ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10180
2025-04-06 14:05:32 +00:00 · 2002-11-07 02:34:46 +00:00 · 2002-11-07 02:34:46 +00:00 · 2d39fda4e3
commit 2d39fda4e3
parent f8f62de907
14 changed files with 286 additions and 205 deletions
--- a/icu4c/source/common/putil.c
+++ b/icu4c/source/common/putil.c
@ -1833,10 +1833,13 @@ static const char * const
 _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
    "U_REGEX_ERROR_START",
    "U_REGEX_INTERNAL_ERROR",
+    "U_REGEX_RULE_SYNTAX",
    "U_REGEX_INVALID_STATE",
    "U_REGEX_BAD_ESCAPE_SEQUENCE",
    "U_REGEX_PROPERTY_SYNTAX",
-    "U_REGEX_UNIMPLEMENTED"
+    "U_REGEX_UNIMPLEMENTED",
+    "U_REGEX_MISMATCHED_PAREN",
+    "U_REGEX_MATCH_MODE_ERROR"
 };

 U_CAPI const char * U_EXPORT2
@ -1852,7 +1855,7 @@ u_errorName(UErrorCode code) {
    } else if (U_BRK_ERROR_START <= code  && code < U_BRK_ERROR_LIMIT){
        return _uBrkErrorName[code - U_BRK_ERROR_START];
    } else if (U_REGEX_ERROR_START <= code && code < U_REGEX_ERROR_LIMIT) {
-        return _uBrkErrorName[code - U_REGEX_ERROR_START];
+        return _uRegexErrorName[code - U_REGEX_ERROR_START];
    } else {
        return "[BOGUS UErrorCode]";
    }
--- a/icu4c/source/common/unicode/uconfig.h
+++ b/icu4c/source/common/unicode/uconfig.h
@ -52,6 +52,7 @@
 #   endif
 #   define UCONFIG_NO_FORMATTING 1
 #   define UCONFIG_NO_TRANSLITERATION 1
+#   define UCONFIG_NO_REGULAR_EXPRESSIONS 1
 #endif

 /* common library switches -------------------------------------------------- */
@ -114,5 +115,16 @@
 #   define UCONFIG_NO_TRANSLITERATION 0
 #endif

+/**
+ * \def UCONFIG_NO_REGULAR_EXPRESSIONS
+ * This switch turns off regular expressions.
+ *
+ * @draft ICU 2.6
+ */
+#ifndef UCONFIG_NO_REGULAR_EXPRESSIONS
+#   define UCONFIG_NO_REGULAR_EXPRESSIONS 0
+#endif
+
+

 #endif
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -502,10 +502,13 @@ typedef enum UErrorCode {
     */
     U_REGEX_ERROR_START=0x10300,
     U_REGEX_INTERNAL_ERROR,
+     U_REGEX_RULE_SYNTAX,
     U_REGEX_INVALID_STATE,
     U_REGEX_BAD_ESCAPE_SEQUENCE,
     U_REGEX_PROPERTY_SYNTAX,
     U_REGEX_UNIMPLEMENTED,
+     U_REGEX_MISMATCHED_PAREN,
+     U_REGEX_MATCH_MODE_ERROR,
     U_REGEX_ERROR_LIMIT,

    U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -5,11 +5,14 @@
 //  Copyright (C) 2002, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
-//  This file contains the ICU regular expression scanner, which is responsible
-//  for preprocessing a regular expression pattern into the tokenized form that
+//  This file contains the ICU regular expression compiler, which is responsible
+//  for processing a regular expression pattern into the compiled form that
 //  is used by the match finding engine.
 //

+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS

 #include "unicode/unistr.h"
 #include "unicode/uniset.h"
@ -18,18 +21,18 @@
 #include "unicode/parsepos.h"
 #include "unicode/parseerr.h"
 #include "unicode/regex.h"
-#include "regeximp.h"
 #include "uprops.h"
 #include "cmemory.h"
 #include "cstring.h"
+#include "uassert.h"

 #include "stdio.h"    // TODO:  Get rid of this

+#include "regeximp.h"
 #include "regexcst.h"   // Contains state table for the regex pattern parser.
                       //   generated by a Perl script.
 #include "regexcmp.h"

-#include "uassert.h"


 U_NAMESPACE_BEGIN
@ -52,10 +55,10 @@ static const int RESCAN_DEBUG = 0;

 // Characters that have no special meaning, and thus do not need to be escaped.  Expressed
 //    as the inverse of those needing escaping --  [^\*\?\+\[\(\)\{\}\^\$\|\\\.]
-static const UChar gRuleSet_rule_char_pattern[]       = { 
+static const UChar gRuleSet_rule_char_pattern[]       = {
 //   [    ^      \     *     \     ?     \     +     \     [     \     (     /     )
-    0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29, 
- //   \     {    \     }     \     ^     \     $     \     |     \     \     \     .     ]   
+    0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29,
+ //   \     {    \     }     \     ^     \     $     \     |     \     \     \     .     ]
    0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};


@ -72,7 +75,7 @@ static UnicodeSet  *gUnescapeCharSet;
 //    will handle.
 //
 static const UChar gUnescapeCharPattern[] = {
-//    [     a     c     e     f     n     r     t     u     U     ] 
+//    [     a     c     e     f     n     r     t     u     U     ]
    0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d, 0};


@ -123,7 +126,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)

    //
    //  Set up the constant (static) Unicode Sets.
-    //    
+    //
    if (gRuleSets[kRuleSet_rule_char-128] == NULL) {
        //  TODO:  Make thread safe.
        //  TODO:  Memory Cleanup on ICU shutdown.
@ -131,8 +134,8 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
        gRuleSets[kRuleSet_white_space-128]     = (UnicodeSet*) uprv_openRuleWhiteSpaceSet(&status);
        gRuleSets[kRuleSet_digit_char-128]      = new UnicodeSet(gRuleSet_digit_char_pattern,      status);
        gUnescapeCharSet                        = new UnicodeSet(gUnescapeCharPattern,             status);
-        gPropSets[URX_ISWORD_SET]               = new UnicodeSet(gIsWordPattern,                   status); 
-        gPropSets[URX_ISSPACE_SET]              = new UnicodeSet(gIsSpacePattern,                  status); 
+        gPropSets[URX_ISWORD_SET]               = new UnicodeSet(gIsWordPattern,                   status);
+        gPropSets[URX_ISSPACE_SET]              = new UnicodeSet(gIsSpacePattern,                  status);

        if (U_FAILURE(status)) {
            delete gRuleSets[kRuleSet_rule_char-128];
@ -171,7 +174,7 @@ RegexCompile::~RegexCompile() {
 //                         script regexcst.pl
 //
 //---------------------------------------------------------------------------------
-void    RegexCompile::compile(                    
+void    RegexCompile::compile(
                         RegexPattern &rxp,          // User level pattern object to receive
                                                     //    the compiled pattern.
                         const UnicodeString &pat,   // Source pat to be compiled.
@ -285,7 +288,7 @@ void    RegexCompile::compile(
        if (tableEl->fPushState != 0) {
            fStackPtr++;
            if (fStackPtr >= kStackSize) {
-                error(U_BRK_INTERNAL_ERROR);
+                error(U_REGEX_INTERNAL_ERROR);
                printf("RegexCompile::parse() - state stack overflow.\n");
                fStackPtr--;
            }
@ -304,7 +307,7 @@ void    RegexCompile::compile(
            state = fStack[fStackPtr];
            fStackPtr--;
            if (fStackPtr < 0) {
-                error(U_BRK_INTERNAL_ERROR);
+                error(U_REGEX_INTERNAL_ERROR);
                printf("RegexCompile::compile() - state stack underflow.\n");
                fStackPtr++;
            }
@ -358,12 +361,16 @@ UBool RegexCompile::doParseActions(EParseAction action)
        //  Encountering end of pattern also behaves like a close paren,
        //   and forces fixups of the State Save at the beginning of the compiled pattern
        //   and of any OR operations at the top level.
-        // 
+        //
        handleCloseParen();
-        
+        if (fParenStack.size() > 0) {
+            // Missing close paren in pattern.
+            error(U_REGEX_MISMATCHED_PAREN);
+        }
+
        // add the END operation to the compiled pattern.
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
-        
+
        // Terminate the pattern compilation state machine.
        returnVal = FALSE;
        break;
@ -405,7 +412,7 @@ UBool RegexCompile::doParseActions(EParseAction action)

    case doOpenCaptureParen:
        // Open Paren.
-        //   Compile to a 
+        //   Compile to a
        //      - NOP, which later may be replaced by a save-state if the
        //         parenthesized group gets a * quantifier, followed by
        //      - START_CAPTURE
@ -430,7 +437,7 @@ UBool RegexCompile::doParseActions(EParseAction action)

    case doOpenNonCaptureParen:
        // Open non-caputuring (grouping only) Paren.
-        //   Compile to a 
+        //   Compile to a
        //      - NOP, which later may be replaced by a save-state if the
        //         parenthesized group gets a * quantifier, followed by
        //      - NOP, which may later be replaced by a save-state if there
@ -440,7 +447,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);

            // On the Parentheses stack, start a new frame and add the postions
-            //   of the two NOPs.  
+            //   of the two NOPs.
            fParenStack.push(-1, *fStatus);                               // Begin a new frame.
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
@ -475,6 +482,10 @@ UBool RegexCompile::doParseActions(EParseAction action)

    case doCloseParen:
        handleCloseParen();
+        if (fParenStack.size() <= 0) {
+            //  Extra close paren, or missing open paren.
+            error(U_REGEX_MISMATCHED_PAREN);
+        }
        break;

    case doNOP:
@ -483,11 +494,16 @@ UBool RegexCompile::doParseActions(EParseAction action)

    case doBadOpenParenType:
    case doRuleError:
-        error(U_BRK_RULE_SYNTAX);
+        error(U_REGEX_RULE_SYNTAX);
        returnVal = FALSE;
        break;


+    case doMismatchedParenErr:
+        error(U_REGEX_MISMATCHED_PAREN);
+        returnVal = FALSE;
+        break;
+
    case doPlus:
        //  Normal '+'  compiles to
        //     1.   stuff to be repeated  (already built)
@ -532,7 +548,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
        //     3. ...
        // Insert the state save into the compiled pattern, and we're done.
        {
-            int32_t   saveStateLoc = blockTopLoc(TRUE);      
+            int32_t   saveStateLoc = blockTopLoc(TRUE);
            int32_t   saveStateOp  = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
        }
@ -572,9 +588,9 @@ UBool RegexCompile::doParseActions(EParseAction action)
        //       3.   JMP  0
        //       4.   ...
        //
-        { 
+        {
            // location of item #1, the STATE_SAVE
-            int32_t   saveStateLoc = blockTopLoc(TRUE);       
+            int32_t   saveStateLoc = blockTopLoc(TRUE);

            // Locate the position in the compiled pattern where the match will continue
            //   after completing the *.   (4 in the comment above)
@ -599,7 +615,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
        //     3.   STATE_SAVE  2
        //     4    ...
        {
-            int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1. 
+            int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
            int32_t     jmpOp   = URX_BUILD(URX_JMP, saveLoc);
            int32_t     stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
@ -607,7 +623,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
            fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
        }
        break;
-        
+

    case doStartString:
        // We've just scanned a single "normal" character from the pattern,
@ -678,7 +694,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
        //   has only one character, emit the single character token instead.
        {
            int32_t   strLength = fRXPat->fLiteralText.length() - fStringOpStart;
-            U_ASSERT(strLength > 0);  
+            U_ASSERT(strLength > 0);
            int32_t  lastCharIdx = fRXPat->fLiteralText.length()-1;
            lastCharIdx = fRXPat->fLiteralText.getChar32Start(lastCharIdx);
            if (lastCharIdx == fStringOpStart) {
@ -735,7 +751,7 @@ UBool RegexCompile::doParseActions(EParseAction action)

    case doBackslashG:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
-        break;        
+        break;

    case doBackslashS:
        fRXPat->fCompiledPat->addElement(
@ -750,31 +766,31 @@ UBool RegexCompile::doParseActions(EParseAction action)
    case doBackslashW:
        fRXPat->fCompiledPat->addElement(
            URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
-        break;        
+        break;

    case doBackslashw:
        fRXPat->fCompiledPat->addElement(
            URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
-        break;        
+        break;

    case doBackslashX:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
-        break;        
+        break;

    case doBackslashx:              // \x{abcd}   alternate hex format
-        //  TODO:  implement 
+        //  TODO:  implement
        error(U_REGEX_UNIMPLEMENTED);
        break;
-            
+


    case doBackslashZ:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
-        break;        
+        break;

    case doBackslashz:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
-        break;        
+        break;

    case doExit:
        returnVal = FALSE;
@ -806,12 +822,23 @@ UBool RegexCompile::doParseActions(EParseAction action)
        break;

    case doNamedChar:            // \N{NAMED_CHAR}
-        //  TODO:  implement 
+        //  TODO:  implement
        error(U_REGEX_UNIMPLEMENTED);
        break;
-            
+
+    case doMatchMode:   //  (?i)    and similar
+        // TODO:  implement
+        error(U_REGEX_UNIMPLEMENTED);
+        break;
+
+    case doNotImplementedError:
+        // TODO:  get rid of this once everything is implemented.
+        error(U_REGEX_UNIMPLEMENTED);
+        break;
+
+
    default:
-        error(U_BRK_INTERNAL_ERROR);
+        error(U_REGEX_INTERNAL_ERROR);
        returnVal = FALSE;
        break;
    }
@ -838,7 +865,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
 //------------------------------------------------------------------------------
 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
    int32_t   theLoc;
-    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)    
+    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
    {
        // The item just processed is a parenthesized block.
        theLoc = fMatchOpenParen;   // A slot is already reserved for us.
@ -878,8 +905,11 @@ int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
 void  RegexCompile::handleCloseParen() {
    int32_t   patIdx;
    int32_t   patOp;
-    U_ASSERT(fParenStack.size() >= 1);
-    
+    if (fParenStack.size() <= 0) {
+        error(U_REGEX_MISMATCHED_PAREN);
+        return;
+    }
+
    // Fixup any operations within the just-closed parenthesized group
    //    that need to reference the end of the (block).
    //    (The first one on popped from the stack is an unused slot for
@ -896,17 +926,17 @@ void  RegexCompile::handleCloseParen() {
        fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
        fMatchOpenParen     = patIdx;
    }
-    
+
    // DO any additional fixups, depending on the specific kind of
    // parentesized grouping this is
-    
+
    switch (patIdx) {
    case -1:
        // No additional fixups required.
        //   This is the case with most kinds of groupings.
        break;
    case -2:
-        // Capturing Parentheses.  
+        // Capturing Parentheses.
        //   Insert a End Capture op into the pattern.
        //   Grab the group number from the start capture op
        //      and put it into the end-capture op.
@ -1039,7 +1069,7 @@ UChar32  RegexCompile::nextCharLL() {
        fLineNum++;
        fCharNum=0;
        if (fQuoteMode) {
-            error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
+            error(U_REGEX_RULE_SYNTAX);
            fQuoteMode = FALSE;
        }
    }
@ -1120,7 +1150,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
        //  Use UnicodeString::unescapeAt() to handle those that it can.
        //  Otherwise just return the '\', and let the pattern parser deal with it.
        //
-        int32_t startX = fNextIndex;  // start and end positions of the 
+        int32_t startX = fNextIndex;  // start and end positions of the
        int32_t endX   = fNextIndex;  //   sequence following the '\'
        if (c.fChar == chBackSlash) {
            if (gUnescapeCharSet->contains(peekCharLL())) {
@ -1148,7 +1178,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
 //
 //             The scan position is normally under the control of the state machine
 //             that controls pattern parsing.  UnicodeSets, however, are parsed by
-//             the UnicodeSet constructor, not by the Regex pattern parser.  
+//             the UnicodeSet constructor, not by the Regex pattern parser.
 //
 //---------------------------------------------------------------------------------
 UnicodeSet *RegexCompile::scanSet() {
@ -1193,7 +1223,7 @@ UnicodeSet *RegexCompile::scanSet() {
 //---------------------------------------------------------------------------------
 //
 //  scanProp   Construct a UnicodeSet from the text at the current scan
-//             position, which will be of the form \p{whaterver} 
+//             position, which will be of the form \p{whaterver}
 //
 //             The scan position will be at the 'p' or 'P'.  On return
 //             the scan position should be just after the '}'
@ -1240,6 +1270,5 @@ UnicodeSet *RegexCompile::scanProp() {
    return uset;
 };

-
 U_NAMESPACE_END
-
+#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -4,8 +4,10 @@
 //  Copyright (C) 2002, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
-//  This file contains declarations for the class RegexCompile and for compiled
-//  regular expression data format
+//  This file contains declarations for the class RegexCompile
+//
+//  This class is internal to the regular expression implementation.
+//  For the public Regular Expression API, see the file "unicode/regex.h"
 //


@ -13,6 +15,8 @@
 #define RBBISCAN_H

 #include "unicode/utypes.h"
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+
 #include "unicode/uobject.h"
 #include "unicode/uniset.h"
 #include "unicode/parseerr.h"
@ -28,12 +32,7 @@ static const UBool REGEX_DEBUG = TRUE;

 //--------------------------------------------------------------------------------
 //
-//  class RegexCompile    does the lowest level, character-at-a-time
-//                        scanning of a regular expression.  
-//
-//                        The output of the scanner is a tokenized form
-//                        of the RE, plus prebuilt UnicodeSet objects for each
-//                        set of charcters that is referenced.
+//  class RegexCompile    Contains the regular expression compiler.
 //
 //--------------------------------------------------------------------------------
 static const int    kStackSize = 100;               // The size of the state stack for
@ -161,5 +160,5 @@ private:
 };

 U_NAMESPACE_END
-
-#endif
+#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
+#endif   // RBBISCAN_H
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@ -24,7 +24,6 @@ U_NAMESPACE_BEGIN
 enum Regex_PatternParseAction {
    doCloseParen,
    doProperty,
-    doTagValue,
    doOrOperator,
    doOpenCaptureParen,
    doBadOpenParenType,
@ -35,6 +34,7 @@ enum Regex_PatternParseAction {
    doNamedChar,
    doBackslashw,
    doPossesiveStar,
+    doMismatchedParenErr,
    doOpenLookBehind,
    doBackslashx,
    doBackslashz,
@ -43,6 +43,7 @@ enum Regex_PatternParseAction {
    doEnterQuoteMode,
    doPossesivePlus,
    doNGStar,
+    doMatchMode,
    doOpenLookAheadNeg,
    doPlus,
    doOpenNonCaptureParen,
@ -51,14 +52,11 @@ enum Regex_PatternParseAction {
    doNGPlus,
    doPatFinish,
    doBackslashD,
-    doIntervalMinValue,
-    doIntervalDigit,
    doPossesiveOpt,
    doBackslashG,
    doOpt,
    doOpenAtomicParen,
    doBackslashS,
-    doNumberExpectedError,
    doStringChar,
    doOpenLookAhead,
    doBackRef,
@ -74,6 +72,7 @@ enum Regex_PatternParseAction {
    doBackslashb,
    doEndString,
    doBackslashd,
+    doNotImplementedError,
    doOpenLookBehindNeg,
    doSplitString,
    rbbiLastAction};
@ -100,13 +99,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doStartString, 254, 13,0,  TRUE}     //  3      term
    , {doStartString, 130, 13,0,  TRUE}     //  4 
    , {doScanUnicodeSet, 91 /* [ */, 20,0,  TRUE}     //  5 
-    , {doNOP, 40 /* ( */, 27, 20, TRUE}     //  6 
+    , {doNOP, 40 /* ( */, 28, 20, TRUE}     //  6 
    , {doDotAny, 46 /* . */, 20,0,  TRUE}     //  7 
    , {doCaret, 94 /* ^ */, 3,0,  TRUE}     //  8 
    , {doDollar, 36 /* $ */, 3,0,  TRUE}     //  9 
-    , {doNOP, 92 /* \ */, 60,0,  TRUE}     //  10 
+    , {doNOP, 92 /* \ */, 67,0,  TRUE}     //  10 
    , {doNOP, 253, 2,0,  FALSE}     //  11 
-    , {doRuleError, 255, 80,0,  FALSE}     //  12 
+    , {doRuleError, 255, 87,0,  FALSE}     //  12 
    , {doStringChar, 254, 13,0,  TRUE}     //  13      string
    , {doStringChar, 130, 13,0,  TRUE}     //  14 
    , {doSplitString, 63 /* ? */, 20,0,  FALSE}     //  15 
@ -114,67 +113,74 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doSplitString, 42 /* * */, 20,0,  FALSE}     //  17 
    , {doSplitString, 123 /* { */, 20,0,  FALSE}     //  18 
    , {doEndString, 255, 20,0,  FALSE}     //  19 
-    , {doNOP, 42 /* * */, 41,0,  TRUE}     //  20      expr-quant
-    , {doNOP, 43 /* + */, 44,0,  TRUE}     //  21 
-    , {doNOP, 63 /* ? */, 47,0,  TRUE}     //  22 
-    , {doNOP, 255, 24,0,  FALSE}     //  23 
-    , {doOrOperator, 124 /* | */, 3,0,  TRUE}     //  24      expr-cont
-    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  25 
-    , {doNOP, 255, 3,0,  FALSE}     //  26 
-    , {doNOP, 63 /* ? */, 29,0,  TRUE}     //  27      open-paren
-    , {doOpenCaptureParen, 255, 3, 20, FALSE}     //  28 
-    , {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE}     //  29      open-paren-extended
-    , {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE}     //  30 
-    , {doOpenLookAhead, 61 /* = */, 3, 24, TRUE}     //  31 
-    , {doOpenLookAheadNeg, 33 /* ! */, 3, 24, TRUE}     //  32 
-    , {doNOP, 60 /* < */, 36,0,  TRUE}     //  33 
-    , {doNOP, 35 /* # */, 39,0,  TRUE}     //  34 
-    , {doBadOpenParenType, 255, 80,0,  FALSE}     //  35 
-    , {doOpenLookBehind, 61 /* = */, 3, 24, TRUE}     //  36      open-paren-lookbehind
-    , {doOpenLookBehindNeg, 33 /* ! */, 3, 24, TRUE}     //  37 
-    , {doBadOpenParenType, 255, 80,0,  FALSE}     //  38 
-    , {doNOP, 41 /* ) */, 3,0,  TRUE}     //  39      paren-comment
-    , {doNOP, 255, 39,0,  TRUE}     //  40 
-    , {doNGStar, 63 /* ? */, 24,0,  TRUE}     //  41      quant-star
-    , {doPossesiveStar, 43 /* + */, 24,0,  TRUE}     //  42 
-    , {doStar, 255, 24,0,  FALSE}     //  43 
-    , {doNGPlus, 63 /* ? */, 24,0,  TRUE}     //  44      quant-plus
-    , {doPossesivePlus, 43 /* + */, 24,0,  TRUE}     //  45 
-    , {doPlus, 255, 24,0,  FALSE}     //  46 
-    , {doNGOpt, 63 /* ? */, 24,0,  TRUE}     //  47      quant-opt
-    , {doPossesiveOpt, 43 /* + */, 24,0,  TRUE}     //  48 
-    , {doOpt, 255, 24,0,  FALSE}     //  49 
-    , {doNOP, 129, 50,0,  TRUE}     //  50      interval-open
-    , {doIntervalMinValue, 128, 53,0,  FALSE}     //  51 
-    , {doNumberExpectedError, 255, 80,0,  FALSE}     //  52 
-    , {doNOP, 129, 57,0,  TRUE}     //  53      interval-value
-    , {doNOP, 125 /* } */, 57,0,  FALSE}     //  54 
-    , {doIntervalDigit, 128, 53,0,  TRUE}     //  55 
-    , {doNumberExpectedError, 255, 80,0,  FALSE}     //  56 
-    , {doNOP, 129, 57,0,  TRUE}     //  57      interval-close
-    , {doTagValue, 125 /* } */, 24,0,  TRUE}     //  58 
-    , {doNumberExpectedError, 255, 80,0,  FALSE}     //  59 
-    , {doBackslashA, 65 /* A */, 3,0,  TRUE}     //  60      backslash
-    , {doBackslashB, 66 /* B */, 3,0,  TRUE}     //  61 
-    , {doBackslashb, 98 /* b */, 3,0,  TRUE}     //  62 
-    , {doBackslashd, 100 /* d */, 20,0,  TRUE}     //  63 
-    , {doBackslashD, 68 /* D */, 20,0,  TRUE}     //  64 
-    , {doBackslashG, 71 /* G */, 3,0,  TRUE}     //  65 
-    , {doNamedChar, 78 /* N */, 20,0,  TRUE}     //  66 
-    , {doProperty, 112 /* p */, 20,0,  FALSE}     //  67 
-    , {doProperty, 80 /* P */, 20,0,  FALSE}     //  68 
-    , {doEnterQuoteMode, 81 /* Q */, 3,0,  TRUE}     //  69 
-    , {doBackslashS, 83 /* S */, 20,0,  TRUE}     //  70 
-    , {doBackslashs, 115 /* s */, 20,0,  TRUE}     //  71 
-    , {doBackslashW, 87 /* W */, 20,0,  TRUE}     //  72 
-    , {doBackslashw, 119 /* w */, 20,0,  TRUE}     //  73 
-    , {doBackslashX, 88 /* X */, 20,0,  TRUE}     //  74 
-    , {doBackslashx, 120 /* x */, 20,0,  TRUE}     //  75 
-    , {doBackslashZ, 90 /* Z */, 3,0,  TRUE}     //  76 
-    , {doBackslashz, 122 /* z */, 3,0,  TRUE}     //  77 
-    , {doBackRef, 128, 20,0,  TRUE}     //  78 
-    , {doStartString, 255, 13,0,  TRUE}     //  79 
-    , {doExit, 255, 80,0,  TRUE}     //  80      errorDeath
+    , {doNOP, 42 /* * */, 56,0,  TRUE}     //  20      expr-quant
+    , {doNOP, 43 /* + */, 59,0,  TRUE}     //  21 
+    , {doNOP, 63 /* ? */, 62,0,  TRUE}     //  22 
+    , {doNOP, 123 /* { */, 65,0,  TRUE}     //  23 
+    , {doNOP, 255, 25,0,  FALSE}     //  24 
+    , {doOrOperator, 124 /* | */, 3,0,  TRUE}     //  25      expr-cont
+    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  26 
+    , {doNOP, 255, 3,0,  FALSE}     //  27 
+    , {doNOP, 63 /* ? */, 30,0,  TRUE}     //  28      open-paren
+    , {doOpenCaptureParen, 255, 3, 20, FALSE}     //  29 
+    , {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE}     //  30      open-paren-extended
+    , {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE}     //  31 
+    , {doOpenLookAhead, 61 /* = */, 3, 25, TRUE}     //  32 
+    , {doOpenLookAheadNeg, 33 /* ! */, 3, 25, TRUE}     //  33 
+    , {doNOP, 60 /* < */, 42,0,  TRUE}     //  34 
+    , {doNOP, 35 /* # */, 45,0,  TRUE}     //  35 
+    , {doMatchMode, 105 /* i */, 48,0,  TRUE}     //  36 
+    , {doMatchMode, 120 /* x */, 48,0,  TRUE}     //  37 
+    , {doMatchMode, 115 /* s */, 48,0,  TRUE}     //  38 
+    , {doMatchMode, 109 /* m */, 48,0,  TRUE}     //  39 
+    , {doMatchMode, 45 /* - */, 48,0,  TRUE}     //  40 
+    , {doBadOpenParenType, 255, 87,0,  FALSE}     //  41 
+    , {doOpenLookBehind, 61 /* = */, 3, 25, TRUE}     //  42      open-paren-lookbehind
+    , {doOpenLookBehindNeg, 33 /* ! */, 3, 25, TRUE}     //  43 
+    , {doBadOpenParenType, 255, 87,0,  FALSE}     //  44 
+    , {doNOP, 41 /* ) */, 3,0,  TRUE}     //  45      paren-comment
+    , {doMismatchedParenErr, 253, 87,0,  FALSE}     //  46 
+    , {doNOP, 255, 45,0,  TRUE}     //  47 
+    , {doMatchMode, 105 /* i */, 48,0,  TRUE}     //  48      paren-flag
+    , {doMatchMode, 115 /* s */, 48,0,  TRUE}     //  49 
+    , {doMatchMode, 109 /* m */, 48,0,  TRUE}     //  50 
+    , {doMatchMode, 120 /* x */, 48,0,  TRUE}     //  51 
+    , {doMatchMode, 45 /* - */, 48,0,  TRUE}     //  52 
+    , {doNOP, 41 /* ) */, 3,0,  TRUE}     //  53 
+    , {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE}     //  54 
+    , {doNOP, 255, 87,0,  FALSE}     //  55 
+    , {doNGStar, 63 /* ? */, 25,0,  TRUE}     //  56      quant-star
+    , {doPossesiveStar, 43 /* + */, 25,0,  TRUE}     //  57 
+    , {doStar, 255, 25,0,  FALSE}     //  58 
+    , {doNGPlus, 63 /* ? */, 25,0,  TRUE}     //  59      quant-plus
+    , {doPossesivePlus, 43 /* + */, 25,0,  TRUE}     //  60 
+    , {doPlus, 255, 25,0,  FALSE}     //  61 
+    , {doNGOpt, 63 /* ? */, 25,0,  TRUE}     //  62      quant-opt
+    , {doPossesiveOpt, 43 /* + */, 25,0,  TRUE}     //  63 
+    , {doOpt, 255, 25,0,  FALSE}     //  64 
+    , {doNOP, 129, 65,0,  TRUE}     //  65      interval-open
+    , {doNotImplementedError, 255, 87,0,  FALSE}     //  66 
+    , {doBackslashA, 65 /* A */, 3,0,  TRUE}     //  67      backslash
+    , {doBackslashB, 66 /* B */, 3,0,  TRUE}     //  68 
+    , {doBackslashb, 98 /* b */, 3,0,  TRUE}     //  69 
+    , {doBackslashd, 100 /* d */, 20,0,  TRUE}     //  70 
+    , {doBackslashD, 68 /* D */, 20,0,  TRUE}     //  71 
+    , {doBackslashG, 71 /* G */, 3,0,  TRUE}     //  72 
+    , {doNamedChar, 78 /* N */, 20,0,  TRUE}     //  73 
+    , {doProperty, 112 /* p */, 20,0,  FALSE}     //  74 
+    , {doProperty, 80 /* P */, 20,0,  FALSE}     //  75 
+    , {doEnterQuoteMode, 81 /* Q */, 3,0,  TRUE}     //  76 
+    , {doBackslashS, 83 /* S */, 20,0,  TRUE}     //  77 
+    , {doBackslashs, 115 /* s */, 20,0,  TRUE}     //  78 
+    , {doBackslashW, 87 /* W */, 20,0,  TRUE}     //  79 
+    , {doBackslashw, 119 /* w */, 20,0,  TRUE}     //  80 
+    , {doBackslashX, 88 /* X */, 20,0,  TRUE}     //  81 
+    , {doBackslashx, 120 /* x */, 20,0,  TRUE}     //  82 
+    , {doBackslashZ, 90 /* Z */, 3,0,  TRUE}     //  83 
+    , {doBackslashz, 122 /* z */, 3,0,  TRUE}     //  84 
+    , {doBackRef, 128, 20,0,  TRUE}     //  85 
+    , {doStartString, 255, 13,0,  TRUE}     //  86 
+    , {doExit, 255, 87,0,  TRUE}     //  87      errorDeath
 };
 static const char *RegexStateNames[] = {    0,
     "start",
@ -199,6 +205,7 @@ static const char *RegexStateNames[] = {    0,
     "expr-quant",
    0,
    0,
+    0,
    0,
     "expr-cont",
    0,
@ -211,11 +218,25 @@ static const char *RegexStateNames[] = {    0,
    0,
    0,
    0,
+    0,
+    0,
+    0,
+    0,
+    0,
    0,
     "open-paren-lookbehind",
    0,
    0,
     "paren-comment",
+    0,
+    0,
+     "paren-flag",
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
    0,
     "quant-star",
    0,
@ -227,14 +248,6 @@ static const char *RegexStateNames[] = {    0,
    0,
    0,
     "interval-open",
-    0,
-    0,
-     "interval-value",
-    0,
-    0,
-    0,
-     "interval-close",
-    0,
    0,
     "backslash",
    0,
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@ -107,7 +107,8 @@ string:
 expr-quant:
    '*'                  n  quant-star                       
    '+'                  n  quant-plus                              
-    '?'                  n  quant-opt        
+    '?'                  n  quant-opt     
+    '{'                  n  interval-open
    default                 expr-cont 
    
    
@ -136,6 +137,11 @@ open-paren-extended:
    '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
    '<'                  n  open-paren-lookbehind
    '#'                  n  paren-comment
+    'i'                  n  paren-flag                              doMatchMode
+    'x'                  n  paren-flag                              doMatchMode
+    's'                  n  paren-flag                              doMatchMode
+    'm'                  n  paren-flag                              doMatchMode
+    '-'                  n  paren-flag                              doMatchMode
    default                 errorDeath                              doBadOpenParenType
    
 open-paren-lookbehind:
@ -150,7 +156,21 @@ open-paren-lookbehind:
 #
 paren-comment:
    ')'                  n  term
+    eof		            errorDeath                              doMismatchedParenErr
    default              n  paren-comment
+
+#
+#  paren-flag    Scanned a (?ismx-ismx  flag setting thing
+#                TODO:  this is not fully implemented yet.
+paren-flag:
+    'i'                  n  paren-flag                              doMatchMode
+    's'                  n  paren-flag                              doMatchMode
+    'm'                  n  paren-flag                              doMatchMode
+    'x'                  n  paren-flag                              doMatchMode
+    '-'                  n  paren-flag                              doMatchMode
+    ')'                  n  term
+    ':'                  n  term              ^expr-quant           doOpenNonCaptureParen
+    default                 errorDeath
    
    
 #
@ -189,19 +209,8 @@ quant-opt:
 #
 interval-open:
    white_space          n  interval-open
-    digit_char              interval-value                          doIntervalMinValue
-    default                 errorDeath                              doNumberExpectedError
+    default                 errorDeath                              doNotImplementedError
    
-interval-value:
-    white_space          n  interval-close
-    '}'                     interval-close
-    digit_char           n  interval-value                          doIntervalDigit
-    default                 errorDeath                              doNumberExpectedError
-    
-interval-close:
-    white_space          n  interval-close
-    '}'                  n  expr-cont                               doTagValue
-    default                 errorDeath                              doNumberExpectedError
    
    
    
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -4,9 +4,9 @@
 //
 //   file:  regeximp.h
 //
-//           ICU Regular Expressions, declarations of internal implementation types
-//           and constants that are common between the pattern compiler and the 
-//           runtime execution engine.
+//           ICU Regular Expressions,
+//               Definitions of constant values used in the compiled form of
+//               a regular expression pattern.
 //

 #ifndef _REGEXIMP_H
@ -19,7 +19,7 @@
 //
 static const uint32_t     URX_UNUSED1       = 1;
 static const uint32_t     URX_END           = 2;
-static const uint32_t     URX_ONECHAR       = 3;
+static const uint32_t     URX_ONECHAR       = 3;    // Value field is the 21 bit unicode char to match
 static const uint32_t     URX_STRING        = 4;    // Value field is index of string start
 static const uint32_t     URX_STRING_LEN    = 5;    // Value field is string length (code units)
 static const uint32_t     URX_STATE_SAVE    = 6;    // Value field is pattern position to push
@ -55,7 +55,7 @@ static const uint32_t     URX_DOLLAR        = 24;   // Also for \Z

                
 //
-//  Access to Unicode Sets for composite properties
+//  Access to Unicode Sets for Perl-like composite character properties
 //     The sets are accessed by the match engine for things like \w (word boundary)
 //     
 static const uint32_t     URX_ISWORD_SET  = 1;
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -1,6 +1,9 @@
 //
 //  file:  rematch.cpp    
 //
+//         Contains the implementation of class RegexMatcher,
+//         which is one of the main API classes for the ICU regular expression package.
+//
 /*
 **********************************************************************
 *   Copyright (C) 2002 International Business Machines Corporation   *
@ -9,6 +12,8 @@
 */

 #include "unicode/utypes.h"
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+
 #include "unicode/regex.h"
 #include "unicode/uniset.h"
 #include "unicode/uchar.h"
@ -443,10 +448,11 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
 //   isWordBoundary 
 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
 //                     For us,
-//                       If the current char is a combining mark, \b is FALSE
-//                       Scan backwards to the first non-combining char
-//                       Pos is a boundary if the current and previous chars are
-//                            opposite in membership in \w set
+//                       If the current char is a combining mark,
+//                          \b is FALSE.
+//                       Else Scan backwards to the first non-combining char.
+//                            We are at a boundary if the this char and the original chars are
+//                               opposite in membership in \w set
 //
 //--------------------------------------------------------------------------------
 UBool RegexMatcher::isWordBoundary(int32_t pos) {
@ -486,27 +492,6 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) {
 }


-//--------------------------------------------------------------------------------
-//
-//    getCaptureText    We have encountered a '\' that might preceed a
-//                      capture group specification. 
-//                      If a valid capture group number follows the '\', 
-//                      return the indicies to the start & end of the captured
-//                      text, and update the patIdx to the position following the
-//                      \n sequence.
-//
-//                      This function is used during find and replace operations when
-//                      processing caputure references in the replacement text.
-//
-//--------------------------------------------------------------------------------
-UBool  RegexMatcher::getCaptureText(const UnicodeString &rep,
-                                int32_t &repIdx,
-                                int32_t &textStart,
-                                int32_t &textEnd)
-{
-    return FALSE;
-}
-
 //--------------------------------------------------------------------------------
 //
 //     backTrack    Within the match engine, this function is called when
@ -915,10 +900,9 @@ breakFromLoop:



-
-
 const char RegexMatcher::fgClassID = 0;

 U_NAMESPACE_END

+#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS

--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -9,6 +9,9 @@
 */

 #include "unicode/utypes.h"
+
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+
 #include "unicode/regex.h"
 #include "uassert.h"
 #include "uvector.h"
@ -66,6 +69,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    fBadState         = other.fBadState;
    fNumCaptureGroups = other.fNumCaptureGroups;
    fMaxCaptureDigits = other.fMaxCaptureDigits;
+    fStaticSets       = other.fStaticSets;    
    if (fBadState) {
        return *this;
    }
@ -110,6 +114,7 @@ void RegexPattern::init() {
    fBadState         = FALSE;
    fNumCaptureGroups = 0;
    fMaxCaptureDigits = 1;     // TODO:  calculate for real.
+    fStaticSets       = NULL;
    fMatcher          = NULL;
    
    UErrorCode status=U_ZERO_ERROR;
@ -384,15 +389,6 @@ int32_t  RegexPattern::split(const UnicodeString &input,



-//---------------------------------------------------------------------
-//
-//   hashcode
-//
-//---------------------------------------------------------------------
-int32_t   RegexPattern::hashCode(void) const {
-    return 0;           // TODO:   Do something better here
-};
-

 //---------------------------------------------------------------------
 //
@ -512,8 +508,8 @@ breakFromLoop:
    printf("\n\n");
 };

-
-
 const char RegexPattern::fgClassID = 0;

+
 U_NAMESPACE_END
+#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -9,6 +9,9 @@
 #define REGEX_H

 #include "unicode/utypes.h"
+
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+
 #include "unicode/uobject.h"
 #include "unicode/unistr.h"
 #include "unicode/parseerr.h"
@ -69,9 +72,6 @@ public:
    RegexPattern  &operator =(const RegexPattern &other);
    virtual RegexPattern  *clone() const;

-    // TODO:  Do we really want a hashCode function on this class?
-    virtual int32_t         hashCode(void) const;
-    
    
   /**
    *     Compiles the given regular expression into a pattern 
@ -428,10 +428,6 @@ private:
    //
    void         MatchAt(int32_t startIdx, UErrorCode &status);   
    inline  void backTrack(int32_t &inputIdx, int32_t &patIdx);
-    UBool        getCaptureText(const UnicodeString &rep,
-                                int32_t &repIdx,
-                                int32_t &textStart,
-                                int32_t &textEnd);
    UBool        isWordBoundary(int32_t pos);         // perform the \b test


@ -448,7 +444,6 @@ private:

 };  

-
-
 U_NAMESPACE_END
+#endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
 #endif
--- a/icu4c/source/test/intltest/itmajor.cpp
+++ b/icu4c/source/test/intltest/itmajor.cpp
@ -70,11 +70,13 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
                break;

        case 3: name = "regex";
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
                if (exec) {
                    logln("TestSuite Regex---"); logln();
                    RegexTest test;
                    callTest( test, par );
                }
+#endif
                break;

        case 4: name = "format";
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -11,6 +11,8 @@
 //

 #include "unicode/utypes.h"
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+
 #include "unicode/uchar.h"
 #include "intltest.h"
 #include "regextst.h"
@ -1195,8 +1197,38 @@ void RegexTest::Errors() {
    REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);

    // Missing close parentheses
-    //REGEX_ERR("Comment (?# with no close", 1, 0, U_REGEX_INTERNAL_ERROR);
+    REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
+    REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
+    REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
+
+    // Extra close paren
+    REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
+    REGEX_ERR(")))))))", 1, 1, U_REGEX_RULE_SYNTAX);
+    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
+
+    // Flag settings not yet implemented
+    REGEX_ERR("(?i:stuff*)", 1, 3, U_REGEX_UNIMPLEMENTED);
+    REGEX_ERR("(?-si) stuff", 1, 3, U_REGEX_UNIMPLEMENTED);
+
+    // Look-ahead, Look-behind
+    REGEX_ERR("abc(?=xyz).*", 1, 6, U_REGEX_UNIMPLEMENTED);    // look-ahead
+    REGEX_ERR("abc(?!xyz).*", 1, 6, U_REGEX_UNIMPLEMENTED);    // negated look-ahead
+    REGEX_ERR("abc(?<=xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED);   // look-behind
+    REGEX_ERR("abc(?<!xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED);   // negated look-behind
+    REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
+
+    // Atomic Grouping
+    REGEX_ERR("abc(?>xyz)", 1, 6, U_REGEX_UNIMPLEMENTED);
+
+    // {Numeric Quantifiers}
+    REGEX_ERR("abc{4}", 1, 5, U_REGEX_UNIMPLEMENTED);
+
+
+    // Quantifiers are allowed only after something that can be quantified.
+    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
+    REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
+    REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
 }

-
+#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */

--- a/icu4c/source/test/intltest/regextst.h
+++ b/icu4c/source/test/intltest/regextst.h
@ -8,6 +8,8 @@
 #ifndef REGEXTST_H
 #define REGEXTST_H

+#include "unicode/utypes.h"
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS

 #include "intltest.h"
 #include "unicode/regex.h"
@ -35,4 +37,6 @@ public:
    virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
                            UErrorCode expectedStatus, int line);
 };
+
+#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
 #endif