ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10220
2025-04-16 18:25:57 +00:00 · 2002-11-11 18:49:49 +00:00 · 2002-11-11 18:49:49 +00:00 · fa16d0f578
commit fa16d0f578
parent 1560a2fb59
8 changed files with 552 additions and 336 deletions
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -244,13 +244,13 @@ void    RegexCompile::compile(
        //
        tableEl = &gRuleParseStateTable[state];
        if (RESCAN_DEBUG) {
-            printf("char, line, col = (\'%c\', %d, %d)    state=%s ",
+            printf( "char, line, col = (\'%c\', %d, %d)    state=%s ",
                fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
        }

        for (;;) {    // loop through table rows belonging to this state, looking for one
                      //   that matches the current input char.
-            if (RESCAN_DEBUG) { printf(".");}
+            if (RESCAN_DEBUG) { printf( ".");}
            if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
                // Table row specified an individual character, not a set, and
                //   the input character is not quoted, and
@ -284,7 +284,7 @@ void    RegexCompile::compile(
            // No match on this row, advance to the next  row for this state,
            tableEl++;
        }
-        if (RESCAN_DEBUG) { printf("\n");}
+        if (RESCAN_DEBUG) { printf( "\n");}

        //
        // We've found the row of the state table that matches the current input
@ -301,7 +301,7 @@ void    RegexCompile::compile(
            fStackPtr++;
            if (fStackPtr >= kStackSize) {
                error(U_REGEX_INTERNAL_ERROR);
-                printf("RegexCompile::parse() - state stack overflow.\n");
+                // printf( "RegexCompile::parse() - state stack overflow.\n");
                fStackPtr--;
            }
            fStack[fStackPtr] = tableEl->fPushState;
@ -319,9 +319,12 @@ void    RegexCompile::compile(
            state = fStack[fStackPtr];
            fStackPtr--;
            if (fStackPtr < 0) {
-                error(U_REGEX_INTERNAL_ERROR);
-                printf("RegexCompile::compile() - state stack underflow.\n");
+                // state stack underflow
+                // This will occur if the user pattern has mis-matched parentheses,
+                //   with extra close parens.
+                // 
                fStackPtr++;
+                error(U_REGEX_MISMATCHED_PAREN);
            }
        }

@ -637,94 +640,12 @@ UBool RegexCompile::doParseActions(EParseAction action)
        break;


-    case doStartString:
-        // We've just scanned a single "normal" character from the pattern,
-        // which is a character without special meaning that will need to be
-        // matched literally.   Save it away.  It may be the start of a string.
-        {
-            fStringOpStart = fRXPat->fLiteralText.length();
-            fRXPat->fLiteralText.append(fC.fChar);
-            break;
-        }

-    case doStringChar:
-        // We've just scanned a "normal" character from the pattern, which now
-        //   needs to be appended the the literal match string being that is
-        //   already being assembled.
-        {
-            fRXPat->fLiteralText.append(fC.fChar);
-            break;
-        }
-
-
-
-    case doSplitString:
-        // We've just peeked at a quantifier, e.g. a *, following a scanned string.
-        //   Separate the last character from the string, because the quantifier
-        //   only applies to it, not to the entire string.  Emit into the compiled
-        //   pattern:
-        //      -  string chars[0..n-2]     (as a string, assuming more than one char)
-        //      -  string char [n-1]        (as a single character)
-        {
-            // Locate the positions of the last and next-to-last characters
-            //  in the string.  Requires a bit of futzing around to account for
-            //  surrogate pairs, since we want 32 bit code points, not 16 bit code units.
-            int32_t  strLength = fRXPat->fLiteralText.length() - fStringOpStart;
-            U_ASSERT(strLength > 0);
-            int32_t  lastCharIdx = fRXPat->fLiteralText.length()-1;
-            lastCharIdx = fRXPat->fLiteralText.getChar32Start(lastCharIdx);
-            int32_t nextToLastCharIdx = lastCharIdx-1;
-            if (nextToLastCharIdx > fStringOpStart) {
-                nextToLastCharIdx = fRXPat->fLiteralText.getChar32Start(nextToLastCharIdx);
-            }
-
-            if (nextToLastCharIdx > fStringOpStart) {
-                // The string contains three or more code units.
-                // emit the first through the next-to-last as a string.
-                int32_t  stringToken = URX_BUILD(URX_STRING, fStringOpStart);
-                fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
-                stringToken = URX_BUILD(URX_STRING_LEN, lastCharIdx - fStringOpStart);
-                fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
-            }
-            else if (nextToLastCharIdx == fStringOpStart) {
-                // The string contains exactly two code units.
-                // emit the first into the compiled pattern as a single char
-                UChar32  c = fRXPat->fLiteralText.char32At(nextToLastCharIdx);
-                int32_t  charToken = URX_BUILD(URX_ONECHAR, c);
-                fRXPat->fCompiledPat->addElement(charToken, *fStatus);
-            }
-            // In all cases emit the last char as a single character.
-            UChar32  c = fRXPat->fLiteralText.char32At(lastCharIdx);
-            int32_t  charToken = URX_BUILD(URX_ONECHAR, c);
-            fRXPat->fCompiledPat->addElement(charToken, *fStatus);
-        }
+    case doLiteralChar:
+        // We've just scanned a "normal" character from the pattern, 
+        literalChar();
        break;

-    case doEndString:
-        // We have reached the end of a literal string in the pattern.
-        // Emit the string token into the compiled pattern, or if the string
-        //   has only one character, emit the single character token instead.
-        {
-            int32_t   strLength = fRXPat->fLiteralText.length() - fStringOpStart;
-            U_ASSERT(strLength > 0);
-            int32_t  lastCharIdx = fRXPat->fLiteralText.length()-1;
-            lastCharIdx = fRXPat->fLiteralText.getChar32Start(lastCharIdx);
-            if (lastCharIdx == fStringOpStart) {
-                // The string contains exactly one character.
-                //  Emit it into the compiled pattern as a single char.
-                int32_t  charToken = URX_BUILD(URX_ONECHAR, fRXPat->fLiteralText.char32At(fStringOpStart));
-                fRXPat->fCompiledPat->addElement(charToken, *fStatus);
-            } else {
-                // The string contains two or more chars.  Emit as a string.
-                // Compiled string consumes two tokens in the compiled pattern, one
-                //   for the index of the start-of-string, and one for the length.
-                int32_t  stringToken = URX_BUILD(URX_STRING, fStringOpStart);
-                fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
-                stringToken = URX_BUILD(URX_STRING_LEN, strLength);
-                fRXPat->fCompiledPat->addElement(stringToken, *fStatus);
-            }
-        }
-        break;


    case doDotAny:
@ -858,6 +779,151 @@ UBool RegexCompile::doParseActions(EParseAction action)
 };


+
+//------------------------------------------------------------------------------
+//
+//   literalChar           We've encountered a literal character from the pattern,
+//                             or an escape sequence that reduces to a character.
+//                         Add it to the string containing all literal chars/strings from
+//                             the pattern.
+//                         If we are in a pattern string already, add the new char to it.
+//                         If we aren't in a pattern string, begin one now.
+//
+//------------------------------------------------------------------------------
+void RegexCompile::literalChar()  {
+    int32_t           op;            // An operation in the compiled pattern.
+    int32_t           opType;
+    int32_t           patternLoc;   // A position in the compiled pattern.
+    int32_t           stringLen;
+
+
+    // If the last thing compiled into the pattern was not a literal char,
+    //   force this new literal char to begin a new string, and not append to the previous.
+    op     = fRXPat->fCompiledPat->lastElementi();
+    opType = URX_TYPE(op);
+    if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR)) {
+        fixLiterals();
+    }
+
+    if (fStringOpStart == -1) {
+        // First char of a string in the pattern.
+        // Emit a OneChar op into the compiled pattern.
+        op = URX_BUILD(URX_ONECHAR, fC.fChar);
+        fRXPat->fCompiledPat->addElement(op, *fStatus);
+
+        // Also add it to the string pool, in case we get a second adjacent literal
+        //   and want to change form ONE_CHAR to STRING
+        fStringOpStart = fRXPat->fLiteralText.length();
+        fRXPat->fLiteralText.append(fC.fChar);
+        return;
+    }
+    
+    // We are adding onto an existing string
+    fRXPat->fLiteralText.append(fC.fChar);
+
+    // If the most recently emitted op is a URX_ONECHAR, change it to a string op.
+    op     = fRXPat->fCompiledPat->lastElementi();
+    opType = URX_TYPE(op);
+    U_ASSERT(opType == URX_ONECHAR || opType == URX_STRING_LEN);
+    if (opType == URX_ONECHAR) {
+        op         = URX_BUILD(URX_STRING, fStringOpStart);
+        patternLoc = fRXPat->fCompiledPat->size() - 1;
+        fRXPat->fCompiledPat->setElementAt(op, patternLoc);
+        op         = URX_BUILD(URX_STRING_LEN, 0);
+        fRXPat->fCompiledPat->addElement(op, *fStatus);
+    }
+
+    // The pattern contains a URX_SRING / URX_STRING_LEN.  Update the
+    //  string length to reflect the new char we just added to the string.
+    stringLen  = fRXPat->fLiteralText.length() - fStringOpStart;
+    op         = URX_BUILD(URX_STRING_LEN, stringLen);
+    patternLoc = fRXPat->fCompiledPat->size() - 1;
+    fRXPat->fCompiledPat->setElementAt(op, patternLoc);
+}
+
+
+
+//------------------------------------------------------------------------------
+//
+//    fixLiterals           When compiling something that can follow a literal
+//                          string in a pattern, we need to "fix" any preceding
+//                          string, which will cause any subsequent literals to
+//                          begin a new string, rather than appending to the
+//                          old one.
+//
+//                          Optionally, split the last char of the string off into
+//                          a single "ONE_CHAR" operation, so that quantifiers can
+//                          apply to that char alone.  Example:   abc*
+//                          The * needs to apply to the 'c' only.
+//
+//------------------------------------------------------------------------------
+void    RegexCompile::fixLiterals(UBool split) {
+    int32_t  stringStart = fStringOpStart;    // start index of the current literal string
+    int32_t  op;                              // An op from/for the compiled pattern.
+    int32_t  opType;                          // An opcode type from the compiled pattern.
+    int32_t  stringLastCharIdx;
+    UChar32  lastChar;
+    int32_t  stringNextToLastCharIdx;
+    UChar32  nextToLastChar;
+    int32_t  stringLen;
+
+    fStringOpStart = -1;    
+    if (!split) {
+        return;
+    }
+
+    // Split:  We need to  ensure that the last item in the compiled pattern does
+    //   not refer to a literal string of more than one char.  If it does,
+    //   separate the last char from the rest of the string.
+
+    // If the last operation from the compiled pattern is not a string,
+    //   nothing needs to be done  
+    op     = fRXPat->fCompiledPat->lastElementi();
+    opType = URX_TYPE(op);
+    if (opType != URX_STRING_LEN) {
+        return;
+    }
+    stringLen = URX_VAL(op);
+
+    //
+    // Find the position of the last code point in the string  (might be a surrogate pair)
+    //
+    stringLastCharIdx = fRXPat->fLiteralText.length();
+    stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
+    lastChar          = fRXPat->fLiteralText.char32At(stringLastCharIdx);
+
+    // The string should always be at least two code points long, meaning that there
+    //   should be something before the last char position that we just found.
+    U_ASSERT(stringLastCharIdx > stringStart);
+    stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1);
+    U_ASSERT(stringNextToLastCharIdx >= stringStart);
+    nextToLastChar          = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx);
+
+    if (stringNextToLastCharIdx > stringStart) {
+        // The length of string remaining after removing one char is two or more.
+        // Leave the string in the compiled pattern, shorten it by one char,
+        //   and append a URX_ONECHAR op for the last char.
+        stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
+        op = URX_BUILD(URX_STRING_LEN, stringLen);
+        fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
+        op = URX_BUILD(URX_ONECHAR, lastChar);
+        fRXPat->fCompiledPat->addElement(op, *fStatus);
+    } else {
+        // The original string consisted of exactly two characters.  Replace
+        // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
+        // of URX_ONECHARs.
+        op = URX_BUILD(URX_ONECHAR, nextToLastChar);
+        fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -2);
+        op = URX_BUILD(URX_ONECHAR, lastChar);
+        fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
+    }
+}
+
+
+
+
+
+
 //------------------------------------------------------------------------------
 //
 //   blockTopLoc()          Find or create a location in the compiled pattern
@ -889,6 +955,7 @@ int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
        // Item just compiled is a single thing, a ".", or a single char, or a set reference.
        // No slot for STATE_SAVE was pre-reserved in the compiled code.
        // We need to make space now.
+        fixLiterals(TRUE);  // If last item was a string, separate the last char.
        theLoc = fRXPat->fCompiledPat->size()-1;
        if (reserveLoc) {
            int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
@ -922,6 +989,10 @@ void  RegexCompile::handleCloseParen() {
        return;
    }

+    // Force any literal chars that may follow the close paren to start a new string,
+    //   and not attach to any preceding it.
+    fixLiterals(FALSE);
+
    // Fixup any operations within the just-closed parenthesized group
    //    that need to reference the end of the (block).
    //    (The first one on popped from the stack is an unused slot for
@ -1211,7 +1282,7 @@ UnicodeSet *RegexCompile::scanSet() {
    if (U_FAILURE(localStatus)) {
        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
        //         UnicodeSet appears to not be reporting correctly at this time.
-        printf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
+        printf( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
        error(localStatus);
        delete uset;
        return NULL;
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -97,6 +97,8 @@ private:
                                                     //  there is space to add an opcode there.
    void        compileSet(UnicodeSet *theSet);      // Generate the compiled pattern for
                                                     //   a reference to a UnicodeSet.
+    void        literalChar();                       // Compile a literal char
+    void        fixLiterals(UBool split=FALSE);      // Fix literal strings.


    UErrorCode                    *fStatus;
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@ -29,7 +29,6 @@ enum Regex_PatternParseAction {
    doBadOpenParenType,
    doRuleError,
    doBackslashs,
-    doStartString,
    doNGOpt,
    doNamedChar,
    doBackslashw,
@ -55,9 +54,9 @@ enum Regex_PatternParseAction {
    doPossesiveOpt,
    doBackslashG,
    doOpt,
+    doLiteralChar,
    doOpenAtomicParen,
    doBackslashS,
-    doStringChar,
    doOpenLookAhead,
    doBackRef,
    doDollar,
@ -70,11 +69,9 @@ enum Regex_PatternParseAction {
    doExit,
    doPatStart,
    doBackslashb,
-    doEndString,
-    doBackslashd,
    doNotImplementedError,
+    doBackslashd,
    doOpenLookBehindNeg,
-    doSplitString,
    rbbiLastAction};

 //-------------------------------------------------------------------------------
@ -94,97 +91,88 @@ struct RegexTableEl {

 static const struct RegexTableEl gRuleParseStateTable[] = {
    {doNOP, 0, 0, 0, TRUE}
-    , {doPatStart, 255, 3, 2, FALSE}     //  1      start
-    , {doPatFinish, 255, 2,0,  FALSE}     //  2      finish
-    , {doStartString, 254, 13,0,  TRUE}     //  3      term
-    , {doStartString, 130, 13,0,  TRUE}     //  4 
-    , {doScanUnicodeSet, 91 /* [ */, 20,0,  TRUE}     //  5 
-    , {doNOP, 40 /* ( */, 28, 20, TRUE}     //  6 
-    , {doDotAny, 46 /* . */, 20,0,  TRUE}     //  7 
-    , {doCaret, 94 /* ^ */, 3,0,  TRUE}     //  8 
-    , {doDollar, 36 /* $ */, 3,0,  TRUE}     //  9 
-    , {doNOP, 92 /* \ */, 67,0,  TRUE}     //  10 
-    , {doNOP, 253, 2,0,  FALSE}     //  11 
-    , {doRuleError, 255, 87,0,  FALSE}     //  12 
-    , {doStringChar, 254, 13,0,  TRUE}     //  13      string
-    , {doStringChar, 130, 13,0,  TRUE}     //  14 
-    , {doSplitString, 63 /* ? */, 20,0,  FALSE}     //  15 
-    , {doSplitString, 43 /* + */, 20,0,  FALSE}     //  16 
-    , {doSplitString, 42 /* * */, 20,0,  FALSE}     //  17 
-    , {doSplitString, 123 /* { */, 20,0,  FALSE}     //  18 
-    , {doEndString, 255, 20,0,  FALSE}     //  19 
-    , {doNOP, 42 /* * */, 56,0,  TRUE}     //  20      expr-quant
-    , {doNOP, 43 /* + */, 59,0,  TRUE}     //  21 
-    , {doNOP, 63 /* ? */, 62,0,  TRUE}     //  22 
-    , {doNOP, 123 /* { */, 65,0,  TRUE}     //  23 
-    , {doNOP, 255, 25,0,  FALSE}     //  24 
-    , {doOrOperator, 124 /* | */, 3,0,  TRUE}     //  25      expr-cont
-    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  26 
-    , {doNOP, 255, 3,0,  FALSE}     //  27 
-    , {doNOP, 63 /* ? */, 30,0,  TRUE}     //  28      open-paren
-    , {doOpenCaptureParen, 255, 3, 20, FALSE}     //  29 
-    , {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE}     //  30      open-paren-extended
-    , {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE}     //  31 
-    , {doOpenLookAhead, 61 /* = */, 3, 25, TRUE}     //  32 
-    , {doOpenLookAheadNeg, 33 /* ! */, 3, 25, TRUE}     //  33 
-    , {doNOP, 60 /* < */, 42,0,  TRUE}     //  34 
-    , {doNOP, 35 /* # */, 45,0,  TRUE}     //  35 
-    , {doMatchMode, 105 /* i */, 48,0,  TRUE}     //  36 
-    , {doMatchMode, 120 /* x */, 48,0,  TRUE}     //  37 
-    , {doMatchMode, 115 /* s */, 48,0,  TRUE}     //  38 
-    , {doMatchMode, 109 /* m */, 48,0,  TRUE}     //  39 
-    , {doMatchMode, 45 /* - */, 48,0,  TRUE}     //  40 
-    , {doBadOpenParenType, 255, 87,0,  FALSE}     //  41 
-    , {doOpenLookBehind, 61 /* = */, 3, 25, TRUE}     //  42      open-paren-lookbehind
-    , {doOpenLookBehindNeg, 33 /* ! */, 3, 25, TRUE}     //  43 
-    , {doBadOpenParenType, 255, 87,0,  FALSE}     //  44 
-    , {doNOP, 41 /* ) */, 3,0,  TRUE}     //  45      paren-comment
-    , {doMismatchedParenErr, 253, 87,0,  FALSE}     //  46 
-    , {doNOP, 255, 45,0,  TRUE}     //  47 
-    , {doMatchMode, 105 /* i */, 48,0,  TRUE}     //  48      paren-flag
-    , {doMatchMode, 115 /* s */, 48,0,  TRUE}     //  49 
-    , {doMatchMode, 109 /* m */, 48,0,  TRUE}     //  50 
-    , {doMatchMode, 120 /* x */, 48,0,  TRUE}     //  51 
-    , {doMatchMode, 45 /* - */, 48,0,  TRUE}     //  52 
-    , {doNOP, 41 /* ) */, 3,0,  TRUE}     //  53 
-    , {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE}     //  54 
-    , {doNOP, 255, 87,0,  FALSE}     //  55 
-    , {doNGStar, 63 /* ? */, 25,0,  TRUE}     //  56      quant-star
-    , {doPossesiveStar, 43 /* + */, 25,0,  TRUE}     //  57 
-    , {doStar, 255, 25,0,  FALSE}     //  58 
-    , {doNGPlus, 63 /* ? */, 25,0,  TRUE}     //  59      quant-plus
-    , {doPossesivePlus, 43 /* + */, 25,0,  TRUE}     //  60 
-    , {doPlus, 255, 25,0,  FALSE}     //  61 
-    , {doNGOpt, 63 /* ? */, 25,0,  TRUE}     //  62      quant-opt
-    , {doPossesiveOpt, 43 /* + */, 25,0,  TRUE}     //  63 
-    , {doOpt, 255, 25,0,  FALSE}     //  64 
-    , {doNOP, 129, 65,0,  TRUE}     //  65      interval-open
-    , {doNotImplementedError, 255, 87,0,  FALSE}     //  66 
-    , {doBackslashA, 65 /* A */, 3,0,  TRUE}     //  67      backslash
-    , {doBackslashB, 66 /* B */, 3,0,  TRUE}     //  68 
-    , {doBackslashb, 98 /* b */, 3,0,  TRUE}     //  69 
-    , {doBackslashd, 100 /* d */, 20,0,  TRUE}     //  70 
-    , {doBackslashD, 68 /* D */, 20,0,  TRUE}     //  71 
-    , {doBackslashG, 71 /* G */, 3,0,  TRUE}     //  72 
-    , {doNamedChar, 78 /* N */, 20,0,  TRUE}     //  73 
-    , {doProperty, 112 /* p */, 20,0,  FALSE}     //  74 
-    , {doProperty, 80 /* P */, 20,0,  FALSE}     //  75 
-    , {doEnterQuoteMode, 81 /* Q */, 3,0,  TRUE}     //  76 
-    , {doBackslashS, 83 /* S */, 20,0,  TRUE}     //  77 
-    , {doBackslashs, 115 /* s */, 20,0,  TRUE}     //  78 
-    , {doBackslashW, 87 /* W */, 20,0,  TRUE}     //  79 
-    , {doBackslashw, 119 /* w */, 20,0,  TRUE}     //  80 
-    , {doBackslashX, 88 /* X */, 20,0,  TRUE}     //  81 
-    , {doBackslashx, 120 /* x */, 20,0,  TRUE}     //  82 
-    , {doBackslashZ, 90 /* Z */, 3,0,  TRUE}     //  83 
-    , {doBackslashz, 122 /* z */, 3,0,  TRUE}     //  84 
-    , {doBackRef, 128, 20,0,  TRUE}     //  85 
-    , {doStartString, 255, 13,0,  TRUE}     //  86 
-    , {doExit, 255, 87,0,  TRUE}     //  87      errorDeath
+    , {doPatStart, 255, 2,0,  FALSE}     //  1      start
+    , {doLiteralChar, 254, 12,0,  TRUE}     //  2      term
+    , {doLiteralChar, 130, 12,0,  TRUE}     //  3 
+    , {doScanUnicodeSet, 91 /* [ */, 12,0,  TRUE}     //  4 
+    , {doNOP, 40 /* ( */, 20,0,  TRUE}     //  5 
+    , {doDotAny, 46 /* . */, 12,0,  TRUE}     //  6 
+    , {doCaret, 94 /* ^ */, 2,0,  TRUE}     //  7 
+    , {doDollar, 36 /* $ */, 2,0,  TRUE}     //  8 
+    , {doNOP, 92 /* \ */, 59,0,  TRUE}     //  9 
+    , {doPatFinish, 253, 2,0,  FALSE}     //  10 
+    , {doRuleError, 255, 79,0,  FALSE}     //  11 
+    , {doNOP, 42 /* * */, 48,0,  TRUE}     //  12      expr-quant
+    , {doNOP, 43 /* + */, 51,0,  TRUE}     //  13 
+    , {doNOP, 63 /* ? */, 54,0,  TRUE}     //  14 
+    , {doNOP, 123 /* { */, 57,0,  TRUE}     //  15 
+    , {doNOP, 255, 17,0,  FALSE}     //  16 
+    , {doOrOperator, 124 /* | */, 2,0,  TRUE}     //  17      expr-cont
+    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  18 
+    , {doNOP, 255, 2,0,  FALSE}     //  19 
+    , {doNOP, 63 /* ? */, 22,0,  TRUE}     //  20      open-paren
+    , {doOpenCaptureParen, 255, 2, 12, FALSE}     //  21 
+    , {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE}     //  22      open-paren-extended
+    , {doOpenAtomicParen, 62 /* > */, 2, 12, TRUE}     //  23 
+    , {doOpenLookAhead, 61 /* = */, 2, 17, TRUE}     //  24 
+    , {doOpenLookAheadNeg, 33 /* ! */, 2, 17, TRUE}     //  25 
+    , {doNOP, 60 /* < */, 34,0,  TRUE}     //  26 
+    , {doNOP, 35 /* # */, 37,0,  TRUE}     //  27 
+    , {doMatchMode, 105 /* i */, 40,0,  TRUE}     //  28 
+    , {doMatchMode, 120 /* x */, 40,0,  TRUE}     //  29 
+    , {doMatchMode, 115 /* s */, 40,0,  TRUE}     //  30 
+    , {doMatchMode, 109 /* m */, 40,0,  TRUE}     //  31 
+    , {doMatchMode, 45 /* - */, 40,0,  TRUE}     //  32 
+    , {doBadOpenParenType, 255, 79,0,  FALSE}     //  33 
+    , {doOpenLookBehind, 61 /* = */, 2, 17, TRUE}     //  34      open-paren-lookbehind
+    , {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE}     //  35 
+    , {doBadOpenParenType, 255, 79,0,  FALSE}     //  36 
+    , {doNOP, 41 /* ) */, 2,0,  TRUE}     //  37      paren-comment
+    , {doMismatchedParenErr, 253, 79,0,  FALSE}     //  38 
+    , {doNOP, 255, 37,0,  TRUE}     //  39 
+    , {doMatchMode, 105 /* i */, 40,0,  TRUE}     //  40      paren-flag
+    , {doMatchMode, 115 /* s */, 40,0,  TRUE}     //  41 
+    , {doMatchMode, 109 /* m */, 40,0,  TRUE}     //  42 
+    , {doMatchMode, 120 /* x */, 40,0,  TRUE}     //  43 
+    , {doMatchMode, 45 /* - */, 40,0,  TRUE}     //  44 
+    , {doNOP, 41 /* ) */, 2,0,  TRUE}     //  45 
+    , {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE}     //  46 
+    , {doNOP, 255, 79,0,  FALSE}     //  47 
+    , {doNGStar, 63 /* ? */, 17,0,  TRUE}     //  48      quant-star
+    , {doPossesiveStar, 43 /* + */, 17,0,  TRUE}     //  49 
+    , {doStar, 255, 17,0,  FALSE}     //  50 
+    , {doNGPlus, 63 /* ? */, 17,0,  TRUE}     //  51      quant-plus
+    , {doPossesivePlus, 43 /* + */, 17,0,  TRUE}     //  52 
+    , {doPlus, 255, 17,0,  FALSE}     //  53 
+    , {doNGOpt, 63 /* ? */, 17,0,  TRUE}     //  54      quant-opt
+    , {doPossesiveOpt, 43 /* + */, 17,0,  TRUE}     //  55 
+    , {doOpt, 255, 17,0,  FALSE}     //  56 
+    , {doNOP, 129, 57,0,  TRUE}     //  57      interval-open
+    , {doNotImplementedError, 255, 79,0,  FALSE}     //  58 
+    , {doBackslashA, 65 /* A */, 2,0,  TRUE}     //  59      backslash
+    , {doBackslashB, 66 /* B */, 2,0,  TRUE}     //  60 
+    , {doBackslashb, 98 /* b */, 2,0,  TRUE}     //  61 
+    , {doBackslashd, 100 /* d */, 12,0,  TRUE}     //  62 
+    , {doBackslashD, 68 /* D */, 12,0,  TRUE}     //  63 
+    , {doBackslashG, 71 /* G */, 2,0,  TRUE}     //  64 
+    , {doNamedChar, 78 /* N */, 12,0,  TRUE}     //  65 
+    , {doProperty, 112 /* p */, 12,0,  FALSE}     //  66 
+    , {doProperty, 80 /* P */, 12,0,  FALSE}     //  67 
+    , {doEnterQuoteMode, 81 /* Q */, 2,0,  TRUE}     //  68 
+    , {doBackslashS, 83 /* S */, 12,0,  TRUE}     //  69 
+    , {doBackslashs, 115 /* s */, 12,0,  TRUE}     //  70 
+    , {doBackslashW, 87 /* W */, 12,0,  TRUE}     //  71 
+    , {doBackslashw, 119 /* w */, 12,0,  TRUE}     //  72 
+    , {doBackslashX, 88 /* X */, 12,0,  TRUE}     //  73 
+    , {doBackslashx, 120 /* x */, 12,0,  TRUE}     //  74 
+    , {doBackslashZ, 90 /* Z */, 2,0,  TRUE}     //  75 
+    , {doBackslashz, 122 /* z */, 2,0,  TRUE}     //  76 
+    , {doBackRef, 128, 12,0,  TRUE}     //  77 
+    , {doLiteralChar, 255, 12,0,  TRUE}     //  78 
+    , {doExit, 255, 79,0,  TRUE}     //  79      errorDeath
 };
 static const char *RegexStateNames[] = {    0,
     "start",
-     "finish",
     "term",
    0,
    0,
@ -194,13 +182,6 @@ static const char *RegexStateNames[] = {    0,
    0,
    0,
    0,
-    0,
-     "string",
-    0,
-    0,
-    0,
-    0,
-    0,
    0,
     "expr-quant",
    0,
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@ -55,50 +55,27 @@
 #  start state, scan position is at the beginning of the pattern.
 #
 start:
-   default                 term                 ^finish             doPatStart
+   default                 term                                     doPatStart
    
-#
-#  finish  -  We've scanned off the end of the pattern string.
-#             The "doPatFinish" action will stop the pattern scanning state machine.
-#
-finish:
-    default                finish                                   doPatFinish
-     

    
    
 #
-#  term.  Eat through a single rule character, or a composite thing, which
-#         could be a parenthesized expression  or a Unicode Set.
+#  term.  At a position where we can accept the start most items in a pattern.
 #
 term:
-    quoted               n string                                   doStartString
-    rule_char            n string                                   doStartString
-    '['                  n expr-quant     		            doScanUnicodeSet
-    '('                  n open-paren            ^expr-quant          
+    quoted               n expr-quant     		                    doLiteralChar
+    rule_char            n expr-quant     		                    doLiteralChar
+    '['                  n expr-quant                               doScanUnicodeSet
+    '('                  n open-paren                     
    '.'                  n expr-quant                               doDotAny
    '^'                  n term                                     doCaret
    '$'                  n term                                     doDollar
    '\'                  n backslash
-    eof		           finish
+    eof		               term                                     doPatFinish
    default                errorDeath                               doRuleError
    

-#
-#   string        We've encountered a literal character, or an escaped character.
-#                 Continue with any additional literal chars, building the sequence
-#                 into a string.
-#
-string:
-    quoted                n string                                  doStringChar
-    rule_char             n string                                  doStringChar
-    # If the string ends in a quatinfier, we need to split off the last character so that
-    #   the quantifier effects only it, and not the entire string.  (e.g.  "ABC*")
-    '?'                     expr-quant                              doSplitString
-    '+'                     expr-quant                              doSplitString
-    '*'                     expr-quant                              doSplitString
-    '{'                     expr-quant                              doSplitString
-    default                 expr-quant                              doEndString

 #
 #   expr-quant    We've just finished scanning a term, now look for the optional
@ -223,12 +200,12 @@ backslash:
   'A'                   n  term                                    doBackslashA
   'B'                   n  term                                    doBackslashB
   'b'                   n  term                                    doBackslashb
-   'd'			 n  expr-quant				    doBackslashd
+   'd'                   n  expr-quant                              doBackslashd
   'D'                   n  expr-quant                              doBackslashD
   'G'                   n  term                                    doBackslashG
-   'N'			 n  expr-quant                              doNamedChar      #   \N{NAME}  named char
-   'p'			    expr-quant                              doProperty       #   \p{Lu}  style property
-   'P'			    expr-quant                              doProperty
+   'N'                   n  expr-quant                              doNamedChar      #   \N{NAME}  named char
+   'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
+   'P'                      expr-quant                              doProperty
   'Q'                   n  term                                    doEnterQuoteMode
   'S'                   n  expr-quant                              doBackslashS
   's'                   n  expr-quant                              doBackslashs
@ -238,9 +215,8 @@ backslash:
   'x'                   n  expr-quant                              doBackslashx
   'Z'                   n  term                                    doBackslashZ
   'z'                   n  term                                    doBackslashz
-   digit_char	         n  expr-quant			            doBackRef
-   
-   default               n  string				    doStartString   
+   digit_char	         n  expr-quant                              doBackRef
+   default               n  expr-quant		                        doLiteralChar     #  Escaped literal char.		       

    
    
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -14,36 +14,38 @@


 //
-//  Opcode types     In the compiled form of the regex, these are the type, or opcodes,
+//  Opcode types     In the compiled form of the regexp, these are the type, or opcodes,
 //                   of the entries.
 //
-static const uint32_t     URX_UNUSED1       = 1;
-static const uint32_t     URX_END           = 2;
-static const uint32_t     URX_ONECHAR       = 3;    // Value field is the 21 bit unicode char to match
-static const uint32_t     URX_STRING        = 4;    // Value field is index of string start
-static const uint32_t     URX_STRING_LEN    = 5;    // Value field is string length (code units)
-static const uint32_t     URX_STATE_SAVE    = 6;    // Value field is pattern position to push
-static const uint32_t     URX_NOP           = 7;
-static const uint32_t     URX_START_CAPTURE = 8;    // Value field is capture group number.
-static const uint32_t     URX_END_CAPTURE   = 9;    // Value field is capture group number
-static const uint32_t     URX_STATIC_SETREF = 10;   // Value field is index of set in array of sets.   
-static const uint32_t     URX_SETREF        = 11;   // Value field is index of set in array of sets.
-static const uint32_t     URX_DOTANY        = 12; 
-static const uint32_t     URX_JMP           = 13;   // Value field is destination position in
+enum {
+     URX_UNUSED1       = 1,
+     URX_END           = 2,
+     URX_ONECHAR       = 3,    // Value field is the 21 bit unicode char to match
+     URX_STRING        = 4,    // Value field is index of string start
+     URX_STRING_LEN    = 5,    // Value field is string length (code units)
+     URX_STATE_SAVE    = 6,    // Value field is pattern position to push
+     URX_NOP           = 7,
+     URX_START_CAPTURE = 8,    // Value field is capture group number.
+     URX_END_CAPTURE   = 9,    // Value field is capture group number
+     URX_STATIC_SETREF = 10,   // Value field is index of set in array of sets.   
+     URX_SETREF        = 11,   // Value field is index of set in array of sets.
+     URX_DOTANY        = 12, 
+     URX_JMP           = 13,   // Value field is destination position in
                                                    //   the pattern.
-static const uint32_t     URX_FAIL          = 14;   // Stop match operation;  No match.
+     URX_FAIL          = 14,   // Stop match operation,  No match.

-static const uint32_t     URX_BACKSLASH_A   = 15;   
-static const uint32_t     URX_BACKSLASH_B   = 16;   // Value field:  0:  \b    1:  \B
-static const uint32_t     URX_BACKSLASH_G   = 17; 
-static const uint32_t     URX_BACKSLASH_W   = 18;   // Value field:  0:  \w    1:  \W
-static const uint32_t     URX_BACKSLASH_X   = 19;
-static const uint32_t     URX_BACKSLASH_Z   = 20;   // \z   Unconditional end of line.
+     URX_BACKSLASH_A   = 15,   
+     URX_BACKSLASH_B   = 16,   // Value field:  0:  \b    1:  \B
+     URX_BACKSLASH_G   = 17, 
+     URX_BACKSLASH_W   = 18,   // Value field:  0:  \w    1:  \W
+     URX_BACKSLASH_X   = 19,
+     URX_BACKSLASH_Z   = 20,   // \z   Unconditional end of line.

-static const uint32_t     URX_DOTANY_ALL    = 21;   // ., in the . matches any mode.
-static const uint32_t     URX_BACKSLASH_D   = 22;   // Value field:  0:  \d    1:  \D
-static const uint32_t     URX_CARET         = 23;   // Value field:  1:  multi-line mode.
-static const uint32_t     URX_DOLLAR        = 24;   // Also for \Z
+     URX_DOTANY_ALL    = 21,   // ., in the . matches any mode.
+     URX_BACKSLASH_D   = 22,   // Value field:  0:  \d    1:  \D
+     URX_CARET         = 23,   // Value field:  1:  multi-line mode.
+     URX_DOLLAR        = 24   // Also for \Z
+};


 //
@ -58,13 +60,16 @@ static const uint32_t     URX_DOLLAR        = 24;   // Also for \Z
 //  Access to Unicode Sets for Perl-like composite character properties
 //     The sets are accessed by the match engine for things like \w (word boundary)
 //     
-static const uint32_t     URX_ISWORD_SET  = 1;
-static const uint32_t     URX_ISALNUM_SET = 2;
-static const uint32_t     URX_ISALPHA_SET = 3;
-static const uint32_t     URX_ISSPACE_SET = 4;
-static const uint32_t     URX_LAST_SET    = 5;
+enum {
+     URX_ISWORD_SET  = 1,
+     URX_ISALNUM_SET = 2,
+     URX_ISALPHA_SET = 3,
+     URX_ISSPACE_SET = 4,
+     URX_LAST_SET    = 5,
+
+     URX_NEG_SET     = 0x800000          // Flag bit to reverse sense of set
+                                         //   membership test.
+};

-static const uint32_t     URX_NEG_SET     = 0x800000;  // Flag bit to reverse sense of set
-                                                       //   membership test.
 #endif

--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -204,6 +204,11 @@ RegexPattern  *RegexPattern::compile(
    if (U_FAILURE(err)) {
        return NULL;
    }
+    if (flags != 0) {
+        err = U_REGEX_UNIMPLEMENTED;
+        return NULL;
+    }
+
    RegexPattern *This = new RegexPattern;
    if (This == NULL) {
        err = U_MEMORY_ALLOCATION_ERROR;
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -4,10 +4,35 @@
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
+//
+//   file:   regex.h
+//
+//           ICU Regular Expressions, API for C++
+//

 #ifndef REGEX_H
 #define REGEX_H

+
+/**
+  * \file
+  * \brief  C++ API:  Regular Expressions
+  *
+  * <h2>Regular Expression API</h2>
+  *
+  * <p>The ICU API for processing regular expressions consists of two classes,
+  *    <code>RegexPattern</code> and <code>RegexMatcher</code>. 
+  *    <code>RegexPattern</code> objects represent a pre-processed, or compiled
+  *    regular expression.  They are created from a regular expression pattern string,
+  *    and can be used to create <RegexMatcher> objects for the pattern. </p>
+  *
+  * <p> Class <code>RegexMatcher</code> bundles together a regular expression pattern
+  *     and a target string to which the search pattern will be applied. 
+  *     <code>RegexMatcher</code> includes API for doing plain find or search
+  *     operations, for search and replace operations, and for obtaining detailed
+  *     information about bounds of a match. </p>
+  */
+
 #include "unicode/utypes.h"

 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
@ -25,56 +50,120 @@ class UStack;
 class UnicodeSet;


-//---------------------------------------------------------------------------------
-//
-//  Flags for Regular Expression Modes.
-//   TODO:  Move to C header once one exists.
-//   All flags default to off or false
-//   All are as defined by Java Regexps.
-//
-//---------------------------------------------------------------------------------
+/**
+ * Constants for Regular Expression Match Modes.
+ * <p>Note that non-default match modes will not be supported until ICU 2.6</p>
+ * @draft ICU 2.4 
+ */
 enum {
-        UREGEX_CANON_EQ         = 128,    // Forces normalization of pattern and strings.
-        UREGEX_CASE_INSENSITIVE = 2,      // Enable case insensitive matching.
-        UREGEX_COMMENTS         = 4,      // Allow white space and comments within patterns
-        UREGEX_DOTALL           = 32,     // If set, "." matches line terminators.
-                                          //   otherwise . matching stops at line end.
-        UREGEX_MULTILINE        = 8,      // Control behavior of "$" and "^". 
-                                          //   If set, recognize line terminators within string
-                                          //   otherwise, match only at start and end of
-                                          //   input string
-        UREGEX_UNICODE_CASE     = 64,     // If set, use full Unicode case folding for case
-                                          //   insensitive matches.  Otherwise, case insensitive
-                                          //   matching only affects chars in the ASCII range.
-                                          //   TODO:  do we want to support this option at all?
-        UREGEX_UNIX_LINES       = 1       // If set, only \n is recognized as a line terminator.
-                                          //   otherwise recognize all Unicode line endings.
+        /** Forces normalization of pattern and strings.  @draft ICU 2.4 */
+        UREGEX_CANON_EQ         = 128, 
+        /**  Enable case insensitive matching.  @draft ICU 2.4 */
+        UREGEX_CASE_INSENSITIVE = 2,  
+        /**  Allow white space and comments within patterns  @draft ICU 2.4 */
+        UREGEX_COMMENTS         = 4,  
+        /**  If set, '.' matches line terminators,  otherwise '.' matching stops at line end.
+          *  @draft ICU 2.4 */
+        UREGEX_DOTALL           = 32,  
+        /**   Control behavior of "$" and "^"
+          *    If set, recognize line terminators within string,
+          *    otherwise, match only at start and end of input string.
+          *   @draft ICU 2.4 */
+        UREGEX_MULTILINE        = 8  
 };



-//---------------------------------------------------------------------------------
-//
-//    class  RegexPattern
-//
-//---------------------------------------------------------------------------------
+/**
+  * Class <code>RegexPattern</code> represents a compiled regular expression.  It includes
+  * factory methods for creating a RegexPattern object from the source (string) form
+  * of a regular expression, methods for creating RegexMatchers that allow the pattern
+  * to be applied to input text, and a few convenience methods for simple common
+  * uses of regular expressions.
+  *
+  * @draft ICU 2.4
+  */
 class U_I18N_API RegexPattern: public UObject {
 public:
    
-    
+    /**
+      * default constructor.  Create a RegexPattern object that refers to no actual
+      *   pattern.  Not normally needed; RegexPattern objects are usually
+      *   created using the factory method <code>compile()</code.  
+      *
+      * @draft ICU 2.4
+      */
    RegexPattern();
-    RegexPattern(const RegexPattern &other);
+
+
+    /**
+      * Copy Constructor.  Create a new RegexPattern object that is equivalent
+      *                    to the source object. 
+      * @draft ICU 2.4
+      */
+    RegexPattern(const RegexPattern &source);
+
+    /**
+      * Destructor.  Note that a RegexPattern object must persist so long as any
+      *  RegexMatcher objects that were created from the RegexPattern are active.
+      * @draft ICU 2.4
+      */
    virtual ~RegexPattern();
    
+    /**
+      * Comparison operator.  Two RegexPattern objects are considered equal if they
+      * were constructed from identical source patterns using the same match flag
+      * settings.
+      * @param that a RegexPattern object to compare with "this".
+      * @return TRUE if the objects are equavelent.
+      * @draft ICU 2.4
+      */
    UBool                  operator==(const RegexPattern& that) const;
+
+    /**
+      * Comparison operator.  Two RegexPattern objects are considered equal if they
+      * were constructed from identical source patterns using the same match flag
+      * settings.
+      * @param that a RegexPattern object to compare with "this".
+      * @return TRUE if the objects are different.
+      * @draft ICU 2.4
+      */
    inline UBool           operator!=(const RegexPattern& that) const {return ! operator ==(that);};
    
-    RegexPattern  &operator =(const RegexPattern &other);
+    /*
+     * Assignment operator.  After assignment, this RegexPattern will behave identically
+     *     to the source object.
+     * @draft ICU 2.4
+     */
+    RegexPattern  &operator =(const RegexPattern &source);
+
+    /*
+     * Create an exact copy of this RegexPattern object.  Since RegexPattern is not
+     * intended to be subclasses, <code>clone()</code> and the copy construction are
+     * equivalent operations.
+     */
    virtual RegexPattern  *clone() const;

    
   /**
-    *     Compiles the given regular expression into a pattern 
+    *     <p>Compiles the given regular expression in string form into a RegexPattern
+    *     object.  The compile methods, rather than the constructors, are the usual
+    *     way that RegexPattern objects are created.</p>
+    *
+    *     <p>Note that RegexPattern objects must not be deleted while RegexMatcher
+    *     objects created from the pattern are active.  RegexMatchers keep a pointer
+    *     back to their pattern, so premature deletion of the pattern is a
+    *     catastrophic error.</p>
+    *
+    *     <p>All pattern match mode flags are set to their default values.</p>
+    *
+    *    @param regex The regular expression to be compiles.
+    *    @param pe    Receives the position (line and column nubers) of any error
+    *                 within the regular expression.)
+    *    @param err A reference to a UErrorCode to receive any errors.
+    *    @return      A regexPattern object for the compiled pattern.
+    *
+    *    @draft ICU 2.4
    */
    static RegexPattern *compile( const UnicodeString &regex,
        UParseError          &pe,
@ -83,6 +172,26 @@ public:
   /**
    *     Compiles the given regular expression into a pattern with the given flags 
    */
+   /**
+    *     <p>Compiles the given regular expression in string form into a RegexPattern
+    *     object using the specified match mode flags.  The compile methods,
+    *     rather than the constructors, are the usual way that RegexPattern objects
+    *     are created.</p>
+    *
+    *     <p>Note that RegexPattern objects must not be deleted while RegexMatcher
+    *     objects created from the pattern are active.  RegexMatchers keep a pointer
+    *     back to their pattern, so premature deletion of the pattern is a
+    *     catastrophic error.</p>
+    *
+    *    @param regex The regular expression to be compiles.
+    *    @param flags The match mode flags to be used.
+    *    @param pe    Receives the position (line and column nubers) of any error
+    *                 within the regular expression.)
+    *    @param err   A reference to a UErrorCode to receive any errors.
+    *    @return      A regexPattern object for the compiled pattern.
+    *
+    *    @draft ICU 2.4
+    */
    static RegexPattern *compile( const UnicodeString &regex,
        int32_t              flags,
        UParseError          &pe,
@ -90,19 +199,41 @@ public:


   /**
-    *     Return the flags for this pattern
+    *     Get the match mode flags that were used when compiling this pattern.
+    *     @return  the match mode flags
+    *     @draft ICU 2.4
    */
    virtual int32_t flags() const;
    
   /*
-    *  Creates a matcher that will match the given input against this pattern.
+    *  Creates a RegexMatcher that will match the given input against this pattern.  The
+    *   RegexMatcher can then be used to perform match, find or replace operations
+    *   on on the input.  Note that a RegexPattern object must not be deleted while
+    *   any RegexMatchers created from it still exist and might possibly be used again.
+    *
+    *   @param input The input string to which the regular expression will be applied.
+    *   @param err   A reference to a UErrorCode to receive any errors.
+    *   @return      A RegexMatcher object for this pattern and input.
+    *
+    *   @draft ICU 2.4
    */
    virtual RegexMatcher *matcher(const UnicodeString &input,
        UErrorCode          &err) const;
    
    
-   /*
-    *  Compiles the given regular expression and attempts to match the given input against it.
+   /**
+    *  Test whether a string matches a regular expression.  This convenience function
+    *   both compiles the reguluar expression and applies it in a single operation.  
+    *   Note that if the same pattern needs to be applied repeatedly, this method will be
+    *   less efficient than creating and reusing RegexPattern object.
+    *
+    *  @param regex The regular expression
+    *  @param input The string data to be matched
+    *  @param pe Receives the position of any syntax errors within the regular expression
+    *  @param err A reference to a UErrorCode to receive any errors.
+    *  @return True if the regular expression exactly matches the full input string.
+    *
+    *  @draft ICU 2.4
    */
    static UBool matches(const UnicodeString   &regex,
        const UnicodeString   &input,
@ -112,12 +243,13 @@ public:
    
   /*
    *    Returns the regular expression from which this pattern was compiled. 
+    *    @draft ICU 2.4
    */
    virtual UnicodeString pattern() const;
    
    
    /*
-    *    Split a string around matches of the pattern.  Somewhat like split() form Perl.
+    *    Split a string around matches of the pattern.  Somewhat like split() from Perl.
    *    @param input   The string to be split into fields.  The field delimiters
    *                   match the pattern (in the "this" object)
    *    @param dest    An array of UnicodeStrings to receive the results of the split.
@ -131,6 +263,7 @@ public:
    *                   of fields, the trailing part of the input string, including any
    *                   field delimiters, is placed in the last destination string.
    *    @return        The number of fields into which the input string was split.
+    *    @draft ICU 2.4
    */
    virtual int32_t  split(const UnicodeString &input,
        UnicodeString    dest[],
@ -147,14 +280,14 @@ public:
    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
-     * @draft ICU 2.2
+     * @draft ICU 2.4
     */
    virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
    
    /**
    * ICU "poor man's RTTI", returns a UClassID for this class.
    *
-    * @draft ICU 2.2
+    * @draft ICU 2.4
    */
    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
    
@ -167,12 +300,12 @@ private:
    UnicodeString   fPattern;      // The original pattern string.
    int32_t         fFlags;        // The flags used when compiling the pattern.
                                   //   
-    UVector         *fCompiledPat; // The compiled, tokenized pattern.
+    UVector         *fCompiledPat; // The compiled pattern.
    UnicodeString   fLiteralText;  // Any literal string data from the pattern, 
                                   //   after un-escaping, for use during the match.
    UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
-    UBool           fBadState;     // True if any prior error has left this
-                                   //  RegexPattern unusable.
+    UBool           fBadState;     // True if some prior error has left this
+                                   //  RegexPattern in an unusable state.

    RegexMatcher    *fMatcher;     // A cached matcher for this pattern, used for
                                   //  split(), to avoid having to
@ -205,61 +338,77 @@ private:



-//--------------------------------------------------------------------------------
-//
-//    class RegexMatcher 
-//
-//--------------------------------------------------------------------------------
-class U_I18N_API RegexMatcher: public UObject {
+/**
+  *  class RegexMatcher bundles together a reular expression pattern and
+  *  input text to which the expression can be applied.  It includes methods
+  *  for testing for matches, and for find and replace operations.
+  *
+  * @draft ICU 2.4
+  */
+  class U_I18N_API RegexMatcher: public UObject {
 public:
-    
-   /*   Destructor.  Note that there are no public constructors; creation is
+   /**
+    *   Destructor.  Note that there are no public constructors; creation is
    *   done with RegexPattern::matcher().
+    *
+    *  @draft ICU 2.4
    */
    virtual ~RegexMatcher();

-   /*
+   /**
    *   Implements a replace operation intended to be used as part of an
    *   incremental find-and-replace.
    *
-    *   The input sequence, starting from the append position and ending at
-    *   the start of the current match is appended to the destination string.
+    *   The input string, starting from the end of the previous match and ending at
+    *   the start of the current match, is appended to the destination string.
    *
    *   Then the replacement string is appended to the output string,
    *   including handling any substitutions of captured text.
    *
-    *   The append position is set to the position of the first
-    *   character following the match in the input string.
-    *
-    *   For complete, prepackaged, non-incremental find-and-replace
+    *   For simple, prepackaged, non-incremental find-and-replace
    *   operations, see replaceFirst() or replaceAll().
    *
-    *   Returns:  This Matcher
+    *   @param   dest        A UnicodeString to which the results of the find-and-replace are appended.
+    *   @param   replacement A UnicodeString that provides the text to be substitured for
+    *                        the input text that matched the regexp pattern.  The replacement
+    *                        text may contain references to captured text from the
+    *                        input.
+    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible 
+    *                        errors are  U_REGEX_INVALID_STATE if no match has been
+    *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
+    *                        if the replacement text specifies a capture group that
+    *                        does not exist in the pattern.
+    *                        
+    *   @return  this  RegexMatcher
+    *   @draft ICU 2.4
    *
-    *    error:  Illegal state - no match yet attemtped, or last match failed.
-    *            IndexOutOfBounds - caputure string number from replacement string.
    */
    virtual RegexMatcher &appendReplacement(UnicodeString &dest,
        const UnicodeString &replacement, UErrorCode &status);
    
    
-   /*
-    * This method reads characters from the input sequence,
-    * starting at the append position, and appends them to the
-    * destination string. It is intended to be invoked after one
-    * or more invocations of the appendReplacement method in order
-    * to copy the remainder of the input sequence. 
+   /**
+    * As the final step in a find-and-replace operation, append the remainder
+    * of the input string, starting at the position following the last match,
+    * to the destination string. It is intended to be invoked after one
+    * or more invocations of the <code>RegexMatcher::appendReplacement()</code>. 
    *
+    *  @param dest A UnicodeString to which the results of the find-and-replace are appended.
    *  @return  the destination string.
+    *  @draft ICU 2.4
    */
    virtual UnicodeString &appendTail(UnicodeString &dest); 
    
    
-    /*
-    *    Returns the index of the last character matched, plus one.
-    *    error:  Illegal state - no match yet attemtped, or last match failed.
+   /**
+    *    Find the ending position of the most recent match.
+    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible 
+    *                        errors are  U_REGEX_INVALID_STATE if no match has been
+    *                        attempted or the last match failed.
+    *    @return the index of the last character matched, plus one.
+    *   @draft ICU 2.4
    */
-    virtual int32_t end(UErrorCode &err) const;
+    virtual int32_t end(UErrorCode &status) const;
    
    
    /*
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -367,7 +367,7 @@ void RegexTest::Basic() {
 //
 #if 0
    {
-    REGEX_FIND("\\D+", "<0>non digits</0>");
+    REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D");
    }
    exit(1);
 #endif
@ -856,17 +856,21 @@ void RegexTest::API_Pattern() {
    RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
    REGEX_ASSERT(*pat1a == *pat1);

+#if 0
    // Compile with different flags should be not equal
    RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
    REGEX_CHECK_STATUS;
+
    REGEX_ASSERT(*pat1b != *pat1a);
    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
    REGEX_ASSERT(pat1a->flags() == 0);
+    delete pat1b;
+#endif    // add test back in when we actually support flag settings.

    // clone
-    RegexPattern *pat1c = pat1b->clone();
-    REGEX_ASSERT(*pat1b == *pat1c);
-    REGEX_ASSERT(*pat1a != *pat1c);
+    RegexPattern *pat1c = pat1->clone();
+    REGEX_ASSERT(*pat1c == *pat1);
+    REGEX_ASSERT(*pat1c != *pat2);


    // TODO:  Actually do some matches with the cloned/copied/assigned patterns.
@ -874,7 +878,6 @@ void RegexTest::API_Pattern() {


    delete pat1c;
-    delete pat1b;
    delete pat1a;
    delete pat1;
    delete pat2;
@ -1081,6 +1084,18 @@ void RegexTest::Extended() {
    // (?# comment) doesn't muck up pattern
    REGEX_FIND("Hello (?# this is a comment) world", "  <0>Hello  world</0>...");

+    // Check some implementation corner cases base on the way literal strings are compiled.
+    REGEX_FIND("A", "<0>A</0>");
+    REGEX_FIND("AB", "<0>AB</0>ABABAB");
+    REGEX_FIND("AB+", "<0>ABBB</0>A");
+    REGEX_FIND("AB+", "<0>AB</0>ABAB");
+    REGEX_FIND("ABC+", "<0>ABC</0>ABC");
+    REGEX_FIND("ABC+", "<0>ABCCCC</0>ABC");
+    REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D");
+    REGEX_FIND("(?:ABC)DEF+", "<0>ABCDEFFF</0>D");
+    REGEX_FIND("AB\\.C\\eD\\u0666E", "<0>AB.C\\u001BD\\u0666E</0>F");
+
+
 }


@ -1123,6 +1138,18 @@ void RegexTest::Errors() {
    // {Numeric Quantifiers}
    REGEX_ERR("abc{4}", 1, 5, U_REGEX_UNIMPLEMENTED);

+    // Attempt to use non-default flags 
+    {
+        UParseError   pe;
+        UErrorCode    status = U_ZERO_ERROR;
+        int32_t       flags  = UREGEX_CASE_INSENSITIVE | UREGEX_CANON_EQ |
+                               UREGEX_COMMENTS         | UREGEX_DOTALL   |
+                               UREGEX_MULTILINE;
+        RegexPattern *pat1= RegexPattern::compile(".*", UREGEX_CASE_INSENSITIVE, pe, status);
+        REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
+        delete pat1;
+    }
+

    // Quantifiers are allowed only after something that can be quantified.
    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);