ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10132
2025-04-13 08:53:20 +00:00 · 2002-10-31 01:58:01 +00:00 · 2002-10-31 01:58:01 +00:00 · ccd8fc3536
commit ccd8fc3536
parent 0ad551db67
8 changed files with 267 additions and 50 deletions
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -72,10 +72,27 @@ static UnicodeSet  *gUnescapeCharSet;
 //    will handle.
 //
 static const UChar gUnescapeCharPattern[] = {
-//    [     a     b     c     e     f     n     r     t     u     U     ] 
-    0x5b, 0x61, 0x62, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d};
+//    [     a     c     e     f     n     r     t     u     U     ] 
+    0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d, 0};


+
+//----------------------------------------------------------------------------------------
+//
+//  Unicode Set Definitions for Regular Expression composite properties
+//
+//----------------------------------------------------------------------------------------
+
+static const UChar gIsWordPattern[] = {
+//    [     \     p     {     L     l     }     \     p     {     L     u     }
+    0x5b, 0x5c, 0x70, 0x7b, 0x4c, 0x6c, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x75, 0x7d,
+//          \     p     {     L     t     }     \     p     {     L     o     }
+          0x5c, 0x70, 0x7b, 0x4c, 0x74, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x6f, 0x7d,
+//          \     p     {     N     d     }     ]
+          0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d, 0x5d, 0};
+
+static const UnicodeSet *gPropSets[URX_LAST_SET];
+
 //----------------------------------------------------------------------------------------
 //
 //  Constructor.
@ -101,7 +118,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
    }

    //
-    //  Set up the constant Unicode Sets.
+    //  Set up the constant (static) Unicode Sets.
    //    
    if (gRuleSets[kRuleSet_rule_char-128] == NULL) {
        //  TODO:  Make thread safe.
@ -110,6 +127,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
        gRuleSets[kRuleSet_white_space-128]     = new UnicodeSet(UnicodePropertySet::getRuleWhiteSpaceSet(status));
        gRuleSets[kRuleSet_digit_char-128]      = new UnicodeSet(gRuleSet_digit_char_pattern,      status);
        gUnescapeCharSet                        = new UnicodeSet(gUnescapeCharPattern,             status);
+        gPropSets[URX_ISWORD_SET]               = new UnicodeSet(gIsWordPattern,                   status); 
        if (U_FAILURE(status)) {
            delete gRuleSets[kRuleSet_rule_char-128];
            delete gRuleSets[kRuleSet_white_space-128];
@ -119,6 +137,11 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
            gRuleSets[kRuleSet_white_space-128] = NULL;
            gRuleSets[kRuleSet_digit_char-128]  = NULL;
            gUnescapeCharSet = NULL;
+            int i;
+            for (i=0; i<URX_LAST_SET; i++) {
+                delete gPropSets[i];
+                gPropSets[i] = NULL;
+            }
            return;
        }
    }
@ -164,6 +187,7 @@ void    RegexCompile::compile(

    // Prepare the RegexPattern object to receive the compiled pattern.
    fRXPat->fPattern        = pat;
+    fRXPat->fStaticSets     = gPropSets;


    // Initialize the pattern scanning state machine
@ -685,16 +709,26 @@ UBool RegexCompile::doParseActions(EParseAction action)
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 0), *fStatus);
        break;

+    case doBackslashD:
+        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
+        break;
+
+    case doBackslashd:
+        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
+        break;
+
    case doBackslashG:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
        break;        

    case doBackslashW:
-        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_W, 1), *fStatus);
+        fRXPat->fCompiledPat->addElement(
+            URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
        break;        

    case doBackslashw:
-        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_W, 0), *fStatus);
+        fRXPat->fCompiledPat->addElement(
+            URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
        break;        

    case doBackslashX:
@ -772,7 +806,6 @@ int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
        if (reserveLoc) {
            int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);
            int32_t prevType = URX_TYPE(opAtTheLoc);
-            U_ASSERT(prevType==URX_ONECHAR || prevType==URX_SETREF || prevType==URX_DOTANY);
            int32_t  nop = URX_BUILD(URX_NOP, 0);
            fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
        }
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@ -47,6 +47,7 @@ enum Regex_PatternParseAction {
    doBackslashB,
    doNGPlus,
    doPatFinish,
+    doBackslashD,
    doIntervalMinValue,
    doIntervalDigit,
    doPossesiveOpt,
@ -66,6 +67,7 @@ enum Regex_PatternParseAction {
    doPatStart,
    doBackslashb,
    doEndString,
+    doBackslashd,
    doOpenLookBehindNeg,
    doSplitString,
    rbbiLastAction};
@ -96,7 +98,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doDotAny, 46 /* . */, 18,0,  TRUE}     //  7 
    , {doNOP, 92 /* \ */, 59,0,  TRUE}     //  8 
    , {doNOP, 253, 2,0,  FALSE}     //  9 
-    , {doRuleError, 255, 71,0,  FALSE}     //  10 
+    , {doRuleError, 255, 73,0,  FALSE}     //  10 
    , {doStringChar, 254, 11,0,  TRUE}     //  11      string
    , {doStringChar, 130, 11,0,  TRUE}     //  12 
    , {doSplitString, 63 /* ? */, 18,0,  FALSE}     //  13 
@ -118,10 +120,10 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doOpenLookAhead, 61 /* = */, 3, 22, TRUE}     //  29 
    , {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE}     //  30 
    , {doNOP, 60 /* < */, 33,0,  TRUE}     //  31 
-    , {doBadOpenParenType, 255, 71,0,  FALSE}     //  32 
+    , {doBadOpenParenType, 255, 73,0,  FALSE}     //  32 
    , {doOpenLookBehind, 61 /* = */, 3, 22, TRUE}     //  33      open-paren-lookbehind
    , {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE}     //  34 
-    , {doBadOpenParenType, 255, 71,0,  FALSE}     //  35 
+    , {doBadOpenParenType, 255, 73,0,  FALSE}     //  35 
    , {doNGStar, 63 /* ? */, 22,0,  TRUE}     //  36      quant-star
    , {doPossesiveStar, 43 /* + */, 22,0,  TRUE}     //  37 
    , {doStar, 255, 22,0,  FALSE}     //  38 
@ -133,14 +135,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doOpt, 255, 22,0,  FALSE}     //  44 
    , {doNOP, 129, 45,0,  TRUE}     //  45      interval-open
    , {doIntervalMinValue, 128, 48,0,  FALSE}     //  46 
-    , {doNumberExpectedError, 255, 71,0,  FALSE}     //  47 
+    , {doNumberExpectedError, 255, 73,0,  FALSE}     //  47 
    , {doNOP, 129, 52,0,  TRUE}     //  48      interval-value
    , {doNOP, 125 /* } */, 52,0,  FALSE}     //  49 
    , {doIntervalDigit, 128, 48,0,  TRUE}     //  50 
-    , {doNumberExpectedError, 255, 71,0,  FALSE}     //  51 
+    , {doNumberExpectedError, 255, 73,0,  FALSE}     //  51 
    , {doNOP, 129, 52,0,  TRUE}     //  52      interval-close
    , {doTagValue, 125 /* } */, 55,0,  TRUE}     //  53 
-    , {doNumberExpectedError, 255, 71,0,  FALSE}     //  54 
+    , {doNumberExpectedError, 255, 73,0,  FALSE}     //  54 
    , {doNOP, 254, 3,0,  FALSE}     //  55      expr-cont-no-interval
    , {doExprOrOperator, 124 /* | */, 3,0,  TRUE}     //  56 
    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  57 
@ -148,16 +150,18 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doBackslashA, 65 /* A */, 3,0,  TRUE}     //  59      backslash
    , {doBackslashB, 66 /* B */, 3,0,  TRUE}     //  60 
    , {doBackslashb, 98 /* b */, 3,0,  TRUE}     //  61 
-    , {doBackslashG, 71 /* G */, 3,0,  TRUE}     //  62 
-    , {doProperty, 112 /* p */, 18,0,  FALSE}     //  63 
-    , {doProperty, 80 /* P */, 18,0,  FALSE}     //  64 
-    , {doBackslashW, 87 /* W */, 3,0,  TRUE}     //  65 
-    , {doBackslashw, 119 /* w */, 3,0,  TRUE}     //  66 
-    , {doBackslashX, 88 /* X */, 3,0,  TRUE}     //  67 
-    , {doBackslashZ, 90 /* Z */, 3,0,  TRUE}     //  68 
-    , {doBackslashz, 122 /* z */, 3,0,  TRUE}     //  69 
-    , {doStartString, 255, 11,0,  TRUE}     //  70 
-    , {doExit, 255, 71,0,  TRUE}     //  71      errorDeath
+    , {doBackslashd, 100 /* d */, 18,0,  TRUE}     //  62 
+    , {doBackslashD, 68 /* D */, 18,0,  TRUE}     //  63 
+    , {doBackslashG, 71 /* G */, 3,0,  TRUE}     //  64 
+    , {doProperty, 112 /* p */, 18,0,  FALSE}     //  65 
+    , {doProperty, 80 /* P */, 18,0,  FALSE}     //  66 
+    , {doBackslashW, 87 /* W */, 18,0,  TRUE}     //  67 
+    , {doBackslashw, 119 /* w */, 18,0,  TRUE}     //  68 
+    , {doBackslashX, 88 /* X */, 18,0,  TRUE}     //  69 
+    , {doBackslashZ, 90 /* Z */, 3,0,  TRUE}     //  70 
+    , {doBackslashz, 122 /* z */, 3,0,  TRUE}     //  71 
+    , {doStartString, 255, 11,0,  TRUE}     //  72 
+    , {doExit, 255, 73,0,  TRUE}     //  73      errorDeath
 };
 static const char *RegexStateNames[] = {    0,
     "start",
@ -229,6 +233,8 @@ static const char *RegexStateNames[] = {    0,
    0,
    0,
    0,
+    0,
+    0,
    0,
     "errorDeath",
    0};
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@ -214,12 +214,14 @@ backslash:
   'A'                   n  term                                    doBackslashA
   'B'                   n  term                                    doBackslashB
   'b'                   n  term                                    doBackslashb
+   'd'			 n  expr-quant				    doBackslashd
+   'D'                   n  expr-quant                              doBackslashD
   'G'                   n  term                                    doBackslashG
   'p'			    expr-quant                              doProperty       #   \p{Lu}  style property
   'P'			    expr-quant                              doProperty
-   'W'                   n  term                                    doBackslashW
-   'w'                   n  term                                    doBackslashw
-   'X'                   n  term                                    doBackslashX
+   'W'                   n  expr-quant                              doBackslashW
+   'w'                   n  expr-quant                              doBackslashw
+   'X'                   n  expr-quant                              doBackslashX
   'Z'                   n  term                                    doBackslashZ
   'z'                   n  term                                    doBackslashz
   
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -26,7 +26,7 @@ static const uint32_t     URX_STATE_SAVE    = 6;    // Value field is pattern po
 static const uint32_t     URX_NOP           = 7;
 static const uint32_t     URX_START_CAPTURE = 8;    // Value field is capture group number.
 static const uint32_t     URX_END_CAPTURE   = 9;    // Value field is capture group number
-static const uint32_t     URX_UNUSED10      = 10;   
+static const uint32_t     URX_STATIC_SETREF = 10;   // Value field is index of set in array of sets.   
 static const uint32_t     URX_SETREF        = 11;   // Value field is index of set in array of sets.
 static const uint32_t     URX_DOTANY        = 12; 
 static const uint32_t     URX_JMP           = 13;   // Value field is destination position in
@ -35,11 +35,14 @@ static const uint32_t     URX_FAIL          = 14;   // Stop match operation;  No

 static const uint32_t     URX_BACKSLASH_A   = 15;   
 static const uint32_t     URX_BACKSLASH_B   = 16;   // Value field:  0:  \b    1:  \B
+static const uint32_t     URX_BACKSLASH_D   = 22;   // Value field:  0:  \d    1:  \D
 static const uint32_t     URX_BACKSLASH_G   = 17; 
 static const uint32_t     URX_BACKSLASH_W   = 18;   // Value field:  0:  \w    1:  \W
 static const uint32_t     URX_BACKSLASH_X   = 19;
 static const uint32_t     URX_BACKSLASH_Z   = 20;   // Value field:  0:  \z    1:  \Z

+static const uint32_t     URX_DOTANY_ALL    = 21;   // ., in the . matches any mode.
+

 //
 //  Convenience macros for assembling and disassembling a compiled operation.
@ -49,5 +52,16 @@ static const uint32_t     URX_BACKSLASH_Z   = 20;   // Value field:  0:  \z    1
 #define URX_VAL(x)           ((x) & 0xffffff)

                
+//
+//  Access to Unicode Sets for composite properties
+//     The sets are accessed by the match engine for things like \w (word boundary)
+//     
+static const uint32_t     URX_ISWORD_SET  = 1;
+static const uint32_t     URX_ISALNUM_SET = 2;
+static const uint32_t     URX_ISALPHA_SET = 3;
+static const uint32_t     URX_LAST_SET    = 4;
+
+static const uint32_t     URX_NEG_SET     = 0x800000;  // Flag bit to reverse sense of set
+                                                       //   membership test.
 #endif

--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -183,15 +183,15 @@ int32_t RegexMatcher::end(UErrorCode &err) const {

 int32_t RegexMatcher::end(int group, UErrorCode &err) const {
    if (U_FAILURE(err)) {
-        return 0;
+        return -1;
    }
    if (fMatch == FALSE) {
        err = U_REGEX_INVALID_STATE;
-        return 0;
+        return -1;
    }
    if (group < 0 || group > fPattern->fNumCaptureGroups) {
        err = U_INDEX_OUTOFBOUNDS_ERROR;
-        return 0;
+        return -1;
    }
    int32_t e = -1;
    if (group == 0) {
@ -404,15 +404,15 @@ int32_t RegexMatcher::start(UErrorCode &err) const {

 int32_t RegexMatcher::start(int group, UErrorCode &err) const {
    if (U_FAILURE(err)) {
-        return 0;
+        return -1;
    }
    if (fMatch == FALSE) {
        err = U_REGEX_INVALID_STATE;
-        return 0;
+        return -1;
    }
    if (group < 0 || group > fPattern->fNumCaptureGroups) {
        err = U_INDEX_OUTOFBOUNDS_ERROR;
-        return 0;
+        return -1;
    }
    int32_t s;
    if (group == 0) {
@ -426,6 +426,54 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {



+//--------------------------------------------------------------------------------
+//
+//   isWordBoundary 
+//                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
+//                     For us,
+//                       If the current char is a combining mark, \b is FALSE
+//                       Scan backwards to the first non-combining char
+//                       Pos is a boundary if the current and previous chars are
+//                            opposite in membership in \w set
+//
+//--------------------------------------------------------------------------------
+UBool RegexMatcher::isWordBoundary(int32_t pos) {
+    UBool isBoundary = FALSE;
+    if (pos >=  fInputLength) {
+        // off end of string.  Not a boundary.
+        return FALSE;
+    }
+    
+    // Determine whether char c at Pos is a member of the word set of chars.
+    UChar32  c = fInput->char32At(pos);
+    int8_t ctype = u_charType(c);
+    if (ctype==U_NON_SPACING_MARK || ctype==U_ENCLOSING_MARK) {
+        // Current char is a combining one.  Not a boundary.
+        return FALSE;
+    }
+    UBool cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
+    
+    // Back up until we come to a non-combining char, determine whether
+    //  that char is a word char.
+    UBool prevCIsWord = FALSE;
+    int32_t prevPos = pos;
+    for (;;) {
+        if (prevPos == 0) {
+            break;
+        }
+        prevPos = fInput->moveIndex32(prevPos, -1);
+        UChar32 prevChar = fInput->char32At(prevPos);
+        int8_t prevCType = u_charType(prevChar);
+        if (!(prevCType==U_NON_SPACING_MARK || prevCType==U_ENCLOSING_MARK)) {
+            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
+            break;
+        }
+    }
+    isBoundary = cIsWord ^ prevCIsWord;
+    return isBoundary;
+}
+
+
 //--------------------------------------------------------------------------------
 //
 //    getCaptureText    We have encountered a '\' that might preceed a
@ -597,23 +645,44 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
            break;

        case URX_BACKSLASH_B:          // Test for word boundaries
-            if (FALSE) {
-                backTrack(inputIdx, patIdx);
+            {
+                UBool success = isWordBoundary(inputIdx);
+                success ^= (opValue != 0);     // flip sense for \B
+                if (!success) {
+                    backTrack(inputIdx, patIdx);
+                }
            }
            break;


+        case URX_BACKSLASH_D:
+            {
+                if (inputIdx >= fInputLength) {
+                    backTrack(inputIdx, patIdx);
+                    break;
+                }
+
+                UChar32 c = fInput->char32At(inputIdx);   
+                int8_t ctype = u_charType(c);
+                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
+                success ^= (opValue != 0);
+                if (success) {
+                    inputIdx = fInput->moveIndex32(inputIdx, 1);
+                } else {
+                    backTrack(inputIdx, patIdx);
+                }
+            }
+            break;
+
+
+
+
        case URX_BACKSLASH_G:          // Test for position at end of previous match
            if (FALSE) {
                backTrack(inputIdx, patIdx);
            }
            break;

-        case URX_BACKSLASH_W:          // Match word chars   (TODO:  doesn't belong here?
-            if (FALSE) {
-                backTrack(inputIdx, patIdx);
-            }
-            break;

        case URX_BACKSLASH_X:          // Match combining character sequence
            if (FALSE) {
@ -629,6 +698,33 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {



+        case URX_STATIC_SETREF:
+            {
+                // Test input character against one of the predefined sets
+                //    (Word Characters, for example)
+                // The high bit of the op value is a flag for the match polarity.
+                //    0:   success if input char is in set.
+                //    1:   success if input char is not in set.
+                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);  
+                opValue &= ~URX_NEG_SET;
+                if (inputIdx < fInputLength) {
+                    // There is input left.  Pick up one char and test it for set membership.
+                    UChar32  c = fInput->char32At(inputIdx);
+                    U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
+                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
+                    if (s->contains(c)) {
+                        success = !success;
+                    }
+                }
+                if (success) {
+                    inputIdx = fInput->moveIndex32(inputIdx, 1);
+                } else {
+                    backTrack(inputIdx, patIdx);
+                }
+            }
+            break;
+            
+
        case URX_SETREF:
            if (inputIdx < fInputLength) {
                // There is input left.  Pick up one char and test it for set membership.
@ -648,12 +744,44 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
            

        case URX_DOTANY:
-            // . matches anything, but does not match if we've run out of input.
-            if (inputIdx < fInputLength) {
-                // There is input left.  Advance one character in it.
+            {
+                // . matches anything
+                if (inputIdx >= fInputLength) {
+                    // At end of input.  Match failed.  Backtrack out.
+                    backTrack(inputIdx, patIdx);
+                    break;
+                }
+                // There is input left.  Advance over one char, unless we've hit end-of-line
+                UChar32 c = fInput->char32At(inputIdx);
                inputIdx = fInput->moveIndex32(inputIdx, 1);
-            } else {
-            backTrack(inputIdx, patIdx);
+                if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
+                    // End of line in normal mode.   . does not match.
+                    backTrack(inputIdx, patIdx);
+                    break;
+                }
+            }
+            break;
+            
+            
+        case URX_DOTANY_ALL:
+            {
+                // ., in dot-matches-all (including new lines) mode
+                // . matches anything
+                if (inputIdx >= fInputLength) {
+                    // At end of input.  Match failed.  Backtrack out.
+                    backTrack(inputIdx, patIdx);
+                    break;
+                }
+                // There is input left.  Advance over one char, unless we've hit end-of-line
+                UChar32 c = fInput->char32At(inputIdx);
+                inputIdx = fInput->moveIndex32(inputIdx, 1);
+                if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
+                    // In the case of a CR/LF, we need to advance over both.
+                    UChar32 nextc = fInput->char32At(inputIdx);
+                    if (c == 0x0d && nextc == 0x0a) {
+                        inputIdx = fInput->moveIndex32(inputIdx, 1);
+                    }
+                }
            }
            break;

--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -411,7 +411,7 @@ static char *opNames[] = {
        "NOP",
        "START_CAPTURE",
        "END_CAPTURE",
-        "UNUSED10",
+        "URX_STATIC_SETREF",
        "SETREF",
        "DOTANY",
        "JMP",
@ -466,6 +466,7 @@ void   RegexPattern::dump() {
        case URX_START_CAPTURE:
        case URX_END_CAPTURE:
        case URX_SETREF:
+        case URX_STATIC_SETREF:
        case URX_STATE_SAVE:
        case URX_JMP:
        case URX_BACKSLASH_B:
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -19,6 +19,7 @@ U_NAMESPACE_BEGIN
 class RegexMatcher;
 class UVector;
 class UStack;
+class UnicodeSet;


 //---------------------------------------------------------------------------------
@ -26,7 +27,7 @@ class UStack;
 //  Flags for Regular Expression Modes.
 //   TODO:  Move to C header once one exists.
 //   All flags default to off or false
-//   All are as defined by Java Regexes.
+//   All are as defined by Java Regexps.
 //
 //---------------------------------------------------------------------------------
 enum {
@ -34,7 +35,7 @@ enum {
        UREGEX_CASE_INSENSITIVE = 2,      // Enable case insensitive matching.
        UREGEX_COMMENTS         = 4,      // Allow white space and comments within patterns
        UREGEX_DOTALL           = 32,     // If set, "." matches line terminators.
-                                          //   otherwise matching stops at line end.
+                                          //   otherwise . matching stops at line end.
        UREGEX_MULTILINE        = 8,      // Control behavior of "$" and "^". 
                                          //   If set, recognize line terminators within string
                                          //   otherwise, match only at start and end of
@ -165,7 +166,7 @@ private:
    //
    UnicodeString   fPattern;      // The original pattern string.
    int32_t         fFlags;        // The flags used when compiling the pattern.
-                                   //   TODO:  make an enum type for the flags.
+                                   //   
    UVector         *fCompiledPat; // The compiled, tokenized pattern.
    UnicodeString   fLiteralText;  // Any literal string data from the pattern, 
                                   //   after un-escaping, for use during the match.
@ -180,6 +181,9 @@ private:
    int32_t         fNumCaptureGroups;
    int32_t         fMaxCaptureDigits;

+    const UnicodeSet  **fStaticSets;  // Ptr to static (shared) sets for predefined
+                                    //   regex character classes, e.g. Word.
+
    friend class RegexCompile;
    friend class RegexMatcher;

@ -428,6 +432,7 @@ private:
                                int32_t &repIdx,
                                int32_t &textStart,
                                int32_t &textEnd);
+    UBool        isWordBoundary(int32_t pos);         // perform the \b test


    const RegexPattern  *fPattern;
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -263,6 +263,7 @@ void RegexTest::regex_find(char *pat, char *input, UErrorCode expectedStatus, in
    // matcher->groupCount does not include group 0, the entire match, hence the +1.
    if (isMatch == FALSE && groupStarts.size() != 0) {
        errln("Error at line %d:  Match expected, but none found.\n", line);
+        failed = true;
        goto cleanupAndReturn;
    }
    int i;
@ -315,7 +316,7 @@ void RegexTest::Basic() {
 //
 #if 0
    {
-    REGEX_FIND( "\\p{Lu}+", "here we go ... <0>ABC</0> and no more.")
+    REGEX_FIND("\\D+", "<0>non digits</0>");
    }
    exit(1);
 #endif
@ -428,7 +429,6 @@ void RegexTest::Basic() {
    
    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
-    REGEX_TESTLM("\\b", "\\u0008", TRUE, TRUE);        // BS
    // REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L (or whatever) TODO: bug in Unescape
    // REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape  TODO: bug in Unescape
    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
@ -1012,6 +1012,34 @@ void RegexTest::Extended() {
    REGEX_FIND( "\\p{Lu}+", "here we go ... <0>ABC</0> and no more.");
    REGEX_FIND( "(\\p{L}+)(\\P{L}*?) (\\p{Zs}*)",  "7999<0><1>letters</1><2>4949%^&*(</2> <3>   </3></0>");

+    // \w and \W
+    REGEX_FIND( "\\w+", "  $%^&*( <0>hello123</0>%^&*(");
+    REGEX_FIND( "\\W+", "<0>  $%^&*( </0>hello123%^&*(");
+
+    // \b \B
+    REGEX_FIND( ".*?\\b(.).*", "<0>  $%^&*( <1>h</1>ello123%^&*()gxx</0>");
+
+                 // Finds first chars of up to 5 words
+    REGEX_FIND( "(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?",
+        "<0><1>T</1>the <2>q</2>ick <3>b</3>rown <4>f</4></0>ox");
+    REGEX_FIND( "H.*?((?:\\B.)+)", "<0>H<1>ello</1></0> ");
+    REGEX_FIND( ".*?((?:\\B.)+).*?((?:\\B.)+).*?((?:\\B.)+)",
+        "<0>H<1>ello</1> <2>    </2>g<3>oodbye</3></0> ");
+
+    REGEX_FIND("(?:.*?\\b(.))?(?:.*?\\b(.))?(?:.*?\\b(.))?(?:.*?\\b(.))?(?:.*?\\b(.))?.*",
+        "<0>   \\u0301 \\u0301<1>A</1>\\u0302BC\\u0303\\u0304<2> </2>\\u0305 \\u0306"
+        "<3>X</3>\\u0307Y\\u0308</0>");
+
+    // . does not match new-lines
+    REGEX_FIND(".", "\\u000a\\u000d\\u0085\\u000c\\u2028\\u2029<0>X</0>\\u000aY");
+    REGEX_FIND("A.", "A\\u000a ");  // no match
+
+    // \d for decimal digits
+    REGEX_FIND("\\d*", "<0>0123456789\\u0660\\u06F9\\u0969\\u0A66\\u1369"
+        "\\u17E2\\uFF10\\U0001D7CE\\U0001D7FF</0>non-digits");  
+    REGEX_FIND("\\D+", "<0>non digits</0>");
+    REGEX_FIND("\\D*(\\d*)(\\D*)", "<0>non-digits<1>3456666</1><2>more non digits</2></0>");
+
 }