ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10157
2025-04-10 07:39:16 +00:00 · 2002-11-06 02:35:20 +00:00 · 2002-11-06 02:35:20 +00:00 · 96ec073b83
commit 96ec073b83
parent ee0d1cd5db
10 changed files with 329 additions and 120 deletions
--- a/icu4c/source/common/putil.c
+++ b/icu4c/source/common/putil.c
@ -1832,8 +1832,11 @@ _uBrkErrorName[U_BRK_ERROR_LIMIT - U_BRK_ERROR_START] = {
 static const char * const
 _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
    "U_REGEX_ERROR_START",
-    "U_REGEX_INTERNAL_ERROR"
-    "U_REGEX_INVALID_STATE"
+    "U_REGEX_INTERNAL_ERROR",
+    "U_REGEX_INVALID_STATE",
+    "U_REGEX_BAD_ESCAPE_SEQUENCE",
+    "U_REGEX_PROPERTY_SYNTAX",
+    "U_REGEX_UNIMPLEMENTED"
 };

 U_CAPI const char * U_EXPORT2
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -505,6 +505,7 @@ typedef enum UErrorCode {
     U_REGEX_INVALID_STATE,
     U_REGEX_BAD_ESCAPE_SEQUENCE,
     U_REGEX_PROPERTY_SYNTAX,
+     U_REGEX_UNIMPLEMENTED,
     U_REGEX_ERROR_LIMIT,

    U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -449,26 +449,28 @@ UBool RegexCompile::doParseActions(EParseAction action)


    case doOpenAtomicParen:
-        // Open Paren.
+        // Open Atomic Paren.
+        error(U_REGEX_UNIMPLEMENTED);
        break;

    case doOpenLookAhead:
        // Open Paren.
+        error(U_REGEX_UNIMPLEMENTED);
        break;

    case doOpenLookAheadNeg:
        // Open Paren.
+        error(U_REGEX_UNIMPLEMENTED);
        break;

    case doOpenLookBehind:
        // Open Paren.
+        error(U_REGEX_UNIMPLEMENTED);
        break;

    case doOpenLookBehindNeg:
        // Open Paren.
-        break;
-
-    case doExprRParen:
+        error(U_REGEX_UNIMPLEMENTED);
        break;

    case doCloseParen:
@ -702,6 +704,14 @@ UBool RegexCompile::doParseActions(EParseAction action)
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus);
        break;

+    case doCaret:       // TODO:  multi-line mode flag.
+        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
+        break;
+
+
+    case doDollar:       // TODO:  multi-line mode flag.
+        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
+        break;

    case doBackslashA:
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
@ -751,8 +761,15 @@ UBool RegexCompile::doParseActions(EParseAction action)
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
        break;        

+    case doBackslashx:              // \x{abcd}   alternate hex format
+        //  TODO:  implement 
+        error(U_REGEX_UNIMPLEMENTED);
+        break;
+            
+
+
    case doBackslashZ:
-        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 1), *fStatus);
+        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
        break;        

    case doBackslashz:
@ -782,6 +799,16 @@ UBool RegexCompile::doParseActions(EParseAction action)
        // Just scanned a \Q.  Put character scanner into quote mode.
        fQuoteMode = TRUE;
        break;
+
+    case doBackRef:
+        //  TODO:  implement back references.
+        error(U_REGEX_UNIMPLEMENTED);
+        break;
+
+    case doNamedChar:            // \N{NAMED_CHAR}
+        //  TODO:  implement 
+        error(U_REGEX_UNIMPLEMENTED);
+        break;
            
    default:
        error(U_BRK_INTERNAL_ERROR);
@ -951,7 +978,7 @@ void RegexCompile::error(UErrorCode e) {
        *fStatus = e;
        fParseErr->line  = fLineNum;
        fParseErr->offset = fCharNum;
-        fParseErr->preContext[0] = 0;
+        fParseErr->preContext[0] = 0;    // TODO:  copy in some input pattern text
        fParseErr->preContext[0] = 0;
    }
 }
@ -959,11 +986,6 @@ void RegexCompile::error(UErrorCode e) {



-
-
-
-
-
 //
 //  Assorted Unicode character constants.
 //     Numeric because there is no portable way to enter them as literals.
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@ -22,7 +22,6 @@ U_NAMESPACE_BEGIN


 enum Regex_PatternParseAction {
-    doExprOrOperator,
    doCloseParen,
    doProperty,
    doTagValue,
@ -33,12 +32,14 @@ enum Regex_PatternParseAction {
    doBackslashs,
    doStartString,
    doNGOpt,
+    doNamedChar,
    doBackslashw,
    doPossesiveStar,
    doOpenLookBehind,
-    doExprRParen,
+    doBackslashx,
    doBackslashz,
    doStar,
+    doCaret,
    doEnterQuoteMode,
    doPossesivePlus,
    doNGStar,
@ -57,9 +58,11 @@ enum Regex_PatternParseAction {
    doOpt,
    doOpenAtomicParen,
    doBackslashS,
+    doNumberExpectedError,
    doStringChar,
    doOpenLookAhead,
-    doNumberExpectedError,
+    doBackRef,
+    doDollar,
    doDotAny,
    doBackslashW,
    doBackslashX,
@ -94,80 +97,84 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    {doNOP, 0, 0, 0, TRUE}
    , {doPatStart, 255, 3, 2, FALSE}     //  1      start
    , {doPatFinish, 255, 2,0,  FALSE}     //  2      finish
-    , {doStartString, 254, 11,0,  TRUE}     //  3      term
-    , {doStartString, 130, 11,0,  TRUE}     //  4 
-    , {doScanUnicodeSet, 91 /* [ */, 18,0,  TRUE}     //  5 
-    , {doNOP, 40 /* ( */, 25, 18, TRUE}     //  6 
-    , {doDotAny, 46 /* . */, 18,0,  TRUE}     //  7 
-    , {doNOP, 92 /* \ */, 59,0,  TRUE}     //  8 
-    , {doNOP, 253, 2,0,  FALSE}     //  9 
-    , {doRuleError, 255, 76,0,  FALSE}     //  10 
-    , {doStringChar, 254, 11,0,  TRUE}     //  11      string
-    , {doStringChar, 130, 11,0,  TRUE}     //  12 
-    , {doSplitString, 63 /* ? */, 18,0,  FALSE}     //  13 
-    , {doSplitString, 43 /* + */, 18,0,  FALSE}     //  14 
-    , {doSplitString, 42 /* * */, 18,0,  FALSE}     //  15 
-    , {doSplitString, 123 /* { */, 18,0,  FALSE}     //  16 
-    , {doEndString, 255, 18,0,  FALSE}     //  17 
-    , {doNOP, 42 /* * */, 36,0,  TRUE}     //  18      expr-quant
-    , {doNOP, 43 /* + */, 39,0,  TRUE}     //  19 
-    , {doNOP, 63 /* ? */, 42,0,  TRUE}     //  20 
-    , {doNOP, 255, 22,0,  FALSE}     //  21 
-    , {doOrOperator, 124 /* | */, 3,0,  TRUE}     //  22      expr-cont
-    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  23 
-    , {doNOP, 255, 3,0,  FALSE}     //  24 
-    , {doNOP, 63 /* ? */, 27,0,  TRUE}     //  25      open-paren
-    , {doOpenCaptureParen, 255, 3, 18, FALSE}     //  26 
-    , {doOpenNonCaptureParen, 58 /* : */, 3, 18, TRUE}     //  27      open-paren-extended
-    , {doOpenAtomicParen, 62 /* > */, 3, 18, TRUE}     //  28 
-    , {doOpenLookAhead, 61 /* = */, 3, 22, TRUE}     //  29 
-    , {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE}     //  30 
-    , {doNOP, 60 /* < */, 33,0,  TRUE}     //  31 
-    , {doBadOpenParenType, 255, 76,0,  FALSE}     //  32 
-    , {doOpenLookBehind, 61 /* = */, 3, 22, TRUE}     //  33      open-paren-lookbehind
-    , {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE}     //  34 
-    , {doBadOpenParenType, 255, 76,0,  FALSE}     //  35 
-    , {doNGStar, 63 /* ? */, 22,0,  TRUE}     //  36      quant-star
-    , {doPossesiveStar, 43 /* + */, 22,0,  TRUE}     //  37 
-    , {doStar, 255, 22,0,  FALSE}     //  38 
-    , {doNGPlus, 63 /* ? */, 22,0,  TRUE}     //  39      quant-plus
-    , {doPossesivePlus, 43 /* + */, 22,0,  TRUE}     //  40 
-    , {doPlus, 255, 22,0,  FALSE}     //  41 
-    , {doNGOpt, 63 /* ? */, 22,0,  TRUE}     //  42      quant-opt
-    , {doPossesiveOpt, 43 /* + */, 22,0,  TRUE}     //  43 
-    , {doOpt, 255, 22,0,  FALSE}     //  44 
-    , {doNOP, 129, 45,0,  TRUE}     //  45      interval-open
-    , {doIntervalMinValue, 128, 48,0,  FALSE}     //  46 
-    , {doNumberExpectedError, 255, 76,0,  FALSE}     //  47 
-    , {doNOP, 129, 52,0,  TRUE}     //  48      interval-value
-    , {doNOP, 125 /* } */, 52,0,  FALSE}     //  49 
-    , {doIntervalDigit, 128, 48,0,  TRUE}     //  50 
-    , {doNumberExpectedError, 255, 76,0,  FALSE}     //  51 
-    , {doNOP, 129, 52,0,  TRUE}     //  52      interval-close
-    , {doTagValue, 125 /* } */, 55,0,  TRUE}     //  53 
-    , {doNumberExpectedError, 255, 76,0,  FALSE}     //  54 
-    , {doNOP, 254, 3,0,  FALSE}     //  55      expr-cont-no-interval
-    , {doExprOrOperator, 124 /* | */, 3,0,  TRUE}     //  56 
-    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  57 
-    , {doNOP, 255, 3,0,  FALSE}     //  58 
-    , {doBackslashA, 65 /* A */, 3,0,  TRUE}     //  59      backslash
-    , {doBackslashB, 66 /* B */, 3,0,  TRUE}     //  60 
-    , {doBackslashb, 98 /* b */, 3,0,  TRUE}     //  61 
-    , {doBackslashd, 100 /* d */, 18,0,  TRUE}     //  62 
-    , {doBackslashD, 68 /* D */, 18,0,  TRUE}     //  63 
-    , {doBackslashG, 71 /* G */, 3,0,  TRUE}     //  64 
-    , {doProperty, 112 /* p */, 18,0,  FALSE}     //  65 
-    , {doProperty, 80 /* P */, 18,0,  FALSE}     //  66 
-    , {doEnterQuoteMode, 81 /* Q */, 3,0,  TRUE}     //  67 
-    , {doBackslashS, 83 /* S */, 18,0,  TRUE}     //  68 
-    , {doBackslashs, 115 /* s */, 18,0,  TRUE}     //  69 
-    , {doBackslashW, 87 /* W */, 18,0,  TRUE}     //  70 
-    , {doBackslashw, 119 /* w */, 18,0,  TRUE}     //  71 
-    , {doBackslashX, 88 /* X */, 18,0,  TRUE}     //  72 
-    , {doBackslashZ, 90 /* Z */, 3,0,  TRUE}     //  73 
-    , {doBackslashz, 122 /* z */, 3,0,  TRUE}     //  74 
-    , {doStartString, 255, 11,0,  TRUE}     //  75 
-    , {doExit, 255, 76,0,  TRUE}     //  76      errorDeath
+    , {doStartString, 254, 13,0,  TRUE}     //  3      term
+    , {doStartString, 130, 13,0,  TRUE}     //  4 
+    , {doScanUnicodeSet, 91 /* [ */, 20,0,  TRUE}     //  5 
+    , {doNOP, 40 /* ( */, 27, 20, TRUE}     //  6 
+    , {doDotAny, 46 /* . */, 20,0,  TRUE}     //  7 
+    , {doCaret, 94 /* ^ */, 3,0,  TRUE}     //  8 
+    , {doDollar, 36 /* $ */, 3,0,  TRUE}     //  9 
+    , {doNOP, 92 /* \ */, 60,0,  TRUE}     //  10 
+    , {doNOP, 253, 2,0,  FALSE}     //  11 
+    , {doRuleError, 255, 80,0,  FALSE}     //  12 
+    , {doStringChar, 254, 13,0,  TRUE}     //  13      string
+    , {doStringChar, 130, 13,0,  TRUE}     //  14 
+    , {doSplitString, 63 /* ? */, 20,0,  FALSE}     //  15 
+    , {doSplitString, 43 /* + */, 20,0,  FALSE}     //  16 
+    , {doSplitString, 42 /* * */, 20,0,  FALSE}     //  17 
+    , {doSplitString, 123 /* { */, 20,0,  FALSE}     //  18 
+    , {doEndString, 255, 20,0,  FALSE}     //  19 
+    , {doNOP, 42 /* * */, 41,0,  TRUE}     //  20      expr-quant
+    , {doNOP, 43 /* + */, 44,0,  TRUE}     //  21 
+    , {doNOP, 63 /* ? */, 47,0,  TRUE}     //  22 
+    , {doNOP, 255, 24,0,  FALSE}     //  23 
+    , {doOrOperator, 124 /* | */, 3,0,  TRUE}     //  24      expr-cont
+    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  25 
+    , {doNOP, 255, 3,0,  FALSE}     //  26 
+    , {doNOP, 63 /* ? */, 29,0,  TRUE}     //  27      open-paren
+    , {doOpenCaptureParen, 255, 3, 20, FALSE}     //  28 
+    , {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE}     //  29      open-paren-extended
+    , {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE}     //  30 
+    , {doOpenLookAhead, 61 /* = */, 3, 24, TRUE}     //  31 
+    , {doOpenLookAheadNeg, 33 /* ! */, 3, 24, TRUE}     //  32 
+    , {doNOP, 60 /* < */, 36,0,  TRUE}     //  33 
+    , {doNOP, 35 /* # */, 39,0,  TRUE}     //  34 
+    , {doBadOpenParenType, 255, 80,0,  FALSE}     //  35 
+    , {doOpenLookBehind, 61 /* = */, 3, 24, TRUE}     //  36      open-paren-lookbehind
+    , {doOpenLookBehindNeg, 33 /* ! */, 3, 24, TRUE}     //  37 
+    , {doBadOpenParenType, 255, 80,0,  FALSE}     //  38 
+    , {doNOP, 41 /* ) */, 3,0,  TRUE}     //  39      paren-comment
+    , {doNOP, 255, 39,0,  TRUE}     //  40 
+    , {doNGStar, 63 /* ? */, 24,0,  TRUE}     //  41      quant-star
+    , {doPossesiveStar, 43 /* + */, 24,0,  TRUE}     //  42 
+    , {doStar, 255, 24,0,  FALSE}     //  43 
+    , {doNGPlus, 63 /* ? */, 24,0,  TRUE}     //  44      quant-plus
+    , {doPossesivePlus, 43 /* + */, 24,0,  TRUE}     //  45 
+    , {doPlus, 255, 24,0,  FALSE}     //  46 
+    , {doNGOpt, 63 /* ? */, 24,0,  TRUE}     //  47      quant-opt
+    , {doPossesiveOpt, 43 /* + */, 24,0,  TRUE}     //  48 
+    , {doOpt, 255, 24,0,  FALSE}     //  49 
+    , {doNOP, 129, 50,0,  TRUE}     //  50      interval-open
+    , {doIntervalMinValue, 128, 53,0,  FALSE}     //  51 
+    , {doNumberExpectedError, 255, 80,0,  FALSE}     //  52 
+    , {doNOP, 129, 57,0,  TRUE}     //  53      interval-value
+    , {doNOP, 125 /* } */, 57,0,  FALSE}     //  54 
+    , {doIntervalDigit, 128, 53,0,  TRUE}     //  55 
+    , {doNumberExpectedError, 255, 80,0,  FALSE}     //  56 
+    , {doNOP, 129, 57,0,  TRUE}     //  57      interval-close
+    , {doTagValue, 125 /* } */, 24,0,  TRUE}     //  58 
+    , {doNumberExpectedError, 255, 80,0,  FALSE}     //  59 
+    , {doBackslashA, 65 /* A */, 3,0,  TRUE}     //  60      backslash
+    , {doBackslashB, 66 /* B */, 3,0,  TRUE}     //  61 
+    , {doBackslashb, 98 /* b */, 3,0,  TRUE}     //  62 
+    , {doBackslashd, 100 /* d */, 20,0,  TRUE}     //  63 
+    , {doBackslashD, 68 /* D */, 20,0,  TRUE}     //  64 
+    , {doBackslashG, 71 /* G */, 3,0,  TRUE}     //  65 
+    , {doNamedChar, 78 /* N */, 20,0,  TRUE}     //  66 
+    , {doProperty, 112 /* p */, 20,0,  FALSE}     //  67 
+    , {doProperty, 80 /* P */, 20,0,  FALSE}     //  68 
+    , {doEnterQuoteMode, 81 /* Q */, 3,0,  TRUE}     //  69 
+    , {doBackslashS, 83 /* S */, 20,0,  TRUE}     //  70 
+    , {doBackslashs, 115 /* s */, 20,0,  TRUE}     //  71 
+    , {doBackslashW, 87 /* W */, 20,0,  TRUE}     //  72 
+    , {doBackslashw, 119 /* w */, 20,0,  TRUE}     //  73 
+    , {doBackslashX, 88 /* X */, 20,0,  TRUE}     //  74 
+    , {doBackslashx, 120 /* x */, 20,0,  TRUE}     //  75 
+    , {doBackslashZ, 90 /* Z */, 3,0,  TRUE}     //  76 
+    , {doBackslashz, 122 /* z */, 3,0,  TRUE}     //  77 
+    , {doBackRef, 128, 20,0,  TRUE}     //  78 
+    , {doStartString, 255, 13,0,  TRUE}     //  79 
+    , {doExit, 255, 80,0,  TRUE}     //  80      errorDeath
 };
 static const char *RegexStateNames[] = {    0,
     "start",
@ -179,6 +186,8 @@ static const char *RegexStateNames[] = {    0,
    0,
    0,
    0,
+    0,
+    0,
    0,
     "string",
    0,
@ -201,9 +210,12 @@ static const char *RegexStateNames[] = {    0,
    0,
    0,
    0,
+    0,
    0,
     "open-paren-lookbehind",
    0,
+    0,
+     "paren-comment",
    0,
     "quant-star",
    0,
@ -223,10 +235,6 @@ static const char *RegexStateNames[] = {    0,
    0,
     "interval-close",
    0,
-    0,
-     "expr-cont-no-interval",
-    0,
-    0,
    0,
     "backslash",
    0,
@ -244,6 +252,9 @@ static const char *RegexStateNames[] = {    0,
    0,
    0,
    0,
+    0,
+    0,
+    0,
    0,
     "errorDeath",
    0};
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@ -77,6 +77,8 @@ term:
    '['                  n expr-quant     		            doScanUnicodeSet
    '('                  n open-paren            ^expr-quant          
    '.'                  n expr-quant                               doDotAny
+    '^'                  n term                                     doCaret
+    '$'                  n term                                     doDollar
    '\'                  n backslash
    eof		           finish
    default                errorDeath                               doRuleError
@ -133,6 +135,7 @@ open-paren-extended:
    '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
    '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
    '<'                  n  open-paren-lookbehind
+    '#'                  n  paren-comment
    default                 errorDeath                              doBadOpenParenType
    
 open-paren-lookbehind:
@ -141,6 +144,15 @@ open-paren-lookbehind:
    default                 errorDeath                              doBadOpenParenType
    

+#
+#   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
+#                    TODO:  should parens nest here?  Check what perl does.
+#
+paren-comment:
+    ')'                  n  term
+    default              n  paren-comment
+    
+    
 #
 #  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
 #                 between plain '*', '*?', '*+'
@ -188,23 +200,11 @@ interval-value:
    
 interval-close:
    white_space          n  interval-close
-    '}'                  n  expr-cont-no-interval                   doTagValue
+    '}'                  n  expr-cont                               doTagValue
    default                 errorDeath                              doNumberExpectedError
    
    
    
-#
-#  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
-#                                            allowed, but not required.  Just like
-#                                            expr-cont, above, except that no interval
-#                                            specification {min, max}  is permitted.
-#
-expr-cont-no-interval:
-    quoted                  term                                    
-    '|'                  n  term                                    doExprOrOperator
-    ')'                  n  pop                                     doExprRParen
-    default                 term                   
-    
    
 #
 #  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
@ -217,6 +217,7 @@ backslash:
   'd'			 n  expr-quant				    doBackslashd
   'D'                   n  expr-quant                              doBackslashD
   'G'                   n  term                                    doBackslashG
+   'N'			 n  expr-quant                              doNamedChar      #   \N{NAME}  named char
   'p'			    expr-quant                              doProperty       #   \p{Lu}  style property
   'P'			    expr-quant                              doProperty
   'Q'                   n  term                                    doEnterQuoteMode
@ -225,8 +226,10 @@ backslash:
   'W'                   n  expr-quant                              doBackslashW
   'w'                   n  expr-quant                              doBackslashw
   'X'                   n  expr-quant                              doBackslashX
+   'x'                   n  expr-quant                              doBackslashx
   'Z'                   n  term                                    doBackslashZ
   'z'                   n  term                                    doBackslashz
+   digit_char	         n  expr-quant			            doBackRef
   
   default               n  string				    doStartString   

--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -35,13 +35,15 @@ static const uint32_t     URX_FAIL          = 14;   // Stop match operation;  No

 static const uint32_t     URX_BACKSLASH_A   = 15;   
 static const uint32_t     URX_BACKSLASH_B   = 16;   // Value field:  0:  \b    1:  \B
-static const uint32_t     URX_BACKSLASH_D   = 22;   // Value field:  0:  \d    1:  \D
 static const uint32_t     URX_BACKSLASH_G   = 17; 
 static const uint32_t     URX_BACKSLASH_W   = 18;   // Value field:  0:  \w    1:  \W
 static const uint32_t     URX_BACKSLASH_X   = 19;
-static const uint32_t     URX_BACKSLASH_Z   = 20;   // Value field:  0:  \z    1:  \Z
+static const uint32_t     URX_BACKSLASH_Z   = 20;   // \z   Unconditional end of line.

 static const uint32_t     URX_DOTANY_ALL    = 21;   // ., in the . matches any mode.
+static const uint32_t     URX_BACKSLASH_D   = 22;   // Value field:  0:  \d    1:  \D
+static const uint32_t     URX_CARET         = 23;   // Value field:  1:  multi-line mode.
+static const uint32_t     URX_DOLLAR        = 24;   // Also for \Z


 //
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -560,9 +560,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
    //  Cache frequently referenced items from the compiled pattern
    //  in local variables.
    //
-    UVector             *pat     = fPattern->fCompiledPat;
-    const UnicodeString *litText = &fPattern->fLiteralText;
-    UVector             *sets    = fPattern->fSets;
+    UVector             *pat      = fPattern->fCompiledPat;
+    const UnicodeString *litText  = &fPattern->fLiteralText;
+    UVector             *sets     = fPattern->fSets;
+    int32_t              inputLen = fInput->length();
    

    //
@ -658,6 +659,46 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
            fCaptureEnds->setElementAt(inputIdx, opValue);
            break;

+
+        case URX_DOLLAR:                   //  $, test for End of line
+                                           //     or for position before new line at end of input
+            if (inputIdx < inputLen-2) {
+                // We are no where near the end of input.  Fail.
+                backTrack(inputIdx, patIdx);
+                break;
+            }
+            if (inputIdx >= inputLen) {
+                // We really are at the end of input.  Success.
+                break;
+            }
+            // If we are positioned just before a new-line that is located at the
+            //   end of input, succeed.
+            if (inputIdx == inputLen-1) {
+                UChar32 c = fInput->char32At(inputIdx);
+                if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
+                    break;                         // At new-line at end of input. Success
+                }
+            }
+
+            if (inputIdx == inputLen-2) {
+                if (fInput->char32At(inputIdx) == 0x0d && fInput->char32At(inputIdx+1) == 0x0a) {
+                    break;                         // At CR/LF at end of input.  Success
+                }
+            }
+
+            backTrack(inputIdx, patIdx);
+
+            // TODO:  support for multi-line mode.
+            break;
+
+
+        case URX_CARET:                    //  ^, test for start of line
+            if (inputIdx != 0) {
+                backTrack(inputIdx, patIdx);
+            }                              // TODO:  support for multi-line mode.
+            break;
+
+
        case URX_BACKSLASH_A:          // Test for start of input
            if (inputIdx != 0) {
                backTrack(inputIdx, patIdx);
@ -731,18 +772,24 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
                    for(;;) {   
                        c = fInput->char32At(inputIdx);   
                        ctype = u_charType(c);
-                        // TODO:  make a set and add the "othe grapheme extend" chars
+                        // TODO:  make a set and add the "other grapheme extend" chars
                        //        to the list of stuff to be skipped over.
                        if (!(ctype == U_NON_SPACING_MARK || ctype == U_ENCLOSING_MARK)) {
                            break;
                        }
+                        inputIdx = fInput->moveIndex32(inputIdx, 1);
+                        if (inputIdx >= fInputLength) {
+                            break; 
+                        }
                    }
                }
            }
            break;

+
+
        case URX_BACKSLASH_Z:          // Test for end of line
-            if (FALSE) {
+            if (inputIdx < inputLen) {
                backTrack(inputIdx, patIdx);
            }
            break;
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -421,7 +421,11 @@ static char *opNames[] = {
        "URX_BACKSLASH_G",
        "URX_BACKSLASH_W",
        "URX_BACKSLASH_X",
-        "URX_BACKSLASH_Z"
+        "URX_BACKSLASH_Z",
+        "URX_DOTANY_ALL",
+        "URX_BACKSLASH_D",
+        "URX_CARET",
+        "URX_DOLLAR"
 };

 void   RegexPattern::dump() {
@ -470,8 +474,11 @@ void   RegexPattern::dump() {
        case URX_STATE_SAVE:
        case URX_JMP:
        case URX_BACKSLASH_B:
+        case URX_BACKSLASH_D:
        case URX_BACKSLASH_W:
        case URX_BACKSLASH_Z:
+        case URX_CARET:
+        case URX_DOLLAR:
            // types with an integer operand field.
            printf("%d", val);
            break;
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -54,6 +54,10 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
        case 4: name = "Extended";
            if (exec) Extended(); 
            break;
+        case 5: name = "Errors";
+            if (exec) Errors(); 
+            break;
+
        default: name = ""; 
            break; //needed to end loop
    }
@ -163,7 +167,7 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
 //
 //       usage:
 //          REGEX_FIND("pattern",  "input text");
-//          REGEX_FIND_S("pattern",  "input text",  expected status);
+//          REGEX_ERR("pattern",   expected status);
 //
 //          The input text is unescaped.  The pattern is not.
 //          The input text is marked with the expected match positions
@ -177,7 +181,6 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
 // REGEX_FIND is invoked via a macro, which allows capturing the source file line
 //            number for use in error messages.
 #define REGEX_FIND(pat, text) regex_find(pat, text, U_ZERO_ERROR, __LINE__);
-#define REGEX_FIND_S(pat, text, status) regex_find(pat, text, status, __LINE__);


 //  Set a value into a UVector at position specified by a decimal number in
@ -301,6 +304,52 @@ cleanupAndReturn:
 }
 

+
+
+
+
+
+
+//---------------------------------------------------------------------------
+//
+//    REGEX_ERR       Macro + invocation function to simplify writing tests
+//                       regex tests for incorrect patterns
+//
+//       usage:
+//          REGEX_ERR("pattern",   expected error line, column, expected status);
+//
+//---------------------------------------------------------------------------
+#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
+
+void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
+                          UErrorCode expectedStatus, int line) {
+    UnicodeString       pattern(pat);
+
+    UErrorCode          status         = U_ZERO_ERROR;
+    UParseError         pe;
+    RegexPattern        *callerPattern = NULL;
+
+    //
+    //  Compile the caller's pattern
+    //
+    UnicodeString patString(pat);
+    callerPattern = RegexPattern::compile(patString, 0, pe, status);
+    if (status != expectedStatus) {
+        errln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
+    } else {
+        if (status != U_ZERO_ERROR) {
+            if (pe.line != errLine || pe.offset != errCol) {
+                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
+                    line, errLine, errCol, pe.line, pe.offset);
+            }
+        }
+    }
+
+    delete callerPattern;
+}
+
+
+
 //---------------------------------------------------------------------------
 //
 //      Basic      Check for basic functionality of regex pattern matching.
@ -429,8 +478,8 @@ void RegexTest::Basic() {
    
    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
-    // REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L (or whatever) TODO: bug in Unescape
-    // REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape  TODO: bug in Unescape
+    REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L 
+    REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape 
    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
@ -1087,7 +1136,66 @@ void RegexTest::Extended() {

    // \X  consume one combining char sequence.
    REGEX_FIND("(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?",
-        "<0><1>A</1><2>B</2><3> </3></0>");
+        "<0><1>A</1><2>B</2><3> </3><4>\\r\\n</4></0>");
+    REGEX_FIND("(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?(\\X)?",
+        "<0><1>A\\u0301</1><2>\n</2><3>\\u0305</3><4>a\\u0302\\u0303\\u0304</4></0>");
+
+    // ^ matches only at beginning of line
+    REGEX_FIND(".*^(Hello)", "<0><1>Hello</1></0> Hello Hello Hello Goodbye");
+    REGEX_FIND(".*(Hello)",  "<0>Hello Hello Hello <1>Hello</1></0> Goodbye");
+    REGEX_FIND(".*^(Hello)", " Hello Hello Hello Hello Goodbye");   // No Match
+
+    // $ matches only at end of line, or before a newline preceding the end of line
+    REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>");
+    REGEX_FIND(".*?(Goodbye)", "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye");
+    REGEX_FIND(".*?(Goodbye)$", "Hello Goodbye> Goodbye Goodbye ");  // No Match
+
+    REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
+    REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
+    REGEX_FIND(".*?(Goodbye)$", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\r\\n");
+    REGEX_FIND(".*?(Goodbye)$", "Hello Goodbye Goodbye Goodbye\\n\\n");  // No Match
+    
+    // \Z matches at end of input, like $ with default flags.
+    REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>");
+    REGEX_FIND(".*?(Goodbye)", "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye");
+    REGEX_FIND(".*?(Goodbye)\\Z", "Hello Goodbye> Goodbye Goodbye ");  // No Match
+    REGEX_FIND("here$", "here\\nthe end");   // No Match
+
+    REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
+    REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\n");
+    REGEX_FIND(".*?(Goodbye)\\Z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\\r\\n");
+    REGEX_FIND(".*?(Goodbye)\\Z", "Hello Goodbye Goodbye Goodbye\\n\\n");  // No Match
+    
+    // \z matches only at the end of string.
+    //    no special treatment of new lines.
+    //    no dependencies on flag settings.
+    REGEX_FIND(".*?(Goodbye)\\z", "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>");
+    REGEX_FIND(".*?(Goodbye)\\z", "Hello Goodbye Goodbye Goodbye ");  // No Match
+    REGEX_FIND("here$", "here\\nthe end");   // No Match
+
+    REGEX_FIND(".*?(Goodbye)\\z", "Hello Goodbye Goodbye Goodbye\\n");   // No Match
+    REGEX_FIND(".*?(Goodbye)\\n\\z", "<0>Hello Goodbye Goodbye <1>Goodbye</1>\\n</0>");
+    
+    // (?# comment) doesn't muck up pattern
+    REGEX_FIND("Hello (?# this is a comment) world", "  <0>Hello  world</0>...");
+
+}
+
+
+
+//---------------------------------------------------------------------------
+//
+//      Errors     Check for error handling in patterns.
+//
+//---------------------------------------------------------------------------
+void RegexTest::Errors() {
+    // \escape sequences that aren't implemented yet.
+    REGEX_ERR("No (support) for \\1 BackReferences yet.", 1, 19,  U_REGEX_UNIMPLEMENTED);
+    REGEX_ERR("named chars \\N{GREEK CAPITAL LETTER ALPHA} not implementd", 1, 14, U_REGEX_UNIMPLEMENTED);
+    REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
+
+    // Missing close parentheses
+    //REGEX_ERR("Comment (?# with no close", 1, 0, U_REGEX_INTERNAL_ERROR);
 }


--- a/icu4c/source/test/intltest/regextst.h
+++ b/icu4c/source/test/intltest/regextst.h
@ -21,13 +21,18 @@ public:

    virtual void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL );

+    // The following are test functions that are visible from the intltest test framework.
    virtual void API_Match();
    virtual void API_Pattern();
    virtual void API_Replace();
    virtual void Basic();
    virtual void Extended();
+    virtual void Errors();

+    // The following functions are internal to the regexp tests.
    virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line);
    virtual void regex_find(const char *pat, const char *input, UErrorCode expectedStatus, int line);
+    virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
+                            UErrorCode expectedStatus, int line);
 };
 #endif