ICU-2422 regexp, tests from perl, and some bug fixes

X-SVN-Rev: 10901
2025-04-13 08:53:20 +00:00 · 2003-01-24 02:05:03 +00:00 · 2003-01-24 02:05:03 +00:00 · a92820c54b
commit a92820c54b
parent f092768650
7 changed files with 453 additions and 97 deletions
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -154,6 +154,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
    fQuoteMode      = FALSE;
    fFreeForm       = FALSE;
    fMatcherDataEnd = 0;
+    fBackRefMax     = 0;

    fMatchOpenParen  = -1;
    fMatchCloseParen = -1;
@ -371,6 +372,24 @@ void    RegexCompile::compile(
    // The pattern has now been read and processed, and the compiled code generated.
    //

+    // Back-reference fixup
+    //
+    int32_t loc;
+    for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
+        int32_t op = fRXPat->fCompiledPat->elementAti(loc);
+        if (URX_TYPE(op) == URX_BACKREF) {
+            int32_t where = URX_VAL(op);
+            if (where > fRXPat->fGroupMap->size()) {
+                error(U_REGEX_INVALID_BACK_REF);
+                break;
+            }
+            where = fRXPat->fGroupMap->elementAti(where-1);
+            op    = URX_BUILD(URX_BACKREF, where);
+            fRXPat->fCompiledPat->setElementAt(op, loc);
+        }
+    }
+
+
    //
    // Compute the number of digits requried for the largest capture group number.
    //
@ -608,6 +627,14 @@ UBool RegexCompile::doParseActions(EParseAction action)
        error(U_REGEX_UNIMPLEMENTED);
        break;

+    case doConditionalExpr:
+        // Conditionals such as (?(1)a:b)
+    case doPerlInline:
+        // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
+        error(U_REGEX_UNIMPLEMENTED);
+        break;
+
+
    case doCloseParen:
        handleCloseParen();
        if (fParenStack.size() <= 0) {
@ -896,6 +923,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
        break;

+    case doEscapeError:
+        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
+        break;
+
    case doExit:
        returnVal = FALSE;
        break;
@ -929,9 +960,8 @@ UBool RegexCompile::doParseActions(EParseAction action)
            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
            int32_t  groupNum = 0;
            UChar32  c        = fC.fChar;
-            int32_t  t;

-            for (t=numCaptureGroups; t>0; t=t/10) {
+            for (;;) {
                // Loop once per digit, for max allowed number of digits in a back reference.
                groupNum = groupNum * 10 + u_charDigitValue(c);
                if (groupNum >= numCaptureGroups) {
@ -943,16 +973,15 @@ UBool RegexCompile::doParseActions(EParseAction action)
                }
                nextCharLL();
            }
-            if (groupNum > numCaptureGroups) {
-                error(U_REGEX_INVALID_BACK_REF);
-                break;
-            }

            // Scan of the back reference in the source regexp is complete.  Now generate
-            //  the compiled code for it.
+            //  the compiled code for it. 
+            // Because capture groups can be forward-referenced by back-references,
+            //  we fill the operand with the capture group number.  At the end
+            //  of compilation, it will be changed to the variables location.
            U_ASSERT(groupNum > 0);
-            int32_t  varsLoc = fRXPat->fGroupMap->elementAti(groupNum-1);
-            int32_t  op = URX_BUILD(URX_BACKREF, varsLoc);
+            // int32_t  varsLoc = fRXPat->fGroupMap->elementAti(groupNum-1);
+            int32_t  op = URX_BUILD(URX_BACKREF, groupNum);
            fRXPat->fCompiledPat->addElement(op, *fStatus);
        }
        break;
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -154,6 +154,12 @@ private:

    int32_t                       fMatcherDataEnd;   // Location Counter for allocation of data
                                                     //   to be used by the matcher at match time.
+
+    int32_t                       fBackRefMax;       // Number of the largest capture group with a
+                                                     //   back reference.  Capture groups can be forward-
+                                                     //   referenced, so we can't flag an error on
+                                                     //   a too-big back ref number until the end of the
+                                                     //   pattern is reached.
 };

 U_NAMESPACE_END
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@ -56,6 +56,7 @@ enum Regex_PatternParseAction {
    doPatFinish,
    doBackslashD,
    doPossesiveOpt,
+    doEscapeError,
    doBackslashG,
    doOpt,
    doInterval,
@ -72,7 +73,9 @@ enum Regex_PatternParseAction {
    doBackslashX,
    doScanUnicodeSet,
    doBackslashZ,
+    doPerlInline,
    doNOP,
+    doConditionalExpr,
    doExit,
    doNGInterval,
    doPatStart,
@ -107,13 +110,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doDotAny, 46 /* . */, 12,0,  TRUE}     //  6 
    , {doCaret, 94 /* ^ */, 2,0,  TRUE}     //  7 
    , {doDollar, 36 /* $ */, 2,0,  TRUE}     //  8 
-    , {doNOP, 92 /* \ */, 70,0,  TRUE}     //  9 
+    , {doNOP, 92 /* \ */, 72,0,  TRUE}     //  9 
    , {doPatFinish, 253, 2,0,  FALSE}     //  10 
-    , {doRuleError, 255, 91,0,  FALSE}     //  11 
-    , {doNOP, 42 /* * */, 48,0,  TRUE}     //  12      expr-quant
-    , {doNOP, 43 /* + */, 51,0,  TRUE}     //  13 
-    , {doNOP, 63 /* ? */, 54,0,  TRUE}     //  14 
-    , {doIntervalInit, 123 /* { */, 57,0,  TRUE}     //  15 
+    , {doRuleError, 255, 94,0,  FALSE}     //  11 
+    , {doNOP, 42 /* * */, 50,0,  TRUE}     //  12      expr-quant
+    , {doNOP, 43 /* + */, 53,0,  TRUE}     //  13 
+    , {doNOP, 63 /* ? */, 56,0,  TRUE}     //  14 
+    , {doIntervalInit, 123 /* { */, 59,0,  TRUE}     //  15 
    , {doNOP, 255, 17,0,  FALSE}     //  16 
    , {doOrOperator, 124 /* | */, 2,0,  TRUE}     //  17      expr-cont
    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  18 
@ -124,72 +127,75 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doOpenAtomicParen, 62 /* > */, 2, 12, TRUE}     //  23 
    , {doOpenLookAhead, 61 /* = */, 2, 17, TRUE}     //  24 
    , {doOpenLookAheadNeg, 33 /* ! */, 2, 17, TRUE}     //  25 
-    , {doNOP, 60 /* < */, 34,0,  TRUE}     //  26 
-    , {doNOP, 35 /* # */, 37,0,  TRUE}     //  27 
-    , {doMatchMode, 105 /* i */, 40,0,  TRUE}     //  28 
-    , {doMatchMode, 120 /* x */, 40,0,  TRUE}     //  29 
-    , {doMatchMode, 115 /* s */, 40,0,  TRUE}     //  30 
-    , {doMatchMode, 109 /* m */, 40,0,  TRUE}     //  31 
-    , {doMatchMode, 45 /* - */, 40,0,  TRUE}     //  32 
-    , {doBadOpenParenType, 255, 91,0,  FALSE}     //  33 
-    , {doOpenLookBehind, 61 /* = */, 2, 17, TRUE}     //  34      open-paren-lookbehind
-    , {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE}     //  35 
-    , {doBadOpenParenType, 255, 91,0,  FALSE}     //  36 
-    , {doNOP, 41 /* ) */, 2,0,  TRUE}     //  37      paren-comment
-    , {doMismatchedParenErr, 253, 91,0,  FALSE}     //  38 
-    , {doNOP, 255, 37,0,  TRUE}     //  39 
-    , {doMatchMode, 105 /* i */, 40,0,  TRUE}     //  40      paren-flag
-    , {doMatchMode, 115 /* s */, 40,0,  TRUE}     //  41 
-    , {doMatchMode, 109 /* m */, 40,0,  TRUE}     //  42 
-    , {doMatchMode, 120 /* x */, 40,0,  TRUE}     //  43 
-    , {doMatchMode, 45 /* - */, 40,0,  TRUE}     //  44 
-    , {doNOP, 41 /* ) */, 2,0,  TRUE}     //  45 
-    , {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE}     //  46 
-    , {doNOP, 255, 91,0,  FALSE}     //  47 
-    , {doNGStar, 63 /* ? */, 17,0,  TRUE}     //  48      quant-star
-    , {doPossesiveStar, 43 /* + */, 17,0,  TRUE}     //  49 
-    , {doStar, 255, 17,0,  FALSE}     //  50 
-    , {doNGPlus, 63 /* ? */, 17,0,  TRUE}     //  51      quant-plus
-    , {doPossesivePlus, 43 /* + */, 17,0,  TRUE}     //  52 
-    , {doPlus, 255, 17,0,  FALSE}     //  53 
-    , {doNGOpt, 63 /* ? */, 17,0,  TRUE}     //  54      quant-opt
-    , {doPossesiveOpt, 43 /* + */, 17,0,  TRUE}     //  55 
-    , {doOpt, 255, 17,0,  FALSE}     //  56 
-    , {doNOP, 129, 57,0,  TRUE}     //  57      interval-open
-    , {doNOP, 128, 60,0,  FALSE}     //  58 
-    , {doIntervalError, 255, 91,0,  FALSE}     //  59 
-    , {doIntevalLowerDigit, 128, 60,0,  TRUE}     //  60      interval-lower
-    , {doNOP, 44 /* , */, 64,0,  TRUE}     //  61 
-    , {doIntervalSame, 125 /* } */, 67,0,  TRUE}     //  62 
-    , {doIntervalError, 255, 91,0,  FALSE}     //  63 
-    , {doIntervalUpperDigit, 128, 64,0,  TRUE}     //  64      interval-upper
-    , {doNOP, 125 /* } */, 67,0,  TRUE}     //  65 
-    , {doIntervalError, 255, 91,0,  FALSE}     //  66 
-    , {doNGInterval, 63 /* ? */, 17,0,  TRUE}     //  67      interval-type
-    , {doPossesiveInterval, 43 /* + */, 17,0,  TRUE}     //  68 
-    , {doInterval, 255, 17,0,  FALSE}     //  69 
-    , {doBackslashA, 65 /* A */, 2,0,  TRUE}     //  70      backslash
-    , {doBackslashB, 66 /* B */, 2,0,  TRUE}     //  71 
-    , {doBackslashb, 98 /* b */, 2,0,  TRUE}     //  72 
-    , {doBackslashd, 100 /* d */, 12,0,  TRUE}     //  73 
-    , {doBackslashD, 68 /* D */, 12,0,  TRUE}     //  74 
-    , {doBackslashG, 71 /* G */, 2,0,  TRUE}     //  75 
-    , {doNamedChar, 78 /* N */, 12,0,  TRUE}     //  76 
-    , {doProperty, 112 /* p */, 12,0,  FALSE}     //  77 
-    , {doProperty, 80 /* P */, 12,0,  FALSE}     //  78 
-    , {doEnterQuoteMode, 81 /* Q */, 2,0,  TRUE}     //  79 
-    , {doBackslashS, 83 /* S */, 12,0,  TRUE}     //  80 
-    , {doBackslashs, 115 /* s */, 12,0,  TRUE}     //  81 
-    , {doBackslashW, 87 /* W */, 12,0,  TRUE}     //  82 
-    , {doBackslashw, 119 /* w */, 12,0,  TRUE}     //  83 
-    , {doBackslashX, 88 /* X */, 12,0,  TRUE}     //  84 
-    , {doBackslashx, 120 /* x */, 12,0,  TRUE}     //  85 
-    , {doBackslashZ, 90 /* Z */, 2,0,  TRUE}     //  86 
-    , {doBackslashz, 122 /* z */, 2,0,  TRUE}     //  87 
-    , {doOctal, 48 /* 0 */, 12,0,  TRUE}     //  88 
-    , {doBackRef, 128, 12,0,  TRUE}     //  89 
-    , {doLiteralChar, 255, 12,0,  TRUE}     //  90 
-    , {doExit, 255, 91,0,  TRUE}     //  91      errorDeath
+    , {doNOP, 60 /* < */, 36,0,  TRUE}     //  26 
+    , {doNOP, 35 /* # */, 39,0,  TRUE}     //  27 
+    , {doMatchMode, 105 /* i */, 42,0,  TRUE}     //  28 
+    , {doMatchMode, 120 /* x */, 42,0,  TRUE}     //  29 
+    , {doMatchMode, 115 /* s */, 42,0,  TRUE}     //  30 
+    , {doMatchMode, 109 /* m */, 42,0,  TRUE}     //  31 
+    , {doMatchMode, 45 /* - */, 42,0,  TRUE}     //  32 
+    , {doConditionalExpr, 40 /* ( */, 94,0,  TRUE}     //  33 
+    , {doPerlInline, 123 /* { */, 94,0,  TRUE}     //  34 
+    , {doBadOpenParenType, 255, 94,0,  FALSE}     //  35 
+    , {doOpenLookBehind, 61 /* = */, 2, 17, TRUE}     //  36      open-paren-lookbehind
+    , {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE}     //  37 
+    , {doBadOpenParenType, 255, 94,0,  FALSE}     //  38 
+    , {doNOP, 41 /* ) */, 2,0,  TRUE}     //  39      paren-comment
+    , {doMismatchedParenErr, 253, 94,0,  FALSE}     //  40 
+    , {doNOP, 255, 39,0,  TRUE}     //  41 
+    , {doMatchMode, 105 /* i */, 42,0,  TRUE}     //  42      paren-flag
+    , {doMatchMode, 115 /* s */, 42,0,  TRUE}     //  43 
+    , {doMatchMode, 109 /* m */, 42,0,  TRUE}     //  44 
+    , {doMatchMode, 120 /* x */, 42,0,  TRUE}     //  45 
+    , {doMatchMode, 45 /* - */, 42,0,  TRUE}     //  46 
+    , {doNOP, 41 /* ) */, 2,0,  TRUE}     //  47 
+    , {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE}     //  48 
+    , {doNOP, 255, 94,0,  FALSE}     //  49 
+    , {doNGStar, 63 /* ? */, 17,0,  TRUE}     //  50      quant-star
+    , {doPossesiveStar, 43 /* + */, 17,0,  TRUE}     //  51 
+    , {doStar, 255, 17,0,  FALSE}     //  52 
+    , {doNGPlus, 63 /* ? */, 17,0,  TRUE}     //  53      quant-plus
+    , {doPossesivePlus, 43 /* + */, 17,0,  TRUE}     //  54 
+    , {doPlus, 255, 17,0,  FALSE}     //  55 
+    , {doNGOpt, 63 /* ? */, 17,0,  TRUE}     //  56      quant-opt
+    , {doPossesiveOpt, 43 /* + */, 17,0,  TRUE}     //  57 
+    , {doOpt, 255, 17,0,  FALSE}     //  58 
+    , {doNOP, 129, 59,0,  TRUE}     //  59      interval-open
+    , {doNOP, 128, 62,0,  FALSE}     //  60 
+    , {doIntervalError, 255, 94,0,  FALSE}     //  61 
+    , {doIntevalLowerDigit, 128, 62,0,  TRUE}     //  62      interval-lower
+    , {doNOP, 44 /* , */, 66,0,  TRUE}     //  63 
+    , {doIntervalSame, 125 /* } */, 69,0,  TRUE}     //  64 
+    , {doIntervalError, 255, 94,0,  FALSE}     //  65 
+    , {doIntervalUpperDigit, 128, 66,0,  TRUE}     //  66      interval-upper
+    , {doNOP, 125 /* } */, 69,0,  TRUE}     //  67 
+    , {doIntervalError, 255, 94,0,  FALSE}     //  68 
+    , {doNGInterval, 63 /* ? */, 17,0,  TRUE}     //  69      interval-type
+    , {doPossesiveInterval, 43 /* + */, 17,0,  TRUE}     //  70 
+    , {doInterval, 255, 17,0,  FALSE}     //  71 
+    , {doBackslashA, 65 /* A */, 2,0,  TRUE}     //  72      backslash
+    , {doBackslashB, 66 /* B */, 2,0,  TRUE}     //  73 
+    , {doBackslashb, 98 /* b */, 2,0,  TRUE}     //  74 
+    , {doBackslashd, 100 /* d */, 12,0,  TRUE}     //  75 
+    , {doBackslashD, 68 /* D */, 12,0,  TRUE}     //  76 
+    , {doBackslashG, 71 /* G */, 2,0,  TRUE}     //  77 
+    , {doNamedChar, 78 /* N */, 12,0,  TRUE}     //  78 
+    , {doProperty, 112 /* p */, 12,0,  FALSE}     //  79 
+    , {doProperty, 80 /* P */, 12,0,  FALSE}     //  80 
+    , {doEnterQuoteMode, 81 /* Q */, 2,0,  TRUE}     //  81 
+    , {doBackslashS, 83 /* S */, 12,0,  TRUE}     //  82 
+    , {doBackslashs, 115 /* s */, 12,0,  TRUE}     //  83 
+    , {doBackslashW, 87 /* W */, 12,0,  TRUE}     //  84 
+    , {doBackslashw, 119 /* w */, 12,0,  TRUE}     //  85 
+    , {doBackslashX, 88 /* X */, 12,0,  TRUE}     //  86 
+    , {doBackslashx, 120 /* x */, 12,0,  TRUE}     //  87 
+    , {doBackslashZ, 90 /* Z */, 2,0,  TRUE}     //  88 
+    , {doBackslashz, 122 /* z */, 2,0,  TRUE}     //  89 
+    , {doOctal, 48 /* 0 */, 12,0,  TRUE}     //  90 
+    , {doBackRef, 128, 12,0,  TRUE}     //  91 
+    , {doEscapeError, 253, 94,0,  FALSE}     //  92 
+    , {doLiteralChar, 255, 12,0,  TRUE}     //  93 
+    , {doExit, 255, 94,0,  TRUE}     //  94      errorDeath
 };
 static const char * const RegexStateNames[] = {    0,
     "start",
@ -224,6 +230,8 @@ static const char * const RegexStateNames[] = {    0,
    0,
    0,
    0,
+    0,
+    0,
    0,
     "open-paren-lookbehind",
    0,
@ -281,6 +289,7 @@ static const char * const RegexStateNames[] = {    0,
    0,
    0,
    0,
+    0,
    0,
     "errorDeath",
    0};
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@ -119,6 +119,8 @@ open-paren-extended:
    's'                  n  paren-flag                              doMatchMode
    'm'                  n  paren-flag                              doMatchMode
    '-'                  n  paren-flag                              doMatchMode
+    '('                  n  errorDeath                              doConditionalExpr
+    '{'                  n  errorDeath                              doPerlInline
    default                 errorDeath                              doBadOpenParenType
    
 open-paren-lookbehind:
@ -230,8 +232,9 @@ backslash:
   'Z'                   n  term                                    doBackslashZ
   'z'                   n  term                                    doBackslashz
   '0'                   n  expr-quant                              doOctal
-   digit_char	            expr-quant                              doBackRef         #  Will scan multiple digits
-   default               n  expr-quant		                        doLiteralChar     #  Escaped literal char.		       
+   digit_char	         n  expr-quant                              doBackRef         #  Will scan multiple digits
+   eof                      errorDeath                              doEscapeError
+   default               n  expr-quant		                    doLiteralChar     #  Escaped literal char.		       

    
    
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -24,8 +24,6 @@
 #include "uvectr32.h"
 #include "regeximp.h"

-//#include "stdio.h"
-//#include "malloc.h"

 U_NAMESPACE_BEGIN

@ -222,9 +220,14 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
 UBool RegexMatcher::find() {
    // Start at the position of the last match end.  (Will be zero if the
    //   matcher has been reset.
+    //
+    // TODO:  Needs optimization
    UErrorCode status = U_ZERO_ERROR;

    int32_t  startPos;
+    // TODO:  needs to go up to the very end, so a pattern that can match a zero lenght
+    //        string can match at the end of a string.  Can't do until loop-breaking
+    //        is added to the engine, though, otherwise it triggers too many bugs.
    for (startPos=fMatchEnd; startPos < fInputLength; startPos = fInput->moveIndex32(startPos, 1)) {
        MatchAt(startPos, status);
        if (U_FAILURE(status)) {
@ -477,22 +480,27 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
 //                            We are at a boundary if the this char and the original chars are
 //                               opposite in membership in \w set
 //
+//          parameters:   pos   - the current position in the input buffer
+//                        start - the position where the match operation started.
+//                                don't backup before this position when looking back
+//                                for a preceding base char.
+//
 //--------------------------------------------------------------------------------
 UBool RegexMatcher::isWordBoundary(int32_t pos) {
    UBool isBoundary = FALSE;
-    if (pos >=  fInputLength) {
-        // off end of string.  Not a boundary.
-        return FALSE;
-    }
+    UBool cIsWord    = FALSE;
    
-    // Determine whether char c at Pos is a member of the word set of chars.
-    UChar32  c = fInput->char32At(pos);
-    int8_t ctype = u_charType(c);
-    if (ctype==U_NON_SPACING_MARK || ctype==U_ENCLOSING_MARK) {
-        // Current char is a combining one.  Not a boundary.
-        return FALSE;
+    // Determine whether char c at current position is a member of the word set of chars.
+    // If we're off the end of the string, behave as though we're not at a word char.
+    if (pos < fInputLength) {
+        UChar32  c = fInput->char32At(pos);
+        int8_t ctype = u_charType(c);
+        if (ctype==U_NON_SPACING_MARK || ctype==U_ENCLOSING_MARK) {
+            // Current char is a combining one.  Not a boundary.
+            return FALSE;
+        }
+        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
    }
-    UBool cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
    
    // Back up until we come to a non-combining char, determine whether
    //  that char is a word char.
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -14,6 +14,7 @@
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

 #include "unicode/uchar.h"
+#include "unicode/ucnv.h"
 #include "intltest.h"
 #include "regextst.h"
 #include "uvector.h"
@ -59,6 +60,10 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
        case 5: name = "Errors";
            if (exec) Errors(); 
            break;
+        case 6: name = "PerlTests";
+            // if (exec) PerlTests();
+            break;
+

        default: name = ""; 
            break; //needed to end loop
@ -368,7 +373,7 @@ void RegexTest::Basic() {
 //
 #if 0
    {
-    REGEX_TESTLM("(abc)*+a", "abcabcabc", FALSE, FALSE);
+    REGEX_TESTLM("\\ba\\b", "-a", FALSE, TRUE);
    // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
    // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    }
@ -1109,6 +1114,8 @@ void RegexTest::Extended() {

    // \b \B
    REGEX_FIND( ".*?\\b(.).*", "<0>  $%^&*( <1>h</1>ello123%^&*()gxx</0>");
+    REGEX_FIND( "\\ba\\b", "-<0>a</0>");
+    REGEX_FIND("\\by\\b",  "xy");

                 // Finds first chars of up to 5 words
    REGEX_FIND( "(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?(?:.*?\\b(\\w))?",
@ -1319,5 +1326,297 @@ void RegexTest::Errors() {

 }

+
+//---------------------------------------------------------------------------
+//
+//      PerlTests     Run Perl's regexp tests.
+//
+//---------------------------------------------------------------------------
+static UBool ReplaceFirst(UnicodeString &target, const UnicodeString &pattern,
+                         const UnicodeString &replacement, UErrorCode &status)
+{
+    if (U_FAILURE(status)) {
+        return FALSE;
+    }
+    UParseError pe;
+    RegexPattern *pat = NULL;
+    RegexMatcher *mat = NULL;
+
+    pat = RegexPattern::compile(pattern, 0, pe, status);
+    if (pat != NULL) {
+        mat = pat->matcher(target, status);
+    }
+    if (mat != NULL) {
+        target = mat->replaceFirst(replacement, status);
+    }
+    UBool retVal = (mat->start(0, status) != -1);
+    delete mat;
+    delete pat;
+    return retVal;
+}
+
+static char *cstar(const UnicodeString &s) {
+    UErrorCode status=U_ZERO_ERROR;
+    static char buf[1000];
+    s.extract(buf, 1000, NULL, status);
+    buf[999] = 0;
+    return buf;
+}
+
+//-------------------------------------------------------------------------------
+//
+//  Read a text data file, convert it to UChars, and return the data
+//    in one big UChar * buffer, which the caller must delete.
+//
+//--------------------------------------------------------------------------------
+UChar *RegexTest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {
+    UChar       *retPtr  = NULL;
+    char        *fileBuf = NULL;
+    UConverter* conv     = NULL;
+    FILE        *f       = NULL;
+ 
+    ulen = 0;
+    {
+        if (U_FAILURE(status)) {
+            return retPtr;
+        }
+        
+        //
+        //  Open the file.
+        //
+        f = fopen(fileName, "rb");
+        if (f == 0) {
+            errln("Error opening test data file %s\n", fileName);
+            goto cleanUpAndReturn;
+        }
+        //
+        //  Read it in
+        //
+        fseek( f, 0, SEEK_END);
+        int fileSize = ftell(f);
+        fileBuf = new char[fileSize];
+        fseek(f, 0, SEEK_SET);
+        int amt_read = fread(fileBuf, 1, fileSize, f);
+        if (amt_read != fileSize || fileSize <= 0) {
+            errln("Error reading test data file.");
+            goto cleanUpAndReturn;
+        }
+        
+        //
+        // Look for a Unicode Signature (BOM) on the data just read
+        //
+        int32_t        signatureLength;
+        const char *   fileBufC = fileBuf;
+        const char*    encoding = ucnv_detectUnicodeSignature(
+            fileBuf, fileSize, &signatureLength, &status);
+        if(encoding!=NULL ){
+            fileBufC  += signatureLength;
+            fileSize  -= signatureLength;
+        }
+        
+        //
+        // Open a converter to take the rule file to UTF-16
+        //
+        conv = ucnv_open(encoding, &status);
+        if (U_FAILURE(status)) {
+            goto cleanUpAndReturn;
+        }
+        
+        //
+        // Convert the rules to UChar.
+        //  Preflight first to determine required buffer size.
+        //
+        ulen = ucnv_toUChars(conv,
+            NULL,           //  dest,
+            0,              //  destCapacity,
+            fileBufC,
+            fileSize,
+            &status);
+        if (status == U_BUFFER_OVERFLOW_ERROR) {
+            // Buffer Overflow is expected from the preflight operation.
+            status = U_ZERO_ERROR;
+        }
+        
+        retPtr = new UChar[ulen+1];
+        ucnv_toUChars(conv,
+            retPtr,       //  dest,
+            ulen+1,
+            fileBufC,
+            fileSize,
+            &status);
+    }
+
+cleanUpAndReturn:
+    fclose(f);
+    delete fileBuf;
+    ucnv_close(conv);
+    if (U_FAILURE(status)) {
+        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
+        delete retPtr;
+        retPtr = 0;
+        ulen   = 0;
+    };
+    return retPtr;
+}
+
+
+//-------------------------------------------------------------------------------
+//
+//   PerlTests  - Run Perl's regular expression tests
+//
+//-------------------------------------------------------------------------------
+void RegexTest::PerlTests() {
+    UErrorCode  status = U_ZERO_ERROR;
+    UParseError pe;
+
+    //
+    //  Open and read the test data file.
+    //
+    const char *testDataDirectory = loadTestData(status);
+    UnicodeString tdd(testDataDirectory);
+    ReplaceFirst(tdd, "([/\\\\])out[/\\\\]testdata", "$1re_tests.txt", status);
+
+    int    len;
+    UChar *testData = ReadAndConvertFile(cstar(tdd), len, status);
+
+    //
+    //  Put the test data into a UnicodeString
+    //
+    UnicodeString ruleSourceS(FALSE, testData, len);
+
+    //
+    //  Regex to break the input file into lines, and strip the new lines.
+    //     One line per match, capture group one is the desired data.
+    //
+    RegexPattern* linePat = RegexPattern::compile("(.+?)[\\r\\n]+", 0, pe, status);
+    RegexMatcher* lineMat = linePat->matcher(ruleSourceS, status);
+
+    //
+    //  Regex to split a test file line into fields.
+    //    There are six fields, separated by tabs.
+    //
+    RegexPattern* fieldPat = RegexPattern::compile("\\t", 0, pe, status);
+
+    //
+    //  Regex to identify test patterns with flag settings, and to separate them.
+    //    Test patterns with flags look like 'pattern'i
+    //    Test patterns without flags are not quoted:   paterrn
+    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
+    //
+    RegexPattern *flagPat = RegexPattern::compile("('?)(.*)\\1(.*)", 0, pe, status);
+    RegexMatcher* flagMat = flagPat->matcher("", status);
+
+    //
+    // Regex to find ${bang}.  Perl doesn't put literal '!'s into patterns.
+    //
+    RegexPattern *bangPat = RegexPattern::compile("\\$\\{bang\\}", 0, pe, status);
+    RegexMatcher *bangMat = bangPat->matcher("", status);
+    
+
+    int32_t  lineNum = 0;
+    int32_t  skippedUnimplementedCount = 0;
+    while (lineMat->find()) {
+        lineNum++;
+        UnicodeString line = lineMat->group(1, status);
+        UnicodeString fields[7];
+        fieldPat->split(line, fields, 7, status);
+
+        flagMat->reset(fields[0]);
+        flagMat->matches(status);
+        UnicodeString pattern  = flagMat->group(2, status);
+        bangMat->reset(pattern);
+        pattern = bangMat->replaceAll("\\u0021", status);
+        UnicodeString flagStr = flagMat->group(3, status);
+        // printf("pattern = %s\n", cstar(pattern));
+        // printf("   flags = %s\n", cstar(flags));
+        if (U_FAILURE(status)) {
+            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
+            return;
+        }
+
+        int32_t flags = 0;
+        const UChar UChar_c = 0x63;  // Damn the lack of Unicode support in C
+        const UChar UChar_i = 0x69;
+        const UChar UChar_m = 0x6d;
+        const UChar UChar_x = 0x78;
+        const UChar UChar_y = 0x79;
+        if (flagStr.indexOf(UChar_i) != -1) {
+            flags |= UREGEX_CASE_INSENSITIVE;
+        }
+        if (flagStr.indexOf(UChar_m) != -1) {
+            flags |= UREGEX_MULTILINE;
+        }
+        if (flagStr.indexOf(UChar_x) != -1) {
+            flags |= UREGEX_COMMENTS;
+        }
+
+        //
+        // Compile the test pattern.
+        //
+        status = U_ZERO_ERROR;
+        RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
+        if (status == U_REGEX_UNIMPLEMENTED) {
+            skippedUnimplementedCount++;
+            delete testPat;
+            status = U_ZERO_ERROR;
+            continue;
+        }
+
+        if (U_FAILURE(status)) {
+            // Some tests are supposed to generate errors.
+            //   Only report an error for tests that are supposed to succeed.
+            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
+                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
+            {
+                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
+            }
+            status = U_ZERO_ERROR;
+            delete testPat;
+            continue;
+        }
+
+        if (fields[2].indexOf(UChar_i) >= 0) {
+            // ICU should skip this test.
+            delete testPat;
+            continue;
+        }
+
+        if (fields[2].indexOf(UChar_c) >= 0) {
+            // This pattern should have caused a compilation error, but didn't/
+            errln("line %d: Expected a pattern compile error, got success.", lineNum);
+            delete testPat;
+            continue;
+        }
+
+
+        //
+        // Run the test
+        //
+        RegexMatcher *testMat = testPat->matcher(fields[1], status);
+        UBool found = testMat->find();
+        UBool expected = FALSE;
+        if (fields[2].indexOf(UChar_y) >=0) {
+            expected = TRUE;
+        }
+        if (expected != found) {
+            errln("line %d: Expected %smatch, got %smatch", 
+                lineNum, expected?"":"no ", found?"":"no " );
+        }
+
+        
+
+        delete testMat;
+        delete testPat;
+    }
+
+    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
+
+
+
+
+}
+
+
+
 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */

--- a/icu4c/source/test/intltest/regextst.h
+++ b/icu4c/source/test/intltest/regextst.h
@ -30,12 +30,14 @@ public:
    virtual void Basic();
    virtual void Extended();
    virtual void Errors();
+    virtual void PerlTests();

    // The following functions are internal to the regexp tests.
    virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int line);
    virtual void regex_find(const char *pat, const char *input, UErrorCode expectedStatus, int line);
    virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
                            UErrorCode expectedStatus, int line);
+    virtual UChar *ReadAndConvertFile(const char *fileName, int &len, UErrorCode &status);
 };

 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS