ICU-2422 Regexp \N{NAME} added. (ZeroLengthMatch)* optimizations improved.

X-SVN-Rev: 11228
2025-04-08 23:10:40 +00:00 · 2003-03-04 01:10:46 +00:00 · 2003-03-04 01:10:46 +00:00 · 09baec1685
commit 09baec1685
parent 234855629c
6 changed files with 85 additions and 52 deletions
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -1018,7 +1018,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
            int32_t   saveStateLoc = blockTopLoc(TRUE);
            int32_t   dataLoc = -1;

-            if (possibleNullMatch(saveStateLoc, fRXPat->fCompiledPat->size()-1)) {
+            if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
                insertOp(saveStateLoc);
                dataLoc =  fRXPat->fFrameSize;
                fRXPat->fFrameSize++;
@ -1132,7 +1132,7 @@ UBool RegexCompile::doParseActions(EParseAction action)

    case doLiteralChar:
        // We've just scanned a "normal" character from the pattern, 
-        literalChar();
+        literalChar(fC.fChar);
        break;


@ -1302,10 +1302,6 @@ UBool RegexCompile::doParseActions(EParseAction action)
        break;


-    case doNamedChar:            // \N{NAMED_CHAR}
-        //  TODO:  implement
-        error(U_REGEX_UNIMPLEMENTED);
-        break;

    case doPossesivePlus:
        // Possessive ++ quantifier.
@ -1503,7 +1499,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
 //                         If we aren't in a pattern string, begin one now.
 //
 //------------------------------------------------------------------------------
-void RegexCompile::literalChar()  {
+void RegexCompile::literalChar(UChar32 c)  {
    int32_t           op;            // An operation in the compiled pattern.
    int32_t           opType;
    int32_t           patternLoc;   // A position in the compiled pattern.
@ -1521,17 +1517,17 @@ void RegexCompile::literalChar()  {
    if (fStringOpStart == -1) {
        // First char of a string in the pattern.
        // Emit a OneChar op into the compiled pattern.
-        emitONE_CHAR(fC.fChar);
+        emitONE_CHAR(c);

        // Also add it to the string pool, in case we get a second adjacent literal
        //   and want to change form ONE_CHAR to STRING
        fStringOpStart = fRXPat->fLiteralText.length();
-        fRXPat->fLiteralText.append(fC.fChar);
+        fRXPat->fLiteralText.append(c);
        return;
    }
    
    // We are adding onto an existing string
-    fRXPat->fLiteralText.append(fC.fChar);
+    fRXPat->fLiteralText.append(c);

    // If the most recently emitted op is a URX_ONECHAR, change it to a string op.
    op     = fRXPat->fCompiledPat->lastElementi();
@ -1987,8 +1983,7 @@ void        RegexCompile::compileSet(UnicodeSet *theSet)
            // The set contains only a single code point.  Put it into
            //   the compiled pattern as a single char operation rather
            //   than a set, and discard the set itself.
-            int32_t  charToken = URX_BUILD(URX_ONECHAR, firstSetChar);
-            fRXPat->fCompiledPat->addElement(charToken, *fStatus);
+            literalChar(firstSetChar);
            delete theSet;
        }
        break;
@ -2083,6 +2078,9 @@ UBool  RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
 //                     value may be shorter than the actual minimum; it must
 //                     never be longer.
 //
+//                     start and end are the range of p-code operations to be
+//                     examined.  The endpoints are included in the range.
+//
 //----------------------------------------------------------------------------------------
 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
    if (U_FAILURE(*fStatus)) {
@ -2097,11 +2095,17 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
    int32_t    op;
    int32_t    opType;
    int32_t    currentLen = 0;
-    UVector32  lengthSoFar(end+1, *fStatus);
-    lengthSoFar.setSize(end+1);

-    for (loc=start; loc<=end; loc++) {
-        lengthSoFar.setElementAt(INT32_MAX, loc);
+
+    // forwardedLength is a vector holding minimum-match-length values that
+    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
+    //   It must be one longer than the pattern being checked because some  ops
+    //   will jmp to a end-of-block+1 location from within a block, and we must
+    //   count those when checking the block.
+    UVector32  forwardedLength(end+2, *fStatus);
+    forwardedLength.setSize(end+2);
+    for (loc=start; loc<=end+1; loc++) {
+        forwardedLength.setElementAt(INT32_MAX, loc);
    }

    for (loc = start; loc<=end; loc++) {
@ -2112,8 +2116,8 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
        // If the op we are now at was the destination of a branch in the pattern,
        // and that path has a shorter minimum length than the current accumulated value,
        // replace the current accumulated value.
-        if (lengthSoFar.elementAti(loc) < currentLen) {
-            currentLen = lengthSoFar.elementAti(loc);
+        if (forwardedLength.elementAti(loc) < currentLen) {
+            currentLen = forwardedLength.elementAti(loc);
        }

        switch (opType) {
@ -2165,12 +2169,13 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
                if (jmpDest < loc) {
                    // Loop of some kind.  Can safely ignore, the worst that will happen
                    //  is that we understate the true minimum length
-                    currentLen = lengthSoFar.elementAti(loc+1);
+                    currentLen = forwardedLength.elementAti(loc+1);
                   
                } else {
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
-                    if (lengthSoFar.elementAti(jmpDest) > currentLen) {
-                        lengthSoFar.setElementAt(currentLen, jmpDest);
+                    U_ASSERT(jmpDest <= end+1);
+                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
+                        forwardedLength.setElementAt(currentLen, jmpDest);
                    }
                }
            }
@ -2179,7 +2184,7 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
        case URX_FAIL:
            // Fails are kind of like a branch, except that the min length was
            //   propagated already, by the state save.
-            currentLen = lengthSoFar.elementAti(loc+1);
+            currentLen = forwardedLength.elementAti(loc+1);
            break;


@ -2189,8 +2194,8 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
                //             of the state save.
                int32_t  jmpDest = URX_VAL(op);
                if (jmpDest > loc) {
-                    if (currentLen < lengthSoFar.elementAti(jmpDest)) {
-                        lengthSoFar.setElementAt(currentLen, jmpDest);
+                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
+                        forwardedLength.setElementAt(currentLen, jmpDest);
                    }
                } 
            }
@ -2268,8 +2273,14 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
            }
            
        }
-        return currentLen;
-        
+
+    // We have finished walking through the ops.  Check whether some forward jump
+    //   propagated a shorter length to location end+1.
+    if (forwardedLength.elementAti(end+1) < currentLen) {
+        currentLen = forwardedLength.elementAti(end+1);
+    }
+            
+    return currentLen;
 }


@ -2297,11 +2308,11 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
    int32_t    op;
    int32_t    opType;
    int32_t    currentLen = 0;
-    UVector32  lengthSoFar(end+1, *fStatus);
-    lengthSoFar.setSize(end+1);
+    UVector32  forwardedLength(end+1, *fStatus);
+    forwardedLength.setSize(end+1);

    for (loc=start; loc<=end; loc++) {
-        lengthSoFar.setElementAt(0, loc);
+        forwardedLength.setElementAt(0, loc);
    }

    for (loc = start; loc<=end; loc++) {
@ -2312,8 +2323,8 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
        // If the op we are now at was the destination of a branch in the pattern,
        // and that path has a longer maximum length than the current accumulated value,
        // replace the current accumulated value.
-        if (lengthSoFar.elementAti(loc) > currentLen) {
-            currentLen = lengthSoFar.elementAti(loc);
+        if (forwardedLength.elementAti(loc) > currentLen) {
+            currentLen = forwardedLength.elementAti(loc);
        }

        switch (opType) {
@ -2388,8 +2399,8 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
                    currentLen = INT32_MAX;
                } else {
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
-                    if (lengthSoFar.elementAti(jmpDest) < currentLen) {
-                        lengthSoFar.setElementAt(currentLen, jmpDest);
+                    if (forwardedLength.elementAti(jmpDest) < currentLen) {
+                        forwardedLength.setElementAt(currentLen, jmpDest);
                    }
                    currentLen = 0;
                }
@ -2399,7 +2410,7 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
        case URX_FAIL:
            // Fails are kind of like a branch, except that the max length was
            //   propagated already, by the state save.
-            currentLen = lengthSoFar.elementAti(loc+1);
+            currentLen = forwardedLength.elementAti(loc+1);
            break;


@ -2411,8 +2422,8 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
                //               match length is unbounded.
                int32_t  jmpDest = URX_VAL(op);
                if (jmpDest > loc) {
-                    if (currentLen > lengthSoFar.elementAti(jmpDest)) {
-                        lengthSoFar.setElementAt(currentLen, jmpDest);
+                    if (currentLen > forwardedLength.elementAti(jmpDest)) {
+                        forwardedLength.setElementAt(currentLen, jmpDest);
                    }
                } else {
                    currentLen = INT32_MAX;
@ -2536,6 +2547,7 @@ static const UChar      chRParen    = 0x29;
 static const UChar      chLBracket  = 0x5b;
 static const UChar      chRBracket  = 0x5d;
 static const UChar      chRBrace    = 0x7d;
+static const UChar      chUpperN    = 0x4E;
 static const UChar      chLowerP    = 0x70;
 static const UChar      chUpperP    = 0x50;

@ -2780,7 +2792,7 @@ UnicodeSet *RegexCompile::scanProp() {
        return NULL;
    }

-    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP);
+    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP || fC.fChar == chUpperN);

    // enclose the \p{property} from the regex pattern source in  [brackets]
    UnicodeString setPattern;
@ -2800,8 +2812,16 @@ UnicodeSet *RegexCompile::scanProp() {
    }
    setPattern.append(chRBracket);

+    uint32_t   usetFlags = 0;
+    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
+        usetFlags |= USET_CASE_INSENSITIVE;
+    }
+    if (fModeFlags & UREGEX_COMMENTS) {
+        usetFlags |= USET_IGNORE_SPACE;
+    }
+
    // Build the UnicodeSet from the set pattern we just built up in a string.
-    uset = new UnicodeSet(setPattern, *fStatus);
+    uset = new UnicodeSet(setPattern,  usetFlags, *fStatus);
    if (U_FAILURE(*fStatus)) {
        delete uset;
        uset =  NULL;
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -97,9 +97,9 @@ private:
                                                     //  there is space to add an opcode there.
    void        compileSet(UnicodeSet *theSet);      // Generate the compiled pattern for
                                                     //   a reference to a UnicodeSet.
-    void        compileInterval(int32_t InitOp,       // Generate the code for a {min,max} quantifier.
+    void        compileInterval(int32_t InitOp,      // Generate the code for a {min,max} quantifier.
                               int32_t LoopOp);
-    void        literalChar();                       // Compile a literal char
+    void        literalChar(UChar32 c);              // Compile a literal char
    void        fixLiterals(UBool split=FALSE);      // Fix literal strings.
    void        insertOp(int32_t where);             // Open up a slot for a new op in the
                                                     //   generated code at the specified location.
@ -186,10 +186,12 @@ private:
    int32_t                       fIntervalUpper;    // Placed here temporarily, when pattern is
                                                     //   initially scanned.  Each new interval
                                                     //   encountered overwrites these values.
-
                                                     //   -1 for the upper interval value means none
                                                     //   was specified (unlimited occurences.)

+    int32_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a
+                                                     //   pattern, valid while remainder of name is
+                                                     //   scanned.
 };

 U_NAMESPACE_END
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@ -33,7 +33,6 @@ enum Regex_PatternParseAction {
    doBackslashs,
    doOctal,
    doNGOpt,
-    doNamedChar,
    doBackslashw,
    doPossesiveStar,
    doMismatchedParenErr,
@ -190,7 +189,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doBackslashd, 100 /* d */, 14,0,  TRUE}     //  82 
    , {doBackslashD, 68 /* D */, 14,0,  TRUE}     //  83 
    , {doBackslashG, 71 /* G */, 2,0,  TRUE}     //  84 
-    , {doNamedChar, 78 /* N */, 14,0,  TRUE}     //  85 
+    , {doProperty, 78 /* N */, 14,0,  FALSE}     //  85 
    , {doProperty, 112 /* p */, 14,0,  FALSE}     //  86 
    , {doProperty, 80 /* P */, 14,0,  FALSE}     //  87 
    , {doEnterQuoteMode, 81 /* Q */, 2,0,  TRUE}     //  88 
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@ -237,7 +237,7 @@ backslash:
   'd'                   n  expr-quant                              doBackslashd
   'D'                   n  expr-quant                              doBackslashD
   'G'                   n  term                                    doBackslashG
-   'N'                   n  expr-quant                              doNamedChar      #   \N{NAME}  named char
+   'N'                      expr-quant                              doProperty       #   \N{NAME}  named char
   'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
   'P'                      expr-quant                              doProperty
   'Q'                   n  term                                    doEnterQuoteMode
@ -255,9 +255,6 @@ backslash:
   default               n  expr-quant		                    doLiteralChar     #  Escaped literal char.		       

    
-    
-
-    
 #
 # errorDeath.   This state is specified as the next state whenever a syntax error
 #               in the source rules is detected.  Barring bugs, the state machine will never
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -394,10 +394,10 @@ void RegexTest::Basic() {
 //
 #if 0
    {
-        // REGEX_TESTLM("^a (?#xxx) (?#yyy) {3}c", "accc", FALSE, FALSE);
+        // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
        UParseError pe;
-        UErrorCode  status;
-        RegexPattern::compile("^a (?#xxx) (?#yyy) {3}c", UREGEX_COMMENTS, pe, status);
+        UErrorCode  status = U_ZERO_ERROR;
+        RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
        // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    }
@ -1219,7 +1219,6 @@ void RegexTest::Extended() {
 //---------------------------------------------------------------------------
 void RegexTest::Errors() {
    // \escape sequences that aren't implemented yet.
-    REGEX_ERR("named chars \\N{GREEK CAPITAL LETTER ALPHA} not implementd", 1, 14, U_REGEX_UNIMPLEMENTED);
    REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);

    // Missing close parentheses
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -262,4 +262,20 @@

 ".*(?<=: ?)(\w*)"                "<0>1:one  2: two 3:<1>three</1></0>   "

-#"(?<=abc*)xyz"           dt    "abcccc<0>xyz</0>"
+#
+# Named Characters
+#
+"a\N{LATIN SMALL LETTER B}c"    "<0>abc</0>"
+"a\N{LATIN SMALL LETTER B}c"  i  "<0>abc</0>"
+"a\N{LATIN SMALL LETTER B}c"  i  "<0>aBc</0>"
+"a\N{LATIN SMALL LETTER B}c"     "aBc"
+
+"\N{FULL STOP}*"                 "<0>...</0>abc"
+
+
+#
+#  Random debugging, Temporary
+#
+#"^(?:a?b?)*$"	        d         "a--"	
+"^(?:a?b?)*$"	                  "a--"
+