ICU-2422 regexp, free-format support added, some match mode bugs fixed

X-SVN-Rev: 11035
2025-04-13 08:53:20 +00:00 · 2003-02-13 01:10:22 +00:00 · 2003-02-13 01:10:22 +00:00 · 14da9b81e3
commit 14da9b81e3
parent 22d336f746
8 changed files with 118 additions and 52 deletions
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -175,20 +175,21 @@ static void caseClose(UnicodeSet *theSet) {
 //----------------------------------------------------------------------------------------
 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status)
 {
-    fStatus             = &status;
+    fStatus           = &status;

-    fRXPat          = rxp;
-    fScanIndex      = 0;
-    fNextIndex      = 0;
-    fPeekChar       = -1;
-    fLineNum        = 1;
-    fCharNum        = 0;
-    fQuoteMode      = FALSE;
-    fFreeForm       = FALSE;
-    fModeFlags      = fRXPat->fFlags;
+    fRXPat            = rxp;
+    fScanIndex        = 0;
+    fNextIndex        = 0;
+    fPeekChar         = -1;
+    fLineNum          = 1;
+    fCharNum          = 0;
+    fQuoteMode        = FALSE;
+    fInBackslashQuote = FALSE;
+    fModeFlags        = fRXPat->fFlags;
+    fEOLComments      = TRUE;

-    fMatchOpenParen  = -1;
-    fMatchCloseParen = -1;
+    fMatchOpenParen   = -1;
+    fMatchCloseParen  = -1;

    if (U_FAILURE(status)) {
        return;
@ -304,7 +305,7 @@ void    RegexCompile::compile(

        U_ASSERT(state != 0);

-        // Find the state table element that matches the input char from the rule, or the
+        // Find the state table element that matches the input char from the pattern, or the
        //    class of the input character.  Start with the first table row for this
        //    state, then linearly scan forward until we find a row that matches the
        //    character.  The last row for each state always matches all characters, so
@ -337,7 +338,7 @@ void    RegexCompile::compile(
            }

            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
-                fC.fQuoted == FALSE &&                                      //   char is not escaped &&
+                fC.fQuoted == FALSE &&                                       //   char is not escaped &&
                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
                UnicodeSet *uniset = gRuleSets[tableEl->fCharClass-128];
                if (uniset->contains(fC.fChar)) {
@ -373,6 +374,10 @@ void    RegexCompile::compile(
            fStack[fStackPtr] = tableEl->fPushState;
        }

+        //
+        //  NextChar.  This is where characters are actually fetched from the pattern.
+        //             Happens under control of the 'n' tag in the state table.
+        //
        if (tableEl->fNextChar) {
            nextChar(fC);
        }
@ -1300,9 +1305,17 @@ UBool RegexCompile::doParseActions(EParseAction action)
        }
        break;

+    case doSuppressComments:
+        // We have just scanned a '(?'.  We now need to prevent the character scanner from
+        // treating a '#' as a to-the-end-of-line comment.
+        //   (This Perl compatibility just gets uglier and uglier to do...)
+        fEOLComments = FALSE;
+        break;
+


    default:
+        U_ASSERT(FALSE);
        error(U_REGEX_INTERNAL_ERROR);
        returnVal = FALSE;
        break;
@ -1863,8 +1876,8 @@ static const UChar      chUpperP    = 0x50;
 //----------------------------------------------------------------------------------------
 //
 //  nextCharLL    Low Level Next Char from the regex pattern.
-//                Get a char from the string,
-//                keep track of input position for error reporting.
+//                Get a char from the string, keep track of input position
+//                     for error reporting.
 //
 //----------------------------------------------------------------------------------------
 UChar32  RegexCompile::nextCharLL() {
@ -1929,9 +1942,6 @@ UChar32  RegexCompile::peekCharLL() {
 //---------------------------------------------------------------------------------
 void RegexCompile::nextChar(RegexPatternChar &c) {

-    // Unicode Character constants needed for the processing done by nextChar(),
-    //   in hex because literals wont work on EBCDIC machines.
-
    fScanIndex = fNextIndex;
    c.fChar    = nextCharLL();
    c.fQuoted  = FALSE;
@ -1944,39 +1954,60 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
            nextChar(c);        // recurse to get the real next char
        }
    }
+    else if (fInBackslashQuote) {
+        // The current character immediately follows a '\'
+        // Don't check for any further escapes, just return it as-is.
+        // Don't set c.fQuoted, because that would prevent the state machine from
+        //    dispatching on the character.
+        fInBackslashQuote = FALSE;
+    }
    else
    {
-        // We are not in a 'quoted region' of the source.
+        // We are not in a \Q quoted region \E of the source.
        //
-        if (fFreeForm && c.fChar == chPound) {
-            // Start of a comment.  Consume the rest of it.
-            //  The new-line char that terminates the comment is always returned.
-            //  It will be treated as white-space, and serves to break up anything
-            //    that might otherwise incorrectly clump together with a comment in
-            //    the middle (a variable name, for example.)
+        if (fModeFlags & UREGEX_COMMENTS) {
+            //
+            // We are in free-spacing and comments mode.
+            //  Scan through any white space and comments, until we 
+            //  reach a significant character or the end of inut.
            for (;;) {
+                if (c.fChar == (UChar32)-1) {
+                    break;     // End of Input
+                }
+                if  (c.fChar == chPound && fEOLComments == TRUE) {
+                    // Start of a comment.  Consume the rest of it, until EOF or a new line
+                    for (;;) {
+                        c.fChar = nextCharLL();
+                        if (c.fChar == (UChar32)-1 ||  // EOF
+                            c.fChar == chCR        ||
+                            c.fChar == chLF        ||
+                            c.fChar == chNEL       ||
+                            c.fChar == chLS)       {
+                            break;
+                        }
+                    }
+                }
+                if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
+                    //  TODO:  is RuleWhiteSpace the right thing to use here?
+                    break;
+                }
                c.fChar = nextCharLL();
-                if (c.fChar == (UChar32)-1 ||  // EOF
-                    c.fChar == chCR     ||
-                    c.fChar == chLF     ||
-                    c.fChar == chNEL    ||
-                    c.fChar == chLS)       {break;}
            }
        }
-        if (c.fChar == (UChar32)-1) {
-            return;
-        }

        //
        //  check for backslash escaped characters.
-        //  Use UnicodeString::unescapeAt() to handle those that it can.
-        //  Otherwise just return the '\', and let the pattern parser deal with it.
        //
-        int32_t startX = fNextIndex;  // start and end positions of the
-        int32_t endX   = fNextIndex;  //   sequence following the '\'
+                int32_t startX = fNextIndex;  // start and end positions of the
+                int32_t endX   = fNextIndex;  //   sequence following the '\'
        if (c.fChar == chBackSlash) {
            if (gUnescapeCharSet->contains(peekCharLL())) {
-                nextCharLL();     // get & discard the peeked char.
+                //
+                // A '\' sequence that is handled by ICU's standard unescapeAt function.
+                //   Includes \uxxxx, \n, \r, many others.
+                //   Return the single equivalent character.
+                //
+                nextCharLL();                 // get & discard the peeked char.
                c.fQuoted = TRUE;
                c.fChar = fRXPat->fPattern.unescapeAt(endX);
                if (startX == endX) {
@ -1985,8 +2016,21 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
                fCharNum += endX - startX;
                fNextIndex = endX;
            }
+            else
+            {
+                // We are in a '\' escape that will be handled by the state table scanner.
+                // Just return the backslash, but remember that the following char is to
+                //  be taken literally.  TODO:  this is awkward
+                fInBackslashQuote = TRUE;
+            }
        }
    }
+
+    // re-enable # to end-of-line comments, in case they were disabled..
+    // They are disabled by the parser upon seeing '(?', but this lasts for
+    //  the fetching of the next character only.
+    fEOLComments = TRUE;
+
    // putc(c.fChar, stdout);
 }

--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -118,8 +118,10 @@ private:
                                                     //   in the rule input string.
    int32_t                       fNextIndex;        // Index of the next character, which
                                                     //   is the first character not yet scanned.
-    UBool                         fQuoteMode;        // Scan is in a quoted region
-    UBool                         fFreeForm;         // Scan mode is free-form, ignore spaces.
+    UBool                         fQuoteMode;        // Scan is in a \Q...\E quoted region
+    UBool                         fInBackslashQuote; // Scan is between a '\' and the following char.
+    UBool                         fEOLComments;      // When scan is just after '(?',  inhibit #... to 
+                                                     //   end of line comments, in favor of (?#...) comments.
    int                           fLineNum;          // Line number in input file.
    int                           fCharNum;          // Char position within the line.
    UChar32                       fLastChar;         // Previous char, needed to count CR-LF
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@ -60,6 +60,7 @@ enum Regex_PatternParseAction {
    doPossesiveOpt,
    doEscapeError,
    doBackslashG,
+    doSuppressComments,
    doMatchModeParen,
    doOpt,
    doInterval,
@ -127,11 +128,11 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doOrOperator, 124 /* | */, 2,0,  TRUE}     //  20      expr-cont
    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  21 
    , {doNOP, 255, 2,0,  FALSE}     //  22 
-    , {doNOP, 63 /* ? */, 25,0,  TRUE}     //  23      open-paren-quant
+    , {doSuppressComments, 63 /* ? */, 25,0,  TRUE}     //  23      open-paren-quant
    , {doNOP, 255, 27,0,  FALSE}     //  24 
    , {doNOP, 35 /* # */, 46, 14, TRUE}     //  25      open-paren-quant2
    , {doNOP, 255, 29,0,  FALSE}     //  26 
-    , {doNOP, 63 /* ? */, 29,0,  TRUE}     //  27      open-paren
+    , {doSuppressComments, 63 /* ? */, 29,0,  TRUE}     //  27      open-paren
    , {doOpenCaptureParen, 255, 2, 14, FALSE}     //  28 
    , {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE}     //  29      open-paren-extended
    , {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE}     //  30 
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@ -64,7 +64,7 @@ start:
 #  term.  At a position where we can accept the start most items in a pattern.
 #
 term:
-    quoted               n expr-quant     	                        doLiteralChar
+    quoted               n expr-quant                               doLiteralChar
    rule_char            n expr-quant                               doLiteralChar
    '['                  n expr-quant                               doScanUnicodeSet
    '('                  n open-paren                     
@ -109,7 +109,7 @@ expr-cont:
 #                      branches into the normal parenthesis sequence as quickly as possible.
 #
 open-paren-quant:
-    '?'                  n  open-paren-quant2
+    '?'                  n  open-paren-quant2                      doSuppressComments
    default                 open-paren
    
 open-paren-quant2:
@ -122,7 +122,7 @@ open-paren-quant2:
 #                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
 #
 open-paren:
-    '?'                  n  open-paren-extended
+    '?'                  n  open-paren-extended                     doSuppressComments
    default                 term            ^expr-quant             doOpenCaptureParen
    
 open-paren-extended:
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -844,9 +844,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
                   // We are at the start input.  Success.
                   break;
               }
-               // Check the character just before the current pos.
+               // Check whether character just before the current pos is a new-line
+               //   unless we are at the end of input
               UChar  c = inputBuf[fp->fInputIdx - 1]; 
-               if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
+               if ((fp->fInputIdx < inputLen) && 
+                   (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
                   //  It's a new-line.  ^ is true.  Success.
                   break;                        
               }
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -215,7 +215,7 @@ RegexPattern  *RegexPattern::compile(
        return NULL;
    }

-    if ((flags & (UREGEX_CANON_EQ | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE)) != 0) {
+    if ((flags & UREGEX_CANON_EQ) != 0) {
        status = U_REGEX_UNIMPLEMENTED;
        return NULL;
    }
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -394,9 +394,12 @@ void RegexTest::Basic() {
 //
 #if 0
    {
-    REGEX_TESTLM("(a)|\\1", "x", FALSE, FALSE);
-    // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
-    // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
+        // REGEX_TESTLM("^a (?#xxx) (?#yyy) {3}c", "accc", FALSE, FALSE);
+        UParseError pe;
+        UErrorCode  status;
+        RegexPattern::compile("^a (?#xxx) (?#yyy) {3}c", UREGEX_COMMENTS, pe, status);
+        // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
+        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    }
    exit(1);
 #endif
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -216,3 +216,17 @@
 "(?:(?i)a)b"                    "<0>Ab</0>"
 "ab(?i)cd"	                 "<0>abCd</0>"
 "ab$cd"                         "abcd"
+
+# White space handling
+"a b"                           "ab"
+"abc "                          "abc"
+"abc "                          "<0>abc </0>"
+#"ab[cd e]z"                     "<0>ab z</0>"           #TODO:  white space handling in Unicode Sets.
+
+
+#Multi-line mode
+'b\s^'                     m     "a\nb\n"
+
+# Free-spacing mode
+"a b c  # this is a comment"  x "<0>abc</0> "
+'^a (?#xxx) (?#yyy) {3}c'  x    "<0>aaac</0>"