diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 3376638a6ff..2f69ec4ff6f 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -175,20 +175,21 @@ static void caseClose(UnicodeSet *theSet) { //---------------------------------------------------------------------------------------- RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status) { - fStatus = &status; + fStatus = &status; - fRXPat = rxp; - fScanIndex = 0; - fNextIndex = 0; - fPeekChar = -1; - fLineNum = 1; - fCharNum = 0; - fQuoteMode = FALSE; - fFreeForm = FALSE; - fModeFlags = fRXPat->fFlags; + fRXPat = rxp; + fScanIndex = 0; + fNextIndex = 0; + fPeekChar = -1; + fLineNum = 1; + fCharNum = 0; + fQuoteMode = FALSE; + fInBackslashQuote = FALSE; + fModeFlags = fRXPat->fFlags; + fEOLComments = TRUE; - fMatchOpenParen = -1; - fMatchCloseParen = -1; + fMatchOpenParen = -1; + fMatchCloseParen = -1; if (U_FAILURE(status)) { return; @@ -304,7 +305,7 @@ void RegexCompile::compile( U_ASSERT(state != 0); - // Find the state table element that matches the input char from the rule, or the + // Find the state table element that matches the input char from the pattern, or the // class of the input character. Start with the first table row for this // state, then linearly scan forward until we find a row that matches the // character. The last row for each state always matches all characters, so @@ -337,7 +338,7 @@ void RegexCompile::compile( } if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && - fC.fQuoted == FALSE && // char is not escaped && + fC.fQuoted == FALSE && // char is not escaped && fC.fChar != (UChar32)-1) { // char is not EOF UnicodeSet *uniset = gRuleSets[tableEl->fCharClass-128]; if (uniset->contains(fC.fChar)) { @@ -373,6 +374,10 @@ void RegexCompile::compile( fStack[fStackPtr] = tableEl->fPushState; } + // + // NextChar. This is where characters are actually fetched from the pattern. + // Happens under control of the 'n' tag in the state table. + // if (tableEl->fNextChar) { nextChar(fC); } @@ -1300,9 +1305,17 @@ UBool RegexCompile::doParseActions(EParseAction action) } break; + case doSuppressComments: + // We have just scanned a '(?'. We now need to prevent the character scanner from + // treating a '#' as a to-the-end-of-line comment. + // (This Perl compatibility just gets uglier and uglier to do...) + fEOLComments = FALSE; + break; + default: + U_ASSERT(FALSE); error(U_REGEX_INTERNAL_ERROR); returnVal = FALSE; break; @@ -1863,8 +1876,8 @@ static const UChar chUpperP = 0x50; //---------------------------------------------------------------------------------------- // // nextCharLL Low Level Next Char from the regex pattern. -// Get a char from the string, -// keep track of input position for error reporting. +// Get a char from the string, keep track of input position +// for error reporting. // //---------------------------------------------------------------------------------------- UChar32 RegexCompile::nextCharLL() { @@ -1929,9 +1942,6 @@ UChar32 RegexCompile::peekCharLL() { //--------------------------------------------------------------------------------- void RegexCompile::nextChar(RegexPatternChar &c) { - // Unicode Character constants needed for the processing done by nextChar(), - // in hex because literals wont work on EBCDIC machines. - fScanIndex = fNextIndex; c.fChar = nextCharLL(); c.fQuoted = FALSE; @@ -1944,39 +1954,60 @@ void RegexCompile::nextChar(RegexPatternChar &c) { nextChar(c); // recurse to get the real next char } } + else if (fInBackslashQuote) { + // The current character immediately follows a '\' + // Don't check for any further escapes, just return it as-is. + // Don't set c.fQuoted, because that would prevent the state machine from + // dispatching on the character. + fInBackslashQuote = FALSE; + } else { - // We are not in a 'quoted region' of the source. + // We are not in a \Q quoted region \E of the source. // - if (fFreeForm && c.fChar == chPound) { - // Start of a comment. Consume the rest of it. - // The new-line char that terminates the comment is always returned. - // It will be treated as white-space, and serves to break up anything - // that might otherwise incorrectly clump together with a comment in - // the middle (a variable name, for example.) + if (fModeFlags & UREGEX_COMMENTS) { + // + // We are in free-spacing and comments mode. + // Scan through any white space and comments, until we + // reach a significant character or the end of inut. for (;;) { + if (c.fChar == (UChar32)-1) { + break; // End of Input + } + if (c.fChar == chPound && fEOLComments == TRUE) { + // Start of a comment. Consume the rest of it, until EOF or a new line + for (;;) { + c.fChar = nextCharLL(); + if (c.fChar == (UChar32)-1 || // EOF + c.fChar == chCR || + c.fChar == chLF || + c.fChar == chNEL || + c.fChar == chLS) { + break; + } + } + } + if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) { + // TODO: is RuleWhiteSpace the right thing to use here? + break; + } c.fChar = nextCharLL(); - if (c.fChar == (UChar32)-1 || // EOF - c.fChar == chCR || - c.fChar == chLF || - c.fChar == chNEL || - c.fChar == chLS) {break;} } } - if (c.fChar == (UChar32)-1) { - return; - } // // check for backslash escaped characters. - // Use UnicodeString::unescapeAt() to handle those that it can. - // Otherwise just return the '\', and let the pattern parser deal with it. // - int32_t startX = fNextIndex; // start and end positions of the - int32_t endX = fNextIndex; // sequence following the '\' + int32_t startX = fNextIndex; // start and end positions of the + int32_t endX = fNextIndex; // sequence following the '\' if (c.fChar == chBackSlash) { if (gUnescapeCharSet->contains(peekCharLL())) { - nextCharLL(); // get & discard the peeked char. + // + // A '\' sequence that is handled by ICU's standard unescapeAt function. + // Includes \uxxxx, \n, \r, many others. + // Return the single equivalent character. + // + nextCharLL(); // get & discard the peeked char. c.fQuoted = TRUE; c.fChar = fRXPat->fPattern.unescapeAt(endX); if (startX == endX) { @@ -1985,8 +2016,21 @@ void RegexCompile::nextChar(RegexPatternChar &c) { fCharNum += endX - startX; fNextIndex = endX; } + else + { + // We are in a '\' escape that will be handled by the state table scanner. + // Just return the backslash, but remember that the following char is to + // be taken literally. TODO: this is awkward + fInBackslashQuote = TRUE; + } } } + + // re-enable # to end-of-line comments, in case they were disabled.. + // They are disabled by the parser upon seeing '(?', but this lasts for + // the fetching of the next character only. + fEOLComments = TRUE; + // putc(c.fChar, stdout); } diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h index e92528aebd5..4840849bacd 100644 --- a/icu4c/source/i18n/regexcmp.h +++ b/icu4c/source/i18n/regexcmp.h @@ -118,8 +118,10 @@ private: // in the rule input string. int32_t fNextIndex; // Index of the next character, which // is the first character not yet scanned. - UBool fQuoteMode; // Scan is in a quoted region - UBool fFreeForm; // Scan mode is free-form, ignore spaces. + UBool fQuoteMode; // Scan is in a \Q...\E quoted region + UBool fInBackslashQuote; // Scan is between a '\' and the following char. + UBool fEOLComments; // When scan is just after '(?', inhibit #... to + // end of line comments, in favor of (?#...) comments. int fLineNum; // Line number in input file. int fCharNum; // Char position within the line. UChar32 fLastChar; // Previous char, needed to count CR-LF diff --git a/icu4c/source/i18n/regexcst.h b/icu4c/source/i18n/regexcst.h index 813c6ac89be..40742724aa1 100644 --- a/icu4c/source/i18n/regexcst.h +++ b/icu4c/source/i18n/regexcst.h @@ -60,6 +60,7 @@ enum Regex_PatternParseAction { doPossesiveOpt, doEscapeError, doBackslashG, + doSuppressComments, doMatchModeParen, doOpt, doInterval, @@ -127,11 +128,11 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 21 , {doNOP, 255, 2,0, FALSE} // 22 - , {doNOP, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant + , {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant , {doNOP, 255, 27,0, FALSE} // 24 , {doNOP, 35 /* # */, 46, 14, TRUE} // 25 open-paren-quant2 , {doNOP, 255, 29,0, FALSE} // 26 - , {doNOP, 63 /* ? */, 29,0, TRUE} // 27 open-paren + , {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren , {doOpenCaptureParen, 255, 2, 14, FALSE} // 28 , {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE} // 29 open-paren-extended , {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30 diff --git a/icu4c/source/i18n/regexcst.txt b/icu4c/source/i18n/regexcst.txt index 56e71c26acf..f2343d3861c 100644 --- a/icu4c/source/i18n/regexcst.txt +++ b/icu4c/source/i18n/regexcst.txt @@ -64,7 +64,7 @@ start: # term. At a position where we can accept the start most items in a pattern. # term: - quoted n expr-quant doLiteralChar + quoted n expr-quant doLiteralChar rule_char n expr-quant doLiteralChar '[' n expr-quant doScanUnicodeSet '(' n open-paren @@ -109,7 +109,7 @@ expr-cont: # branches into the normal parenthesis sequence as quickly as possible. # open-paren-quant: - '?' n open-paren-quant2 + '?' n open-paren-quant2 doSuppressComments default open-paren open-paren-quant2: @@ -122,7 +122,7 @@ open-paren-quant2: # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. # open-paren: - '?' n open-paren-extended + '?' n open-paren-extended doSuppressComments default term ^expr-quant doOpenCaptureParen open-paren-extended: diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 04ee0a6f082..2e6728f043d 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -844,9 +844,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { // We are at the start input. Success. break; } - // Check the character just before the current pos. + // Check whether character just before the current pos is a new-line + // unless we are at the end of input UChar c = inputBuf[fp->fInputIdx - 1]; - if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) { + if ((fp->fInputIdx < inputLen) && + (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) { // It's a new-line. ^ is true. Success. break; } diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index eae23c3d0f9..b558126357e 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -215,7 +215,7 @@ RegexPattern *RegexPattern::compile( return NULL; } - if ((flags & (UREGEX_CANON_EQ | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE)) != 0) { + if ((flags & UREGEX_CANON_EQ) != 0) { status = U_REGEX_UNIMPLEMENTED; return NULL; } diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index f7126bf4df2..d30e3503c32 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -394,9 +394,12 @@ void RegexTest::Basic() { // #if 0 { - REGEX_TESTLM("(a)|\\1", "x", FALSE, FALSE); - // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc<2>cccddd"); - // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); + // REGEX_TESTLM("^a (?#xxx) (?#yyy) {3}c", "accc", FALSE, FALSE); + UParseError pe; + UErrorCode status; + RegexPattern::compile("^a (?#xxx) (?#yyy) {3}c", UREGEX_COMMENTS, pe, status); + // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc<2>cccddd"); + // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); } exit(1); #endif diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index f54a4fab517..44c5cb113f7 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -216,3 +216,17 @@ "(?:(?i)a)b" "<0>Ab" "ab(?i)cd" "<0>abCd" "ab$cd" "abcd" + +# White space handling +"a b" "ab" +"abc " "abc" +"abc " "<0>abc " +#"ab[cd e]z" "<0>ab z" #TODO: white space handling in Unicode Sets. + + +#Multi-line mode +'b\s^' m "a\nb\n" + +# Free-spacing mode +"a b c # this is a comment" x "<0>abc " +'^a (?#xxx) (?#yyy) {3}c' x "<0>aaac"