diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h index 70d774f2694..55361e9d3b7 100644 --- a/icu4c/source/common/unicode/utypes.h +++ b/icu4c/source/common/unicode/utypes.h @@ -503,6 +503,7 @@ typedef enum UErrorCode { U_REGEX_ERROR_START=0x10300, U_REGEX_INTERNAL_ERROR, U_REGEX_INVALID_STATE, + U_REGEX_BAD_ESCAPE_SEQUENCE, U_REGEX_ERROR_LIMIT, U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */ diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index b94967b32c0..b3fa5352dcb 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -59,22 +59,21 @@ static const UChar gRuleSet_rule_char_pattern[] = { 0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0}; -static const UChar gRuleSet_name_char_pattern[] = { -// [ _ \ p { L } \ p { N } ] - 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0}; - static const UChar gRuleSet_digit_char_pattern[] = { // [ 0 - 9 ] 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; -static const UChar gRuleSet_name_start_char_pattern[] = { -// [ _ \ p { L } ] - 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 }; - -static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any" static UnicodeSet *gRuleSets[10]; // Array of ptrs to the actual UnicodeSet objects. +static UnicodeSet *gUnescapeCharSet; +// +// These are the backslash escape characters that ICU's unescape +// will handle. +// +static const UChar gUnescapeCharPattern[] = { +// [ a b c e f n r t u U ] + 0x5b, 0x61, 0x62, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d}; //---------------------------------------------------------------------------------------- @@ -88,7 +87,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status) fScanIndex = 0; fNextIndex = 0; - + fPeekChar = -1; fLineNum = 1; fCharNum = 0; fQuoteMode = FALSE; @@ -110,13 +109,16 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status) gRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(gRuleSet_rule_char_pattern, status); gRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodePropertySet::getRuleWhiteSpaceSet(status)); gRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern, status); + gUnescapeCharSet = new UnicodeSet(gUnescapeCharPattern, status); if (U_FAILURE(status)) { delete gRuleSets[kRuleSet_rule_char-128]; delete gRuleSets[kRuleSet_white_space-128]; delete gRuleSets[kRuleSet_digit_char-128]; + delete gUnescapeCharSet; gRuleSets[kRuleSet_rule_char-128] = NULL; gRuleSets[kRuleSet_white_space-128] = NULL; gRuleSets[kRuleSet_digit_char-128] = NULL; + gUnescapeCharSet = NULL; return; } } @@ -218,7 +220,7 @@ void RegexCompile::compile( // Table row specified "quoted" and the char was quoted. break; } - if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) { + if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) { // Table row specified eof and we hit eof on the input. break; } @@ -605,14 +607,15 @@ UBool RegexCompile::doParseActions(EParseAction action) break; - case doDotAny: // scanned a ".", match any single character. fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus); break; - case doExprFinished: + case doBackslashA: + // Scanned a "\A". + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus); break; case doExit: @@ -816,6 +819,11 @@ UChar32 RegexCompile::nextCharLL() { UChar32 ch; UnicodeString &pattern = fRXPat->fPattern; + if (fPeekChar != -1) { + ch = fPeekChar; + fPeekChar = -1; + return ch; + } if (fPatternLength==0 || fNextIndex >= fPatternLength) { return (UChar32)-1; } @@ -846,12 +854,25 @@ UChar32 RegexCompile::nextCharLL() { return ch; } +//--------------------------------------------------------------------------------- +// +// peekCharLL Low Level Character Scanning, sneak a peek at the next +// character without actually getting it. +// +//--------------------------------------------------------------------------------- +UChar32 RegexCompile::peekCharLL() { + if (fPeekChar == -1) { + fPeekChar = nextCharLL(); + } + return fPeekChar; +} + //--------------------------------------------------------------------------------- // -// nextChar for rules scanning. At this level, we handle stripping -// out comments and processing backslash character escapes. -// The rest of the rules grammar is handled at the next level up. +// nextChar for pattern scanning. At this level, we handle stripping +// out comments and processing some backslash character escapes. +// The rest of the pattern grammar is handled at the next level up. // //--------------------------------------------------------------------------------- void RegexCompile::nextChar(RegexPatternChar &c) { @@ -870,7 +891,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) { { // We are not in a 'quoted region' of the source. // - if (c.fChar == chPound) { + if (fFreeForm && c.fChar == chPound) { // Start of a comment. Consume the rest of it. // The new-line char that terminates the comment is always returned. // It will be treated as white-space, and serves to break up anything @@ -891,16 +912,22 @@ void RegexCompile::nextChar(RegexPatternChar &c) { // // check for backslash escaped characters. - // Use UnicodeString::unescapeAt() to handle them. + // Use UnicodeString::unescapeAt() to handle those that it can. + // Otherwise just return the '\', and let the pattern parser deal with it. // + int32_t startX = fNextIndex; // start and end positions of the + int32_t endX = fNextIndex; // sequence following the '\' if (c.fChar == chBackSlash) { - c.fQuoted = TRUE; - int32_t startX = fNextIndex; - c.fChar = fRXPat->fPattern.unescapeAt(fNextIndex); - if (fNextIndex == startX) { - error(U_BRK_HEX_DIGITS_EXPECTED); + if (gUnescapeCharSet->contains(peekCharLL())) { + nextCharLL(); // get & discard the peeked char. + c.fQuoted = TRUE; + c.fChar = fRXPat->fPattern.unescapeAt(endX); + if (startX == endX) { + error(U_REGEX_BAD_ESCAPE_SEQUENCE); + } + fCharNum += endX - startX; + fNextIndex = endX; } - fCharNum += fNextIndex-startX; } } // putc(c.fChar, stdout); diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h index 66b0e994f22..0416a0533bf 100644 --- a/icu4c/source/i18n/regexcmp.h +++ b/icu4c/source/i18n/regexcmp.h @@ -65,8 +65,6 @@ public: void nextChar(RegexPatternChar &c); // Get the next char from the input stream. - UBool push(const RegexPatternChar &c); // Push (unget) one character. - // Only a single character may be pushed. /** * ICU "poor man's RTTI", returns a UClassID for the actual class. @@ -88,6 +86,7 @@ private: void error(UErrorCode e); // error reporting convenience function. UChar32 nextCharLL(); + UChar32 peekCharLL(); UnicodeSet *scanSet(); void handleCloseParen(); int32_t blockTopLoc(); // Locate a position in the compiled pattern @@ -99,6 +98,9 @@ private: RegexPattern *fRXPat; UParseError *fParseErr; + // + // Data associated with low level character scanning + // int32_t fScanIndex; // Index of current character being processed // in the rule input string. int32_t fNextIndex; // Index of the next character, which @@ -109,6 +111,8 @@ private: int fCharNum; // Char position within the line. UChar32 fLastChar; // Previous char, needed to count CR-LF // as a single line, not two. + UChar32 fPeekChar; // Saved char, if we've scanned ahead. + RegexPatternChar fC; // Current char for parse state machine // processing. diff --git a/icu4c/source/i18n/regexcst.h b/icu4c/source/i18n/regexcst.h index 38fa01107d1..cd68bc72a1c 100644 --- a/icu4c/source/i18n/regexcst.h +++ b/icu4c/source/i18n/regexcst.h @@ -40,6 +40,7 @@ enum Regex_PatternParseAction { doOpenLookAheadNeg, doPlus, doOpenNonCaptureParen, + doBackslashA, doNGPlus, doPatFinish, doIntervalMinValue, @@ -51,7 +52,6 @@ enum Regex_PatternParseAction { doOpenLookAhead, doNumberExpectedError, doDotAny, - doExprFinished, doScanUnicodeSet, doNOP, doExit, @@ -80,71 +80,65 @@ static const struct RegexTableEl gRuleParseStateTable[] = { {doNOP, 0, 0, 0, TRUE} , {doPatStart, 255, 3, 2, FALSE} // 1 start , {doPatFinish, 255, 2,0, FALSE} // 2 finish - , {doStartString, 254, 10,0, TRUE} // 3 term - , {doStartString, 130, 10,0, TRUE} // 4 - , {doScanUnicodeSet, 91 /* [ */, 17,0, TRUE} // 5 - , {doNOP, 40 /* ( */, 29, 17, TRUE} // 6 - , {doDotAny, 46 /* . */, 17,0, TRUE} // 7 - , {doNOP, 253, 255,0, FALSE} // 8 - , {doRuleError, 255, 67,0, FALSE} // 9 - , {doStringChar, 254, 10,0, TRUE} // 10 string - , {doStringChar, 130, 10,0, TRUE} // 11 - , {doSplitString, 63 /* ? */, 17,0, FALSE} // 12 - , {doSplitString, 43 /* + */, 17,0, FALSE} // 13 - , {doSplitString, 42 /* * */, 17,0, FALSE} // 14 - , {doSplitString, 123 /* { */, 17,0, FALSE} // 15 - , {doEndString, 255, 17,0, FALSE} // 16 - , {doNOP, 42 /* * */, 40,0, TRUE} // 17 expr-quant - , {doNOP, 43 /* + */, 43,0, TRUE} // 18 - , {doNOP, 63 /* ? */, 46,0, TRUE} // 19 - , {doNOP, 255, 21,0, FALSE} // 20 - , {doNOP, 254, 3,0, FALSE} // 21 expr-cont - , {doNOP, 130, 3,0, FALSE} // 22 - , {doNOP, 91 /* [ */, 3,0, FALSE} // 23 - , {doNOP, 40 /* ( */, 3,0, FALSE} // 24 - , {doNOP, 46 /* . */, 3,0, FALSE} // 25 - , {doOrOperator, 124 /* | */, 3,0, TRUE} // 26 - , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 27 - , {doExprFinished, 255, 255,0, FALSE} // 28 - , {doNOP, 63 /* ? */, 31,0, TRUE} // 29 open-paren - , {doOpenCaptureParen, 255, 3, 17, FALSE} // 30 - , {doOpenNonCaptureParen, 58 /* : */, 3, 17, TRUE} // 31 open-paren-extended - , {doOpenAtomicParen, 62 /* > */, 3, 17, TRUE} // 32 - , {doOpenLookAhead, 61 /* = */, 3, 21, TRUE} // 33 - , {doOpenLookAheadNeg, 33 /* ! */, 3, 21, TRUE} // 34 - , {doNOP, 60 /* < */, 37,0, TRUE} // 35 - , {doBadOpenParenType, 255, 67,0, FALSE} // 36 - , {doOpenLookBehind, 61 /* = */, 3, 21, TRUE} // 37 open-paren-lookbehind - , {doOpenLookBehindNeg, 33 /* ! */, 3, 21, TRUE} // 38 - , {doBadOpenParenType, 255, 67,0, FALSE} // 39 - , {doNGStar, 63 /* ? */, 21,0, TRUE} // 40 quant-star - , {doPossesiveStar, 43 /* + */, 21,0, TRUE} // 41 - , {doStar, 255, 21,0, FALSE} // 42 - , {doNGPlus, 63 /* ? */, 21,0, TRUE} // 43 quant-plus - , {doPossesivePlus, 43 /* + */, 21,0, TRUE} // 44 - , {doPlus, 255, 21,0, FALSE} // 45 - , {doNGOpt, 63 /* ? */, 21,0, TRUE} // 46 quant-opt - , {doPossesiveOpt, 43 /* + */, 21,0, TRUE} // 47 - , {doOpt, 255, 21,0, FALSE} // 48 - , {doNOP, 129, 49,0, TRUE} // 49 interval-open - , {doIntervalMinValue, 128, 52,0, FALSE} // 50 - , {doNumberExpectedError, 255, 67,0, FALSE} // 51 - , {doNOP, 129, 56,0, TRUE} // 52 interval-value - , {doNOP, 125 /* } */, 56,0, FALSE} // 53 - , {doIntervalDigit, 128, 52,0, TRUE} // 54 - , {doNumberExpectedError, 255, 67,0, FALSE} // 55 - , {doNOP, 129, 56,0, TRUE} // 56 interval-close - , {doTagValue, 125 /* } */, 59,0, TRUE} // 57 - , {doNumberExpectedError, 255, 67,0, FALSE} // 58 - , {doNOP, 254, 3,0, FALSE} // 59 expr-cont-no-interval - , {doNOP, 130, 3,0, FALSE} // 60 - , {doNOP, 91 /* [ */, 3,0, FALSE} // 61 - , {doNOP, 40 /* ( */, 3,0, FALSE} // 62 - , {doNOP, 46 /* . */, 3,0, FALSE} // 63 - , {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 64 - , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 65 - , {doExprFinished, 255, 255,0, FALSE} // 66 - , {doExit, 255, 67,0, TRUE} // 67 errorDeath + , {doStartString, 254, 11,0, TRUE} // 3 term + , {doStartString, 130, 11,0, TRUE} // 4 + , {doScanUnicodeSet, 91 /* [ */, 18,0, TRUE} // 5 + , {doNOP, 40 /* ( */, 25, 18, TRUE} // 6 + , {doDotAny, 46 /* . */, 18,0, TRUE} // 7 + , {doNOP, 92 /* \ */, 59,0, TRUE} // 8 + , {doNOP, 253, 2,0, FALSE} // 9 + , {doRuleError, 255, 61,0, FALSE} // 10 + , {doStringChar, 254, 11,0, TRUE} // 11 string + , {doStringChar, 130, 11,0, TRUE} // 12 + , {doSplitString, 63 /* ? */, 18,0, FALSE} // 13 + , {doSplitString, 43 /* + */, 18,0, FALSE} // 14 + , {doSplitString, 42 /* * */, 18,0, FALSE} // 15 + , {doSplitString, 123 /* { */, 18,0, FALSE} // 16 + , {doEndString, 255, 18,0, FALSE} // 17 + , {doNOP, 42 /* * */, 36,0, TRUE} // 18 expr-quant + , {doNOP, 43 /* + */, 39,0, TRUE} // 19 + , {doNOP, 63 /* ? */, 42,0, TRUE} // 20 + , {doNOP, 255, 22,0, FALSE} // 21 + , {doOrOperator, 124 /* | */, 3,0, TRUE} // 22 expr-cont + , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 23 + , {doNOP, 255, 3,0, FALSE} // 24 + , {doNOP, 63 /* ? */, 27,0, TRUE} // 25 open-paren + , {doOpenCaptureParen, 255, 3, 18, FALSE} // 26 + , {doOpenNonCaptureParen, 58 /* : */, 3, 18, TRUE} // 27 open-paren-extended + , {doOpenAtomicParen, 62 /* > */, 3, 18, TRUE} // 28 + , {doOpenLookAhead, 61 /* = */, 3, 22, TRUE} // 29 + , {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE} // 30 + , {doNOP, 60 /* < */, 33,0, TRUE} // 31 + , {doBadOpenParenType, 255, 61,0, FALSE} // 32 + , {doOpenLookBehind, 61 /* = */, 3, 22, TRUE} // 33 open-paren-lookbehind + , {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE} // 34 + , {doBadOpenParenType, 255, 61,0, FALSE} // 35 + , {doNGStar, 63 /* ? */, 22,0, TRUE} // 36 quant-star + , {doPossesiveStar, 43 /* + */, 22,0, TRUE} // 37 + , {doStar, 255, 22,0, FALSE} // 38 + , {doNGPlus, 63 /* ? */, 22,0, TRUE} // 39 quant-plus + , {doPossesivePlus, 43 /* + */, 22,0, TRUE} // 40 + , {doPlus, 255, 22,0, FALSE} // 41 + , {doNGOpt, 63 /* ? */, 22,0, TRUE} // 42 quant-opt + , {doPossesiveOpt, 43 /* + */, 22,0, TRUE} // 43 + , {doOpt, 255, 22,0, FALSE} // 44 + , {doNOP, 129, 45,0, TRUE} // 45 interval-open + , {doIntervalMinValue, 128, 48,0, FALSE} // 46 + , {doNumberExpectedError, 255, 61,0, FALSE} // 47 + , {doNOP, 129, 52,0, TRUE} // 48 interval-value + , {doNOP, 125 /* } */, 52,0, FALSE} // 49 + , {doIntervalDigit, 128, 48,0, TRUE} // 50 + , {doNumberExpectedError, 255, 61,0, FALSE} // 51 + , {doNOP, 129, 52,0, TRUE} // 52 interval-close + , {doTagValue, 125 /* } */, 55,0, TRUE} // 53 + , {doNumberExpectedError, 255, 61,0, FALSE} // 54 + , {doNOP, 254, 3,0, FALSE} // 55 expr-cont-no-interval + , {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 56 + , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57 + , {doNOP, 255, 3,0, FALSE} // 58 + , {doBackslashA, 65 /* A */, 3,0, TRUE} // 59 backslash + , {doStartString, 255, 11,0, TRUE} // 60 + , {doExit, 255, 61,0, TRUE} // 61 errorDeath }; static const char *RegexStateNames[] = { 0, "start", @@ -155,6 +149,7 @@ static const char *RegexStateNames[] = { 0, 0, 0, 0, + 0, 0, "string", 0, @@ -169,11 +164,6 @@ static const char *RegexStateNames[] = { 0, 0, "expr-cont", 0, - 0, - 0, - 0, - 0, - 0, 0, "open-paren", 0, @@ -209,9 +199,7 @@ static const char *RegexStateNames[] = { 0, 0, 0, 0, - 0, - 0, - 0, + "backslash", 0, "errorDeath", 0}; diff --git a/icu4c/source/i18n/regexcst.txt b/icu4c/source/i18n/regexcst.txt index 020cb8d7ff8..11b92be501b 100644 --- a/icu4c/source/i18n/regexcst.txt +++ b/icu4c/source/i18n/regexcst.txt @@ -77,7 +77,8 @@ term: '[' n expr-quant doScanUnicodeSet '(' n open-paren ^expr-quant '.' n expr-quant doDotAny - eof pop + '\' n backslash + eof finish default errorDeath doRuleError @@ -110,17 +111,12 @@ expr-quant: # # expr-cont Expression, continuation. At a point where additional terms are -# allowed, but not required. +# allowed, but not required. No Quantifiers # expr-cont: - quoted term - rule_char term - '[' term - '(' term - '.' term '|' n term doOrOperator ')' n pop doCloseParen - default pop doExprFinished + default term # @@ -205,16 +201,18 @@ interval-close: # expr-cont-no-interval: quoted term - rule_char term - '[' term - '(' term - '.' term '|' n term doExprOrOperator ')' n pop doExprRParen - default pop doExprFinished + default term - +# +# backslash # Backslash. Figure out which of the \thingies we have encountered. +# The low level next-char function will have preprocessed +# some of them already; those won't come here. +backslash: + 'A' n term doBackslashA + default n string doStartString diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index b53a0418681..900b7b9b68d 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -26,7 +26,7 @@ static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern po static const uint32_t URX_NOP = 7; static const uint32_t URX_START_CAPTURE = 8; // Value field is capture group number. static const uint32_t URX_END_CAPTURE = 9; // Value field is capture group number -static const uint32_t URX_UNUSED10 = 10; // Value field is index in pattern to +static const uint32_t URX_BACKSLASH_A = 10; // Value field is index in pattern to // loop back to. static const uint32_t URX_SETREF = 11; // Value field is index of set in array of sets. static const uint32_t URX_DOTANY = 12; diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index f1dc22d14ae..5898bfd216d 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -11,6 +11,7 @@ #include "unicode/utypes.h" #include "unicode/regex.h" #include "unicode/uniset.h" +#include "unicode/uchar.h" #include "uassert.h" #include "uvector.h" #include "regeximp.h" @@ -54,20 +55,126 @@ RegexMatcher::~RegexMatcher() { - +static const UChar BACKSLASH = 0x5c; +static const UChar DOLLARSIGN = 0x24; +//-------------------------------------------------------------------------------- +// +// appendReplacement +// +//-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, - const UnicodeString &replacement) { + const UnicodeString &replacement, + UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + return *this; + } + + // Copy input string from the end of previous match to start of current match + int32_t len = fMatchStart-fLastMatchEnd; + if (len > 0) { + dest.append(*fInput, fLastMatchEnd, len); + } + + + // scan the replacement text, looking for substitutions ($n) and \escapes. + int32_t replLen = replacement.length(); + int32_t replIdx; + for (replIdx = 0; replIdx= replLen) { + break; + } + c = replacement.charAt(replIdx); + dest.append(c); + continue; + } + + if (c != DOLLARSIGN) { + // Normal char, not a $. Copy it out without further checks. + dest.append(c); + continue; + } + + // We've got a $. Pick up a capture group number if one follows. + // Consume at most the number of digits necessary for the largest capture + // number that is valid for this pattern. + if (++replIdx >= replLen) { + // $ was at the end of the replacement string. Dump it out and be done. + dest.append(c); + break; + } + + int32_t numDigits = 0; + int32_t groupNum = 0; + for (;;) { + c = replacement.charAt(replIdx); + if (u_isdigit(c) == FALSE) { + break; + } + groupNum=groupNum*10 + u_charDigitValue(c); + numDigits++; + if (++replIdx >= replLen) { + break; + } + if (numDigits >= fPattern->fMaxCaptureDigits) { + break; + } + } + + // We've scanned one char ahead in the pattern. Back up so the + // next iteration of the loop picks the char again. + --replIdx; + + if (numDigits == 0) { + // The $ didn't introduce a group number at all. + // Treat it as just part of the substitution text. + dest.append(DOLLARSIGN); + continue; + } + + // Finally, append the capture group data to the destination. + dest.append(group(groupNum, status)); + if (U_FAILURE(status)) { + // Can fail if group number is out of range. + return *this; + } + + } + return *this; } +//-------------------------------------------------------------------------------- +// +// appendTail Intended to be used in conjunction with appendReplacement() +// To the destination string, append everything following +// the last match position from the input string. +// +//-------------------------------------------------------------------------------- UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { + int32_t len = fInputLength-fMatchEnd; + if (len > 0) { + dest.append(*fInput, fMatchEnd, len); + } return dest; } +//-------------------------------------------------------------------------------- +// +// end +// +//-------------------------------------------------------------------------------- int32_t RegexMatcher::end(UErrorCode &err) const { return end(0, err); } @@ -78,7 +185,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const { if (U_FAILURE(err)) { return 0; } - if (fLastMatch == FALSE) { + if (fMatch == FALSE) { err = U_REGEX_INVALID_STATE; return 0; } @@ -88,7 +195,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const { } int32_t e = 0; if (group == 0) { - e = fLastMatchEnd; + e = fMatchEnd; } else { int32_t s = fCaptureEnds->elementAti(group); // TODO: what to do if no match on this specific group? @@ -101,11 +208,16 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const { +//-------------------------------------------------------------------------------- +// +// find() +// +//-------------------------------------------------------------------------------- UBool RegexMatcher::find() { // Start at the position of the last match end. (Will be zero if the // matcher has been reset. UErrorCode status = U_ZERO_ERROR; - return find(fLastMatchEnd, status); + return find(fMatchEnd, status); } @@ -128,16 +240,20 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) { if (U_FAILURE(status)) { return FALSE; } - if (fLastMatch) { + if (fMatch) { return TRUE; } } - fLastMatchStart = fLastMatchEnd = fInputLength; return FALSE; } +//-------------------------------------------------------------------------------- +// +// group() +// +//-------------------------------------------------------------------------------- UnicodeString RegexMatcher::group(UErrorCode &status) const { return group(0, status); } @@ -181,7 +297,7 @@ UBool RegexMatcher::lookingAt(UErrorCode &status) { } reset(); MatchAt(0, status); - return fLastMatch; + return fMatch; } @@ -192,7 +308,7 @@ UBool RegexMatcher::matches(UErrorCode &status) { } reset(); MatchAt(0, status); - UBool success = (fLastMatch && fLastMatchEnd==fInputLength); + UBool success = (fMatch && fMatchEnd==fInputLength); return success; } @@ -205,23 +321,58 @@ const RegexPattern &RegexMatcher::pattern() const { -UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &err) { - return UnicodeString(); +//-------------------------------------------------------------------------------- +// +// replaceAll +// +//-------------------------------------------------------------------------------- +UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { + if (U_FAILURE(status)) { + return *fInput; + } + UnicodeString destString; + for (reset(); find(); ) { + appendReplacement(destString, replacement, status); + } + appendTail(destString); + return destString; } -UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &err) { - return UnicodeString(); +//-------------------------------------------------------------------------------- +// +// replaceFirst +// +//-------------------------------------------------------------------------------- +UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { + if (U_FAILURE(status)) { + return *fInput; + } + reset(); + if (!find()) { + return *fInput; + } + + UnicodeString destString; + appendReplacement(destString, replacement, status); + appendTail(destString); + return destString; } +//-------------------------------------------------------------------------------- +// +// reset +// +//-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::reset() { - fLastMatchStart = 0; - fLastMatchEnd = 0; - fLastMatch = FALSE; + fMatchStart = 0; + fMatchEnd = 0; + fLastMatchEnd = 0; + fMatch = FALSE; int i; for (i=0; i<=fPattern->fNumCaptureGroups; i++) { fCaptureStarts->setElementAt(i, -1); @@ -252,7 +403,7 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const { if (U_FAILURE(err)) { return 0; } - if (fLastMatch == FALSE) { + if (fMatch == FALSE) { err = U_REGEX_INVALID_STATE; return 0; } @@ -262,7 +413,7 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const { } int32_t s; if (group == 0) { - s = fLastMatchStart; + s = fMatchStart; } else { s = fCaptureStarts->elementAti(group); // TODO: what to do if no match on this specific group? @@ -272,6 +423,26 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const { +//-------------------------------------------------------------------------------- +// +// getCaptureText We have encountered a '\' that might preceed a +// capture group specification. +// If a valid capture group number follows the '\', +// return the indicies to the start & end of the captured +// text, and update the patIdx to the position following the +// \n sequence. +// +// This function is used during find and replace operations when +// processing caputure references in the replacement text. +// +//-------------------------------------------------------------------------------- +UBool RegexMatcher::getCaptureText(const UnicodeString &rep, + int32_t &repIdx, + int32_t &textStart, + int32_t &textEnd) +{ + return FALSE; +} //-------------------------------------------------------------------------------- // @@ -408,6 +579,12 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { fCaptureEnds->setElementAt(inputIdx, opValue); break; + case URX_BACKSLASH_A: + if (inputIdx != 0) { + backTrack(inputIdx, patIdx); + } + break; + case URX_SETREF: if (inputIdx < fInputLength) { @@ -449,7 +626,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { default: // Trouble. The compiled pattern contains an entry with an // unrecognized type tag. - U_ASSERT(false); + U_ASSERT(FALSE); } if (U_FAILURE(status)) { @@ -458,10 +635,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { } breakFromLoop: - fLastMatch = isMatch; + fMatch = isMatch; if (isMatch) { - fLastMatchStart = startIdx; - fLastMatchEnd = inputIdx; + fLastMatchEnd = fMatchEnd; + fMatchStart = startIdx; + fMatchEnd = inputIdx; } return; } diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index 519a1db8dc3..406be3aff68 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -65,6 +65,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { fLiteralText = other.fLiteralText; fBadState = other.fBadState; fNumCaptureGroups = other.fNumCaptureGroups; + fMaxCaptureDigits = other.fMaxCaptureDigits; if (fBadState) { return *this; } @@ -108,6 +109,7 @@ void RegexPattern::init() { fFlags = 0; fBadState = FALSE; fNumCaptureGroups = 0; + fMaxCaptureDigits = 1; // TODO: calculate for real. fMatcher = NULL; UErrorCode status=U_ZERO_ERROR; @@ -301,6 +303,8 @@ UnicodeString RegexPattern::pattern() const { //--------------------------------------------------------------------- // // split +// TODO: perl returns captured strings intermixed with the +// fields. Should we do this too? // //--------------------------------------------------------------------- int32_t RegexPattern::split(const UnicodeString &input, @@ -359,9 +363,9 @@ int32_t RegexPattern::split(const UnicodeString &input, if (fMatcher->find()) { // We found another delimiter. Move everything from where we started looking // up until the start of the delimiter into the next output string. - int32_t fieldLen = fMatcher->fLastMatchStart - nextOutputStringStart; + int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart; dest[i].setTo(input, nextOutputStringStart, fieldLen); - nextOutputStringStart = fMatcher->fLastMatchEnd; + nextOutputStringStart = fMatcher->fMatchEnd; if (nextOutputStringStart == inputLen) { // The delimiter was at the end of the string. We're done. break; @@ -407,7 +411,7 @@ static char *opNames[] = { "NOP", "START_CAPTURE", "END_CAPTURE", - "?10", + "URX_BACKSLASH_A", "SETREF", "DOTANY", "JMP", diff --git a/icu4c/source/i18n/unicode/regex.h b/icu4c/source/i18n/unicode/regex.h index 92ed6e6e76e..9dab258935e 100644 --- a/icu4c/source/i18n/unicode/regex.h +++ b/icu4c/source/i18n/unicode/regex.h @@ -178,6 +178,7 @@ private: // make new ones on each call. int32_t fNumCaptureGroups; + int32_t fMaxCaptureDigits; friend class RegexCompile; friend class RegexMatcher; @@ -226,13 +227,16 @@ public: * The append position is set to the position of the first * character following the match in the input string. * + * For complete, prepackaged, non-incremental find-and-replace + * operations, see replaceFirst() or replaceAll(). + * * Returns: This Matcher * * error: Illegal state - no match yet attemtped, or last match failed. * IndexOutOfBounds - caputure string number from replacement string. */ virtual RegexMatcher &appendReplacement(UnicodeString &dest, - const UnicodeString &replacement); + const UnicodeString &replacement, UErrorCode &status); /* @@ -329,7 +333,8 @@ public: /* * Replaces every subsequence of the input sequence that matches the pattern - * with the given replacement string. + * with the given replacement string. This is a convenience function that + * provides a complete find-and-replace-all operation. * * This method first resets this matcher. It then scans the input sequence * looking for matches of the pattern. Characters that are not part of any @@ -337,10 +342,7 @@ public: * replacement string. The replacement string may contain references to * captured subsequences as in the appendReplacement method. * - * @return The target string. Depending on how the RegexMatcher was - * created, this may either be the original input string or a copy - * - * Error: Index out of bounds (replacement string capture group) + * @return A string containing the results of the find and replace. * */ virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &err); @@ -348,16 +350,15 @@ public: /* * Replaces the first subsequence of the input sequence that matches - * the pattern with the given replacement string. + * the pattern with the given replacement string. This is a convenience + * function that provides a complete find-and-replace operation. + * * This method first resets this matcher. It then scans the input sequence * looking for a match of the pattern. Characters that are not part * of the match are appended directly to the result string; the match is replaced * in the result by the replacement string. The replacement string may contain * references to captured subsequences as in the appendReplacement method. * - * Error: Index out of bounds (replacement string capture group) - * Illegal state (no match) - * Note: Javadoc doesn't list exceptions, but they gotta be there for consistency */ virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &err); @@ -409,27 +410,33 @@ public: private: // Constructors and other object boilerplate are private. - // Creation by users is through factory method in RegexPattern + // Instances of RegexMatcher can not be assigned, copied, cloned, etc. + // Creation by users is only through the factory method in class RegexPattern RegexMatcher(const RegexPattern *pat); RegexMatcher(const RegexMatcher &other); RegexMatcher &operator =(const RegexMatcher &rhs); friend class RegexPattern; - inline void backTrack(int32_t &inputIdx, int32_t &patIdx); // // MatchAt This is the internal interface to the match engine itself. // Match status comes back in matcher member variables. // - virtual void MatchAt(int32_t startIdx, UErrorCode &status); + void MatchAt(int32_t startIdx, UErrorCode &status); + inline void backTrack(int32_t &inputIdx, int32_t &patIdx); + UBool getCaptureText(const UnicodeString &rep, + int32_t &repIdx, + int32_t &textStart, + int32_t &textEnd); const RegexPattern *fPattern; const UnicodeString *fInput; int32_t fInputLength; - UBool fLastMatch; // True if the last match was successful. - int32_t fLastMatchStart; - int32_t fLastMatchEnd; + UBool fMatch; // True if the last match was successful. + int32_t fMatchStart; // Position of the start of the most recent match + int32_t fMatchEnd; // First position after the end of the most recent match + int32_t fLastMatchEnd; // First position after the end of the previous match. UStack *fBackTrackStack; UVector *fCaptureStarts; UVector *fCaptureEnds; diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index ca9761103bd..b68f1d0981f 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -31,12 +31,12 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch if (exec) logln("TestSuite RegexTest: "); switch (index) { - case 0: name = "API_Match"; - if (exec) API_Match(); - break; - case 1: name = "Basic"; + case 0: name = "Basic"; if (exec) Basic(); break; + case 1: name = "API_Match"; + if (exec) API_Match(); + break; case 2: name = "API_Replace"; if (exec) API_Replace(); break; @@ -87,6 +87,7 @@ UBool RegexTest::doRegexLMTest(char *pat, char *text, UBool looking, UBool match errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %d\n", line, status); return FALSE; } + // REPattern->dump(); UnicodeString inputString(inputText); UnicodeString unEscapedInput = inputString.unescape(); @@ -295,6 +296,101 @@ void RegexTest::API_Match() { delete matcher; delete pat; } + + // + // Replace + // + { + int32_t flags=0; + UParseError pe; + UErrorCode status=U_ZERO_ERROR; + + UnicodeString re("abc"); + RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); + REGEX_CHECK_STATUS; + UnicodeString data = ".abc..abc...abc.."; + // 012345678901234567 + RegexMatcher *matcher = pat->matcher(data, status); + + // + // Plain vanilla matches. + // + UnicodeString dest; + dest = matcher->replaceFirst("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ".yz..abc...abc.."); + + dest = matcher->replaceAll("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ".yz..yz...yz.."); + + // + // Plain vanilla non-matches. + // + UnicodeString d2 = ".abx..abx...abx.."; + matcher->reset(d2); + dest = matcher->replaceFirst("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ".abx..abx...abx.."); + + dest = matcher->replaceAll("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ".abx..abx...abx.."); + + // + // Empty source string + // + UnicodeString d3 = ""; + matcher->reset(d3); + dest = matcher->replaceFirst("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ""); + + dest = matcher->replaceAll("yz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == ""); + + // + // Empty substitution string + // + matcher->reset(data); // ".abc..abc...abc.." + dest = matcher->replaceFirst("", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == "...abc...abc.."); + + dest = matcher->replaceAll("", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == "........"); + + // + // match whole string + // + UnicodeString d4 = "abc"; + matcher->reset(d4); + dest = matcher->replaceFirst("xyz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == "xyz"); + + dest = matcher->replaceAll("xyz", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == "xyz"); + + // + // Capture Group, simple case + // + UnicodeString re2("a(..)"); + RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); + REGEX_CHECK_STATUS; + UnicodeString d5 = "abcdefg"; + RegexMatcher *matcher2 = pat2->matcher(d5, status); + REGEX_CHECK_STATUS; + dest = matcher2->replaceFirst("$1$1", status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(dest == "bcbcdefg"); + + } + + } @@ -314,6 +410,7 @@ void RegexTest::Basic() { // #if 0 { + REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input } return; #endif @@ -419,6 +516,26 @@ void RegexTest::Basic() { REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); + // + // Escape sequences that become single literal chars, handled internally + // by ICU's Unescape. + // + + // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. + REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL + REGEX_TESTLM("\\b", "\\u0008", TRUE, TRUE); // BS + // REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L (or whatever) TODO: bug in Unescape + // REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape TODO: bug in Unescape + REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed + REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line + REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR + REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab + REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); + REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); + + REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input + REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input + };