From ce09d8a4bcee3cf83ad34a7cc1ac6759eca19bf0 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Tue, 24 Feb 2015 00:24:59 +0000 Subject: [PATCH] ICU-11393 Regex, add pattern chars R v and h X-SVN-Rev: 37057 --- icu4c/source/i18n/regexcmp.cpp | 113 ++++++ icu4c/source/i18n/regexcst.h | 461 +++++++++++++----------- icu4c/source/i18n/regexcst.pl | 4 +- icu4c/source/i18n/regexcst.txt | 9 + icu4c/source/i18n/regeximp.h | 12 +- icu4c/source/i18n/rematch.cpp | 189 ++++++++-- icu4c/source/i18n/repattrn.cpp | 3 + icu4c/source/test/testdata/regextst.txt | 72 ++++ 8 files changed, 611 insertions(+), 252 deletions(-) diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index cd6ca2b2467..e518e84cd35 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -1188,6 +1188,21 @@ UBool RegexCompile::doParseActions(int32_t action) appendOp(URX_BACKSLASH_G, 0); break; + case doBackslashH: + fixLiterals(FALSE); + appendOp(URX_BACKSLASH_H, 1); + break; + + case doBackslashh: + fixLiterals(FALSE); + appendOp(URX_BACKSLASH_H, 0); + break; + + case doBackslashR: + fixLiterals(FALSE); + appendOp(URX_BACKSLASH_R, 0); + break; + case doBackslashS: fixLiterals(FALSE); appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); @@ -1198,6 +1213,16 @@ UBool RegexCompile::doParseActions(int32_t action) appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); break; + case doBackslashV: + fixLiterals(FALSE); + appendOp(URX_BACKSLASH_V, 1); + break; + + case doBackslashv: + fixLiterals(FALSE); + appendOp(URX_BACKSLASH_V, 0); + break; + case doBackslashW: fixLiterals(FALSE); appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); @@ -1548,6 +1573,48 @@ UBool RegexCompile::doParseActions(int32_t action) break; } + case doSetBackslash_h: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + UnicodeSet h; + h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); + h.add((UChar32)9); // Tab + set->addAll(h); + break; + } + + case doSetBackslash_H: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + UnicodeSet h; + h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); + h.add((UChar32)9); // Tab + h.complement(); + set->addAll(h); + break; + } + + case doSetBackslash_v: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + set->add((UChar32)0x0a, (UChar32)0x0d); // add range + set->add((UChar32)0x85); + set->add((UChar32)0x2028, (UChar32)0x2029); + break; + } + + case doSetBackslash_V: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + UnicodeSet v; + v.add((UChar32)0x0a, (UChar32)0x0d); // add range + v.add((UChar32)0x85); + v.add((UChar32)0x2028, (UChar32)0x2029); + v.complement(); + set->addAll(v); + break; + } + case doSetBackslash_w: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); @@ -2749,6 +2816,43 @@ void RegexCompile::matchStartType() { break; + case URX_BACKSLASH_H: + // Horiz white space + if (currentLen == 0) { + UnicodeSet s; + s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); + s.add((UChar32)9); // Tab + if (URX_VAL(op) != 0) { + s.complement(); + } + fRXPat->fInitialChars->addAll(s); + numInitialStrings += 2; + } + currentLen++; + atStart = FALSE; + break; + + + case URX_BACKSLASH_R: // Any line ending sequence + case URX_BACKSLASH_V: // Any line ending code point, with optional negation + if (currentLen == 0) { + UnicodeSet s; + s.add((UChar32)0x0a, (UChar32)0x0d); // add range + s.add((UChar32)0x85); + s.add((UChar32)0x2028, (UChar32)0x2029); + if (URX_VAL(op) != 0) { + // Complement option applies to URX_BACKSLASH_V only. + s.complement(); + } + fRXPat->fInitialChars->addAll(s); + numInitialStrings += 2; + } + currentLen++; + atStart = FALSE; + break; + + + case URX_ONECHAR_I: // Case Insensitive Single Character. if (currentLen == 0) { @@ -3137,6 +3241,9 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { case URX_STAT_SETREF_N: case URX_SETREF: case URX_BACKSLASH_D: + case URX_BACKSLASH_H: + case URX_BACKSLASH_R: + case URX_BACKSLASH_V: case URX_ONECHAR_I: case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. case URX_DOTANY_ALL: // . matches one or two. @@ -3418,6 +3525,9 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { case URX_STAT_SETREF_N: case URX_SETREF: case URX_BACKSLASH_D: + case URX_BACKSLASH_H: + case URX_BACKSLASH_R: + case URX_BACKSLASH_V: case URX_ONECHAR_I: case URX_DOTANY_ALL: case URX_DOTANY: @@ -3746,6 +3856,9 @@ void RegexCompile::stripNOPs() { case URX_LOOP_C: case URX_DOLLAR_D: case URX_DOLLAR_MD: + case URX_BACKSLASH_H: + case URX_BACKSLASH_R: + case URX_BACKSLASH_V: // These instructions are unaltered by the relocation. fRXPat->fCompiledPat->setElementAt(op, dst); dst++; diff --git a/icu4c/source/i18n/regexcst.h b/icu4c/source/i18n/regexcst.h index e754be4bd1a..3e62485f7c9 100644 --- a/icu4c/source/i18n/regexcst.h +++ b/icu4c/source/i18n/regexcst.h @@ -16,108 +16,117 @@ U_NAMESPACE_BEGIN // // Character classes for regex pattern scanning. // - static const uint8_t kRuleSet_digit_char = 128; - static const uint8_t kRuleSet_ascii_letter = 129; + static const uint8_t kRuleSet_ascii_letter = 128; + static const uint8_t kRuleSet_digit_char = 129; static const uint8_t kRuleSet_rule_char = 130; enum Regex_PatternParseAction { - doIntervalUpperDigit, - doPossessiveOpt, - doOpenLookBehindNeg, - doDotAny, - doSetBackslash_D, - doSetLiteral, - doSetBackslash_S, - doEscapeError, - doSetBackslash_W, - doDollar, - doBackslashb, - doSetOpError, - doBackslashG, - doPatStart, - doMismatchedParenErr, - doPossessivePlus, - doBackslashX, - doSetBackslash_s, - doSetBackslash_w, - doBackslashW, - doBackslashw, - doSetMatchMode, - doOrOperator, - doOpenLookAheadNeg, - doOpenLookBehind, - doBackslashS, - doBeginMatchMode, - doNOP, - doSetProp, - doBackslashA, - doIntervalInit, - doOpenCaptureParen, - doNGPlus, - doIntervalError, - doSetDifference2, - doNGOpt, - doEscapedLiteralChar, - doSetNegate, - doSetBegin, - doMatchModeParen, - doLiteralChar, - doOpt, - doSetIntersection2, - doBadOpenParenType, - doSuppressComments, - doCloseParen, - doPatFinish, - doSetBeginUnion, - doSetBackslash_d, - doProperty, - doNGInterval, - doNGStar, - doOpenLookAhead, - doSetBeginIntersection1, - doBeginNamedCapture, - doInterval, - doMatchMode, - doSetNoCloseError, - doSetBeginDifference1, - doPlus, - doBackslashD, - doSetLiteralEscaped, - doContinueNamedCapture, - doSetPosixProp, - doBackslashz, - doSetNamedRange, - doPossessiveStar, - doBadModeFlag, - doContinueNamedBackRef, - doPerlInline, - doBackslashd, - doOpenNonCaptureParen, - doSetEnd, - doSetAddDash, - doSetFinish, - doCaret, - doConditionalExpr, - doExit, - doNamedChar, - doSetRange, - doPossessiveInterval, - doBackslashs, - doIntervalSame, - doEnterQuoteMode, - doOpenAtomicParen, - doSetNamedChar, - doRuleError, - doStar, - doSetAddAmp, - doBackslashB, - doCompleteNamedBackRef, - doBackslashZ, - doIntevalLowerDigit, + doSetBackslash_V, + doSetBackslash_h, doBeginNamedBackRef, - doBackRef, + doSetMatchMode, + doEnterQuoteMode, + doOpenCaptureParen, + doContinueNamedCapture, + doSetBackslash_d, + doBeginMatchMode, + doBackslashX, + doSetPosixProp, + doIntervalError, + doSetLiteralEscaped, + doSetBackslash_s, + doNOP, + doBackslashv, + doOpenLookBehind, + doPatStart, + doPossessiveInterval, + doOpenAtomicParen, + doOpenLookAheadNeg, + doBackslashd, + doBackslashZ, + doIntervalUpperDigit, doBadNamedCapture, + doSetDifference2, + doSetAddAmp, + doSetNamedChar, + doNamedChar, + doSetBackslash_H, + doBackslashb, + doBackslashz, + doSetBeginDifference1, + doOpenLookAhead, + doMatchModeParen, + doBackslashV, + doIntevalLowerDigit, + doCaret, + doSetEnd, + doSetNegate, + doBackslashS, + doOrOperator, + doBackslashB, + doBackslashw, + doBackslashR, + doRuleError, + doDotAny, + doMatchMode, + doSetBackslash_W, + doNGPlus, + doSetBackslash_D, + doPossessiveOpt, + doSetNamedRange, + doConditionalExpr, + doBackslashs, + doPossessiveStar, + doPlus, + doBadOpenParenType, + doCloseParen, + doNGInterval, + doSetProp, + doBackRef, + doSetBeginUnion, + doEscapeError, + doOpt, + doSetBeginIntersection1, + doPossessivePlus, + doBackslashD, + doOpenLookBehindNeg, + doSetBegin, + doSetIntersection2, + doCompleteNamedBackRef, + doSetRange, + doDollar, + doBackslashH, + doExit, + doNGOpt, + doOpenNonCaptureParen, + doBackslashA, + doSetBackslash_v, + doBackslashh, + doBadModeFlag, + doSetNoCloseError, + doIntervalSame, + doSetAddDash, + doBackslashW, + doPerlInline, + doSetOpError, + doSetLiteral, + doPatFinish, + doBeginNamedCapture, + doEscapedLiteralChar, + doLiteralChar, + doSuppressComments, + doMismatchedParenErr, + doNGStar, + doSetFinish, + doInterval, + doBackslashG, + doStar, + doSetBackslash_w, + doSetBackslash_S, + doProperty, + doContinueNamedBackRef, + doIntervalInit, rbbiLastAction}; //------------------------------------------------------------------------------- @@ -140,7 +149,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doPatStart, 255, 2,0, FALSE} // 1 start , {doLiteralChar, 254, 14,0, TRUE} // 2 term , {doLiteralChar, 130, 14,0, TRUE} // 3 - , {doSetBegin, 91 /* [ */, 118, 196, TRUE} // 4 + , {doSetBegin, 91 /* [ */, 123, 205, TRUE} // 4 , {doNOP, 40 /* ( */, 27,0, TRUE} // 5 , {doDotAny, 46 /* . */, 14,0, TRUE} // 6 , {doCaret, 94 /* ^ */, 14,0, TRUE} // 7 @@ -149,7 +158,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doOrOperator, 124 /* | */, 2,0, TRUE} // 10 , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11 , {doPatFinish, 253, 2,0, FALSE} // 12 - , {doRuleError, 255, 197,0, FALSE} // 13 + , {doRuleError, 255, 206,0, FALSE} // 13 , {doNOP, 42 /* * */, 68,0, TRUE} // 14 expr-quant , {doNOP, 43 /* + */, 71,0, TRUE} // 15 , {doNOP, 63 /* ? */, 74,0, TRUE} // 16 @@ -179,15 +188,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doBeginMatchMode, 119 /* w */, 53,0, FALSE} // 40 , {doBeginMatchMode, 120 /* x */, 53,0, FALSE} // 41 , {doBeginMatchMode, 45 /* - */, 53,0, FALSE} // 42 - , {doConditionalExpr, 40 /* ( */, 197,0, TRUE} // 43 - , {doPerlInline, 123 /* { */, 197,0, TRUE} // 44 - , {doBadOpenParenType, 255, 197,0, FALSE} // 45 + , {doConditionalExpr, 40 /* ( */, 206,0, TRUE} // 43 + , {doPerlInline, 123 /* { */, 206,0, TRUE} // 44 + , {doBadOpenParenType, 255, 206,0, FALSE} // 45 , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 46 open-paren-lookbehind , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 47 - , {doBeginNamedCapture, 129, 64,0, FALSE} // 48 - , {doBadOpenParenType, 255, 197,0, FALSE} // 49 + , {doBeginNamedCapture, 128, 64,0, FALSE} // 48 + , {doBadOpenParenType, 255, 206,0, FALSE} // 49 , {doNOP, 41 /* ) */, 255,0, TRUE} // 50 paren-comment - , {doMismatchedParenErr, 253, 197,0, FALSE} // 51 + , {doMismatchedParenErr, 253, 206,0, FALSE} // 51 , {doNOP, 255, 50,0, TRUE} // 52 , {doMatchMode, 105 /* i */, 53,0, TRUE} // 53 paren-flag , {doMatchMode, 100 /* d */, 53,0, TRUE} // 54 @@ -199,11 +208,11 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doMatchMode, 45 /* - */, 53,0, TRUE} // 60 , {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 61 , {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 62 - , {doBadModeFlag, 255, 197,0, FALSE} // 63 - , {doContinueNamedCapture, 129, 64,0, TRUE} // 64 named-capture - , {doContinueNamedCapture, 128, 64,0, TRUE} // 65 + , {doBadModeFlag, 255, 206,0, FALSE} // 63 + , {doContinueNamedCapture, 128, 64,0, TRUE} // 64 named-capture + , {doContinueNamedCapture, 129, 64,0, TRUE} // 65 , {doOpenCaptureParen, 62 /* > */, 2, 14, TRUE} // 66 - , {doBadNamedCapture, 255, 197,0, FALSE} // 67 + , {doBadNamedCapture, 255, 206,0, FALSE} // 67 , {doNGStar, 63 /* ? */, 20,0, TRUE} // 68 quant-star , {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 69 , {doStar, 255, 20,0, FALSE} // 70 @@ -213,15 +222,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doNGOpt, 63 /* ? */, 20,0, TRUE} // 74 quant-opt , {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 75 , {doOpt, 255, 20,0, FALSE} // 76 - , {doNOP, 128, 79,0, FALSE} // 77 interval-open - , {doIntervalError, 255, 197,0, FALSE} // 78 - , {doIntevalLowerDigit, 128, 79,0, TRUE} // 79 interval-lower + , {doNOP, 129, 79,0, FALSE} // 77 interval-open + , {doIntervalError, 255, 206,0, FALSE} // 78 + , {doIntevalLowerDigit, 129, 79,0, TRUE} // 79 interval-lower , {doNOP, 44 /* , */, 83,0, TRUE} // 80 , {doIntervalSame, 125 /* } */, 86,0, TRUE} // 81 - , {doIntervalError, 255, 197,0, FALSE} // 82 - , {doIntervalUpperDigit, 128, 83,0, TRUE} // 83 interval-upper + , {doIntervalError, 255, 206,0, FALSE} // 82 + , {doIntervalUpperDigit, 129, 83,0, TRUE} // 83 interval-upper , {doNOP, 125 /* } */, 86,0, TRUE} // 84 - , {doIntervalError, 255, 197,0, FALSE} // 85 + , {doIntervalError, 255, 206,0, FALSE} // 85 , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 86 interval-type , {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 87 , {doInterval, 255, 20,0, FALSE} // 88 @@ -231,109 +240,118 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doBackslashd, 100 /* d */, 14,0, TRUE} // 92 , {doBackslashD, 68 /* D */, 14,0, TRUE} // 93 , {doBackslashG, 71 /* G */, 2,0, TRUE} // 94 - , {doNOP, 107 /* k */, 110,0, TRUE} // 95 - , {doNamedChar, 78 /* N */, 14,0, FALSE} // 96 - , {doProperty, 112 /* p */, 14,0, FALSE} // 97 - , {doProperty, 80 /* P */, 14,0, FALSE} // 98 - , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 99 - , {doBackslashS, 83 /* S */, 14,0, TRUE} // 100 - , {doBackslashs, 115 /* s */, 14,0, TRUE} // 101 - , {doBackslashW, 87 /* W */, 14,0, TRUE} // 102 - , {doBackslashw, 119 /* w */, 14,0, TRUE} // 103 - , {doBackslashX, 88 /* X */, 14,0, TRUE} // 104 - , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 105 - , {doBackslashz, 122 /* z */, 2,0, TRUE} // 106 - , {doBackRef, 128, 14,0, TRUE} // 107 - , {doEscapeError, 253, 197,0, FALSE} // 108 - , {doEscapedLiteralChar, 255, 14,0, TRUE} // 109 - , {doBeginNamedBackRef, 60 /* < */, 112,0, TRUE} // 110 named-backref - , {doBadNamedCapture, 255, 197,0, FALSE} // 111 - , {doContinueNamedBackRef, 129, 114,0, TRUE} // 112 named-backref-2 - , {doBadNamedCapture, 255, 197,0, FALSE} // 113 - , {doContinueNamedBackRef, 129, 114,0, TRUE} // 114 named-backref-3 - , {doContinueNamedBackRef, 128, 114,0, TRUE} // 115 - , {doCompleteNamedBackRef, 62 /* > */, 14,0, TRUE} // 116 - , {doBadNamedCapture, 255, 197,0, FALSE} // 117 - , {doSetNegate, 94 /* ^ */, 121,0, TRUE} // 118 set-open - , {doSetPosixProp, 58 /* : */, 123,0, FALSE} // 119 - , {doNOP, 255, 121,0, FALSE} // 120 - , {doSetLiteral, 93 /* ] */, 136,0, TRUE} // 121 set-open2 - , {doNOP, 255, 126,0, FALSE} // 122 - , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 123 set-posix - , {doNOP, 58 /* : */, 126,0, FALSE} // 124 - , {doRuleError, 255, 197,0, FALSE} // 125 - , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 126 set-start - , {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 127 - , {doNOP, 92 /* \ */, 186,0, TRUE} // 128 - , {doNOP, 45 /* - */, 132,0, TRUE} // 129 - , {doNOP, 38 /* & */, 134,0, TRUE} // 130 - , {doSetLiteral, 255, 136,0, TRUE} // 131 - , {doRuleError, 45 /* - */, 197,0, FALSE} // 132 set-start-dash - , {doSetAddDash, 255, 136,0, FALSE} // 133 - , {doRuleError, 38 /* & */, 197,0, FALSE} // 134 set-start-amp - , {doSetAddAmp, 255, 136,0, FALSE} // 135 - , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 136 set-after-lit - , {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 137 - , {doNOP, 45 /* - */, 173,0, TRUE} // 138 - , {doNOP, 38 /* & */, 164,0, TRUE} // 139 - , {doNOP, 92 /* \ */, 186,0, TRUE} // 140 - , {doSetNoCloseError, 253, 197,0, FALSE} // 141 - , {doSetLiteral, 255, 136,0, TRUE} // 142 - , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 143 set-after-set - , {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 144 - , {doNOP, 45 /* - */, 166,0, TRUE} // 145 - , {doNOP, 38 /* & */, 161,0, TRUE} // 146 - , {doNOP, 92 /* \ */, 186,0, TRUE} // 147 - , {doSetNoCloseError, 253, 197,0, FALSE} // 148 - , {doSetLiteral, 255, 136,0, TRUE} // 149 - , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 150 set-after-range - , {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 151 - , {doNOP, 45 /* - */, 169,0, TRUE} // 152 - , {doNOP, 38 /* & */, 171,0, TRUE} // 153 - , {doNOP, 92 /* \ */, 186,0, TRUE} // 154 - , {doSetNoCloseError, 253, 197,0, FALSE} // 155 - , {doSetLiteral, 255, 136,0, TRUE} // 156 - , {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 157 set-after-op - , {doSetOpError, 93 /* ] */, 197,0, FALSE} // 158 - , {doNOP, 92 /* \ */, 186,0, TRUE} // 159 - , {doSetLiteral, 255, 136,0, TRUE} // 160 - , {doSetBeginIntersection1, 91 /* [ */, 118, 143, TRUE} // 161 set-set-amp - , {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 162 - , {doSetAddAmp, 255, 136,0, FALSE} // 163 - , {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 164 set-lit-amp - , {doSetAddAmp, 255, 136,0, FALSE} // 165 - , {doSetBeginDifference1, 91 /* [ */, 118, 143, TRUE} // 166 set-set-dash - , {doSetDifference2, 45 /* - */, 157,0, TRUE} // 167 - , {doSetAddDash, 255, 136,0, FALSE} // 168 - , {doSetDifference2, 45 /* - */, 157,0, TRUE} // 169 set-range-dash - , {doSetAddDash, 255, 136,0, FALSE} // 170 - , {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 171 set-range-amp - , {doSetAddAmp, 255, 136,0, FALSE} // 172 - , {doSetDifference2, 45 /* - */, 157,0, TRUE} // 173 set-lit-dash - , {doSetAddDash, 91 /* [ */, 136,0, FALSE} // 174 - , {doSetAddDash, 93 /* ] */, 136,0, FALSE} // 175 - , {doNOP, 92 /* \ */, 178,0, TRUE} // 176 - , {doSetRange, 255, 150,0, TRUE} // 177 - , {doSetOpError, 115 /* s */, 197,0, FALSE} // 178 set-lit-dash-escape - , {doSetOpError, 83 /* S */, 197,0, FALSE} // 179 - , {doSetOpError, 119 /* w */, 197,0, FALSE} // 180 - , {doSetOpError, 87 /* W */, 197,0, FALSE} // 181 - , {doSetOpError, 100 /* d */, 197,0, FALSE} // 182 - , {doSetOpError, 68 /* D */, 197,0, FALSE} // 183 - , {doSetNamedRange, 78 /* N */, 150,0, FALSE} // 184 - , {doSetRange, 255, 150,0, TRUE} // 185 - , {doSetProp, 112 /* p */, 143,0, FALSE} // 186 set-escape - , {doSetProp, 80 /* P */, 143,0, FALSE} // 187 - , {doSetNamedChar, 78 /* N */, 136,0, FALSE} // 188 - , {doSetBackslash_s, 115 /* s */, 150,0, TRUE} // 189 - , {doSetBackslash_S, 83 /* S */, 150,0, TRUE} // 190 - , {doSetBackslash_w, 119 /* w */, 150,0, TRUE} // 191 - , {doSetBackslash_W, 87 /* W */, 150,0, TRUE} // 192 - , {doSetBackslash_d, 100 /* d */, 150,0, TRUE} // 193 - , {doSetBackslash_D, 68 /* D */, 150,0, TRUE} // 194 - , {doSetLiteralEscaped, 255, 136,0, TRUE} // 195 - , {doSetFinish, 255, 14,0, FALSE} // 196 set-finish - , {doExit, 255, 197,0, TRUE} // 197 errorDeath + , {doBackslashh, 104 /* h */, 14,0, TRUE} // 95 + , {doBackslashH, 72 /* H */, 14,0, TRUE} // 96 + , {doNOP, 107 /* k */, 115,0, TRUE} // 97 + , {doNamedChar, 78 /* N */, 14,0, FALSE} // 98 + , {doProperty, 112 /* p */, 14,0, FALSE} // 99 + , {doProperty, 80 /* P */, 14,0, FALSE} // 100 + , {doBackslashR, 82 /* R */, 14,0, TRUE} // 101 + , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 102 + , {doBackslashS, 83 /* S */, 14,0, TRUE} // 103 + , {doBackslashs, 115 /* s */, 14,0, TRUE} // 104 + , {doBackslashv, 118 /* v */, 14,0, TRUE} // 105 + , {doBackslashV, 86 /* V */, 14,0, TRUE} // 106 + , {doBackslashW, 87 /* W */, 14,0, TRUE} // 107 + , {doBackslashw, 119 /* w */, 14,0, TRUE} // 108 + , {doBackslashX, 88 /* X */, 14,0, TRUE} // 109 + , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 110 + , {doBackslashz, 122 /* z */, 2,0, TRUE} // 111 + , {doBackRef, 129, 14,0, TRUE} // 112 + , {doEscapeError, 253, 206,0, FALSE} // 113 + , {doEscapedLiteralChar, 255, 14,0, TRUE} // 114 + , {doBeginNamedBackRef, 60 /* < */, 117,0, TRUE} // 115 named-backref + , {doBadNamedCapture, 255, 206,0, FALSE} // 116 + , {doContinueNamedBackRef, 128, 119,0, TRUE} // 117 named-backref-2 + , {doBadNamedCapture, 255, 206,0, FALSE} // 118 + , {doContinueNamedBackRef, 128, 119,0, TRUE} // 119 named-backref-3 + , {doContinueNamedBackRef, 129, 119,0, TRUE} // 120 + , {doCompleteNamedBackRef, 62 /* > */, 14,0, TRUE} // 121 + , {doBadNamedCapture, 255, 206,0, FALSE} // 122 + , {doSetNegate, 94 /* ^ */, 126,0, TRUE} // 123 set-open + , {doSetPosixProp, 58 /* : */, 128,0, FALSE} // 124 + , {doNOP, 255, 126,0, FALSE} // 125 + , {doSetLiteral, 93 /* ] */, 141,0, TRUE} // 126 set-open2 + , {doNOP, 255, 131,0, FALSE} // 127 + , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 128 set-posix + , {doNOP, 58 /* : */, 131,0, FALSE} // 129 + , {doRuleError, 255, 206,0, FALSE} // 130 + , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 131 set-start + , {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 132 + , {doNOP, 92 /* \ */, 191,0, TRUE} // 133 + , {doNOP, 45 /* - */, 137,0, TRUE} // 134 + , {doNOP, 38 /* & */, 139,0, TRUE} // 135 + , {doSetLiteral, 255, 141,0, TRUE} // 136 + , {doRuleError, 45 /* - */, 206,0, FALSE} // 137 set-start-dash + , {doSetAddDash, 255, 141,0, FALSE} // 138 + , {doRuleError, 38 /* & */, 206,0, FALSE} // 139 set-start-amp + , {doSetAddAmp, 255, 141,0, FALSE} // 140 + , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 141 set-after-lit + , {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 142 + , {doNOP, 45 /* - */, 178,0, TRUE} // 143 + , {doNOP, 38 /* & */, 169,0, TRUE} // 144 + , {doNOP, 92 /* \ */, 191,0, TRUE} // 145 + , {doSetNoCloseError, 253, 206,0, FALSE} // 146 + , {doSetLiteral, 255, 141,0, TRUE} // 147 + , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 148 set-after-set + , {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 149 + , {doNOP, 45 /* - */, 171,0, TRUE} // 150 + , {doNOP, 38 /* & */, 166,0, TRUE} // 151 + , {doNOP, 92 /* \ */, 191,0, TRUE} // 152 + , {doSetNoCloseError, 253, 206,0, FALSE} // 153 + , {doSetLiteral, 255, 141,0, TRUE} // 154 + , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 155 set-after-range + , {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 156 + , {doNOP, 45 /* - */, 174,0, TRUE} // 157 + , {doNOP, 38 /* & */, 176,0, TRUE} // 158 + , {doNOP, 92 /* \ */, 191,0, TRUE} // 159 + , {doSetNoCloseError, 253, 206,0, FALSE} // 160 + , {doSetLiteral, 255, 141,0, TRUE} // 161 + , {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 162 set-after-op + , {doSetOpError, 93 /* ] */, 206,0, FALSE} // 163 + , {doNOP, 92 /* \ */, 191,0, TRUE} // 164 + , {doSetLiteral, 255, 141,0, TRUE} // 165 + , {doSetBeginIntersection1, 91 /* [ */, 123, 148, TRUE} // 166 set-set-amp + , {doSetIntersection2, 38 /* & */, 162,0, TRUE} // 167 + , {doSetAddAmp, 255, 141,0, FALSE} // 168 + , {doSetIntersection2, 38 /* & */, 162,0, TRUE} // 169 set-lit-amp + , {doSetAddAmp, 255, 141,0, FALSE} // 170 + , {doSetBeginDifference1, 91 /* [ */, 123, 148, TRUE} // 171 set-set-dash + , {doSetDifference2, 45 /* - */, 162,0, TRUE} // 172 + , {doSetAddDash, 255, 141,0, FALSE} // 173 + , {doSetDifference2, 45 /* - */, 162,0, TRUE} // 174 set-range-dash + , {doSetAddDash, 255, 141,0, FALSE} // 175 + , {doSetIntersection2, 38 /* & */, 162,0, TRUE} // 176 set-range-amp + , {doSetAddAmp, 255, 141,0, FALSE} // 177 + , {doSetDifference2, 45 /* - */, 162,0, TRUE} // 178 set-lit-dash + , {doSetAddDash, 91 /* [ */, 141,0, FALSE} // 179 + , {doSetAddDash, 93 /* ] */, 141,0, FALSE} // 180 + , {doNOP, 92 /* \ */, 183,0, TRUE} // 181 + , {doSetRange, 255, 155,0, TRUE} // 182 + , {doSetOpError, 115 /* s */, 206,0, FALSE} // 183 set-lit-dash-escape + , {doSetOpError, 83 /* S */, 206,0, FALSE} // 184 + , {doSetOpError, 119 /* w */, 206,0, FALSE} // 185 + , {doSetOpError, 87 /* W */, 206,0, FALSE} // 186 + , {doSetOpError, 100 /* d */, 206,0, FALSE} // 187 + , {doSetOpError, 68 /* D */, 206,0, FALSE} // 188 + , {doSetNamedRange, 78 /* N */, 155,0, FALSE} // 189 + , {doSetRange, 255, 155,0, TRUE} // 190 + , {doSetProp, 112 /* p */, 148,0, FALSE} // 191 set-escape + , {doSetProp, 80 /* P */, 148,0, FALSE} // 192 + , {doSetNamedChar, 78 /* N */, 141,0, FALSE} // 193 + , {doSetBackslash_s, 115 /* s */, 155,0, TRUE} // 194 + , {doSetBackslash_S, 83 /* S */, 155,0, TRUE} // 195 + , {doSetBackslash_w, 119 /* w */, 155,0, TRUE} // 196 + , {doSetBackslash_W, 87 /* W */, 155,0, TRUE} // 197 + , {doSetBackslash_d, 100 /* d */, 155,0, TRUE} // 198 + , {doSetBackslash_D, 68 /* D */, 155,0, TRUE} // 199 + , {doSetBackslash_h, 104 /* h */, 155,0, TRUE} // 200 + , {doSetBackslash_H, 72 /* H */, 155,0, TRUE} // 201 + , {doSetBackslash_v, 118 /* v */, 155,0, TRUE} // 202 + , {doSetBackslash_V, 86 /* V */, 155,0, TRUE} // 203 + , {doSetLiteralEscaped, 255, 141,0, TRUE} // 204 + , {doSetFinish, 255, 14,0, FALSE} // 205 set-finish + , {doExit, 255, 206,0, TRUE} // 206 errorDeath }; static const char * const RegexStateNames[] = { 0, "start", @@ -444,6 +462,11 @@ static const char * const RegexStateNames[] = { 0, 0, 0, 0, + 0, + 0, + 0, + 0, + 0, 0, "named-backref", 0, @@ -530,6 +553,10 @@ static const char * const RegexStateNames[] = { 0, 0, 0, 0, + 0, + 0, + 0, + 0, 0, "set-finish", "errorDeath", diff --git a/icu4c/source/i18n/regexcst.pl b/icu4c/source/i18n/regexcst.pl index f1dc06af7ec..d52093629c5 100755 --- a/icu4c/source/i18n/regexcst.pl +++ b/icu4c/source/i18n/regexcst.pl @@ -1,7 +1,7 @@ #!/usr/bin/perl # ******************************************************************** # * COPYRIGHT: -# * Copyright (c) 2002-2007, International Business Machines Corporation and +# * Copyright (c) 2002-2015, International Business Machines Corporation and # * others. All Rights Reserved. # ******************************************************************** # @@ -206,7 +206,7 @@ print "// This file contains the state table for the ICU Regular Expression P print "// It is generated by the Perl script \"regexcst.pl\" from\n"; print "// the rule parser state definitions file \"regexcst.txt\".\n"; print "//\n"; -print "// Copyright (C) 2002-2007 International Business Machines Corporation \n"; +print "// Copyright (C) 2002-2015 International Business Machines Corporation \n"; print "// and others. All rights reserved. \n"; print "//\n"; print "//---------------------------------------------------------------------------------\n"; diff --git a/icu4c/source/i18n/regexcst.txt b/icu4c/source/i18n/regexcst.txt index fe9bc6e74cb..1b88e446540 100644 --- a/icu4c/source/i18n/regexcst.txt +++ b/icu4c/source/i18n/regexcst.txt @@ -250,13 +250,18 @@ backslash: 'd' n expr-quant doBackslashd 'D' n expr-quant doBackslashD 'G' n term doBackslashG + 'h' n expr-quant doBackslashh + 'H' n expr-quant doBackslashH 'k' n named-backref 'N' expr-quant doNamedChar # \N{NAME} named char 'p' expr-quant doProperty # \p{Lu} style property 'P' expr-quant doProperty + 'R' n expr-quant doBackslashR 'Q' n term doEnterQuoteMode 'S' n expr-quant doBackslashS 's' n expr-quant doBackslashs + 'v' n expr-quant doBackslashv + 'V' n expr-quant doBackslashV 'W' n expr-quant doBackslashW 'w' n expr-quant doBackslashw 'X' n expr-quant doBackslashX @@ -472,6 +477,10 @@ set-escape: 'W' n set-after-range doSetBackslash_W 'd' n set-after-range doSetBackslash_d 'D' n set-after-range doSetBackslash_D + 'h' n set-after-range doSetBackslash_h + 'H' n set-after-range doSetBackslash_H + 'v' n set-after-range doSetBackslash_v + 'V' n set-after-range doSetBackslash_V default n set-after-lit doSetLiteralEscaped # diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index fdd9c76e6f4..52ea662633e 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2002-2014 International Business Machines Corporation +// Copyright (C) 2002-2015 International Business Machines Corporation // and others. All rights reserved. // // file: regeximp.h @@ -173,7 +173,10 @@ enum { URX_BACKSLASH_BU = 53, // \b or \B in UREGEX_UWORD mode, using Unicode style // word boundaries. URX_DOLLAR_D = 54, // $ end of input test, in UNIX_LINES mode. - URX_DOLLAR_MD = 55 // $ end of input test, in MULTI_LINE and UNIX_LINES mode. + URX_DOLLAR_MD = 55, // $ end of input test, in MULTI_LINE and UNIX_LINES mode. + URX_BACKSLASH_H = 56, // Value field: 0: \h 1: \H + URX_BACKSLASH_R = 57, // Any line break sequence. + URX_BACKSLASH_V = 58 // Value field: 0: \v 1: \V }; @@ -235,7 +238,10 @@ enum { "LOOP_DOT_I", \ "BACKSLASH_BU", \ "DOLLAR_D", \ - "DOLLAR_MD" + "DOLLAR_MD", \ + "URX_BACKSLASH_H", \ + "URX_BACKSLASH_R", \ + "URX_BACKSLASH_V" // diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 41330332a50..341d29f13a8 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -49,6 +49,15 @@ static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; // This constant determines that state saves per tick number. static const int32_t TIMER_INITIAL_VALUE = 10000; + +// Test for any of the Unicode line terminating characters. +static inline UBool isLineTerminator(UChar32 c) { + if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) { + return false; + } + return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029; +} + //----------------------------------------------------------------------------- // // Constructor and Destructor @@ -837,20 +846,19 @@ UBool RegexMatcher::find(UErrorCode &status) { } } else { for (;;) { - if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { - if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { - (void)UTEXT_NEXT32(fInputText); - startPos = UTEXT_GETNATIVEINDEX(fInputText); - } - MatchAt(startPos, FALSE, status); - if (U_FAILURE(status)) { - return FALSE; - } - if (fMatch) { - return TRUE; - } - UTEXT_SETNATIVEINDEX(fInputText, startPos); + if (isLineTerminator(c)) { + if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { + (void)UTEXT_NEXT32(fInputText); + startPos = UTEXT_GETNATIVEINDEX(fInputText); + } + MatchAt(startPos, FALSE, status); + if (U_FAILURE(status)) { + return FALSE; + } + if (fMatch) { + return TRUE; + } + UTEXT_SETNATIVEINDEX(fInputText, startPos); } if (startPos >= testStartLimit) { fMatch = FALSE; @@ -1098,8 +1106,7 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) { } else { for (;;) { c = inputBuf[startPos-1]; - if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { + if (isLineTerminator(c)) { if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { startPos++; } @@ -2927,9 +2934,9 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { // end of input, succeed. UChar32 c = UTEXT_NEXT32(fInputText); if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { - if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { + if (isLineTerminator(c)) { // If not in the middle of a CR/LF sequence - if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { + if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { // At new-line at end of input. Success fHitEnd = TRUE; fRequireEnd = TRUE; @@ -2985,7 +2992,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { // It makes no difference where the new-line is within the input. UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_CURRENT32(fInputText); - if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { + if (isLineTerminator(c)) { // At a line end, except for the odd chance of being in the middle of a CR/LF sequence // In multi-line mode, hitting a new-line just before the end of input does not // set the hitEnd or requireEnd flags @@ -3034,8 +3041,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { // unless we are at the end of input UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_PREVIOUS32(fInputText); - if ((fp->fInputIdx < fAnchorLimit) && - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) { // It's a new-line. ^ is true. Success. // TODO: what should be done with positions between a CR and LF? break; @@ -3116,6 +3122,68 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { break; + case URX_BACKSLASH_H: // Test for \h, horizontal white space. + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); + int8_t ctype = u_charType(c); + UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB + success ^= (UBool)(opValue != 0); // flip sense for \H + if (success) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } else { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_R: // Test for \R, any line break sequence. + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); + if (isLineTerminator(c)) { + if (c == 0x0d && utext_current32(fInputText) == 0x0a) { + utext_next32(fInputText); + } + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } else { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_V: // \v, any single line ending character. + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); + UChar32 c = UTEXT_NEXT32(fInputText); + UBool success = isLineTerminator(c); + success ^= (UBool)(opValue != 0); // flip sense for \V + if (success) { + fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); + } else { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + case URX_BACKSLASH_X: // Match a Grapheme, as defined by Unicode TR 29. // Differs slightly from Perl, which consumes combining marks independently @@ -3343,8 +3411,7 @@ GC_Done: // There is input left. Advance over one char, unless we've hit end-of-line UChar32 c = UTEXT_NEXT32(fInputText); - if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + if (isLineTerminator(c)) { // End of line in normal mode. . does not match. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; @@ -4101,7 +4168,7 @@ GC_Done: if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s if ((c == 0x0a) || // 0x0a is newline in both modes. (((opValue & 2) == 0) && // IF not UNIX_LINES mode - (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) { + isLineTerminator(c))) { // char is a line ending. Exit the scanning loop. break; } @@ -4432,7 +4499,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu UChar32 c; U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); - if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { + if (isLineTerminator(c)) { if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { // At new-line at end of input. Success fHitEnd = TRUE; @@ -4486,7 +4553,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu // If we are positioned just before a new-line, succeed. // It makes no difference where the new-line is within the input. UChar32 c = inputBuf[fp->fInputIdx]; - if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { + if (isLineTerminator(c)) { // At a line end, except for the odd chance of being in the middle of a CR/LF sequence // In multi-line mode, hitting a new-line just before the end of input does not // set the hitEnd or requireEnd flags @@ -4534,7 +4601,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu // unless we are at the end of input UChar c = inputBuf[fp->fInputIdx - 1]; if ((fp->fInputIdx < fAnchorLimit) && - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + isLineTerminator(c)) { // It's a new-line. ^ is true. Success. // TODO: what should be done with positions between a CR and LF? break; @@ -4611,6 +4678,69 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu break; + case URX_BACKSLASH_H: // Test for \h, horizontal white space. + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + int8_t ctype = u_charType(c); + UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB + success ^= (UBool)(opValue != 0); // flip sense for \H + if (!success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_R: // Test for \R, any line break sequence. + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + if (isLineTerminator(c)) { + if (c == 0x0d && fp->fInputIdx < fActiveLimit) { + // Check for CR/LF sequence. Consume both together when found. + UChar c2; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2); + if (c2 != 0x0a) { + U16_PREV(inputBuf, 0, fp->fInputIdx, c2); + } + } + } else { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_V: // Any single code point line ending. + { + if (fp->fInputIdx >= fActiveLimit) { + fHitEnd = TRUE; + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + break; + } + UChar32 c; + U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); + UBool success = isLineTerminator(c); + success ^= (UBool)(opValue != 0); // flip sense for \V + if (!success) { + fp = (REStackFrame *)fStack->popFrame(fFrameSize); + } + } + break; + + + case URX_BACKSLASH_X: // Match a Grapheme, as defined by Unicode TR 29. // Differs slightly from Perl, which consumes combining marks independently @@ -4820,8 +4950,7 @@ GC_Done: // There is input left. Advance over one char, unless we've hit end-of-line UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); - if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible - ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { + if (isLineTerminator(c)) { // End of line in normal mode. . does not match. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; @@ -5535,7 +5664,7 @@ GC_Done: if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s if ((c == 0x0a) || // 0x0a is newline in both modes. (((opValue & 2) == 0) && // IF not UNIX_LINES mode - ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { + isLineTerminator(c))) { // char is a line ending. Put the input pos back to the // line ending char, and exit the scanning loop. U16_BACK_1(inputBuf, 0, ix); diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index 14454e25f8f..58650d11374 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -742,6 +742,9 @@ void RegexPattern::dumpOp(int32_t index) const { case URX_LBN_END: case URX_LOOP_C: case URX_LOOP_DOT_I: + case URX_BACKSLASH_H: + case URX_BACKSLASH_R: + case URX_BACKSLASH_V: // types with an integer operand field. printf("%d", val); break; diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index e0f8b27d758..15d13bf156e 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -693,6 +693,78 @@ "abc\jkl" "<0>abcjkl" # escape of a non-special letter is just itself. "abc[ \j]kl" "<0>abcjkl" +# +# \R all newline sequences. +# +"abc\Rxyz" "<0>abc\u000axyzgh" +"abc\Rxyz" "<0>abc\u000bxyzgh" +"abc\Rxyz" "<0>abc\u000cxyzgh" +"abc\Rxyz" "<0>abc\u000dxyzgh" +"abc\Rxyz" "<0>abc\u0085xyzgh" +"abc\Rxyz" "<0>abc\u2028xyzgh" +"abc\Rxyz" "<0>abc\u2029xyzgh" +"abc\Rxyz" "<0>abc\u000d\u000axyzgh" + +"abc\R\nxyz" "abc\u000d\u000axyzgh" # \R cannot match only the CR from a CR/LF sequence. +"abc\r\nxyz" "<0>abc\u000d\u000axyzgh" + +"abc\Rxyz" "abc\u0009xyz" # Assorted non-matches. +"abc\Rxyz" "abc\u000exyz" +"abc\Rxyz" "abc\u202axyz" + +# \v \V single character new line sequences. + +"abc\vxyz" "<0>abc\u000axyzgh" +"abc\vxyz" "<0>abc\u000bxyzgh" +"abc\vxyz" "<0>abc\u000cxyzgh" +"abc\vxyz" "<0>abc\u000dxyzgh" +"abc\vxyz" "<0>abc\u0085xyzgh" +"abc\vxyz" "<0>abc\u2028xyzgh" +"abc\vxyz" "<0>abc\u2029xyzgh" +"abc\vxyz" "abc\u000d\u000axyzgh" +"abc\vxyz" "abc?xyzgh" + +"abc[\v]xyz" "<0>abc\u000axyzgh" +"abc[\v]xyz" "<0>abc\u000bxyzgh" +"abc[\v]xyz" "<0>abc\u000cxyzgh" +"abc[\v]xyz" "<0>abc\u000dxyzgh" +"abc[\v]xyz" "<0>abc\u0085xyzgh" +"abc[\v]xyz" "<0>abc\u2028xyzgh" +"abc[\v]xyz" "<0>abc\u2029xyzgh" +"abc[\v]xyz" "abc\u000d\u000axyzgh" +"abc[\v]xyz" "abc?xyzgh" + +"abc\Vxyz" "abc\u000axyzgh" +"abc\Vxyz" "abc\u000bxyzgh" +"abc\Vxyz" "abc\u000cxyzgh" +"abc\Vxyz" "abc\u000dxyzgh" +"abc\Vxyz" "abc\u0085xyzgh" +"abc\Vxyz" "abc\u2028xyzgh" +"abc\Vxyz" "abc\u2029xyzgh" +"abc\Vxyz" "abc\u000d\u000axyzgh" +"abc\Vxyz" "<0>abc?xyzgh" + +# \h \H horizontal white space. Defined as gc=space_separator plus ascii tab + +"abc\hxyz" "<0>abc xyzgh" +"abc\Hxyz" "abc xyzgh" +"abc\hxyz" "<0>abc\u2003xyzgh" +"abc\Hxyz" "abc\u2003xyzgh" +"abc\hxyz" "<0>abc\u0009xyzgh" +"abc\Hxyz" "abc\u0009xyzgh" +"abc\hxyz" "abc?xyzgh" +"abc\Hxyz" "<0>abc?xyzgh" + +"abc[\h]xyz" "<0>abc xyzgh" +"abc[\H]xyz" "abc xyzgh" +"abc[\h]xyz" "<0>abc\u2003xyzgh" +"abc[\H]xyz" "abc\u2003xyzgh" +"abc[\h]xyz" "<0>abc\u0009xyzgh" +"abc[\H]xyz" "abc\u0009xyzgh" +"abc[\h]xyz" "abc?xyzgh" +"abc[\H]xyz" "<0>abc?xyzgh" + + # # Bug xxxx #