mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-11393 Regex, add pattern chars R v and h
X-SVN-Rev: 37057
This commit is contained in:
parent
77775f1b2d
commit
ce09d8a4bc
8 changed files with 611 additions and 252 deletions
|
@ -1188,6 +1188,21 @@ UBool RegexCompile::doParseActions(int32_t action)
|
|||
appendOp(URX_BACKSLASH_G, 0);
|
||||
break;
|
||||
|
||||
case doBackslashH:
|
||||
fixLiterals(FALSE);
|
||||
appendOp(URX_BACKSLASH_H, 1);
|
||||
break;
|
||||
|
||||
case doBackslashh:
|
||||
fixLiterals(FALSE);
|
||||
appendOp(URX_BACKSLASH_H, 0);
|
||||
break;
|
||||
|
||||
case doBackslashR:
|
||||
fixLiterals(FALSE);
|
||||
appendOp(URX_BACKSLASH_R, 0);
|
||||
break;
|
||||
|
||||
case doBackslashS:
|
||||
fixLiterals(FALSE);
|
||||
appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
|
||||
|
@ -1198,6 +1213,16 @@ UBool RegexCompile::doParseActions(int32_t action)
|
|||
appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
|
||||
break;
|
||||
|
||||
case doBackslashV:
|
||||
fixLiterals(FALSE);
|
||||
appendOp(URX_BACKSLASH_V, 1);
|
||||
break;
|
||||
|
||||
case doBackslashv:
|
||||
fixLiterals(FALSE);
|
||||
appendOp(URX_BACKSLASH_V, 0);
|
||||
break;
|
||||
|
||||
case doBackslashW:
|
||||
fixLiterals(FALSE);
|
||||
appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
|
||||
|
@ -1548,6 +1573,48 @@ UBool RegexCompile::doParseActions(int32_t action)
|
|||
break;
|
||||
}
|
||||
|
||||
case doSetBackslash_h:
|
||||
{
|
||||
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
||||
UnicodeSet h;
|
||||
h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
|
||||
h.add((UChar32)9); // Tab
|
||||
set->addAll(h);
|
||||
break;
|
||||
}
|
||||
|
||||
case doSetBackslash_H:
|
||||
{
|
||||
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
||||
UnicodeSet h;
|
||||
h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
|
||||
h.add((UChar32)9); // Tab
|
||||
h.complement();
|
||||
set->addAll(h);
|
||||
break;
|
||||
}
|
||||
|
||||
case doSetBackslash_v:
|
||||
{
|
||||
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
||||
set->add((UChar32)0x0a, (UChar32)0x0d); // add range
|
||||
set->add((UChar32)0x85);
|
||||
set->add((UChar32)0x2028, (UChar32)0x2029);
|
||||
break;
|
||||
}
|
||||
|
||||
case doSetBackslash_V:
|
||||
{
|
||||
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
||||
UnicodeSet v;
|
||||
v.add((UChar32)0x0a, (UChar32)0x0d); // add range
|
||||
v.add((UChar32)0x85);
|
||||
v.add((UChar32)0x2028, (UChar32)0x2029);
|
||||
v.complement();
|
||||
set->addAll(v);
|
||||
break;
|
||||
}
|
||||
|
||||
case doSetBackslash_w:
|
||||
{
|
||||
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
||||
|
@ -2749,6 +2816,43 @@ void RegexCompile::matchStartType() {
|
|||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_H:
|
||||
// Horiz white space
|
||||
if (currentLen == 0) {
|
||||
UnicodeSet s;
|
||||
s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
|
||||
s.add((UChar32)9); // Tab
|
||||
if (URX_VAL(op) != 0) {
|
||||
s.complement();
|
||||
}
|
||||
fRXPat->fInitialChars->addAll(s);
|
||||
numInitialStrings += 2;
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_R: // Any line ending sequence
|
||||
case URX_BACKSLASH_V: // Any line ending code point, with optional negation
|
||||
if (currentLen == 0) {
|
||||
UnicodeSet s;
|
||||
s.add((UChar32)0x0a, (UChar32)0x0d); // add range
|
||||
s.add((UChar32)0x85);
|
||||
s.add((UChar32)0x2028, (UChar32)0x2029);
|
||||
if (URX_VAL(op) != 0) {
|
||||
// Complement option applies to URX_BACKSLASH_V only.
|
||||
s.complement();
|
||||
}
|
||||
fRXPat->fInitialChars->addAll(s);
|
||||
numInitialStrings += 2;
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_ONECHAR_I:
|
||||
// Case Insensitive Single Character.
|
||||
if (currentLen == 0) {
|
||||
|
@ -3137,6 +3241,9 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
|||
case URX_STAT_SETREF_N:
|
||||
case URX_SETREF:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_BACKSLASH_H:
|
||||
case URX_BACKSLASH_R:
|
||||
case URX_BACKSLASH_V:
|
||||
case URX_ONECHAR_I:
|
||||
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
||||
case URX_DOTANY_ALL: // . matches one or two.
|
||||
|
@ -3418,6 +3525,9 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
|||
case URX_STAT_SETREF_N:
|
||||
case URX_SETREF:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_BACKSLASH_H:
|
||||
case URX_BACKSLASH_R:
|
||||
case URX_BACKSLASH_V:
|
||||
case URX_ONECHAR_I:
|
||||
case URX_DOTANY_ALL:
|
||||
case URX_DOTANY:
|
||||
|
@ -3746,6 +3856,9 @@ void RegexCompile::stripNOPs() {
|
|||
case URX_LOOP_C:
|
||||
case URX_DOLLAR_D:
|
||||
case URX_DOLLAR_MD:
|
||||
case URX_BACKSLASH_H:
|
||||
case URX_BACKSLASH_R:
|
||||
case URX_BACKSLASH_V:
|
||||
// These instructions are unaltered by the relocation.
|
||||
fRXPat->fCompiledPat->setElementAt(op, dst);
|
||||
dst++;
|
||||
|
|
|
@ -16,108 +16,117 @@ U_NAMESPACE_BEGIN
|
|||
//
|
||||
// Character classes for regex pattern scanning.
|
||||
//
|
||||
static const uint8_t kRuleSet_digit_char = 128;
|
||||
static const uint8_t kRuleSet_ascii_letter = 129;
|
||||
static const uint8_t kRuleSet_ascii_letter = 128;
|
||||
static const uint8_t kRuleSet_digit_char = 129;
|
||||
static const uint8_t kRuleSet_rule_char = 130;
|
||||
|
||||
|
||||
enum Regex_PatternParseAction {
|
||||
doIntervalUpperDigit,
|
||||
doPossessiveOpt,
|
||||
doOpenLookBehindNeg,
|
||||
doDotAny,
|
||||
doSetBackslash_D,
|
||||
doSetLiteral,
|
||||
doSetBackslash_S,
|
||||
doEscapeError,
|
||||
doSetBackslash_W,
|
||||
doDollar,
|
||||
doBackslashb,
|
||||
doSetOpError,
|
||||
doBackslashG,
|
||||
doPatStart,
|
||||
doMismatchedParenErr,
|
||||
doPossessivePlus,
|
||||
doBackslashX,
|
||||
doSetBackslash_s,
|
||||
doSetBackslash_w,
|
||||
doBackslashW,
|
||||
doBackslashw,
|
||||
doSetMatchMode,
|
||||
doOrOperator,
|
||||
doOpenLookAheadNeg,
|
||||
doOpenLookBehind,
|
||||
doBackslashS,
|
||||
doBeginMatchMode,
|
||||
doNOP,
|
||||
doSetProp,
|
||||
doBackslashA,
|
||||
doIntervalInit,
|
||||
doOpenCaptureParen,
|
||||
doNGPlus,
|
||||
doIntervalError,
|
||||
doSetDifference2,
|
||||
doNGOpt,
|
||||
doEscapedLiteralChar,
|
||||
doSetNegate,
|
||||
doSetBegin,
|
||||
doMatchModeParen,
|
||||
doLiteralChar,
|
||||
doOpt,
|
||||
doSetIntersection2,
|
||||
doBadOpenParenType,
|
||||
doSuppressComments,
|
||||
doCloseParen,
|
||||
doPatFinish,
|
||||
doSetBeginUnion,
|
||||
doSetBackslash_d,
|
||||
doProperty,
|
||||
doNGInterval,
|
||||
doNGStar,
|
||||
doOpenLookAhead,
|
||||
doSetBeginIntersection1,
|
||||
doBeginNamedCapture,
|
||||
doInterval,
|
||||
doMatchMode,
|
||||
doSetNoCloseError,
|
||||
doSetBeginDifference1,
|
||||
doPlus,
|
||||
doBackslashD,
|
||||
doSetLiteralEscaped,
|
||||
doContinueNamedCapture,
|
||||
doSetPosixProp,
|
||||
doBackslashz,
|
||||
doSetNamedRange,
|
||||
doPossessiveStar,
|
||||
doBadModeFlag,
|
||||
doContinueNamedBackRef,
|
||||
doPerlInline,
|
||||
doBackslashd,
|
||||
doOpenNonCaptureParen,
|
||||
doSetEnd,
|
||||
doSetAddDash,
|
||||
doSetFinish,
|
||||
doCaret,
|
||||
doConditionalExpr,
|
||||
doExit,
|
||||
doNamedChar,
|
||||
doSetRange,
|
||||
doPossessiveInterval,
|
||||
doBackslashs,
|
||||
doIntervalSame,
|
||||
doEnterQuoteMode,
|
||||
doOpenAtomicParen,
|
||||
doSetNamedChar,
|
||||
doRuleError,
|
||||
doStar,
|
||||
doSetAddAmp,
|
||||
doBackslashB,
|
||||
doCompleteNamedBackRef,
|
||||
doBackslashZ,
|
||||
doIntevalLowerDigit,
|
||||
doSetBackslash_V,
|
||||
doSetBackslash_h,
|
||||
doBeginNamedBackRef,
|
||||
doBackRef,
|
||||
doSetMatchMode,
|
||||
doEnterQuoteMode,
|
||||
doOpenCaptureParen,
|
||||
doContinueNamedCapture,
|
||||
doSetBackslash_d,
|
||||
doBeginMatchMode,
|
||||
doBackslashX,
|
||||
doSetPosixProp,
|
||||
doIntervalError,
|
||||
doSetLiteralEscaped,
|
||||
doSetBackslash_s,
|
||||
doNOP,
|
||||
doBackslashv,
|
||||
doOpenLookBehind,
|
||||
doPatStart,
|
||||
doPossessiveInterval,
|
||||
doOpenAtomicParen,
|
||||
doOpenLookAheadNeg,
|
||||
doBackslashd,
|
||||
doBackslashZ,
|
||||
doIntervalUpperDigit,
|
||||
doBadNamedCapture,
|
||||
doSetDifference2,
|
||||
doSetAddAmp,
|
||||
doSetNamedChar,
|
||||
doNamedChar,
|
||||
doSetBackslash_H,
|
||||
doBackslashb,
|
||||
doBackslashz,
|
||||
doSetBeginDifference1,
|
||||
doOpenLookAhead,
|
||||
doMatchModeParen,
|
||||
doBackslashV,
|
||||
doIntevalLowerDigit,
|
||||
doCaret,
|
||||
doSetEnd,
|
||||
doSetNegate,
|
||||
doBackslashS,
|
||||
doOrOperator,
|
||||
doBackslashB,
|
||||
doBackslashw,
|
||||
doBackslashR,
|
||||
doRuleError,
|
||||
doDotAny,
|
||||
doMatchMode,
|
||||
doSetBackslash_W,
|
||||
doNGPlus,
|
||||
doSetBackslash_D,
|
||||
doPossessiveOpt,
|
||||
doSetNamedRange,
|
||||
doConditionalExpr,
|
||||
doBackslashs,
|
||||
doPossessiveStar,
|
||||
doPlus,
|
||||
doBadOpenParenType,
|
||||
doCloseParen,
|
||||
doNGInterval,
|
||||
doSetProp,
|
||||
doBackRef,
|
||||
doSetBeginUnion,
|
||||
doEscapeError,
|
||||
doOpt,
|
||||
doSetBeginIntersection1,
|
||||
doPossessivePlus,
|
||||
doBackslashD,
|
||||
doOpenLookBehindNeg,
|
||||
doSetBegin,
|
||||
doSetIntersection2,
|
||||
doCompleteNamedBackRef,
|
||||
doSetRange,
|
||||
doDollar,
|
||||
doBackslashH,
|
||||
doExit,
|
||||
doNGOpt,
|
||||
doOpenNonCaptureParen,
|
||||
doBackslashA,
|
||||
doSetBackslash_v,
|
||||
doBackslashh,
|
||||
doBadModeFlag,
|
||||
doSetNoCloseError,
|
||||
doIntervalSame,
|
||||
doSetAddDash,
|
||||
doBackslashW,
|
||||
doPerlInline,
|
||||
doSetOpError,
|
||||
doSetLiteral,
|
||||
doPatFinish,
|
||||
doBeginNamedCapture,
|
||||
doEscapedLiteralChar,
|
||||
doLiteralChar,
|
||||
doSuppressComments,
|
||||
doMismatchedParenErr,
|
||||
doNGStar,
|
||||
doSetFinish,
|
||||
doInterval,
|
||||
doBackslashG,
|
||||
doStar,
|
||||
doSetBackslash_w,
|
||||
doSetBackslash_S,
|
||||
doProperty,
|
||||
doContinueNamedBackRef,
|
||||
doIntervalInit,
|
||||
rbbiLastAction};
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
|
@ -140,7 +149,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doPatStart, 255, 2,0, FALSE} // 1 start
|
||||
, {doLiteralChar, 254, 14,0, TRUE} // 2 term
|
||||
, {doLiteralChar, 130, 14,0, TRUE} // 3
|
||||
, {doSetBegin, 91 /* [ */, 118, 196, TRUE} // 4
|
||||
, {doSetBegin, 91 /* [ */, 123, 205, TRUE} // 4
|
||||
, {doNOP, 40 /* ( */, 27,0, TRUE} // 5
|
||||
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
|
||||
, {doCaret, 94 /* ^ */, 14,0, TRUE} // 7
|
||||
|
@ -149,7 +158,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
|
||||
, {doPatFinish, 253, 2,0, FALSE} // 12
|
||||
, {doRuleError, 255, 197,0, FALSE} // 13
|
||||
, {doRuleError, 255, 206,0, FALSE} // 13
|
||||
, {doNOP, 42 /* * */, 68,0, TRUE} // 14 expr-quant
|
||||
, {doNOP, 43 /* + */, 71,0, TRUE} // 15
|
||||
, {doNOP, 63 /* ? */, 74,0, TRUE} // 16
|
||||
|
@ -179,15 +188,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doBeginMatchMode, 119 /* w */, 53,0, FALSE} // 40
|
||||
, {doBeginMatchMode, 120 /* x */, 53,0, FALSE} // 41
|
||||
, {doBeginMatchMode, 45 /* - */, 53,0, FALSE} // 42
|
||||
, {doConditionalExpr, 40 /* ( */, 197,0, TRUE} // 43
|
||||
, {doPerlInline, 123 /* { */, 197,0, TRUE} // 44
|
||||
, {doBadOpenParenType, 255, 197,0, FALSE} // 45
|
||||
, {doConditionalExpr, 40 /* ( */, 206,0, TRUE} // 43
|
||||
, {doPerlInline, 123 /* { */, 206,0, TRUE} // 44
|
||||
, {doBadOpenParenType, 255, 206,0, FALSE} // 45
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 46 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 47
|
||||
, {doBeginNamedCapture, 129, 64,0, FALSE} // 48
|
||||
, {doBadOpenParenType, 255, 197,0, FALSE} // 49
|
||||
, {doBeginNamedCapture, 128, 64,0, FALSE} // 48
|
||||
, {doBadOpenParenType, 255, 206,0, FALSE} // 49
|
||||
, {doNOP, 41 /* ) */, 255,0, TRUE} // 50 paren-comment
|
||||
, {doMismatchedParenErr, 253, 197,0, FALSE} // 51
|
||||
, {doMismatchedParenErr, 253, 206,0, FALSE} // 51
|
||||
, {doNOP, 255, 50,0, TRUE} // 52
|
||||
, {doMatchMode, 105 /* i */, 53,0, TRUE} // 53 paren-flag
|
||||
, {doMatchMode, 100 /* d */, 53,0, TRUE} // 54
|
||||
|
@ -199,11 +208,11 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doMatchMode, 45 /* - */, 53,0, TRUE} // 60
|
||||
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 61
|
||||
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 62
|
||||
, {doBadModeFlag, 255, 197,0, FALSE} // 63
|
||||
, {doContinueNamedCapture, 129, 64,0, TRUE} // 64 named-capture
|
||||
, {doContinueNamedCapture, 128, 64,0, TRUE} // 65
|
||||
, {doBadModeFlag, 255, 206,0, FALSE} // 63
|
||||
, {doContinueNamedCapture, 128, 64,0, TRUE} // 64 named-capture
|
||||
, {doContinueNamedCapture, 129, 64,0, TRUE} // 65
|
||||
, {doOpenCaptureParen, 62 /* > */, 2, 14, TRUE} // 66
|
||||
, {doBadNamedCapture, 255, 197,0, FALSE} // 67
|
||||
, {doBadNamedCapture, 255, 206,0, FALSE} // 67
|
||||
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 68 quant-star
|
||||
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 69
|
||||
, {doStar, 255, 20,0, FALSE} // 70
|
||||
|
@ -213,15 +222,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 74 quant-opt
|
||||
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 75
|
||||
, {doOpt, 255, 20,0, FALSE} // 76
|
||||
, {doNOP, 128, 79,0, FALSE} // 77 interval-open
|
||||
, {doIntervalError, 255, 197,0, FALSE} // 78
|
||||
, {doIntevalLowerDigit, 128, 79,0, TRUE} // 79 interval-lower
|
||||
, {doNOP, 129, 79,0, FALSE} // 77 interval-open
|
||||
, {doIntervalError, 255, 206,0, FALSE} // 78
|
||||
, {doIntevalLowerDigit, 129, 79,0, TRUE} // 79 interval-lower
|
||||
, {doNOP, 44 /* , */, 83,0, TRUE} // 80
|
||||
, {doIntervalSame, 125 /* } */, 86,0, TRUE} // 81
|
||||
, {doIntervalError, 255, 197,0, FALSE} // 82
|
||||
, {doIntervalUpperDigit, 128, 83,0, TRUE} // 83 interval-upper
|
||||
, {doIntervalError, 255, 206,0, FALSE} // 82
|
||||
, {doIntervalUpperDigit, 129, 83,0, TRUE} // 83 interval-upper
|
||||
, {doNOP, 125 /* } */, 86,0, TRUE} // 84
|
||||
, {doIntervalError, 255, 197,0, FALSE} // 85
|
||||
, {doIntervalError, 255, 206,0, FALSE} // 85
|
||||
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 86 interval-type
|
||||
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 87
|
||||
, {doInterval, 255, 20,0, FALSE} // 88
|
||||
|
@ -231,109 +240,118 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 92
|
||||
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 93
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 94
|
||||
, {doNOP, 107 /* k */, 110,0, TRUE} // 95
|
||||
, {doNamedChar, 78 /* N */, 14,0, FALSE} // 96
|
||||
, {doProperty, 112 /* p */, 14,0, FALSE} // 97
|
||||
, {doProperty, 80 /* P */, 14,0, FALSE} // 98
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 99
|
||||
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 100
|
||||
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 101
|
||||
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 102
|
||||
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 103
|
||||
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 104
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 105
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 106
|
||||
, {doBackRef, 128, 14,0, TRUE} // 107
|
||||
, {doEscapeError, 253, 197,0, FALSE} // 108
|
||||
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 109
|
||||
, {doBeginNamedBackRef, 60 /* < */, 112,0, TRUE} // 110 named-backref
|
||||
, {doBadNamedCapture, 255, 197,0, FALSE} // 111
|
||||
, {doContinueNamedBackRef, 129, 114,0, TRUE} // 112 named-backref-2
|
||||
, {doBadNamedCapture, 255, 197,0, FALSE} // 113
|
||||
, {doContinueNamedBackRef, 129, 114,0, TRUE} // 114 named-backref-3
|
||||
, {doContinueNamedBackRef, 128, 114,0, TRUE} // 115
|
||||
, {doCompleteNamedBackRef, 62 /* > */, 14,0, TRUE} // 116
|
||||
, {doBadNamedCapture, 255, 197,0, FALSE} // 117
|
||||
, {doSetNegate, 94 /* ^ */, 121,0, TRUE} // 118 set-open
|
||||
, {doSetPosixProp, 58 /* : */, 123,0, FALSE} // 119
|
||||
, {doNOP, 255, 121,0, FALSE} // 120
|
||||
, {doSetLiteral, 93 /* ] */, 136,0, TRUE} // 121 set-open2
|
||||
, {doNOP, 255, 126,0, FALSE} // 122
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 123 set-posix
|
||||
, {doNOP, 58 /* : */, 126,0, FALSE} // 124
|
||||
, {doRuleError, 255, 197,0, FALSE} // 125
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 126 set-start
|
||||
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 127
|
||||
, {doNOP, 92 /* \ */, 186,0, TRUE} // 128
|
||||
, {doNOP, 45 /* - */, 132,0, TRUE} // 129
|
||||
, {doNOP, 38 /* & */, 134,0, TRUE} // 130
|
||||
, {doSetLiteral, 255, 136,0, TRUE} // 131
|
||||
, {doRuleError, 45 /* - */, 197,0, FALSE} // 132 set-start-dash
|
||||
, {doSetAddDash, 255, 136,0, FALSE} // 133
|
||||
, {doRuleError, 38 /* & */, 197,0, FALSE} // 134 set-start-amp
|
||||
, {doSetAddAmp, 255, 136,0, FALSE} // 135
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 136 set-after-lit
|
||||
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 137
|
||||
, {doNOP, 45 /* - */, 173,0, TRUE} // 138
|
||||
, {doNOP, 38 /* & */, 164,0, TRUE} // 139
|
||||
, {doNOP, 92 /* \ */, 186,0, TRUE} // 140
|
||||
, {doSetNoCloseError, 253, 197,0, FALSE} // 141
|
||||
, {doSetLiteral, 255, 136,0, TRUE} // 142
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 143 set-after-set
|
||||
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 144
|
||||
, {doNOP, 45 /* - */, 166,0, TRUE} // 145
|
||||
, {doNOP, 38 /* & */, 161,0, TRUE} // 146
|
||||
, {doNOP, 92 /* \ */, 186,0, TRUE} // 147
|
||||
, {doSetNoCloseError, 253, 197,0, FALSE} // 148
|
||||
, {doSetLiteral, 255, 136,0, TRUE} // 149
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 150 set-after-range
|
||||
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 151
|
||||
, {doNOP, 45 /* - */, 169,0, TRUE} // 152
|
||||
, {doNOP, 38 /* & */, 171,0, TRUE} // 153
|
||||
, {doNOP, 92 /* \ */, 186,0, TRUE} // 154
|
||||
, {doSetNoCloseError, 253, 197,0, FALSE} // 155
|
||||
, {doSetLiteral, 255, 136,0, TRUE} // 156
|
||||
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 157 set-after-op
|
||||
, {doSetOpError, 93 /* ] */, 197,0, FALSE} // 158
|
||||
, {doNOP, 92 /* \ */, 186,0, TRUE} // 159
|
||||
, {doSetLiteral, 255, 136,0, TRUE} // 160
|
||||
, {doSetBeginIntersection1, 91 /* [ */, 118, 143, TRUE} // 161 set-set-amp
|
||||
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 162
|
||||
, {doSetAddAmp, 255, 136,0, FALSE} // 163
|
||||
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 164 set-lit-amp
|
||||
, {doSetAddAmp, 255, 136,0, FALSE} // 165
|
||||
, {doSetBeginDifference1, 91 /* [ */, 118, 143, TRUE} // 166 set-set-dash
|
||||
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 167
|
||||
, {doSetAddDash, 255, 136,0, FALSE} // 168
|
||||
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 169 set-range-dash
|
||||
, {doSetAddDash, 255, 136,0, FALSE} // 170
|
||||
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 171 set-range-amp
|
||||
, {doSetAddAmp, 255, 136,0, FALSE} // 172
|
||||
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 173 set-lit-dash
|
||||
, {doSetAddDash, 91 /* [ */, 136,0, FALSE} // 174
|
||||
, {doSetAddDash, 93 /* ] */, 136,0, FALSE} // 175
|
||||
, {doNOP, 92 /* \ */, 178,0, TRUE} // 176
|
||||
, {doSetRange, 255, 150,0, TRUE} // 177
|
||||
, {doSetOpError, 115 /* s */, 197,0, FALSE} // 178 set-lit-dash-escape
|
||||
, {doSetOpError, 83 /* S */, 197,0, FALSE} // 179
|
||||
, {doSetOpError, 119 /* w */, 197,0, FALSE} // 180
|
||||
, {doSetOpError, 87 /* W */, 197,0, FALSE} // 181
|
||||
, {doSetOpError, 100 /* d */, 197,0, FALSE} // 182
|
||||
, {doSetOpError, 68 /* D */, 197,0, FALSE} // 183
|
||||
, {doSetNamedRange, 78 /* N */, 150,0, FALSE} // 184
|
||||
, {doSetRange, 255, 150,0, TRUE} // 185
|
||||
, {doSetProp, 112 /* p */, 143,0, FALSE} // 186 set-escape
|
||||
, {doSetProp, 80 /* P */, 143,0, FALSE} // 187
|
||||
, {doSetNamedChar, 78 /* N */, 136,0, FALSE} // 188
|
||||
, {doSetBackslash_s, 115 /* s */, 150,0, TRUE} // 189
|
||||
, {doSetBackslash_S, 83 /* S */, 150,0, TRUE} // 190
|
||||
, {doSetBackslash_w, 119 /* w */, 150,0, TRUE} // 191
|
||||
, {doSetBackslash_W, 87 /* W */, 150,0, TRUE} // 192
|
||||
, {doSetBackslash_d, 100 /* d */, 150,0, TRUE} // 193
|
||||
, {doSetBackslash_D, 68 /* D */, 150,0, TRUE} // 194
|
||||
, {doSetLiteralEscaped, 255, 136,0, TRUE} // 195
|
||||
, {doSetFinish, 255, 14,0, FALSE} // 196 set-finish
|
||||
, {doExit, 255, 197,0, TRUE} // 197 errorDeath
|
||||
, {doBackslashh, 104 /* h */, 14,0, TRUE} // 95
|
||||
, {doBackslashH, 72 /* H */, 14,0, TRUE} // 96
|
||||
, {doNOP, 107 /* k */, 115,0, TRUE} // 97
|
||||
, {doNamedChar, 78 /* N */, 14,0, FALSE} // 98
|
||||
, {doProperty, 112 /* p */, 14,0, FALSE} // 99
|
||||
, {doProperty, 80 /* P */, 14,0, FALSE} // 100
|
||||
, {doBackslashR, 82 /* R */, 14,0, TRUE} // 101
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 102
|
||||
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 103
|
||||
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 104
|
||||
, {doBackslashv, 118 /* v */, 14,0, TRUE} // 105
|
||||
, {doBackslashV, 86 /* V */, 14,0, TRUE} // 106
|
||||
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 107
|
||||
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 108
|
||||
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 109
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 110
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 111
|
||||
, {doBackRef, 129, 14,0, TRUE} // 112
|
||||
, {doEscapeError, 253, 206,0, FALSE} // 113
|
||||
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 114
|
||||
, {doBeginNamedBackRef, 60 /* < */, 117,0, TRUE} // 115 named-backref
|
||||
, {doBadNamedCapture, 255, 206,0, FALSE} // 116
|
||||
, {doContinueNamedBackRef, 128, 119,0, TRUE} // 117 named-backref-2
|
||||
, {doBadNamedCapture, 255, 206,0, FALSE} // 118
|
||||
, {doContinueNamedBackRef, 128, 119,0, TRUE} // 119 named-backref-3
|
||||
, {doContinueNamedBackRef, 129, 119,0, TRUE} // 120
|
||||
, {doCompleteNamedBackRef, 62 /* > */, 14,0, TRUE} // 121
|
||||
, {doBadNamedCapture, 255, 206,0, FALSE} // 122
|
||||
, {doSetNegate, 94 /* ^ */, 126,0, TRUE} // 123 set-open
|
||||
, {doSetPosixProp, 58 /* : */, 128,0, FALSE} // 124
|
||||
, {doNOP, 255, 126,0, FALSE} // 125
|
||||
, {doSetLiteral, 93 /* ] */, 141,0, TRUE} // 126 set-open2
|
||||
, {doNOP, 255, 131,0, FALSE} // 127
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 128 set-posix
|
||||
, {doNOP, 58 /* : */, 131,0, FALSE} // 129
|
||||
, {doRuleError, 255, 206,0, FALSE} // 130
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 131 set-start
|
||||
, {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 132
|
||||
, {doNOP, 92 /* \ */, 191,0, TRUE} // 133
|
||||
, {doNOP, 45 /* - */, 137,0, TRUE} // 134
|
||||
, {doNOP, 38 /* & */, 139,0, TRUE} // 135
|
||||
, {doSetLiteral, 255, 141,0, TRUE} // 136
|
||||
, {doRuleError, 45 /* - */, 206,0, FALSE} // 137 set-start-dash
|
||||
, {doSetAddDash, 255, 141,0, FALSE} // 138
|
||||
, {doRuleError, 38 /* & */, 206,0, FALSE} // 139 set-start-amp
|
||||
, {doSetAddAmp, 255, 141,0, FALSE} // 140
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 141 set-after-lit
|
||||
, {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 142
|
||||
, {doNOP, 45 /* - */, 178,0, TRUE} // 143
|
||||
, {doNOP, 38 /* & */, 169,0, TRUE} // 144
|
||||
, {doNOP, 92 /* \ */, 191,0, TRUE} // 145
|
||||
, {doSetNoCloseError, 253, 206,0, FALSE} // 146
|
||||
, {doSetLiteral, 255, 141,0, TRUE} // 147
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 148 set-after-set
|
||||
, {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 149
|
||||
, {doNOP, 45 /* - */, 171,0, TRUE} // 150
|
||||
, {doNOP, 38 /* & */, 166,0, TRUE} // 151
|
||||
, {doNOP, 92 /* \ */, 191,0, TRUE} // 152
|
||||
, {doSetNoCloseError, 253, 206,0, FALSE} // 153
|
||||
, {doSetLiteral, 255, 141,0, TRUE} // 154
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 155 set-after-range
|
||||
, {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 156
|
||||
, {doNOP, 45 /* - */, 174,0, TRUE} // 157
|
||||
, {doNOP, 38 /* & */, 176,0, TRUE} // 158
|
||||
, {doNOP, 92 /* \ */, 191,0, TRUE} // 159
|
||||
, {doSetNoCloseError, 253, 206,0, FALSE} // 160
|
||||
, {doSetLiteral, 255, 141,0, TRUE} // 161
|
||||
, {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 162 set-after-op
|
||||
, {doSetOpError, 93 /* ] */, 206,0, FALSE} // 163
|
||||
, {doNOP, 92 /* \ */, 191,0, TRUE} // 164
|
||||
, {doSetLiteral, 255, 141,0, TRUE} // 165
|
||||
, {doSetBeginIntersection1, 91 /* [ */, 123, 148, TRUE} // 166 set-set-amp
|
||||
, {doSetIntersection2, 38 /* & */, 162,0, TRUE} // 167
|
||||
, {doSetAddAmp, 255, 141,0, FALSE} // 168
|
||||
, {doSetIntersection2, 38 /* & */, 162,0, TRUE} // 169 set-lit-amp
|
||||
, {doSetAddAmp, 255, 141,0, FALSE} // 170
|
||||
, {doSetBeginDifference1, 91 /* [ */, 123, 148, TRUE} // 171 set-set-dash
|
||||
, {doSetDifference2, 45 /* - */, 162,0, TRUE} // 172
|
||||
, {doSetAddDash, 255, 141,0, FALSE} // 173
|
||||
, {doSetDifference2, 45 /* - */, 162,0, TRUE} // 174 set-range-dash
|
||||
, {doSetAddDash, 255, 141,0, FALSE} // 175
|
||||
, {doSetIntersection2, 38 /* & */, 162,0, TRUE} // 176 set-range-amp
|
||||
, {doSetAddAmp, 255, 141,0, FALSE} // 177
|
||||
, {doSetDifference2, 45 /* - */, 162,0, TRUE} // 178 set-lit-dash
|
||||
, {doSetAddDash, 91 /* [ */, 141,0, FALSE} // 179
|
||||
, {doSetAddDash, 93 /* ] */, 141,0, FALSE} // 180
|
||||
, {doNOP, 92 /* \ */, 183,0, TRUE} // 181
|
||||
, {doSetRange, 255, 155,0, TRUE} // 182
|
||||
, {doSetOpError, 115 /* s */, 206,0, FALSE} // 183 set-lit-dash-escape
|
||||
, {doSetOpError, 83 /* S */, 206,0, FALSE} // 184
|
||||
, {doSetOpError, 119 /* w */, 206,0, FALSE} // 185
|
||||
, {doSetOpError, 87 /* W */, 206,0, FALSE} // 186
|
||||
, {doSetOpError, 100 /* d */, 206,0, FALSE} // 187
|
||||
, {doSetOpError, 68 /* D */, 206,0, FALSE} // 188
|
||||
, {doSetNamedRange, 78 /* N */, 155,0, FALSE} // 189
|
||||
, {doSetRange, 255, 155,0, TRUE} // 190
|
||||
, {doSetProp, 112 /* p */, 148,0, FALSE} // 191 set-escape
|
||||
, {doSetProp, 80 /* P */, 148,0, FALSE} // 192
|
||||
, {doSetNamedChar, 78 /* N */, 141,0, FALSE} // 193
|
||||
, {doSetBackslash_s, 115 /* s */, 155,0, TRUE} // 194
|
||||
, {doSetBackslash_S, 83 /* S */, 155,0, TRUE} // 195
|
||||
, {doSetBackslash_w, 119 /* w */, 155,0, TRUE} // 196
|
||||
, {doSetBackslash_W, 87 /* W */, 155,0, TRUE} // 197
|
||||
, {doSetBackslash_d, 100 /* d */, 155,0, TRUE} // 198
|
||||
, {doSetBackslash_D, 68 /* D */, 155,0, TRUE} // 199
|
||||
, {doSetBackslash_h, 104 /* h */, 155,0, TRUE} // 200
|
||||
, {doSetBackslash_H, 72 /* H */, 155,0, TRUE} // 201
|
||||
, {doSetBackslash_v, 118 /* v */, 155,0, TRUE} // 202
|
||||
, {doSetBackslash_V, 86 /* V */, 155,0, TRUE} // 203
|
||||
, {doSetLiteralEscaped, 255, 141,0, TRUE} // 204
|
||||
, {doSetFinish, 255, 14,0, FALSE} // 205 set-finish
|
||||
, {doExit, 255, 206,0, TRUE} // 206 errorDeath
|
||||
};
|
||||
static const char * const RegexStateNames[] = { 0,
|
||||
"start",
|
||||
|
@ -444,6 +462,11 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"named-backref",
|
||||
0,
|
||||
|
@ -530,6 +553,10 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"set-finish",
|
||||
"errorDeath",
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/perl
|
||||
# ********************************************************************
|
||||
# * COPYRIGHT:
|
||||
# * Copyright (c) 2002-2007, International Business Machines Corporation and
|
||||
# * Copyright (c) 2002-2015, International Business Machines Corporation and
|
||||
# * others. All Rights Reserved.
|
||||
# ********************************************************************
|
||||
#
|
||||
|
@ -206,7 +206,7 @@ print "// This file contains the state table for the ICU Regular Expression P
|
|||
print "// It is generated by the Perl script \"regexcst.pl\" from\n";
|
||||
print "// the rule parser state definitions file \"regexcst.txt\".\n";
|
||||
print "//\n";
|
||||
print "// Copyright (C) 2002-2007 International Business Machines Corporation \n";
|
||||
print "// Copyright (C) 2002-2015 International Business Machines Corporation \n";
|
||||
print "// and others. All rights reserved. \n";
|
||||
print "//\n";
|
||||
print "//---------------------------------------------------------------------------------\n";
|
||||
|
|
|
@ -250,13 +250,18 @@ backslash:
|
|||
'd' n expr-quant doBackslashd
|
||||
'D' n expr-quant doBackslashD
|
||||
'G' n term doBackslashG
|
||||
'h' n expr-quant doBackslashh
|
||||
'H' n expr-quant doBackslashH
|
||||
'k' n named-backref
|
||||
'N' expr-quant doNamedChar # \N{NAME} named char
|
||||
'p' expr-quant doProperty # \p{Lu} style property
|
||||
'P' expr-quant doProperty
|
||||
'R' n expr-quant doBackslashR
|
||||
'Q' n term doEnterQuoteMode
|
||||
'S' n expr-quant doBackslashS
|
||||
's' n expr-quant doBackslashs
|
||||
'v' n expr-quant doBackslashv
|
||||
'V' n expr-quant doBackslashV
|
||||
'W' n expr-quant doBackslashW
|
||||
'w' n expr-quant doBackslashw
|
||||
'X' n expr-quant doBackslashX
|
||||
|
@ -472,6 +477,10 @@ set-escape:
|
|||
'W' n set-after-range doSetBackslash_W
|
||||
'd' n set-after-range doSetBackslash_d
|
||||
'D' n set-after-range doSetBackslash_D
|
||||
'h' n set-after-range doSetBackslash_h
|
||||
'H' n set-after-range doSetBackslash_H
|
||||
'v' n set-after-range doSetBackslash_v
|
||||
'V' n set-after-range doSetBackslash_V
|
||||
default n set-after-lit doSetLiteralEscaped
|
||||
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (C) 2002-2014 International Business Machines Corporation
|
||||
// Copyright (C) 2002-2015 International Business Machines Corporation
|
||||
// and others. All rights reserved.
|
||||
//
|
||||
// file: regeximp.h
|
||||
|
@ -173,7 +173,10 @@ enum {
|
|||
URX_BACKSLASH_BU = 53, // \b or \B in UREGEX_UWORD mode, using Unicode style
|
||||
// word boundaries.
|
||||
URX_DOLLAR_D = 54, // $ end of input test, in UNIX_LINES mode.
|
||||
URX_DOLLAR_MD = 55 // $ end of input test, in MULTI_LINE and UNIX_LINES mode.
|
||||
URX_DOLLAR_MD = 55, // $ end of input test, in MULTI_LINE and UNIX_LINES mode.
|
||||
URX_BACKSLASH_H = 56, // Value field: 0: \h 1: \H
|
||||
URX_BACKSLASH_R = 57, // Any line break sequence.
|
||||
URX_BACKSLASH_V = 58 // Value field: 0: \v 1: \V
|
||||
|
||||
};
|
||||
|
||||
|
@ -235,7 +238,10 @@ enum {
|
|||
"LOOP_DOT_I", \
|
||||
"BACKSLASH_BU", \
|
||||
"DOLLAR_D", \
|
||||
"DOLLAR_MD"
|
||||
"DOLLAR_MD", \
|
||||
"URX_BACKSLASH_H", \
|
||||
"URX_BACKSLASH_R", \
|
||||
"URX_BACKSLASH_V"
|
||||
|
||||
|
||||
//
|
||||
|
|
|
@ -49,6 +49,15 @@ static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
|
|||
// This constant determines that state saves per tick number.
|
||||
static const int32_t TIMER_INITIAL_VALUE = 10000;
|
||||
|
||||
|
||||
// Test for any of the Unicode line terminating characters.
|
||||
static inline UBool isLineTerminator(UChar32 c) {
|
||||
if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
|
||||
return false;
|
||||
}
|
||||
return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Constructor and Destructor
|
||||
|
@ -837,20 +846,19 @@ UBool RegexMatcher::find(UErrorCode &status) {
|
|||
}
|
||||
} else {
|
||||
for (;;) {
|
||||
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
||||
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
|
||||
if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
|
||||
(void)UTEXT_NEXT32(fInputText);
|
||||
startPos = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
}
|
||||
MatchAt(startPos, FALSE, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
if (fMatch) {
|
||||
return TRUE;
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(fInputText, startPos);
|
||||
if (isLineTerminator(c)) {
|
||||
if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
|
||||
(void)UTEXT_NEXT32(fInputText);
|
||||
startPos = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
}
|
||||
MatchAt(startPos, FALSE, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
if (fMatch) {
|
||||
return TRUE;
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(fInputText, startPos);
|
||||
}
|
||||
if (startPos >= testStartLimit) {
|
||||
fMatch = FALSE;
|
||||
|
@ -1098,8 +1106,7 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
|
|||
} else {
|
||||
for (;;) {
|
||||
c = inputBuf[startPos-1];
|
||||
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
||||
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
|
||||
if (isLineTerminator(c)) {
|
||||
if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
|
||||
startPos++;
|
||||
}
|
||||
|
@ -2927,9 +2934,9 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
|||
// end of input, succeed.
|
||||
UChar32 c = UTEXT_NEXT32(fInputText);
|
||||
if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
|
||||
if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
|
||||
if (isLineTerminator(c)) {
|
||||
// If not in the middle of a CR/LF sequence
|
||||
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
|
||||
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
|
||||
// At new-line at end of input. Success
|
||||
fHitEnd = TRUE;
|
||||
fRequireEnd = TRUE;
|
||||
|
@ -2985,7 +2992,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
|||
// It makes no difference where the new-line is within the input.
|
||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||
UChar32 c = UTEXT_CURRENT32(fInputText);
|
||||
if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
if (isLineTerminator(c)) {
|
||||
// At a line end, except for the odd chance of being in the middle of a CR/LF sequence
|
||||
// In multi-line mode, hitting a new-line just before the end of input does not
|
||||
// set the hitEnd or requireEnd flags
|
||||
|
@ -3034,8 +3041,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
|||
// unless we are at the end of input
|
||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||
UChar32 c = UTEXT_PREVIOUS32(fInputText);
|
||||
if ((fp->fInputIdx < fAnchorLimit) &&
|
||||
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
|
||||
if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
|
||||
// It's a new-line. ^ is true. Success.
|
||||
// TODO: what should be done with positions between a CR and LF?
|
||||
break;
|
||||
|
@ -3116,6 +3122,68 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
|||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_H: // Test for \h, horizontal white space.
|
||||
{
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
fHitEnd = TRUE;
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
break;
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||
UChar32 c = UTEXT_NEXT32(fInputText);
|
||||
int8_t ctype = u_charType(c);
|
||||
UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
|
||||
success ^= (UBool)(opValue != 0); // flip sense for \H
|
||||
if (success) {
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
} else {
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_R: // Test for \R, any line break sequence.
|
||||
{
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
fHitEnd = TRUE;
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
break;
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||
UChar32 c = UTEXT_NEXT32(fInputText);
|
||||
if (isLineTerminator(c)) {
|
||||
if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
|
||||
utext_next32(fInputText);
|
||||
}
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
} else {
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_V: // \v, any single line ending character.
|
||||
{
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
fHitEnd = TRUE;
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
break;
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||
UChar32 c = UTEXT_NEXT32(fInputText);
|
||||
UBool success = isLineTerminator(c);
|
||||
success ^= (UBool)(opValue != 0); // flip sense for \V
|
||||
if (success) {
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
} else {
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_X:
|
||||
// Match a Grapheme, as defined by Unicode TR 29.
|
||||
// Differs slightly from Perl, which consumes combining marks independently
|
||||
|
@ -3343,8 +3411,7 @@ GC_Done:
|
|||
|
||||
// There is input left. Advance over one char, unless we've hit end-of-line
|
||||
UChar32 c = UTEXT_NEXT32(fInputText);
|
||||
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
||||
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
|
||||
if (isLineTerminator(c)) {
|
||||
// End of line in normal mode. . does not match.
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
break;
|
||||
|
@ -4101,7 +4168,7 @@ GC_Done:
|
|||
if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
|
||||
if ((c == 0x0a) || // 0x0a is newline in both modes.
|
||||
(((opValue & 2) == 0) && // IF not UNIX_LINES mode
|
||||
(c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
isLineTerminator(c))) {
|
||||
// char is a line ending. Exit the scanning loop.
|
||||
break;
|
||||
}
|
||||
|
@ -4432,7 +4499,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
|||
UChar32 c;
|
||||
U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
|
||||
|
||||
if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
|
||||
if (isLineTerminator(c)) {
|
||||
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
|
||||
// At new-line at end of input. Success
|
||||
fHitEnd = TRUE;
|
||||
|
@ -4486,7 +4553,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
|||
// If we are positioned just before a new-line, succeed.
|
||||
// It makes no difference where the new-line is within the input.
|
||||
UChar32 c = inputBuf[fp->fInputIdx];
|
||||
if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
if (isLineTerminator(c)) {
|
||||
// At a line end, except for the odd chance of being in the middle of a CR/LF sequence
|
||||
// In multi-line mode, hitting a new-line just before the end of input does not
|
||||
// set the hitEnd or requireEnd flags
|
||||
|
@ -4534,7 +4601,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
|||
// unless we are at the end of input
|
||||
UChar c = inputBuf[fp->fInputIdx - 1];
|
||||
if ((fp->fInputIdx < fAnchorLimit) &&
|
||||
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
|
||||
isLineTerminator(c)) {
|
||||
// It's a new-line. ^ is true. Success.
|
||||
// TODO: what should be done with positions between a CR and LF?
|
||||
break;
|
||||
|
@ -4611,6 +4678,69 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
|||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_H: // Test for \h, horizontal white space.
|
||||
{
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
fHitEnd = TRUE;
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
break;
|
||||
}
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
int8_t ctype = u_charType(c);
|
||||
UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
|
||||
success ^= (UBool)(opValue != 0); // flip sense for \H
|
||||
if (!success) {
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_R: // Test for \R, any line break sequence.
|
||||
{
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
fHitEnd = TRUE;
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
break;
|
||||
}
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (isLineTerminator(c)) {
|
||||
if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
|
||||
// Check for CR/LF sequence. Consume both together when found.
|
||||
UChar c2;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
|
||||
if (c2 != 0x0a) {
|
||||
U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_V: // Any single code point line ending.
|
||||
{
|
||||
if (fp->fInputIdx >= fActiveLimit) {
|
||||
fHitEnd = TRUE;
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
break;
|
||||
}
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
UBool success = isLineTerminator(c);
|
||||
success ^= (UBool)(opValue != 0); // flip sense for \V
|
||||
if (!success) {
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_BACKSLASH_X:
|
||||
// Match a Grapheme, as defined by Unicode TR 29.
|
||||
// Differs slightly from Perl, which consumes combining marks independently
|
||||
|
@ -4820,8 +4950,7 @@ GC_Done:
|
|||
// There is input left. Advance over one char, unless we've hit end-of-line
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
||||
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
|
||||
if (isLineTerminator(c)) {
|
||||
// End of line in normal mode. . does not match.
|
||||
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
|
||||
break;
|
||||
|
@ -5535,7 +5664,7 @@ GC_Done:
|
|||
if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
|
||||
if ((c == 0x0a) || // 0x0a is newline in both modes.
|
||||
(((opValue & 2) == 0) && // IF not UNIX_LINES mode
|
||||
((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
|
||||
isLineTerminator(c))) {
|
||||
// char is a line ending. Put the input pos back to the
|
||||
// line ending char, and exit the scanning loop.
|
||||
U16_BACK_1(inputBuf, 0, ix);
|
||||
|
|
|
@ -742,6 +742,9 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
case URX_LBN_END:
|
||||
case URX_LOOP_C:
|
||||
case URX_LOOP_DOT_I:
|
||||
case URX_BACKSLASH_H:
|
||||
case URX_BACKSLASH_R:
|
||||
case URX_BACKSLASH_V:
|
||||
// types with an integer operand field.
|
||||
printf("%d", val);
|
||||
break;
|
||||
|
|
72
icu4c/source/test/testdata/regextst.txt
vendored
72
icu4c/source/test/testdata/regextst.txt
vendored
|
@ -693,6 +693,78 @@
|
|||
"abc\jkl" "<0>abcjkl</0>" # escape of a non-special letter is just itself.
|
||||
"abc[ \j]kl" "<0>abcjkl</0>"
|
||||
|
||||
#
|
||||
# \R all newline sequences.
|
||||
#
|
||||
"abc\Rxyz" "<0>abc\u000axyz</0>gh"
|
||||
"abc\Rxyz" "<0>abc\u000bxyz</0>gh"
|
||||
"abc\Rxyz" "<0>abc\u000cxyz</0>gh"
|
||||
"abc\Rxyz" "<0>abc\u000dxyz</0>gh"
|
||||
"abc\Rxyz" "<0>abc\u0085xyz</0>gh"
|
||||
"abc\Rxyz" "<0>abc\u2028xyz</0>gh"
|
||||
"abc\Rxyz" "<0>abc\u2029xyz</0>gh"
|
||||
"abc\Rxyz" "<0>abc\u000d\u000axyz</0>gh"
|
||||
|
||||
"abc\R\nxyz" "abc\u000d\u000axyzgh" # \R cannot match only the CR from a CR/LF sequence.
|
||||
"abc\r\nxyz" "<0>abc\u000d\u000axyz</0>gh"
|
||||
|
||||
"abc\Rxyz" "abc\u0009xyz" # Assorted non-matches.
|
||||
"abc\Rxyz" "abc\u000exyz"
|
||||
"abc\Rxyz" "abc\u202axyz"
|
||||
|
||||
# \v \V single character new line sequences.
|
||||
|
||||
"abc\vxyz" "<0>abc\u000axyz</0>gh"
|
||||
"abc\vxyz" "<0>abc\u000bxyz</0>gh"
|
||||
"abc\vxyz" "<0>abc\u000cxyz</0>gh"
|
||||
"abc\vxyz" "<0>abc\u000dxyz</0>gh"
|
||||
"abc\vxyz" "<0>abc\u0085xyz</0>gh"
|
||||
"abc\vxyz" "<0>abc\u2028xyz</0>gh"
|
||||
"abc\vxyz" "<0>abc\u2029xyz</0>gh"
|
||||
"abc\vxyz" "abc\u000d\u000axyzgh"
|
||||
"abc\vxyz" "abc?xyzgh"
|
||||
|
||||
"abc[\v]xyz" "<0>abc\u000axyz</0>gh"
|
||||
"abc[\v]xyz" "<0>abc\u000bxyz</0>gh"
|
||||
"abc[\v]xyz" "<0>abc\u000cxyz</0>gh"
|
||||
"abc[\v]xyz" "<0>abc\u000dxyz</0>gh"
|
||||
"abc[\v]xyz" "<0>abc\u0085xyz</0>gh"
|
||||
"abc[\v]xyz" "<0>abc\u2028xyz</0>gh"
|
||||
"abc[\v]xyz" "<0>abc\u2029xyz</0>gh"
|
||||
"abc[\v]xyz" "abc\u000d\u000axyzgh"
|
||||
"abc[\v]xyz" "abc?xyzgh"
|
||||
|
||||
"abc\Vxyz" "abc\u000axyzgh"
|
||||
"abc\Vxyz" "abc\u000bxyzgh"
|
||||
"abc\Vxyz" "abc\u000cxyzgh"
|
||||
"abc\Vxyz" "abc\u000dxyzgh"
|
||||
"abc\Vxyz" "abc\u0085xyzgh"
|
||||
"abc\Vxyz" "abc\u2028xyzgh"
|
||||
"abc\Vxyz" "abc\u2029xyzgh"
|
||||
"abc\Vxyz" "abc\u000d\u000axyzgh"
|
||||
"abc\Vxyz" "<0>abc?xyz</0>gh"
|
||||
|
||||
# \h \H horizontal white space. Defined as gc=space_separator plus ascii tab
|
||||
|
||||
"abc\hxyz" "<0>abc xyz</0>gh"
|
||||
"abc\Hxyz" "abc xyzgh"
|
||||
"abc\hxyz" "<0>abc\u2003xyz</0>gh"
|
||||
"abc\Hxyz" "abc\u2003xyzgh"
|
||||
"abc\hxyz" "<0>abc\u0009xyz</0>gh"
|
||||
"abc\Hxyz" "abc\u0009xyzgh"
|
||||
"abc\hxyz" "abc?xyzgh"
|
||||
"abc\Hxyz" "<0>abc?xyz</0>gh"
|
||||
|
||||
"abc[\h]xyz" "<0>abc xyz</0>gh"
|
||||
"abc[\H]xyz" "abc xyzgh"
|
||||
"abc[\h]xyz" "<0>abc\u2003xyz</0>gh"
|
||||
"abc[\H]xyz" "abc\u2003xyzgh"
|
||||
"abc[\h]xyz" "<0>abc\u0009xyz</0>gh"
|
||||
"abc[\H]xyz" "abc\u0009xyzgh"
|
||||
"abc[\h]xyz" "abc?xyzgh"
|
||||
"abc[\H]xyz" "<0>abc?xyz</0>gh"
|
||||
|
||||
|
||||
#
|
||||
# Bug xxxx
|
||||
#
|
||||
|
|
Loading…
Add table
Reference in a new issue