ICU-11393 Regex, add pattern chars R v and h

X-SVN-Rev: 37057
This commit is contained in:
Andy Heninger 2015-02-24 00:24:59 +00:00
parent 77775f1b2d
commit ce09d8a4bc
8 changed files with 611 additions and 252 deletions

View file

@ -1188,6 +1188,21 @@ UBool RegexCompile::doParseActions(int32_t action)
appendOp(URX_BACKSLASH_G, 0);
break;
case doBackslashH:
fixLiterals(FALSE);
appendOp(URX_BACKSLASH_H, 1);
break;
case doBackslashh:
fixLiterals(FALSE);
appendOp(URX_BACKSLASH_H, 0);
break;
case doBackslashR:
fixLiterals(FALSE);
appendOp(URX_BACKSLASH_R, 0);
break;
case doBackslashS:
fixLiterals(FALSE);
appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
@ -1198,6 +1213,16 @@ UBool RegexCompile::doParseActions(int32_t action)
appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
break;
case doBackslashV:
fixLiterals(FALSE);
appendOp(URX_BACKSLASH_V, 1);
break;
case doBackslashv:
fixLiterals(FALSE);
appendOp(URX_BACKSLASH_V, 0);
break;
case doBackslashW:
fixLiterals(FALSE);
appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
@ -1548,6 +1573,48 @@ UBool RegexCompile::doParseActions(int32_t action)
break;
}
case doSetBackslash_h:
{
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
UnicodeSet h;
h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
h.add((UChar32)9); // Tab
set->addAll(h);
break;
}
case doSetBackslash_H:
{
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
UnicodeSet h;
h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
h.add((UChar32)9); // Tab
h.complement();
set->addAll(h);
break;
}
case doSetBackslash_v:
{
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
set->add((UChar32)0x0a, (UChar32)0x0d); // add range
set->add((UChar32)0x85);
set->add((UChar32)0x2028, (UChar32)0x2029);
break;
}
case doSetBackslash_V:
{
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
UnicodeSet v;
v.add((UChar32)0x0a, (UChar32)0x0d); // add range
v.add((UChar32)0x85);
v.add((UChar32)0x2028, (UChar32)0x2029);
v.complement();
set->addAll(v);
break;
}
case doSetBackslash_w:
{
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
@ -2749,6 +2816,43 @@ void RegexCompile::matchStartType() {
break;
case URX_BACKSLASH_H:
// Horiz white space
if (currentLen == 0) {
UnicodeSet s;
s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
s.add((UChar32)9); // Tab
if (URX_VAL(op) != 0) {
s.complement();
}
fRXPat->fInitialChars->addAll(s);
numInitialStrings += 2;
}
currentLen++;
atStart = FALSE;
break;
case URX_BACKSLASH_R: // Any line ending sequence
case URX_BACKSLASH_V: // Any line ending code point, with optional negation
if (currentLen == 0) {
UnicodeSet s;
s.add((UChar32)0x0a, (UChar32)0x0d); // add range
s.add((UChar32)0x85);
s.add((UChar32)0x2028, (UChar32)0x2029);
if (URX_VAL(op) != 0) {
// Complement option applies to URX_BACKSLASH_V only.
s.complement();
}
fRXPat->fInitialChars->addAll(s);
numInitialStrings += 2;
}
currentLen++;
atStart = FALSE;
break;
case URX_ONECHAR_I:
// Case Insensitive Single Character.
if (currentLen == 0) {
@ -3137,6 +3241,9 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_STAT_SETREF_N:
case URX_SETREF:
case URX_BACKSLASH_D:
case URX_BACKSLASH_H:
case URX_BACKSLASH_R:
case URX_BACKSLASH_V:
case URX_ONECHAR_I:
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
case URX_DOTANY_ALL: // . matches one or two.
@ -3418,6 +3525,9 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
case URX_STAT_SETREF_N:
case URX_SETREF:
case URX_BACKSLASH_D:
case URX_BACKSLASH_H:
case URX_BACKSLASH_R:
case URX_BACKSLASH_V:
case URX_ONECHAR_I:
case URX_DOTANY_ALL:
case URX_DOTANY:
@ -3746,6 +3856,9 @@ void RegexCompile::stripNOPs() {
case URX_LOOP_C:
case URX_DOLLAR_D:
case URX_DOLLAR_MD:
case URX_BACKSLASH_H:
case URX_BACKSLASH_R:
case URX_BACKSLASH_V:
// These instructions are unaltered by the relocation.
fRXPat->fCompiledPat->setElementAt(op, dst);
dst++;

View file

@ -16,108 +16,117 @@ U_NAMESPACE_BEGIN
//
// Character classes for regex pattern scanning.
//
static const uint8_t kRuleSet_digit_char = 128;
static const uint8_t kRuleSet_ascii_letter = 129;
static const uint8_t kRuleSet_ascii_letter = 128;
static const uint8_t kRuleSet_digit_char = 129;
static const uint8_t kRuleSet_rule_char = 130;
enum Regex_PatternParseAction {
doIntervalUpperDigit,
doPossessiveOpt,
doOpenLookBehindNeg,
doDotAny,
doSetBackslash_D,
doSetLiteral,
doSetBackslash_S,
doEscapeError,
doSetBackslash_W,
doDollar,
doBackslashb,
doSetOpError,
doBackslashG,
doPatStart,
doMismatchedParenErr,
doPossessivePlus,
doBackslashX,
doSetBackslash_s,
doSetBackslash_w,
doBackslashW,
doBackslashw,
doSetMatchMode,
doOrOperator,
doOpenLookAheadNeg,
doOpenLookBehind,
doBackslashS,
doBeginMatchMode,
doNOP,
doSetProp,
doBackslashA,
doIntervalInit,
doOpenCaptureParen,
doNGPlus,
doIntervalError,
doSetDifference2,
doNGOpt,
doEscapedLiteralChar,
doSetNegate,
doSetBegin,
doMatchModeParen,
doLiteralChar,
doOpt,
doSetIntersection2,
doBadOpenParenType,
doSuppressComments,
doCloseParen,
doPatFinish,
doSetBeginUnion,
doSetBackslash_d,
doProperty,
doNGInterval,
doNGStar,
doOpenLookAhead,
doSetBeginIntersection1,
doBeginNamedCapture,
doInterval,
doMatchMode,
doSetNoCloseError,
doSetBeginDifference1,
doPlus,
doBackslashD,
doSetLiteralEscaped,
doContinueNamedCapture,
doSetPosixProp,
doBackslashz,
doSetNamedRange,
doPossessiveStar,
doBadModeFlag,
doContinueNamedBackRef,
doPerlInline,
doBackslashd,
doOpenNonCaptureParen,
doSetEnd,
doSetAddDash,
doSetFinish,
doCaret,
doConditionalExpr,
doExit,
doNamedChar,
doSetRange,
doPossessiveInterval,
doBackslashs,
doIntervalSame,
doEnterQuoteMode,
doOpenAtomicParen,
doSetNamedChar,
doRuleError,
doStar,
doSetAddAmp,
doBackslashB,
doCompleteNamedBackRef,
doBackslashZ,
doIntevalLowerDigit,
doSetBackslash_V,
doSetBackslash_h,
doBeginNamedBackRef,
doBackRef,
doSetMatchMode,
doEnterQuoteMode,
doOpenCaptureParen,
doContinueNamedCapture,
doSetBackslash_d,
doBeginMatchMode,
doBackslashX,
doSetPosixProp,
doIntervalError,
doSetLiteralEscaped,
doSetBackslash_s,
doNOP,
doBackslashv,
doOpenLookBehind,
doPatStart,
doPossessiveInterval,
doOpenAtomicParen,
doOpenLookAheadNeg,
doBackslashd,
doBackslashZ,
doIntervalUpperDigit,
doBadNamedCapture,
doSetDifference2,
doSetAddAmp,
doSetNamedChar,
doNamedChar,
doSetBackslash_H,
doBackslashb,
doBackslashz,
doSetBeginDifference1,
doOpenLookAhead,
doMatchModeParen,
doBackslashV,
doIntevalLowerDigit,
doCaret,
doSetEnd,
doSetNegate,
doBackslashS,
doOrOperator,
doBackslashB,
doBackslashw,
doBackslashR,
doRuleError,
doDotAny,
doMatchMode,
doSetBackslash_W,
doNGPlus,
doSetBackslash_D,
doPossessiveOpt,
doSetNamedRange,
doConditionalExpr,
doBackslashs,
doPossessiveStar,
doPlus,
doBadOpenParenType,
doCloseParen,
doNGInterval,
doSetProp,
doBackRef,
doSetBeginUnion,
doEscapeError,
doOpt,
doSetBeginIntersection1,
doPossessivePlus,
doBackslashD,
doOpenLookBehindNeg,
doSetBegin,
doSetIntersection2,
doCompleteNamedBackRef,
doSetRange,
doDollar,
doBackslashH,
doExit,
doNGOpt,
doOpenNonCaptureParen,
doBackslashA,
doSetBackslash_v,
doBackslashh,
doBadModeFlag,
doSetNoCloseError,
doIntervalSame,
doSetAddDash,
doBackslashW,
doPerlInline,
doSetOpError,
doSetLiteral,
doPatFinish,
doBeginNamedCapture,
doEscapedLiteralChar,
doLiteralChar,
doSuppressComments,
doMismatchedParenErr,
doNGStar,
doSetFinish,
doInterval,
doBackslashG,
doStar,
doSetBackslash_w,
doSetBackslash_S,
doProperty,
doContinueNamedBackRef,
doIntervalInit,
rbbiLastAction};
//-------------------------------------------------------------------------------
@ -140,7 +149,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doPatStart, 255, 2,0, FALSE} // 1 start
, {doLiteralChar, 254, 14,0, TRUE} // 2 term
, {doLiteralChar, 130, 14,0, TRUE} // 3
, {doSetBegin, 91 /* [ */, 118, 196, TRUE} // 4
, {doSetBegin, 91 /* [ */, 123, 205, TRUE} // 4
, {doNOP, 40 /* ( */, 27,0, TRUE} // 5
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
, {doCaret, 94 /* ^ */, 14,0, TRUE} // 7
@ -149,7 +158,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
, {doPatFinish, 253, 2,0, FALSE} // 12
, {doRuleError, 255, 197,0, FALSE} // 13
, {doRuleError, 255, 206,0, FALSE} // 13
, {doNOP, 42 /* * */, 68,0, TRUE} // 14 expr-quant
, {doNOP, 43 /* + */, 71,0, TRUE} // 15
, {doNOP, 63 /* ? */, 74,0, TRUE} // 16
@ -179,15 +188,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doBeginMatchMode, 119 /* w */, 53,0, FALSE} // 40
, {doBeginMatchMode, 120 /* x */, 53,0, FALSE} // 41
, {doBeginMatchMode, 45 /* - */, 53,0, FALSE} // 42
, {doConditionalExpr, 40 /* ( */, 197,0, TRUE} // 43
, {doPerlInline, 123 /* { */, 197,0, TRUE} // 44
, {doBadOpenParenType, 255, 197,0, FALSE} // 45
, {doConditionalExpr, 40 /* ( */, 206,0, TRUE} // 43
, {doPerlInline, 123 /* { */, 206,0, TRUE} // 44
, {doBadOpenParenType, 255, 206,0, FALSE} // 45
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 46 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 47
, {doBeginNamedCapture, 129, 64,0, FALSE} // 48
, {doBadOpenParenType, 255, 197,0, FALSE} // 49
, {doBeginNamedCapture, 128, 64,0, FALSE} // 48
, {doBadOpenParenType, 255, 206,0, FALSE} // 49
, {doNOP, 41 /* ) */, 255,0, TRUE} // 50 paren-comment
, {doMismatchedParenErr, 253, 197,0, FALSE} // 51
, {doMismatchedParenErr, 253, 206,0, FALSE} // 51
, {doNOP, 255, 50,0, TRUE} // 52
, {doMatchMode, 105 /* i */, 53,0, TRUE} // 53 paren-flag
, {doMatchMode, 100 /* d */, 53,0, TRUE} // 54
@ -199,11 +208,11 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doMatchMode, 45 /* - */, 53,0, TRUE} // 60
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 61
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 62
, {doBadModeFlag, 255, 197,0, FALSE} // 63
, {doContinueNamedCapture, 129, 64,0, TRUE} // 64 named-capture
, {doContinueNamedCapture, 128, 64,0, TRUE} // 65
, {doBadModeFlag, 255, 206,0, FALSE} // 63
, {doContinueNamedCapture, 128, 64,0, TRUE} // 64 named-capture
, {doContinueNamedCapture, 129, 64,0, TRUE} // 65
, {doOpenCaptureParen, 62 /* > */, 2, 14, TRUE} // 66
, {doBadNamedCapture, 255, 197,0, FALSE} // 67
, {doBadNamedCapture, 255, 206,0, FALSE} // 67
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 68 quant-star
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 69
, {doStar, 255, 20,0, FALSE} // 70
@ -213,15 +222,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 74 quant-opt
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 75
, {doOpt, 255, 20,0, FALSE} // 76
, {doNOP, 128, 79,0, FALSE} // 77 interval-open
, {doIntervalError, 255, 197,0, FALSE} // 78
, {doIntevalLowerDigit, 128, 79,0, TRUE} // 79 interval-lower
, {doNOP, 129, 79,0, FALSE} // 77 interval-open
, {doIntervalError, 255, 206,0, FALSE} // 78
, {doIntevalLowerDigit, 129, 79,0, TRUE} // 79 interval-lower
, {doNOP, 44 /* , */, 83,0, TRUE} // 80
, {doIntervalSame, 125 /* } */, 86,0, TRUE} // 81
, {doIntervalError, 255, 197,0, FALSE} // 82
, {doIntervalUpperDigit, 128, 83,0, TRUE} // 83 interval-upper
, {doIntervalError, 255, 206,0, FALSE} // 82
, {doIntervalUpperDigit, 129, 83,0, TRUE} // 83 interval-upper
, {doNOP, 125 /* } */, 86,0, TRUE} // 84
, {doIntervalError, 255, 197,0, FALSE} // 85
, {doIntervalError, 255, 206,0, FALSE} // 85
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 86 interval-type
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 87
, {doInterval, 255, 20,0, FALSE} // 88
@ -231,109 +240,118 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 92
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 93
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 94
, {doNOP, 107 /* k */, 110,0, TRUE} // 95
, {doNamedChar, 78 /* N */, 14,0, FALSE} // 96
, {doProperty, 112 /* p */, 14,0, FALSE} // 97
, {doProperty, 80 /* P */, 14,0, FALSE} // 98
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 99
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 100
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 101
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 102
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 103
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 104
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 105
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 106
, {doBackRef, 128, 14,0, TRUE} // 107
, {doEscapeError, 253, 197,0, FALSE} // 108
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 109
, {doBeginNamedBackRef, 60 /* < */, 112,0, TRUE} // 110 named-backref
, {doBadNamedCapture, 255, 197,0, FALSE} // 111
, {doContinueNamedBackRef, 129, 114,0, TRUE} // 112 named-backref-2
, {doBadNamedCapture, 255, 197,0, FALSE} // 113
, {doContinueNamedBackRef, 129, 114,0, TRUE} // 114 named-backref-3
, {doContinueNamedBackRef, 128, 114,0, TRUE} // 115
, {doCompleteNamedBackRef, 62 /* > */, 14,0, TRUE} // 116
, {doBadNamedCapture, 255, 197,0, FALSE} // 117
, {doSetNegate, 94 /* ^ */, 121,0, TRUE} // 118 set-open
, {doSetPosixProp, 58 /* : */, 123,0, FALSE} // 119
, {doNOP, 255, 121,0, FALSE} // 120
, {doSetLiteral, 93 /* ] */, 136,0, TRUE} // 121 set-open2
, {doNOP, 255, 126,0, FALSE} // 122
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 123 set-posix
, {doNOP, 58 /* : */, 126,0, FALSE} // 124
, {doRuleError, 255, 197,0, FALSE} // 125
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 126 set-start
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 127
, {doNOP, 92 /* \ */, 186,0, TRUE} // 128
, {doNOP, 45 /* - */, 132,0, TRUE} // 129
, {doNOP, 38 /* & */, 134,0, TRUE} // 130
, {doSetLiteral, 255, 136,0, TRUE} // 131
, {doRuleError, 45 /* - */, 197,0, FALSE} // 132 set-start-dash
, {doSetAddDash, 255, 136,0, FALSE} // 133
, {doRuleError, 38 /* & */, 197,0, FALSE} // 134 set-start-amp
, {doSetAddAmp, 255, 136,0, FALSE} // 135
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 136 set-after-lit
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 137
, {doNOP, 45 /* - */, 173,0, TRUE} // 138
, {doNOP, 38 /* & */, 164,0, TRUE} // 139
, {doNOP, 92 /* \ */, 186,0, TRUE} // 140
, {doSetNoCloseError, 253, 197,0, FALSE} // 141
, {doSetLiteral, 255, 136,0, TRUE} // 142
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 143 set-after-set
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 144
, {doNOP, 45 /* - */, 166,0, TRUE} // 145
, {doNOP, 38 /* & */, 161,0, TRUE} // 146
, {doNOP, 92 /* \ */, 186,0, TRUE} // 147
, {doSetNoCloseError, 253, 197,0, FALSE} // 148
, {doSetLiteral, 255, 136,0, TRUE} // 149
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 150 set-after-range
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 151
, {doNOP, 45 /* - */, 169,0, TRUE} // 152
, {doNOP, 38 /* & */, 171,0, TRUE} // 153
, {doNOP, 92 /* \ */, 186,0, TRUE} // 154
, {doSetNoCloseError, 253, 197,0, FALSE} // 155
, {doSetLiteral, 255, 136,0, TRUE} // 156
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 157 set-after-op
, {doSetOpError, 93 /* ] */, 197,0, FALSE} // 158
, {doNOP, 92 /* \ */, 186,0, TRUE} // 159
, {doSetLiteral, 255, 136,0, TRUE} // 160
, {doSetBeginIntersection1, 91 /* [ */, 118, 143, TRUE} // 161 set-set-amp
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 162
, {doSetAddAmp, 255, 136,0, FALSE} // 163
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 164 set-lit-amp
, {doSetAddAmp, 255, 136,0, FALSE} // 165
, {doSetBeginDifference1, 91 /* [ */, 118, 143, TRUE} // 166 set-set-dash
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 167
, {doSetAddDash, 255, 136,0, FALSE} // 168
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 169 set-range-dash
, {doSetAddDash, 255, 136,0, FALSE} // 170
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 171 set-range-amp
, {doSetAddAmp, 255, 136,0, FALSE} // 172
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 173 set-lit-dash
, {doSetAddDash, 91 /* [ */, 136,0, FALSE} // 174
, {doSetAddDash, 93 /* ] */, 136,0, FALSE} // 175
, {doNOP, 92 /* \ */, 178,0, TRUE} // 176
, {doSetRange, 255, 150,0, TRUE} // 177
, {doSetOpError, 115 /* s */, 197,0, FALSE} // 178 set-lit-dash-escape
, {doSetOpError, 83 /* S */, 197,0, FALSE} // 179
, {doSetOpError, 119 /* w */, 197,0, FALSE} // 180
, {doSetOpError, 87 /* W */, 197,0, FALSE} // 181
, {doSetOpError, 100 /* d */, 197,0, FALSE} // 182
, {doSetOpError, 68 /* D */, 197,0, FALSE} // 183
, {doSetNamedRange, 78 /* N */, 150,0, FALSE} // 184
, {doSetRange, 255, 150,0, TRUE} // 185
, {doSetProp, 112 /* p */, 143,0, FALSE} // 186 set-escape
, {doSetProp, 80 /* P */, 143,0, FALSE} // 187
, {doSetNamedChar, 78 /* N */, 136,0, FALSE} // 188
, {doSetBackslash_s, 115 /* s */, 150,0, TRUE} // 189
, {doSetBackslash_S, 83 /* S */, 150,0, TRUE} // 190
, {doSetBackslash_w, 119 /* w */, 150,0, TRUE} // 191
, {doSetBackslash_W, 87 /* W */, 150,0, TRUE} // 192
, {doSetBackslash_d, 100 /* d */, 150,0, TRUE} // 193
, {doSetBackslash_D, 68 /* D */, 150,0, TRUE} // 194
, {doSetLiteralEscaped, 255, 136,0, TRUE} // 195
, {doSetFinish, 255, 14,0, FALSE} // 196 set-finish
, {doExit, 255, 197,0, TRUE} // 197 errorDeath
, {doBackslashh, 104 /* h */, 14,0, TRUE} // 95
, {doBackslashH, 72 /* H */, 14,0, TRUE} // 96
, {doNOP, 107 /* k */, 115,0, TRUE} // 97
, {doNamedChar, 78 /* N */, 14,0, FALSE} // 98
, {doProperty, 112 /* p */, 14,0, FALSE} // 99
, {doProperty, 80 /* P */, 14,0, FALSE} // 100
, {doBackslashR, 82 /* R */, 14,0, TRUE} // 101
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 102
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 103
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 104
, {doBackslashv, 118 /* v */, 14,0, TRUE} // 105
, {doBackslashV, 86 /* V */, 14,0, TRUE} // 106
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 107
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 108
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 109
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 110
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 111
, {doBackRef, 129, 14,0, TRUE} // 112
, {doEscapeError, 253, 206,0, FALSE} // 113
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 114
, {doBeginNamedBackRef, 60 /* < */, 117,0, TRUE} // 115 named-backref
, {doBadNamedCapture, 255, 206,0, FALSE} // 116
, {doContinueNamedBackRef, 128, 119,0, TRUE} // 117 named-backref-2
, {doBadNamedCapture, 255, 206,0, FALSE} // 118
, {doContinueNamedBackRef, 128, 119,0, TRUE} // 119 named-backref-3
, {doContinueNamedBackRef, 129, 119,0, TRUE} // 120
, {doCompleteNamedBackRef, 62 /* > */, 14,0, TRUE} // 121
, {doBadNamedCapture, 255, 206,0, FALSE} // 122
, {doSetNegate, 94 /* ^ */, 126,0, TRUE} // 123 set-open
, {doSetPosixProp, 58 /* : */, 128,0, FALSE} // 124
, {doNOP, 255, 126,0, FALSE} // 125
, {doSetLiteral, 93 /* ] */, 141,0, TRUE} // 126 set-open2
, {doNOP, 255, 131,0, FALSE} // 127
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 128 set-posix
, {doNOP, 58 /* : */, 131,0, FALSE} // 129
, {doRuleError, 255, 206,0, FALSE} // 130
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 131 set-start
, {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 132
, {doNOP, 92 /* \ */, 191,0, TRUE} // 133
, {doNOP, 45 /* - */, 137,0, TRUE} // 134
, {doNOP, 38 /* & */, 139,0, TRUE} // 135
, {doSetLiteral, 255, 141,0, TRUE} // 136
, {doRuleError, 45 /* - */, 206,0, FALSE} // 137 set-start-dash
, {doSetAddDash, 255, 141,0, FALSE} // 138
, {doRuleError, 38 /* & */, 206,0, FALSE} // 139 set-start-amp
, {doSetAddAmp, 255, 141,0, FALSE} // 140
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 141 set-after-lit
, {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 142
, {doNOP, 45 /* - */, 178,0, TRUE} // 143
, {doNOP, 38 /* & */, 169,0, TRUE} // 144
, {doNOP, 92 /* \ */, 191,0, TRUE} // 145
, {doSetNoCloseError, 253, 206,0, FALSE} // 146
, {doSetLiteral, 255, 141,0, TRUE} // 147
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 148 set-after-set
, {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 149
, {doNOP, 45 /* - */, 171,0, TRUE} // 150
, {doNOP, 38 /* & */, 166,0, TRUE} // 151
, {doNOP, 92 /* \ */, 191,0, TRUE} // 152
, {doSetNoCloseError, 253, 206,0, FALSE} // 153
, {doSetLiteral, 255, 141,0, TRUE} // 154
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 155 set-after-range
, {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 156
, {doNOP, 45 /* - */, 174,0, TRUE} // 157
, {doNOP, 38 /* & */, 176,0, TRUE} // 158
, {doNOP, 92 /* \ */, 191,0, TRUE} // 159
, {doSetNoCloseError, 253, 206,0, FALSE} // 160
, {doSetLiteral, 255, 141,0, TRUE} // 161
, {doSetBeginUnion, 91 /* [ */, 123, 148, TRUE} // 162 set-after-op
, {doSetOpError, 93 /* ] */, 206,0, FALSE} // 163
, {doNOP, 92 /* \ */, 191,0, TRUE} // 164
, {doSetLiteral, 255, 141,0, TRUE} // 165
, {doSetBeginIntersection1, 91 /* [ */, 123, 148, TRUE} // 166 set-set-amp
, {doSetIntersection2, 38 /* & */, 162,0, TRUE} // 167
, {doSetAddAmp, 255, 141,0, FALSE} // 168
, {doSetIntersection2, 38 /* & */, 162,0, TRUE} // 169 set-lit-amp
, {doSetAddAmp, 255, 141,0, FALSE} // 170
, {doSetBeginDifference1, 91 /* [ */, 123, 148, TRUE} // 171 set-set-dash
, {doSetDifference2, 45 /* - */, 162,0, TRUE} // 172
, {doSetAddDash, 255, 141,0, FALSE} // 173
, {doSetDifference2, 45 /* - */, 162,0, TRUE} // 174 set-range-dash
, {doSetAddDash, 255, 141,0, FALSE} // 175
, {doSetIntersection2, 38 /* & */, 162,0, TRUE} // 176 set-range-amp
, {doSetAddAmp, 255, 141,0, FALSE} // 177
, {doSetDifference2, 45 /* - */, 162,0, TRUE} // 178 set-lit-dash
, {doSetAddDash, 91 /* [ */, 141,0, FALSE} // 179
, {doSetAddDash, 93 /* ] */, 141,0, FALSE} // 180
, {doNOP, 92 /* \ */, 183,0, TRUE} // 181
, {doSetRange, 255, 155,0, TRUE} // 182
, {doSetOpError, 115 /* s */, 206,0, FALSE} // 183 set-lit-dash-escape
, {doSetOpError, 83 /* S */, 206,0, FALSE} // 184
, {doSetOpError, 119 /* w */, 206,0, FALSE} // 185
, {doSetOpError, 87 /* W */, 206,0, FALSE} // 186
, {doSetOpError, 100 /* d */, 206,0, FALSE} // 187
, {doSetOpError, 68 /* D */, 206,0, FALSE} // 188
, {doSetNamedRange, 78 /* N */, 155,0, FALSE} // 189
, {doSetRange, 255, 155,0, TRUE} // 190
, {doSetProp, 112 /* p */, 148,0, FALSE} // 191 set-escape
, {doSetProp, 80 /* P */, 148,0, FALSE} // 192
, {doSetNamedChar, 78 /* N */, 141,0, FALSE} // 193
, {doSetBackslash_s, 115 /* s */, 155,0, TRUE} // 194
, {doSetBackslash_S, 83 /* S */, 155,0, TRUE} // 195
, {doSetBackslash_w, 119 /* w */, 155,0, TRUE} // 196
, {doSetBackslash_W, 87 /* W */, 155,0, TRUE} // 197
, {doSetBackslash_d, 100 /* d */, 155,0, TRUE} // 198
, {doSetBackslash_D, 68 /* D */, 155,0, TRUE} // 199
, {doSetBackslash_h, 104 /* h */, 155,0, TRUE} // 200
, {doSetBackslash_H, 72 /* H */, 155,0, TRUE} // 201
, {doSetBackslash_v, 118 /* v */, 155,0, TRUE} // 202
, {doSetBackslash_V, 86 /* V */, 155,0, TRUE} // 203
, {doSetLiteralEscaped, 255, 141,0, TRUE} // 204
, {doSetFinish, 255, 14,0, FALSE} // 205 set-finish
, {doExit, 255, 206,0, TRUE} // 206 errorDeath
};
static const char * const RegexStateNames[] = { 0,
"start",
@ -444,6 +462,11 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
"named-backref",
0,
@ -530,6 +553,10 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
0,
0,
"set-finish",
"errorDeath",

View file

@ -1,7 +1,7 @@
#!/usr/bin/perl
# ********************************************************************
# * COPYRIGHT:
# * Copyright (c) 2002-2007, International Business Machines Corporation and
# * Copyright (c) 2002-2015, International Business Machines Corporation and
# * others. All Rights Reserved.
# ********************************************************************
#
@ -206,7 +206,7 @@ print "// This file contains the state table for the ICU Regular Expression P
print "// It is generated by the Perl script \"regexcst.pl\" from\n";
print "// the rule parser state definitions file \"regexcst.txt\".\n";
print "//\n";
print "// Copyright (C) 2002-2007 International Business Machines Corporation \n";
print "// Copyright (C) 2002-2015 International Business Machines Corporation \n";
print "// and others. All rights reserved. \n";
print "//\n";
print "//---------------------------------------------------------------------------------\n";

View file

@ -250,13 +250,18 @@ backslash:
'd' n expr-quant doBackslashd
'D' n expr-quant doBackslashD
'G' n term doBackslashG
'h' n expr-quant doBackslashh
'H' n expr-quant doBackslashH
'k' n named-backref
'N' expr-quant doNamedChar # \N{NAME} named char
'p' expr-quant doProperty # \p{Lu} style property
'P' expr-quant doProperty
'R' n expr-quant doBackslashR
'Q' n term doEnterQuoteMode
'S' n expr-quant doBackslashS
's' n expr-quant doBackslashs
'v' n expr-quant doBackslashv
'V' n expr-quant doBackslashV
'W' n expr-quant doBackslashW
'w' n expr-quant doBackslashw
'X' n expr-quant doBackslashX
@ -472,6 +477,10 @@ set-escape:
'W' n set-after-range doSetBackslash_W
'd' n set-after-range doSetBackslash_d
'D' n set-after-range doSetBackslash_D
'h' n set-after-range doSetBackslash_h
'H' n set-after-range doSetBackslash_H
'v' n set-after-range doSetBackslash_v
'V' n set-after-range doSetBackslash_V
default n set-after-lit doSetLiteralEscaped
#

View file

@ -1,5 +1,5 @@
//
// Copyright (C) 2002-2014 International Business Machines Corporation
// Copyright (C) 2002-2015 International Business Machines Corporation
// and others. All rights reserved.
//
// file: regeximp.h
@ -173,7 +173,10 @@ enum {
URX_BACKSLASH_BU = 53, // \b or \B in UREGEX_UWORD mode, using Unicode style
// word boundaries.
URX_DOLLAR_D = 54, // $ end of input test, in UNIX_LINES mode.
URX_DOLLAR_MD = 55 // $ end of input test, in MULTI_LINE and UNIX_LINES mode.
URX_DOLLAR_MD = 55, // $ end of input test, in MULTI_LINE and UNIX_LINES mode.
URX_BACKSLASH_H = 56, // Value field: 0: \h 1: \H
URX_BACKSLASH_R = 57, // Any line break sequence.
URX_BACKSLASH_V = 58 // Value field: 0: \v 1: \V
};
@ -235,7 +238,10 @@ enum {
"LOOP_DOT_I", \
"BACKSLASH_BU", \
"DOLLAR_D", \
"DOLLAR_MD"
"DOLLAR_MD", \
"URX_BACKSLASH_H", \
"URX_BACKSLASH_R", \
"URX_BACKSLASH_V"
//

View file

@ -49,6 +49,15 @@ static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
// This constant determines that state saves per tick number.
static const int32_t TIMER_INITIAL_VALUE = 10000;
// Test for any of the Unicode line terminating characters.
static inline UBool isLineTerminator(UChar32 c) {
if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
return false;
}
return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
}
//-----------------------------------------------------------------------------
//
// Constructor and Destructor
@ -837,20 +846,19 @@ UBool RegexMatcher::find(UErrorCode &status) {
}
} else {
for (;;) {
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
(void)UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
}
MatchAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
if (isLineTerminator(c)) {
if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
(void)UTEXT_NEXT32(fInputText);
startPos = UTEXT_GETNATIVEINDEX(fInputText);
}
MatchAt(startPos, FALSE, status);
if (U_FAILURE(status)) {
return FALSE;
}
if (fMatch) {
return TRUE;
}
UTEXT_SETNATIVEINDEX(fInputText, startPos);
}
if (startPos >= testStartLimit) {
fMatch = FALSE;
@ -1098,8 +1106,7 @@ UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
} else {
for (;;) {
c = inputBuf[startPos-1];
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
if (isLineTerminator(c)) {
if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
startPos++;
}
@ -2927,9 +2934,9 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
// end of input, succeed.
UChar32 c = UTEXT_NEXT32(fInputText);
if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
if (isLineTerminator(c)) {
// If not in the middle of a CR/LF sequence
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
// At new-line at end of input. Success
fHitEnd = TRUE;
fRequireEnd = TRUE;
@ -2985,7 +2992,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
// It makes no difference where the new-line is within the input.
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_CURRENT32(fInputText);
if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
if (isLineTerminator(c)) {
// At a line end, except for the odd chance of being in the middle of a CR/LF sequence
// In multi-line mode, hitting a new-line just before the end of input does not
// set the hitEnd or requireEnd flags
@ -3034,8 +3041,7 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
// unless we are at the end of input
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_PREVIOUS32(fInputText);
if ((fp->fInputIdx < fAnchorLimit) &&
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
// It's a new-line. ^ is true. Success.
// TODO: what should be done with positions between a CR and LF?
break;
@ -3116,6 +3122,68 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
break;
case URX_BACKSLASH_H: // Test for \h, horizontal white space.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
int8_t ctype = u_charType(c);
UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
success ^= (UBool)(opValue != 0); // flip sense for \H
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_R: // Test for \R, any line break sequence.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
if (isLineTerminator(c)) {
if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
utext_next32(fInputText);
}
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_V: // \v, any single line ending character.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
UChar32 c = UTEXT_NEXT32(fInputText);
UBool success = isLineTerminator(c);
success ^= (UBool)(opValue != 0); // flip sense for \V
if (success) {
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_X:
// Match a Grapheme, as defined by Unicode TR 29.
// Differs slightly from Perl, which consumes combining marks independently
@ -3343,8 +3411,7 @@ GC_Done:
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c = UTEXT_NEXT32(fInputText);
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
if (isLineTerminator(c)) {
// End of line in normal mode. . does not match.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
@ -4101,7 +4168,7 @@ GC_Done:
if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
if ((c == 0x0a) || // 0x0a is newline in both modes.
(((opValue & 2) == 0) && // IF not UNIX_LINES mode
(c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
isLineTerminator(c))) {
// char is a line ending. Exit the scanning loop.
break;
}
@ -4432,7 +4499,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
UChar32 c;
U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
if (isLineTerminator(c)) {
if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
// At new-line at end of input. Success
fHitEnd = TRUE;
@ -4486,7 +4553,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
// If we are positioned just before a new-line, succeed.
// It makes no difference where the new-line is within the input.
UChar32 c = inputBuf[fp->fInputIdx];
if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
if (isLineTerminator(c)) {
// At a line end, except for the odd chance of being in the middle of a CR/LF sequence
// In multi-line mode, hitting a new-line just before the end of input does not
// set the hitEnd or requireEnd flags
@ -4534,7 +4601,7 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
// unless we are at the end of input
UChar c = inputBuf[fp->fInputIdx - 1];
if ((fp->fInputIdx < fAnchorLimit) &&
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
isLineTerminator(c)) {
// It's a new-line. ^ is true. Success.
// TODO: what should be done with positions between a CR and LF?
break;
@ -4611,6 +4678,69 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
break;
case URX_BACKSLASH_H: // Test for \h, horizontal white space.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
int8_t ctype = u_charType(c);
UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
success ^= (UBool)(opValue != 0); // flip sense for \H
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_R: // Test for \R, any line break sequence.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (isLineTerminator(c)) {
if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
// Check for CR/LF sequence. Consume both together when found.
UChar c2;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
if (c2 != 0x0a) {
U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
}
}
} else {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_V: // Any single code point line ending.
{
if (fp->fInputIdx >= fActiveLimit) {
fHitEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
}
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
UBool success = isLineTerminator(c);
success ^= (UBool)(opValue != 0); // flip sense for \V
if (!success) {
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
}
}
break;
case URX_BACKSLASH_X:
// Match a Grapheme, as defined by Unicode TR 29.
// Differs slightly from Perl, which consumes combining marks independently
@ -4820,8 +4950,7 @@ GC_Done:
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
if (isLineTerminator(c)) {
// End of line in normal mode. . does not match.
fp = (REStackFrame *)fStack->popFrame(fFrameSize);
break;
@ -5535,7 +5664,7 @@ GC_Done:
if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
if ((c == 0x0a) || // 0x0a is newline in both modes.
(((opValue & 2) == 0) && // IF not UNIX_LINES mode
((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
isLineTerminator(c))) {
// char is a line ending. Put the input pos back to the
// line ending char, and exit the scanning loop.
U16_BACK_1(inputBuf, 0, ix);

View file

@ -742,6 +742,9 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_LBN_END:
case URX_LOOP_C:
case URX_LOOP_DOT_I:
case URX_BACKSLASH_H:
case URX_BACKSLASH_R:
case URX_BACKSLASH_V:
// types with an integer operand field.
printf("%d", val);
break;

View file

@ -693,6 +693,78 @@
"abc\jkl" "<0>abcjkl</0>" # escape of a non-special letter is just itself.
"abc[ \j]kl" "<0>abcjkl</0>"
#
# \R all newline sequences.
#
"abc\Rxyz" "<0>abc\u000axyz</0>gh"
"abc\Rxyz" "<0>abc\u000bxyz</0>gh"
"abc\Rxyz" "<0>abc\u000cxyz</0>gh"
"abc\Rxyz" "<0>abc\u000dxyz</0>gh"
"abc\Rxyz" "<0>abc\u0085xyz</0>gh"
"abc\Rxyz" "<0>abc\u2028xyz</0>gh"
"abc\Rxyz" "<0>abc\u2029xyz</0>gh"
"abc\Rxyz" "<0>abc\u000d\u000axyz</0>gh"
"abc\R\nxyz" "abc\u000d\u000axyzgh" # \R cannot match only the CR from a CR/LF sequence.
"abc\r\nxyz" "<0>abc\u000d\u000axyz</0>gh"
"abc\Rxyz" "abc\u0009xyz" # Assorted non-matches.
"abc\Rxyz" "abc\u000exyz"
"abc\Rxyz" "abc\u202axyz"
# \v \V single character new line sequences.
"abc\vxyz" "<0>abc\u000axyz</0>gh"
"abc\vxyz" "<0>abc\u000bxyz</0>gh"
"abc\vxyz" "<0>abc\u000cxyz</0>gh"
"abc\vxyz" "<0>abc\u000dxyz</0>gh"
"abc\vxyz" "<0>abc\u0085xyz</0>gh"
"abc\vxyz" "<0>abc\u2028xyz</0>gh"
"abc\vxyz" "<0>abc\u2029xyz</0>gh"
"abc\vxyz" "abc\u000d\u000axyzgh"
"abc\vxyz" "abc?xyzgh"
"abc[\v]xyz" "<0>abc\u000axyz</0>gh"
"abc[\v]xyz" "<0>abc\u000bxyz</0>gh"
"abc[\v]xyz" "<0>abc\u000cxyz</0>gh"
"abc[\v]xyz" "<0>abc\u000dxyz</0>gh"
"abc[\v]xyz" "<0>abc\u0085xyz</0>gh"
"abc[\v]xyz" "<0>abc\u2028xyz</0>gh"
"abc[\v]xyz" "<0>abc\u2029xyz</0>gh"
"abc[\v]xyz" "abc\u000d\u000axyzgh"
"abc[\v]xyz" "abc?xyzgh"
"abc\Vxyz" "abc\u000axyzgh"
"abc\Vxyz" "abc\u000bxyzgh"
"abc\Vxyz" "abc\u000cxyzgh"
"abc\Vxyz" "abc\u000dxyzgh"
"abc\Vxyz" "abc\u0085xyzgh"
"abc\Vxyz" "abc\u2028xyzgh"
"abc\Vxyz" "abc\u2029xyzgh"
"abc\Vxyz" "abc\u000d\u000axyzgh"
"abc\Vxyz" "<0>abc?xyz</0>gh"
# \h \H horizontal white space. Defined as gc=space_separator plus ascii tab
"abc\hxyz" "<0>abc xyz</0>gh"
"abc\Hxyz" "abc xyzgh"
"abc\hxyz" "<0>abc\u2003xyz</0>gh"
"abc\Hxyz" "abc\u2003xyzgh"
"abc\hxyz" "<0>abc\u0009xyz</0>gh"
"abc\Hxyz" "abc\u0009xyzgh"
"abc\hxyz" "abc?xyzgh"
"abc\Hxyz" "<0>abc?xyz</0>gh"
"abc[\h]xyz" "<0>abc xyz</0>gh"
"abc[\H]xyz" "abc xyzgh"
"abc[\h]xyz" "<0>abc\u2003xyz</0>gh"
"abc[\H]xyz" "abc\u2003xyzgh"
"abc[\h]xyz" "<0>abc\u0009xyz</0>gh"
"abc[\H]xyz" "abc\u0009xyzgh"
"abc[\h]xyz" "abc?xyzgh"
"abc[\H]xyz" "<0>abc?xyz</0>gh"
#
# Bug xxxx
#