ICU-5312 Regular Expressions Named Capture.

X-SVN-Rev: 37040
This commit is contained in:
Andy Heninger 2015-02-18 23:56:19 +00:00
parent da811f1dfe
commit ec3f77f878
16 changed files with 1050 additions and 384 deletions

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1996-2014, International Business Machines
* Copyright (C) 1996-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -651,8 +651,9 @@ typedef enum UErrorCode {
U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by user callback fn. */
#ifndef U_HIDE_DRAFT_API
U_REGEX_PATTERN_TOO_BIG, /**< Pattern exceeds limits on size or complexity. @draft ICU 55 */
U_REGEX_INVALID_CAPTURE_GROUP_NAME, /**< Invalid capture group name. @draft ICU 55 */
#endif /* U_HIDE_DRAFT_API */
U_REGEX_ERROR_LIMIT=U_REGEX_STOPPED_BY_CALLER+2, /**< This must always be the last value to indicate the limit for regexp errors */
U_REGEX_ERROR_LIMIT=U_REGEX_STOPPED_BY_CALLER+3, /**< This must always be the last value to indicate the limit for regexp errors */
/*
* The error code in the range 0x10400-0x104ff are reserved for IDNA related error codes

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 1997-2014, International Business Machines
* Copyright (C) 1997-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -166,7 +166,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_STACK_OVERFLOW",
"U_REGEX_TIME_OUT",
"U_REGEX_STOPPED_BY_CALLER",
"U_REGEX_PATTERN_TOO_BIG"
"U_REGEX_PATTERN_TOO_BIG",
"U_REGEX_INVALID_CAPTURE_GROUP_NAME"
};
static const char * const

View file

@ -70,6 +70,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
fMatchOpenParen = -1;
fMatchCloseParen = -1;
fCaptureName = NULL;
if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
status = rxp->fDeferredStatus;
@ -86,6 +87,8 @@ static const UChar chDash = 0x2d; // '-'
//
//------------------------------------------------------------------------------
RegexCompile::~RegexCompile() {
delete fCaptureName; // Normally will be NULL, but can exist if pattern
// compilation stops with a syntax error.
}
static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
@ -286,17 +289,6 @@ void RegexCompile::compile(
// The pattern has now been read and processed, and the compiled code generated.
//
//
// Compute the number of digits requried for the largest capture group number.
//
fRXPat->fMaxCaptureDigits = 1;
int32_t n = 10;
int32_t groupCount = fRXPat->fGroupMap->size();
while (n <= groupCount) {
fRXPat->fMaxCaptureDigits++;
n *= 10;
}
//
// The pattern's fFrameSize so far has accumulated the requirements for
// storage for capture parentheses, counters, etc. that are encountered
@ -438,8 +430,25 @@ UBool RegexCompile::doParseActions(int32_t action)
break;
case doBeginNamedCapture:
// Scanning (?<letter.
// The first letter of the name will come through again under doConinueNamedCapture.
fCaptureName = new UnicodeString();
if (fCaptureName == NULL) {
error(U_MEMORY_ALLOCATION_ERROR);
}
break;
case doContinueNamedCapture:
fCaptureName->append(fC.fChar);
break;
case doBadNamedCapture:
error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
break;
case doOpenCaptureParen:
// Open Paren.
// Open Capturing Paren, possibly named.
// Compile to a
// - NOP, which later may be replaced by a save-state if the
// parenthesized group gets a * quantifier, followed by
@ -474,8 +483,18 @@ UBool RegexCompile::doParseActions(int32_t action)
// Save the mapping from group number to stack frame variable position.
fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
// If this is a named capture group, add the name->group number mapping.
if (fCaptureName != NULL) {
int32_t groupNumber = fRXPat->fGroupMap->size();
int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
fCaptureName = NULL; // hash table takes ownership of the name (key) string.
if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
}
}
}
break;
break;
case doOpenNonCaptureParen:
// Open non-caputuring (grouping only) Paren.
@ -1270,7 +1289,41 @@ UBool RegexCompile::doParseActions(int32_t action)
}
break;
case doBeginNamedBackRef:
U_ASSERT(fCaptureName == NULL);
fCaptureName = new UnicodeString;
if (fCaptureName == NULL) {
error(U_MEMORY_ALLOCATION_ERROR);
}
break;
case doContinueNamedBackRef:
fCaptureName->append(fC.fChar);
break;
case doCompleteNamedBackRef:
{
int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
if (groupNumber == 0) {
// Group name has not been defined.
// Could be a forward reference. If we choose to support them at some
// future time, extra mechanism will be required at this point.
error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
} else {
// Given the number, handle identically to a \n numbered back reference.
// See comments above, under doBackRef
fixLiterals(FALSE);
if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
appendOp(URX_BACKREF_I, groupNumber);
} else {
appendOp(URX_BACKREF, groupNumber);
}
}
delete fCaptureName;
fCaptureName = NULL;
break;
}
case doPossessivePlus:
// Possessive ++ quantifier.
// Compiles to

View file

@ -1,7 +1,7 @@
//
// regexcmp.h
//
// Copyright (C) 2002-2014, International Business Machines Corporation and others.
// Copyright (C) 2002-2015, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for the class RegexCompile
@ -220,6 +220,9 @@ private:
UChar32 fLastSetLiteral; // The last single code point added to a set.
// needed when "-y" is scanned, and we need
// to turn "x-y" into a range.
UnicodeString *fCaptureName; // Named Capture, the group name is built up
// in this string while being scanned.
};
// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]

View file

@ -5,7 +5,7 @@
// It is generated by the Perl script "regexcst.pl" from
// the rule parser state definitions file "regexcst.txt".
//
// Copyright (C) 2002-2007 International Business Machines Corporation
// Copyright (C) 2002-2015 International Business Machines Corporation
// and others. All rights reserved.
//
//---------------------------------------------------------------------------------
@ -17,100 +17,107 @@ U_NAMESPACE_BEGIN
// Character classes for regex pattern scanning.
//
static const uint8_t kRuleSet_digit_char = 128;
static const uint8_t kRuleSet_rule_char = 129;
static const uint8_t kRuleSet_ascii_letter = 129;
static const uint8_t kRuleSet_rule_char = 130;
enum Regex_PatternParseAction {
doLiteralChar,
doSetEnd,
doBackslashA,
doSetBeginUnion,
doNOP,
doSetBackslash_w,
doSetRange,
doBackslashG,
doPerlInline,
doSetAddDash,
doIntevalLowerDigit,
doProperty,
doBackslashX,
doOpenAtomicParen,
doSetLiteralEscaped,
doPatFinish,
doSetBackslash_D,
doSetDifference2,
doNamedChar,
doNGPlus,
doOpenLookBehindNeg,
doIntervalError,
doIntervalSame,
doBackRef,
doPlus,
doOpenCaptureParen,
doMismatchedParenErr,
doBeginMatchMode,
doEscapeError,
doOpenNonCaptureParen,
doDollar,
doSetProp,
doIntervalUpperDigit,
doSetBegin,
doBackslashs,
doOpenLookBehind,
doPossessiveOpt,
doOpenLookBehindNeg,
doDotAny,
doSetBackslash_D,
doSetLiteral,
doSetBackslash_S,
doEscapeError,
doSetBackslash_W,
doDollar,
doBackslashb,
doSetOpError,
doBackslashG,
doPatStart,
doMismatchedParenErr,
doPossessivePlus,
doBackslashX,
doSetBackslash_s,
doSetBackslash_w,
doBackslashW,
doBackslashw,
doSetMatchMode,
doOrOperator,
doCaret,
doMatchModeParen,
doStar,
doOpt,
doMatchMode,
doSuppressComments,
doPossessiveInterval,
doOpenLookAheadNeg,
doBackslashW,
doCloseParen,
doSetOpError,
doIntervalInit,
doSetFinish,
doSetIntersection2,
doNGStar,
doEnterQuoteMode,
doSetAddAmp,
doBackslashB,
doBackslashw,
doPossessiveOpt,
doSetNegate,
doRuleError,
doBackslashb,
doConditionalExpr,
doPossessivePlus,
doBadOpenParenType,
doNGInterval,
doSetLiteral,
doSetNamedChar,
doBackslashd,
doSetBeginDifference1,
doBackslashD,
doExit,
doSetBackslash_S,
doInterval,
doSetNoCloseError,
doNGOpt,
doSetPosixProp,
doOpenLookBehind,
doBackslashS,
doBackslashZ,
doSetBeginIntersection1,
doSetBackslash_W,
doBeginMatchMode,
doNOP,
doSetProp,
doBackslashA,
doIntervalInit,
doOpenCaptureParen,
doNGPlus,
doIntervalError,
doSetDifference2,
doNGOpt,
doEscapedLiteralChar,
doSetNegate,
doSetBegin,
doMatchModeParen,
doLiteralChar,
doOpt,
doSetIntersection2,
doBadOpenParenType,
doSuppressComments,
doCloseParen,
doPatFinish,
doSetBeginUnion,
doSetBackslash_d,
doProperty,
doNGInterval,
doNGStar,
doOpenLookAhead,
doBadModeFlag,
doPatStart,
doSetBeginIntersection1,
doBeginNamedCapture,
doInterval,
doMatchMode,
doSetNoCloseError,
doSetBeginDifference1,
doPlus,
doBackslashD,
doSetLiteralEscaped,
doContinueNamedCapture,
doSetPosixProp,
doBackslashz,
doSetNamedRange,
doPossessiveStar,
doEscapedLiteralChar,
doSetBackslash_s,
doBackslashz,
doDotAny,
doBadModeFlag,
doContinueNamedBackRef,
doPerlInline,
doBackslashd,
doOpenNonCaptureParen,
doSetEnd,
doSetAddDash,
doSetFinish,
doCaret,
doConditionalExpr,
doExit,
doNamedChar,
doSetRange,
doPossessiveInterval,
doBackslashs,
doIntervalSame,
doEnterQuoteMode,
doOpenAtomicParen,
doSetNamedChar,
doRuleError,
doStar,
doSetAddAmp,
doBackslashB,
doCompleteNamedBackRef,
doBackslashZ,
doIntevalLowerDigit,
doBeginNamedBackRef,
doBackRef,
doBadNamedCapture,
rbbiLastAction};
//-------------------------------------------------------------------------------
@ -132,21 +139,21 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
, {doPatStart, 255, 2,0, FALSE} // 1 start
, {doLiteralChar, 254, 14,0, TRUE} // 2 term
, {doLiteralChar, 129, 14,0, TRUE} // 3
, {doSetBegin, 91 /* [ */, 104, 182, TRUE} // 4
, {doLiteralChar, 130, 14,0, TRUE} // 3
, {doSetBegin, 91 /* [ */, 118, 196, TRUE} // 4
, {doNOP, 40 /* ( */, 27,0, TRUE} // 5
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
, {doCaret, 94 /* ^ */, 14,0, TRUE} // 7
, {doDollar, 36 /* $ */, 14,0, TRUE} // 8
, {doNOP, 92 /* \ */, 84,0, TRUE} // 9
, {doNOP, 92 /* \ */, 89,0, TRUE} // 9
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
, {doPatFinish, 253, 2,0, FALSE} // 12
, {doRuleError, 255, 183,0, FALSE} // 13
, {doNOP, 42 /* * */, 63,0, TRUE} // 14 expr-quant
, {doNOP, 43 /* + */, 66,0, TRUE} // 15
, {doNOP, 63 /* ? */, 69,0, TRUE} // 16
, {doIntervalInit, 123 /* { */, 72,0, TRUE} // 17
, {doRuleError, 255, 197,0, FALSE} // 13
, {doNOP, 42 /* * */, 68,0, TRUE} // 14 expr-quant
, {doNOP, 43 /* + */, 71,0, TRUE} // 15
, {doNOP, 63 /* ? */, 74,0, TRUE} // 16
, {doIntervalInit, 123 /* { */, 77,0, TRUE} // 17
, {doNOP, 40 /* ( */, 23,0, TRUE} // 18
, {doNOP, 255, 20,0, FALSE} // 19
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont
@ -154,7 +161,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doNOP, 255, 2,0, FALSE} // 22
, {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
, {doNOP, 255, 27,0, FALSE} // 24
, {doNOP, 35 /* # */, 49, 14, TRUE} // 25 open-paren-quant2
, {doNOP, 35 /* # */, 50, 14, TRUE} // 25 open-paren-quant2
, {doNOP, 255, 29,0, FALSE} // 26
, {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren
, {doOpenCaptureParen, 255, 2, 14, FALSE} // 28
@ -163,156 +170,170 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31
, {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32
, {doNOP, 60 /* < */, 46,0, TRUE} // 33
, {doNOP, 35 /* # */, 49, 2, TRUE} // 34
, {doBeginMatchMode, 105 /* i */, 52,0, FALSE} // 35
, {doBeginMatchMode, 100 /* d */, 52,0, FALSE} // 36
, {doBeginMatchMode, 109 /* m */, 52,0, FALSE} // 37
, {doBeginMatchMode, 115 /* s */, 52,0, FALSE} // 38
, {doBeginMatchMode, 117 /* u */, 52,0, FALSE} // 39
, {doBeginMatchMode, 119 /* w */, 52,0, FALSE} // 40
, {doBeginMatchMode, 120 /* x */, 52,0, FALSE} // 41
, {doBeginMatchMode, 45 /* - */, 52,0, FALSE} // 42
, {doConditionalExpr, 40 /* ( */, 183,0, TRUE} // 43
, {doPerlInline, 123 /* { */, 183,0, TRUE} // 44
, {doBadOpenParenType, 255, 183,0, FALSE} // 45
, {doNOP, 35 /* # */, 50, 2, TRUE} // 34
, {doBeginMatchMode, 105 /* i */, 53,0, FALSE} // 35
, {doBeginMatchMode, 100 /* d */, 53,0, FALSE} // 36
, {doBeginMatchMode, 109 /* m */, 53,0, FALSE} // 37
, {doBeginMatchMode, 115 /* s */, 53,0, FALSE} // 38
, {doBeginMatchMode, 117 /* u */, 53,0, FALSE} // 39
, {doBeginMatchMode, 119 /* w */, 53,0, FALSE} // 40
, {doBeginMatchMode, 120 /* x */, 53,0, FALSE} // 41
, {doBeginMatchMode, 45 /* - */, 53,0, FALSE} // 42
, {doConditionalExpr, 40 /* ( */, 197,0, TRUE} // 43
, {doPerlInline, 123 /* { */, 197,0, TRUE} // 44
, {doBadOpenParenType, 255, 197,0, FALSE} // 45
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 46 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 47
, {doBadOpenParenType, 255, 183,0, FALSE} // 48
, {doNOP, 41 /* ) */, 255,0, TRUE} // 49 paren-comment
, {doMismatchedParenErr, 253, 183,0, FALSE} // 50
, {doNOP, 255, 49,0, TRUE} // 51
, {doMatchMode, 105 /* i */, 52,0, TRUE} // 52 paren-flag
, {doMatchMode, 100 /* d */, 52,0, TRUE} // 53
, {doMatchMode, 109 /* m */, 52,0, TRUE} // 54
, {doMatchMode, 115 /* s */, 52,0, TRUE} // 55
, {doMatchMode, 117 /* u */, 52,0, TRUE} // 56
, {doMatchMode, 119 /* w */, 52,0, TRUE} // 57
, {doMatchMode, 120 /* x */, 52,0, TRUE} // 58
, {doMatchMode, 45 /* - */, 52,0, TRUE} // 59
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 60
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 61
, {doBadModeFlag, 255, 183,0, FALSE} // 62
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 63 quant-star
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 64
, {doStar, 255, 20,0, FALSE} // 65
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 66 quant-plus
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 67
, {doPlus, 255, 20,0, FALSE} // 68
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 69 quant-opt
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 70
, {doOpt, 255, 20,0, FALSE} // 71
, {doNOP, 128, 74,0, FALSE} // 72 interval-open
, {doIntervalError, 255, 183,0, FALSE} // 73
, {doIntevalLowerDigit, 128, 74,0, TRUE} // 74 interval-lower
, {doNOP, 44 /* , */, 78,0, TRUE} // 75
, {doIntervalSame, 125 /* } */, 81,0, TRUE} // 76
, {doIntervalError, 255, 183,0, FALSE} // 77
, {doIntervalUpperDigit, 128, 78,0, TRUE} // 78 interval-upper
, {doNOP, 125 /* } */, 81,0, TRUE} // 79
, {doIntervalError, 255, 183,0, FALSE} // 80
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 81 interval-type
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 82
, {doInterval, 255, 20,0, FALSE} // 83
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 84 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 85
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 86
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 87
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 88
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 89
, {doNamedChar, 78 /* N */, 14,0, FALSE} // 90
, {doProperty, 112 /* p */, 14,0, FALSE} // 91
, {doProperty, 80 /* P */, 14,0, FALSE} // 92
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 93
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 94
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 95
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 96
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 97
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 98
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 99
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 100
, {doBackRef, 128, 14,0, TRUE} // 101
, {doEscapeError, 253, 183,0, FALSE} // 102
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 103
, {doSetNegate, 94 /* ^ */, 107,0, TRUE} // 104 set-open
, {doSetPosixProp, 58 /* : */, 109,0, FALSE} // 105
, {doNOP, 255, 107,0, FALSE} // 106
, {doSetLiteral, 93 /* ] */, 122,0, TRUE} // 107 set-open2
, {doNOP, 255, 112,0, FALSE} // 108
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 109 set-posix
, {doNOP, 58 /* : */, 112,0, FALSE} // 110
, {doRuleError, 255, 183,0, FALSE} // 111
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 112 set-start
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 113
, {doNOP, 92 /* \ */, 172,0, TRUE} // 114
, {doNOP, 45 /* - */, 118,0, TRUE} // 115
, {doNOP, 38 /* & */, 120,0, TRUE} // 116
, {doSetLiteral, 255, 122,0, TRUE} // 117
, {doRuleError, 45 /* - */, 183,0, FALSE} // 118 set-start-dash
, {doSetAddDash, 255, 122,0, FALSE} // 119
, {doRuleError, 38 /* & */, 183,0, FALSE} // 120 set-start-amp
, {doSetAddAmp, 255, 122,0, FALSE} // 121
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 122 set-after-lit
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 123
, {doNOP, 45 /* - */, 159,0, TRUE} // 124
, {doNOP, 38 /* & */, 150,0, TRUE} // 125
, {doNOP, 92 /* \ */, 172,0, TRUE} // 126
, {doSetNoCloseError, 253, 183,0, FALSE} // 127
, {doSetLiteral, 255, 122,0, TRUE} // 128
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 129 set-after-set
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 130
, {doNOP, 45 /* - */, 152,0, TRUE} // 131
, {doNOP, 38 /* & */, 147,0, TRUE} // 132
, {doNOP, 92 /* \ */, 172,0, TRUE} // 133
, {doSetNoCloseError, 253, 183,0, FALSE} // 134
, {doSetLiteral, 255, 122,0, TRUE} // 135
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 136 set-after-range
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 137
, {doNOP, 45 /* - */, 155,0, TRUE} // 138
, {doNOP, 38 /* & */, 157,0, TRUE} // 139
, {doNOP, 92 /* \ */, 172,0, TRUE} // 140
, {doSetNoCloseError, 253, 183,0, FALSE} // 141
, {doSetLiteral, 255, 122,0, TRUE} // 142
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 143 set-after-op
, {doSetOpError, 93 /* ] */, 183,0, FALSE} // 144
, {doNOP, 92 /* \ */, 172,0, TRUE} // 145
, {doSetLiteral, 255, 122,0, TRUE} // 146
, {doSetBeginIntersection1, 91 /* [ */, 104, 129, TRUE} // 147 set-set-amp
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 148
, {doSetAddAmp, 255, 122,0, FALSE} // 149
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 150 set-lit-amp
, {doSetAddAmp, 255, 122,0, FALSE} // 151
, {doSetBeginDifference1, 91 /* [ */, 104, 129, TRUE} // 152 set-set-dash
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 153
, {doSetAddDash, 255, 122,0, FALSE} // 154
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 155 set-range-dash
, {doSetAddDash, 255, 122,0, FALSE} // 156
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 157 set-range-amp
, {doSetAddAmp, 255, 122,0, FALSE} // 158
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 159 set-lit-dash
, {doSetAddDash, 91 /* [ */, 122,0, FALSE} // 160
, {doSetAddDash, 93 /* ] */, 122,0, FALSE} // 161
, {doNOP, 92 /* \ */, 164,0, TRUE} // 162
, {doSetRange, 255, 136,0, TRUE} // 163
, {doSetOpError, 115 /* s */, 183,0, FALSE} // 164 set-lit-dash-escape
, {doSetOpError, 83 /* S */, 183,0, FALSE} // 165
, {doSetOpError, 119 /* w */, 183,0, FALSE} // 166
, {doSetOpError, 87 /* W */, 183,0, FALSE} // 167
, {doSetOpError, 100 /* d */, 183,0, FALSE} // 168
, {doSetOpError, 68 /* D */, 183,0, FALSE} // 169
, {doSetNamedRange, 78 /* N */, 136,0, FALSE} // 170
, {doSetRange, 255, 136,0, TRUE} // 171
, {doSetProp, 112 /* p */, 129,0, FALSE} // 172 set-escape
, {doSetProp, 80 /* P */, 129,0, FALSE} // 173
, {doSetNamedChar, 78 /* N */, 122,0, FALSE} // 174
, {doSetBackslash_s, 115 /* s */, 136,0, TRUE} // 175
, {doSetBackslash_S, 83 /* S */, 136,0, TRUE} // 176
, {doSetBackslash_w, 119 /* w */, 136,0, TRUE} // 177
, {doSetBackslash_W, 87 /* W */, 136,0, TRUE} // 178
, {doSetBackslash_d, 100 /* d */, 136,0, TRUE} // 179
, {doSetBackslash_D, 68 /* D */, 136,0, TRUE} // 180
, {doSetLiteralEscaped, 255, 122,0, TRUE} // 181
, {doSetFinish, 255, 14,0, FALSE} // 182 set-finish
, {doExit, 255, 183,0, TRUE} // 183 errorDeath
, {doBeginNamedCapture, 129, 64,0, FALSE} // 48
, {doBadOpenParenType, 255, 197,0, FALSE} // 49
, {doNOP, 41 /* ) */, 255,0, TRUE} // 50 paren-comment
, {doMismatchedParenErr, 253, 197,0, FALSE} // 51
, {doNOP, 255, 50,0, TRUE} // 52
, {doMatchMode, 105 /* i */, 53,0, TRUE} // 53 paren-flag
, {doMatchMode, 100 /* d */, 53,0, TRUE} // 54
, {doMatchMode, 109 /* m */, 53,0, TRUE} // 55
, {doMatchMode, 115 /* s */, 53,0, TRUE} // 56
, {doMatchMode, 117 /* u */, 53,0, TRUE} // 57
, {doMatchMode, 119 /* w */, 53,0, TRUE} // 58
, {doMatchMode, 120 /* x */, 53,0, TRUE} // 59
, {doMatchMode, 45 /* - */, 53,0, TRUE} // 60
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 61
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 62
, {doBadModeFlag, 255, 197,0, FALSE} // 63
, {doContinueNamedCapture, 129, 64,0, TRUE} // 64 named-capture
, {doContinueNamedCapture, 128, 64,0, TRUE} // 65
, {doOpenCaptureParen, 62 /* > */, 2, 14, TRUE} // 66
, {doBadNamedCapture, 255, 197,0, FALSE} // 67
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 68 quant-star
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 69
, {doStar, 255, 20,0, FALSE} // 70
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 71 quant-plus
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 72
, {doPlus, 255, 20,0, FALSE} // 73
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 74 quant-opt
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 75
, {doOpt, 255, 20,0, FALSE} // 76
, {doNOP, 128, 79,0, FALSE} // 77 interval-open
, {doIntervalError, 255, 197,0, FALSE} // 78
, {doIntevalLowerDigit, 128, 79,0, TRUE} // 79 interval-lower
, {doNOP, 44 /* , */, 83,0, TRUE} // 80
, {doIntervalSame, 125 /* } */, 86,0, TRUE} // 81
, {doIntervalError, 255, 197,0, FALSE} // 82
, {doIntervalUpperDigit, 128, 83,0, TRUE} // 83 interval-upper
, {doNOP, 125 /* } */, 86,0, TRUE} // 84
, {doIntervalError, 255, 197,0, FALSE} // 85
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 86 interval-type
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 87
, {doInterval, 255, 20,0, FALSE} // 88
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 89 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 90
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 91
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 92
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 93
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 94
, {doNOP, 107 /* k */, 110,0, TRUE} // 95
, {doNamedChar, 78 /* N */, 14,0, FALSE} // 96
, {doProperty, 112 /* p */, 14,0, FALSE} // 97
, {doProperty, 80 /* P */, 14,0, FALSE} // 98
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 99
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 100
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 101
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 102
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 103
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 104
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 105
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 106
, {doBackRef, 128, 14,0, TRUE} // 107
, {doEscapeError, 253, 197,0, FALSE} // 108
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 109
, {doBeginNamedBackRef, 60 /* < */, 112,0, TRUE} // 110 named-backref
, {doBadNamedCapture, 255, 197,0, FALSE} // 111
, {doContinueNamedBackRef, 129, 114,0, TRUE} // 112 named-backref-2
, {doBadNamedCapture, 255, 197,0, FALSE} // 113
, {doContinueNamedBackRef, 129, 114,0, TRUE} // 114 named-backref-3
, {doContinueNamedBackRef, 128, 114,0, TRUE} // 115
, {doCompleteNamedBackRef, 62 /* > */, 14,0, TRUE} // 116
, {doBadNamedCapture, 255, 197,0, FALSE} // 117
, {doSetNegate, 94 /* ^ */, 121,0, TRUE} // 118 set-open
, {doSetPosixProp, 58 /* : */, 123,0, FALSE} // 119
, {doNOP, 255, 121,0, FALSE} // 120
, {doSetLiteral, 93 /* ] */, 136,0, TRUE} // 121 set-open2
, {doNOP, 255, 126,0, FALSE} // 122
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 123 set-posix
, {doNOP, 58 /* : */, 126,0, FALSE} // 124
, {doRuleError, 255, 197,0, FALSE} // 125
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 126 set-start
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 127
, {doNOP, 92 /* \ */, 186,0, TRUE} // 128
, {doNOP, 45 /* - */, 132,0, TRUE} // 129
, {doNOP, 38 /* & */, 134,0, TRUE} // 130
, {doSetLiteral, 255, 136,0, TRUE} // 131
, {doRuleError, 45 /* - */, 197,0, FALSE} // 132 set-start-dash
, {doSetAddDash, 255, 136,0, FALSE} // 133
, {doRuleError, 38 /* & */, 197,0, FALSE} // 134 set-start-amp
, {doSetAddAmp, 255, 136,0, FALSE} // 135
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 136 set-after-lit
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 137
, {doNOP, 45 /* - */, 173,0, TRUE} // 138
, {doNOP, 38 /* & */, 164,0, TRUE} // 139
, {doNOP, 92 /* \ */, 186,0, TRUE} // 140
, {doSetNoCloseError, 253, 197,0, FALSE} // 141
, {doSetLiteral, 255, 136,0, TRUE} // 142
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 143 set-after-set
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 144
, {doNOP, 45 /* - */, 166,0, TRUE} // 145
, {doNOP, 38 /* & */, 161,0, TRUE} // 146
, {doNOP, 92 /* \ */, 186,0, TRUE} // 147
, {doSetNoCloseError, 253, 197,0, FALSE} // 148
, {doSetLiteral, 255, 136,0, TRUE} // 149
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 150 set-after-range
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 151
, {doNOP, 45 /* - */, 169,0, TRUE} // 152
, {doNOP, 38 /* & */, 171,0, TRUE} // 153
, {doNOP, 92 /* \ */, 186,0, TRUE} // 154
, {doSetNoCloseError, 253, 197,0, FALSE} // 155
, {doSetLiteral, 255, 136,0, TRUE} // 156
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 157 set-after-op
, {doSetOpError, 93 /* ] */, 197,0, FALSE} // 158
, {doNOP, 92 /* \ */, 186,0, TRUE} // 159
, {doSetLiteral, 255, 136,0, TRUE} // 160
, {doSetBeginIntersection1, 91 /* [ */, 118, 143, TRUE} // 161 set-set-amp
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 162
, {doSetAddAmp, 255, 136,0, FALSE} // 163
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 164 set-lit-amp
, {doSetAddAmp, 255, 136,0, FALSE} // 165
, {doSetBeginDifference1, 91 /* [ */, 118, 143, TRUE} // 166 set-set-dash
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 167
, {doSetAddDash, 255, 136,0, FALSE} // 168
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 169 set-range-dash
, {doSetAddDash, 255, 136,0, FALSE} // 170
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 171 set-range-amp
, {doSetAddAmp, 255, 136,0, FALSE} // 172
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 173 set-lit-dash
, {doSetAddDash, 91 /* [ */, 136,0, FALSE} // 174
, {doSetAddDash, 93 /* ] */, 136,0, FALSE} // 175
, {doNOP, 92 /* \ */, 178,0, TRUE} // 176
, {doSetRange, 255, 150,0, TRUE} // 177
, {doSetOpError, 115 /* s */, 197,0, FALSE} // 178 set-lit-dash-escape
, {doSetOpError, 83 /* S */, 197,0, FALSE} // 179
, {doSetOpError, 119 /* w */, 197,0, FALSE} // 180
, {doSetOpError, 87 /* W */, 197,0, FALSE} // 181
, {doSetOpError, 100 /* d */, 197,0, FALSE} // 182
, {doSetOpError, 68 /* D */, 197,0, FALSE} // 183
, {doSetNamedRange, 78 /* N */, 150,0, FALSE} // 184
, {doSetRange, 255, 150,0, TRUE} // 185
, {doSetProp, 112 /* p */, 143,0, FALSE} // 186 set-escape
, {doSetProp, 80 /* P */, 143,0, FALSE} // 187
, {doSetNamedChar, 78 /* N */, 136,0, FALSE} // 188
, {doSetBackslash_s, 115 /* s */, 150,0, TRUE} // 189
, {doSetBackslash_S, 83 /* S */, 150,0, TRUE} // 190
, {doSetBackslash_w, 119 /* w */, 150,0, TRUE} // 191
, {doSetBackslash_W, 87 /* W */, 150,0, TRUE} // 192
, {doSetBackslash_d, 100 /* d */, 150,0, TRUE} // 193
, {doSetBackslash_D, 68 /* D */, 150,0, TRUE} // 194
, {doSetLiteralEscaped, 255, 136,0, TRUE} // 195
, {doSetFinish, 255, 14,0, FALSE} // 196 set-finish
, {doExit, 255, 197,0, TRUE} // 197 errorDeath
};
static const char * const RegexStateNames[] = { 0,
"start",
@ -362,6 +383,7 @@ static const char * const RegexStateNames[] = { 0,
0,
"open-paren-lookbehind",
0,
0,
0,
"paren-comment",
0,
@ -376,6 +398,10 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
"named-capture",
0,
0,
0,
"quant-star",
0,
@ -417,6 +443,15 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
"named-backref",
0,
"named-backref-2",
0,
"named-backref-3",
0,
0,
0,
"set-open",
0,

View file

@ -1,7 +1,7 @@
#*****************************************************************************
#
# Copyright (C) 2002-2007, International Business Machines Corporation and others.
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
# All Rights Reserved.
#
#*****************************************************************************
@ -147,6 +147,7 @@ open-paren-extended:
open-paren-lookbehind:
'=' n term ^expr-cont doOpenLookBehind # (?<=
'!' n term ^expr-cont doOpenLookBehindNeg # (?<!
ascii_letter named-capture doBeginNamedCapture # (?<name
default errorDeath doBadOpenParenType
@ -174,6 +175,14 @@ paren-flag:
':' n term ^expr-quant doMatchModeParen
default errorDeath doBadModeFlag
#
# named-capture (?<name> ... ), position currently on the name.
#
named-capture:
ascii_letter n named-capture doContinueNamedCapture
digit_char n named-capture doContinueNamedCapture
'>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.
default errorDeath doBadNamedCapture
#
# quant-star Scanning a '*' quantifier. Need to look ahead to decide
@ -241,6 +250,7 @@ backslash:
'd' n expr-quant doBackslashd
'D' n expr-quant doBackslashD
'G' n term doBackslashG
'k' n named-backref
'N' expr-quant doNamedChar # \N{NAME} named char
'p' expr-quant doProperty # \p{Lu} style property
'P' expr-quant doProperty
@ -257,6 +267,24 @@ backslash:
default n expr-quant doEscapedLiteralChar
# named-backref Scanned \k
# Leading to \k<captureName>
# Failure to get the full sequence is an error.
#
named-backref:
'<' n named-backref-2 doBeginNamedBackRef
default errorDeath doBadNamedCapture
named-backref-2:
ascii_letter n named-backref-3 doContinueNamedBackRef
default errorDeath doBadNamedCapture
named-backref-3:
ascii_letter n named-backref-3 doContinueNamedBackRef
digit_char n named-backref-3 doContinueNamedBackRef
'>' n expr-quant doCompleteNamedBackRef
default errorDeath doBadNamedCapture
#
# [set expression] parsing,

View file

@ -1,7 +1,7 @@
//
// regexst.h
//
// Copyright (C) 2004-2013, International Business Machines Corporation and others.
// Copyright (C) 2004-2015, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains class RegexStaticSets
@ -55,11 +55,6 @@ static const UChar gRuleSet_rule_char_pattern[] = {
// \ { \ } \ ^ \ $ \ | \ \ \ . ]
0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};
static const UChar gRuleSet_digit_char_pattern[] = {
// [ 0 - 9 ]
0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
//
// Here are the backslash escape characters that ICU's unescape() function
// will handle.
@ -213,23 +208,29 @@ fEmptyText(NULL)
// Sets used while parsing rules, but not referenced from the parse state table
fRuleSets[kRuleSet_rule_char-128] = UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status);
fRuleSets[kRuleSet_digit_char-128] = UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status);
fRuleSets[kRuleSet_digit_char-128].add((UChar)0x30, (UChar)0x39); // [0-9]
fRuleSets[kRuleSet_ascii_letter-128].add((UChar)0x41, (UChar)0x5A); // [A-Z]
fRuleSets[kRuleSet_ascii_letter-128].add((UChar)0x61, (UChar)0x7A); // [a-z]
fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
for (i=0; i<UPRV_LENGTHOF(fRuleSets); i++) {
fRuleSets[i].compact();
}
// Finally, initialize an empty string for utility purposes
fEmptyText = utext_openUChars(NULL, NULL, 0, status);
return; // If we reached this point, everything is fine so just exit
if (U_SUCCESS(*status)) {
return;
}
ExitConstrDeleteAll: // Remove fPropSets and fRuleSets and return error
for (i=0; i<URX_LAST_SET; i++) {
delete fPropSets[i];
fPropSets[i] = NULL;
}
*status = U_MEMORY_ALLOCATION_ERROR;
if (U_SUCCESS(*status)) {
*status = U_MEMORY_ALLOCATION_ERROR;
}
}

View file

@ -257,6 +257,9 @@ void RegexMatcher::init2(UText *input, UErrorCode &status) {
static const UChar BACKSLASH = 0x5c;
static const UChar DOLLARSIGN = 0x24;
static const UChar LEFTBRACKET = 0x7b;
static const UChar RIGHTBRACKET = 0x7d;
//--------------------------------------------------------------------------------
//
// appendReplacement
@ -331,8 +334,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
// TODO: optimize this loop by efficiently scanning for '$' or '\',
// move entire ranges not containing substitutions.
UTEXT_SETNATIVEINDEX(replacement, 0);
UChar32 c = UTEXT_NEXT32(replacement);
while (c != U_SENTINEL) {
for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
if (c == BACKSLASH) {
// Backslash Escape. Copy the following char out without further checks.
// Note: Surrogate pairs don't need any special handling
@ -398,51 +400,69 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
}
}
} else {
// We've got a $. Pick up a capture group number if one follows.
// Consume at most the number of digits necessary for the largest capture
// number that is valid for this pattern.
// We've got a $. Pick up a capture group name or number if one follows.
// Consume digits so long as the resulting group number <= the number of
// number of capture groups in the pattern.
int32_t numDigits = 0;
int32_t groupNum = 0;
UChar32 digitC;
for (;;) {
digitC = UTEXT_CURRENT32(replacement);
if (digitC == U_SENTINEL) {
break;
int32_t numDigits = 0;
UChar32 nextChar = utext_current32(replacement);
if (nextChar == LEFTBRACKET) {
// Scan for a Named Capture Group, ${name}.
UnicodeString groupName;
utext_next32(replacement);
while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
nextChar = utext_next32(replacement);
if (nextChar == U_SENTINEL) {
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
} else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
(nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
(nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
groupName.append(nextChar);
} else if (nextChar == RIGHTBRACKET) {
groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
if (groupNum == 0) {
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
} else {
// Character was something other than a name char or a closing '}'
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
}
if (u_isdigit(digitC) == FALSE) {
break;
} else if (u_isdigit(nextChar)) {
// $n Scan for a capture group number
int32_t numCaptureGroups = fPattern->fGroupMap->size();
for (;;) {
nextChar = UTEXT_CURRENT32(replacement);
if (nextChar == U_SENTINEL) {
break;
}
if (u_isdigit(nextChar) == FALSE) {
break;
}
int32_t nextDigitVal = u_charDigitValue(nextChar);
if (groupNum*10 + nextDigitVal > numCaptureGroups) {
// Don't consume the next digit if it makes the capture group number too big.
if (numDigits == 0) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
}
break;
}
(void)UTEXT_NEXT32(replacement);
groupNum=groupNum*10 + nextDigitVal;
++numDigits;
}
(void)UTEXT_NEXT32(replacement);
groupNum=groupNum*10 + u_charDigitValue(digitC);
numDigits++;
if (numDigits >= fPattern->fMaxCaptureDigits) {
break;
}
}
if (numDigits == 0) {
// The $ didn't introduce a group number at all.
// Treat it as just part of the substitution text.
UChar c16 = DOLLARSIGN;
destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
} else {
// Finally, append the capture group data to the destination.
destLen += appendGroup(groupNum, dest, status);
if (U_FAILURE(status)) {
// Can fail if group number is out of range.
break;
}
// $ not followed by capture group name or number.
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
}
if (U_FAILURE(status)) {
break;
} else {
c = UTEXT_NEXT32(replacement);
}
}
if (U_SUCCESS(status)) {
destLen += appendGroup(groupNum, dest, status);
}
} // End of $ capture group handling
} // End of per-character loop through the replacement string.
return *this;
}
@ -1201,7 +1221,6 @@ UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
}
//--------------------------------------------------------------------------------
//
// appendGroup() -- currently internal only, appends a group to a UText rather
@ -1282,8 +1301,6 @@ int32_t RegexMatcher::groupCount() const {
return fPattern->fGroupMap->size();
}
//--------------------------------------------------------------------------------
//
// hasAnchoringBounds()

View file

@ -3,7 +3,7 @@
//
/*
***************************************************************************
* Copyright (C) 2002-2014 International Business Machines Corporation *
* Copyright (C) 2002-2015 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
@ -15,6 +15,7 @@
#include "unicode/regex.h"
#include "unicode/uclean.h"
#include "uassert.h"
#include "uhash.h"
#include "uvector.h"
#include "uvectr32.h"
#include "uvectr64.h"
@ -92,7 +93,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fMinMatchLen = other.fMinMatchLen;
fFrameSize = other.fFrameSize;
fDataSize = other.fDataSize;
fMaxCaptureDigits = other.fMaxCaptureDigits;
fStaticSets = other.fStaticSets;
fStaticSets8 = other.fStaticSets8;
@ -133,6 +133,21 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fSets8[i] = other.fSets8[i];
}
// Copy the named capture group hash map.
int32_t hashPos = UHASH_FIRST;
while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
if (U_FAILURE(fDeferredStatus)) {
break;
}
const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
UnicodeString *key = new UnicodeString(*name);
int32_t val = hashEl->value.integer;
if (key == NULL) {
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
} else {
uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
}
}
return *this;
}
@ -154,7 +169,6 @@ void RegexPattern::init() {
fFrameSize = 0;
fDataSize = 0;
fGroupMap = NULL;
fMaxCaptureDigits = 1;
fStaticSets = NULL;
fStaticSets8 = NULL;
fStartType = START_NO_INFO;
@ -164,6 +178,7 @@ void RegexPattern::init() {
fInitialChar = 0;
fInitialChars8 = NULL;
fNeedsAltInput = FALSE;
fNamedCaptureMap = NULL;
fPattern = NULL; // will be set later
fPatternString = NULL; // may be set later
@ -172,17 +187,24 @@ void RegexPattern::init() {
fSets = new UVector(fDeferredStatus);
fInitialChars = new UnicodeSet;
fInitialChars8 = new Regex8BitSet;
fNamedCaptureMap = uhash_open(uhash_hashUnicodeString, // Key hash function
uhash_compareUnicodeString, // Key comparator function
uhash_compareLong, // Value comparator function
&fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return;
}
if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
fInitialChars == NULL || fInitialChars8 == NULL) {
fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) {
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Slot zero of the vector of sets is reserved. Fill it here.
fSets->addElement((int32_t)0, fDeferredStatus);
// fNamedCaptureMap owns its key strings, type (UnicodeString *)
uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
}
@ -220,6 +242,8 @@ void RegexPattern::zap() {
delete fPatternString;
fPatternString = NULL;
}
uhash_close(fNamedCaptureMap);
fNamedCaptureMap = NULL;
}
@ -577,6 +601,34 @@ UText *RegexPattern::patternText(UErrorCode &status) const {
}
//--------------------------------------------------------------------------------
//
// groupNumberFromName()
//
//--------------------------------------------------------------------------------
int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
}
// No need to explicitly check for syntactically valid names.
// Invalid ones will never be in the map, and the lookup will fail.
int32_t number = uhash_geti(fNamedCaptureMap, &groupName);
if (number == 0) {
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
return number;
}
int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
}
UnicodeString name(groupName, nameLength, US_INV);
return groupNumberFromName(name, status);
}
//---------------------------------------------------------------------
//
@ -754,6 +806,7 @@ void RegexPattern::dumpOp(int32_t index) const {
void RegexPattern::dumpPattern() const {
#if defined(REGEX_DEBUG)
// TODO: This function assumes an ASCII based charset.
int index;
int i;
@ -805,6 +858,21 @@ void RegexPattern::dumpPattern() const {
}
}
printf("Named Capture Groups:\n");
if (uhash_count(fNamedCaptureMap) == 0) {
printf(" None\n");
} else {
int32_t pos = UHASH_FIRST;
const UHashElement *el = NULL;
while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
const UnicodeString *name = (const UnicodeString *)el->key.pointer;
char s[100];
name->extract(0, 99, s, sizeof(s), US_INV); // capture group names are invariant.
int32_t number = el->value.integer;
printf(" %d\t%s\n", number, s);
}
}
printf("\nIndex Binary Type Operand\n" \
"-------------------------------------------\n");
for (index = 0; index<fCompiledPat->size(); index++) {

View file

@ -55,6 +55,8 @@
// Forward Declarations
struct UHashtable;
U_NAMESPACE_BEGIN
struct Regex8BitSet;
@ -136,7 +138,7 @@ public:
/**
* Create an exact copy of this RegexPattern object. Since RegexPattern is not
* intended to be subclasses, <code>clone()</code> and the copy construction are
* intended to be subclassed, <code>clone()</code> and the copy construction are
* equivalent operations.
* @return the copy of this RegexPattern
* @stable ICU 2.4
@ -437,6 +439,41 @@ public:
virtual UText *patternText(UErrorCode &status) const;
/**
* Get the group number corresponding to a named capture group.
* The returned number can be used with any function that access
* capture groups by number.
*
* The function returns an error status if the specified name does not
* appear in the pattern.
*
* @param groupName The capture group name.
* @param status A UErrorCode to receive any errors.
*
* @draft ICU 55
*/
virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
/**
* Get the group number corresponding to a named capture group.
* The returned number can be used with any function that access
* capture groups by number.
*
* The function returns an error status if the specified name does not
* appear in the pattern.
*
* @param groupName The capture group name,
* platform invariant characters only.
* @param nameLength The length of the name, or -1 if the name is
* nul-terminated.
* @param status A UErrorCode to receive any errors.
*
* @draft ICU 55
*/
virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
/**
* Split a string into fields. Somewhat like split() from Perl or Java.
* Pattern matches identify delimiters that separate the input
@ -573,8 +610,6 @@ private:
UVector32 *fGroupMap; // Map from capture group number to position of
// the group's variables in the matcher stack frame.
int32_t fMaxCaptureDigits;
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
// regex character classes, e.g. Word.
@ -589,6 +624,8 @@ private:
Regex8BitSet *fInitialChars8;
UBool fNeedsAltInput;
UHashtable *fNamedCaptureMap; // Map from capture group names to numbers.
friend class RegexCompile;
friend class RegexMatcher;
friend class RegexCImpl;
@ -854,7 +891,6 @@ public:
*/
virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
/**
* Returns the number of capturing groups in this matcher's pattern.
* @return the number of capture groups
@ -945,7 +981,6 @@ public:
*/
virtual int64_t start64(int32_t group, UErrorCode &status) const;
/**
* Returns the index in the input string of the first character following the
* text matched during the previous match operation.
@ -1015,7 +1050,6 @@ public:
*/
virtual int64_t end64(int32_t group, UErrorCode &status) const;
/**
* Resets this matcher. The effect is to remove any memory of previous matches,
* and to cause subsequent find() operations to begin at the beginning of

View file

@ -607,6 +607,53 @@ U_STABLE int32_t U_EXPORT2
uregex_groupCount(URegularExpression *regexp,
UErrorCode *status);
/**
* Get the group number corresponding to a named capture group.
* The returned number can be used with any function that access
* capture groups by number.
*
* The function returns an error status if the specified name does not
* appear in the pattern.
*
* @param regexp The compiled regular expression.
* @param groupName The capture group name.
* @param nameLength The length of the name, or -1 if the name is a
* nul-terminated string.
* @param status A pointer to a UErrorCode to receive any errors.
*
* @draft ICU 55
*/
U_DRAFT int32_t U_EXPORT2
uregex_groupNumberFromName(URegularExpression *regexp,
const UChar *groupName,
int32_t nameLength,
UErrorCode *status);
/**
* Get the group number corresponding to a named capture group.
* The returned number can be used with any function that access
* capture groups by number.
*
* The function returns an error status if the specified name does not
* appear in the pattern.
*
* @param regexp The compiled regular expression.
* @param groupName The capture group name,
* platform invariant characters only.
* @param nameLength The length of the name, or -1 if the name is
* nul-terminated.
* @param status A pointer to a UErrorCode to receive any errors.
*
* @draft ICU 55
*/
U_DRAFT int32_t U_EXPORT2
uregex_groupNumberFromCName(URegularExpression *regexp,
const char *groupName,
int32_t nameLength,
UErrorCode *status);
/** Extract the string for the specified matching expression or subexpression.
* Group #0 is the complete string of matched text.
* Group #1 is the text matched by the first set of capturing parentheses.
@ -630,8 +677,8 @@ uregex_group(URegularExpression *regexp,
int32_t destCapacity,
UErrorCode *status);
/** Returns a shallow immutable clone of the entire input string. The returned UText current native index
* is set to the beginning of the requested capture group. The capture group length is also
/** Returns a shallow immutable clone of the entire input string with the current index set
* to the beginning of the requested capture group. The capture group length is also
* returned via groupLength.
* Group #0 is the complete string of matched text.
* Group #1 is the text matched by the first set of capturing parentheses.
@ -644,7 +691,7 @@ uregex_group(URegularExpression *regexp,
* @param dest A mutable UText in which to store the current input.
* If NULL, a new UText will be created as an immutable shallow clone
* of the entire input string.
* @param groupLength The group length of the desired capture group.
* @param groupLength The group length of the desired capture group. Output parameter.
* @param status A reference to a UErrorCode to receive any errors.
* @return The subject text currently associated with this regular expression.
* If a pre-allocated UText was provided, it will always be used and returned.

View file

@ -17,14 +17,14 @@
#include "unicode/uchar.h"
#include "unicode/uobject.h"
#include "unicode/utf16.h"
#include "umutex.h"
#include "uassert.h"
#include "cmemory.h"
#include "uassert.h"
#include "uhash.h"
#include "umutex.h"
#include "uvectr32.h"
#include "regextxt.h"
#include <stdio.h>
U_NAMESPACE_BEGIN
#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
@ -625,6 +625,36 @@ uregex_groupCount(URegularExpression *regexp2,
}
//------------------------------------------------------------------------------
//
// uregex_groupNumberFromName
//
//------------------------------------------------------------------------------
int32_t
uregex_groupNumberFromName(URegularExpression *regexp2,
const UChar *groupName,
int32_t nameLength,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return 0;
}
int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status);
return result;
}
int32_t
uregex_groupNumberFromCName(URegularExpression *regexp2,
const char *groupName,
int32_t nameLength,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return 0;
}
return regexp->fPat->groupNumberFromName(groupName, nameLength, *status);
}
//------------------------------------------------------------------------------
//
// uregex_group
@ -1285,6 +1315,8 @@ U_NAMESPACE_END
static const UChar BACKSLASH = 0x5c;
static const UChar DOLLARSIGN = 0x24;
static const UChar LEFTBRACKET = 0x7b;
static const UChar RIGHTBRACKET = 0x7d;
//
// Move a character to an output buffer, with bounds checking on the index.
@ -1359,10 +1391,10 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
matchStart = (int32_t)m->fMatchStart;
} else {
// !!!: Would like a better way to do this!
UErrorCode status = U_ZERO_ERROR;
lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
status = U_ZERO_ERROR;
matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
UErrorCode tempStatus = U_ZERO_ERROR;
lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus);
tempStatus = U_ZERO_ERROR;
matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus);
}
for (i=lastMatchEnd; i<matchStart; i++) {
appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
@ -1377,7 +1409,7 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
// scan the replacement text, looking for substitutions ($n) and \escapes.
int32_t replIdx = 0;
while (replIdx < replacementLength) {
while (replIdx < replacementLength && U_SUCCESS(*status)) {
UChar c = replacementText[replIdx];
replIdx++;
if (c != DOLLARSIGN && c != BACKSLASH) {
@ -1426,55 +1458,84 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
continue;
}
// We've got a $. Pick up the following capture group name or number.
// For numbers, consume only digits that produce a valid capture group for the pattern.
// We've got a $. Pick up a capture group number if one follows.
// Consume at most the number of digits necessary for the largest capture
// number that is valid for this pattern.
int32_t numDigits = 0;
int32_t groupNum = 0;
UChar32 digitC;
for (;;) {
if (replIdx >= replacementLength) {
break;
}
U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
if (u_isdigit(digitC) == FALSE) {
break;
}
U_ASSERT(c == DOLLARSIGN);
UChar32 c32;
U16_GET(replacementText, 0, replIdx, replacementLength, c32);
if (u_isdigit(c32)) {
int32_t numDigits = 0;
int32_t numCaptureGroups = m->fPattern->fGroupMap->size();
for (;;) {
if (replIdx >= replacementLength) {
break;
}
U16_GET(replacementText, 0, replIdx, replacementLength, c32);
if (u_isdigit(c32) == FALSE) {
break;
}
int32_t digitVal = u_charDigitValue(c32);
if (groupNum * 10 + digitVal <= numCaptureGroups) {
groupNum = groupNum * 10 + digitVal;
U16_FWD_1(replacementText, replIdx, replacementLength);
numDigits++;
} else {
if (numDigits == 0) {
*status = U_INDEX_OUTOFBOUNDS_ERROR;
}
break;
}
}
} else if (c32 == LEFTBRACKET) {
// Scan for Named Capture Group, ${name}.
UnicodeString groupName;
U16_FWD_1(replacementText, replIdx, replacementLength);
groupNum=groupNum*10 + u_charDigitValue(digitC);
numDigits++;
if (numDigits >= m->fPattern->fMaxCaptureDigits) {
break;
while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) {
if (replIdx >= replacementLength) {
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
break;
}
U16_NEXT(replacementText, replIdx, replacementLength, c32);
if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z
(c32 >= 0x61 && c32 <= 0x7a) || // a..z
(c32 >= 0x31 && c32 <= 0x39)) { // 0..9
groupName.append(c32);
} else if (c32 == RIGHTBRACKET) {
groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
if (groupNum == 0) {
// Name not defined by pattern.
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
} else {
// Character was something other than a name char or a closing '}'
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
}
} else {
// $ not followed by {name} or digits.
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
if (numDigits == 0) {
// The $ didn't introduce a group number at all.
// Treat it as just part of the substitution text.
appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
continue;
}
// Finally, append the capture group data to the destination.
destIdx += uregex_group((URegularExpression*)regexp, groupNum,
dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
if (*status == U_BUFFER_OVERFLOW_ERROR) {
// Ignore buffer overflow when extracting the group. We need to
// continue on to get full size of the untruncated result. We will
// raise our own buffer overflow error at the end.
*status = U_ZERO_ERROR;
if (U_SUCCESS(*status)) {
destIdx += uregex_group((URegularExpression*)regexp, groupNum,
dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
if (*status == U_BUFFER_OVERFLOW_ERROR) {
// Ignore buffer overflow when extracting the group. We need to
// continue on to get full size of the untruncated result. We will
// raise our own buffer overflow error at the end.
*status = U_ZERO_ERROR;
}
}
if (U_FAILURE(*status)) {
// Can fail if group number is out of range.
// bad group number or name.
break;
}
}
//
@ -1483,10 +1544,12 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
//
if (destIdx < capacity) {
dest[destIdx] = 0;
} else if (destIdx == *destCapacity) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
} else if (U_SUCCESS(*status)) {
if (destIdx == *destCapacity) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
}
}
//

View file

@ -1022,7 +1022,7 @@ static void TestRegexCAPI(void) {
TEST_ASSERT_SUCCESS(status);
bufPtr = buf;
bufCap = UPRV_LENGTHOF(buf);
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", UPRV_LENGTHOF(repl));
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ \\$ \\abc", UPRV_LENGTHOF(repl));
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE);
@ -1817,7 +1817,8 @@ static void TestUTextAPI(void) {
UText *result;
const char str_Replxxx[] = { 0x52, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x20, 0x3c, 0x61, 0x61, 0x3e, 0x20, 0x78, 0x31, 0x78, 0x20, 0x78, 0x2e, 0x2e, 0x2e, 0x78, 0x2e, 0x00 }; /* Replace <aa> x1x x...x. */
const char str_Nomatchhere[] = { 0x4e, 0x6f, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x68, 0x65, 0x72, 0x65, 0x2e, 0x00 }; /* No match here. */
const char str_u00411U00000042a[] = { 0x5c, 0x5c, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x31, 0x24, 0x31, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x34, 0x32, 0x24, 0x5c, 0x61, 0x00 }; /* \\\u0041$1\U00000042$\a */
const char str_u00411U00000042a[] = { 0x5c, 0x5c, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x31, 0x24, 0x31,
0x5c, 0x55, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x34, 0x32, 0x5c, 0x24, 0x5c, 0x61, 0x00 }; /* \\\u0041$1\U00000042\$\a */
const char str_1x[] = { 0x3c, 0x24, 0x31, 0x3e, 0x00 }; /* <$1> */
const char str_ReplaceAaaBax1xxx[] = { 0x52, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x20, 0x5c, 0x41, 0x61, 0x61, 0x42, 0x24, 0x61, 0x20, 0x78, 0x31, 0x78, 0x20, 0x78, 0x2e, 0x2e, 0x2e, 0x78, 0x2e, 0x00 }; /* Replace \AaaB$a x1x x...x. */
status = U_ZERO_ERROR;
@ -1925,7 +1926,7 @@ static void TestUTextAPI(void) {
TEST_ASSERT_SUCCESS(status);
bufPtr = buf;
bufCap = UPRV_LENGTHOF(buf);
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", UPRV_LENGTHOF(repl));
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ \\$ \\abc", UPRV_LENGTHOF(repl));
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE);

View file

@ -148,6 +148,15 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
case 25: name = "TestBug11371";
if (exec) TestBug11371();
break;
case 26: name = "TestBug11480";
if (exec) TestBug11480();
break;
case 27: name = "NamedCapture";
if (exec) NamedCapture();
break;
case 28: name = "NamedCaptureLimits";
if (exec) NamedCaptureLimits();
break;
default: name = "";
break; //needed to end loop
}
@ -1429,8 +1438,8 @@ void RegexTest::API_Replace() {
REGEX_ASSERT(dest == "The value of $1 is bc.defg");
dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
REGEX_ASSERT(U_FAILURE(status));
status = U_ZERO_ERROR;
UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
replacement = replacement.unescape();
@ -2633,7 +2642,9 @@ void RegexTest::API_Replace_UTF8() {
REGEX_ASSERT(result == &destText);
REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
result = matcher2->replaceFirst(&replText, NULL, status);
REGEX_CHECK_STATUS;
@ -3108,7 +3119,7 @@ void RegexTest::API_Pattern_UTF8() {
UnicodeString stringToSplit("first:second:third");
UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
REGEX_CHECK_STATUS;
UText *splits[10] = {NULL};
int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
REGEX_CHECK_STATUS;
@ -5137,7 +5148,7 @@ void RegexTest::PreAllocatedUTextCAPI () {
/* Unicode escapes */
uregex_setText(re, text1, -1, &status);
regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
REGEX_CHECK_STATUS;
@ -5196,6 +5207,276 @@ void RegexTest::PreAllocatedUTextCAPI () {
utext_close(&patternText);
}
//--------------------------------------------------------------
//
// NamedCapture Check basic named capture group functionality
//
//--------------------------------------------------------------
void RegexTest::NamedCapture() {
UErrorCode status = U_ZERO_ERROR;
RegexPattern *pat = RegexPattern::compile(UnicodeString(
"abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
REGEX_CHECK_STATUS;
int32_t group = pat->groupNumberFromName("five", -1, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(5 == group);
group = pat->groupNumberFromName("three", -1, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(3 == group);
status = U_ZERO_ERROR;
group = pat->groupNumberFromName(UnicodeString("six"), status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(6 == group);
status = U_ZERO_ERROR;
group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
status = U_ZERO_ERROR;
// After copying a pattern, named capture should still work in the copy.
RegexPattern *copiedPat = new RegexPattern(*pat);
REGEX_ASSERT(*copiedPat == *pat);
delete pat; pat = NULL; // Delete original, copy should have no references back to it.
group = copiedPat->groupNumberFromName("five", -1, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(5 == group);
group = copiedPat->groupNumberFromName("three", -1, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(3 == group);
delete copiedPat;
// ReplaceAll with named capture group.
status = U_ZERO_ERROR;
UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
REGEX_CHECK_STATUS;
// m.pattern().dumpPattern();
UnicodeString replacedText = m->replaceAll("'${mid}'", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
delete m;
// ReplaceAll, allowed capture group numbers.
text = UnicodeString("abcmxyz");
m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<$3>"), status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<$4>"), status);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
REGEX_CHECK_STATUS; // that push group num out of range.
REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("<${one"), status);
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
status = U_ZERO_ERROR;
replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
delete m;
// Repeat the above replaceAll() tests using the plain C API, which
// has a separate implementation internally.
// TODO: factor out the test data.
status = U_ZERO_ERROR;
URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
REGEX_CHECK_STATUS;
text = UnicodeString("abcmxyz");
uregex_setText(re, text.getBuffer(), text.length(), &status);
REGEX_CHECK_STATUS;
UChar resultBuf[100];
int32_t resultLength;
UnicodeString repl;
status = U_ZERO_ERROR;
repl = UnicodeString("<$0>");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
status = U_ZERO_ERROR;
repl = UnicodeString("<$1>");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
status = U_ZERO_ERROR;
repl = UnicodeString("<${one}>");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
status = U_ZERO_ERROR;
repl = UnicodeString("<$2>");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
status = U_ZERO_ERROR;
repl = UnicodeString("<$3>");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
status = U_ZERO_ERROR;
repl = UnicodeString("<$4>");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
status = U_ZERO_ERROR;
repl = UnicodeString("<$04>");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
status = U_ZERO_ERROR;
repl = UnicodeString("<$000016>");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
status = U_ZERO_ERROR;
repl = UnicodeString("<$3$2$1${one}>");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
status = U_ZERO_ERROR;
repl = UnicodeString("$3$2$1${one}");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
status = U_ZERO_ERROR;
repl = UnicodeString("<${noSuchName}>");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
status = U_ZERO_ERROR;
repl = UnicodeString("<${invalid-name}>");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
status = U_ZERO_ERROR;
repl = UnicodeString("<${one");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
status = U_ZERO_ERROR;
repl = UnicodeString("$not a capture group");
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
uregex_close(re);
}
//--------------------------------------------------------------
//
// NamedCaptureLimits Patterns with huge numbers of named capture groups.
// The point is not so much what the exact limit is,
// but that a largish number doesn't hit bad non-linear performance,
// and that exceeding the limit fails cleanly.
//
//--------------------------------------------------------------
void RegexTest::NamedCaptureLimits() {
if (quick) {
logln("Skipping test. Runs in exhuastive mode only.");
return;
}
const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
char nnbuf[100];
UnicodeString pattern;
int32_t nn;
for (nn=1; nn<goodLimit; nn++) {
sprintf(nnbuf, "(?<nn%d>)", nn);
pattern.append(UnicodeString(nnbuf, -1, US_INV));
}
UErrorCode status = U_ZERO_ERROR;
RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
REGEX_CHECK_STATUS;
for (nn=1; nn<goodLimit; nn++) {
sprintf(nnbuf, "nn%d", nn);
int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
REGEX_ASSERT(nn == groupNum);
if (nn != groupNum) {
break;
}
}
delete pat;
pattern.remove();
for (nn=1; nn<failLimit; nn++) {
sprintf(nnbuf, "(?<nn%d>)", nn);
pattern.append(UnicodeString(nnbuf, -1, US_INV));
}
status = U_ZERO_ERROR;
pat = RegexPattern::compile(pattern, 0, status);
REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
delete pat;
}
//--------------------------------------------------------------
//
// Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
@ -5487,5 +5768,26 @@ void RegexTest::TestBug11371() {
}
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
void RegexTest::TestBug11480() {
// C API, get capture group of a group that does not participate in the match.
// (Returns a zero length string, with nul termination,
// indistinguishable from a group with a zero lenght match.)
UErrorCode status = U_ZERO_ERROR;
URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
REGEX_CHECK_STATUS;
UnicodeString text = UNICODE_STRING_SIMPLE("A");
uregex_setText(re, text.getBuffer(), text.length(), &status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
REGEX_ASSERT(length == 0);
REGEX_ASSERT(buf[0] == 13);
REGEX_ASSERT(buf[1] == 0);
REGEX_ASSERT(buf[2] == 13);
uregex_close(re);
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2002-2014, International Business Machines Corporation and
* Copyright (c) 2002-2015, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -41,6 +41,8 @@ public:
virtual void API_Replace_UTF8();
virtual void PerlTestsUTF8();
virtual void PreAllocatedUTextCAPI();
virtual void NamedCapture();
virtual void NamedCaptureLimits();
virtual void Bug7651();
virtual void Bug7740();
virtual void Bug8479();
@ -51,6 +53,7 @@ public:
virtual void TestCaseInsensitiveStarters();
virtual void TestBug11049();
virtual void TestBug11371();
virtual void TestBug11480();
// The following functions are internal to the regexp tests.
virtual void assertUText(const char *expected, UText *actual, const char *file, int line);

View file

@ -1,4 +1,4 @@
# Copyright (c) 2001-2014 International Business Machines
# Copyright (c) 2001-2015 International Business Machines
# Corporation and others. All Rights Reserved.
#
# file:
@ -513,6 +513,15 @@
"ab(?:(c)|(d))\1" i "abde"
"ab(?:(c)|(d))\1" i "<0>ab<1>c</1>c</0>e"
# Named back references
"(?<one>abcd)\k<one>" "<0><1>abcd</1>abcd</0>"
"(no)?(?<one>abcd)\k<one>" "<0><2>abcd</2>abcd</0>"
"(?<a_1>...)" E " " # backref names are ascii letters & numbers only"
"(?<1a>...)" E " " # backref names must begin with a letter"
"(?<a>.)(?<a>.)" E " " # Repeated names are illegal.
# Case Insensitive
"aBc" i "<0>ABC</0>"
"a[^bc]d" i "ABD"