mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 23:10:40 +00:00
ICU-5312 Regular Expressions Named Capture.
X-SVN-Rev: 37040
This commit is contained in:
parent
da811f1dfe
commit
ec3f77f878
16 changed files with 1050 additions and 384 deletions
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines
|
||||
* Copyright (C) 1996-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
|
@ -651,8 +651,9 @@ typedef enum UErrorCode {
|
|||
U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by user callback fn. */
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
U_REGEX_PATTERN_TOO_BIG, /**< Pattern exceeds limits on size or complexity. @draft ICU 55 */
|
||||
U_REGEX_INVALID_CAPTURE_GROUP_NAME, /**< Invalid capture group name. @draft ICU 55 */
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
U_REGEX_ERROR_LIMIT=U_REGEX_STOPPED_BY_CALLER+2, /**< This must always be the last value to indicate the limit for regexp errors */
|
||||
U_REGEX_ERROR_LIMIT=U_REGEX_STOPPED_BY_CALLER+3, /**< This must always be the last value to indicate the limit for regexp errors */
|
||||
|
||||
/*
|
||||
* The error code in the range 0x10400-0x104ff are reserved for IDNA related error codes
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2014, International Business Machines
|
||||
* Copyright (C) 1997-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
|
@ -166,7 +166,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
|
|||
"U_REGEX_STACK_OVERFLOW",
|
||||
"U_REGEX_TIME_OUT",
|
||||
"U_REGEX_STOPPED_BY_CALLER",
|
||||
"U_REGEX_PATTERN_TOO_BIG"
|
||||
"U_REGEX_PATTERN_TOO_BIG",
|
||||
"U_REGEX_INVALID_CAPTURE_GROUP_NAME"
|
||||
};
|
||||
|
||||
static const char * const
|
||||
|
|
|
@ -70,6 +70,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
|
|||
|
||||
fMatchOpenParen = -1;
|
||||
fMatchCloseParen = -1;
|
||||
fCaptureName = NULL;
|
||||
|
||||
if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
|
||||
status = rxp->fDeferredStatus;
|
||||
|
@ -86,6 +87,8 @@ static const UChar chDash = 0x2d; // '-'
|
|||
//
|
||||
//------------------------------------------------------------------------------
|
||||
RegexCompile::~RegexCompile() {
|
||||
delete fCaptureName; // Normally will be NULL, but can exist if pattern
|
||||
// compilation stops with a syntax error.
|
||||
}
|
||||
|
||||
static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
|
||||
|
@ -286,17 +289,6 @@ void RegexCompile::compile(
|
|||
// The pattern has now been read and processed, and the compiled code generated.
|
||||
//
|
||||
|
||||
//
|
||||
// Compute the number of digits requried for the largest capture group number.
|
||||
//
|
||||
fRXPat->fMaxCaptureDigits = 1;
|
||||
int32_t n = 10;
|
||||
int32_t groupCount = fRXPat->fGroupMap->size();
|
||||
while (n <= groupCount) {
|
||||
fRXPat->fMaxCaptureDigits++;
|
||||
n *= 10;
|
||||
}
|
||||
|
||||
//
|
||||
// The pattern's fFrameSize so far has accumulated the requirements for
|
||||
// storage for capture parentheses, counters, etc. that are encountered
|
||||
|
@ -438,8 +430,25 @@ UBool RegexCompile::doParseActions(int32_t action)
|
|||
break;
|
||||
|
||||
|
||||
case doBeginNamedCapture:
|
||||
// Scanning (?<letter.
|
||||
// The first letter of the name will come through again under doConinueNamedCapture.
|
||||
fCaptureName = new UnicodeString();
|
||||
if (fCaptureName == NULL) {
|
||||
error(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
break;
|
||||
|
||||
case doContinueNamedCapture:
|
||||
fCaptureName->append(fC.fChar);
|
||||
break;
|
||||
|
||||
case doBadNamedCapture:
|
||||
error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
break;
|
||||
|
||||
case doOpenCaptureParen:
|
||||
// Open Paren.
|
||||
// Open Capturing Paren, possibly named.
|
||||
// Compile to a
|
||||
// - NOP, which later may be replaced by a save-state if the
|
||||
// parenthesized group gets a * quantifier, followed by
|
||||
|
@ -474,8 +483,18 @@ UBool RegexCompile::doParseActions(int32_t action)
|
|||
|
||||
// Save the mapping from group number to stack frame variable position.
|
||||
fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
|
||||
|
||||
// If this is a named capture group, add the name->group number mapping.
|
||||
if (fCaptureName != NULL) {
|
||||
int32_t groupNumber = fRXPat->fGroupMap->size();
|
||||
int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
|
||||
fCaptureName = NULL; // hash table takes ownership of the name (key) string.
|
||||
if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
|
||||
error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
break;
|
||||
|
||||
case doOpenNonCaptureParen:
|
||||
// Open non-caputuring (grouping only) Paren.
|
||||
|
@ -1270,7 +1289,41 @@ UBool RegexCompile::doParseActions(int32_t action)
|
|||
}
|
||||
break;
|
||||
|
||||
case doBeginNamedBackRef:
|
||||
U_ASSERT(fCaptureName == NULL);
|
||||
fCaptureName = new UnicodeString;
|
||||
if (fCaptureName == NULL) {
|
||||
error(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
break;
|
||||
|
||||
case doContinueNamedBackRef:
|
||||
fCaptureName->append(fC.fChar);
|
||||
break;
|
||||
|
||||
case doCompleteNamedBackRef:
|
||||
{
|
||||
int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
|
||||
if (groupNumber == 0) {
|
||||
// Group name has not been defined.
|
||||
// Could be a forward reference. If we choose to support them at some
|
||||
// future time, extra mechanism will be required at this point.
|
||||
error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
} else {
|
||||
// Given the number, handle identically to a \n numbered back reference.
|
||||
// See comments above, under doBackRef
|
||||
fixLiterals(FALSE);
|
||||
if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
|
||||
appendOp(URX_BACKREF_I, groupNumber);
|
||||
} else {
|
||||
appendOp(URX_BACKREF, groupNumber);
|
||||
}
|
||||
}
|
||||
delete fCaptureName;
|
||||
fCaptureName = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
case doPossessivePlus:
|
||||
// Possessive ++ quantifier.
|
||||
// Compiles to
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
//
|
||||
// regexcmp.h
|
||||
//
|
||||
// Copyright (C) 2002-2014, International Business Machines Corporation and others.
|
||||
// Copyright (C) 2002-2015, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains declarations for the class RegexCompile
|
||||
|
@ -220,6 +220,9 @@ private:
|
|||
UChar32 fLastSetLiteral; // The last single code point added to a set.
|
||||
// needed when "-y" is scanned, and we need
|
||||
// to turn "x-y" into a range.
|
||||
|
||||
UnicodeString *fCaptureName; // Named Capture, the group name is built up
|
||||
// in this string while being scanned.
|
||||
};
|
||||
|
||||
// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
// It is generated by the Perl script "regexcst.pl" from
|
||||
// the rule parser state definitions file "regexcst.txt".
|
||||
//
|
||||
// Copyright (C) 2002-2007 International Business Machines Corporation
|
||||
// Copyright (C) 2002-2015 International Business Machines Corporation
|
||||
// and others. All rights reserved.
|
||||
//
|
||||
//---------------------------------------------------------------------------------
|
||||
|
@ -17,100 +17,107 @@ U_NAMESPACE_BEGIN
|
|||
// Character classes for regex pattern scanning.
|
||||
//
|
||||
static const uint8_t kRuleSet_digit_char = 128;
|
||||
static const uint8_t kRuleSet_rule_char = 129;
|
||||
static const uint8_t kRuleSet_ascii_letter = 129;
|
||||
static const uint8_t kRuleSet_rule_char = 130;
|
||||
|
||||
|
||||
enum Regex_PatternParseAction {
|
||||
doLiteralChar,
|
||||
doSetEnd,
|
||||
doBackslashA,
|
||||
doSetBeginUnion,
|
||||
doNOP,
|
||||
doSetBackslash_w,
|
||||
doSetRange,
|
||||
doBackslashG,
|
||||
doPerlInline,
|
||||
doSetAddDash,
|
||||
doIntevalLowerDigit,
|
||||
doProperty,
|
||||
doBackslashX,
|
||||
doOpenAtomicParen,
|
||||
doSetLiteralEscaped,
|
||||
doPatFinish,
|
||||
doSetBackslash_D,
|
||||
doSetDifference2,
|
||||
doNamedChar,
|
||||
doNGPlus,
|
||||
doOpenLookBehindNeg,
|
||||
doIntervalError,
|
||||
doIntervalSame,
|
||||
doBackRef,
|
||||
doPlus,
|
||||
doOpenCaptureParen,
|
||||
doMismatchedParenErr,
|
||||
doBeginMatchMode,
|
||||
doEscapeError,
|
||||
doOpenNonCaptureParen,
|
||||
doDollar,
|
||||
doSetProp,
|
||||
doIntervalUpperDigit,
|
||||
doSetBegin,
|
||||
doBackslashs,
|
||||
doOpenLookBehind,
|
||||
doPossessiveOpt,
|
||||
doOpenLookBehindNeg,
|
||||
doDotAny,
|
||||
doSetBackslash_D,
|
||||
doSetLiteral,
|
||||
doSetBackslash_S,
|
||||
doEscapeError,
|
||||
doSetBackslash_W,
|
||||
doDollar,
|
||||
doBackslashb,
|
||||
doSetOpError,
|
||||
doBackslashG,
|
||||
doPatStart,
|
||||
doMismatchedParenErr,
|
||||
doPossessivePlus,
|
||||
doBackslashX,
|
||||
doSetBackslash_s,
|
||||
doSetBackslash_w,
|
||||
doBackslashW,
|
||||
doBackslashw,
|
||||
doSetMatchMode,
|
||||
doOrOperator,
|
||||
doCaret,
|
||||
doMatchModeParen,
|
||||
doStar,
|
||||
doOpt,
|
||||
doMatchMode,
|
||||
doSuppressComments,
|
||||
doPossessiveInterval,
|
||||
doOpenLookAheadNeg,
|
||||
doBackslashW,
|
||||
doCloseParen,
|
||||
doSetOpError,
|
||||
doIntervalInit,
|
||||
doSetFinish,
|
||||
doSetIntersection2,
|
||||
doNGStar,
|
||||
doEnterQuoteMode,
|
||||
doSetAddAmp,
|
||||
doBackslashB,
|
||||
doBackslashw,
|
||||
doPossessiveOpt,
|
||||
doSetNegate,
|
||||
doRuleError,
|
||||
doBackslashb,
|
||||
doConditionalExpr,
|
||||
doPossessivePlus,
|
||||
doBadOpenParenType,
|
||||
doNGInterval,
|
||||
doSetLiteral,
|
||||
doSetNamedChar,
|
||||
doBackslashd,
|
||||
doSetBeginDifference1,
|
||||
doBackslashD,
|
||||
doExit,
|
||||
doSetBackslash_S,
|
||||
doInterval,
|
||||
doSetNoCloseError,
|
||||
doNGOpt,
|
||||
doSetPosixProp,
|
||||
doOpenLookBehind,
|
||||
doBackslashS,
|
||||
doBackslashZ,
|
||||
doSetBeginIntersection1,
|
||||
doSetBackslash_W,
|
||||
doBeginMatchMode,
|
||||
doNOP,
|
||||
doSetProp,
|
||||
doBackslashA,
|
||||
doIntervalInit,
|
||||
doOpenCaptureParen,
|
||||
doNGPlus,
|
||||
doIntervalError,
|
||||
doSetDifference2,
|
||||
doNGOpt,
|
||||
doEscapedLiteralChar,
|
||||
doSetNegate,
|
||||
doSetBegin,
|
||||
doMatchModeParen,
|
||||
doLiteralChar,
|
||||
doOpt,
|
||||
doSetIntersection2,
|
||||
doBadOpenParenType,
|
||||
doSuppressComments,
|
||||
doCloseParen,
|
||||
doPatFinish,
|
||||
doSetBeginUnion,
|
||||
doSetBackslash_d,
|
||||
doProperty,
|
||||
doNGInterval,
|
||||
doNGStar,
|
||||
doOpenLookAhead,
|
||||
doBadModeFlag,
|
||||
doPatStart,
|
||||
doSetBeginIntersection1,
|
||||
doBeginNamedCapture,
|
||||
doInterval,
|
||||
doMatchMode,
|
||||
doSetNoCloseError,
|
||||
doSetBeginDifference1,
|
||||
doPlus,
|
||||
doBackslashD,
|
||||
doSetLiteralEscaped,
|
||||
doContinueNamedCapture,
|
||||
doSetPosixProp,
|
||||
doBackslashz,
|
||||
doSetNamedRange,
|
||||
doPossessiveStar,
|
||||
doEscapedLiteralChar,
|
||||
doSetBackslash_s,
|
||||
doBackslashz,
|
||||
doDotAny,
|
||||
doBadModeFlag,
|
||||
doContinueNamedBackRef,
|
||||
doPerlInline,
|
||||
doBackslashd,
|
||||
doOpenNonCaptureParen,
|
||||
doSetEnd,
|
||||
doSetAddDash,
|
||||
doSetFinish,
|
||||
doCaret,
|
||||
doConditionalExpr,
|
||||
doExit,
|
||||
doNamedChar,
|
||||
doSetRange,
|
||||
doPossessiveInterval,
|
||||
doBackslashs,
|
||||
doIntervalSame,
|
||||
doEnterQuoteMode,
|
||||
doOpenAtomicParen,
|
||||
doSetNamedChar,
|
||||
doRuleError,
|
||||
doStar,
|
||||
doSetAddAmp,
|
||||
doBackslashB,
|
||||
doCompleteNamedBackRef,
|
||||
doBackslashZ,
|
||||
doIntevalLowerDigit,
|
||||
doBeginNamedBackRef,
|
||||
doBackRef,
|
||||
doBadNamedCapture,
|
||||
rbbiLastAction};
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
|
@ -132,21 +139,21 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
{doNOP, 0, 0, 0, TRUE}
|
||||
, {doPatStart, 255, 2,0, FALSE} // 1 start
|
||||
, {doLiteralChar, 254, 14,0, TRUE} // 2 term
|
||||
, {doLiteralChar, 129, 14,0, TRUE} // 3
|
||||
, {doSetBegin, 91 /* [ */, 104, 182, TRUE} // 4
|
||||
, {doLiteralChar, 130, 14,0, TRUE} // 3
|
||||
, {doSetBegin, 91 /* [ */, 118, 196, TRUE} // 4
|
||||
, {doNOP, 40 /* ( */, 27,0, TRUE} // 5
|
||||
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
|
||||
, {doCaret, 94 /* ^ */, 14,0, TRUE} // 7
|
||||
, {doDollar, 36 /* $ */, 14,0, TRUE} // 8
|
||||
, {doNOP, 92 /* \ */, 84,0, TRUE} // 9
|
||||
, {doNOP, 92 /* \ */, 89,0, TRUE} // 9
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
|
||||
, {doPatFinish, 253, 2,0, FALSE} // 12
|
||||
, {doRuleError, 255, 183,0, FALSE} // 13
|
||||
, {doNOP, 42 /* * */, 63,0, TRUE} // 14 expr-quant
|
||||
, {doNOP, 43 /* + */, 66,0, TRUE} // 15
|
||||
, {doNOP, 63 /* ? */, 69,0, TRUE} // 16
|
||||
, {doIntervalInit, 123 /* { */, 72,0, TRUE} // 17
|
||||
, {doRuleError, 255, 197,0, FALSE} // 13
|
||||
, {doNOP, 42 /* * */, 68,0, TRUE} // 14 expr-quant
|
||||
, {doNOP, 43 /* + */, 71,0, TRUE} // 15
|
||||
, {doNOP, 63 /* ? */, 74,0, TRUE} // 16
|
||||
, {doIntervalInit, 123 /* { */, 77,0, TRUE} // 17
|
||||
, {doNOP, 40 /* ( */, 23,0, TRUE} // 18
|
||||
, {doNOP, 255, 20,0, FALSE} // 19
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont
|
||||
|
@ -154,7 +161,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doNOP, 255, 2,0, FALSE} // 22
|
||||
, {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
|
||||
, {doNOP, 255, 27,0, FALSE} // 24
|
||||
, {doNOP, 35 /* # */, 49, 14, TRUE} // 25 open-paren-quant2
|
||||
, {doNOP, 35 /* # */, 50, 14, TRUE} // 25 open-paren-quant2
|
||||
, {doNOP, 255, 29,0, FALSE} // 26
|
||||
, {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren
|
||||
, {doOpenCaptureParen, 255, 2, 14, FALSE} // 28
|
||||
|
@ -163,156 +170,170 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32
|
||||
, {doNOP, 60 /* < */, 46,0, TRUE} // 33
|
||||
, {doNOP, 35 /* # */, 49, 2, TRUE} // 34
|
||||
, {doBeginMatchMode, 105 /* i */, 52,0, FALSE} // 35
|
||||
, {doBeginMatchMode, 100 /* d */, 52,0, FALSE} // 36
|
||||
, {doBeginMatchMode, 109 /* m */, 52,0, FALSE} // 37
|
||||
, {doBeginMatchMode, 115 /* s */, 52,0, FALSE} // 38
|
||||
, {doBeginMatchMode, 117 /* u */, 52,0, FALSE} // 39
|
||||
, {doBeginMatchMode, 119 /* w */, 52,0, FALSE} // 40
|
||||
, {doBeginMatchMode, 120 /* x */, 52,0, FALSE} // 41
|
||||
, {doBeginMatchMode, 45 /* - */, 52,0, FALSE} // 42
|
||||
, {doConditionalExpr, 40 /* ( */, 183,0, TRUE} // 43
|
||||
, {doPerlInline, 123 /* { */, 183,0, TRUE} // 44
|
||||
, {doBadOpenParenType, 255, 183,0, FALSE} // 45
|
||||
, {doNOP, 35 /* # */, 50, 2, TRUE} // 34
|
||||
, {doBeginMatchMode, 105 /* i */, 53,0, FALSE} // 35
|
||||
, {doBeginMatchMode, 100 /* d */, 53,0, FALSE} // 36
|
||||
, {doBeginMatchMode, 109 /* m */, 53,0, FALSE} // 37
|
||||
, {doBeginMatchMode, 115 /* s */, 53,0, FALSE} // 38
|
||||
, {doBeginMatchMode, 117 /* u */, 53,0, FALSE} // 39
|
||||
, {doBeginMatchMode, 119 /* w */, 53,0, FALSE} // 40
|
||||
, {doBeginMatchMode, 120 /* x */, 53,0, FALSE} // 41
|
||||
, {doBeginMatchMode, 45 /* - */, 53,0, FALSE} // 42
|
||||
, {doConditionalExpr, 40 /* ( */, 197,0, TRUE} // 43
|
||||
, {doPerlInline, 123 /* { */, 197,0, TRUE} // 44
|
||||
, {doBadOpenParenType, 255, 197,0, FALSE} // 45
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 46 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 47
|
||||
, {doBadOpenParenType, 255, 183,0, FALSE} // 48
|
||||
, {doNOP, 41 /* ) */, 255,0, TRUE} // 49 paren-comment
|
||||
, {doMismatchedParenErr, 253, 183,0, FALSE} // 50
|
||||
, {doNOP, 255, 49,0, TRUE} // 51
|
||||
, {doMatchMode, 105 /* i */, 52,0, TRUE} // 52 paren-flag
|
||||
, {doMatchMode, 100 /* d */, 52,0, TRUE} // 53
|
||||
, {doMatchMode, 109 /* m */, 52,0, TRUE} // 54
|
||||
, {doMatchMode, 115 /* s */, 52,0, TRUE} // 55
|
||||
, {doMatchMode, 117 /* u */, 52,0, TRUE} // 56
|
||||
, {doMatchMode, 119 /* w */, 52,0, TRUE} // 57
|
||||
, {doMatchMode, 120 /* x */, 52,0, TRUE} // 58
|
||||
, {doMatchMode, 45 /* - */, 52,0, TRUE} // 59
|
||||
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 60
|
||||
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 61
|
||||
, {doBadModeFlag, 255, 183,0, FALSE} // 62
|
||||
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 63 quant-star
|
||||
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 64
|
||||
, {doStar, 255, 20,0, FALSE} // 65
|
||||
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 66 quant-plus
|
||||
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 67
|
||||
, {doPlus, 255, 20,0, FALSE} // 68
|
||||
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 69 quant-opt
|
||||
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 70
|
||||
, {doOpt, 255, 20,0, FALSE} // 71
|
||||
, {doNOP, 128, 74,0, FALSE} // 72 interval-open
|
||||
, {doIntervalError, 255, 183,0, FALSE} // 73
|
||||
, {doIntevalLowerDigit, 128, 74,0, TRUE} // 74 interval-lower
|
||||
, {doNOP, 44 /* , */, 78,0, TRUE} // 75
|
||||
, {doIntervalSame, 125 /* } */, 81,0, TRUE} // 76
|
||||
, {doIntervalError, 255, 183,0, FALSE} // 77
|
||||
, {doIntervalUpperDigit, 128, 78,0, TRUE} // 78 interval-upper
|
||||
, {doNOP, 125 /* } */, 81,0, TRUE} // 79
|
||||
, {doIntervalError, 255, 183,0, FALSE} // 80
|
||||
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 81 interval-type
|
||||
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 82
|
||||
, {doInterval, 255, 20,0, FALSE} // 83
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 84 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 85
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 86
|
||||
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 87
|
||||
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 88
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 89
|
||||
, {doNamedChar, 78 /* N */, 14,0, FALSE} // 90
|
||||
, {doProperty, 112 /* p */, 14,0, FALSE} // 91
|
||||
, {doProperty, 80 /* P */, 14,0, FALSE} // 92
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 93
|
||||
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 94
|
||||
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 95
|
||||
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 96
|
||||
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 97
|
||||
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 98
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 99
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 100
|
||||
, {doBackRef, 128, 14,0, TRUE} // 101
|
||||
, {doEscapeError, 253, 183,0, FALSE} // 102
|
||||
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 103
|
||||
, {doSetNegate, 94 /* ^ */, 107,0, TRUE} // 104 set-open
|
||||
, {doSetPosixProp, 58 /* : */, 109,0, FALSE} // 105
|
||||
, {doNOP, 255, 107,0, FALSE} // 106
|
||||
, {doSetLiteral, 93 /* ] */, 122,0, TRUE} // 107 set-open2
|
||||
, {doNOP, 255, 112,0, FALSE} // 108
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 109 set-posix
|
||||
, {doNOP, 58 /* : */, 112,0, FALSE} // 110
|
||||
, {doRuleError, 255, 183,0, FALSE} // 111
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 112 set-start
|
||||
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 113
|
||||
, {doNOP, 92 /* \ */, 172,0, TRUE} // 114
|
||||
, {doNOP, 45 /* - */, 118,0, TRUE} // 115
|
||||
, {doNOP, 38 /* & */, 120,0, TRUE} // 116
|
||||
, {doSetLiteral, 255, 122,0, TRUE} // 117
|
||||
, {doRuleError, 45 /* - */, 183,0, FALSE} // 118 set-start-dash
|
||||
, {doSetAddDash, 255, 122,0, FALSE} // 119
|
||||
, {doRuleError, 38 /* & */, 183,0, FALSE} // 120 set-start-amp
|
||||
, {doSetAddAmp, 255, 122,0, FALSE} // 121
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 122 set-after-lit
|
||||
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 123
|
||||
, {doNOP, 45 /* - */, 159,0, TRUE} // 124
|
||||
, {doNOP, 38 /* & */, 150,0, TRUE} // 125
|
||||
, {doNOP, 92 /* \ */, 172,0, TRUE} // 126
|
||||
, {doSetNoCloseError, 253, 183,0, FALSE} // 127
|
||||
, {doSetLiteral, 255, 122,0, TRUE} // 128
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 129 set-after-set
|
||||
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 130
|
||||
, {doNOP, 45 /* - */, 152,0, TRUE} // 131
|
||||
, {doNOP, 38 /* & */, 147,0, TRUE} // 132
|
||||
, {doNOP, 92 /* \ */, 172,0, TRUE} // 133
|
||||
, {doSetNoCloseError, 253, 183,0, FALSE} // 134
|
||||
, {doSetLiteral, 255, 122,0, TRUE} // 135
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 136 set-after-range
|
||||
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 137
|
||||
, {doNOP, 45 /* - */, 155,0, TRUE} // 138
|
||||
, {doNOP, 38 /* & */, 157,0, TRUE} // 139
|
||||
, {doNOP, 92 /* \ */, 172,0, TRUE} // 140
|
||||
, {doSetNoCloseError, 253, 183,0, FALSE} // 141
|
||||
, {doSetLiteral, 255, 122,0, TRUE} // 142
|
||||
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 143 set-after-op
|
||||
, {doSetOpError, 93 /* ] */, 183,0, FALSE} // 144
|
||||
, {doNOP, 92 /* \ */, 172,0, TRUE} // 145
|
||||
, {doSetLiteral, 255, 122,0, TRUE} // 146
|
||||
, {doSetBeginIntersection1, 91 /* [ */, 104, 129, TRUE} // 147 set-set-amp
|
||||
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 148
|
||||
, {doSetAddAmp, 255, 122,0, FALSE} // 149
|
||||
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 150 set-lit-amp
|
||||
, {doSetAddAmp, 255, 122,0, FALSE} // 151
|
||||
, {doSetBeginDifference1, 91 /* [ */, 104, 129, TRUE} // 152 set-set-dash
|
||||
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 153
|
||||
, {doSetAddDash, 255, 122,0, FALSE} // 154
|
||||
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 155 set-range-dash
|
||||
, {doSetAddDash, 255, 122,0, FALSE} // 156
|
||||
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 157 set-range-amp
|
||||
, {doSetAddAmp, 255, 122,0, FALSE} // 158
|
||||
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 159 set-lit-dash
|
||||
, {doSetAddDash, 91 /* [ */, 122,0, FALSE} // 160
|
||||
, {doSetAddDash, 93 /* ] */, 122,0, FALSE} // 161
|
||||
, {doNOP, 92 /* \ */, 164,0, TRUE} // 162
|
||||
, {doSetRange, 255, 136,0, TRUE} // 163
|
||||
, {doSetOpError, 115 /* s */, 183,0, FALSE} // 164 set-lit-dash-escape
|
||||
, {doSetOpError, 83 /* S */, 183,0, FALSE} // 165
|
||||
, {doSetOpError, 119 /* w */, 183,0, FALSE} // 166
|
||||
, {doSetOpError, 87 /* W */, 183,0, FALSE} // 167
|
||||
, {doSetOpError, 100 /* d */, 183,0, FALSE} // 168
|
||||
, {doSetOpError, 68 /* D */, 183,0, FALSE} // 169
|
||||
, {doSetNamedRange, 78 /* N */, 136,0, FALSE} // 170
|
||||
, {doSetRange, 255, 136,0, TRUE} // 171
|
||||
, {doSetProp, 112 /* p */, 129,0, FALSE} // 172 set-escape
|
||||
, {doSetProp, 80 /* P */, 129,0, FALSE} // 173
|
||||
, {doSetNamedChar, 78 /* N */, 122,0, FALSE} // 174
|
||||
, {doSetBackslash_s, 115 /* s */, 136,0, TRUE} // 175
|
||||
, {doSetBackslash_S, 83 /* S */, 136,0, TRUE} // 176
|
||||
, {doSetBackslash_w, 119 /* w */, 136,0, TRUE} // 177
|
||||
, {doSetBackslash_W, 87 /* W */, 136,0, TRUE} // 178
|
||||
, {doSetBackslash_d, 100 /* d */, 136,0, TRUE} // 179
|
||||
, {doSetBackslash_D, 68 /* D */, 136,0, TRUE} // 180
|
||||
, {doSetLiteralEscaped, 255, 122,0, TRUE} // 181
|
||||
, {doSetFinish, 255, 14,0, FALSE} // 182 set-finish
|
||||
, {doExit, 255, 183,0, TRUE} // 183 errorDeath
|
||||
, {doBeginNamedCapture, 129, 64,0, FALSE} // 48
|
||||
, {doBadOpenParenType, 255, 197,0, FALSE} // 49
|
||||
, {doNOP, 41 /* ) */, 255,0, TRUE} // 50 paren-comment
|
||||
, {doMismatchedParenErr, 253, 197,0, FALSE} // 51
|
||||
, {doNOP, 255, 50,0, TRUE} // 52
|
||||
, {doMatchMode, 105 /* i */, 53,0, TRUE} // 53 paren-flag
|
||||
, {doMatchMode, 100 /* d */, 53,0, TRUE} // 54
|
||||
, {doMatchMode, 109 /* m */, 53,0, TRUE} // 55
|
||||
, {doMatchMode, 115 /* s */, 53,0, TRUE} // 56
|
||||
, {doMatchMode, 117 /* u */, 53,0, TRUE} // 57
|
||||
, {doMatchMode, 119 /* w */, 53,0, TRUE} // 58
|
||||
, {doMatchMode, 120 /* x */, 53,0, TRUE} // 59
|
||||
, {doMatchMode, 45 /* - */, 53,0, TRUE} // 60
|
||||
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 61
|
||||
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 62
|
||||
, {doBadModeFlag, 255, 197,0, FALSE} // 63
|
||||
, {doContinueNamedCapture, 129, 64,0, TRUE} // 64 named-capture
|
||||
, {doContinueNamedCapture, 128, 64,0, TRUE} // 65
|
||||
, {doOpenCaptureParen, 62 /* > */, 2, 14, TRUE} // 66
|
||||
, {doBadNamedCapture, 255, 197,0, FALSE} // 67
|
||||
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 68 quant-star
|
||||
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 69
|
||||
, {doStar, 255, 20,0, FALSE} // 70
|
||||
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 71 quant-plus
|
||||
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 72
|
||||
, {doPlus, 255, 20,0, FALSE} // 73
|
||||
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 74 quant-opt
|
||||
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 75
|
||||
, {doOpt, 255, 20,0, FALSE} // 76
|
||||
, {doNOP, 128, 79,0, FALSE} // 77 interval-open
|
||||
, {doIntervalError, 255, 197,0, FALSE} // 78
|
||||
, {doIntevalLowerDigit, 128, 79,0, TRUE} // 79 interval-lower
|
||||
, {doNOP, 44 /* , */, 83,0, TRUE} // 80
|
||||
, {doIntervalSame, 125 /* } */, 86,0, TRUE} // 81
|
||||
, {doIntervalError, 255, 197,0, FALSE} // 82
|
||||
, {doIntervalUpperDigit, 128, 83,0, TRUE} // 83 interval-upper
|
||||
, {doNOP, 125 /* } */, 86,0, TRUE} // 84
|
||||
, {doIntervalError, 255, 197,0, FALSE} // 85
|
||||
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 86 interval-type
|
||||
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 87
|
||||
, {doInterval, 255, 20,0, FALSE} // 88
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 89 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 90
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 91
|
||||
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 92
|
||||
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 93
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 94
|
||||
, {doNOP, 107 /* k */, 110,0, TRUE} // 95
|
||||
, {doNamedChar, 78 /* N */, 14,0, FALSE} // 96
|
||||
, {doProperty, 112 /* p */, 14,0, FALSE} // 97
|
||||
, {doProperty, 80 /* P */, 14,0, FALSE} // 98
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 99
|
||||
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 100
|
||||
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 101
|
||||
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 102
|
||||
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 103
|
||||
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 104
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 105
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 106
|
||||
, {doBackRef, 128, 14,0, TRUE} // 107
|
||||
, {doEscapeError, 253, 197,0, FALSE} // 108
|
||||
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 109
|
||||
, {doBeginNamedBackRef, 60 /* < */, 112,0, TRUE} // 110 named-backref
|
||||
, {doBadNamedCapture, 255, 197,0, FALSE} // 111
|
||||
, {doContinueNamedBackRef, 129, 114,0, TRUE} // 112 named-backref-2
|
||||
, {doBadNamedCapture, 255, 197,0, FALSE} // 113
|
||||
, {doContinueNamedBackRef, 129, 114,0, TRUE} // 114 named-backref-3
|
||||
, {doContinueNamedBackRef, 128, 114,0, TRUE} // 115
|
||||
, {doCompleteNamedBackRef, 62 /* > */, 14,0, TRUE} // 116
|
||||
, {doBadNamedCapture, 255, 197,0, FALSE} // 117
|
||||
, {doSetNegate, 94 /* ^ */, 121,0, TRUE} // 118 set-open
|
||||
, {doSetPosixProp, 58 /* : */, 123,0, FALSE} // 119
|
||||
, {doNOP, 255, 121,0, FALSE} // 120
|
||||
, {doSetLiteral, 93 /* ] */, 136,0, TRUE} // 121 set-open2
|
||||
, {doNOP, 255, 126,0, FALSE} // 122
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 123 set-posix
|
||||
, {doNOP, 58 /* : */, 126,0, FALSE} // 124
|
||||
, {doRuleError, 255, 197,0, FALSE} // 125
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 126 set-start
|
||||
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 127
|
||||
, {doNOP, 92 /* \ */, 186,0, TRUE} // 128
|
||||
, {doNOP, 45 /* - */, 132,0, TRUE} // 129
|
||||
, {doNOP, 38 /* & */, 134,0, TRUE} // 130
|
||||
, {doSetLiteral, 255, 136,0, TRUE} // 131
|
||||
, {doRuleError, 45 /* - */, 197,0, FALSE} // 132 set-start-dash
|
||||
, {doSetAddDash, 255, 136,0, FALSE} // 133
|
||||
, {doRuleError, 38 /* & */, 197,0, FALSE} // 134 set-start-amp
|
||||
, {doSetAddAmp, 255, 136,0, FALSE} // 135
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 136 set-after-lit
|
||||
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 137
|
||||
, {doNOP, 45 /* - */, 173,0, TRUE} // 138
|
||||
, {doNOP, 38 /* & */, 164,0, TRUE} // 139
|
||||
, {doNOP, 92 /* \ */, 186,0, TRUE} // 140
|
||||
, {doSetNoCloseError, 253, 197,0, FALSE} // 141
|
||||
, {doSetLiteral, 255, 136,0, TRUE} // 142
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 143 set-after-set
|
||||
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 144
|
||||
, {doNOP, 45 /* - */, 166,0, TRUE} // 145
|
||||
, {doNOP, 38 /* & */, 161,0, TRUE} // 146
|
||||
, {doNOP, 92 /* \ */, 186,0, TRUE} // 147
|
||||
, {doSetNoCloseError, 253, 197,0, FALSE} // 148
|
||||
, {doSetLiteral, 255, 136,0, TRUE} // 149
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 150 set-after-range
|
||||
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 151
|
||||
, {doNOP, 45 /* - */, 169,0, TRUE} // 152
|
||||
, {doNOP, 38 /* & */, 171,0, TRUE} // 153
|
||||
, {doNOP, 92 /* \ */, 186,0, TRUE} // 154
|
||||
, {doSetNoCloseError, 253, 197,0, FALSE} // 155
|
||||
, {doSetLiteral, 255, 136,0, TRUE} // 156
|
||||
, {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE} // 157 set-after-op
|
||||
, {doSetOpError, 93 /* ] */, 197,0, FALSE} // 158
|
||||
, {doNOP, 92 /* \ */, 186,0, TRUE} // 159
|
||||
, {doSetLiteral, 255, 136,0, TRUE} // 160
|
||||
, {doSetBeginIntersection1, 91 /* [ */, 118, 143, TRUE} // 161 set-set-amp
|
||||
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 162
|
||||
, {doSetAddAmp, 255, 136,0, FALSE} // 163
|
||||
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 164 set-lit-amp
|
||||
, {doSetAddAmp, 255, 136,0, FALSE} // 165
|
||||
, {doSetBeginDifference1, 91 /* [ */, 118, 143, TRUE} // 166 set-set-dash
|
||||
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 167
|
||||
, {doSetAddDash, 255, 136,0, FALSE} // 168
|
||||
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 169 set-range-dash
|
||||
, {doSetAddDash, 255, 136,0, FALSE} // 170
|
||||
, {doSetIntersection2, 38 /* & */, 157,0, TRUE} // 171 set-range-amp
|
||||
, {doSetAddAmp, 255, 136,0, FALSE} // 172
|
||||
, {doSetDifference2, 45 /* - */, 157,0, TRUE} // 173 set-lit-dash
|
||||
, {doSetAddDash, 91 /* [ */, 136,0, FALSE} // 174
|
||||
, {doSetAddDash, 93 /* ] */, 136,0, FALSE} // 175
|
||||
, {doNOP, 92 /* \ */, 178,0, TRUE} // 176
|
||||
, {doSetRange, 255, 150,0, TRUE} // 177
|
||||
, {doSetOpError, 115 /* s */, 197,0, FALSE} // 178 set-lit-dash-escape
|
||||
, {doSetOpError, 83 /* S */, 197,0, FALSE} // 179
|
||||
, {doSetOpError, 119 /* w */, 197,0, FALSE} // 180
|
||||
, {doSetOpError, 87 /* W */, 197,0, FALSE} // 181
|
||||
, {doSetOpError, 100 /* d */, 197,0, FALSE} // 182
|
||||
, {doSetOpError, 68 /* D */, 197,0, FALSE} // 183
|
||||
, {doSetNamedRange, 78 /* N */, 150,0, FALSE} // 184
|
||||
, {doSetRange, 255, 150,0, TRUE} // 185
|
||||
, {doSetProp, 112 /* p */, 143,0, FALSE} // 186 set-escape
|
||||
, {doSetProp, 80 /* P */, 143,0, FALSE} // 187
|
||||
, {doSetNamedChar, 78 /* N */, 136,0, FALSE} // 188
|
||||
, {doSetBackslash_s, 115 /* s */, 150,0, TRUE} // 189
|
||||
, {doSetBackslash_S, 83 /* S */, 150,0, TRUE} // 190
|
||||
, {doSetBackslash_w, 119 /* w */, 150,0, TRUE} // 191
|
||||
, {doSetBackslash_W, 87 /* W */, 150,0, TRUE} // 192
|
||||
, {doSetBackslash_d, 100 /* d */, 150,0, TRUE} // 193
|
||||
, {doSetBackslash_D, 68 /* D */, 150,0, TRUE} // 194
|
||||
, {doSetLiteralEscaped, 255, 136,0, TRUE} // 195
|
||||
, {doSetFinish, 255, 14,0, FALSE} // 196 set-finish
|
||||
, {doExit, 255, 197,0, TRUE} // 197 errorDeath
|
||||
};
|
||||
static const char * const RegexStateNames[] = { 0,
|
||||
"start",
|
||||
|
@ -362,6 +383,7 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
"open-paren-lookbehind",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"paren-comment",
|
||||
0,
|
||||
|
@ -376,6 +398,10 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"named-capture",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"quant-star",
|
||||
0,
|
||||
|
@ -417,6 +443,15 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"named-backref",
|
||||
0,
|
||||
"named-backref-2",
|
||||
0,
|
||||
"named-backref-3",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"set-open",
|
||||
0,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
|
||||
#*****************************************************************************
|
||||
#
|
||||
# Copyright (C) 2002-2007, International Business Machines Corporation and others.
|
||||
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
#*****************************************************************************
|
||||
|
@ -147,6 +147,7 @@ open-paren-extended:
|
|||
open-paren-lookbehind:
|
||||
'=' n term ^expr-cont doOpenLookBehind # (?<=
|
||||
'!' n term ^expr-cont doOpenLookBehindNeg # (?<!
|
||||
ascii_letter named-capture doBeginNamedCapture # (?<name
|
||||
default errorDeath doBadOpenParenType
|
||||
|
||||
|
||||
|
@ -174,6 +175,14 @@ paren-flag:
|
|||
':' n term ^expr-quant doMatchModeParen
|
||||
default errorDeath doBadModeFlag
|
||||
|
||||
#
|
||||
# named-capture (?<name> ... ), position currently on the name.
|
||||
#
|
||||
named-capture:
|
||||
ascii_letter n named-capture doContinueNamedCapture
|
||||
digit_char n named-capture doContinueNamedCapture
|
||||
'>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.
|
||||
default errorDeath doBadNamedCapture
|
||||
|
||||
#
|
||||
# quant-star Scanning a '*' quantifier. Need to look ahead to decide
|
||||
|
@ -241,6 +250,7 @@ backslash:
|
|||
'd' n expr-quant doBackslashd
|
||||
'D' n expr-quant doBackslashD
|
||||
'G' n term doBackslashG
|
||||
'k' n named-backref
|
||||
'N' expr-quant doNamedChar # \N{NAME} named char
|
||||
'p' expr-quant doProperty # \p{Lu} style property
|
||||
'P' expr-quant doProperty
|
||||
|
@ -257,6 +267,24 @@ backslash:
|
|||
default n expr-quant doEscapedLiteralChar
|
||||
|
||||
|
||||
# named-backref Scanned \k
|
||||
# Leading to \k<captureName>
|
||||
# Failure to get the full sequence is an error.
|
||||
#
|
||||
named-backref:
|
||||
'<' n named-backref-2 doBeginNamedBackRef
|
||||
default errorDeath doBadNamedCapture
|
||||
|
||||
named-backref-2:
|
||||
ascii_letter n named-backref-3 doContinueNamedBackRef
|
||||
default errorDeath doBadNamedCapture
|
||||
|
||||
named-backref-3:
|
||||
ascii_letter n named-backref-3 doContinueNamedBackRef
|
||||
digit_char n named-backref-3 doContinueNamedBackRef
|
||||
'>' n expr-quant doCompleteNamedBackRef
|
||||
default errorDeath doBadNamedCapture
|
||||
|
||||
|
||||
#
|
||||
# [set expression] parsing,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
//
|
||||
// regexst.h
|
||||
//
|
||||
// Copyright (C) 2004-2013, International Business Machines Corporation and others.
|
||||
// Copyright (C) 2004-2015, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains class RegexStaticSets
|
||||
|
@ -55,11 +55,6 @@ static const UChar gRuleSet_rule_char_pattern[] = {
|
|||
// \ { \ } \ ^ \ $ \ | \ \ \ . ]
|
||||
0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};
|
||||
|
||||
|
||||
static const UChar gRuleSet_digit_char_pattern[] = {
|
||||
// [ 0 - 9 ]
|
||||
0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
|
||||
|
||||
//
|
||||
// Here are the backslash escape characters that ICU's unescape() function
|
||||
// will handle.
|
||||
|
@ -213,23 +208,29 @@ fEmptyText(NULL)
|
|||
|
||||
// Sets used while parsing rules, but not referenced from the parse state table
|
||||
fRuleSets[kRuleSet_rule_char-128] = UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status);
|
||||
fRuleSets[kRuleSet_digit_char-128] = UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status);
|
||||
fRuleSets[kRuleSet_digit_char-128].add((UChar)0x30, (UChar)0x39); // [0-9]
|
||||
fRuleSets[kRuleSet_ascii_letter-128].add((UChar)0x41, (UChar)0x5A); // [A-Z]
|
||||
fRuleSets[kRuleSet_ascii_letter-128].add((UChar)0x61, (UChar)0x7A); // [a-z]
|
||||
fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
|
||||
for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
|
||||
for (i=0; i<UPRV_LENGTHOF(fRuleSets); i++) {
|
||||
fRuleSets[i].compact();
|
||||
}
|
||||
|
||||
// Finally, initialize an empty string for utility purposes
|
||||
fEmptyText = utext_openUChars(NULL, NULL, 0, status);
|
||||
|
||||
return; // If we reached this point, everything is fine so just exit
|
||||
if (U_SUCCESS(*status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
ExitConstrDeleteAll: // Remove fPropSets and fRuleSets and return error
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
delete fPropSets[i];
|
||||
fPropSets[i] = NULL;
|
||||
}
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
if (U_SUCCESS(*status)) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -257,6 +257,9 @@ void RegexMatcher::init2(UText *input, UErrorCode &status) {
|
|||
|
||||
static const UChar BACKSLASH = 0x5c;
|
||||
static const UChar DOLLARSIGN = 0x24;
|
||||
static const UChar LEFTBRACKET = 0x7b;
|
||||
static const UChar RIGHTBRACKET = 0x7d;
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// appendReplacement
|
||||
|
@ -331,8 +334,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
|
|||
// TODO: optimize this loop by efficiently scanning for '$' or '\',
|
||||
// move entire ranges not containing substitutions.
|
||||
UTEXT_SETNATIVEINDEX(replacement, 0);
|
||||
UChar32 c = UTEXT_NEXT32(replacement);
|
||||
while (c != U_SENTINEL) {
|
||||
for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
|
||||
if (c == BACKSLASH) {
|
||||
// Backslash Escape. Copy the following char out without further checks.
|
||||
// Note: Surrogate pairs don't need any special handling
|
||||
|
@ -398,51 +400,69 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
|
|||
}
|
||||
}
|
||||
} else {
|
||||
// We've got a $. Pick up a capture group number if one follows.
|
||||
// Consume at most the number of digits necessary for the largest capture
|
||||
// number that is valid for this pattern.
|
||||
// We've got a $. Pick up a capture group name or number if one follows.
|
||||
// Consume digits so long as the resulting group number <= the number of
|
||||
// number of capture groups in the pattern.
|
||||
|
||||
int32_t numDigits = 0;
|
||||
int32_t groupNum = 0;
|
||||
UChar32 digitC;
|
||||
for (;;) {
|
||||
digitC = UTEXT_CURRENT32(replacement);
|
||||
if (digitC == U_SENTINEL) {
|
||||
break;
|
||||
int32_t numDigits = 0;
|
||||
UChar32 nextChar = utext_current32(replacement);
|
||||
if (nextChar == LEFTBRACKET) {
|
||||
// Scan for a Named Capture Group, ${name}.
|
||||
UnicodeString groupName;
|
||||
utext_next32(replacement);
|
||||
while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
|
||||
nextChar = utext_next32(replacement);
|
||||
if (nextChar == U_SENTINEL) {
|
||||
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
} else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
|
||||
(nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
|
||||
(nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
|
||||
groupName.append(nextChar);
|
||||
} else if (nextChar == RIGHTBRACKET) {
|
||||
groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
|
||||
if (groupNum == 0) {
|
||||
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
}
|
||||
} else {
|
||||
// Character was something other than a name char or a closing '}'
|
||||
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
}
|
||||
}
|
||||
if (u_isdigit(digitC) == FALSE) {
|
||||
break;
|
||||
|
||||
} else if (u_isdigit(nextChar)) {
|
||||
// $n Scan for a capture group number
|
||||
int32_t numCaptureGroups = fPattern->fGroupMap->size();
|
||||
for (;;) {
|
||||
nextChar = UTEXT_CURRENT32(replacement);
|
||||
if (nextChar == U_SENTINEL) {
|
||||
break;
|
||||
}
|
||||
if (u_isdigit(nextChar) == FALSE) {
|
||||
break;
|
||||
}
|
||||
int32_t nextDigitVal = u_charDigitValue(nextChar);
|
||||
if (groupNum*10 + nextDigitVal > numCaptureGroups) {
|
||||
// Don't consume the next digit if it makes the capture group number too big.
|
||||
if (numDigits == 0) {
|
||||
status = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
}
|
||||
break;
|
||||
}
|
||||
(void)UTEXT_NEXT32(replacement);
|
||||
groupNum=groupNum*10 + nextDigitVal;
|
||||
++numDigits;
|
||||
}
|
||||
(void)UTEXT_NEXT32(replacement);
|
||||
groupNum=groupNum*10 + u_charDigitValue(digitC);
|
||||
numDigits++;
|
||||
if (numDigits >= fPattern->fMaxCaptureDigits) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (numDigits == 0) {
|
||||
// The $ didn't introduce a group number at all.
|
||||
// Treat it as just part of the substitution text.
|
||||
UChar c16 = DOLLARSIGN;
|
||||
destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
|
||||
} else {
|
||||
// Finally, append the capture group data to the destination.
|
||||
destLen += appendGroup(groupNum, dest, status);
|
||||
if (U_FAILURE(status)) {
|
||||
// Can fail if group number is out of range.
|
||||
break;
|
||||
}
|
||||
// $ not followed by capture group name or number.
|
||||
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
}
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
break;
|
||||
} else {
|
||||
c = UTEXT_NEXT32(replacement);
|
||||
}
|
||||
}
|
||||
if (U_SUCCESS(status)) {
|
||||
destLen += appendGroup(groupNum, dest, status);
|
||||
}
|
||||
} // End of $ capture group handling
|
||||
} // End of per-character loop through the replacement string.
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
@ -1201,7 +1221,6 @@ UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
|
|||
}
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// appendGroup() -- currently internal only, appends a group to a UText rather
|
||||
|
@ -1282,8 +1301,6 @@ int32_t RegexMatcher::groupCount() const {
|
|||
return fPattern->fGroupMap->size();
|
||||
}
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// hasAnchoringBounds()
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
//
|
||||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2002-2014 International Business Machines Corporation *
|
||||
* Copyright (C) 2002-2015 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
***************************************************************************
|
||||
*/
|
||||
|
@ -15,6 +15,7 @@
|
|||
#include "unicode/regex.h"
|
||||
#include "unicode/uclean.h"
|
||||
#include "uassert.h"
|
||||
#include "uhash.h"
|
||||
#include "uvector.h"
|
||||
#include "uvectr32.h"
|
||||
#include "uvectr64.h"
|
||||
|
@ -92,7 +93,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
fMinMatchLen = other.fMinMatchLen;
|
||||
fFrameSize = other.fFrameSize;
|
||||
fDataSize = other.fDataSize;
|
||||
fMaxCaptureDigits = other.fMaxCaptureDigits;
|
||||
fStaticSets = other.fStaticSets;
|
||||
fStaticSets8 = other.fStaticSets8;
|
||||
|
||||
|
@ -133,6 +133,21 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
fSets8[i] = other.fSets8[i];
|
||||
}
|
||||
|
||||
// Copy the named capture group hash map.
|
||||
int32_t hashPos = UHASH_FIRST;
|
||||
while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
break;
|
||||
}
|
||||
const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
|
||||
UnicodeString *key = new UnicodeString(*name);
|
||||
int32_t val = hashEl->value.integer;
|
||||
if (key == NULL) {
|
||||
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -154,7 +169,6 @@ void RegexPattern::init() {
|
|||
fFrameSize = 0;
|
||||
fDataSize = 0;
|
||||
fGroupMap = NULL;
|
||||
fMaxCaptureDigits = 1;
|
||||
fStaticSets = NULL;
|
||||
fStaticSets8 = NULL;
|
||||
fStartType = START_NO_INFO;
|
||||
|
@ -164,6 +178,7 @@ void RegexPattern::init() {
|
|||
fInitialChar = 0;
|
||||
fInitialChars8 = NULL;
|
||||
fNeedsAltInput = FALSE;
|
||||
fNamedCaptureMap = NULL;
|
||||
|
||||
fPattern = NULL; // will be set later
|
||||
fPatternString = NULL; // may be set later
|
||||
|
@ -172,17 +187,24 @@ void RegexPattern::init() {
|
|||
fSets = new UVector(fDeferredStatus);
|
||||
fInitialChars = new UnicodeSet;
|
||||
fInitialChars8 = new Regex8BitSet;
|
||||
fNamedCaptureMap = uhash_open(uhash_hashUnicodeString, // Key hash function
|
||||
uhash_compareUnicodeString, // Key comparator function
|
||||
uhash_compareLong, // Value comparator function
|
||||
&fDeferredStatus);
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
return;
|
||||
}
|
||||
if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
|
||||
fInitialChars == NULL || fInitialChars8 == NULL) {
|
||||
fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) {
|
||||
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
// Slot zero of the vector of sets is reserved. Fill it here.
|
||||
fSets->addElement((int32_t)0, fDeferredStatus);
|
||||
|
||||
// fNamedCaptureMap owns its key strings, type (UnicodeString *)
|
||||
uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
|
||||
}
|
||||
|
||||
|
||||
|
@ -220,6 +242,8 @@ void RegexPattern::zap() {
|
|||
delete fPatternString;
|
||||
fPatternString = NULL;
|
||||
}
|
||||
uhash_close(fNamedCaptureMap);
|
||||
fNamedCaptureMap = NULL;
|
||||
}
|
||||
|
||||
|
||||
|
@ -577,6 +601,34 @@ UText *RegexPattern::patternText(UErrorCode &status) const {
|
|||
}
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// groupNumberFromName()
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// No need to explicitly check for syntactically valid names.
|
||||
// Invalid ones will never be in the map, and the lookup will fail.
|
||||
|
||||
int32_t number = uhash_geti(fNamedCaptureMap, &groupName);
|
||||
if (number == 0) {
|
||||
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
}
|
||||
return number;
|
||||
}
|
||||
|
||||
int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
UnicodeString name(groupName, nameLength, US_INV);
|
||||
return groupNumberFromName(name, status);
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
|
@ -754,6 +806,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
|
||||
void RegexPattern::dumpPattern() const {
|
||||
#if defined(REGEX_DEBUG)
|
||||
// TODO: This function assumes an ASCII based charset.
|
||||
int index;
|
||||
int i;
|
||||
|
||||
|
@ -805,6 +858,21 @@ void RegexPattern::dumpPattern() const {
|
|||
}
|
||||
}
|
||||
|
||||
printf("Named Capture Groups:\n");
|
||||
if (uhash_count(fNamedCaptureMap) == 0) {
|
||||
printf(" None\n");
|
||||
} else {
|
||||
int32_t pos = UHASH_FIRST;
|
||||
const UHashElement *el = NULL;
|
||||
while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
|
||||
const UnicodeString *name = (const UnicodeString *)el->key.pointer;
|
||||
char s[100];
|
||||
name->extract(0, 99, s, sizeof(s), US_INV); // capture group names are invariant.
|
||||
int32_t number = el->value.integer;
|
||||
printf(" %d\t%s\n", number, s);
|
||||
}
|
||||
}
|
||||
|
||||
printf("\nIndex Binary Type Operand\n" \
|
||||
"-------------------------------------------\n");
|
||||
for (index = 0; index<fCompiledPat->size(); index++) {
|
||||
|
|
|
@ -55,6 +55,8 @@
|
|||
|
||||
// Forward Declarations
|
||||
|
||||
struct UHashtable;
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
struct Regex8BitSet;
|
||||
|
@ -136,7 +138,7 @@ public:
|
|||
|
||||
/**
|
||||
* Create an exact copy of this RegexPattern object. Since RegexPattern is not
|
||||
* intended to be subclasses, <code>clone()</code> and the copy construction are
|
||||
* intended to be subclassed, <code>clone()</code> and the copy construction are
|
||||
* equivalent operations.
|
||||
* @return the copy of this RegexPattern
|
||||
* @stable ICU 2.4
|
||||
|
@ -437,6 +439,41 @@ public:
|
|||
virtual UText *patternText(UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Get the group number corresponding to a named capture group.
|
||||
* The returned number can be used with any function that access
|
||||
* capture groups by number.
|
||||
*
|
||||
* The function returns an error status if the specified name does not
|
||||
* appear in the pattern.
|
||||
*
|
||||
* @param groupName The capture group name.
|
||||
* @param status A UErrorCode to receive any errors.
|
||||
*
|
||||
* @draft ICU 55
|
||||
*/
|
||||
virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Get the group number corresponding to a named capture group.
|
||||
* The returned number can be used with any function that access
|
||||
* capture groups by number.
|
||||
*
|
||||
* The function returns an error status if the specified name does not
|
||||
* appear in the pattern.
|
||||
*
|
||||
* @param groupName The capture group name,
|
||||
* platform invariant characters only.
|
||||
* @param nameLength The length of the name, or -1 if the name is
|
||||
* nul-terminated.
|
||||
* @param status A UErrorCode to receive any errors.
|
||||
*
|
||||
* @draft ICU 55
|
||||
*/
|
||||
virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Split a string into fields. Somewhat like split() from Perl or Java.
|
||||
* Pattern matches identify delimiters that separate the input
|
||||
|
@ -573,8 +610,6 @@ private:
|
|||
UVector32 *fGroupMap; // Map from capture group number to position of
|
||||
// the group's variables in the matcher stack frame.
|
||||
|
||||
int32_t fMaxCaptureDigits;
|
||||
|
||||
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
|
||||
// regex character classes, e.g. Word.
|
||||
|
||||
|
@ -589,6 +624,8 @@ private:
|
|||
Regex8BitSet *fInitialChars8;
|
||||
UBool fNeedsAltInput;
|
||||
|
||||
UHashtable *fNamedCaptureMap; // Map from capture group names to numbers.
|
||||
|
||||
friend class RegexCompile;
|
||||
friend class RegexMatcher;
|
||||
friend class RegexCImpl;
|
||||
|
@ -854,7 +891,6 @@ public:
|
|||
*/
|
||||
virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the number of capturing groups in this matcher's pattern.
|
||||
* @return the number of capture groups
|
||||
|
@ -945,7 +981,6 @@ public:
|
|||
*/
|
||||
virtual int64_t start64(int32_t group, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Returns the index in the input string of the first character following the
|
||||
* text matched during the previous match operation.
|
||||
|
@ -1015,7 +1050,6 @@ public:
|
|||
*/
|
||||
virtual int64_t end64(int32_t group, UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Resets this matcher. The effect is to remove any memory of previous matches,
|
||||
* and to cause subsequent find() operations to begin at the beginning of
|
||||
|
|
|
@ -607,6 +607,53 @@ U_STABLE int32_t U_EXPORT2
|
|||
uregex_groupCount(URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Get the group number corresponding to a named capture group.
|
||||
* The returned number can be used with any function that access
|
||||
* capture groups by number.
|
||||
*
|
||||
* The function returns an error status if the specified name does not
|
||||
* appear in the pattern.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param groupName The capture group name.
|
||||
* @param nameLength The length of the name, or -1 if the name is a
|
||||
* nul-terminated string.
|
||||
* @param status A pointer to a UErrorCode to receive any errors.
|
||||
*
|
||||
* @draft ICU 55
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uregex_groupNumberFromName(URegularExpression *regexp,
|
||||
const UChar *groupName,
|
||||
int32_t nameLength,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Get the group number corresponding to a named capture group.
|
||||
* The returned number can be used with any function that access
|
||||
* capture groups by number.
|
||||
*
|
||||
* The function returns an error status if the specified name does not
|
||||
* appear in the pattern.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param groupName The capture group name,
|
||||
* platform invariant characters only.
|
||||
* @param nameLength The length of the name, or -1 if the name is
|
||||
* nul-terminated.
|
||||
* @param status A pointer to a UErrorCode to receive any errors.
|
||||
*
|
||||
* @draft ICU 55
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uregex_groupNumberFromCName(URegularExpression *regexp,
|
||||
const char *groupName,
|
||||
int32_t nameLength,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/** Extract the string for the specified matching expression or subexpression.
|
||||
* Group #0 is the complete string of matched text.
|
||||
* Group #1 is the text matched by the first set of capturing parentheses.
|
||||
|
@ -630,8 +677,8 @@ uregex_group(URegularExpression *regexp,
|
|||
int32_t destCapacity,
|
||||
UErrorCode *status);
|
||||
|
||||
/** Returns a shallow immutable clone of the entire input string. The returned UText current native index
|
||||
* is set to the beginning of the requested capture group. The capture group length is also
|
||||
/** Returns a shallow immutable clone of the entire input string with the current index set
|
||||
* to the beginning of the requested capture group. The capture group length is also
|
||||
* returned via groupLength.
|
||||
* Group #0 is the complete string of matched text.
|
||||
* Group #1 is the text matched by the first set of capturing parentheses.
|
||||
|
@ -644,7 +691,7 @@ uregex_group(URegularExpression *regexp,
|
|||
* @param dest A mutable UText in which to store the current input.
|
||||
* If NULL, a new UText will be created as an immutable shallow clone
|
||||
* of the entire input string.
|
||||
* @param groupLength The group length of the desired capture group.
|
||||
* @param groupLength The group length of the desired capture group. Output parameter.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return The subject text currently associated with this regular expression.
|
||||
* If a pre-allocated UText was provided, it will always be used and returned.
|
||||
|
|
|
@ -17,14 +17,14 @@
|
|||
#include "unicode/uchar.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "umutex.h"
|
||||
#include "uassert.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
#include "uhash.h"
|
||||
#include "umutex.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
#include "regextxt.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
|
||||
|
@ -625,6 +625,36 @@ uregex_groupCount(URegularExpression *regexp2,
|
|||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_groupNumberFromName
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
int32_t
|
||||
uregex_groupNumberFromName(URegularExpression *regexp2,
|
||||
const UChar *groupName,
|
||||
int32_t nameLength,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, FALSE, status) == FALSE) {
|
||||
return 0;
|
||||
}
|
||||
int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status);
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t
|
||||
uregex_groupNumberFromCName(URegularExpression *regexp2,
|
||||
const char *groupName,
|
||||
int32_t nameLength,
|
||||
UErrorCode *status) {
|
||||
RegularExpression *regexp = (RegularExpression*)regexp2;
|
||||
if (validateRE(regexp, FALSE, status) == FALSE) {
|
||||
return 0;
|
||||
}
|
||||
return regexp->fPat->groupNumberFromName(groupName, nameLength, *status);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_group
|
||||
|
@ -1285,6 +1315,8 @@ U_NAMESPACE_END
|
|||
|
||||
static const UChar BACKSLASH = 0x5c;
|
||||
static const UChar DOLLARSIGN = 0x24;
|
||||
static const UChar LEFTBRACKET = 0x7b;
|
||||
static const UChar RIGHTBRACKET = 0x7d;
|
||||
|
||||
//
|
||||
// Move a character to an output buffer, with bounds checking on the index.
|
||||
|
@ -1359,10 +1391,10 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
|||
matchStart = (int32_t)m->fMatchStart;
|
||||
} else {
|
||||
// !!!: Would like a better way to do this!
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
|
||||
status = U_ZERO_ERROR;
|
||||
matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
|
||||
UErrorCode tempStatus = U_ZERO_ERROR;
|
||||
lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus);
|
||||
tempStatus = U_ZERO_ERROR;
|
||||
matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus);
|
||||
}
|
||||
for (i=lastMatchEnd; i<matchStart; i++) {
|
||||
appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
|
||||
|
@ -1377,7 +1409,7 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
|||
|
||||
// scan the replacement text, looking for substitutions ($n) and \escapes.
|
||||
int32_t replIdx = 0;
|
||||
while (replIdx < replacementLength) {
|
||||
while (replIdx < replacementLength && U_SUCCESS(*status)) {
|
||||
UChar c = replacementText[replIdx];
|
||||
replIdx++;
|
||||
if (c != DOLLARSIGN && c != BACKSLASH) {
|
||||
|
@ -1426,55 +1458,84 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
|||
continue;
|
||||
}
|
||||
|
||||
// We've got a $. Pick up the following capture group name or number.
|
||||
// For numbers, consume only digits that produce a valid capture group for the pattern.
|
||||
|
||||
|
||||
// We've got a $. Pick up a capture group number if one follows.
|
||||
// Consume at most the number of digits necessary for the largest capture
|
||||
// number that is valid for this pattern.
|
||||
|
||||
int32_t numDigits = 0;
|
||||
int32_t groupNum = 0;
|
||||
UChar32 digitC;
|
||||
for (;;) {
|
||||
if (replIdx >= replacementLength) {
|
||||
break;
|
||||
}
|
||||
U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
|
||||
if (u_isdigit(digitC) == FALSE) {
|
||||
break;
|
||||
}
|
||||
U_ASSERT(c == DOLLARSIGN);
|
||||
UChar32 c32;
|
||||
U16_GET(replacementText, 0, replIdx, replacementLength, c32);
|
||||
if (u_isdigit(c32)) {
|
||||
int32_t numDigits = 0;
|
||||
int32_t numCaptureGroups = m->fPattern->fGroupMap->size();
|
||||
for (;;) {
|
||||
if (replIdx >= replacementLength) {
|
||||
break;
|
||||
}
|
||||
U16_GET(replacementText, 0, replIdx, replacementLength, c32);
|
||||
if (u_isdigit(c32) == FALSE) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t digitVal = u_charDigitValue(c32);
|
||||
if (groupNum * 10 + digitVal <= numCaptureGroups) {
|
||||
groupNum = groupNum * 10 + digitVal;
|
||||
U16_FWD_1(replacementText, replIdx, replacementLength);
|
||||
numDigits++;
|
||||
} else {
|
||||
if (numDigits == 0) {
|
||||
*status = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (c32 == LEFTBRACKET) {
|
||||
// Scan for Named Capture Group, ${name}.
|
||||
UnicodeString groupName;
|
||||
U16_FWD_1(replacementText, replIdx, replacementLength);
|
||||
groupNum=groupNum*10 + u_charDigitValue(digitC);
|
||||
numDigits++;
|
||||
if (numDigits >= m->fPattern->fMaxCaptureDigits) {
|
||||
break;
|
||||
while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) {
|
||||
if (replIdx >= replacementLength) {
|
||||
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
break;
|
||||
}
|
||||
U16_NEXT(replacementText, replIdx, replacementLength, c32);
|
||||
if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z
|
||||
(c32 >= 0x61 && c32 <= 0x7a) || // a..z
|
||||
(c32 >= 0x31 && c32 <= 0x39)) { // 0..9
|
||||
groupName.append(c32);
|
||||
} else if (c32 == RIGHTBRACKET) {
|
||||
groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
|
||||
if (groupNum == 0) {
|
||||
// Name not defined by pattern.
|
||||
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
}
|
||||
} else {
|
||||
// Character was something other than a name char or a closing '}'
|
||||
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// $ not followed by {name} or digits.
|
||||
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
}
|
||||
|
||||
|
||||
if (numDigits == 0) {
|
||||
// The $ didn't introduce a group number at all.
|
||||
// Treat it as just part of the substitution text.
|
||||
appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Finally, append the capture group data to the destination.
|
||||
destIdx += uregex_group((URegularExpression*)regexp, groupNum,
|
||||
dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
|
||||
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
// Ignore buffer overflow when extracting the group. We need to
|
||||
// continue on to get full size of the untruncated result. We will
|
||||
// raise our own buffer overflow error at the end.
|
||||
*status = U_ZERO_ERROR;
|
||||
if (U_SUCCESS(*status)) {
|
||||
destIdx += uregex_group((URegularExpression*)regexp, groupNum,
|
||||
dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
|
||||
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
// Ignore buffer overflow when extracting the group. We need to
|
||||
// continue on to get full size of the untruncated result. We will
|
||||
// raise our own buffer overflow error at the end.
|
||||
*status = U_ZERO_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
// Can fail if group number is out of range.
|
||||
// bad group number or name.
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -1483,10 +1544,12 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
|||
//
|
||||
if (destIdx < capacity) {
|
||||
dest[destIdx] = 0;
|
||||
} else if (destIdx == *destCapacity) {
|
||||
*status = U_STRING_NOT_TERMINATED_WARNING;
|
||||
} else {
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
} else if (U_SUCCESS(*status)) {
|
||||
if (destIdx == *destCapacity) {
|
||||
*status = U_STRING_NOT_TERMINATED_WARNING;
|
||||
} else {
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
|
|
@ -1022,7 +1022,7 @@ static void TestRegexCAPI(void) {
|
|||
TEST_ASSERT_SUCCESS(status);
|
||||
bufPtr = buf;
|
||||
bufCap = UPRV_LENGTHOF(buf);
|
||||
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", UPRV_LENGTHOF(repl));
|
||||
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ \\$ \\abc", UPRV_LENGTHOF(repl));
|
||||
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE);
|
||||
|
@ -1817,7 +1817,8 @@ static void TestUTextAPI(void) {
|
|||
UText *result;
|
||||
const char str_Replxxx[] = { 0x52, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x20, 0x3c, 0x61, 0x61, 0x3e, 0x20, 0x78, 0x31, 0x78, 0x20, 0x78, 0x2e, 0x2e, 0x2e, 0x78, 0x2e, 0x00 }; /* Replace <aa> x1x x...x. */
|
||||
const char str_Nomatchhere[] = { 0x4e, 0x6f, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x68, 0x65, 0x72, 0x65, 0x2e, 0x00 }; /* No match here. */
|
||||
const char str_u00411U00000042a[] = { 0x5c, 0x5c, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x31, 0x24, 0x31, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x34, 0x32, 0x24, 0x5c, 0x61, 0x00 }; /* \\\u0041$1\U00000042$\a */
|
||||
const char str_u00411U00000042a[] = { 0x5c, 0x5c, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x31, 0x24, 0x31,
|
||||
0x5c, 0x55, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x34, 0x32, 0x5c, 0x24, 0x5c, 0x61, 0x00 }; /* \\\u0041$1\U00000042\$\a */
|
||||
const char str_1x[] = { 0x3c, 0x24, 0x31, 0x3e, 0x00 }; /* <$1> */
|
||||
const char str_ReplaceAaaBax1xxx[] = { 0x52, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x20, 0x5c, 0x41, 0x61, 0x61, 0x42, 0x24, 0x61, 0x20, 0x78, 0x31, 0x78, 0x20, 0x78, 0x2e, 0x2e, 0x2e, 0x78, 0x2e, 0x00 }; /* Replace \AaaB$a x1x x...x. */
|
||||
status = U_ZERO_ERROR;
|
||||
|
@ -1925,7 +1926,7 @@ static void TestUTextAPI(void) {
|
|||
TEST_ASSERT_SUCCESS(status);
|
||||
bufPtr = buf;
|
||||
bufCap = UPRV_LENGTHOF(buf);
|
||||
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", UPRV_LENGTHOF(repl));
|
||||
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ \\$ \\abc", UPRV_LENGTHOF(repl));
|
||||
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE);
|
||||
|
|
|
@ -148,6 +148,15 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
|
|||
case 25: name = "TestBug11371";
|
||||
if (exec) TestBug11371();
|
||||
break;
|
||||
case 26: name = "TestBug11480";
|
||||
if (exec) TestBug11480();
|
||||
break;
|
||||
case 27: name = "NamedCapture";
|
||||
if (exec) NamedCapture();
|
||||
break;
|
||||
case 28: name = "NamedCaptureLimits";
|
||||
if (exec) NamedCaptureLimits();
|
||||
break;
|
||||
default: name = "";
|
||||
break; //needed to end loop
|
||||
}
|
||||
|
@ -1429,8 +1438,8 @@ void RegexTest::API_Replace() {
|
|||
REGEX_ASSERT(dest == "The value of $1 is bc.defg");
|
||||
|
||||
dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
|
||||
REGEX_ASSERT(U_FAILURE(status));
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
|
||||
replacement = replacement.unescape();
|
||||
|
@ -2633,7 +2642,9 @@ void RegexTest::API_Replace_UTF8() {
|
|||
REGEX_ASSERT(result == &destText);
|
||||
REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
|
||||
|
||||
const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
|
||||
const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
|
||||
0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
|
||||
0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
|
||||
utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
|
||||
result = matcher2->replaceFirst(&replText, NULL, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
@ -3108,7 +3119,7 @@ void RegexTest::API_Pattern_UTF8() {
|
|||
UnicodeString stringToSplit("first:second:third");
|
||||
UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
||||
|
||||
UText *splits[10] = {NULL};
|
||||
int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
@ -5137,7 +5148,7 @@ void RegexTest::PreAllocatedUTextCAPI () {
|
|||
|
||||
/* Unicode escapes */
|
||||
uregex_setText(re, text1, -1, &status);
|
||||
regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
|
||||
regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
|
||||
utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
|
||||
result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
@ -5196,6 +5207,276 @@ void RegexTest::PreAllocatedUTextCAPI () {
|
|||
utext_close(&patternText);
|
||||
}
|
||||
|
||||
|
||||
//--------------------------------------------------------------
|
||||
//
|
||||
// NamedCapture Check basic named capture group functionality
|
||||
//
|
||||
//--------------------------------------------------------------
|
||||
void RegexTest::NamedCapture() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
RegexPattern *pat = RegexPattern::compile(UnicodeString(
|
||||
"abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
int32_t group = pat->groupNumberFromName("five", -1, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(5 == group);
|
||||
group = pat->groupNumberFromName("three", -1, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(3 == group);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
group = pat->groupNumberFromName(UnicodeString("six"), status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(6 == group);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
|
||||
U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
// After copying a pattern, named capture should still work in the copy.
|
||||
RegexPattern *copiedPat = new RegexPattern(*pat);
|
||||
REGEX_ASSERT(*copiedPat == *pat);
|
||||
delete pat; pat = NULL; // Delete original, copy should have no references back to it.
|
||||
|
||||
group = copiedPat->groupNumberFromName("five", -1, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(5 == group);
|
||||
group = copiedPat->groupNumberFromName("three", -1, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(3 == group);
|
||||
delete copiedPat;
|
||||
|
||||
// ReplaceAll with named capture group.
|
||||
status = U_ZERO_ERROR;
|
||||
UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
|
||||
RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
// m.pattern().dumpPattern();
|
||||
UnicodeString replacedText = m->replaceAll("'${mid}'", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
|
||||
delete m;
|
||||
|
||||
// ReplaceAll, allowed capture group numbers.
|
||||
text = UnicodeString("abcmxyz");
|
||||
m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<$3>"), status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<$4>"), status);
|
||||
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
|
||||
REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
|
||||
REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
|
||||
REGEX_CHECK_STATUS; // that push group num out of range.
|
||||
REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
|
||||
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
|
||||
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("<${one"), status);
|
||||
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
|
||||
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
|
||||
delete m;
|
||||
|
||||
// Repeat the above replaceAll() tests using the plain C API, which
|
||||
// has a separate implementation internally.
|
||||
// TODO: factor out the test data.
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
text = UnicodeString("abcmxyz");
|
||||
uregex_setText(re, text.getBuffer(), text.length(), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
||||
UChar resultBuf[100];
|
||||
int32_t resultLength;
|
||||
UnicodeString repl;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<$0>");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<$1>");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<${one}>");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<$2>");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<$3>");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<$4>");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<$04>");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<$000016>");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<$3$2$1${one}>");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("$3$2$1${one}");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<${noSuchName}>");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<${invalid-name}>");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("<${one");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
repl = UnicodeString("$not a capture group");
|
||||
resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
|
||||
REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
|
||||
|
||||
uregex_close(re);
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------
|
||||
//
|
||||
// NamedCaptureLimits Patterns with huge numbers of named capture groups.
|
||||
// The point is not so much what the exact limit is,
|
||||
// but that a largish number doesn't hit bad non-linear performance,
|
||||
// and that exceeding the limit fails cleanly.
|
||||
//
|
||||
//--------------------------------------------------------------
|
||||
void RegexTest::NamedCaptureLimits() {
|
||||
if (quick) {
|
||||
logln("Skipping test. Runs in exhuastive mode only.");
|
||||
return;
|
||||
}
|
||||
const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
|
||||
const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
|
||||
char nnbuf[100];
|
||||
UnicodeString pattern;
|
||||
int32_t nn;
|
||||
|
||||
for (nn=1; nn<goodLimit; nn++) {
|
||||
sprintf(nnbuf, "(?<nn%d>)", nn);
|
||||
pattern.append(UnicodeString(nnbuf, -1, US_INV));
|
||||
}
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
for (nn=1; nn<goodLimit; nn++) {
|
||||
sprintf(nnbuf, "nn%d", nn);
|
||||
int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
|
||||
REGEX_ASSERT(nn == groupNum);
|
||||
if (nn != groupNum) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
delete pat;
|
||||
|
||||
pattern.remove();
|
||||
for (nn=1; nn<failLimit; nn++) {
|
||||
sprintf(nnbuf, "(?<nn%d>)", nn);
|
||||
pattern.append(UnicodeString(nnbuf, -1, US_INV));
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
pat = RegexPattern::compile(pattern, 0, status);
|
||||
REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
|
||||
delete pat;
|
||||
}
|
||||
|
||||
|
||||
//--------------------------------------------------------------
|
||||
//
|
||||
// Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
|
||||
|
@ -5487,5 +5768,26 @@ void RegexTest::TestBug11371() {
|
|||
}
|
||||
}
|
||||
|
||||
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
|
||||
void RegexTest::TestBug11480() {
|
||||
// C API, get capture group of a group that does not participate in the match.
|
||||
// (Returns a zero length string, with nul termination,
|
||||
// indistinguishable from a group with a zero lenght match.)
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
UnicodeString text = UNICODE_STRING_SIMPLE("A");
|
||||
uregex_setText(re, text.getBuffer(), text.length(), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
|
||||
UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
|
||||
int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
|
||||
REGEX_ASSERT(length == 0);
|
||||
REGEX_ASSERT(buf[0] == 13);
|
||||
REGEX_ASSERT(buf[1] == 0);
|
||||
REGEX_ASSERT(buf[2] == 13);
|
||||
uregex_close(re);
|
||||
}
|
||||
|
||||
|
||||
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2002-2014, International Business Machines Corporation and
|
||||
* Copyright (c) 2002-2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -41,6 +41,8 @@ public:
|
|||
virtual void API_Replace_UTF8();
|
||||
virtual void PerlTestsUTF8();
|
||||
virtual void PreAllocatedUTextCAPI();
|
||||
virtual void NamedCapture();
|
||||
virtual void NamedCaptureLimits();
|
||||
virtual void Bug7651();
|
||||
virtual void Bug7740();
|
||||
virtual void Bug8479();
|
||||
|
@ -51,6 +53,7 @@ public:
|
|||
virtual void TestCaseInsensitiveStarters();
|
||||
virtual void TestBug11049();
|
||||
virtual void TestBug11371();
|
||||
virtual void TestBug11480();
|
||||
|
||||
// The following functions are internal to the regexp tests.
|
||||
virtual void assertUText(const char *expected, UText *actual, const char *file, int line);
|
||||
|
|
11
icu4c/source/test/testdata/regextst.txt
vendored
11
icu4c/source/test/testdata/regextst.txt
vendored
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) 2001-2014 International Business Machines
|
||||
# Copyright (c) 2001-2015 International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file:
|
||||
|
@ -513,6 +513,15 @@
|
|||
"ab(?:(c)|(d))\1" i "abde"
|
||||
"ab(?:(c)|(d))\1" i "<0>ab<1>c</1>c</0>e"
|
||||
|
||||
# Named back references
|
||||
"(?<one>abcd)\k<one>" "<0><1>abcd</1>abcd</0>"
|
||||
"(no)?(?<one>abcd)\k<one>" "<0><2>abcd</2>abcd</0>"
|
||||
|
||||
"(?<a_1>...)" E " " # backref names are ascii letters & numbers only"
|
||||
"(?<1a>...)" E " " # backref names must begin with a letter"
|
||||
"(?<a>.)(?<a>.)" E " " # Repeated names are illegal.
|
||||
|
||||
|
||||
# Case Insensitive
|
||||
"aBc" i "<0>ABC</0>"
|
||||
"a[^bc]d" i "ABD"
|
||||
|
|
Loading…
Add table
Reference in a new issue