mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 09:21:03 +00:00
ICU-2422 regexp, {min,max} quantifiers added.
X-SVN-Rev: 10859
This commit is contained in:
parent
c11f4d65b3
commit
01147100bd
12 changed files with 584 additions and 229 deletions
|
@ -1840,6 +1840,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
|
|||
"U_REGEX_PROPERTY_SYNTAX",
|
||||
"U_REGEX_UNIMPLEMENTED",
|
||||
"U_REGEX_MISMATCHED_PAREN",
|
||||
"U_REGEX_NUMBER_TOO_BIG",
|
||||
"U_REGEX_BAD_INTERVAL"
|
||||
};
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
|
|
|
@ -625,6 +625,8 @@ typedef enum UErrorCode {
|
|||
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
|
||||
U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */
|
||||
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
|
||||
U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */
|
||||
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
|
||||
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
|
||||
|
||||
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||
|
|
|
@ -204,8 +204,8 @@ public:
|
|||
int32_t push(int32_t i, UErrorCode &status);
|
||||
|
||||
int32_t *reserveBlock(int32_t size, UErrorCode &status);
|
||||
int32_t *popBlock(int32_t amount);
|
||||
};
|
||||
int32_t *popFrame(int32_t size);
|
||||
};
|
||||
|
||||
|
||||
// UVector32 inlines
|
||||
|
@ -237,6 +237,15 @@ inline int32_t *UVector32::reserveBlock(int32_t size, UErrorCode &status) {
|
|||
return rp;
|
||||
}
|
||||
|
||||
inline int32_t *UVector32::popFrame(int32_t size) {
|
||||
U_ASSERT(count >= size);
|
||||
count -= size;
|
||||
if (count < 0) {
|
||||
count = 0;
|
||||
}
|
||||
return elements+count-size;
|
||||
}
|
||||
|
||||
|
||||
|
||||
inline int32_t UVector32::size(void) const {
|
||||
|
@ -288,13 +297,6 @@ inline int32_t UVector32::popi(void) {
|
|||
return result;
|
||||
}
|
||||
|
||||
inline int32_t *UVector32::popBlock(int32_t amount) {
|
||||
U_ASSERT(amount <= count);
|
||||
count -= amount;
|
||||
if (count < 0) {count = 0;}
|
||||
return elements + count;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
|
|
@ -145,13 +145,14 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
|||
{
|
||||
fStatus = &status;
|
||||
|
||||
fScanIndex = 0;
|
||||
fNextIndex = 0;
|
||||
fPeekChar = -1;
|
||||
fLineNum = 1;
|
||||
fCharNum = 0;
|
||||
fQuoteMode = FALSE;
|
||||
fFreeForm = FALSE;
|
||||
fScanIndex = 0;
|
||||
fNextIndex = 0;
|
||||
fPeekChar = -1;
|
||||
fLineNum = 1;
|
||||
fCharNum = 0;
|
||||
fQuoteMode = FALSE;
|
||||
fFreeForm = FALSE;
|
||||
fMatcherDataEnd = 0;
|
||||
|
||||
fMatchOpenParen = -1;
|
||||
fMatchCloseParen = -1;
|
||||
|
@ -374,13 +375,22 @@ void RegexCompile::compile(
|
|||
fRXPat->fMaxCaptureDigits = 1;
|
||||
int32_t n = 10;
|
||||
for (;;) {
|
||||
if (n > fRXPat->fNumCaptureGroups) {
|
||||
if (n > fRXPat->fGroupMap->size()) {
|
||||
break;
|
||||
}
|
||||
fRXPat->fMaxCaptureDigits++;
|
||||
n *= 10;
|
||||
}
|
||||
|
||||
//
|
||||
// The pattern's fFrameSize so far has accumulated the requirements for
|
||||
// storage for capture parentheses, counters, etc. that are encountered
|
||||
// in the pattern. Add space for the two variables that are always
|
||||
// present in the saved state: the input string position and the
|
||||
// position in the compiled pattern.
|
||||
//
|
||||
fRXPat->fFrameSize+=2;
|
||||
|
||||
//
|
||||
// A stupid bit of non-sense to prevent code coverage testing from complaining
|
||||
// about the pattern.dump() debug function. Go through the motions of dumping,
|
||||
|
@ -499,8 +509,9 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
// is an '|' alternation within the parens.
|
||||
{
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
fRXPat->fNumCaptureGroups++;
|
||||
int32_t cop = URX_BUILD(URX_START_CAPTURE, fRXPat->fNumCaptureGroups);
|
||||
int32_t varsLoc = fRXPat->fFrameSize; // Reserve two slots in match stack frame.
|
||||
fRXPat->fFrameSize += 2;
|
||||
int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
|
||||
fRXPat->fCompiledPat->addElement(cop, *fStatus);
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
|
||||
|
@ -511,6 +522,9 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
fParenStack.push(-2, *fStatus); // Begin a new frame.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
|
||||
|
||||
// Save the mapping from group number to stack frame variable position.
|
||||
fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -704,6 +718,64 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
break;
|
||||
|
||||
|
||||
case doIntervalInit:
|
||||
// The '{' opening an interval quantifier was just scanned.
|
||||
// Init the counter varaiables that will accumulate the values as the digits
|
||||
// are scanned.
|
||||
fIntervalLow = 0;
|
||||
fIntervalUpper = -1;
|
||||
break;
|
||||
|
||||
case doIntevalLowerDigit:
|
||||
// Scanned a digit from the lower value of an {lower,upper} interval
|
||||
{
|
||||
int32_t digitValue = u_charDigitValue(fC.fChar);
|
||||
U_ASSERT(digitValue >= 0);
|
||||
fIntervalLow = fIntervalLow*10 + digitValue;
|
||||
if (fIntervalLow < 0) {
|
||||
error(U_REGEX_NUMBER_TOO_BIG);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case doIntervalUpperDigit:
|
||||
// Scanned a digit from the upper value of an {lower,upper} interval
|
||||
{
|
||||
if (fIntervalUpper < 0) {
|
||||
fIntervalUpper = 0;
|
||||
}
|
||||
int32_t digitValue = u_charDigitValue(fC.fChar);
|
||||
U_ASSERT(digitValue >= 0);
|
||||
fIntervalUpper = fIntervalUpper*10 + digitValue;
|
||||
if (fIntervalLow < 0) {
|
||||
error(U_REGEX_NUMBER_TOO_BIG);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case doIntervalSame:
|
||||
// Scanned a single value interval like {27}. Upper = Lower.
|
||||
fIntervalUpper = fIntervalLow;
|
||||
break;
|
||||
|
||||
case doInterval:
|
||||
// Finished scanning a normal {lower,upper} interval. Generate the code for it.
|
||||
compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
|
||||
break;
|
||||
|
||||
case doPossesiveInterval:
|
||||
// Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
|
||||
compileInterval(URX_CTR_INIT_P, URX_CTR_LOOP_P);
|
||||
break;
|
||||
|
||||
case doNGInterval:
|
||||
// Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
|
||||
compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
|
||||
break;
|
||||
|
||||
case doIntervalError:
|
||||
error(U_REGEX_BAD_INTERVAL);
|
||||
break;
|
||||
|
||||
case doLiteralChar:
|
||||
// We've just scanned a "normal" character from the pattern,
|
||||
|
@ -835,11 +907,6 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
case doNotImplementedError:
|
||||
// TODO: get rid of this once everything is implemented.
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
error(U_REGEX_INTERNAL_ERROR);
|
||||
|
@ -995,6 +1062,54 @@ void RegexCompile::fixLiterals(UBool split) {
|
|||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// insertOp() Insert a slot for a new opcode into the already
|
||||
// compiled pattern code.
|
||||
//
|
||||
// Fill the slot with a NOP. Our caller will replace it
|
||||
// with what they really wanted.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
void RegexCompile::insertOp(int32_t where) {
|
||||
UVector32 *code = fRXPat->fCompiledPat;
|
||||
U_ASSERT(where>0 && where < code->size());
|
||||
|
||||
int32_t nop = URX_BUILD(URX_NOP, 0);
|
||||
code->insertElementAt(nop, where, *fStatus);
|
||||
|
||||
// Walk through the pattern, looking for any ops with targets that
|
||||
// were moved down by the insert. Fix them.
|
||||
int32_t loc;
|
||||
for (loc=0; loc<code->size(); loc++) {
|
||||
int32_t op = code->elementAti(loc);
|
||||
int32_t opType = URX_TYPE(op);
|
||||
int32_t opValue = URX_VAL(op);
|
||||
if ((opType == URX_JMP ||
|
||||
opType == URX_STATE_SAVE ||
|
||||
opType == URX_CTR_LOOP ||
|
||||
opType == URX_RELOC_OPRND) && opValue > where) {
|
||||
// Target location for this opcode is after the insertion point and
|
||||
// needs to be incremented to adjust for the insertion.
|
||||
opValue++;
|
||||
op = URX_BUILD(opType, opValue);
|
||||
code->setElementAt(op, loc);
|
||||
}
|
||||
}
|
||||
|
||||
// Now fix up the parentheses stack. All positive values in it are locations in
|
||||
// the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
|
||||
for (loc=0; loc<fParenStack.size(); loc++) {
|
||||
int32_t x = fParenStack.elementAti(loc);
|
||||
if (x>where) {
|
||||
x++;
|
||||
fParenStack.setElementAt(x, loc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// blockTopLoc() Find or create a location in the compiled pattern
|
||||
|
@ -1007,9 +1122,10 @@ void RegexCompile::fixLiterals(UBool split) {
|
|||
// is reserved for this purpose. .* or similar don't
|
||||
// and a slot needs to be added.
|
||||
//
|
||||
// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
|
||||
// at the returned location.
|
||||
// FALSE - just return the address, reserve a location there.
|
||||
// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
|
||||
// at the returned location.
|
||||
// FALSE - just return the address,
|
||||
// do not reserve a location there.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
|
||||
|
@ -1097,9 +1213,8 @@ void RegexCompile::handleCloseParen() {
|
|||
{
|
||||
int32_t captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
|
||||
U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
|
||||
int32_t captureGroupNumber = URX_VAL(captureOp);
|
||||
U_ASSERT(captureGroupNumber > 0);
|
||||
int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, captureGroupNumber);
|
||||
int32_t framVarLocation = URX_VAL(captureOp);
|
||||
int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, framVarLocation+1);
|
||||
fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
@ -1167,6 +1282,53 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
|
|||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// compileInterval Generate the code for a {min, max} style interval quantifier.
|
||||
// Except for the specific opcodes used, the code is the same
|
||||
// for all three types (greedy, non-greedy, possessive) of
|
||||
// intervals. The opcodes are supplied as parameters.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
|
||||
{
|
||||
// The CTR_INIT op at the top of the block with the {n,m} quantifier takes
|
||||
// four slots in the compiled code. Reserve them.
|
||||
int32_t topOfBlock = blockTopLoc(TRUE);
|
||||
insertOp(topOfBlock);
|
||||
insertOp(topOfBlock);
|
||||
insertOp(topOfBlock);
|
||||
|
||||
// The operands for the CTR_INIT opcode include the index in the matcher data
|
||||
// of the counter. Allocate it now.
|
||||
int32_t counterLoc = fRXPat->fFrameSize;
|
||||
fRXPat->fFrameSize++;
|
||||
|
||||
int32_t op = URX_BUILD(InitOp, counterLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
|
||||
|
||||
// The second operand of CTR_INIT is the location following the end of the loop.
|
||||
// Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
|
||||
// compilation of something later on causes the code to grow and the target
|
||||
// position to move.
|
||||
int32_t loopEnd = fRXPat->fCompiledPat->size();
|
||||
op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
|
||||
fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
|
||||
|
||||
// Followed by the min and max counts.
|
||||
fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
|
||||
fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
|
||||
|
||||
// Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
|
||||
// Goes at end of the block being looped over, so just append to the code so far.
|
||||
op = URX_BUILD(LoopOp, topOfBlock);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// Error Report a rule parse error.
|
||||
|
|
|
@ -81,8 +81,12 @@ private:
|
|||
// there is space to add an opcode there.
|
||||
void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for
|
||||
// a reference to a UnicodeSet.
|
||||
void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier.
|
||||
int32_t LoopOp);
|
||||
void literalChar(); // Compile a literal char
|
||||
void fixLiterals(UBool split=FALSE); // Fix literal strings.
|
||||
void insertOp(int32_t where); // Open up a slot for a new op in the
|
||||
// generated code at the specified location.
|
||||
|
||||
|
||||
UErrorCode *fStatus;
|
||||
|
@ -121,9 +125,9 @@ private:
|
|||
|
||||
int32_t fPatternLength; // Length of the input pattern string.
|
||||
|
||||
UStack fParenStack; // parentheses stack. Each frame consists of
|
||||
UVector32 fParenStack; // parentheses stack. Each frame consists of
|
||||
// the positions of compiled pattern operations
|
||||
// needing fixup, followed by negative vallue. The
|
||||
// needing fixup, followed by negative value. The
|
||||
// first entry in each frame is the position of the
|
||||
// spot reserved for use when a quantifier
|
||||
// needs to add a SAVE at the start of a (block)
|
||||
|
@ -140,6 +144,16 @@ private:
|
|||
// location after the most recently processed
|
||||
// parenthesized block.
|
||||
|
||||
int32_t fIntervalLow; // {lower, upper} interval quantifier values.
|
||||
int32_t fIntervalUpper; // Placed here temporarily, when pattern is
|
||||
// initially scanned. Each new interval
|
||||
// encountered overwrites these values.
|
||||
|
||||
// -1 for the upper interval value means none
|
||||
// was specified (unlimited occurences.)
|
||||
|
||||
int32_t fMatcherDataEnd; // Location Counter for allocation of data
|
||||
// to be used by the matcher at match time.
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -28,6 +28,7 @@ enum Regex_PatternParseAction {
|
|||
doOpenCaptureParen,
|
||||
doBadOpenParenType,
|
||||
doRuleError,
|
||||
doIntevalLowerDigit,
|
||||
doBackslashs,
|
||||
doNGOpt,
|
||||
doNamedChar,
|
||||
|
@ -37,12 +38,14 @@ enum Regex_PatternParseAction {
|
|||
doOpenLookBehind,
|
||||
doBackslashx,
|
||||
doBackslashz,
|
||||
doIntervalError,
|
||||
doStar,
|
||||
doCaret,
|
||||
doEnterQuoteMode,
|
||||
doPossesivePlus,
|
||||
doNGStar,
|
||||
doMatchMode,
|
||||
doIntervalUpperDigit,
|
||||
doOpenLookAheadNeg,
|
||||
doPlus,
|
||||
doOpenNonCaptureParen,
|
||||
|
@ -54,7 +57,10 @@ enum Regex_PatternParseAction {
|
|||
doPossesiveOpt,
|
||||
doBackslashG,
|
||||
doOpt,
|
||||
doInterval,
|
||||
doLiteralChar,
|
||||
doPossesiveInterval,
|
||||
doIntervalInit,
|
||||
doOpenAtomicParen,
|
||||
doBackslashS,
|
||||
doOpenLookAhead,
|
||||
|
@ -67,10 +73,11 @@ enum Regex_PatternParseAction {
|
|||
doBackslashZ,
|
||||
doNOP,
|
||||
doExit,
|
||||
doNGInterval,
|
||||
doPatStart,
|
||||
doBackslashb,
|
||||
doNotImplementedError,
|
||||
doBackslashd,
|
||||
doIntervalSame,
|
||||
doOpenLookBehindNeg,
|
||||
rbbiLastAction};
|
||||
|
||||
|
@ -99,13 +106,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doDotAny, 46 /* . */, 12,0, TRUE} // 6
|
||||
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
|
||||
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
|
||||
, {doNOP, 92 /* \ */, 59,0, TRUE} // 9
|
||||
, {doNOP, 92 /* \ */, 70,0, TRUE} // 9
|
||||
, {doPatFinish, 253, 2,0, FALSE} // 10
|
||||
, {doRuleError, 255, 79,0, FALSE} // 11
|
||||
, {doRuleError, 255, 90,0, FALSE} // 11
|
||||
, {doNOP, 42 /* * */, 48,0, TRUE} // 12 expr-quant
|
||||
, {doNOP, 43 /* + */, 51,0, TRUE} // 13
|
||||
, {doNOP, 63 /* ? */, 54,0, TRUE} // 14
|
||||
, {doNOP, 123 /* { */, 57,0, TRUE} // 15
|
||||
, {doIntervalInit, 123 /* { */, 57,0, TRUE} // 15
|
||||
, {doNOP, 255, 17,0, FALSE} // 16
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 17 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 18
|
||||
|
@ -123,12 +130,12 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 30
|
||||
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 31
|
||||
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 32
|
||||
, {doBadOpenParenType, 255, 79,0, FALSE} // 33
|
||||
, {doBadOpenParenType, 255, 90,0, FALSE} // 33
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 34 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 35
|
||||
, {doBadOpenParenType, 255, 79,0, FALSE} // 36
|
||||
, {doBadOpenParenType, 255, 90,0, FALSE} // 36
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 37 paren-comment
|
||||
, {doMismatchedParenErr, 253, 79,0, FALSE} // 38
|
||||
, {doMismatchedParenErr, 253, 90,0, FALSE} // 38
|
||||
, {doNOP, 255, 37,0, TRUE} // 39
|
||||
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 40 paren-flag
|
||||
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 41
|
||||
|
@ -137,7 +144,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 44
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 45
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 46
|
||||
, {doNOP, 255, 79,0, FALSE} // 47
|
||||
, {doNOP, 255, 90,0, FALSE} // 47
|
||||
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 48 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 49
|
||||
, {doStar, 255, 17,0, FALSE} // 50
|
||||
|
@ -148,28 +155,39 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doPossesiveOpt, 43 /* + */, 17,0, TRUE} // 55
|
||||
, {doOpt, 255, 17,0, FALSE} // 56
|
||||
, {doNOP, 129, 57,0, TRUE} // 57 interval-open
|
||||
, {doNotImplementedError, 255, 79,0, FALSE} // 58
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 59 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 60
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 61
|
||||
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 62
|
||||
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 63
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 64
|
||||
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 65
|
||||
, {doProperty, 112 /* p */, 12,0, FALSE} // 66
|
||||
, {doProperty, 80 /* P */, 12,0, FALSE} // 67
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 68
|
||||
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 69
|
||||
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 70
|
||||
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 71
|
||||
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 72
|
||||
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 73
|
||||
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 74
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 75
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 76
|
||||
, {doBackRef, 128, 12,0, TRUE} // 77
|
||||
, {doLiteralChar, 255, 12,0, TRUE} // 78
|
||||
, {doExit, 255, 79,0, TRUE} // 79 errorDeath
|
||||
, {doNOP, 128, 60,0, FALSE} // 58
|
||||
, {doIntervalError, 255, 90,0, FALSE} // 59
|
||||
, {doIntevalLowerDigit, 128, 60,0, TRUE} // 60 interval-lower
|
||||
, {doNOP, 44 /* , */, 64,0, TRUE} // 61
|
||||
, {doIntervalSame, 125 /* } */, 67,0, TRUE} // 62
|
||||
, {doIntervalError, 255, 90,0, FALSE} // 63
|
||||
, {doIntervalUpperDigit, 128, 64,0, TRUE} // 64 interval-upper
|
||||
, {doNOP, 125 /* } */, 67,0, TRUE} // 65
|
||||
, {doIntervalError, 255, 90,0, FALSE} // 66
|
||||
, {doNGInterval, 63 /* ? */, 17,0, TRUE} // 67 interval-type
|
||||
, {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 68
|
||||
, {doInterval, 255, 17,0, FALSE} // 69
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 70 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 71
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 72
|
||||
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 73
|
||||
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 74
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 75
|
||||
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 76
|
||||
, {doProperty, 112 /* p */, 12,0, FALSE} // 77
|
||||
, {doProperty, 80 /* P */, 12,0, FALSE} // 78
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 79
|
||||
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 80
|
||||
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 81
|
||||
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 82
|
||||
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 83
|
||||
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 84
|
||||
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 85
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 86
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 87
|
||||
, {doBackRef, 128, 12,0, TRUE} // 88
|
||||
, {doLiteralChar, 255, 12,0, TRUE} // 89
|
||||
, {doExit, 255, 90,0, TRUE} // 90 errorDeath
|
||||
};
|
||||
static const char * const RegexStateNames[] = { 0,
|
||||
"start",
|
||||
|
@ -229,6 +247,17 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
"interval-open",
|
||||
0,
|
||||
0,
|
||||
"interval-lower",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"interval-upper",
|
||||
0,
|
||||
0,
|
||||
"interval-type",
|
||||
0,
|
||||
0,
|
||||
"backslash",
|
||||
0,
|
||||
|
|
|
@ -64,15 +64,15 @@ start:
|
|||
# term. At a position where we can accept the start most items in a pattern.
|
||||
#
|
||||
term:
|
||||
quoted n expr-quant doLiteralChar
|
||||
rule_char n expr-quant doLiteralChar
|
||||
quoted n expr-quant doLiteralChar
|
||||
rule_char n expr-quant doLiteralChar
|
||||
'[' n expr-quant doScanUnicodeSet
|
||||
'(' n open-paren
|
||||
'.' n expr-quant doDotAny
|
||||
'^' n term doCaret
|
||||
'$' n term doDollar
|
||||
'\' n backslash
|
||||
eof term doPatFinish
|
||||
eof term doPatFinish
|
||||
default errorDeath doRuleError
|
||||
|
||||
|
||||
|
@ -85,7 +85,7 @@ expr-quant:
|
|||
'*' n quant-star
|
||||
'+' n quant-plus
|
||||
'?' n quant-opt
|
||||
'{' n interval-open
|
||||
'{' n interval-open doIntervalInit
|
||||
default expr-cont
|
||||
|
||||
|
||||
|
@ -182,14 +182,28 @@ quant-opt:
|
|||
|
||||
#
|
||||
# Interval scanning a '{', the opening delimiter for an interval specification
|
||||
# {number} or {min, max}
|
||||
# {number} or {min, max} or {min, }
|
||||
#
|
||||
interval-open:
|
||||
white_space n interval-open
|
||||
default errorDeath doNotImplementedError
|
||||
|
||||
white_space n interval-open # TODO: is white space allowed here in non-free mode?
|
||||
digit_char interval-lower
|
||||
default errorDeath doIntervalError
|
||||
|
||||
interval-lower:
|
||||
digit_char n interval-lower doIntevalLowerDigit
|
||||
',' n interval-upper
|
||||
'}' n interval-type doIntervalSame # {n}
|
||||
default errorDeath doIntervalError
|
||||
|
||||
interval-upper:
|
||||
digit_char n interval-upper doIntervalUpperDigit
|
||||
'}' n interval-type
|
||||
default errorDeath doIntervalError
|
||||
|
||||
interval-type:
|
||||
'?' n expr-cont doNGInterval # {n,m}?
|
||||
'+' n expr-cont doPossesiveInterval # {n,m}+
|
||||
default expr-cont doInterval # {m,n}
|
||||
|
||||
|
||||
#
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#ifndef _REGEXIMP_H
|
||||
#define _REGEXIMP_H
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//
|
||||
// debugging support. Enable one or more of the #defines immediately following
|
||||
|
@ -50,7 +51,7 @@
|
|||
// of the entries.
|
||||
//
|
||||
enum {
|
||||
URX_RESERVED_OP = 0,
|
||||
URX_RESERVED_OP = 0, // For multi-operand ops, most non-first words.
|
||||
URX_BACKTRACK = 1,
|
||||
URX_END = 2,
|
||||
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
|
||||
|
@ -77,13 +78,24 @@ enum {
|
|||
URX_DOTANY_ALL = 21, // ., in the . matches any mode.
|
||||
URX_BACKSLASH_D = 22, // Value field: 0: \d 1: \D
|
||||
URX_CARET = 23, // Value field: 1: multi-line mode.
|
||||
URX_DOLLAR = 24 // Also for \Z
|
||||
URX_DOLLAR = 24, // Also for \Z
|
||||
|
||||
URX_CTR_INIT = 25, // Counter Inits for {Interval} loops.
|
||||
URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possesive.
|
||||
URX_CTR_INIT_P = 27, // These are 4 word opcodes. See description.
|
||||
URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
|
||||
URX_CTR_LOOP_NG = 29, // Also in three flavors.
|
||||
URX_CTR_LOOP_P = 30,
|
||||
|
||||
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
|
||||
// back into compiled pattern code, and thus must
|
||||
// be relocated when inserting/deleting ops in code.
|
||||
};
|
||||
|
||||
// Keep this list of opcode names in sync with the above enum
|
||||
// Used for debug printing only.
|
||||
#define URX_OPCODE_NAMES \
|
||||
"URX_RESERVED_OP", \
|
||||
" ", \
|
||||
"URX_BACKTRACK", \
|
||||
"END", \
|
||||
"ONECHAR", \
|
||||
|
@ -107,7 +119,14 @@ enum {
|
|||
"URX_DOTANY_ALL", \
|
||||
"URX_BACKSLASH_D", \
|
||||
"URX_CARET", \
|
||||
"URX_DOLLAR"
|
||||
"URX_DOLLAR", \
|
||||
"CTR_INIT", \
|
||||
"CTR_INIT_NG", \
|
||||
"CTR_INIT_P", \
|
||||
"CTR_LOOP", \
|
||||
"CTR_LOOP_NG", \
|
||||
"CTR_LOOP_P", \
|
||||
"RELOC_OPRND"
|
||||
|
||||
//
|
||||
// Convenience macros for assembling and disassembling a compiled operation.
|
||||
|
@ -132,5 +151,18 @@ enum {
|
|||
// membership test.
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// Match Engine State Stack Frame Layout.
|
||||
//
|
||||
struct REStackFrame {
|
||||
int32_t fInputIdx; // Position of next character in the input string
|
||||
int32_t fPatIdx; // Position of next Op in the compiled pattern
|
||||
int32_t fExtra[2]; // Extra state, for capture group start/ends
|
||||
// atomic parentheses, repeat counts, etc.
|
||||
// Locations assigned at pattern compile time.
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
||||
|
||||
|
|
|
@ -38,23 +38,14 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
|
|||
fInputUC = NULL;
|
||||
fInputLength = 0;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fBackTrackStack = new UVector32(status); // TODO: do something with status.
|
||||
fCaptureStarts = new UVector32(status);
|
||||
fCaptureEnds = new UVector32(status);
|
||||
int i;
|
||||
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
|
||||
fCaptureStarts->addElement(-1, status);
|
||||
fCaptureEnds ->addElement(-1, status);
|
||||
}
|
||||
fStack = new UVector32(status); // TODO: do something with status.
|
||||
reset();
|
||||
}
|
||||
|
||||
|
||||
|
||||
RegexMatcher::~RegexMatcher() {
|
||||
delete fBackTrackStack;
|
||||
delete fCaptureStarts;
|
||||
delete fCaptureEnds;
|
||||
delete fStack;
|
||||
}
|
||||
|
||||
|
||||
|
@ -193,7 +184,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
|
|||
err = U_REGEX_INVALID_STATE;
|
||||
return -1;
|
||||
}
|
||||
if (group < 0 || group > fPattern->fNumCaptureGroups) {
|
||||
if (group < 0 || group > fPattern->fGroupMap->size()) {
|
||||
err = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return -1;
|
||||
}
|
||||
|
@ -201,13 +192,19 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
|
|||
if (group == 0) {
|
||||
e = fMatchEnd;
|
||||
} else {
|
||||
// Get the position within the stack frame of the variables for
|
||||
// this capture group.
|
||||
int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
|
||||
U_ASSERT(groupOffset < fPattern->fFrameSize);
|
||||
U_ASSERT(groupOffset >= 0);
|
||||
|
||||
// Note: When the match engine backs out of a capture group, it sets the
|
||||
// group's start position to -1. The end position is left with junk.
|
||||
// So, before returning an end position, we must first check that
|
||||
// the start position indicates that the group matched something.
|
||||
int32_t s = fCaptureStarts->elementAti(group);
|
||||
int32_t s = fFrame->fExtra[groupOffset];
|
||||
if (s != -1) {
|
||||
e = fCaptureEnds->elementAti(group);
|
||||
e = fFrame->fExtra[groupOffset + 1];
|
||||
}
|
||||
}
|
||||
return e;
|
||||
|
@ -301,7 +298,7 @@ UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
|
|||
|
||||
|
||||
int32_t RegexMatcher::groupCount() const {
|
||||
return fPattern->fNumCaptureGroups;
|
||||
return fPattern->fGroupMap->size();
|
||||
}
|
||||
|
||||
|
||||
|
@ -398,11 +395,7 @@ RegexMatcher &RegexMatcher::reset() {
|
|||
fMatchEnd = 0;
|
||||
fLastMatchEnd = 0;
|
||||
fMatch = FALSE;
|
||||
int i;
|
||||
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
|
||||
fCaptureStarts->setElementAt(-1, i);
|
||||
}
|
||||
|
||||
resetStack();
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -418,6 +411,20 @@ RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
|
|||
|
||||
|
||||
|
||||
REStackFrame *RegexMatcher::resetStack() {
|
||||
// Discard any previous contents of the state save stack, and initialize a
|
||||
// new stack frame to all -1. The -1s are needed for capture group limits, where
|
||||
// they indicate that a group has not yet matched anything.
|
||||
fStack->removeAllElements();
|
||||
UErrorCode status = U_ZERO_ERROR; // TODO: do something with status
|
||||
int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, status);
|
||||
int i;
|
||||
for (i=0; i<fPattern->fFrameSize; i++) {
|
||||
iFrame[i] = -1;
|
||||
}
|
||||
return (REStackFrame *)iFrame;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// start
|
||||
|
@ -438,7 +445,7 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
|
|||
err = U_REGEX_INVALID_STATE;
|
||||
return -1;
|
||||
}
|
||||
if (group < 0 || group > fPattern->fNumCaptureGroups) {
|
||||
if (group < 0 || group > fPattern->fGroupMap->size()) {
|
||||
err = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return -1;
|
||||
}
|
||||
|
@ -446,7 +453,10 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
|
|||
if (group == 0) {
|
||||
s = fMatchStart;
|
||||
} else {
|
||||
s = fCaptureStarts->elementAti(group);
|
||||
int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
|
||||
U_ASSERT(groupOffset < fPattern->fFrameSize);
|
||||
U_ASSERT(groupOffset >= 0);
|
||||
s = fFrame->fExtra[groupOffset];
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
@ -501,28 +511,37 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) {
|
|||
return isBoundary;
|
||||
}
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// backTrack Within the match engine, this function is called when
|
||||
// a local match failure occurs, and the match needs to back
|
||||
// track and proceed down another path.
|
||||
// StateSave
|
||||
// Make a new stack frame, initialized as a copy of the current stack frame.
|
||||
// Set the pattern index in the original stack frame from the operand value
|
||||
// in the opcode. Execution of the engine continues with the state in
|
||||
// the newly created stack frame
|
||||
//
|
||||
// Note: Inline function. Keep its body above MatchAt().
|
||||
// Note that reserveBlock() may grow the stack, resulting in the
|
||||
// whole thing being relocated in memory.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
void RegexMatcher::backTrack(int32_t &inputIdx, int32_t &patIdx) {
|
||||
int32_t *sp = fBackTrackStack->popBlock(fCaptureStateSize);
|
||||
int i;
|
||||
for (i=fPattern->fNumCaptureGroups; i>=1; i--) {
|
||||
fCapStarts[i] = *sp++;
|
||||
fCapEnds[i] = *sp++;
|
||||
inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) {
|
||||
// push storage for a new frame.
|
||||
int32_t *newFP = fStack->reserveBlock(frameSize, status);
|
||||
fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack.
|
||||
|
||||
// New stack frame = copy of old top frame.
|
||||
int32_t *source = (int32_t *)fp;
|
||||
int32_t *dest = newFP;
|
||||
for (;;) {
|
||||
*dest++ = *source++;
|
||||
if (source == newFP) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
patIdx = *sp++;
|
||||
inputIdx = *sp++;
|
||||
|
||||
fp->fPatIdx = savePatIdx;
|
||||
return (REStackFrame *)newFP;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -530,8 +549,6 @@ void RegexMatcher::backTrack(int32_t &inputIdx, int32_t &patIdx) {
|
|||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
int32_t inputIdx = startIdx; // Current position in the input string.
|
||||
int32_t patIdx = 0; // Current position in the compiled pattern.
|
||||
UBool isMatch = FALSE; // True if the we have a match.
|
||||
|
||||
int32_t op; // Operation from the compiled pattern, split into
|
||||
|
@ -565,40 +582,35 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
return;
|
||||
}
|
||||
|
||||
// Clear out capture results from any previous match.
|
||||
// Required for capture groups in patterns with | operations that may not match at all,
|
||||
// although the pattern as a whole does match.
|
||||
int i;
|
||||
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
|
||||
fCaptureStarts->setElementAt(-1, i);
|
||||
}
|
||||
|
||||
// Cache frequently referenced items from the compiled pattern
|
||||
// in local variables.
|
||||
//
|
||||
int32_t *pat = fPattern->fCompiledPat->getBuffer();
|
||||
fCapStarts = fCaptureStarts->getBuffer();
|
||||
fCapEnds = fCaptureEnds->getBuffer();
|
||||
fCaptureStateSize = fPattern->fNumCaptureGroups*2 + 2;
|
||||
int32_t *pat = fPattern->fCompiledPat->getBuffer();
|
||||
|
||||
const UChar *litText = fPattern->fLiteralText.getBuffer();
|
||||
UVector *sets = fPattern->fSets;
|
||||
int32_t inputLen = fInput->length();
|
||||
|
||||
REStackFrame *fp = resetStack();
|
||||
int32_t frameSize = fPattern->fFrameSize;
|
||||
|
||||
fp->fPatIdx = 0;
|
||||
fp->fInputIdx = startIdx;
|
||||
|
||||
|
||||
//
|
||||
// Main loop for interpreting the compiled pattern.
|
||||
// One iteration of the loop per pattern operation performed.
|
||||
//
|
||||
for (;;) {
|
||||
op = pat[patIdx];
|
||||
op = pat[fp->fPatIdx];
|
||||
opType = URX_TYPE(op);
|
||||
opValue = URX_VAL(op);
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
printf("inputIdx=%d inputChar=%c ", inputIdx, fInput->char32At(inputIdx));
|
||||
fPattern->dumpOp(patIdx);
|
||||
printf("inputIdx=%d inputChar=%c ", fp->fInputIdx, fInput->char32At(fp->fInputIdx));
|
||||
fPattern->dumpOp(fp->fPatIdx);
|
||||
#endif
|
||||
patIdx++;
|
||||
fp->fPatIdx++;
|
||||
|
||||
switch (opType) {
|
||||
|
||||
|
@ -611,19 +623,19 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
// Force a backtrack. In some circumstances, the pattern compiler
|
||||
// will notice that the pattern can't possibly match anything, and will
|
||||
// emit one of these at that point.
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
|
||||
|
||||
case URX_ONECHAR:
|
||||
if (inputIdx < fInputLength) {
|
||||
if (fp->fInputIdx < fInputLength) {
|
||||
UChar32 c;
|
||||
U16_NEXT(fInputUC, inputIdx, fInputLength, c);
|
||||
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
|
||||
if (c == opValue) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
|
||||
|
||||
|
@ -635,21 +647,21 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
int32_t stringStartIdx, stringLen;
|
||||
stringStartIdx = opValue;
|
||||
|
||||
op = pat[patIdx];
|
||||
patIdx++;
|
||||
op = pat[fp->fPatIdx];
|
||||
fp->fPatIdx++;
|
||||
opType = URX_TYPE(op);
|
||||
opValue = URX_VAL(op);
|
||||
U_ASSERT(opType == URX_STRING_LEN);
|
||||
stringLen = opValue;
|
||||
|
||||
int32_t stringEndIndex = inputIdx + stringLen;
|
||||
int32_t stringEndIndex = fp->fInputIdx + stringLen;
|
||||
if (stringEndIndex <= inputLen &&
|
||||
u_strncmp(fInputUC+inputIdx, litText+stringStartIdx, stringLen) == 0) {
|
||||
u_strncmp(fInputUC+fp->fInputIdx, litText+stringStartIdx, stringLen) == 0) {
|
||||
// Success. Advance the current input position.
|
||||
inputIdx = stringEndIndex;
|
||||
fp->fInputIdx = stringEndIndex;
|
||||
} else {
|
||||
// No match. Back up matching to a saved state
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -657,18 +669,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
|
||||
|
||||
case URX_STATE_SAVE:
|
||||
// Save the state of all capture groups, the pattern continuation
|
||||
// postion and the input position.
|
||||
{
|
||||
int32_t *stackPtr = fBackTrackStack->reserveBlock(fCaptureStateSize, status);
|
||||
int i;
|
||||
for (i=fPattern->fNumCaptureGroups; i>0; i--) {
|
||||
*stackPtr++ = fCapStarts[i];
|
||||
*stackPtr++ = fCapEnds[i];
|
||||
}
|
||||
*stackPtr++ = opValue;
|
||||
*stackPtr++ = inputIdx;
|
||||
}
|
||||
fp = StateSave(fp, opValue, frameSize, status);
|
||||
break;
|
||||
|
||||
|
||||
|
@ -679,69 +680,69 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
goto breakFromLoop;
|
||||
|
||||
case URX_START_CAPTURE:
|
||||
U_ASSERT(opValue > 0 && opValue <= fPattern->fNumCaptureGroups);
|
||||
fCapStarts[opValue] = inputIdx;
|
||||
U_ASSERT(opValue >= 0 && opValue < frameSize-3);
|
||||
fp->fExtra[opValue] = fp->fInputIdx;
|
||||
break;
|
||||
|
||||
|
||||
case URX_END_CAPTURE:
|
||||
U_ASSERT(opValue > 0 && opValue <= fPattern->fNumCaptureGroups);
|
||||
U_ASSERT(fCaptureStarts->elementAti(opValue) >= 0);
|
||||
fCapEnds[opValue] = inputIdx;
|
||||
U_ASSERT(opValue > 0 && opValue < frameSize-2);
|
||||
U_ASSERT(fp->fExtra[opValue-1] >= 0); // Start pos for this group must be set.
|
||||
fp->fExtra[opValue] = fp->fInputIdx;
|
||||
break;
|
||||
|
||||
|
||||
case URX_DOLLAR: // $, test for End of line
|
||||
// or for position before new line at end of input
|
||||
if (inputIdx < inputLen-2) {
|
||||
if (fp->fInputIdx < inputLen-2) {
|
||||
// We are no where near the end of input. Fail.
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
if (inputIdx >= inputLen) {
|
||||
if (fp->fInputIdx >= inputLen) {
|
||||
// We really are at the end of input. Success.
|
||||
break;
|
||||
}
|
||||
// If we are positioned just before a new-line that is located at the
|
||||
// end of input, succeed.
|
||||
if (inputIdx == inputLen-1) {
|
||||
UChar32 c = fInput->char32At(inputIdx);
|
||||
if (fp->fInputIdx == inputLen-1) {
|
||||
UChar32 c = fInput->char32At(fp->fInputIdx);
|
||||
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
break; // At new-line at end of input. Success
|
||||
}
|
||||
}
|
||||
|
||||
if (inputIdx == inputLen-2) {
|
||||
if (fInput->char32At(inputIdx) == 0x0d && fInput->char32At(inputIdx+1) == 0x0a) {
|
||||
if (fp->fInputIdx == inputLen-2) {
|
||||
if (fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) {
|
||||
break; // At CR/LF at end of input. Success
|
||||
}
|
||||
}
|
||||
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
|
||||
// TODO: support for multi-line mode.
|
||||
break;
|
||||
|
||||
|
||||
case URX_CARET: // ^, test for start of line
|
||||
if (inputIdx != 0) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
if (fp->fInputIdx != 0) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
} // TODO: support for multi-line mode.
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_A: // Test for start of input
|
||||
if (inputIdx != 0) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
if (fp->fInputIdx != 0) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_BACKSLASH_B: // Test for word boundaries
|
||||
{
|
||||
UBool success = isWordBoundary(inputIdx);
|
||||
UBool success = isWordBoundary(fp->fInputIdx);
|
||||
success ^= (opValue != 0); // flip sense for \B
|
||||
if (!success) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -749,19 +750,19 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
|
||||
case URX_BACKSLASH_D: // Test for decimal digit
|
||||
{
|
||||
if (inputIdx >= fInputLength) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
if (fp->fInputIdx >= fInputLength) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
|
||||
UChar32 c = fInput->char32At(inputIdx);
|
||||
UChar32 c = fInput->char32At(fp->fInputIdx);
|
||||
int8_t ctype = u_charType(c);
|
||||
UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
|
||||
success ^= (opValue != 0); // flip sense for \D
|
||||
if (success) {
|
||||
inputIdx = fInput->moveIndex32(inputIdx, 1);
|
||||
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
|
||||
} else {
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -770,8 +771,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
|
||||
|
||||
case URX_BACKSLASH_G: // Test for position at end of previous match
|
||||
if (!((fMatch && inputIdx==fMatchEnd) || fMatch==FALSE && inputIdx==0)) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==0)) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -779,20 +780,20 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
case URX_BACKSLASH_X: // Match combining character sequence
|
||||
{ // Closer to Grapheme cluster than to Perl \X
|
||||
// Fail if at end of input
|
||||
if (inputIdx >= fInputLength) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
if (fp->fInputIdx >= fInputLength) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
|
||||
// Always consume one char
|
||||
UChar32 c = fInput->char32At(inputIdx);
|
||||
inputIdx = fInput->moveIndex32(inputIdx, 1);
|
||||
UChar32 c = fInput->char32At(fp->fInputIdx);
|
||||
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
|
||||
|
||||
// Consume CR/LF as a pair
|
||||
if (c == 0x0d) {
|
||||
UChar32 c = fInput->char32At(inputIdx);
|
||||
UChar32 c = fInput->char32At(fp->fInputIdx);
|
||||
if (c == 0x0a) {
|
||||
inputIdx = fInput->moveIndex32(inputIdx, 1);
|
||||
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -801,15 +802,15 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
int8_t ctype = u_charType(c);
|
||||
if (ctype != U_CONTROL_CHAR) {
|
||||
for(;;) {
|
||||
c = fInput->char32At(inputIdx);
|
||||
c = fInput->char32At(fp->fInputIdx);
|
||||
ctype = u_charType(c);
|
||||
// TODO: make a set and add the "other grapheme extend" chars
|
||||
// to the list of stuff to be skipped over.
|
||||
if (!(ctype == U_NON_SPACING_MARK || ctype == U_ENCLOSING_MARK)) {
|
||||
break;
|
||||
}
|
||||
inputIdx = fInput->moveIndex32(inputIdx, 1);
|
||||
if (inputIdx >= fInputLength) {
|
||||
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
|
||||
if (fp->fInputIdx >= fInputLength) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -820,8 +821,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
|
||||
|
||||
case URX_BACKSLASH_Z: // Test for end of line
|
||||
if (inputIdx < inputLen) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
if (fp->fInputIdx < inputLen) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -836,10 +837,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
// 1: success if input char is not in set.
|
||||
UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
|
||||
opValue &= ~URX_NEG_SET;
|
||||
if (inputIdx < fInputLength) {
|
||||
if (fp->fInputIdx < fInputLength) {
|
||||
// There is input left. Pick up one char and test it for set membership.
|
||||
UChar32 c;
|
||||
U16_NEXT(fInputUC, inputIdx, fInputLength, c);
|
||||
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
|
||||
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
|
||||
const UnicodeSet *s = fPattern->fStaticSets[opValue];
|
||||
if (s->contains(c)) {
|
||||
|
@ -847,17 +848,17 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
}
|
||||
}
|
||||
if (!success) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_SETREF:
|
||||
if (inputIdx < fInputLength) {
|
||||
if (fp->fInputIdx < fInputLength) {
|
||||
// There is input left. Pick up one char and test it for set membership.
|
||||
UChar32 c;
|
||||
U16_NEXT(fInputUC, inputIdx, fInputLength, c);
|
||||
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
|
||||
U_ASSERT(opValue > 0 && opValue < sets->size());
|
||||
UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
|
||||
if (s->contains(c)) {
|
||||
|
@ -867,25 +868,25 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
}
|
||||
// Either at end of input, or the character wasn't in the set.
|
||||
// Either way, we need to back track out.
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
|
||||
|
||||
case URX_DOTANY:
|
||||
{
|
||||
// . matches anything
|
||||
if (inputIdx >= fInputLength) {
|
||||
if (fp->fInputIdx >= fInputLength) {
|
||||
// At end of input. Match failed. Backtrack out.
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
// There is input left. Advance over one char, unless we've hit end-of-line
|
||||
UChar32 c;
|
||||
U16_NEXT(fInputUC, inputIdx, fInputLength, c);
|
||||
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
|
||||
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
|
||||
(c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
|
||||
// End of line in normal mode. . does not match.
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -896,32 +897,81 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
{
|
||||
// ., in dot-matches-all (including new lines) mode
|
||||
// . matches anything
|
||||
if (inputIdx >= fInputLength) {
|
||||
if (fp->fInputIdx >= fInputLength) {
|
||||
// At end of input. Match failed. Backtrack out.
|
||||
backTrack(inputIdx, patIdx);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
// There is input left. Advance over one char, unless we've hit end-of-line
|
||||
UChar32 c = fInput->char32At(inputIdx);
|
||||
inputIdx = fInput->moveIndex32(inputIdx, 1);
|
||||
UChar32 c = fInput->char32At(fp->fInputIdx);
|
||||
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
|
||||
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
// In the case of a CR/LF, we need to advance over both.
|
||||
UChar32 nextc = fInput->char32At(inputIdx);
|
||||
UChar32 nextc = fInput->char32At(fp->fInputIdx);
|
||||
if (c == 0x0d && nextc == 0x0a) {
|
||||
inputIdx = fInput->moveIndex32(inputIdx, 1);
|
||||
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_JMP:
|
||||
patIdx = opValue;
|
||||
fp->fPatIdx = opValue;
|
||||
break;
|
||||
|
||||
case URX_FAIL:
|
||||
isMatch = FALSE;
|
||||
goto breakFromLoop;
|
||||
|
||||
case URX_CTR_INIT:
|
||||
{
|
||||
U_ASSERT(opValue >= 0 && opValue < frameSize-2);
|
||||
fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
|
||||
|
||||
// Pick up the three extra operands that CTR_INIT has, and
|
||||
// skip the pattern location counter past
|
||||
int32_t instrOperandLoc = fp->fPatIdx;
|
||||
fp->fPatIdx += 3;
|
||||
int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
|
||||
int32_t minCount = pat[instrOperandLoc+1];
|
||||
int32_t maxCount = pat[instrOperandLoc+2];
|
||||
U_ASSERT(minCount>=0);
|
||||
U_ASSERT(maxCount>=minCount || maxCount==-1);
|
||||
U_ASSERT(loopLoc>fp->fPatIdx);
|
||||
|
||||
if (minCount == 0) {
|
||||
fp = StateSave(fp, loopLoc+1, frameSize, status);
|
||||
}
|
||||
if (maxCount == 0) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_CTR_LOOP:
|
||||
{
|
||||
U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
|
||||
int32_t initOp = pat[opValue];
|
||||
U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
|
||||
int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
|
||||
int32_t minCount = pat[opValue+2];
|
||||
int32_t maxCount = pat[opValue+3];
|
||||
// Increment the counter. Note: we're not worrying about counter
|
||||
// overflow, since the data comes from UnicodeStrings, which
|
||||
// stores its length in an int32_t.
|
||||
(*pCounter)++;
|
||||
U_ASSERT(*pCounter > 0);
|
||||
if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
|
||||
U_ASSERT(*pCounter == maxCount || maxCount == -1);
|
||||
break;
|
||||
}
|
||||
if (*pCounter >= minCount) {
|
||||
fp = StateSave(fp, fp->fPatIdx, frameSize, status);
|
||||
}
|
||||
fp->fPatIdx = opValue + 4; // Loop back.
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
// Trouble. The compiled pattern contains an entry with an
|
||||
|
@ -939,13 +989,18 @@ breakFromLoop:
|
|||
if (isMatch) {
|
||||
fLastMatchEnd = fMatchEnd;
|
||||
fMatchStart = startIdx;
|
||||
fMatchEnd = inputIdx;
|
||||
fMatchEnd = fp->fInputIdx;
|
||||
REGEX_RUN_DEBUG_PRINTF("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd);
|
||||
}
|
||||
else
|
||||
{
|
||||
REGEX_RUN_DEBUG_PRINTF("No match\n\n");
|
||||
}
|
||||
|
||||
fFrame = fp; // The active stack frame when the engine stopped.
|
||||
// Contains the capture group results that we need to
|
||||
// access later.
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -66,7 +66,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
fFlags = other.fFlags;
|
||||
fLiteralText = other.fLiteralText;
|
||||
fBadState = other.fBadState;
|
||||
fNumCaptureGroups = other.fNumCaptureGroups;
|
||||
fMaxCaptureDigits = other.fMaxCaptureDigits;
|
||||
fStaticSets = other.fStaticSets;
|
||||
if (fBadState) {
|
||||
|
@ -74,11 +73,10 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
}
|
||||
|
||||
// Copy the pattern. It's just values, nothing deep to copy.
|
||||
int i;
|
||||
// TODO: something with status
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
for (i=0; i<other.fCompiledPat->size(); i++) {
|
||||
fCompiledPat->addElement(other.fCompiledPat->elementAti(i), status);
|
||||
}
|
||||
fCompiledPat->assign(*other.fCompiledPat, status);
|
||||
fGroupMap->assign(*other.fGroupMap, status);
|
||||
|
||||
// Note: do not copy fMatcher. It'll be created on first use if the
|
||||
// destination needs one.
|
||||
|
@ -87,6 +85,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
// Could be made more efficient if the sets were reference counted and shared,
|
||||
// but I doubt that pattern copying will be particularly common.
|
||||
// Note: init() already added an empty element zero to fSets
|
||||
int32_t i;
|
||||
for (i=1; i<other.fSets->size(); i++) {
|
||||
UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
|
||||
UnicodeSet *newSet = new UnicodeSet(*sourceSet);
|
||||
|
@ -112,14 +111,15 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
void RegexPattern::init() {
|
||||
fFlags = 0;
|
||||
fBadState = FALSE;
|
||||
fNumCaptureGroups = 0;
|
||||
fMaxCaptureDigits = 1; // TODO: calculate for real.
|
||||
fStaticSets = NULL;
|
||||
fMatcher = NULL;
|
||||
fFrameSize = 0;
|
||||
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
// Init of a completely new RegexPattern.
|
||||
fCompiledPat = new UVector32(status);
|
||||
fGroupMap = new UVector32(status);
|
||||
fSets = new UVector(status);
|
||||
if (U_FAILURE(status) || fCompiledPat == NULL || fSets == NULL) {
|
||||
fBadState = TRUE;
|
||||
|
@ -151,6 +151,8 @@ void RegexPattern::zap() {
|
|||
}
|
||||
delete fSets;
|
||||
fSets = NULL;
|
||||
delete fGroupMap;
|
||||
fGroupMap = NULL;
|
||||
}
|
||||
|
||||
|
||||
|
@ -367,6 +369,7 @@ int32_t RegexPattern::split(const UnicodeString &input,
|
|||
// Loop through the input text, searching for the delimiter pattern
|
||||
//
|
||||
int i;
|
||||
int32_t numCaptureGroups = fGroupMap->size();
|
||||
for (i=0; ; i++) {
|
||||
if (i==destCapacity-1) {
|
||||
// There is only one output string left.
|
||||
|
@ -384,7 +387,7 @@ int32_t RegexPattern::split(const UnicodeString &input,
|
|||
// If the delimiter pattern has capturing parentheses, the captured
|
||||
// text goes out into the next n destination strings.
|
||||
int32_t groupNum;
|
||||
for (groupNum=1; groupNum<=this->fNumCaptureGroups; groupNum++) {
|
||||
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
|
||||
if (i==destCapacity-1) {
|
||||
break;
|
||||
}
|
||||
|
@ -446,6 +449,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
// Types with no operand field of interest.
|
||||
break;
|
||||
|
||||
case URX_RESERVED_OP:
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_STATE_SAVE:
|
||||
|
@ -457,6 +461,14 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
case URX_STRING_LEN:
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_INIT_P:
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_CTR_LOOP_P:
|
||||
case URX_RELOC_OPRND:
|
||||
|
||||
// types with an integer operand field.
|
||||
REGEX_DUMP_DEBUG_PRINTF("%d", val);
|
||||
break;
|
||||
|
|
|
@ -48,6 +48,7 @@ class RegexMatcher;
|
|||
class UVector;
|
||||
class UVector32;
|
||||
class UnicodeSet;
|
||||
struct REStackFrame;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -312,7 +313,12 @@ private:
|
|||
// split(), to avoid having to
|
||||
// make new ones on each call.
|
||||
|
||||
int32_t fNumCaptureGroups;
|
||||
int32_t fFrameSize; // Size of a state stack frame in the
|
||||
// execution engine.
|
||||
|
||||
UVector32 *fGroupMap; // Map from capture group number to position of
|
||||
// the group's variables in the matcher stack frame.
|
||||
|
||||
int32_t fMaxCaptureDigits;
|
||||
|
||||
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
|
||||
|
@ -658,9 +664,12 @@ private:
|
|||
// MatchAt This is the internal interface to the match engine itself.
|
||||
// Match status comes back in matcher member variables.
|
||||
//
|
||||
void MatchAt(int32_t startIdx, UErrorCode &status);
|
||||
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
|
||||
UBool isWordBoundary(int32_t pos); // perform the \b test
|
||||
void MatchAt(int32_t startIdx, UErrorCode &status);
|
||||
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
|
||||
UBool isWordBoundary(int32_t pos); // perform the \b test
|
||||
REStackFrame *resetStack();
|
||||
inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx,
|
||||
int32_t frameSize, UErrorCode &status);
|
||||
|
||||
|
||||
const RegexPattern *fPattern;
|
||||
|
@ -672,14 +681,11 @@ private:
|
|||
int32_t fMatchStart; // Position of the start of the most recent match
|
||||
int32_t fMatchEnd; // First position after the end of the most recent match
|
||||
int32_t fLastMatchEnd; // First position after the end of the previous match.
|
||||
UVector32 *fBackTrackStack;
|
||||
UVector32 *fCaptureStarts;
|
||||
UVector32 *fCaptureEnds;
|
||||
|
||||
// Cache the capture vector data pointers, for faster access.
|
||||
int32_t *fCapStarts;
|
||||
int32_t *fCapEnds;
|
||||
int32_t fCaptureStateSize;
|
||||
UVector32 *fStack;
|
||||
REStackFrame *fFrame; // After finding a match, the last active stack
|
||||
// frame, which will contain the capture group results.
|
||||
// NOT valid while match engine is running.
|
||||
|
||||
/**
|
||||
* The address of this static class variable serves as this class's ID
|
||||
|
|
|
@ -368,8 +368,8 @@ void RegexTest::Basic() {
|
|||
//
|
||||
#if 0
|
||||
{
|
||||
//REGEX_TESTLM("X(.+)+X", "nomatch", TRUE, TRUE);
|
||||
REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
|
||||
REGEX_TESTLM("A{3}BC", "AAABC", TRUE, TRUE);
|
||||
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
|
||||
}
|
||||
exit(1);
|
||||
#endif
|
||||
|
@ -1195,6 +1195,34 @@ void RegexTest::Extended() {
|
|||
REGEX_FIND("AB\\.C\\eD\\u0666E", "<0>AB.C\\u001BD\\u0666E</0>F");
|
||||
|
||||
|
||||
// {min,max} iteration qualifier
|
||||
REGEX_TESTLM("A{3}BC", "AAABC", TRUE, TRUE);
|
||||
|
||||
REGEX_FIND("(ABC){2,3}AB", "no matchAB");
|
||||
REGEX_FIND("(ABC){2,3}AB", "ABCAB");
|
||||
REGEX_FIND("(ABC){2,3}AB", "<0>ABC<1>ABC</1>AB</0>");
|
||||
REGEX_FIND("(ABC){2,3}AB", "<0>ABCABC<1>ABC</1>AB</0>");
|
||||
REGEX_FIND("(ABC){2,3}AB", "<0>ABCABC<1>ABC</1>AB</0>CAB");
|
||||
|
||||
REGEX_FIND("(ABC){2}AB", "ABCAB");
|
||||
REGEX_FIND("(ABC){2}AB", "<0>ABC<1>ABC</1>AB</0>");
|
||||
REGEX_FIND("(ABC){2}AB", "<0>ABC<1>ABC</1>AB</0>CAB");
|
||||
REGEX_FIND("(ABC){2}AB", "<0>ABC<1>ABC</1>AB</0>CABCAB");
|
||||
|
||||
REGEX_FIND("(ABC){2,}AB", "ABCAB");
|
||||
REGEX_FIND("(ABC){2,}AB", "<0>ABC<1>ABC</1>AB</0>");
|
||||
REGEX_FIND("(ABC){2,}AB", "<0>ABCABC<1>ABC</1>AB</0>");
|
||||
REGEX_FIND("(ABC){2,}AB", "<0>ABCABCABC<1>ABC</1>AB</0>");
|
||||
|
||||
REGEX_FIND("X{0,0}ABC", "<0>ABC</0>");
|
||||
REGEX_FIND("X{0,1}ABC", "<0>ABC</0>");
|
||||
|
||||
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "Hello there");
|
||||
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "<0>Hello<1>!</1> there</0>");
|
||||
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "<0>Hello<1>!!</1> there</0>");
|
||||
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "<0>Hello<1>!!!</1> there</0>");
|
||||
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "Hello!!!! there");
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -1234,9 +1262,6 @@ void RegexTest::Errors() {
|
|||
// Atomic Grouping
|
||||
REGEX_ERR("abc(?>xyz)", 1, 6, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
// {Numeric Quantifiers}
|
||||
REGEX_ERR("abc{4}", 1, 5, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
// Possessive Quantifiers
|
||||
REGEX_ERR("abc++d", 1, 5, U_REGEX_UNIMPLEMENTED);
|
||||
REGEX_ERR("abc*+d", 1, 5, U_REGEX_UNIMPLEMENTED);
|
||||
|
|
Loading…
Add table
Reference in a new issue