ICU-2422 regexp, {min,max} quantifiers added.

X-SVN-Rev: 10859
This commit is contained in:
Andy Heninger 2003-01-16 01:12:04 +00:00
parent c11f4d65b3
commit 01147100bd
12 changed files with 584 additions and 229 deletions

View file

@ -1840,6 +1840,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_PROPERTY_SYNTAX",
"U_REGEX_UNIMPLEMENTED",
"U_REGEX_MISMATCHED_PAREN",
"U_REGEX_NUMBER_TOO_BIG",
"U_REGEX_BAD_INTERVAL"
};
U_CAPI const char * U_EXPORT2

View file

@ -625,6 +625,8 @@ typedef enum UErrorCode {
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */

View file

@ -204,8 +204,8 @@ public:
int32_t push(int32_t i, UErrorCode &status);
int32_t *reserveBlock(int32_t size, UErrorCode &status);
int32_t *popBlock(int32_t amount);
};
int32_t *popFrame(int32_t size);
};
// UVector32 inlines
@ -237,6 +237,15 @@ inline int32_t *UVector32::reserveBlock(int32_t size, UErrorCode &status) {
return rp;
}
inline int32_t *UVector32::popFrame(int32_t size) {
U_ASSERT(count >= size);
count -= size;
if (count < 0) {
count = 0;
}
return elements+count-size;
}
inline int32_t UVector32::size(void) const {
@ -288,13 +297,6 @@ inline int32_t UVector32::popi(void) {
return result;
}
inline int32_t *UVector32::popBlock(int32_t amount) {
U_ASSERT(amount <= count);
count -= amount;
if (count < 0) {count = 0;}
return elements + count;
}
U_NAMESPACE_END
#endif

View file

@ -145,13 +145,14 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
{
fStatus = &status;
fScanIndex = 0;
fNextIndex = 0;
fPeekChar = -1;
fLineNum = 1;
fCharNum = 0;
fQuoteMode = FALSE;
fFreeForm = FALSE;
fScanIndex = 0;
fNextIndex = 0;
fPeekChar = -1;
fLineNum = 1;
fCharNum = 0;
fQuoteMode = FALSE;
fFreeForm = FALSE;
fMatcherDataEnd = 0;
fMatchOpenParen = -1;
fMatchCloseParen = -1;
@ -374,13 +375,22 @@ void RegexCompile::compile(
fRXPat->fMaxCaptureDigits = 1;
int32_t n = 10;
for (;;) {
if (n > fRXPat->fNumCaptureGroups) {
if (n > fRXPat->fGroupMap->size()) {
break;
}
fRXPat->fMaxCaptureDigits++;
n *= 10;
}
//
// The pattern's fFrameSize so far has accumulated the requirements for
// storage for capture parentheses, counters, etc. that are encountered
// in the pattern. Add space for the two variables that are always
// present in the saved state: the input string position and the
// position in the compiled pattern.
//
fRXPat->fFrameSize+=2;
//
// A stupid bit of non-sense to prevent code coverage testing from complaining
// about the pattern.dump() debug function. Go through the motions of dumping,
@ -499,8 +509,9 @@ UBool RegexCompile::doParseActions(EParseAction action)
// is an '|' alternation within the parens.
{
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
fRXPat->fNumCaptureGroups++;
int32_t cop = URX_BUILD(URX_START_CAPTURE, fRXPat->fNumCaptureGroups);
int32_t varsLoc = fRXPat->fFrameSize; // Reserve two slots in match stack frame.
fRXPat->fFrameSize += 2;
int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
fRXPat->fCompiledPat->addElement(cop, *fStatus);
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
@ -511,6 +522,9 @@ UBool RegexCompile::doParseActions(EParseAction action)
fParenStack.push(-2, *fStatus); // Begin a new frame.
fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
// Save the mapping from group number to stack frame variable position.
fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
}
break;
@ -704,6 +718,64 @@ UBool RegexCompile::doParseActions(EParseAction action)
break;
case doIntervalInit:
// The '{' opening an interval quantifier was just scanned.
// Init the counter varaiables that will accumulate the values as the digits
// are scanned.
fIntervalLow = 0;
fIntervalUpper = -1;
break;
case doIntevalLowerDigit:
// Scanned a digit from the lower value of an {lower,upper} interval
{
int32_t digitValue = u_charDigitValue(fC.fChar);
U_ASSERT(digitValue >= 0);
fIntervalLow = fIntervalLow*10 + digitValue;
if (fIntervalLow < 0) {
error(U_REGEX_NUMBER_TOO_BIG);
}
}
break;
case doIntervalUpperDigit:
// Scanned a digit from the upper value of an {lower,upper} interval
{
if (fIntervalUpper < 0) {
fIntervalUpper = 0;
}
int32_t digitValue = u_charDigitValue(fC.fChar);
U_ASSERT(digitValue >= 0);
fIntervalUpper = fIntervalUpper*10 + digitValue;
if (fIntervalLow < 0) {
error(U_REGEX_NUMBER_TOO_BIG);
}
}
break;
case doIntervalSame:
// Scanned a single value interval like {27}. Upper = Lower.
fIntervalUpper = fIntervalLow;
break;
case doInterval:
// Finished scanning a normal {lower,upper} interval. Generate the code for it.
compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
break;
case doPossesiveInterval:
// Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
compileInterval(URX_CTR_INIT_P, URX_CTR_LOOP_P);
break;
case doNGInterval:
// Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
break;
case doIntervalError:
error(U_REGEX_BAD_INTERVAL);
break;
case doLiteralChar:
// We've just scanned a "normal" character from the pattern,
@ -835,11 +907,6 @@ UBool RegexCompile::doParseActions(EParseAction action)
error(U_REGEX_UNIMPLEMENTED);
break;
case doNotImplementedError:
// TODO: get rid of this once everything is implemented.
error(U_REGEX_UNIMPLEMENTED);
break;
default:
error(U_REGEX_INTERNAL_ERROR);
@ -995,6 +1062,54 @@ void RegexCompile::fixLiterals(UBool split) {
//------------------------------------------------------------------------------
//
// insertOp() Insert a slot for a new opcode into the already
// compiled pattern code.
//
// Fill the slot with a NOP. Our caller will replace it
// with what they really wanted.
//
//------------------------------------------------------------------------------
void RegexCompile::insertOp(int32_t where) {
UVector32 *code = fRXPat->fCompiledPat;
U_ASSERT(where>0 && where < code->size());
int32_t nop = URX_BUILD(URX_NOP, 0);
code->insertElementAt(nop, where, *fStatus);
// Walk through the pattern, looking for any ops with targets that
// were moved down by the insert. Fix them.
int32_t loc;
for (loc=0; loc<code->size(); loc++) {
int32_t op = code->elementAti(loc);
int32_t opType = URX_TYPE(op);
int32_t opValue = URX_VAL(op);
if ((opType == URX_JMP ||
opType == URX_STATE_SAVE ||
opType == URX_CTR_LOOP ||
opType == URX_RELOC_OPRND) && opValue > where) {
// Target location for this opcode is after the insertion point and
// needs to be incremented to adjust for the insertion.
opValue++;
op = URX_BUILD(opType, opValue);
code->setElementAt(op, loc);
}
}
// Now fix up the parentheses stack. All positive values in it are locations in
// the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
for (loc=0; loc<fParenStack.size(); loc++) {
int32_t x = fParenStack.elementAti(loc);
if (x>where) {
x++;
fParenStack.setElementAt(x, loc);
}
}
}
//------------------------------------------------------------------------------
//
// blockTopLoc() Find or create a location in the compiled pattern
@ -1007,9 +1122,10 @@ void RegexCompile::fixLiterals(UBool split) {
// is reserved for this purpose. .* or similar don't
// and a slot needs to be added.
//
// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
// at the returned location.
// FALSE - just return the address, reserve a location there.
// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
// at the returned location.
// FALSE - just return the address,
// do not reserve a location there.
//
//------------------------------------------------------------------------------
int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
@ -1097,9 +1213,8 @@ void RegexCompile::handleCloseParen() {
{
int32_t captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
int32_t captureGroupNumber = URX_VAL(captureOp);
U_ASSERT(captureGroupNumber > 0);
int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, captureGroupNumber);
int32_t framVarLocation = URX_VAL(captureOp);
int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, framVarLocation+1);
fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
}
break;
@ -1167,6 +1282,53 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
}
//----------------------------------------------------------------------------------------
//
// compileInterval Generate the code for a {min, max} style interval quantifier.
// Except for the specific opcodes used, the code is the same
// for all three types (greedy, non-greedy, possessive) of
// intervals. The opcodes are supplied as parameters.
//
//----------------------------------------------------------------------------------------
void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
{
// The CTR_INIT op at the top of the block with the {n,m} quantifier takes
// four slots in the compiled code. Reserve them.
int32_t topOfBlock = blockTopLoc(TRUE);
insertOp(topOfBlock);
insertOp(topOfBlock);
insertOp(topOfBlock);
// The operands for the CTR_INIT opcode include the index in the matcher data
// of the counter. Allocate it now.
int32_t counterLoc = fRXPat->fFrameSize;
fRXPat->fFrameSize++;
int32_t op = URX_BUILD(InitOp, counterLoc);
fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
// The second operand of CTR_INIT is the location following the end of the loop.
// Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
// compilation of something later on causes the code to grow and the target
// position to move.
int32_t loopEnd = fRXPat->fCompiledPat->size();
op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
// Followed by the min and max counts.
fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
// Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
// Goes at end of the block being looped over, so just append to the code so far.
op = URX_BUILD(LoopOp, topOfBlock);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
//----------------------------------------------------------------------------------------
//
// Error Report a rule parse error.

View file

@ -81,8 +81,12 @@ private:
// there is space to add an opcode there.
void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for
// a reference to a UnicodeSet.
void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier.
int32_t LoopOp);
void literalChar(); // Compile a literal char
void fixLiterals(UBool split=FALSE); // Fix literal strings.
void insertOp(int32_t where); // Open up a slot for a new op in the
// generated code at the specified location.
UErrorCode *fStatus;
@ -121,9 +125,9 @@ private:
int32_t fPatternLength; // Length of the input pattern string.
UStack fParenStack; // parentheses stack. Each frame consists of
UVector32 fParenStack; // parentheses stack. Each frame consists of
// the positions of compiled pattern operations
// needing fixup, followed by negative vallue. The
// needing fixup, followed by negative value. The
// first entry in each frame is the position of the
// spot reserved for use when a quantifier
// needs to add a SAVE at the start of a (block)
@ -140,6 +144,16 @@ private:
// location after the most recently processed
// parenthesized block.
int32_t fIntervalLow; // {lower, upper} interval quantifier values.
int32_t fIntervalUpper; // Placed here temporarily, when pattern is
// initially scanned. Each new interval
// encountered overwrites these values.
// -1 for the upper interval value means none
// was specified (unlimited occurences.)
int32_t fMatcherDataEnd; // Location Counter for allocation of data
// to be used by the matcher at match time.
};
U_NAMESPACE_END

View file

@ -28,6 +28,7 @@ enum Regex_PatternParseAction {
doOpenCaptureParen,
doBadOpenParenType,
doRuleError,
doIntevalLowerDigit,
doBackslashs,
doNGOpt,
doNamedChar,
@ -37,12 +38,14 @@ enum Regex_PatternParseAction {
doOpenLookBehind,
doBackslashx,
doBackslashz,
doIntervalError,
doStar,
doCaret,
doEnterQuoteMode,
doPossesivePlus,
doNGStar,
doMatchMode,
doIntervalUpperDigit,
doOpenLookAheadNeg,
doPlus,
doOpenNonCaptureParen,
@ -54,7 +57,10 @@ enum Regex_PatternParseAction {
doPossesiveOpt,
doBackslashG,
doOpt,
doInterval,
doLiteralChar,
doPossesiveInterval,
doIntervalInit,
doOpenAtomicParen,
doBackslashS,
doOpenLookAhead,
@ -67,10 +73,11 @@ enum Regex_PatternParseAction {
doBackslashZ,
doNOP,
doExit,
doNGInterval,
doPatStart,
doBackslashb,
doNotImplementedError,
doBackslashd,
doIntervalSame,
doOpenLookBehindNeg,
rbbiLastAction};
@ -99,13 +106,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doDotAny, 46 /* . */, 12,0, TRUE} // 6
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
, {doNOP, 92 /* \ */, 59,0, TRUE} // 9
, {doNOP, 92 /* \ */, 70,0, TRUE} // 9
, {doPatFinish, 253, 2,0, FALSE} // 10
, {doRuleError, 255, 79,0, FALSE} // 11
, {doRuleError, 255, 90,0, FALSE} // 11
, {doNOP, 42 /* * */, 48,0, TRUE} // 12 expr-quant
, {doNOP, 43 /* + */, 51,0, TRUE} // 13
, {doNOP, 63 /* ? */, 54,0, TRUE} // 14
, {doNOP, 123 /* { */, 57,0, TRUE} // 15
, {doIntervalInit, 123 /* { */, 57,0, TRUE} // 15
, {doNOP, 255, 17,0, FALSE} // 16
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 17 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 18
@ -123,12 +130,12 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 30
, {doMatchMode, 109 /* m */, 40,0, TRUE} // 31
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 32
, {doBadOpenParenType, 255, 79,0, FALSE} // 33
, {doBadOpenParenType, 255, 90,0, FALSE} // 33
, {doOpenLookBehind, 61 /* = */, 2, 17, TRUE} // 34 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 17, TRUE} // 35
, {doBadOpenParenType, 255, 79,0, FALSE} // 36
, {doBadOpenParenType, 255, 90,0, FALSE} // 36
, {doNOP, 41 /* ) */, 2,0, TRUE} // 37 paren-comment
, {doMismatchedParenErr, 253, 79,0, FALSE} // 38
, {doMismatchedParenErr, 253, 90,0, FALSE} // 38
, {doNOP, 255, 37,0, TRUE} // 39
, {doMatchMode, 105 /* i */, 40,0, TRUE} // 40 paren-flag
, {doMatchMode, 115 /* s */, 40,0, TRUE} // 41
@ -137,7 +144,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doMatchMode, 45 /* - */, 40,0, TRUE} // 44
, {doNOP, 41 /* ) */, 2,0, TRUE} // 45
, {doOpenNonCaptureParen, 58 /* : */, 2, 12, TRUE} // 46
, {doNOP, 255, 79,0, FALSE} // 47
, {doNOP, 255, 90,0, FALSE} // 47
, {doNGStar, 63 /* ? */, 17,0, TRUE} // 48 quant-star
, {doPossesiveStar, 43 /* + */, 17,0, TRUE} // 49
, {doStar, 255, 17,0, FALSE} // 50
@ -148,28 +155,39 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doPossesiveOpt, 43 /* + */, 17,0, TRUE} // 55
, {doOpt, 255, 17,0, FALSE} // 56
, {doNOP, 129, 57,0, TRUE} // 57 interval-open
, {doNotImplementedError, 255, 79,0, FALSE} // 58
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 59 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 60
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 61
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 62
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 63
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 64
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 65
, {doProperty, 112 /* p */, 12,0, FALSE} // 66
, {doProperty, 80 /* P */, 12,0, FALSE} // 67
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 68
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 69
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 70
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 71
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 72
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 73
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 74
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 75
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 76
, {doBackRef, 128, 12,0, TRUE} // 77
, {doLiteralChar, 255, 12,0, TRUE} // 78
, {doExit, 255, 79,0, TRUE} // 79 errorDeath
, {doNOP, 128, 60,0, FALSE} // 58
, {doIntervalError, 255, 90,0, FALSE} // 59
, {doIntevalLowerDigit, 128, 60,0, TRUE} // 60 interval-lower
, {doNOP, 44 /* , */, 64,0, TRUE} // 61
, {doIntervalSame, 125 /* } */, 67,0, TRUE} // 62
, {doIntervalError, 255, 90,0, FALSE} // 63
, {doIntervalUpperDigit, 128, 64,0, TRUE} // 64 interval-upper
, {doNOP, 125 /* } */, 67,0, TRUE} // 65
, {doIntervalError, 255, 90,0, FALSE} // 66
, {doNGInterval, 63 /* ? */, 17,0, TRUE} // 67 interval-type
, {doPossesiveInterval, 43 /* + */, 17,0, TRUE} // 68
, {doInterval, 255, 17,0, FALSE} // 69
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 70 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 71
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 72
, {doBackslashd, 100 /* d */, 12,0, TRUE} // 73
, {doBackslashD, 68 /* D */, 12,0, TRUE} // 74
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 75
, {doNamedChar, 78 /* N */, 12,0, TRUE} // 76
, {doProperty, 112 /* p */, 12,0, FALSE} // 77
, {doProperty, 80 /* P */, 12,0, FALSE} // 78
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 79
, {doBackslashS, 83 /* S */, 12,0, TRUE} // 80
, {doBackslashs, 115 /* s */, 12,0, TRUE} // 81
, {doBackslashW, 87 /* W */, 12,0, TRUE} // 82
, {doBackslashw, 119 /* w */, 12,0, TRUE} // 83
, {doBackslashX, 88 /* X */, 12,0, TRUE} // 84
, {doBackslashx, 120 /* x */, 12,0, TRUE} // 85
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 86
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 87
, {doBackRef, 128, 12,0, TRUE} // 88
, {doLiteralChar, 255, 12,0, TRUE} // 89
, {doExit, 255, 90,0, TRUE} // 90 errorDeath
};
static const char * const RegexStateNames[] = { 0,
"start",
@ -229,6 +247,17 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
"interval-open",
0,
0,
"interval-lower",
0,
0,
0,
"interval-upper",
0,
0,
"interval-type",
0,
0,
"backslash",
0,

View file

@ -64,15 +64,15 @@ start:
# term. At a position where we can accept the start most items in a pattern.
#
term:
quoted n expr-quant doLiteralChar
rule_char n expr-quant doLiteralChar
quoted n expr-quant doLiteralChar
rule_char n expr-quant doLiteralChar
'[' n expr-quant doScanUnicodeSet
'(' n open-paren
'.' n expr-quant doDotAny
'^' n term doCaret
'$' n term doDollar
'\' n backslash
eof term doPatFinish
eof term doPatFinish
default errorDeath doRuleError
@ -85,7 +85,7 @@ expr-quant:
'*' n quant-star
'+' n quant-plus
'?' n quant-opt
'{' n interval-open
'{' n interval-open doIntervalInit
default expr-cont
@ -182,14 +182,28 @@ quant-opt:
#
# Interval scanning a '{', the opening delimiter for an interval specification
# {number} or {min, max}
# {number} or {min, max} or {min, }
#
interval-open:
white_space n interval-open
default errorDeath doNotImplementedError
white_space n interval-open # TODO: is white space allowed here in non-free mode?
digit_char interval-lower
default errorDeath doIntervalError
interval-lower:
digit_char n interval-lower doIntevalLowerDigit
',' n interval-upper
'}' n interval-type doIntervalSame # {n}
default errorDeath doIntervalError
interval-upper:
digit_char n interval-upper doIntervalUpperDigit
'}' n interval-type
default errorDeath doIntervalError
interval-type:
'?' n expr-cont doNGInterval # {n,m}?
'+' n expr-cont doPossesiveInterval # {n,m}+
default expr-cont doInterval # {m,n}
#

View file

@ -12,6 +12,7 @@
#ifndef _REGEXIMP_H
#define _REGEXIMP_H
U_NAMESPACE_BEGIN
//
// debugging support. Enable one or more of the #defines immediately following
@ -50,7 +51,7 @@
// of the entries.
//
enum {
URX_RESERVED_OP = 0,
URX_RESERVED_OP = 0, // For multi-operand ops, most non-first words.
URX_BACKTRACK = 1,
URX_END = 2,
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
@ -77,13 +78,24 @@ enum {
URX_DOTANY_ALL = 21, // ., in the . matches any mode.
URX_BACKSLASH_D = 22, // Value field: 0: \d 1: \D
URX_CARET = 23, // Value field: 1: multi-line mode.
URX_DOLLAR = 24 // Also for \Z
URX_DOLLAR = 24, // Also for \Z
URX_CTR_INIT = 25, // Counter Inits for {Interval} loops.
URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possesive.
URX_CTR_INIT_P = 27, // These are 4 word opcodes. See description.
URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
URX_CTR_LOOP_NG = 29, // Also in three flavors.
URX_CTR_LOOP_P = 30,
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
// back into compiled pattern code, and thus must
// be relocated when inserting/deleting ops in code.
};
// Keep this list of opcode names in sync with the above enum
// Used for debug printing only.
#define URX_OPCODE_NAMES \
"URX_RESERVED_OP", \
" ", \
"URX_BACKTRACK", \
"END", \
"ONECHAR", \
@ -107,7 +119,14 @@ enum {
"URX_DOTANY_ALL", \
"URX_BACKSLASH_D", \
"URX_CARET", \
"URX_DOLLAR"
"URX_DOLLAR", \
"CTR_INIT", \
"CTR_INIT_NG", \
"CTR_INIT_P", \
"CTR_LOOP", \
"CTR_LOOP_NG", \
"CTR_LOOP_P", \
"RELOC_OPRND"
//
// Convenience macros for assembling and disassembling a compiled operation.
@ -132,5 +151,18 @@ enum {
// membership test.
};
//
// Match Engine State Stack Frame Layout.
//
struct REStackFrame {
int32_t fInputIdx; // Position of next character in the input string
int32_t fPatIdx; // Position of next Op in the compiled pattern
int32_t fExtra[2]; // Extra state, for capture group start/ends
// atomic parentheses, repeat counts, etc.
// Locations assigned at pattern compile time.
};
U_NAMESPACE_END
#endif

View file

@ -38,23 +38,14 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
fInputUC = NULL;
fInputLength = 0;
UErrorCode status = U_ZERO_ERROR;
fBackTrackStack = new UVector32(status); // TODO: do something with status.
fCaptureStarts = new UVector32(status);
fCaptureEnds = new UVector32(status);
int i;
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
fCaptureStarts->addElement(-1, status);
fCaptureEnds ->addElement(-1, status);
}
fStack = new UVector32(status); // TODO: do something with status.
reset();
}
RegexMatcher::~RegexMatcher() {
delete fBackTrackStack;
delete fCaptureStarts;
delete fCaptureEnds;
delete fStack;
}
@ -193,7 +184,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
err = U_REGEX_INVALID_STATE;
return -1;
}
if (group < 0 || group > fPattern->fNumCaptureGroups) {
if (group < 0 || group > fPattern->fGroupMap->size()) {
err = U_INDEX_OUTOFBOUNDS_ERROR;
return -1;
}
@ -201,13 +192,19 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
if (group == 0) {
e = fMatchEnd;
} else {
// Get the position within the stack frame of the variables for
// this capture group.
int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
U_ASSERT(groupOffset < fPattern->fFrameSize);
U_ASSERT(groupOffset >= 0);
// Note: When the match engine backs out of a capture group, it sets the
// group's start position to -1. The end position is left with junk.
// So, before returning an end position, we must first check that
// the start position indicates that the group matched something.
int32_t s = fCaptureStarts->elementAti(group);
int32_t s = fFrame->fExtra[groupOffset];
if (s != -1) {
e = fCaptureEnds->elementAti(group);
e = fFrame->fExtra[groupOffset + 1];
}
}
return e;
@ -301,7 +298,7 @@ UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
int32_t RegexMatcher::groupCount() const {
return fPattern->fNumCaptureGroups;
return fPattern->fGroupMap->size();
}
@ -398,11 +395,7 @@ RegexMatcher &RegexMatcher::reset() {
fMatchEnd = 0;
fLastMatchEnd = 0;
fMatch = FALSE;
int i;
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
fCaptureStarts->setElementAt(-1, i);
}
resetStack();
return *this;
}
@ -418,6 +411,20 @@ RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
REStackFrame *RegexMatcher::resetStack() {
// Discard any previous contents of the state save stack, and initialize a
// new stack frame to all -1. The -1s are needed for capture group limits, where
// they indicate that a group has not yet matched anything.
fStack->removeAllElements();
UErrorCode status = U_ZERO_ERROR; // TODO: do something with status
int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, status);
int i;
for (i=0; i<fPattern->fFrameSize; i++) {
iFrame[i] = -1;
}
return (REStackFrame *)iFrame;
}
//--------------------------------------------------------------------------------
//
// start
@ -438,7 +445,7 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
err = U_REGEX_INVALID_STATE;
return -1;
}
if (group < 0 || group > fPattern->fNumCaptureGroups) {
if (group < 0 || group > fPattern->fGroupMap->size()) {
err = U_INDEX_OUTOFBOUNDS_ERROR;
return -1;
}
@ -446,7 +453,10 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
if (group == 0) {
s = fMatchStart;
} else {
s = fCaptureStarts->elementAti(group);
int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
U_ASSERT(groupOffset < fPattern->fFrameSize);
U_ASSERT(groupOffset >= 0);
s = fFrame->fExtra[groupOffset];
}
return s;
}
@ -501,28 +511,37 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) {
return isBoundary;
}
//--------------------------------------------------------------------------------
//
// backTrack Within the match engine, this function is called when
// a local match failure occurs, and the match needs to back
// track and proceed down another path.
// StateSave
// Make a new stack frame, initialized as a copy of the current stack frame.
// Set the pattern index in the original stack frame from the operand value
// in the opcode. Execution of the engine continues with the state in
// the newly created stack frame
//
// Note: Inline function. Keep its body above MatchAt().
// Note that reserveBlock() may grow the stack, resulting in the
// whole thing being relocated in memory.
//
//--------------------------------------------------------------------------------
void RegexMatcher::backTrack(int32_t &inputIdx, int32_t &patIdx) {
int32_t *sp = fBackTrackStack->popBlock(fCaptureStateSize);
int i;
for (i=fPattern->fNumCaptureGroups; i>=1; i--) {
fCapStarts[i] = *sp++;
fCapEnds[i] = *sp++;
inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) {
// push storage for a new frame.
int32_t *newFP = fStack->reserveBlock(frameSize, status);
fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack.
// New stack frame = copy of old top frame.
int32_t *source = (int32_t *)fp;
int32_t *dest = newFP;
for (;;) {
*dest++ = *source++;
if (source == newFP) {
break;
}
}
patIdx = *sp++;
inputIdx = *sp++;
fp->fPatIdx = savePatIdx;
return (REStackFrame *)newFP;
}
//--------------------------------------------------------------------------------
//
@ -530,8 +549,6 @@ void RegexMatcher::backTrack(int32_t &inputIdx, int32_t &patIdx) {
//
//--------------------------------------------------------------------------------
void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
int32_t inputIdx = startIdx; // Current position in the input string.
int32_t patIdx = 0; // Current position in the compiled pattern.
UBool isMatch = FALSE; // True if the we have a match.
int32_t op; // Operation from the compiled pattern, split into
@ -565,40 +582,35 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
return;
}
// Clear out capture results from any previous match.
// Required for capture groups in patterns with | operations that may not match at all,
// although the pattern as a whole does match.
int i;
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
fCaptureStarts->setElementAt(-1, i);
}
// Cache frequently referenced items from the compiled pattern
// in local variables.
//
int32_t *pat = fPattern->fCompiledPat->getBuffer();
fCapStarts = fCaptureStarts->getBuffer();
fCapEnds = fCaptureEnds->getBuffer();
fCaptureStateSize = fPattern->fNumCaptureGroups*2 + 2;
int32_t *pat = fPattern->fCompiledPat->getBuffer();
const UChar *litText = fPattern->fLiteralText.getBuffer();
UVector *sets = fPattern->fSets;
int32_t inputLen = fInput->length();
REStackFrame *fp = resetStack();
int32_t frameSize = fPattern->fFrameSize;
fp->fPatIdx = 0;
fp->fInputIdx = startIdx;
//
// Main loop for interpreting the compiled pattern.
// One iteration of the loop per pattern operation performed.
//
for (;;) {
op = pat[patIdx];
op = pat[fp->fPatIdx];
opType = URX_TYPE(op);
opValue = URX_VAL(op);
#ifdef REGEX_RUN_DEBUG
printf("inputIdx=%d inputChar=%c ", inputIdx, fInput->char32At(inputIdx));
fPattern->dumpOp(patIdx);
printf("inputIdx=%d inputChar=%c ", fp->fInputIdx, fInput->char32At(fp->fInputIdx));
fPattern->dumpOp(fp->fPatIdx);
#endif
patIdx++;
fp->fPatIdx++;
switch (opType) {
@ -611,19 +623,19 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
// Force a backtrack. In some circumstances, the pattern compiler
// will notice that the pattern can't possibly match anything, and will
// emit one of these at that point.
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
case URX_ONECHAR:
if (inputIdx < fInputLength) {
if (fp->fInputIdx < fInputLength) {
UChar32 c;
U16_NEXT(fInputUC, inputIdx, fInputLength, c);
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
if (c == opValue) {
break;
}
}
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
@ -635,21 +647,21 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
int32_t stringStartIdx, stringLen;
stringStartIdx = opValue;
op = pat[patIdx];
patIdx++;
op = pat[fp->fPatIdx];
fp->fPatIdx++;
opType = URX_TYPE(op);
opValue = URX_VAL(op);
U_ASSERT(opType == URX_STRING_LEN);
stringLen = opValue;
int32_t stringEndIndex = inputIdx + stringLen;
int32_t stringEndIndex = fp->fInputIdx + stringLen;
if (stringEndIndex <= inputLen &&
u_strncmp(fInputUC+inputIdx, litText+stringStartIdx, stringLen) == 0) {
u_strncmp(fInputUC+fp->fInputIdx, litText+stringStartIdx, stringLen) == 0) {
// Success. Advance the current input position.
inputIdx = stringEndIndex;
fp->fInputIdx = stringEndIndex;
} else {
// No match. Back up matching to a saved state
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
}
break;
@ -657,18 +669,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_STATE_SAVE:
// Save the state of all capture groups, the pattern continuation
// postion and the input position.
{
int32_t *stackPtr = fBackTrackStack->reserveBlock(fCaptureStateSize, status);
int i;
for (i=fPattern->fNumCaptureGroups; i>0; i--) {
*stackPtr++ = fCapStarts[i];
*stackPtr++ = fCapEnds[i];
}
*stackPtr++ = opValue;
*stackPtr++ = inputIdx;
}
fp = StateSave(fp, opValue, frameSize, status);
break;
@ -679,69 +680,69 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
goto breakFromLoop;
case URX_START_CAPTURE:
U_ASSERT(opValue > 0 && opValue <= fPattern->fNumCaptureGroups);
fCapStarts[opValue] = inputIdx;
U_ASSERT(opValue >= 0 && opValue < frameSize-3);
fp->fExtra[opValue] = fp->fInputIdx;
break;
case URX_END_CAPTURE:
U_ASSERT(opValue > 0 && opValue <= fPattern->fNumCaptureGroups);
U_ASSERT(fCaptureStarts->elementAti(opValue) >= 0);
fCapEnds[opValue] = inputIdx;
U_ASSERT(opValue > 0 && opValue < frameSize-2);
U_ASSERT(fp->fExtra[opValue-1] >= 0); // Start pos for this group must be set.
fp->fExtra[opValue] = fp->fInputIdx;
break;
case URX_DOLLAR: // $, test for End of line
// or for position before new line at end of input
if (inputIdx < inputLen-2) {
if (fp->fInputIdx < inputLen-2) {
// We are no where near the end of input. Fail.
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
if (inputIdx >= inputLen) {
if (fp->fInputIdx >= inputLen) {
// We really are at the end of input. Success.
break;
}
// If we are positioned just before a new-line that is located at the
// end of input, succeed.
if (inputIdx == inputLen-1) {
UChar32 c = fInput->char32At(inputIdx);
if (fp->fInputIdx == inputLen-1) {
UChar32 c = fInput->char32At(fp->fInputIdx);
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
break; // At new-line at end of input. Success
}
}
if (inputIdx == inputLen-2) {
if (fInput->char32At(inputIdx) == 0x0d && fInput->char32At(inputIdx+1) == 0x0a) {
if (fp->fInputIdx == inputLen-2) {
if (fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) {
break; // At CR/LF at end of input. Success
}
}
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
// TODO: support for multi-line mode.
break;
case URX_CARET: // ^, test for start of line
if (inputIdx != 0) {
backTrack(inputIdx, patIdx);
if (fp->fInputIdx != 0) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
} // TODO: support for multi-line mode.
break;
case URX_BACKSLASH_A: // Test for start of input
if (inputIdx != 0) {
backTrack(inputIdx, patIdx);
if (fp->fInputIdx != 0) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
break;
case URX_BACKSLASH_B: // Test for word boundaries
{
UBool success = isWordBoundary(inputIdx);
UBool success = isWordBoundary(fp->fInputIdx);
success ^= (opValue != 0); // flip sense for \B
if (!success) {
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
}
break;
@ -749,19 +750,19 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_BACKSLASH_D: // Test for decimal digit
{
if (inputIdx >= fInputLength) {
backTrack(inputIdx, patIdx);
if (fp->fInputIdx >= fInputLength) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
UChar32 c = fInput->char32At(inputIdx);
UChar32 c = fInput->char32At(fp->fInputIdx);
int8_t ctype = u_charType(c);
UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
success ^= (opValue != 0); // flip sense for \D
if (success) {
inputIdx = fInput->moveIndex32(inputIdx, 1);
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
} else {
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
}
break;
@ -770,8 +771,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_BACKSLASH_G: // Test for position at end of previous match
if (!((fMatch && inputIdx==fMatchEnd) || fMatch==FALSE && inputIdx==0)) {
backTrack(inputIdx, patIdx);
if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==0)) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
break;
@ -779,20 +780,20 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_BACKSLASH_X: // Match combining character sequence
{ // Closer to Grapheme cluster than to Perl \X
// Fail if at end of input
if (inputIdx >= fInputLength) {
backTrack(inputIdx, patIdx);
if (fp->fInputIdx >= fInputLength) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
// Always consume one char
UChar32 c = fInput->char32At(inputIdx);
inputIdx = fInput->moveIndex32(inputIdx, 1);
UChar32 c = fInput->char32At(fp->fInputIdx);
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
// Consume CR/LF as a pair
if (c == 0x0d) {
UChar32 c = fInput->char32At(inputIdx);
UChar32 c = fInput->char32At(fp->fInputIdx);
if (c == 0x0a) {
inputIdx = fInput->moveIndex32(inputIdx, 1);
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
break;
}
}
@ -801,15 +802,15 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
int8_t ctype = u_charType(c);
if (ctype != U_CONTROL_CHAR) {
for(;;) {
c = fInput->char32At(inputIdx);
c = fInput->char32At(fp->fInputIdx);
ctype = u_charType(c);
// TODO: make a set and add the "other grapheme extend" chars
// to the list of stuff to be skipped over.
if (!(ctype == U_NON_SPACING_MARK || ctype == U_ENCLOSING_MARK)) {
break;
}
inputIdx = fInput->moveIndex32(inputIdx, 1);
if (inputIdx >= fInputLength) {
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
if (fp->fInputIdx >= fInputLength) {
break;
}
}
@ -820,8 +821,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_BACKSLASH_Z: // Test for end of line
if (inputIdx < inputLen) {
backTrack(inputIdx, patIdx);
if (fp->fInputIdx < inputLen) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
break;
@ -836,10 +837,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
// 1: success if input char is not in set.
UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
opValue &= ~URX_NEG_SET;
if (inputIdx < fInputLength) {
if (fp->fInputIdx < fInputLength) {
// There is input left. Pick up one char and test it for set membership.
UChar32 c;
U16_NEXT(fInputUC, inputIdx, fInputLength, c);
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
const UnicodeSet *s = fPattern->fStaticSets[opValue];
if (s->contains(c)) {
@ -847,17 +848,17 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
}
}
if (!success) {
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
}
break;
case URX_SETREF:
if (inputIdx < fInputLength) {
if (fp->fInputIdx < fInputLength) {
// There is input left. Pick up one char and test it for set membership.
UChar32 c;
U16_NEXT(fInputUC, inputIdx, fInputLength, c);
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
U_ASSERT(opValue > 0 && opValue < sets->size());
UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
if (s->contains(c)) {
@ -867,25 +868,25 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
}
// Either at end of input, or the character wasn't in the set.
// Either way, we need to back track out.
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
case URX_DOTANY:
{
// . matches anything
if (inputIdx >= fInputLength) {
if (fp->fInputIdx >= fInputLength) {
// At end of input. Match failed. Backtrack out.
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c;
U16_NEXT(fInputUC, inputIdx, fInputLength, c);
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
(c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
// End of line in normal mode. . does not match.
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
}
@ -896,32 +897,81 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
{
// ., in dot-matches-all (including new lines) mode
// . matches anything
if (inputIdx >= fInputLength) {
if (fp->fInputIdx >= fInputLength) {
// At end of input. Match failed. Backtrack out.
backTrack(inputIdx, patIdx);
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c = fInput->char32At(inputIdx);
inputIdx = fInput->moveIndex32(inputIdx, 1);
UChar32 c = fInput->char32At(fp->fInputIdx);
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
// In the case of a CR/LF, we need to advance over both.
UChar32 nextc = fInput->char32At(inputIdx);
UChar32 nextc = fInput->char32At(fp->fInputIdx);
if (c == 0x0d && nextc == 0x0a) {
inputIdx = fInput->moveIndex32(inputIdx, 1);
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
}
}
}
break;
case URX_JMP:
patIdx = opValue;
fp->fPatIdx = opValue;
break;
case URX_FAIL:
isMatch = FALSE;
goto breakFromLoop;
case URX_CTR_INIT:
{
U_ASSERT(opValue >= 0 && opValue < frameSize-2);
fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
// Pick up the three extra operands that CTR_INIT has, and
// skip the pattern location counter past
int32_t instrOperandLoc = fp->fPatIdx;
fp->fPatIdx += 3;
int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
int32_t minCount = pat[instrOperandLoc+1];
int32_t maxCount = pat[instrOperandLoc+2];
U_ASSERT(minCount>=0);
U_ASSERT(maxCount>=minCount || maxCount==-1);
U_ASSERT(loopLoc>fp->fPatIdx);
if (minCount == 0) {
fp = StateSave(fp, loopLoc+1, frameSize, status);
}
if (maxCount == 0) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
}
break;
case URX_CTR_LOOP:
{
U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
int32_t initOp = pat[opValue];
U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
int32_t minCount = pat[opValue+2];
int32_t maxCount = pat[opValue+3];
// Increment the counter. Note: we're not worrying about counter
// overflow, since the data comes from UnicodeStrings, which
// stores its length in an int32_t.
(*pCounter)++;
U_ASSERT(*pCounter > 0);
if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
U_ASSERT(*pCounter == maxCount || maxCount == -1);
break;
}
if (*pCounter >= minCount) {
fp = StateSave(fp, fp->fPatIdx, frameSize, status);
}
fp->fPatIdx = opValue + 4; // Loop back.
}
break;
default:
// Trouble. The compiled pattern contains an entry with an
@ -939,13 +989,18 @@ breakFromLoop:
if (isMatch) {
fLastMatchEnd = fMatchEnd;
fMatchStart = startIdx;
fMatchEnd = inputIdx;
fMatchEnd = fp->fInputIdx;
REGEX_RUN_DEBUG_PRINTF("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd);
}
else
{
REGEX_RUN_DEBUG_PRINTF("No match\n\n");
}
fFrame = fp; // The active stack frame when the engine stopped.
// Contains the capture group results that we need to
// access later.
return;
}

View file

@ -66,7 +66,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fFlags = other.fFlags;
fLiteralText = other.fLiteralText;
fBadState = other.fBadState;
fNumCaptureGroups = other.fNumCaptureGroups;
fMaxCaptureDigits = other.fMaxCaptureDigits;
fStaticSets = other.fStaticSets;
if (fBadState) {
@ -74,11 +73,10 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
}
// Copy the pattern. It's just values, nothing deep to copy.
int i;
// TODO: something with status
UErrorCode status = U_ZERO_ERROR;
for (i=0; i<other.fCompiledPat->size(); i++) {
fCompiledPat->addElement(other.fCompiledPat->elementAti(i), status);
}
fCompiledPat->assign(*other.fCompiledPat, status);
fGroupMap->assign(*other.fGroupMap, status);
// Note: do not copy fMatcher. It'll be created on first use if the
// destination needs one.
@ -87,6 +85,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
// Could be made more efficient if the sets were reference counted and shared,
// but I doubt that pattern copying will be particularly common.
// Note: init() already added an empty element zero to fSets
int32_t i;
for (i=1; i<other.fSets->size(); i++) {
UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
UnicodeSet *newSet = new UnicodeSet(*sourceSet);
@ -112,14 +111,15 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
void RegexPattern::init() {
fFlags = 0;
fBadState = FALSE;
fNumCaptureGroups = 0;
fMaxCaptureDigits = 1; // TODO: calculate for real.
fStaticSets = NULL;
fMatcher = NULL;
fFrameSize = 0;
UErrorCode status=U_ZERO_ERROR;
// Init of a completely new RegexPattern.
fCompiledPat = new UVector32(status);
fGroupMap = new UVector32(status);
fSets = new UVector(status);
if (U_FAILURE(status) || fCompiledPat == NULL || fSets == NULL) {
fBadState = TRUE;
@ -151,6 +151,8 @@ void RegexPattern::zap() {
}
delete fSets;
fSets = NULL;
delete fGroupMap;
fGroupMap = NULL;
}
@ -367,6 +369,7 @@ int32_t RegexPattern::split(const UnicodeString &input,
// Loop through the input text, searching for the delimiter pattern
//
int i;
int32_t numCaptureGroups = fGroupMap->size();
for (i=0; ; i++) {
if (i==destCapacity-1) {
// There is only one output string left.
@ -384,7 +387,7 @@ int32_t RegexPattern::split(const UnicodeString &input,
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
for (groupNum=1; groupNum<=this->fNumCaptureGroups; groupNum++) {
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
if (i==destCapacity-1) {
break;
}
@ -446,6 +449,7 @@ void RegexPattern::dumpOp(int32_t index) const {
// Types with no operand field of interest.
break;
case URX_RESERVED_OP:
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_STATE_SAVE:
@ -457,6 +461,14 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_CARET:
case URX_DOLLAR:
case URX_STRING_LEN:
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_CTR_INIT_P:
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_CTR_LOOP_P:
case URX_RELOC_OPRND:
// types with an integer operand field.
REGEX_DUMP_DEBUG_PRINTF("%d", val);
break;

View file

@ -48,6 +48,7 @@ class RegexMatcher;
class UVector;
class UVector32;
class UnicodeSet;
struct REStackFrame;
/**
@ -312,7 +313,12 @@ private:
// split(), to avoid having to
// make new ones on each call.
int32_t fNumCaptureGroups;
int32_t fFrameSize; // Size of a state stack frame in the
// execution engine.
UVector32 *fGroupMap; // Map from capture group number to position of
// the group's variables in the matcher stack frame.
int32_t fMaxCaptureDigits;
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
@ -658,9 +664,12 @@ private:
// MatchAt This is the internal interface to the match engine itself.
// Match status comes back in matcher member variables.
//
void MatchAt(int32_t startIdx, UErrorCode &status);
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
UBool isWordBoundary(int32_t pos); // perform the \b test
void MatchAt(int32_t startIdx, UErrorCode &status);
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
UBool isWordBoundary(int32_t pos); // perform the \b test
REStackFrame *resetStack();
inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx,
int32_t frameSize, UErrorCode &status);
const RegexPattern *fPattern;
@ -672,14 +681,11 @@ private:
int32_t fMatchStart; // Position of the start of the most recent match
int32_t fMatchEnd; // First position after the end of the most recent match
int32_t fLastMatchEnd; // First position after the end of the previous match.
UVector32 *fBackTrackStack;
UVector32 *fCaptureStarts;
UVector32 *fCaptureEnds;
// Cache the capture vector data pointers, for faster access.
int32_t *fCapStarts;
int32_t *fCapEnds;
int32_t fCaptureStateSize;
UVector32 *fStack;
REStackFrame *fFrame; // After finding a match, the last active stack
// frame, which will contain the capture group results.
// NOT valid while match engine is running.
/**
* The address of this static class variable serves as this class's ID

View file

@ -368,8 +368,8 @@ void RegexTest::Basic() {
//
#if 0
{
//REGEX_TESTLM("X(.+)+X", "nomatch", TRUE, TRUE);
REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
REGEX_TESTLM("A{3}BC", "AAABC", TRUE, TRUE);
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
}
exit(1);
#endif
@ -1195,6 +1195,34 @@ void RegexTest::Extended() {
REGEX_FIND("AB\\.C\\eD\\u0666E", "<0>AB.C\\u001BD\\u0666E</0>F");
// {min,max} iteration qualifier
REGEX_TESTLM("A{3}BC", "AAABC", TRUE, TRUE);
REGEX_FIND("(ABC){2,3}AB", "no matchAB");
REGEX_FIND("(ABC){2,3}AB", "ABCAB");
REGEX_FIND("(ABC){2,3}AB", "<0>ABC<1>ABC</1>AB</0>");
REGEX_FIND("(ABC){2,3}AB", "<0>ABCABC<1>ABC</1>AB</0>");
REGEX_FIND("(ABC){2,3}AB", "<0>ABCABC<1>ABC</1>AB</0>CAB");
REGEX_FIND("(ABC){2}AB", "ABCAB");
REGEX_FIND("(ABC){2}AB", "<0>ABC<1>ABC</1>AB</0>");
REGEX_FIND("(ABC){2}AB", "<0>ABC<1>ABC</1>AB</0>CAB");
REGEX_FIND("(ABC){2}AB", "<0>ABC<1>ABC</1>AB</0>CABCAB");
REGEX_FIND("(ABC){2,}AB", "ABCAB");
REGEX_FIND("(ABC){2,}AB", "<0>ABC<1>ABC</1>AB</0>");
REGEX_FIND("(ABC){2,}AB", "<0>ABCABC<1>ABC</1>AB</0>");
REGEX_FIND("(ABC){2,}AB", "<0>ABCABCABC<1>ABC</1>AB</0>");
REGEX_FIND("X{0,0}ABC", "<0>ABC</0>");
REGEX_FIND("X{0,1}ABC", "<0>ABC</0>");
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "Hello there");
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "<0>Hello<1>!</1> there</0>");
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "<0>Hello<1>!!</1> there</0>");
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "<0>Hello<1>!!!</1> there</0>");
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "Hello!!!! there");
}
@ -1234,9 +1262,6 @@ void RegexTest::Errors() {
// Atomic Grouping
REGEX_ERR("abc(?>xyz)", 1, 6, U_REGEX_UNIMPLEMENTED);
// {Numeric Quantifiers}
REGEX_ERR("abc{4}", 1, 5, U_REGEX_UNIMPLEMENTED);
// Possessive Quantifiers
REGEX_ERR("abc++d", 1, 5, U_REGEX_UNIMPLEMENTED);
REGEX_ERR("abc*+d", 1, 5, U_REGEX_UNIMPLEMENTED);