ICU-2422 regexp, Look-ahead ops added. Bug in caching input

string in Matcher fixed.

X-SVN-Rev: 10938
This commit is contained in:
Andy Heninger 2003-01-31 02:00:52 +00:00
parent 4d7921d1b8
commit 61b188cc37
6 changed files with 178 additions and 52 deletions

View file

@ -500,7 +500,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
fRXPat->fCompiledPat->setElementAt(op, savePosition);
// Append an JMP operation into the compiled pattern. The operand for
// the OR will eventually be the location following the ')' for the
// the JMP will eventually be the location following the ')' for the
// group. This will be patched in later, when the ')' is encountered.
op = URX_BUILD(URX_JMP, 0);
fRXPat->fCompiledPat->addElement(op, *fStatus);
@ -601,18 +601,70 @@ UBool RegexCompile::doParseActions(EParseAction action)
fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
}
break;
break;
case doOpenLookAhead:
// Open Paren.
error(U_REGEX_UNIMPLEMENTED);
// Positive Look-ahead (?= stuff )
// Compiles to
// 1 START_LA dataLoc
// 2. NOP reserved for use by quantifiers on the block.
// Look-ahead can't have quantifiers, but paren stack
// compile time conventions require the slot anyhow.
// 3. NOP may be replaced if there is are '|' ops in the block.
// 4. code for parenthesized stuff.
// 5. ENDLA
//
// Two data slots are reserved, for saving the stack ptr and the input position.
{
int32_t dataLoc = fRXPat->fDataSize;
fRXPat->fDataSize += 2;
int32_t op = URX_BUILD(URX_LA_START, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
op = URX_BUILD(URX_NOP, 0);
fRXPat->fCompiledPat->addElement(op, *fStatus);
fRXPat->fCompiledPat->addElement(op, *fStatus);
// On the Parentheses stack, start a new frame and add the postions
// of the NOPs.
fParenStack.push(EParenClass::lookAhead, *fStatus); // Begin a new frame.
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
}
break;
case doOpenLookAheadNeg:
// Open Paren.
error(U_REGEX_UNIMPLEMENTED);
// Negated Lookahead. (?! stuff )
// Compiles to
// 1. START_LA dataloc
// 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
// // which continues with the match.
// 3. NOP // Std. Open Paren sequence, for possible '|'
// 4. code for parenthesized stuff.
// 5. END_LA // Cut back stack, remove saved state from step 2.
// 6. FAIL // code in block succeeded, so neg. lookahead fails.
// 7. ...
{
int32_t dataLoc = fRXPat->fDataSize;
fRXPat->fDataSize += 2;
int32_t op = URX_BUILD(URX_LA_START, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patched later.
fRXPat->fCompiledPat->addElement(op, *fStatus);
op = URX_BUILD(URX_NOP, 0);
fRXPat->fCompiledPat->addElement(op, *fStatus);
// On the Parentheses stack, start a new frame and add the postions
// of the StateSave and NOP.
fParenStack.push(EParenClass::negLookAhead, *fStatus); // Begin a new frame.
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
// Instructions #5 and #6 will be added when the ')' is encountered.
}
break;
case doOpenLookBehind:
@ -1410,11 +1462,12 @@ void RegexCompile::handleCloseParen() {
// Fixup any operations within the just-closed parenthesized group
// that need to reference the end of the (block).
// (The first one on popped from the stack is an unused slot for
// (The first one popped from the stack is an unused slot for
// alternation (OR) state save, but applying the fixup to it does no harm.)
for (;;) {
patIdx = fParenStack.popi();
if (patIdx < 0) {
// value < 0 flags the start of the frame on the paren stack.
break;
}
U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
@ -1429,11 +1482,11 @@ void RegexCompile::handleCloseParen() {
// parentesized grouping this is
switch (patIdx) {
case -1:
case plain:
// No additional fixups required.
// This is the case with most kinds of groupings.
// (Grouping-only parentheses)
break;
case -2:
case capturing:
// Capturing Parentheses.
// Insert a End Capture op into the pattern.
// The frame offset of the variables for this cg is obtained from the
@ -1447,7 +1500,7 @@ void RegexCompile::handleCloseParen() {
fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
}
break;
case -3:
case atomic:
// Atomic Parenthesis.
// Insert a LD_SP operation to restore the state stack to the position
// it was when the atomic parens were entered.
@ -1460,6 +1513,37 @@ void RegexCompile::handleCloseParen() {
}
break;
case EParenClass::lookAhead:
{
int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
int32_t dataLoc = URX_VAL(startOp);
int32_t op = URX_BUILD(URX_LA_END, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
break;
case negLookAhead:
{
// See comment at doOpenLookAheadNeg
int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
int32_t dataLoc = URX_VAL(startOp);
int32_t op = URX_BUILD(URX_LA_END, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
op = URX_BUILD(URX_FAIL, 0);
fRXPat->fCompiledPat->addElement(op, *fStatus);
// Patch the URX_SAVE near the top of the block.
int32_t saveOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
int32_t dest = fRXPat->fCompiledPat->size();
saveOp = URX_BUILD(URX_STATE_SAVE, dest);
fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
}
break;
default:
U_ASSERT(FALSE);
}

View file

@ -65,8 +65,21 @@ public:
static void cleanup(); // Memory cleanup
// Categories of parentheses in pattern.
// The category is saved in the compile-time parentheses stack frame, and
// determines the code to be generated when the matching close ) is encountered.
enum EParenClass {
plain = -1, // No special handling
capturing = -2,
atomic = -3,
lookAhead = -4,
negLookAhead = -5
};
private:
UBool doParseActions(EParseAction a);
void error(UErrorCode e); // error reporting convenience function.

View file

@ -99,12 +99,18 @@ enum {
// capture group variables in the state stack frame.
URX_STO_INP_LOC = 35, // Store the input location. Operand is location
// within the matcher data (not stack).
URX_JMPX = 36 // Conditional JMP.
URX_JMPX = 36, // Conditional JMP.
// First Operand: JMP target location.
// Second Operand: Data location containing an
// input position. If current input position ==
// saved input position, FAIL rather than taking
// the JMP.
// the JMP
URX_LA_START = 37, // Starting a LookAround expression.
// Save InputPos and SP in static data.
// Operand: Static data offset for the save
URX_LA_END = 38 // Ending a Lookaround expression.
// Restore InputPos and Stack to saved values.
// Operand: Static data offset for saved data.
};
// Keep this list of opcode names in sync with the above enum
@ -146,7 +152,9 @@ enum {
"LD_SP", \
"BACKREF", \
"STO_INP_LOC", \
"JMPX"
"JMPX", \
"LA_START", \
"LA_END"
//
// Convenience macros for assembling and disassembling a compiled operation.

View file

@ -35,8 +35,6 @@ U_NAMESPACE_BEGIN
RegexMatcher::RegexMatcher(const RegexPattern *pat) {
fPattern = pat;
fInput = NULL;
fInputUC = NULL;
fInputLength = 0;
UErrorCode status = U_ZERO_ERROR;
fStack = new UVector32(status); // TODO: do something with status.
fData = fSmallData;
@ -177,7 +175,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
//
//--------------------------------------------------------------------------------
UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
int32_t len = fInputLength-fMatchEnd;
int32_t len = fInput->length()-fMatchEnd;
if (len > 0) {
dest.append(*fInput, fMatchEnd, len);
}
@ -237,12 +235,9 @@ UBool RegexMatcher::find() {
// TODO: Needs optimization
UErrorCode status = U_ZERO_ERROR;
int32_t startPos;
// TODO: needs to go up to the very end, so a pattern that can match a zero lenght
// string can match at the end of a string. Can't do until loop-breaking
// is added to the engine, though, otherwise it triggers too many bugs.
startPos = fMatchEnd;
U_ASSERT(startPos >= 0 && startPos <= fInputLength);
int32_t startPos = fMatchEnd;
int32_t inputLen = fInput->length();
U_ASSERT(startPos >= 0 && startPos <= inputLen);
for (;;) {
MatchAt(startPos, status);
if (U_FAILURE(status)) {
@ -251,7 +246,7 @@ UBool RegexMatcher::find() {
if (fMatch) {
return TRUE;
}
if (startPos >= fInputLength) {
if (startPos >= inputLen) {
break;
}
startPos = fInput->moveIndex32(startPos, 1);
@ -265,7 +260,8 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
if (start < 0 || start >= fInputLength) {
int32_t inputLen = fInput->length();
if (start < 0 || start >= inputLen) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
@ -275,7 +271,7 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
// TODO: optimize the search for a leading literal string.
// TODO: optimize based on the minimum length of a possible match
int32_t startPos;
for (startPos=start; startPos < fInputLength; startPos=fInput->moveIndex32(startPos, 1)) {
for (startPos=start; startPos < inputLen; startPos=fInput->moveIndex32(startPos, 1)) {
MatchAt(startPos, status);
if (U_FAILURE(status)) {
return FALSE;
@ -351,7 +347,7 @@ UBool RegexMatcher::matches(UErrorCode &status) {
}
reset();
MatchAt(0, status);
UBool success = (fMatch && fMatchEnd==fInputLength);
UBool success = (fMatch && fMatchEnd==fInput->length());
return success;
}
@ -427,8 +423,6 @@ RegexMatcher &RegexMatcher::reset() {
RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
fInput = &input;
fInputLength = input.length();
fInputUC = fInput->getBuffer();
reset();
return *this;
}
@ -511,7 +505,7 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) {
// Determine whether char c at current position is a member of the word set of chars.
// If we're off the end of the string, behave as though we're not at a word char.
if (pos < fInputLength) {
if (pos < fInput->length()) {
UChar32 c = fInput->char32At(pos);
int8_t ctype = u_charType(c);
if (ctype==U_NON_SPACING_MARK || ctype==U_ENCLOSING_MARK) {
@ -619,6 +613,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
const UChar *litText = fPattern->fLiteralText.getBuffer();
UVector *sets = fPattern->fSets;
int32_t inputLen = fInput->length();
const UChar *inputBuf = fInput->getBuffer();
REStackFrame *fp = resetStack();
int32_t frameSize = fPattern->fFrameSize;
@ -663,9 +658,9 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_ONECHAR:
if (fp->fInputIdx < fInputLength) {
if (fp->fInputIdx < inputLen) {
UChar32 c;
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
if (c == opValue) {
break;
}
@ -691,7 +686,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
int32_t stringEndIndex = fp->fInputIdx + stringLen;
if (stringEndIndex <= inputLen &&
u_strncmp(fInputUC+fp->fInputIdx, litText+stringStartIdx, stringLen) == 0) {
u_strncmp(inputBuf+fp->fInputIdx, litText+stringStartIdx, stringLen) == 0) {
// Success. Advance the current input position.
fp->fInputIdx = stringEndIndex;
} else {
@ -792,7 +787,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_BACKSLASH_D: // Test for decimal digit
{
if (fp->fInputIdx >= fInputLength) {
if (fp->fInputIdx >= inputLen) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
@ -822,7 +817,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_BACKSLASH_X: // Match combining character sequence
{ // Closer to Grapheme cluster than to Perl \X
// Fail if at end of input
if (fp->fInputIdx >= fInputLength) {
if (fp->fInputIdx >= inputLen) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
@ -852,7 +847,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
break;
}
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
if (fp->fInputIdx >= fInputLength) {
if (fp->fInputIdx >= inputLen) {
break;
}
}
@ -877,7 +872,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
// The high bit of the op value is a flag for the match polarity.
// 0: success if input char is in set.
// 1: success if input char is not in set.
if (fp->fInputIdx >= fInputLength) {
if (fp->fInputIdx >= inputLen) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
@ -886,7 +881,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
opValue &= ~URX_NEG_SET;
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
UChar32 c;
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
const UnicodeSet *s = fPattern->fStaticSets[opValue];
if (s->contains(c)) {
success = !success;
@ -899,10 +894,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_SETREF:
if (fp->fInputIdx < fInputLength) {
if (fp->fInputIdx < inputLen) {
// There is input left. Pick up one char and test it for set membership.
UChar32 c;
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
U_ASSERT(opValue > 0 && opValue < sets->size());
UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
if (s->contains(c)) {
@ -919,14 +914,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
case URX_DOTANY:
{
// . matches anything
if (fp->fInputIdx >= fInputLength) {
if (fp->fInputIdx >= inputLen) {
// At end of input. Match failed. Backtrack out.
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
// There is input left. Advance over one char, unless we've hit end-of-line
UChar32 c;
U16_NEXT(fInputUC, fp->fInputIdx, fInputLength, c);
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
(c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029)) {
// End of line in normal mode. . does not match.
@ -941,7 +936,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
{
// ., in dot-matches-all (including new lines) mode
// . matches anything
if (fp->fInputIdx >= fInputLength) {
if (fp->fInputIdx >= inputLen) {
// At end of input. Match failed. Backtrack out.
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
@ -1119,7 +1114,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
break;
}
if ((fp->fInputIdx + len > inputLen) ||
u_strncmp(fInputUC+groupStartIdx, fInputUC+fp->fInputIdx, len) != 0) {
u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) != 0) {
fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
} else {
fp->fInputIdx += len; // Match. Advance current input position.
@ -1149,7 +1144,37 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
}
}
break;
case URX_LA_START:
{
// Entering a lookahead block.
// Save Stack Ptr, Input Pos.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
fData[opValue] = fStack->size();
fData[opValue+1] = fp->fInputIdx;
}
break;
case URX_LA_END:
{
// Leaving a look-ahead block.
// restore Stack Ptr, Input Pos to positions they had on entry to block.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
int32_t stackSize = fStack->size();
int32_t newStackSize = fData[opValue];
U_ASSERT(stackSize >= newStackSize);
if (stackSize > newStackSize) {
int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
int32_t i;
for (i=0; i<frameSize; i++) {
newFP[i] = ((int32_t *)fp)[i];
}
fp = (REStackFrame *)newFP;
fStack->setSize(newStackSize);
}
fp->fInputIdx = fData[opValue+1];
}
break;
default:
// Trouble. The compiled pattern contains an entry with an

View file

@ -681,9 +681,7 @@ private:
const RegexPattern *fPattern;
const UnicodeString *fInput;
const UChar *fInputUC;
int32_t fInputLength;
UBool fMatch; // True if the last match was successful.
int32_t fMatchStart; // Position of the start of the most recent match
int32_t fMatchEnd; // First position after the end of the most recent match

View file

@ -1294,8 +1294,6 @@ void RegexTest::Errors() {
REGEX_ERR("(?-si) stuff", 1, 3, U_REGEX_UNIMPLEMENTED);
// Look-ahead, Look-behind
REGEX_ERR("abc(?=xyz).*", 1, 6, U_REGEX_UNIMPLEMENTED); // look-ahead
REGEX_ERR("abc(?!xyz).*", 1, 6, U_REGEX_UNIMPLEMENTED); // negated look-ahead
REGEX_ERR("abc(?<=xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // look-behind
REGEX_ERR("abc(?<!xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // negated look-behind
REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
@ -1699,10 +1697,10 @@ void RegexTest::PerlTests() {
//
UnicodeString resultString;
UnicodeString perlExpr = fields[3];
groupsMat->reset(perlExpr);
cgMat->reset(perlExpr);
while (perlExpr.length() > 0) {
groupsMat->reset(perlExpr);
cgMat->reset(perlExpr);
if (perlExpr.startsWith("$&")) {
resultString.append(testMat->group(status));
perlExpr.remove(0, 2);