ICU-2422 regexp, atomic paretheses added, but still buggy

X-SVN-Rev: 10865
This commit is contained in:
Andy Heninger 2003-01-17 01:43:54 +00:00
parent f6bb16e5fb
commit 50a1da1f2b
8 changed files with 186 additions and 17 deletions

View file

@ -1841,7 +1841,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_UNIMPLEMENTED",
"U_REGEX_MISMATCHED_PAREN",
"U_REGEX_NUMBER_TOO_BIG",
"U_REGEX_BAD_INTERVAL"
"U_REGEX_BAD_INTERVAL",
"U_REGEX_MAX_LT_MIN"
};
U_CAPI const char * U_EXPORT2

View file

@ -617,16 +617,17 @@ typedef enum UErrorCode {
/*
* The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
*/
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */
U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */

View file

@ -549,8 +549,31 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doOpenAtomicParen:
// Open Atomic Paren.
error(U_REGEX_UNIMPLEMENTED);
// Open Atomic Paren. (?>
// Compile to a
// - NOP, which later may be replaced if the parenthesized group
// has a quantifier, followed by
// - STO_SP save state stack position, so it can be restored at the ")"
// - NOP, which may later be replaced by a save-state if there
// is an '|' alternation within the parens.
{
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
fRXPat->fDataSize += 1; // state stack ptr.
int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc);
fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
// On the Parentheses stack, start a new frame and add the postions
// of the two NOPs. Depending on what follows in the pattern, the
// NOPs may be changed to SAVE_STATE or JMP ops, with a target
// address of the end of the parenthesized group.
fParenStack.push(-3, *fStatus); // Begin a new frame.
fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
}
break;
break;
case doOpenLookAhead:
@ -1218,6 +1241,19 @@ void RegexCompile::handleCloseParen() {
fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
}
break;
case -3:
// Atomic Parenthesis.
// Insert a LD_SP operation to restore the state stack to the position
// it was when the atomic parens were entered.
{
int32_t stoOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
int32_t stoLoc = URX_VAL(stoOp);
int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc);
fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
}
break;
default:
U_ASSERT(FALSE);
}
@ -1324,6 +1360,9 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
op = URX_BUILD(LoopOp, topOfBlock);
fRXPat->fCompiledPat->addElement(op, *fStatus);
if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
error(U_REGEX_MAX_LT_MIN);
}
}

View file

@ -87,9 +87,14 @@ enum {
URX_CTR_LOOP_NG = 29, // Also in three flavors.
URX_CTR_LOOP_P = 30,
URX_RELOC_OPRND = 31 // Operand value in multi-operand ops that refers
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
// back into compiled pattern code, and thus must
// be relocated when inserting/deleting ops in code.
URX_STO_SP = 32, // Store the stack ptr. Operand is location within
// matcher data (not stack data) to store it.
URX_LD_SP = 33 // Load the stack pointer. Operand is location
// to load from.
};
// Keep this list of opcode names in sync with the above enum
@ -126,7 +131,9 @@ enum {
"CTR_LOOP", \
"CTR_LOOP_NG", \
"CTR_LOOP_P", \
"RELOC_OPRND"
"RELOC_OPRND", \
"STO_SP", \
"LD_SP"
//
// Convenience macros for assembling and disassembling a compiled operation.

View file

@ -19,6 +19,7 @@
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "uassert.h"
#include "cmemory.h"
#include "uvector.h"
#include "uvectr32.h"
#include "regeximp.h"
@ -39,6 +40,11 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
fInputLength = 0;
UErrorCode status = U_ZERO_ERROR;
fStack = new UVector32(status); // TODO: do something with status.
fData = fSmallData;
if (pat->fDataSize > sizeof(fSmallData)/sizeof(fSmallData[0])) {
fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); // TODO: null check
}
reset();
}
@ -46,6 +52,9 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
RegexMatcher::~RegexMatcher() {
delete fStack;
if (fData != fSmallData) {
delete fData;
}
}
@ -417,6 +426,7 @@ REStackFrame *RegexMatcher::resetStack() {
// they indicate that a group has not yet matched anything.
fStack->removeAllElements();
UErrorCode status = U_ZERO_ERROR; // TODO: do something with status
int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, status);
int i;
for (i=0; i<fPattern->fFrameSize; i++) {
@ -574,7 +584,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
}
printf("\n");
printf("\n");
printf("PatLoc inputIdx char\n");
printf(" PatLoc inputIdx char\n");
}
#endif
@ -607,7 +617,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
opType = URX_TYPE(op);
opValue = URX_VAL(op);
#ifdef REGEX_RUN_DEBUG
printf("inputIdx=%d inputChar=%c ", fp->fInputIdx, fInput->char32At(fp->fInputIdx));
printf("inputIdx=%d inputChar=%c sp=%d ", fp->fInputIdx,
fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer());
fPattern->dumpOp(fp->fPatIdx);
#endif
fp->fPatIdx++;
@ -972,6 +983,87 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
}
break;
case URX_CTR_INIT_NG:
{
U_ASSERT(opValue >= 0 && opValue < frameSize-2);
fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
// Pick up the three extra operands that CTR_INIT has, and
// skip the pattern location counter past
int32_t instrOperandLoc = fp->fPatIdx;
fp->fPatIdx += 3;
int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
int32_t minCount = pat[instrOperandLoc+1];
int32_t maxCount = pat[instrOperandLoc+2];
U_ASSERT(minCount>=0);
U_ASSERT(maxCount>=minCount || maxCount==-1);
U_ASSERT(loopLoc>fp->fPatIdx);
if (minCount == 0) {
if (maxCount != 0) {
fp = StateSave(fp, fp->fPatIdx, frameSize, status);
}
fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
}
}
break;
case URX_CTR_LOOP_NG:
{
U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
int32_t initOp = pat[opValue];
U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
int32_t minCount = pat[opValue+2];
int32_t maxCount = pat[opValue+3];
// Increment the counter. Note: we're not worrying about counter
// overflow, since the data comes from UnicodeStrings, which
// stores its length in an int32_t.
(*pCounter)++;
U_ASSERT(*pCounter > 0);
if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
// The loop has matched the maximum permitted number of times.
// Break out of here with no action. Matching will
// continue with the following pattern.
U_ASSERT(*pCounter == maxCount || maxCount == -1);
break;
}
if (*pCounter < minCount) {
// We haven't met the minimum number of matches yet.
// Loop back for another one.
fp->fPatIdx = opValue + 4; // Loop back.
} else {
// We do have the minimum number of matches.
// Fall into the following pattern, but first do
// a state save to the top of the loop, so that a failure
// in the following pattern will try another iteration of the loop.
fp = StateSave(fp, opValue + 4, frameSize, status);
}
}
break;
case URX_STO_SP:
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
fData[opValue] = fStack->size();
break;
case URX_LD_SP:
{
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
int32_t newStackSize = fData[opValue];
U_ASSERT(newStackSize <= fStack->size());
REStackFrame *newFP = (REStackFrame *)(fStack->getBuffer() + newStackSize - frameSize);
int32_t i;
for (i=0; i<frameSize; i++) {
newFP[i] = fp[i];
}
fp = newFP;
fStack->setSize(newStackSize);
}
break;
default:
// Trouble. The compiled pattern contains an entry with an

View file

@ -115,6 +115,7 @@ void RegexPattern::init() {
fStaticSets = NULL;
fMatcher = NULL;
fFrameSize = 0;
fDataSize = 0;
UErrorCode status=U_ZERO_ERROR;
// Init of a completely new RegexPattern.
@ -468,6 +469,8 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_CTR_LOOP_NG:
case URX_CTR_LOOP_P:
case URX_RELOC_OPRND:
case URX_STO_SP:
case URX_LD_SP:
// types with an integer operand field.
REGEX_DUMP_DEBUG_PRINTF("%d", val);

View file

@ -316,6 +316,10 @@ private:
int32_t fFrameSize; // Size of a state stack frame in the
// execution engine.
int32_t fDataSize; // The size of the data needed by the pattern that
// does not go on the state stack, but has just
// a single copy per matcher.
UVector32 *fGroupMap; // Map from capture group number to position of
// the group's variables in the matcher stack frame.
@ -687,6 +691,9 @@ private:
// frame, which will contain the capture group results.
// NOT valid while match engine is running.
int32_t *fData; // Data area for use by the compiled pattern.
int32_t fSmallData[8]; // Use this for data if it's enough.
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".

View file

@ -368,7 +368,7 @@ void RegexTest::Basic() {
//
#if 0
{
REGEX_TESTLM("A{3}BC", "AAABC", TRUE, TRUE);
REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
}
exit(1);
@ -1223,6 +1223,19 @@ void RegexTest::Extended() {
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "<0>Hello<1>!!!</1> there</0>");
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "Hello!!!! there");
// Nongreedy {min,max}? intervals
REGEX_FIND("(ABC){2,3}?AB", "no matchAB");
REGEX_FIND("(ABC){2,3}?AB", "ABCAB");
REGEX_FIND("(ABC){2,3}?AB", "<0>ABC<1>ABC</1>AB</0>");
REGEX_FIND("(ABC){2,3}?AB", "<0>ABC<1>ABC</1>AB</0>CAB");
REGEX_FIND("(ABC){2,3}?AB", "<0>ABC<1>ABC</1>AB</0>CABCAB");
REGEX_FIND("(ABC){2,3}?AX", "<0>ABCABC<1>ABC</1>AX</0>");
REGEX_FIND("(ABC){2,3}?AX", "ABC<0>ABCABC<1>ABC</1>AX</0>");
// Atomic Grouping
REGEX_FIND("(?>.*)abc", "abcabcabc"); // no match. .* consumed entire string.
//REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
}
@ -1259,9 +1272,6 @@ void RegexTest::Errors() {
REGEX_ERR("abc(?<!xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // negated look-behind
REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
// Atomic Grouping
REGEX_ERR("abc(?>xyz)", 1, 6, U_REGEX_UNIMPLEMENTED);
// Possessive Quantifiers
REGEX_ERR("abc++d", 1, 5, U_REGEX_UNIMPLEMENTED);
REGEX_ERR("abc*+d", 1, 5, U_REGEX_UNIMPLEMENTED);
@ -1284,6 +1294,15 @@ void RegexTest::Errors() {
REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
// Mal-formed {min,max} quantifiers
REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */