mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-2422 regexp, atomic paretheses added, but still buggy
X-SVN-Rev: 10865
This commit is contained in:
parent
f6bb16e5fb
commit
50a1da1f2b
8 changed files with 186 additions and 17 deletions
|
@ -1841,7 +1841,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
|
|||
"U_REGEX_UNIMPLEMENTED",
|
||||
"U_REGEX_MISMATCHED_PAREN",
|
||||
"U_REGEX_NUMBER_TOO_BIG",
|
||||
"U_REGEX_BAD_INTERVAL"
|
||||
"U_REGEX_BAD_INTERVAL",
|
||||
"U_REGEX_MAX_LT_MIN"
|
||||
};
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
|
|
|
@ -617,16 +617,17 @@ typedef enum UErrorCode {
|
|||
/*
|
||||
* The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
|
||||
*/
|
||||
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
|
||||
U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
|
||||
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
|
||||
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
|
||||
U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
|
||||
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
|
||||
U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */
|
||||
U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */
|
||||
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
|
||||
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
|
||||
U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */
|
||||
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
|
||||
U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */
|
||||
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
|
||||
U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */
|
||||
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
|
||||
U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */
|
||||
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
|
||||
|
||||
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||
|
|
|
@ -549,8 +549,31 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
|
||||
case doOpenAtomicParen:
|
||||
// Open Atomic Paren.
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
// Open Atomic Paren. (?>
|
||||
// Compile to a
|
||||
// - NOP, which later may be replaced if the parenthesized group
|
||||
// has a quantifier, followed by
|
||||
// - STO_SP save state stack position, so it can be restored at the ")"
|
||||
// - NOP, which may later be replaced by a save-state if there
|
||||
// is an '|' alternation within the parens.
|
||||
{
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
|
||||
fRXPat->fDataSize += 1; // state stack ptr.
|
||||
int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc);
|
||||
fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
|
||||
// On the Parentheses stack, start a new frame and add the postions
|
||||
// of the two NOPs. Depending on what follows in the pattern, the
|
||||
// NOPs may be changed to SAVE_STATE or JMP ops, with a target
|
||||
// address of the end of the parenthesized group.
|
||||
fParenStack.push(-3, *fStatus); // Begin a new frame.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
|
||||
}
|
||||
break;
|
||||
|
||||
break;
|
||||
|
||||
case doOpenLookAhead:
|
||||
|
@ -1218,6 +1241,19 @@ void RegexCompile::handleCloseParen() {
|
|||
fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
|
||||
}
|
||||
break;
|
||||
case -3:
|
||||
// Atomic Parenthesis.
|
||||
// Insert a LD_SP operation to restore the state stack to the position
|
||||
// it was when the atomic parens were entered.
|
||||
{
|
||||
int32_t stoOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
|
||||
U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
|
||||
int32_t stoLoc = URX_VAL(stoOp);
|
||||
int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc);
|
||||
fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
U_ASSERT(FALSE);
|
||||
}
|
||||
|
@ -1324,6 +1360,9 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
|
|||
op = URX_BUILD(LoopOp, topOfBlock);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
|
||||
if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
|
||||
error(U_REGEX_MAX_LT_MIN);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -87,9 +87,14 @@ enum {
|
|||
URX_CTR_LOOP_NG = 29, // Also in three flavors.
|
||||
URX_CTR_LOOP_P = 30,
|
||||
|
||||
URX_RELOC_OPRND = 31 // Operand value in multi-operand ops that refers
|
||||
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
|
||||
// back into compiled pattern code, and thus must
|
||||
// be relocated when inserting/deleting ops in code.
|
||||
|
||||
URX_STO_SP = 32, // Store the stack ptr. Operand is location within
|
||||
// matcher data (not stack data) to store it.
|
||||
URX_LD_SP = 33 // Load the stack pointer. Operand is location
|
||||
// to load from.
|
||||
};
|
||||
|
||||
// Keep this list of opcode names in sync with the above enum
|
||||
|
@ -126,7 +131,9 @@ enum {
|
|||
"CTR_LOOP", \
|
||||
"CTR_LOOP_NG", \
|
||||
"CTR_LOOP_P", \
|
||||
"RELOC_OPRND"
|
||||
"RELOC_OPRND", \
|
||||
"STO_SP", \
|
||||
"LD_SP"
|
||||
|
||||
//
|
||||
// Convenience macros for assembling and disassembling a compiled operation.
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include "unicode/uchar.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "uassert.h"
|
||||
#include "cmemory.h"
|
||||
#include "uvector.h"
|
||||
#include "uvectr32.h"
|
||||
#include "regeximp.h"
|
||||
|
@ -39,6 +40,11 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
|
|||
fInputLength = 0;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fStack = new UVector32(status); // TODO: do something with status.
|
||||
fData = fSmallData;
|
||||
if (pat->fDataSize > sizeof(fSmallData)/sizeof(fSmallData[0])) {
|
||||
fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); // TODO: null check
|
||||
}
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
|
@ -46,6 +52,9 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
|
|||
|
||||
RegexMatcher::~RegexMatcher() {
|
||||
delete fStack;
|
||||
if (fData != fSmallData) {
|
||||
delete fData;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -417,6 +426,7 @@ REStackFrame *RegexMatcher::resetStack() {
|
|||
// they indicate that a group has not yet matched anything.
|
||||
fStack->removeAllElements();
|
||||
UErrorCode status = U_ZERO_ERROR; // TODO: do something with status
|
||||
|
||||
int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, status);
|
||||
int i;
|
||||
for (i=0; i<fPattern->fFrameSize; i++) {
|
||||
|
@ -574,7 +584,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
}
|
||||
printf("\n");
|
||||
printf("\n");
|
||||
printf("PatLoc inputIdx char\n");
|
||||
printf(" PatLoc inputIdx char\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -607,7 +617,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
opType = URX_TYPE(op);
|
||||
opValue = URX_VAL(op);
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
printf("inputIdx=%d inputChar=%c ", fp->fInputIdx, fInput->char32At(fp->fInputIdx));
|
||||
printf("inputIdx=%d inputChar=%c sp=%d ", fp->fInputIdx,
|
||||
fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer());
|
||||
fPattern->dumpOp(fp->fPatIdx);
|
||||
#endif
|
||||
fp->fPatIdx++;
|
||||
|
@ -972,6 +983,87 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||
}
|
||||
break;
|
||||
|
||||
case URX_CTR_INIT_NG:
|
||||
{
|
||||
U_ASSERT(opValue >= 0 && opValue < frameSize-2);
|
||||
fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
|
||||
|
||||
// Pick up the three extra operands that CTR_INIT has, and
|
||||
// skip the pattern location counter past
|
||||
int32_t instrOperandLoc = fp->fPatIdx;
|
||||
fp->fPatIdx += 3;
|
||||
int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
|
||||
int32_t minCount = pat[instrOperandLoc+1];
|
||||
int32_t maxCount = pat[instrOperandLoc+2];
|
||||
U_ASSERT(minCount>=0);
|
||||
U_ASSERT(maxCount>=minCount || maxCount==-1);
|
||||
U_ASSERT(loopLoc>fp->fPatIdx);
|
||||
|
||||
if (minCount == 0) {
|
||||
if (maxCount != 0) {
|
||||
fp = StateSave(fp, fp->fPatIdx, frameSize, status);
|
||||
}
|
||||
fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_CTR_LOOP_NG:
|
||||
{
|
||||
U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
|
||||
int32_t initOp = pat[opValue];
|
||||
U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
|
||||
int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
|
||||
int32_t minCount = pat[opValue+2];
|
||||
int32_t maxCount = pat[opValue+3];
|
||||
// Increment the counter. Note: we're not worrying about counter
|
||||
// overflow, since the data comes from UnicodeStrings, which
|
||||
// stores its length in an int32_t.
|
||||
(*pCounter)++;
|
||||
U_ASSERT(*pCounter > 0);
|
||||
|
||||
if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
|
||||
// The loop has matched the maximum permitted number of times.
|
||||
// Break out of here with no action. Matching will
|
||||
// continue with the following pattern.
|
||||
U_ASSERT(*pCounter == maxCount || maxCount == -1);
|
||||
break;
|
||||
}
|
||||
|
||||
if (*pCounter < minCount) {
|
||||
// We haven't met the minimum number of matches yet.
|
||||
// Loop back for another one.
|
||||
fp->fPatIdx = opValue + 4; // Loop back.
|
||||
} else {
|
||||
// We do have the minimum number of matches.
|
||||
// Fall into the following pattern, but first do
|
||||
// a state save to the top of the loop, so that a failure
|
||||
// in the following pattern will try another iteration of the loop.
|
||||
fp = StateSave(fp, opValue + 4, frameSize, status);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_STO_SP:
|
||||
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
|
||||
fData[opValue] = fStack->size();
|
||||
break;
|
||||
|
||||
case URX_LD_SP:
|
||||
{
|
||||
U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
|
||||
int32_t newStackSize = fData[opValue];
|
||||
U_ASSERT(newStackSize <= fStack->size());
|
||||
REStackFrame *newFP = (REStackFrame *)(fStack->getBuffer() + newStackSize - frameSize);
|
||||
int32_t i;
|
||||
for (i=0; i<frameSize; i++) {
|
||||
newFP[i] = fp[i];
|
||||
}
|
||||
fp = newFP;
|
||||
fStack->setSize(newStackSize);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
// Trouble. The compiled pattern contains an entry with an
|
||||
|
|
|
@ -115,6 +115,7 @@ void RegexPattern::init() {
|
|||
fStaticSets = NULL;
|
||||
fMatcher = NULL;
|
||||
fFrameSize = 0;
|
||||
fDataSize = 0;
|
||||
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
// Init of a completely new RegexPattern.
|
||||
|
@ -468,6 +469,8 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
case URX_CTR_LOOP_NG:
|
||||
case URX_CTR_LOOP_P:
|
||||
case URX_RELOC_OPRND:
|
||||
case URX_STO_SP:
|
||||
case URX_LD_SP:
|
||||
|
||||
// types with an integer operand field.
|
||||
REGEX_DUMP_DEBUG_PRINTF("%d", val);
|
||||
|
|
|
@ -316,6 +316,10 @@ private:
|
|||
int32_t fFrameSize; // Size of a state stack frame in the
|
||||
// execution engine.
|
||||
|
||||
int32_t fDataSize; // The size of the data needed by the pattern that
|
||||
// does not go on the state stack, but has just
|
||||
// a single copy per matcher.
|
||||
|
||||
UVector32 *fGroupMap; // Map from capture group number to position of
|
||||
// the group's variables in the matcher stack frame.
|
||||
|
||||
|
@ -687,6 +691,9 @@ private:
|
|||
// frame, which will contain the capture group results.
|
||||
// NOT valid while match engine is running.
|
||||
|
||||
int32_t *fData; // Data area for use by the compiled pattern.
|
||||
int32_t fSmallData[8]; // Use this for data if it's enough.
|
||||
|
||||
/**
|
||||
* The address of this static class variable serves as this class's ID
|
||||
* for ICU "poor man's RTTI".
|
||||
|
|
|
@ -368,7 +368,7 @@ void RegexTest::Basic() {
|
|||
//
|
||||
#if 0
|
||||
{
|
||||
REGEX_TESTLM("A{3}BC", "AAABC", TRUE, TRUE);
|
||||
REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
|
||||
// REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
|
||||
}
|
||||
exit(1);
|
||||
|
@ -1223,6 +1223,19 @@ void RegexTest::Extended() {
|
|||
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "<0>Hello<1>!!!</1> there</0>");
|
||||
REGEX_FIND("(?:Hello(!{1,3}) there){1}", "Hello!!!! there");
|
||||
|
||||
// Nongreedy {min,max}? intervals
|
||||
REGEX_FIND("(ABC){2,3}?AB", "no matchAB");
|
||||
REGEX_FIND("(ABC){2,3}?AB", "ABCAB");
|
||||
REGEX_FIND("(ABC){2,3}?AB", "<0>ABC<1>ABC</1>AB</0>");
|
||||
REGEX_FIND("(ABC){2,3}?AB", "<0>ABC<1>ABC</1>AB</0>CAB");
|
||||
REGEX_FIND("(ABC){2,3}?AB", "<0>ABC<1>ABC</1>AB</0>CABCAB");
|
||||
REGEX_FIND("(ABC){2,3}?AX", "<0>ABCABC<1>ABC</1>AX</0>");
|
||||
REGEX_FIND("(ABC){2,3}?AX", "ABC<0>ABCABC<1>ABC</1>AX</0>");
|
||||
|
||||
// Atomic Grouping
|
||||
REGEX_FIND("(?>.*)abc", "abcabcabc"); // no match. .* consumed entire string.
|
||||
//REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -1259,9 +1272,6 @@ void RegexTest::Errors() {
|
|||
REGEX_ERR("abc(?<!xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // negated look-behind
|
||||
REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
|
||||
|
||||
// Atomic Grouping
|
||||
REGEX_ERR("abc(?>xyz)", 1, 6, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
// Possessive Quantifiers
|
||||
REGEX_ERR("abc++d", 1, 5, U_REGEX_UNIMPLEMENTED);
|
||||
REGEX_ERR("abc*+d", 1, 5, U_REGEX_UNIMPLEMENTED);
|
||||
|
@ -1284,6 +1294,15 @@ void RegexTest::Errors() {
|
|||
REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
|
||||
REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
|
||||
REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
|
||||
|
||||
// Mal-formed {min,max} quantifiers
|
||||
REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
|
||||
REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
|
||||
REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
|
||||
REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
|
||||
REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
|
||||
REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
|
||||
|
||||
}
|
||||
|
||||
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
|
||||
|
|
Loading…
Add table
Reference in a new issue