diff --git a/icu4c/source/common/putil.c b/icu4c/source/common/putil.c index 94fe3931720..0617e3c2d15 100644 --- a/icu4c/source/common/putil.c +++ b/icu4c/source/common/putil.c @@ -1841,7 +1841,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = { "U_REGEX_UNIMPLEMENTED", "U_REGEX_MISMATCHED_PAREN", "U_REGEX_NUMBER_TOO_BIG", - "U_REGEX_BAD_INTERVAL" + "U_REGEX_BAD_INTERVAL", + "U_REGEX_MAX_LT_MIN" }; U_CAPI const char * U_EXPORT2 diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h index c6916136f60..d77e6caa7c8 100644 --- a/icu4c/source/common/unicode/utypes.h +++ b/icu4c/source/common/unicode/utypes.h @@ -617,16 +617,17 @@ typedef enum UErrorCode { /* * The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs */ - U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */ - U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */ - U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */ + U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */ + U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */ + U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */ U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */ U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */ - U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */ + U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */ U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */ U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */ - U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */ - U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */ + U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */ + U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */ + U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */ U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */ U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */ diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index c7eb06fc514..c25a038434f 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -549,8 +549,31 @@ UBool RegexCompile::doParseActions(EParseAction action) case doOpenAtomicParen: - // Open Atomic Paren. - error(U_REGEX_UNIMPLEMENTED); + // Open Atomic Paren. (?> + // Compile to a + // - NOP, which later may be replaced if the parenthesized group + // has a quantifier, followed by + // - STO_SP save state stack position, so it can be restored at the ")" + // - NOP, which may later be replaced by a save-state if there + // is an '|' alternation within the parens. + { + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); + int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the + fRXPat->fDataSize += 1; // state stack ptr. + int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); + fRXPat->fCompiledPat->addElement(stoOp, *fStatus); + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); + + // On the Parentheses stack, start a new frame and add the postions + // of the two NOPs. Depending on what follows in the pattern, the + // NOPs may be changed to SAVE_STATE or JMP ops, with a target + // address of the end of the parenthesized group. + fParenStack.push(-3, *fStatus); // Begin a new frame. + fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP + } + break; + break; case doOpenLookAhead: @@ -1218,6 +1241,19 @@ void RegexCompile::handleCloseParen() { fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); } break; + case -3: + // Atomic Parenthesis. + // Insert a LD_SP operation to restore the state stack to the position + // it was when the atomic parens were entered. + { + int32_t stoOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); + U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); + int32_t stoLoc = URX_VAL(stoOp); + int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); + fRXPat->fCompiledPat->addElement(ldOp, *fStatus); + } + break; + default: U_ASSERT(FALSE); } @@ -1324,6 +1360,9 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) op = URX_BUILD(LoopOp, topOfBlock); fRXPat->fCompiledPat->addElement(op, *fStatus); + if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { + error(U_REGEX_MAX_LT_MIN); + } } diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index 820b9d5aef1..950a0f9ee78 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -87,9 +87,14 @@ enum { URX_CTR_LOOP_NG = 29, // Also in three flavors. URX_CTR_LOOP_P = 30, - URX_RELOC_OPRND = 31 // Operand value in multi-operand ops that refers + URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers // back into compiled pattern code, and thus must // be relocated when inserting/deleting ops in code. + + URX_STO_SP = 32, // Store the stack ptr. Operand is location within + // matcher data (not stack data) to store it. + URX_LD_SP = 33 // Load the stack pointer. Operand is location + // to load from. }; // Keep this list of opcode names in sync with the above enum @@ -126,7 +131,9 @@ enum { "CTR_LOOP", \ "CTR_LOOP_NG", \ "CTR_LOOP_P", \ - "RELOC_OPRND" + "RELOC_OPRND", \ + "STO_SP", \ + "LD_SP" // // Convenience macros for assembling and disassembling a compiled operation. diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 446858d3357..3c17845ea9c 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -19,6 +19,7 @@ #include "unicode/uchar.h" #include "unicode/ustring.h" #include "uassert.h" +#include "cmemory.h" #include "uvector.h" #include "uvectr32.h" #include "regeximp.h" @@ -39,6 +40,11 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) { fInputLength = 0; UErrorCode status = U_ZERO_ERROR; fStack = new UVector32(status); // TODO: do something with status. + fData = fSmallData; + if (pat->fDataSize > sizeof(fSmallData)/sizeof(fSmallData[0])) { + fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); // TODO: null check + } + reset(); } @@ -46,6 +52,9 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) { RegexMatcher::~RegexMatcher() { delete fStack; + if (fData != fSmallData) { + delete fData; + } } @@ -417,6 +426,7 @@ REStackFrame *RegexMatcher::resetStack() { // they indicate that a group has not yet matched anything. fStack->removeAllElements(); UErrorCode status = U_ZERO_ERROR; // TODO: do something with status + int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, status); int i; for (i=0; ifFrameSize; i++) { @@ -574,7 +584,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { } printf("\n"); printf("\n"); - printf("PatLoc inputIdx char\n"); + printf(" PatLoc inputIdx char\n"); } #endif @@ -607,7 +617,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { opType = URX_TYPE(op); opValue = URX_VAL(op); #ifdef REGEX_RUN_DEBUG - printf("inputIdx=%d inputChar=%c ", fp->fInputIdx, fInput->char32At(fp->fInputIdx)); + printf("inputIdx=%d inputChar=%c sp=%d ", fp->fInputIdx, + fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer()); fPattern->dumpOp(fp->fPatIdx); #endif fp->fPatIdx++; @@ -972,6 +983,87 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) { } break; + case URX_CTR_INIT_NG: + { + U_ASSERT(opValue >= 0 && opValue < frameSize-2); + fp->fExtra[opValue] = 0; // Set the loop counter variable to zero + + // Pick up the three extra operands that CTR_INIT has, and + // skip the pattern location counter past + int32_t instrOperandLoc = fp->fPatIdx; + fp->fPatIdx += 3; + int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); + int32_t minCount = pat[instrOperandLoc+1]; + int32_t maxCount = pat[instrOperandLoc+2]; + U_ASSERT(minCount>=0); + U_ASSERT(maxCount>=minCount || maxCount==-1); + U_ASSERT(loopLoc>fp->fPatIdx); + + if (minCount == 0) { + if (maxCount != 0) { + fp = StateSave(fp, fp->fPatIdx, frameSize, status); + } + fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block + } + } + break; + + case URX_CTR_LOOP_NG: + { + U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); + int32_t initOp = pat[opValue]; + U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); + int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; + int32_t minCount = pat[opValue+2]; + int32_t maxCount = pat[opValue+3]; + // Increment the counter. Note: we're not worrying about counter + // overflow, since the data comes from UnicodeStrings, which + // stores its length in an int32_t. + (*pCounter)++; + U_ASSERT(*pCounter > 0); + + if ((uint32_t)*pCounter >= (uint32_t)maxCount) { + // The loop has matched the maximum permitted number of times. + // Break out of here with no action. Matching will + // continue with the following pattern. + U_ASSERT(*pCounter == maxCount || maxCount == -1); + break; + } + + if (*pCounter < minCount) { + // We haven't met the minimum number of matches yet. + // Loop back for another one. + fp->fPatIdx = opValue + 4; // Loop back. + } else { + // We do have the minimum number of matches. + // Fall into the following pattern, but first do + // a state save to the top of the loop, so that a failure + // in the following pattern will try another iteration of the loop. + fp = StateSave(fp, opValue + 4, frameSize, status); + } + } + break; + + case URX_STO_SP: + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); + fData[opValue] = fStack->size(); + break; + + case URX_LD_SP: + { + U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); + int32_t newStackSize = fData[opValue]; + U_ASSERT(newStackSize <= fStack->size()); + REStackFrame *newFP = (REStackFrame *)(fStack->getBuffer() + newStackSize - frameSize); + int32_t i; + for (i=0; isetSize(newStackSize); + } + break; + default: // Trouble. The compiled pattern contains an entry with an diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index b325853f6f5..dc9f8bd7422 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -115,6 +115,7 @@ void RegexPattern::init() { fStaticSets = NULL; fMatcher = NULL; fFrameSize = 0; + fDataSize = 0; UErrorCode status=U_ZERO_ERROR; // Init of a completely new RegexPattern. @@ -468,6 +469,8 @@ void RegexPattern::dumpOp(int32_t index) const { case URX_CTR_LOOP_NG: case URX_CTR_LOOP_P: case URX_RELOC_OPRND: + case URX_STO_SP: + case URX_LD_SP: // types with an integer operand field. REGEX_DUMP_DEBUG_PRINTF("%d", val); diff --git a/icu4c/source/i18n/unicode/regex.h b/icu4c/source/i18n/unicode/regex.h index 53f5260be8d..0a551db39ed 100644 --- a/icu4c/source/i18n/unicode/regex.h +++ b/icu4c/source/i18n/unicode/regex.h @@ -316,6 +316,10 @@ private: int32_t fFrameSize; // Size of a state stack frame in the // execution engine. + int32_t fDataSize; // The size of the data needed by the pattern that + // does not go on the state stack, but has just + // a single copy per matcher. + UVector32 *fGroupMap; // Map from capture group number to position of // the group's variables in the matcher stack frame. @@ -687,6 +691,9 @@ private: // frame, which will contain the capture group results. // NOT valid while match engine is running. + int32_t *fData; // Data area for use by the compiled pattern. + int32_t fSmallData[8]; // Use this for data if it's enough. + /** * The address of this static class variable serves as this class's ID * for ICU "poor man's RTTI". diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index 912225821b4..ff06abf36d0 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -368,7 +368,7 @@ void RegexTest::Basic() { // #if 0 { - REGEX_TESTLM("A{3}BC", "AAABC", TRUE, TRUE); + REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc<2>cccddd"); // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); } exit(1); @@ -1223,6 +1223,19 @@ void RegexTest::Extended() { REGEX_FIND("(?:Hello(!{1,3}) there){1}", "<0>Hello<1>!!! there"); REGEX_FIND("(?:Hello(!{1,3}) there){1}", "Hello!!!! there"); + // Nongreedy {min,max}? intervals + REGEX_FIND("(ABC){2,3}?AB", "no matchAB"); + REGEX_FIND("(ABC){2,3}?AB", "ABCAB"); + REGEX_FIND("(ABC){2,3}?AB", "<0>ABC<1>ABCAB"); + REGEX_FIND("(ABC){2,3}?AB", "<0>ABC<1>ABCABCAB"); + REGEX_FIND("(ABC){2,3}?AB", "<0>ABC<1>ABCABCABCAB"); + REGEX_FIND("(ABC){2,3}?AX", "<0>ABCABC<1>ABCAX"); + REGEX_FIND("(ABC){2,3}?AX", "ABC<0>ABCABC<1>ABCAX"); + + // Atomic Grouping + REGEX_FIND("(?>.*)abc", "abcabcabc"); // no match. .* consumed entire string. + //REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc<2>cccddd"); + } @@ -1259,9 +1272,6 @@ void RegexTest::Errors() { REGEX_ERR("abc(?xyz)", 1, 6, U_REGEX_UNIMPLEMENTED); - // Possessive Quantifiers REGEX_ERR("abc++d", 1, 5, U_REGEX_UNIMPLEMENTED); REGEX_ERR("abc*+d", 1, 5, U_REGEX_UNIMPLEMENTED); @@ -1284,6 +1294,15 @@ void RegexTest::Errors() { REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX); REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); + + // Mal-formed {min,max} quantifiers + REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); + REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); + REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); + REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); + REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); + REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); + } #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */