From 2e7a2dd624b46ad330529d8655791fc88a1d635b Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Fri, 28 Mar 2003 02:31:17 +0000 Subject: [PATCH] ICU-2422 Regexp, more speed optimizations X-SVN-Rev: 11412 --- icu4c/source/i18n/regexcmp.cpp | 101 +++++++++++++++++++++++- icu4c/source/i18n/regexcmp.h | 1 + icu4c/source/i18n/regeximp.h | 23 +++++- icu4c/source/i18n/rematch.cpp | 81 +++++++++++++++++++ icu4c/source/i18n/repattrn.cpp | 15 ++-- icu4c/source/test/testdata/regextst.txt | 1 + 6 files changed, 209 insertions(+), 13 deletions(-) diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index a5d7870703e..fefd18a9c39 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -1023,6 +1023,11 @@ UBool RegexCompile::doParseActions(EParseAction action) // 3. JMP_SAV 2 // 4. ... // + // Or, if the body is a simple [Set] or single char literal, + // 1. LOOP_SR_I set number + // 2. LOOP_C stack location + // ... + // // Or, if the body can match a zero-length string, to inhibit infinite loops, // 1. STATE_SAVE 6 // 2. STO_INP_LOC data-loc @@ -1032,9 +1037,26 @@ UBool RegexCompile::doParseActions(EParseAction action) // 6. ... { // location of item #1, the STATE_SAVE - int32_t saveStateLoc = blockTopLoc(TRUE); + int32_t topLoc = blockTopLoc(FALSE); int32_t dataLoc = -1; + // Check for simple [set]*, which get special optimized code. + if (topLoc == fRXPat->fCompiledPat->size() - 1) { + int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc); + if (URX_TYPE(repeatedOp) == URX_SETREF) { + int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); + fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); + dataLoc = fRXPat->fFrameSize; + fRXPat->fFrameSize++; + int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); + fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); + break; + } + } + + // Check for minimum match lenght of zero, which requires + // extra loop-breaking code. + int32_t saveStateLoc = blockTopLoc(TRUE); if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { insertOp(saveStateLoc); dataLoc = fRXPat->fFrameSize; @@ -1128,7 +1150,9 @@ UBool RegexCompile::doParseActions(EParseAction action) case doInterval: // Finished scanning a normal {lower,upper} interval. Generate the code for it. - compileInterval(URX_CTR_INIT, URX_CTR_LOOP); + if (compileInlineInterval() == FALSE) { + compileInterval(URX_CTR_INIT, URX_CTR_LOOP); + } break; case doPossesiveInterval: @@ -2119,6 +2143,61 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) +UBool RegexCompile::compileInlineInterval() { + if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { + // Too big to inline. Fail, which will cause looping code to be generated. + // (Upper < Lower picks up unbounded upper and errors, both.) + return FALSE; + } + + int32_t topOfBlock = blockTopLoc(FALSE); + if (fIntervalUpper == 0) { + // Pathological case. Attempt no matches, as if the block doesn't exist. + fRXPat->fCompiledPat->setSize(topOfBlock); + return TRUE; + } + + if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { + // The thing being repeated is not a single op, but some + // more complex block. Do it as a loop, not inlines. + // Note that things "repeated" a max of once are handled as inline, because + // the one copy of the code already generated is just fine. + return FALSE; + } + + // Pick up the opcode that is to be repeated + // + int32_t op = fRXPat->fCompiledPat->elementAti(topOfBlock); + + // Compute the pattern location where the inline sequence + // will end, and set up the state save op that will be needed. + // + int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 + + fIntervalUpper + (fIntervalUpper-fIntervalLow); + int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); + if (fIntervalLow == 0) { + insertOp(topOfBlock); + fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); + } + + + + // Loop, emitting the op for the thing being repeated each time. + // Loop starts at 1 because one instance of the op already exists in the pattern, + // it was put there when it was originally encountered. + int32_t i; + for (i=1; ifCompiledPat->addElement(saveOp, *fStatus); + } + if (i > fIntervalLow) { + fRXPat->fCompiledPat->addElement(saveOp, *fStatus); + } + fRXPat->fCompiledPat->addElement(op, *fStatus); + } + return TRUE; +} + //---------------------------------------------------------------------------------------- @@ -2451,6 +2530,12 @@ void RegexCompile::matchStartType() { atStart = FALSE; break; + case URX_LOOP_SR_I: + case URX_LOOP_C: + // More loop ops. These state-save to themselves. + // don't change the minimum match + atStart = FALSE; + break; case URX_LA_START: @@ -2735,6 +2820,11 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { // The jump is conditional, backwards only. break; + case URX_LOOP_SR_I: + case URX_LOOP_C: + // More loop ops. These state-save to themselves. + // don't change the minimum match + break; case URX_LA_START: @@ -2966,8 +3056,9 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { case URX_CTR_INIT_NG: case URX_CTR_LOOP: case URX_CTR_LOOP_NG: + case URX_LOOP_SR_I: + case URX_LOOP_C: // For anything to do with loops, make the match length unbounded. - // TODO, possibly later, special case short loops like {0,1}. // Note: INIT instructions are multi-word. Can ignore because // INT32_MAX length will stop the per-instruction loop. currentLen = INT32_MAX; @@ -3129,6 +3220,8 @@ void RegexCompile::stripNOPs() { case URX_LB_END: case URX_LBN_CONT: case URX_LBN_END: + case URX_LOOP_SR_I: + case URX_LOOP_C: // These instructions are unaltered by the relocation. fRXPat->fCompiledPat->setElementAt(op, dst); dst++; @@ -3207,6 +3300,8 @@ void RegexCompile::OptEndingLoop() { case URX_LD_SP: case URX_END_CAPTURE: case URX_START_CAPTURE: + case URX_LOOP_SR_I: + case URX_LOOP_C: // These ops do a state save. // Can not do the optimization. return; diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h index 21c9686f491..5d3991bc1f6 100644 --- a/icu4c/source/i18n/regexcmp.h +++ b/icu4c/source/i18n/regexcmp.h @@ -99,6 +99,7 @@ private: // a reference to a UnicodeSet. void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. int32_t LoopOp); + UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier void literalChar(UChar32 c); // Compile a literal char void fixLiterals(UBool split=FALSE); // Fix literal strings. void insertOp(int32_t where); // Open up a slot for a new op in the diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index 0d8109e5b1b..d282a220831 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -157,7 +157,13 @@ enum { URX_LBN_END = 48, // Negative LookBehind end // Parameter is the data location. // Check that the match ended at the right spot. - URX_STAT_SETREF_N = 49 // Operand is index of set in array of sets. + URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated + // Operand is index of set in array of sets. + URX_LOOP_SR_I = 50, // Init a [set]* loop. + // Operand is the sets index in array of user sets. + URX_LOOP_C = 51 // Continue a [set]* or OneChar* loop. + // Operand is a matcher static data location. + // Must always immediately follow LOOP_x_I instruction. }; @@ -213,7 +219,9 @@ enum { "LB_END", \ "LBN_CONT", \ "LBN_END", \ - "STAT_SETREF_N" \ + "STAT_SETREF_N", \ + "LOOP_SR_I", \ + "LOOP_C" // @@ -287,12 +295,18 @@ enum StartOfMatch { // 8 bit set, to fast-path latin-1 set membership tests. // struct Regex8BitSet { + inline Regex8BitSet(); + inline void operator = (const Regex8BitSet &s); inline void init(const UnicodeSet *src); inline UBool contains(UChar32 c); inline void add(UChar32 c); int8_t d[32]; }; +inline Regex8BitSet::Regex8BitSet() { + uprv_memset(d, 0, sizeof(d)); +} + inline UBool Regex8BitSet::contains(UChar32 c) { // No bounds checking! This is deliberate. return ((d[c>>3] & 1 <<(c&7)) != 0); @@ -303,7 +317,6 @@ inline void Regex8BitSet::add(UChar32 c) { }; inline void Regex8BitSet::init(const UnicodeSet *s) { - uprv_memset(d, 0, sizeof(d)); if (s != NULL) { for (int i=0; i<255; i++) { if (s->contains(i)) { @@ -313,6 +326,10 @@ inline void Regex8BitSet::init(const UnicodeSet *s) { } } +inline void Regex8BitSet::operator = (const Regex8BitSet &s) { + uprv_memcpy(d, s.d, sizeof(d)); +} + U_NAMESPACE_END #endif diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 2df54b35e99..7b186357430 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -1922,6 +1922,87 @@ GC_Done: break; + case URX_LOOP_SR_I: + // Loop Initialization for the optimized implementation of + // [some character set]* + // This op scans through all matching input. + // The following LOOP_C op emulates stack unwinding if the following pattern fails. + { + U_ASSERT(opValue > 0 && opValue < sets->size()); + Regex8BitSet *s8 = &fPattern->fSets8[opValue]; + UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); + + // Loop through input, until either the input is exhausted or + // we reach a character that is not a member of the set. + int32_t ix = fp->fInputIdx; + for (;;) { + if (ix >= inputLen) { + break; + } + UChar32 c; + U16_NEXT(inputBuf, ix, inputLen, c); + if (c<256) { + if (s8->contains(c) == FALSE) { + U16_BACK_1(inputBuf, 0, ix); + break; + } + } else { + if (s->contains(c) == FALSE) { + U16_BACK_1(inputBuf, 0, ix); + break; + } + } + } + + // If there were no matching characters, skip over the loop altogether. + // The loop doesn't run at all, a * op always succeeds. + if (ix == fp->fInputIdx) { + fp->fPatIdx++; // skip the URX_LOOP_C op. + break; + } + + // Peek ahead in the compiled pattern, to the URX_LOOP_C that + // must follow. It's operand is the stack location + // that holds the starting input index for the match of this [set]* + int32_t loopcOp = pat[fp->fPatIdx]; + U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); + int32_t stackLoc = URX_VAL(loopcOp); + U_ASSERT(stackLoc >= 0 && stackLoc < frameSize); + fp->fExtra[stackLoc] = fp->fInputIdx; + fp->fInputIdx = ix; + + // Save State to the URX_LOOP_C op that follows this one, + // so that match failures in the following code will return to there. + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. + fp = StateSave(fp, fp->fPatIdx, frameSize, status); + fp->fPatIdx++; + } + break; + + + case URX_LOOP_C: + { + U_ASSERT(opValue>=0 && opValuefExtra[opValue]; + U_ASSERT(terminalIdx <= fp->fInputIdx); + if (terminalIdx == fp->fInputIdx) { + // We've backed up the input idx to the point that the loop started. + // The loop is done. Leave here without saving state. + // Subsequent failures won't come back here. + break; + } + // Set up for the next iteration of the loop, with input index + // backed up by one from the last time through, + // and a state save to this instruction in case the following code fails again. + // (We're going backwards because this loop emulates stack unwinding, not + // the initial scan forward.) + U_ASSERT(fp->fInputIdx > 0); + U16_BACK_1(inputBuf, 0, fp->fInputIdx); + fp = StateSave(fp, fp->fPatIdx-1, frameSize, status); + } + break; + + default: // Trouble. The compiled pattern contains an entry with an diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index 9886cf74a1c..4e1d937decf 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -73,9 +73,8 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { fStartType = other.fStartType; fInitialStringIdx = other.fInitialStringIdx; fInitialStringLen = other.fInitialStringLen; - fInitialChars = new UnicodeSet(*other.fInitialChars); - fInitialChars8 = new Regex8BitSet; - uprv_memcpy(fInitialChars8, other.fInitialChars8, sizeof(Regex8BitSet)); + *fInitialChars = *other.fInitialChars; + *fInitialChars8 = *other.fInitialChars8; fInitialChar = other.fInitialChar; // Copy the pattern. It's just values, nothing deep to copy. @@ -87,7 +86,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { // but I doubt that pattern copying will be particularly common. // Note: init() already added an empty element zero to fSets int32_t i; - for (i=1; isize(); i++) { + int32_t numSets = other.fSets->size(); + fSets8 = new Regex8BitSet[numSets]; + for (i=1; iaddElement(newSet, fDeferredStatus); + fSets8[i] = other.fSets8[i]; } - int32_t numSets = other.fSets->size(); - fSets8 = new Regex8BitSet[numSets]; - uprv_memcpy(fSets8, other.fSets8, numSets*sizeof(Regex8BitSet)); // TODO: give Regex8BitSet some constructors return *this; } @@ -459,6 +458,7 @@ void RegexPattern::dumpOp(int32_t index) const { case URX_LB_END: case URX_LBN_CONT: case URX_LBN_END: + case URX_LOOP_C: // types with an integer operand field. REGEX_DUMP_DEBUG_PRINTF("%d", val); break; @@ -484,6 +484,7 @@ void RegexPattern::dumpOp(int32_t index) const { break; case URX_SETREF: + case URX_LOOP_SR_I: { UnicodeString s; UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index 93d5415c62f..ad73727c3bc 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -21,6 +21,7 @@ # White space must be present between the flags and the match string. # + # Capturing parens ".(..)." "<0>a<1>bcd" ".*\A( +hello)" "<0><1> hello"