ICU-2422 Regexp, more speed optimizations

X-SVN-Rev: 11412
This commit is contained in:
Andy Heninger 2003-03-28 02:31:17 +00:00
parent a4a223f056
commit 2e7a2dd624
6 changed files with 209 additions and 13 deletions

View file

@ -1023,6 +1023,11 @@ UBool RegexCompile::doParseActions(EParseAction action)
// 3. JMP_SAV 2
// 4. ...
//
// Or, if the body is a simple [Set] or single char literal,
// 1. LOOP_SR_I set number
// 2. LOOP_C stack location
// ...
//
// Or, if the body can match a zero-length string, to inhibit infinite loops,
// 1. STATE_SAVE 6
// 2. STO_INP_LOC data-loc
@ -1032,9 +1037,26 @@ UBool RegexCompile::doParseActions(EParseAction action)
// 6. ...
{
// location of item #1, the STATE_SAVE
int32_t saveStateLoc = blockTopLoc(TRUE);
int32_t topLoc = blockTopLoc(FALSE);
int32_t dataLoc = -1;
// Check for simple [set]*, which get special optimized code.
if (topLoc == fRXPat->fCompiledPat->size() - 1) {
int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc);
if (URX_TYPE(repeatedOp) == URX_SETREF) {
int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
dataLoc = fRXPat->fFrameSize;
fRXPat->fFrameSize++;
int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
break;
}
}
// Check for minimum match lenght of zero, which requires
// extra loop-breaking code.
int32_t saveStateLoc = blockTopLoc(TRUE);
if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
insertOp(saveStateLoc);
dataLoc = fRXPat->fFrameSize;
@ -1128,7 +1150,9 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doInterval:
// Finished scanning a normal {lower,upper} interval. Generate the code for it.
compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
if (compileInlineInterval() == FALSE) {
compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
}
break;
case doPossesiveInterval:
@ -2119,6 +2143,61 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
UBool RegexCompile::compileInlineInterval() {
if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
// Too big to inline. Fail, which will cause looping code to be generated.
// (Upper < Lower picks up unbounded upper and errors, both.)
return FALSE;
}
int32_t topOfBlock = blockTopLoc(FALSE);
if (fIntervalUpper == 0) {
// Pathological case. Attempt no matches, as if the block doesn't exist.
fRXPat->fCompiledPat->setSize(topOfBlock);
return TRUE;
}
if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
// The thing being repeated is not a single op, but some
// more complex block. Do it as a loop, not inlines.
// Note that things "repeated" a max of once are handled as inline, because
// the one copy of the code already generated is just fine.
return FALSE;
}
// Pick up the opcode that is to be repeated
//
int32_t op = fRXPat->fCompiledPat->elementAti(topOfBlock);
// Compute the pattern location where the inline sequence
// will end, and set up the state save op that will be needed.
//
int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
+ fIntervalUpper + (fIntervalUpper-fIntervalLow);
int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
if (fIntervalLow == 0) {
insertOp(topOfBlock);
fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
}
// Loop, emitting the op for the thing being repeated each time.
// Loop starts at 1 because one instance of the op already exists in the pattern,
// it was put there when it was originally encountered.
int32_t i;
for (i=1; i<fIntervalUpper; i++ ) {
if (i == fIntervalLow) {
fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
}
if (i > fIntervalLow) {
fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
}
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
return TRUE;
}
//----------------------------------------------------------------------------------------
@ -2451,6 +2530,12 @@ void RegexCompile::matchStartType() {
atStart = FALSE;
break;
case URX_LOOP_SR_I:
case URX_LOOP_C:
// More loop ops. These state-save to themselves.
// don't change the minimum match
atStart = FALSE;
break;
case URX_LA_START:
@ -2735,6 +2820,11 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
// The jump is conditional, backwards only.
break;
case URX_LOOP_SR_I:
case URX_LOOP_C:
// More loop ops. These state-save to themselves.
// don't change the minimum match
break;
case URX_LA_START:
@ -2966,8 +3056,9 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
case URX_CTR_INIT_NG:
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_LOOP_SR_I:
case URX_LOOP_C:
// For anything to do with loops, make the match length unbounded.
// TODO, possibly later, special case short loops like {0,1}.
// Note: INIT instructions are multi-word. Can ignore because
// INT32_MAX length will stop the per-instruction loop.
currentLen = INT32_MAX;
@ -3129,6 +3220,8 @@ void RegexCompile::stripNOPs() {
case URX_LB_END:
case URX_LBN_CONT:
case URX_LBN_END:
case URX_LOOP_SR_I:
case URX_LOOP_C:
// These instructions are unaltered by the relocation.
fRXPat->fCompiledPat->setElementAt(op, dst);
dst++;
@ -3207,6 +3300,8 @@ void RegexCompile::OptEndingLoop() {
case URX_LD_SP:
case URX_END_CAPTURE:
case URX_START_CAPTURE:
case URX_LOOP_SR_I:
case URX_LOOP_C:
// These ops do a state save.
// Can not do the optimization.
return;

View file

@ -99,6 +99,7 @@ private:
// a reference to a UnicodeSet.
void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier.
int32_t LoopOp);
UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier
void literalChar(UChar32 c); // Compile a literal char
void fixLiterals(UBool split=FALSE); // Fix literal strings.
void insertOp(int32_t where); // Open up a slot for a new op in the

View file

@ -157,7 +157,13 @@ enum {
URX_LBN_END = 48, // Negative LookBehind end
// Parameter is the data location.
// Check that the match ended at the right spot.
URX_STAT_SETREF_N = 49 // Operand is index of set in array of sets.
URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated
// Operand is index of set in array of sets.
URX_LOOP_SR_I = 50, // Init a [set]* loop.
// Operand is the sets index in array of user sets.
URX_LOOP_C = 51 // Continue a [set]* or OneChar* loop.
// Operand is a matcher static data location.
// Must always immediately follow LOOP_x_I instruction.
};
@ -213,7 +219,9 @@ enum {
"LB_END", \
"LBN_CONT", \
"LBN_END", \
"STAT_SETREF_N" \
"STAT_SETREF_N", \
"LOOP_SR_I", \
"LOOP_C"
//
@ -287,12 +295,18 @@ enum StartOfMatch {
// 8 bit set, to fast-path latin-1 set membership tests.
//
struct Regex8BitSet {
inline Regex8BitSet();
inline void operator = (const Regex8BitSet &s);
inline void init(const UnicodeSet *src);
inline UBool contains(UChar32 c);
inline void add(UChar32 c);
int8_t d[32];
};
inline Regex8BitSet::Regex8BitSet() {
uprv_memset(d, 0, sizeof(d));
}
inline UBool Regex8BitSet::contains(UChar32 c) {
// No bounds checking! This is deliberate.
return ((d[c>>3] & 1 <<(c&7)) != 0);
@ -303,7 +317,6 @@ inline void Regex8BitSet::add(UChar32 c) {
};
inline void Regex8BitSet::init(const UnicodeSet *s) {
uprv_memset(d, 0, sizeof(d));
if (s != NULL) {
for (int i=0; i<255; i++) {
if (s->contains(i)) {
@ -313,6 +326,10 @@ inline void Regex8BitSet::init(const UnicodeSet *s) {
}
}
inline void Regex8BitSet::operator = (const Regex8BitSet &s) {
uprv_memcpy(d, s.d, sizeof(d));
}
U_NAMESPACE_END
#endif

View file

@ -1922,6 +1922,87 @@ GC_Done:
break;
case URX_LOOP_SR_I:
// Loop Initialization for the optimized implementation of
// [some character set]*
// This op scans through all matching input.
// The following LOOP_C op emulates stack unwinding if the following pattern fails.
{
U_ASSERT(opValue > 0 && opValue < sets->size());
Regex8BitSet *s8 = &fPattern->fSets8[opValue];
UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
// Loop through input, until either the input is exhausted or
// we reach a character that is not a member of the set.
int32_t ix = fp->fInputIdx;
for (;;) {
if (ix >= inputLen) {
break;
}
UChar32 c;
U16_NEXT(inputBuf, ix, inputLen, c);
if (c<256) {
if (s8->contains(c) == FALSE) {
U16_BACK_1(inputBuf, 0, ix);
break;
}
} else {
if (s->contains(c) == FALSE) {
U16_BACK_1(inputBuf, 0, ix);
break;
}
}
}
// If there were no matching characters, skip over the loop altogether.
// The loop doesn't run at all, a * op always succeeds.
if (ix == fp->fInputIdx) {
fp->fPatIdx++; // skip the URX_LOOP_C op.
break;
}
// Peek ahead in the compiled pattern, to the URX_LOOP_C that
// must follow. It's operand is the stack location
// that holds the starting input index for the match of this [set]*
int32_t loopcOp = pat[fp->fPatIdx];
U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
int32_t stackLoc = URX_VAL(loopcOp);
U_ASSERT(stackLoc >= 0 && stackLoc < frameSize);
fp->fExtra[stackLoc] = fp->fInputIdx;
fp->fInputIdx = ix;
// Save State to the URX_LOOP_C op that follows this one,
// so that match failures in the following code will return to there.
// Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
fp = StateSave(fp, fp->fPatIdx, frameSize, status);
fp->fPatIdx++;
}
break;
case URX_LOOP_C:
{
U_ASSERT(opValue>=0 && opValue<frameSize);
int32_t terminalIdx = fp->fExtra[opValue];
U_ASSERT(terminalIdx <= fp->fInputIdx);
if (terminalIdx == fp->fInputIdx) {
// We've backed up the input idx to the point that the loop started.
// The loop is done. Leave here without saving state.
// Subsequent failures won't come back here.
break;
}
// Set up for the next iteration of the loop, with input index
// backed up by one from the last time through,
// and a state save to this instruction in case the following code fails again.
// (We're going backwards because this loop emulates stack unwinding, not
// the initial scan forward.)
U_ASSERT(fp->fInputIdx > 0);
U16_BACK_1(inputBuf, 0, fp->fInputIdx);
fp = StateSave(fp, fp->fPatIdx-1, frameSize, status);
}
break;
default:
// Trouble. The compiled pattern contains an entry with an

View file

@ -73,9 +73,8 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fStartType = other.fStartType;
fInitialStringIdx = other.fInitialStringIdx;
fInitialStringLen = other.fInitialStringLen;
fInitialChars = new UnicodeSet(*other.fInitialChars);
fInitialChars8 = new Regex8BitSet;
uprv_memcpy(fInitialChars8, other.fInitialChars8, sizeof(Regex8BitSet));
*fInitialChars = *other.fInitialChars;
*fInitialChars8 = *other.fInitialChars8;
fInitialChar = other.fInitialChar;
// Copy the pattern. It's just values, nothing deep to copy.
@ -87,7 +86,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
// but I doubt that pattern copying will be particularly common.
// Note: init() already added an empty element zero to fSets
int32_t i;
for (i=1; i<other.fSets->size(); i++) {
int32_t numSets = other.fSets->size();
fSets8 = new Regex8BitSet[numSets];
for (i=1; i<numSets; i++) {
if (U_FAILURE(fDeferredStatus)) {
return *this;
}
@ -98,11 +99,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
break;
}
fSets->addElement(newSet, fDeferredStatus);
fSets8[i] = other.fSets8[i];
}
int32_t numSets = other.fSets->size();
fSets8 = new Regex8BitSet[numSets];
uprv_memcpy(fSets8, other.fSets8, numSets*sizeof(Regex8BitSet)); // TODO: give Regex8BitSet some constructors
return *this;
}
@ -459,6 +458,7 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_LB_END:
case URX_LBN_CONT:
case URX_LBN_END:
case URX_LOOP_C:
// types with an integer operand field.
REGEX_DUMP_DEBUG_PRINTF("%d", val);
break;
@ -484,6 +484,7 @@ void RegexPattern::dumpOp(int32_t index) const {
break;
case URX_SETREF:
case URX_LOOP_SR_I:
{
UnicodeString s;
UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);

View file

@ -21,6 +21,7 @@
# White space must be present between the flags and the match string.
#
# Capturing parens
".(..)." "<0>a<1>bc</1>d</0>"
".*\A( +hello)" "<0><1> hello</1></0>"