mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-2422 Regexp, more speed optimizations
X-SVN-Rev: 11412
This commit is contained in:
parent
a4a223f056
commit
2e7a2dd624
6 changed files with 209 additions and 13 deletions
|
@ -1023,6 +1023,11 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
// 3. JMP_SAV 2
|
||||
// 4. ...
|
||||
//
|
||||
// Or, if the body is a simple [Set] or single char literal,
|
||||
// 1. LOOP_SR_I set number
|
||||
// 2. LOOP_C stack location
|
||||
// ...
|
||||
//
|
||||
// Or, if the body can match a zero-length string, to inhibit infinite loops,
|
||||
// 1. STATE_SAVE 6
|
||||
// 2. STO_INP_LOC data-loc
|
||||
|
@ -1032,9 +1037,26 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
// 6. ...
|
||||
{
|
||||
// location of item #1, the STATE_SAVE
|
||||
int32_t saveStateLoc = blockTopLoc(TRUE);
|
||||
int32_t topLoc = blockTopLoc(FALSE);
|
||||
int32_t dataLoc = -1;
|
||||
|
||||
// Check for simple [set]*, which get special optimized code.
|
||||
if (topLoc == fRXPat->fCompiledPat->size() - 1) {
|
||||
int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc);
|
||||
if (URX_TYPE(repeatedOp) == URX_SETREF) {
|
||||
int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
|
||||
fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
|
||||
dataLoc = fRXPat->fFrameSize;
|
||||
fRXPat->fFrameSize++;
|
||||
int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
|
||||
fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for minimum match lenght of zero, which requires
|
||||
// extra loop-breaking code.
|
||||
int32_t saveStateLoc = blockTopLoc(TRUE);
|
||||
if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
|
||||
insertOp(saveStateLoc);
|
||||
dataLoc = fRXPat->fFrameSize;
|
||||
|
@ -1128,7 +1150,9 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
case doInterval:
|
||||
// Finished scanning a normal {lower,upper} interval. Generate the code for it.
|
||||
compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
|
||||
if (compileInlineInterval() == FALSE) {
|
||||
compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
|
||||
}
|
||||
break;
|
||||
|
||||
case doPossesiveInterval:
|
||||
|
@ -2119,6 +2143,61 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
|
|||
|
||||
|
||||
|
||||
UBool RegexCompile::compileInlineInterval() {
|
||||
if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
|
||||
// Too big to inline. Fail, which will cause looping code to be generated.
|
||||
// (Upper < Lower picks up unbounded upper and errors, both.)
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int32_t topOfBlock = blockTopLoc(FALSE);
|
||||
if (fIntervalUpper == 0) {
|
||||
// Pathological case. Attempt no matches, as if the block doesn't exist.
|
||||
fRXPat->fCompiledPat->setSize(topOfBlock);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
|
||||
// The thing being repeated is not a single op, but some
|
||||
// more complex block. Do it as a loop, not inlines.
|
||||
// Note that things "repeated" a max of once are handled as inline, because
|
||||
// the one copy of the code already generated is just fine.
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// Pick up the opcode that is to be repeated
|
||||
//
|
||||
int32_t op = fRXPat->fCompiledPat->elementAti(topOfBlock);
|
||||
|
||||
// Compute the pattern location where the inline sequence
|
||||
// will end, and set up the state save op that will be needed.
|
||||
//
|
||||
int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
|
||||
+ fIntervalUpper + (fIntervalUpper-fIntervalLow);
|
||||
int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
|
||||
if (fIntervalLow == 0) {
|
||||
insertOp(topOfBlock);
|
||||
fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Loop, emitting the op for the thing being repeated each time.
|
||||
// Loop starts at 1 because one instance of the op already exists in the pattern,
|
||||
// it was put there when it was originally encountered.
|
||||
int32_t i;
|
||||
for (i=1; i<fIntervalUpper; i++ ) {
|
||||
if (i == fIntervalLow) {
|
||||
fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
|
||||
}
|
||||
if (i > fIntervalLow) {
|
||||
fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
|
||||
}
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
|
@ -2451,6 +2530,12 @@ void RegexCompile::matchStartType() {
|
|||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
case URX_LOOP_SR_I:
|
||||
case URX_LOOP_C:
|
||||
// More loop ops. These state-save to themselves.
|
||||
// don't change the minimum match
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
case URX_LA_START:
|
||||
|
@ -2735,6 +2820,11 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
|||
// The jump is conditional, backwards only.
|
||||
break;
|
||||
|
||||
case URX_LOOP_SR_I:
|
||||
case URX_LOOP_C:
|
||||
// More loop ops. These state-save to themselves.
|
||||
// don't change the minimum match
|
||||
break;
|
||||
|
||||
|
||||
case URX_LA_START:
|
||||
|
@ -2966,8 +3056,9 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
|||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_LOOP_SR_I:
|
||||
case URX_LOOP_C:
|
||||
// For anything to do with loops, make the match length unbounded.
|
||||
// TODO, possibly later, special case short loops like {0,1}.
|
||||
// Note: INIT instructions are multi-word. Can ignore because
|
||||
// INT32_MAX length will stop the per-instruction loop.
|
||||
currentLen = INT32_MAX;
|
||||
|
@ -3129,6 +3220,8 @@ void RegexCompile::stripNOPs() {
|
|||
case URX_LB_END:
|
||||
case URX_LBN_CONT:
|
||||
case URX_LBN_END:
|
||||
case URX_LOOP_SR_I:
|
||||
case URX_LOOP_C:
|
||||
// These instructions are unaltered by the relocation.
|
||||
fRXPat->fCompiledPat->setElementAt(op, dst);
|
||||
dst++;
|
||||
|
@ -3207,6 +3300,8 @@ void RegexCompile::OptEndingLoop() {
|
|||
case URX_LD_SP:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_START_CAPTURE:
|
||||
case URX_LOOP_SR_I:
|
||||
case URX_LOOP_C:
|
||||
// These ops do a state save.
|
||||
// Can not do the optimization.
|
||||
return;
|
||||
|
|
|
@ -99,6 +99,7 @@ private:
|
|||
// a reference to a UnicodeSet.
|
||||
void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier.
|
||||
int32_t LoopOp);
|
||||
UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier
|
||||
void literalChar(UChar32 c); // Compile a literal char
|
||||
void fixLiterals(UBool split=FALSE); // Fix literal strings.
|
||||
void insertOp(int32_t where); // Open up a slot for a new op in the
|
||||
|
|
|
@ -157,7 +157,13 @@ enum {
|
|||
URX_LBN_END = 48, // Negative LookBehind end
|
||||
// Parameter is the data location.
|
||||
// Check that the match ended at the right spot.
|
||||
URX_STAT_SETREF_N = 49 // Operand is index of set in array of sets.
|
||||
URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated
|
||||
// Operand is index of set in array of sets.
|
||||
URX_LOOP_SR_I = 50, // Init a [set]* loop.
|
||||
// Operand is the sets index in array of user sets.
|
||||
URX_LOOP_C = 51 // Continue a [set]* or OneChar* loop.
|
||||
// Operand is a matcher static data location.
|
||||
// Must always immediately follow LOOP_x_I instruction.
|
||||
|
||||
};
|
||||
|
||||
|
@ -213,7 +219,9 @@ enum {
|
|||
"LB_END", \
|
||||
"LBN_CONT", \
|
||||
"LBN_END", \
|
||||
"STAT_SETREF_N" \
|
||||
"STAT_SETREF_N", \
|
||||
"LOOP_SR_I", \
|
||||
"LOOP_C"
|
||||
|
||||
|
||||
//
|
||||
|
@ -287,12 +295,18 @@ enum StartOfMatch {
|
|||
// 8 bit set, to fast-path latin-1 set membership tests.
|
||||
//
|
||||
struct Regex8BitSet {
|
||||
inline Regex8BitSet();
|
||||
inline void operator = (const Regex8BitSet &s);
|
||||
inline void init(const UnicodeSet *src);
|
||||
inline UBool contains(UChar32 c);
|
||||
inline void add(UChar32 c);
|
||||
int8_t d[32];
|
||||
};
|
||||
|
||||
inline Regex8BitSet::Regex8BitSet() {
|
||||
uprv_memset(d, 0, sizeof(d));
|
||||
}
|
||||
|
||||
inline UBool Regex8BitSet::contains(UChar32 c) {
|
||||
// No bounds checking! This is deliberate.
|
||||
return ((d[c>>3] & 1 <<(c&7)) != 0);
|
||||
|
@ -303,7 +317,6 @@ inline void Regex8BitSet::add(UChar32 c) {
|
|||
};
|
||||
|
||||
inline void Regex8BitSet::init(const UnicodeSet *s) {
|
||||
uprv_memset(d, 0, sizeof(d));
|
||||
if (s != NULL) {
|
||||
for (int i=0; i<255; i++) {
|
||||
if (s->contains(i)) {
|
||||
|
@ -313,6 +326,10 @@ inline void Regex8BitSet::init(const UnicodeSet *s) {
|
|||
}
|
||||
}
|
||||
|
||||
inline void Regex8BitSet::operator = (const Regex8BitSet &s) {
|
||||
uprv_memcpy(d, s.d, sizeof(d));
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
||||
|
|
|
@ -1922,6 +1922,87 @@ GC_Done:
|
|||
break;
|
||||
|
||||
|
||||
case URX_LOOP_SR_I:
|
||||
// Loop Initialization for the optimized implementation of
|
||||
// [some character set]*
|
||||
// This op scans through all matching input.
|
||||
// The following LOOP_C op emulates stack unwinding if the following pattern fails.
|
||||
{
|
||||
U_ASSERT(opValue > 0 && opValue < sets->size());
|
||||
Regex8BitSet *s8 = &fPattern->fSets8[opValue];
|
||||
UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
|
||||
|
||||
// Loop through input, until either the input is exhausted or
|
||||
// we reach a character that is not a member of the set.
|
||||
int32_t ix = fp->fInputIdx;
|
||||
for (;;) {
|
||||
if (ix >= inputLen) {
|
||||
break;
|
||||
}
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, ix, inputLen, c);
|
||||
if (c<256) {
|
||||
if (s8->contains(c) == FALSE) {
|
||||
U16_BACK_1(inputBuf, 0, ix);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (s->contains(c) == FALSE) {
|
||||
U16_BACK_1(inputBuf, 0, ix);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If there were no matching characters, skip over the loop altogether.
|
||||
// The loop doesn't run at all, a * op always succeeds.
|
||||
if (ix == fp->fInputIdx) {
|
||||
fp->fPatIdx++; // skip the URX_LOOP_C op.
|
||||
break;
|
||||
}
|
||||
|
||||
// Peek ahead in the compiled pattern, to the URX_LOOP_C that
|
||||
// must follow. It's operand is the stack location
|
||||
// that holds the starting input index for the match of this [set]*
|
||||
int32_t loopcOp = pat[fp->fPatIdx];
|
||||
U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
|
||||
int32_t stackLoc = URX_VAL(loopcOp);
|
||||
U_ASSERT(stackLoc >= 0 && stackLoc < frameSize);
|
||||
fp->fExtra[stackLoc] = fp->fInputIdx;
|
||||
fp->fInputIdx = ix;
|
||||
|
||||
// Save State to the URX_LOOP_C op that follows this one,
|
||||
// so that match failures in the following code will return to there.
|
||||
// Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
|
||||
fp = StateSave(fp, fp->fPatIdx, frameSize, status);
|
||||
fp->fPatIdx++;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_LOOP_C:
|
||||
{
|
||||
U_ASSERT(opValue>=0 && opValue<frameSize);
|
||||
int32_t terminalIdx = fp->fExtra[opValue];
|
||||
U_ASSERT(terminalIdx <= fp->fInputIdx);
|
||||
if (terminalIdx == fp->fInputIdx) {
|
||||
// We've backed up the input idx to the point that the loop started.
|
||||
// The loop is done. Leave here without saving state.
|
||||
// Subsequent failures won't come back here.
|
||||
break;
|
||||
}
|
||||
// Set up for the next iteration of the loop, with input index
|
||||
// backed up by one from the last time through,
|
||||
// and a state save to this instruction in case the following code fails again.
|
||||
// (We're going backwards because this loop emulates stack unwinding, not
|
||||
// the initial scan forward.)
|
||||
U_ASSERT(fp->fInputIdx > 0);
|
||||
U16_BACK_1(inputBuf, 0, fp->fInputIdx);
|
||||
fp = StateSave(fp, fp->fPatIdx-1, frameSize, status);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
default:
|
||||
// Trouble. The compiled pattern contains an entry with an
|
||||
|
|
|
@ -73,9 +73,8 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
fStartType = other.fStartType;
|
||||
fInitialStringIdx = other.fInitialStringIdx;
|
||||
fInitialStringLen = other.fInitialStringLen;
|
||||
fInitialChars = new UnicodeSet(*other.fInitialChars);
|
||||
fInitialChars8 = new Regex8BitSet;
|
||||
uprv_memcpy(fInitialChars8, other.fInitialChars8, sizeof(Regex8BitSet));
|
||||
*fInitialChars = *other.fInitialChars;
|
||||
*fInitialChars8 = *other.fInitialChars8;
|
||||
fInitialChar = other.fInitialChar;
|
||||
|
||||
// Copy the pattern. It's just values, nothing deep to copy.
|
||||
|
@ -87,7 +86,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
// but I doubt that pattern copying will be particularly common.
|
||||
// Note: init() already added an empty element zero to fSets
|
||||
int32_t i;
|
||||
for (i=1; i<other.fSets->size(); i++) {
|
||||
int32_t numSets = other.fSets->size();
|
||||
fSets8 = new Regex8BitSet[numSets];
|
||||
for (i=1; i<numSets; i++) {
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
return *this;
|
||||
}
|
||||
|
@ -98,11 +99,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
break;
|
||||
}
|
||||
fSets->addElement(newSet, fDeferredStatus);
|
||||
fSets8[i] = other.fSets8[i];
|
||||
}
|
||||
|
||||
int32_t numSets = other.fSets->size();
|
||||
fSets8 = new Regex8BitSet[numSets];
|
||||
uprv_memcpy(fSets8, other.fSets8, numSets*sizeof(Regex8BitSet)); // TODO: give Regex8BitSet some constructors
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -459,6 +458,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
case URX_LB_END:
|
||||
case URX_LBN_CONT:
|
||||
case URX_LBN_END:
|
||||
case URX_LOOP_C:
|
||||
// types with an integer operand field.
|
||||
REGEX_DUMP_DEBUG_PRINTF("%d", val);
|
||||
break;
|
||||
|
@ -484,6 +484,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
break;
|
||||
|
||||
case URX_SETREF:
|
||||
case URX_LOOP_SR_I:
|
||||
{
|
||||
UnicodeString s;
|
||||
UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
|
||||
|
|
1
icu4c/source/test/testdata/regextst.txt
vendored
1
icu4c/source/test/testdata/regextst.txt
vendored
|
@ -21,6 +21,7 @@
|
|||
# White space must be present between the flags and the match string.
|
||||
#
|
||||
|
||||
|
||||
# Capturing parens
|
||||
".(..)." "<0>a<1>bc</1>d</0>"
|
||||
".*\A( +hello)" "<0><1> hello</1></0>"
|
||||
|
|
Loading…
Add table
Reference in a new issue