ICU-2422 Regexp, more speed optimizations

X-SVN-Rev: 11402
This commit is contained in:
Andy Heninger 2003-03-27 01:25:20 +00:00
parent 4b469843ee
commit a640031201
6 changed files with 166 additions and 26 deletions

View file

@ -137,7 +137,8 @@ static const UChar gIsWordPattern[] = {
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x38, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x66, 0x39, 0x5d, 0};
static UnicodeSet *gPropSets[URX_LAST_SET];
static UnicodeSet *gPropSets[URX_LAST_SET];
static Regex8BitSet gPropSets8[URX_LAST_SET];
//----------------------------------------------------------------------------------------
@ -229,6 +230,8 @@ static void InitGraphemeClusterSets(UErrorCode &status) {
delete LV;
delete LVT;
}
}
}
@ -281,6 +284,12 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET], gIsSpacePattern, status);
InitGraphemeClusterSets(status);
int32_t i;
for (i=0; i<URX_LAST_SET; i++) {
gPropSets8[i].init(gPropSets[i]);
}
}
@ -346,6 +355,7 @@ void RegexCompile::compile(
// Prepare the RegexPattern object to receive the compiled pattern.
fRXPat->fPattern = pat;
fRXPat->fStaticSets = gPropSets;
fRXPat->fStaticSets8 = gPropSets8;
// Initialize the pattern scanning state machine
@ -529,6 +539,17 @@ void RegexCompile::compile(
stripNOPs();
OptEndingLoop();
//
// Set up fast latin-1 range sets
//
int32_t numSets = fRXPat->fSets->size();
fRXPat->fSets8 = new Regex8BitSet[numSets];
int32_t i;
for (i=0; i<numSets; i++) {
UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
fRXPat->fSets8[i].init(s);
}
//
// A stupid bit of non-sense to prevent code coverage testing from complaining
// about the pattern.dump() debug function. Go through the motions of dumping,
@ -1214,7 +1235,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doBackslashS:
fRXPat->fCompiledPat->addElement(
URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET | URX_NEG_SET), *fStatus);
URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
break;
case doBackslashs:
@ -1224,7 +1245,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doBackslashW:
fRXPat->fCompiledPat->addElement(
URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
break;
case doBackslashw:
@ -2215,17 +2236,24 @@ void RegexCompile::matchStartType() {
case URX_STATIC_SETREF:
if (currentLen == 0) {
int32_t sn = URX_VAL(op);
UBool negated = ((sn & URX_NEG_SET) == URX_NEG_SET);
sn &= ~URX_NEG_SET;
U_ASSERT(sn>0 && sn<URX_LAST_SET);
const UnicodeSet *s = fRXPat->fStaticSets[sn];
if (negated) {
UnicodeSet sc(*s);
sc.complement();
fRXPat->fInitialChars->addAll(sc);
} else {
fRXPat->fInitialChars->addAll(*s);
}
fRXPat->fInitialChars->addAll(*s);
numInitialStrings += 2;
}
currentLen++;
atStart = FALSE;
break;
case URX_STAT_SETREF_N:
if (currentLen == 0) {
int32_t sn = URX_VAL(op);
const UnicodeSet *s = fRXPat->fStaticSets[sn];
UnicodeSet sc(*s);
sc.complement();
fRXPat->fInitialChars->addAll(sc);
numInitialStrings += 2;
}
currentLen++;
@ -2482,6 +2510,9 @@ void RegexCompile::matchStartType() {
}
fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
// Sort out what we should check for when looking for candidate match start positions.
// In order of preference,
// 1. Start of input text buffer.
@ -2611,6 +2642,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
//
case URX_ONECHAR:
case URX_STATIC_SETREF:
case URX_STAT_SETREF_N:
case URX_SETREF:
case URX_BACKSLASH_D:
case URX_ONECHAR_I:
@ -2854,6 +2886,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
// Ops that match a max of one character (possibly two 16 bit code units.)
//
case URX_STATIC_SETREF:
case URX_STAT_SETREF_N:
case URX_SETREF:
case URX_BACKSLASH_D:
case URX_ONECHAR_I:
@ -3064,6 +3097,7 @@ void RegexCompile::stripNOPs() {
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_STATIC_SETREF:
case URX_STAT_SETREF_N:
case URX_SETREF:
case URX_DOTANY:
case URX_FAIL:

View file

@ -12,6 +12,8 @@
#ifndef _REGEXIMP_H
#define _REGEXIMP_H
#include "cmemory.h"
U_NAMESPACE_BEGIN
//
@ -152,9 +154,10 @@ enum {
// Param 1: The minimum length of the look-behind match
// Param 2: The max length of the look-behind match
// Param 3: The pattern loc following the look-behind block.
URX_LBN_END = 48 // Negative LookBehind end
URX_LBN_END = 48, // Negative LookBehind end
// Parameter is the data location.
// Check that the match ended at the right spot.
URX_STAT_SETREF_N = 49 // Operand is index of set in array of sets.
};
@ -209,7 +212,9 @@ enum {
"LB_CONT", \
"LB_END", \
"LBN_CONT", \
"LBN_END"
"LBN_END", \
"STAT_SETREF_N" \
//
// Convenience macros for assembling and disassembling a compiled operation.
@ -277,6 +282,38 @@ enum StartOfMatch {
(v)==START_STRING? "START_STRING" : \
"ILLEGAL")
//
// 8 bit set, to fast-path latin-1 set membership tests.
//
struct Regex8BitSet {
inline void init(const UnicodeSet *src);
inline UBool contains(UChar32 c);
inline void add(UChar32 c);
int8_t d[32];
};
inline UBool Regex8BitSet::contains(UChar32 c) {
// No bounds checking! This is deliberate.
return ((d[c>>3] & 1 <<(c&7)) != 0);
};
inline void Regex8BitSet::add(UChar32 c) {
d[c>>3] |= 1 << (c&7);
};
inline void Regex8BitSet::init(const UnicodeSet *s) {
uprv_memset(d, 0, sizeof(d));
if (s != NULL) {
for (int i=0; i<255; i++) {
if (s->contains(i)) {
this->add(i);
}
}
}
}
U_NAMESPACE_END
#endif

View file

@ -355,7 +355,8 @@ UBool RegexMatcher::find() {
for (;;) {
int32_t pos = startPos;
U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++];
if (fPattern->fInitialChars->contains(c)) {
if (c<256 && fPattern->fInitialChars8->contains(c) ||
c>=256 && fPattern->fInitialChars->contains(c)) {
MatchAt(pos, fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return FALSE;
@ -1271,9 +1272,16 @@ GC_Done:
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
const UnicodeSet *s = fPattern->fStaticSets[opValue];
if (s->contains(c)) {
success = !success;
if (c < 256) {
Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
if (s8->contains(c)) {
success = !success;
}
} else {
const UnicodeSet *s = fPattern->fStaticSets[opValue];
if (s->contains(c)) {
success = !success;
}
}
if (!success) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
@ -1282,16 +1290,53 @@ GC_Done:
break;
case URX_STAT_SETREF_N:
{
// Test input character for NOT being a member of one of
// the predefined sets (Word Characters, for example)
if (fp->fInputIdx >= inputLen) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
if (c < 256) {
Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
if (s8->contains(c) == FALSE) {
break;
}
} else {
const UnicodeSet *s = fPattern->fStaticSets[opValue];
if (s->contains(c) == FALSE) {
break;
}
}
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
break;
case URX_SETREF:
if (fp->fInputIdx < inputLen) {
// There is input left. Pick up one char and test it for set membership.
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
U_ASSERT(opValue > 0 && opValue < sets->size());
UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
if (s->contains(c)) {
// The character is in the set. A Match.
break;
if (c<256) {
Regex8BitSet *s8 = &fPattern->fSets8[opValue];
if (s8->contains(c)) {
break;
}
} else {
UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
if (s->contains(c)) {
// The character is in the set. A Match.
break;
}
}
}
// Either at end of input, or the character wasn't in the set.

View file

@ -74,6 +74,8 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fInitialStringIdx = other.fInitialStringIdx;
fInitialStringLen = other.fInitialStringLen;
fInitialChars = new UnicodeSet(*other.fInitialChars);
fInitialChars8 = new Regex8BitSet;
uprv_memcpy(fInitialChars8, other.fInitialChars8, sizeof(Regex8BitSet));
fInitialChar = other.fInitialChar;
// Copy the pattern. It's just values, nothing deep to copy.
@ -97,6 +99,10 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
}
fSets->addElement(newSet, fDeferredStatus);
}
int32_t numSets = other.fSets->size();
fSets8 = new Regex8BitSet[numSets];
uprv_memcpy(fSets8, other.fSets8, numSets*sizeof(Regex8BitSet)); // TODO: give Regex8BitSet some constructors
return *this;
}
@ -119,16 +125,20 @@ void RegexPattern::init() {
fInitialStringIdx = 0;
fInitialStringLen = 0;
fInitialChars = NULL;
fInitialChars8 = NULL;
fInitialChar = 0;
fSets8 = NULL;
fCompiledPat = new UVector32(fDeferredStatus);
fGroupMap = new UVector32(fDeferredStatus);
fSets = new UVector(fDeferredStatus);
fInitialChars = new UnicodeSet;
fInitialChars8 = new Regex8BitSet;
if (U_FAILURE(fDeferredStatus)) {
return;
}
if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || fInitialChars == NULL) {
if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
fInitialChars == NULL || fInitialChars8 == NULL) {
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
@ -160,6 +170,10 @@ void RegexPattern::zap() {
fGroupMap = NULL;
delete fInitialChars;
fInitialChars = NULL;
delete fInitialChars8;
fInitialChars8 = NULL;
delete[] fSets8;
fSets8 = NULL;
}
@ -481,6 +495,7 @@ void RegexPattern::dumpOp(int32_t index) const {
break;
case URX_STATIC_SETREF:
case URX_STAT_SETREF_N:
{
UnicodeString s;
if (val & URX_NEG_SET) {

View file

@ -54,6 +54,7 @@ class UVector;
class UVector32;
class UnicodeSet;
struct REStackFrame;
struct Regex8BitSet;
/**
@ -358,6 +359,8 @@ private:
// after un-escaping, for use during the match.
UVector *fSets; // Any UnicodeSets referenced from the pattern.
Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
UErrorCode fDeferredStatus; // status if some prior error has left this
// RegexPattern in an unusable state.
@ -382,11 +385,15 @@ private:
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
// regex character classes, e.g. Word.
Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
// sets for predefined regex classes.
int32_t fStartType; // Info on how a match must start.
int32_t fInitialStringIdx; //
int32_t fInitialStringLen;
UnicodeSet *fInitialChars;
UChar32 fInitialChar;
Regex8BitSet *fInitialChars8;
/**
* The address of this static class variable serves as this class's ID

View file

@ -299,12 +299,14 @@
"abc*$" "<0>abccc</0>"
"ab(?:ab[xyz]\s)*" "<0>ababy abx </0>abc"
"(?:abc|a)(?:bc)+" "<0>abc</0>"
"(?:(abc)|a)(?:bc)+" "<0>abc</0>"
"(?:(abc)|a)(?:bc)*" "<0><1>abc</1></0>"
"^[+\-]?[0-9]*\.?[0-9]*" "<0>123.456</0>"
#
# Random debugging, Temporary
#
#"^(?:a?b?)*$" d "a--"
#"^(?:a?b?)*$" "a--"
"^(?:a?b?)*$" "a--"
"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings</0>"