mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-2422 Regexp, more speed optimizations
X-SVN-Rev: 11402
This commit is contained in:
parent
4b469843ee
commit
a640031201
6 changed files with 166 additions and 26 deletions
|
@ -137,7 +137,8 @@ static const UChar gIsWordPattern[] = {
|
|||
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x38, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x66, 0x39, 0x5d, 0};
|
||||
|
||||
|
||||
static UnicodeSet *gPropSets[URX_LAST_SET];
|
||||
static UnicodeSet *gPropSets[URX_LAST_SET];
|
||||
static Regex8BitSet gPropSets8[URX_LAST_SET];
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
|
@ -229,6 +230,8 @@ static void InitGraphemeClusterSets(UErrorCode &status) {
|
|||
delete LV;
|
||||
delete LVT;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -281,6 +284,12 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
|
|||
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET], gIsSpacePattern, status);
|
||||
|
||||
InitGraphemeClusterSets(status);
|
||||
|
||||
int32_t i;
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
gPropSets8[i].init(gPropSets[i]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -346,6 +355,7 @@ void RegexCompile::compile(
|
|||
// Prepare the RegexPattern object to receive the compiled pattern.
|
||||
fRXPat->fPattern = pat;
|
||||
fRXPat->fStaticSets = gPropSets;
|
||||
fRXPat->fStaticSets8 = gPropSets8;
|
||||
|
||||
|
||||
// Initialize the pattern scanning state machine
|
||||
|
@ -529,6 +539,17 @@ void RegexCompile::compile(
|
|||
stripNOPs();
|
||||
OptEndingLoop();
|
||||
|
||||
//
|
||||
// Set up fast latin-1 range sets
|
||||
//
|
||||
int32_t numSets = fRXPat->fSets->size();
|
||||
fRXPat->fSets8 = new Regex8BitSet[numSets];
|
||||
int32_t i;
|
||||
for (i=0; i<numSets; i++) {
|
||||
UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
|
||||
fRXPat->fSets8[i].init(s);
|
||||
}
|
||||
|
||||
//
|
||||
// A stupid bit of non-sense to prevent code coverage testing from complaining
|
||||
// about the pattern.dump() debug function. Go through the motions of dumping,
|
||||
|
@ -1214,7 +1235,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
case doBackslashS:
|
||||
fRXPat->fCompiledPat->addElement(
|
||||
URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET | URX_NEG_SET), *fStatus);
|
||||
URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashs:
|
||||
|
@ -1224,7 +1245,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
case doBackslashW:
|
||||
fRXPat->fCompiledPat->addElement(
|
||||
URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
|
||||
URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
|
||||
break;
|
||||
|
||||
case doBackslashw:
|
||||
|
@ -2215,17 +2236,24 @@ void RegexCompile::matchStartType() {
|
|||
case URX_STATIC_SETREF:
|
||||
if (currentLen == 0) {
|
||||
int32_t sn = URX_VAL(op);
|
||||
UBool negated = ((sn & URX_NEG_SET) == URX_NEG_SET);
|
||||
sn &= ~URX_NEG_SET;
|
||||
|
||||
U_ASSERT(sn>0 && sn<URX_LAST_SET);
|
||||
const UnicodeSet *s = fRXPat->fStaticSets[sn];
|
||||
if (negated) {
|
||||
UnicodeSet sc(*s);
|
||||
sc.complement();
|
||||
fRXPat->fInitialChars->addAll(sc);
|
||||
} else {
|
||||
fRXPat->fInitialChars->addAll(*s);
|
||||
}
|
||||
fRXPat->fInitialChars->addAll(*s);
|
||||
numInitialStrings += 2;
|
||||
}
|
||||
currentLen++;
|
||||
atStart = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_STAT_SETREF_N:
|
||||
if (currentLen == 0) {
|
||||
int32_t sn = URX_VAL(op);
|
||||
const UnicodeSet *s = fRXPat->fStaticSets[sn];
|
||||
UnicodeSet sc(*s);
|
||||
sc.complement();
|
||||
fRXPat->fInitialChars->addAll(sc);
|
||||
numInitialStrings += 2;
|
||||
}
|
||||
currentLen++;
|
||||
|
@ -2482,6 +2510,9 @@ void RegexCompile::matchStartType() {
|
|||
}
|
||||
|
||||
|
||||
fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
|
||||
|
||||
|
||||
// Sort out what we should check for when looking for candidate match start positions.
|
||||
// In order of preference,
|
||||
// 1. Start of input text buffer.
|
||||
|
@ -2611,6 +2642,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
|||
//
|
||||
case URX_ONECHAR:
|
||||
case URX_STATIC_SETREF:
|
||||
case URX_STAT_SETREF_N:
|
||||
case URX_SETREF:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_ONECHAR_I:
|
||||
|
@ -2854,6 +2886,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
|||
// Ops that match a max of one character (possibly two 16 bit code units.)
|
||||
//
|
||||
case URX_STATIC_SETREF:
|
||||
case URX_STAT_SETREF_N:
|
||||
case URX_SETREF:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_ONECHAR_I:
|
||||
|
@ -3064,6 +3097,7 @@ void RegexCompile::stripNOPs() {
|
|||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_STATIC_SETREF:
|
||||
case URX_STAT_SETREF_N:
|
||||
case URX_SETREF:
|
||||
case URX_DOTANY:
|
||||
case URX_FAIL:
|
||||
|
|
|
@ -12,6 +12,8 @@
|
|||
#ifndef _REGEXIMP_H
|
||||
#define _REGEXIMP_H
|
||||
|
||||
#include "cmemory.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//
|
||||
|
@ -152,9 +154,10 @@ enum {
|
|||
// Param 1: The minimum length of the look-behind match
|
||||
// Param 2: The max length of the look-behind match
|
||||
// Param 3: The pattern loc following the look-behind block.
|
||||
URX_LBN_END = 48 // Negative LookBehind end
|
||||
URX_LBN_END = 48, // Negative LookBehind end
|
||||
// Parameter is the data location.
|
||||
// Check that the match ended at the right spot.
|
||||
URX_STAT_SETREF_N = 49 // Operand is index of set in array of sets.
|
||||
|
||||
};
|
||||
|
||||
|
@ -209,7 +212,9 @@ enum {
|
|||
"LB_CONT", \
|
||||
"LB_END", \
|
||||
"LBN_CONT", \
|
||||
"LBN_END"
|
||||
"LBN_END", \
|
||||
"STAT_SETREF_N" \
|
||||
|
||||
|
||||
//
|
||||
// Convenience macros for assembling and disassembling a compiled operation.
|
||||
|
@ -277,6 +282,38 @@ enum StartOfMatch {
|
|||
(v)==START_STRING? "START_STRING" : \
|
||||
"ILLEGAL")
|
||||
|
||||
|
||||
//
|
||||
// 8 bit set, to fast-path latin-1 set membership tests.
|
||||
//
|
||||
struct Regex8BitSet {
|
||||
inline void init(const UnicodeSet *src);
|
||||
inline UBool contains(UChar32 c);
|
||||
inline void add(UChar32 c);
|
||||
int8_t d[32];
|
||||
};
|
||||
|
||||
inline UBool Regex8BitSet::contains(UChar32 c) {
|
||||
// No bounds checking! This is deliberate.
|
||||
return ((d[c>>3] & 1 <<(c&7)) != 0);
|
||||
};
|
||||
|
||||
inline void Regex8BitSet::add(UChar32 c) {
|
||||
d[c>>3] |= 1 << (c&7);
|
||||
};
|
||||
|
||||
inline void Regex8BitSet::init(const UnicodeSet *s) {
|
||||
uprv_memset(d, 0, sizeof(d));
|
||||
if (s != NULL) {
|
||||
for (int i=0; i<255; i++) {
|
||||
if (s->contains(i)) {
|
||||
this->add(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
||||
|
||||
|
|
|
@ -355,7 +355,8 @@ UBool RegexMatcher::find() {
|
|||
for (;;) {
|
||||
int32_t pos = startPos;
|
||||
U16_NEXT(inputBuf, startPos, inputLen, c); // like c = inputBuf[startPos++];
|
||||
if (fPattern->fInitialChars->contains(c)) {
|
||||
if (c<256 && fPattern->fInitialChars8->contains(c) ||
|
||||
c>=256 && fPattern->fInitialChars->contains(c)) {
|
||||
MatchAt(pos, fDeferredStatus);
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
return FALSE;
|
||||
|
@ -1271,9 +1272,16 @@ GC_Done:
|
|||
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
|
||||
const UnicodeSet *s = fPattern->fStaticSets[opValue];
|
||||
if (s->contains(c)) {
|
||||
success = !success;
|
||||
if (c < 256) {
|
||||
Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
|
||||
if (s8->contains(c)) {
|
||||
success = !success;
|
||||
}
|
||||
} else {
|
||||
const UnicodeSet *s = fPattern->fStaticSets[opValue];
|
||||
if (s->contains(c)) {
|
||||
success = !success;
|
||||
}
|
||||
}
|
||||
if (!success) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
|
@ -1282,16 +1290,53 @@ GC_Done:
|
|||
break;
|
||||
|
||||
|
||||
case URX_STAT_SETREF_N:
|
||||
{
|
||||
// Test input character for NOT being a member of one of
|
||||
// the predefined sets (Word Characters, for example)
|
||||
if (fp->fInputIdx >= inputLen) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
|
||||
U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
|
||||
if (c < 256) {
|
||||
Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
|
||||
if (s8->contains(c) == FALSE) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
const UnicodeSet *s = fPattern->fStaticSets[opValue];
|
||||
if (s->contains(c) == FALSE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_SETREF:
|
||||
if (fp->fInputIdx < inputLen) {
|
||||
// There is input left. Pick up one char and test it for set membership.
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
|
||||
U_ASSERT(opValue > 0 && opValue < sets->size());
|
||||
UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
|
||||
if (s->contains(c)) {
|
||||
// The character is in the set. A Match.
|
||||
break;
|
||||
if (c<256) {
|
||||
Regex8BitSet *s8 = &fPattern->fSets8[opValue];
|
||||
if (s8->contains(c)) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
|
||||
UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
|
||||
if (s->contains(c)) {
|
||||
// The character is in the set. A Match.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Either at end of input, or the character wasn't in the set.
|
||||
|
|
|
@ -74,6 +74,8 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
fInitialStringIdx = other.fInitialStringIdx;
|
||||
fInitialStringLen = other.fInitialStringLen;
|
||||
fInitialChars = new UnicodeSet(*other.fInitialChars);
|
||||
fInitialChars8 = new Regex8BitSet;
|
||||
uprv_memcpy(fInitialChars8, other.fInitialChars8, sizeof(Regex8BitSet));
|
||||
fInitialChar = other.fInitialChar;
|
||||
|
||||
// Copy the pattern. It's just values, nothing deep to copy.
|
||||
|
@ -97,6 +99,10 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
}
|
||||
fSets->addElement(newSet, fDeferredStatus);
|
||||
}
|
||||
|
||||
int32_t numSets = other.fSets->size();
|
||||
fSets8 = new Regex8BitSet[numSets];
|
||||
uprv_memcpy(fSets8, other.fSets8, numSets*sizeof(Regex8BitSet)); // TODO: give Regex8BitSet some constructors
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -119,16 +125,20 @@ void RegexPattern::init() {
|
|||
fInitialStringIdx = 0;
|
||||
fInitialStringLen = 0;
|
||||
fInitialChars = NULL;
|
||||
fInitialChars8 = NULL;
|
||||
fInitialChar = 0;
|
||||
fSets8 = NULL;
|
||||
|
||||
fCompiledPat = new UVector32(fDeferredStatus);
|
||||
fGroupMap = new UVector32(fDeferredStatus);
|
||||
fSets = new UVector(fDeferredStatus);
|
||||
fInitialChars = new UnicodeSet;
|
||||
fInitialChars8 = new Regex8BitSet;
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
return;
|
||||
}
|
||||
if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || fInitialChars == NULL) {
|
||||
if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
|
||||
fInitialChars == NULL || fInitialChars8 == NULL) {
|
||||
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
@ -160,6 +170,10 @@ void RegexPattern::zap() {
|
|||
fGroupMap = NULL;
|
||||
delete fInitialChars;
|
||||
fInitialChars = NULL;
|
||||
delete fInitialChars8;
|
||||
fInitialChars8 = NULL;
|
||||
delete[] fSets8;
|
||||
fSets8 = NULL;
|
||||
}
|
||||
|
||||
|
||||
|
@ -481,6 +495,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
break;
|
||||
|
||||
case URX_STATIC_SETREF:
|
||||
case URX_STAT_SETREF_N:
|
||||
{
|
||||
UnicodeString s;
|
||||
if (val & URX_NEG_SET) {
|
||||
|
|
|
@ -54,6 +54,7 @@ class UVector;
|
|||
class UVector32;
|
||||
class UnicodeSet;
|
||||
struct REStackFrame;
|
||||
struct Regex8BitSet;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -358,6 +359,8 @@ private:
|
|||
// after un-escaping, for use during the match.
|
||||
|
||||
UVector *fSets; // Any UnicodeSets referenced from the pattern.
|
||||
Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
|
||||
|
||||
|
||||
UErrorCode fDeferredStatus; // status if some prior error has left this
|
||||
// RegexPattern in an unusable state.
|
||||
|
@ -382,11 +385,15 @@ private:
|
|||
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
|
||||
// regex character classes, e.g. Word.
|
||||
|
||||
Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
|
||||
// sets for predefined regex classes.
|
||||
|
||||
int32_t fStartType; // Info on how a match must start.
|
||||
int32_t fInitialStringIdx; //
|
||||
int32_t fInitialStringLen;
|
||||
UnicodeSet *fInitialChars;
|
||||
UChar32 fInitialChar;
|
||||
Regex8BitSet *fInitialChars8;
|
||||
|
||||
/**
|
||||
* The address of this static class variable serves as this class's ID
|
||||
|
|
6
icu4c/source/test/testdata/regextst.txt
vendored
6
icu4c/source/test/testdata/regextst.txt
vendored
|
@ -299,12 +299,14 @@
|
|||
"abc*$" "<0>abccc</0>"
|
||||
"ab(?:ab[xyz]\s)*" "<0>ababy abx </0>abc"
|
||||
|
||||
"(?:abc|a)(?:bc)+" "<0>abc</0>"
|
||||
"(?:(abc)|a)(?:bc)+" "<0>abc</0>"
|
||||
"(?:(abc)|a)(?:bc)*" "<0><1>abc</1></0>"
|
||||
"^[+\-]?[0-9]*\.?[0-9]*" "<0>123.456</0>"
|
||||
|
||||
#
|
||||
# Random debugging, Temporary
|
||||
#
|
||||
#"^(?:a?b?)*$" d "a--"
|
||||
#"^(?:a?b?)*$" "a--"
|
||||
"^(?:a?b?)*$" "a--"
|
||||
|
||||
"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings</0>"
|
||||
|
|
Loading…
Add table
Reference in a new issue