ICU-2422 Regexp, more speed optimizations

X-SVN-Rev: 11402
2025-04-07 22:44:49 +00:00 · 2003-03-27 01:25:20 +00:00 · 2003-03-27 01:25:20 +00:00 · a640031201
commit a640031201
parent 4b469843ee
6 changed files with 166 additions and 26 deletions
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -137,7 +137,8 @@ static const UChar gIsWordPattern[] = {
    0x5b, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x38, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x66, 0x39, 0x5d, 0}; 


-static UnicodeSet *gPropSets[URX_LAST_SET];
+static UnicodeSet  *gPropSets[URX_LAST_SET];
+static Regex8BitSet gPropSets8[URX_LAST_SET];


 //----------------------------------------------------------------------------------------
@ -229,6 +230,8 @@ static void InitGraphemeClusterSets(UErrorCode &status) {
            delete LV;
            delete LVT;
        }
+
+        
    }
 }

@ -281,6 +284,12 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
    ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET],          gIsSpacePattern,             status);    

    InitGraphemeClusterSets(status);
+
+    int32_t i;
+    for (i=0; i<URX_LAST_SET; i++) {
+        gPropSets8[i].init(gPropSets[i]);
+    }
+  
 }


@ -346,6 +355,7 @@ void    RegexCompile::compile(
    // Prepare the RegexPattern object to receive the compiled pattern.
    fRXPat->fPattern        = pat;
    fRXPat->fStaticSets     = gPropSets;
+    fRXPat->fStaticSets8    = gPropSets8;


    // Initialize the pattern scanning state machine
@ -529,6 +539,17 @@ void    RegexCompile::compile(
    stripNOPs();
    OptEndingLoop();

+    //
+    // Set up fast latin-1 range sets
+    //
+    int32_t numSets = fRXPat->fSets->size();
+    fRXPat->fSets8 = new Regex8BitSet[numSets];
+    int32_t i;
+    for (i=0; i<numSets; i++) {
+        UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
+        fRXPat->fSets8[i].init(s);
+    }
+
    //
    // A stupid bit of non-sense to prevent code coverage testing from complaining
    //   about the pattern.dump() debug function.  Go through the motions of dumping,
@ -1214,7 +1235,7 @@ UBool RegexCompile::doParseActions(EParseAction action)

    case doBackslashS:
        fRXPat->fCompiledPat->addElement(
-            URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET | URX_NEG_SET), *fStatus);
+            URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
        break;

    case doBackslashs:
@ -1224,7 +1245,7 @@ UBool RegexCompile::doParseActions(EParseAction action)

    case doBackslashW:
        fRXPat->fCompiledPat->addElement(
-            URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
+            URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
        break;

    case doBackslashw:
@ -2215,17 +2236,24 @@ void   RegexCompile::matchStartType() {
        case URX_STATIC_SETREF:    
            if (currentLen == 0) {
                int32_t  sn = URX_VAL(op);
-                UBool negated = ((sn & URX_NEG_SET) == URX_NEG_SET);  
-                sn &= ~URX_NEG_SET;
-
+                U_ASSERT(sn>0 && sn<URX_LAST_SET);
                const UnicodeSet *s = fRXPat->fStaticSets[sn];
-                if (negated) {
-                    UnicodeSet sc(*s);
-                    sc.complement();
-                    fRXPat->fInitialChars->addAll(sc);
-                } else {
-                    fRXPat->fInitialChars->addAll(*s);
-                }
+                fRXPat->fInitialChars->addAll(*s);
+                numInitialStrings += 2;
+            }
+            currentLen++;
+            atStart = FALSE;
+            break;
+
+
+
+        case URX_STAT_SETREF_N:    
+            if (currentLen == 0) {
+                int32_t  sn = URX_VAL(op);
+                const UnicodeSet *s = fRXPat->fStaticSets[sn];
+                UnicodeSet sc(*s);
+                sc.complement();
+                fRXPat->fInitialChars->addAll(sc);
                numInitialStrings += 2;
            }
            currentLen++;
@ -2482,6 +2510,9 @@ void   RegexCompile::matchStartType() {
    }


+    fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
+
+
    // Sort out what we should check for when looking for candidate match start positions.
    // In order of preference,
    //     1.   Start of input text buffer.
@ -2611,6 +2642,7 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
            //   
        case URX_ONECHAR:
        case URX_STATIC_SETREF:
+        case URX_STAT_SETREF_N:
        case URX_SETREF:
        case URX_BACKSLASH_D:
        case URX_ONECHAR_I:
@ -2854,6 +2886,7 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
            // Ops that match a max of one character (possibly two 16 bit code units.)
            //   
        case URX_STATIC_SETREF:
+        case URX_STAT_SETREF_N:
        case URX_SETREF:
        case URX_BACKSLASH_D:
        case URX_ONECHAR_I:
@ -3064,6 +3097,7 @@ void RegexCompile::stripNOPs() {
        case URX_START_CAPTURE:
        case URX_END_CAPTURE:
        case URX_STATIC_SETREF:
+        case URX_STAT_SETREF_N:
        case URX_SETREF:
        case URX_DOTANY:
        case URX_FAIL:
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -12,6 +12,8 @@
 #ifndef _REGEXIMP_H
 #define _REGEXIMP_H

+#include "cmemory.h"
+
 U_NAMESPACE_BEGIN

 //
@ -152,9 +154,10 @@ enum {
                               //   Param 1:  The minimum length of the look-behind match
                               //   Param 2:  The max     length of the look-behind match
                               //   Param 3:  The pattern loc following the look-behind block.
-     URX_LBN_END       = 48    // Negative LookBehind end
+     URX_LBN_END       = 48,   // Negative LookBehind end
                               //   Parameter is the data location.
                               //   Check that the match ended at the right spot.
+     URX_STAT_SETREF_N = 49    // Operand is index of set in array of sets.   

 };           

@ -209,7 +212,9 @@ enum {
        "LB_CONT",             \
        "LB_END",              \
        "LBN_CONT",            \
-        "LBN_END"
+        "LBN_END",             \
+        "STAT_SETREF_N"        \
+

 //
 //  Convenience macros for assembling and disassembling a compiled operation.
@ -277,6 +282,38 @@ enum StartOfMatch {
                               (v)==START_STRING?  "START_STRING"  : \
                                                   "ILLEGAL")
    
+
+//
+//  8 bit set, to fast-path latin-1 set membership tests.
+//
+struct Regex8BitSet {
+    inline void init(const UnicodeSet *src);
+    inline UBool contains(UChar32 c);
+    inline void  add(UChar32 c);
+    int8_t d[32];
+};
+
+inline UBool Regex8BitSet::contains(UChar32 c) {
+    // No bounds checking!  This is deliberate.
+    return ((d[c>>3] & 1 <<(c&7)) != 0);
+};
+
+inline void  Regex8BitSet::add(UChar32 c) {
+    d[c>>3] |= 1 << (c&7);
+};
+
+inline void Regex8BitSet::init(const UnicodeSet *s) {
+    uprv_memset(d, 0, sizeof(d));
+    if (s != NULL) {
+        for (int i=0; i<255; i++) {
+            if (s->contains(i)) {
+                this->add(i);
+            }
+        }
+    }
+}
+
+
 U_NAMESPACE_END
 #endif

--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -355,7 +355,8 @@ UBool RegexMatcher::find() {
            for (;;) {
                int32_t pos = startPos;
                U16_NEXT(inputBuf, startPos, inputLen, c);  // like c = inputBuf[startPos++];
-                if (fPattern->fInitialChars->contains(c)) {
+                if (c<256 && fPattern->fInitialChars8->contains(c) ||
+                    c>=256 && fPattern->fInitialChars->contains(c)) {
                    MatchAt(pos, fDeferredStatus);
                    if (U_FAILURE(fDeferredStatus)) {
                        return FALSE;
@ -1271,9 +1272,16 @@ GC_Done:
                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
                UChar32  c;
                U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
-                const UnicodeSet *s = fPattern->fStaticSets[opValue];
-                if (s->contains(c)) {
-                    success = !success;
+                if (c < 256) {
+                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
+                    if (s8->contains(c)) {
+                        success = !success;
+                    }
+                } else {
+                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
+                    if (s->contains(c)) {
+                        success = !success;
+                    }
                }
                if (!success) {
                    fp = (REStackFrame *)fStack->popFrame(frameSize);
@ -1282,16 +1290,53 @@ GC_Done:
            break;
            

+        case URX_STAT_SETREF_N:
+            {
+                // Test input character for NOT being a member of  one of 
+                //    the predefined sets (Word Characters, for example)
+                if (fp->fInputIdx >= inputLen) {
+                    fp = (REStackFrame *)fStack->popFrame(frameSize);
+                    break;
+                }
+
+                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
+                UChar32  c;
+                U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
+                if (c < 256) {
+                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
+                    if (s8->contains(c) == FALSE) {
+                        break;
+                    }
+                } else {
+                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
+                    if (s->contains(c) == FALSE) {
+                        break;
+                    }
+                }
+
+                fp = (REStackFrame *)fStack->popFrame(frameSize);
+            }
+            break;
+            
+
        case URX_SETREF:
            if (fp->fInputIdx < inputLen) {
                // There is input left.  Pick up one char and test it for set membership.
                UChar32   c;
                U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
                U_ASSERT(opValue > 0 && opValue < sets->size());
-                UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
-                if (s->contains(c)) {
-                    // The character is in the set.  A Match.
-                    break;
+                if (c<256) {
+                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
+                    if (s8->contains(c)) {
+                        break;
+                    }
+                } else {
+                    
+                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
+                    if (s->contains(c)) {
+                        // The character is in the set.  A Match.
+                        break;
+                    }
                }
            }
            // Either at end of input, or the character wasn't in the set.
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -74,6 +74,8 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    fInitialStringIdx = other.fInitialStringIdx;
    fInitialStringLen = other.fInitialStringLen;
    fInitialChars     = new UnicodeSet(*other.fInitialChars);
+    fInitialChars8    = new Regex8BitSet;
+    uprv_memcpy(fInitialChars8, other.fInitialChars8, sizeof(Regex8BitSet));
    fInitialChar      = other.fInitialChar;

    //  Copy the pattern.  It's just values, nothing deep to copy.
@ -97,6 +99,10 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
        }
        fSets->addElement(newSet, fDeferredStatus);
    }
+
+    int32_t numSets = other.fSets->size();
+    fSets8 = new Regex8BitSet[numSets];
+    uprv_memcpy(fSets8, other.fSets8, numSets*sizeof(Regex8BitSet));  // TODO: give Regex8BitSet some constructors
    return *this;
 }

@ -119,16 +125,20 @@ void RegexPattern::init() {
    fInitialStringIdx = 0;
    fInitialStringLen = 0;
    fInitialChars     = NULL;
+    fInitialChars8    = NULL;
    fInitialChar      = 0;
+    fSets8            = NULL;
    
    fCompiledPat      = new UVector32(fDeferredStatus);
    fGroupMap         = new UVector32(fDeferredStatus);
    fSets             = new UVector(fDeferredStatus);
    fInitialChars     = new UnicodeSet;
+    fInitialChars8    = new Regex8BitSet;
    if (U_FAILURE(fDeferredStatus)) {
        return;
    }
-    if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || fInitialChars == NULL) {
+    if (fCompiledPat == NULL  || fGroupMap == NULL || fSets == NULL ||
+        fInitialChars == NULL || fInitialChars8 == NULL) {
        fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
@ -160,6 +170,10 @@ void RegexPattern::zap() {
    fGroupMap = NULL;
    delete fInitialChars;
    fInitialChars = NULL;
+    delete fInitialChars8;
+    fInitialChars8 = NULL;
+    delete[] fSets8;
+    fSets8 = NULL;
 }


@ -481,6 +495,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
        break;

    case URX_STATIC_SETREF:
+    case URX_STAT_SETREF_N:
        {
            UnicodeString s;
            if (val & URX_NEG_SET) {
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -54,6 +54,7 @@ class UVector;
 class UVector32;
 class UnicodeSet;
 struct REStackFrame;
+struct Regex8BitSet;


 /**
@ -358,6 +359,8 @@ private:
                                   //   after un-escaping, for use during the match.

    UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
+    Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
+

    UErrorCode      fDeferredStatus; // status if some prior error has left this
                                   //  RegexPattern in an unusable state.
@ -382,11 +385,15 @@ private:
    UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
                                   //   regex character classes, e.g. Word.

+    Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
+                                   //  sets for predefined regex classes.
+
    int32_t         fStartType;    // Info on how a match must start.
    int32_t         fInitialStringIdx;     //  
    int32_t         fInitialStringLen;
    UnicodeSet     *fInitialChars;  
    UChar32         fInitialChar;
+    Regex8BitSet   *fInitialChars8;

    /**
     * The address of this static class variable serves as this class's ID
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -299,12 +299,14 @@
 "abc*$"                          "<0>abccc</0>"
 "ab(?:ab[xyz]\s)*"               "<0>ababy abx </0>abc"

-"(?:abc|a)(?:bc)+"               "<0>abc</0>"
+"(?:(abc)|a)(?:bc)+"             "<0>abc</0>"
+"(?:(abc)|a)(?:bc)*"             "<0><1>abc</1></0>"
+"^[+\-]?[0-9]*\.?[0-9]*"         "<0>123.456</0>"

 #
 #  Random debugging, Temporary
 #
-#"^(?:a?b?)*$"	        d         "a--"	
+#"^(?:a?b?)*$"	                  "a--"	
 "^(?:a?b?)*$"	                  "a--"

 "This is a string with (?:one |two |three )endings"   "<0>This is a string with two endings</0>"