ICU-2422 regexp, case insensitive matching, work in progress

X-SVN-Rev: 10989
2025-04-21 12:40:02 +00:00 · 2003-02-07 02:04:14 +00:00 · 2003-02-07 02:04:14 +00:00 · 7f44eb19cf
commit 7f44eb19cf
parent 6c56af4be2
8 changed files with 97 additions and 31 deletions
--- a/icu4c/source/common/putil.c
+++ b/icu4c/source/common/putil.c
@ -1843,7 +1843,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
    "U_REGEX_NUMBER_TOO_BIG",
    "U_REGEX_BAD_INTERVAL",
    "U_REGEX_MAX_LT_MIN",
-    "U_REGEX_INVALID_BACK_REF"
+    "U_REGEX_INVALID_BACK_REF",
+    "U_REGEX_INVALID_FLAG"
 };

 U_CAPI const char * U_EXPORT2
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -629,6 +629,7 @@ typedef enum UErrorCode {
     U_REGEX_BAD_INTERVAL,                 /**< Error in {min,max} interval                        */
     U_REGEX_MAX_LT_MIN,                   /**< In {min,max}, max is less than min.                */
     U_REGEX_INVALID_BACK_REF,             /**< Back-reference to a non-existent capture group.    */
+     U_REGEX_INVALID_FLAG,                 /**< Invalid value for match mode flags.                */
     U_REGEX_ERROR_LIMIT,                  /**< This must always be the last value to indicate the limit for regexp errors */

    U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT      /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -153,6 +153,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
    fCharNum        = 0;
    fQuoteMode      = FALSE;
    fFreeForm       = FALSE;
+    fCaseI          = FALSE;

    fMatchOpenParen  = -1;
    fMatchCloseParen = -1;
@ -1223,8 +1224,7 @@ void RegexCompile::literalChar()  {
    if (fStringOpStart == -1) {
        // First char of a string in the pattern.
        // Emit a OneChar op into the compiled pattern.
-        op = URX_BUILD(URX_ONECHAR, fC.fChar);
-        fRXPat->fCompiledPat->addElement(op, *fStatus);
+        emitONE_CHAR(fC.fChar);

        // Also add it to the string pool, in case we get a second adjacent literal
        //   and want to change form ONE_CHAR to STRING
@ -1239,9 +1239,13 @@ void RegexCompile::literalChar()  {
    // If the most recently emitted op is a URX_ONECHAR, change it to a string op.
    op     = fRXPat->fCompiledPat->lastElementi();
    opType = URX_TYPE(op);
-    U_ASSERT(opType == URX_ONECHAR || opType == URX_STRING_LEN);
-    if (opType == URX_ONECHAR) {
-        op         = URX_BUILD(URX_STRING, fStringOpStart);
+    U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
+    if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) {
+        if (fCaseI) {
+            op     = URX_BUILD(URX_STRING_I, fStringOpStart);
+        } else {
+            op     = URX_BUILD(URX_STRING, fStringOpStart);
+        }
        patternLoc = fRXPat->fCompiledPat->size() - 1;
        fRXPat->fCompiledPat->setElementAt(op, patternLoc);
        op         = URX_BUILD(URX_STRING_LEN, 0);
@ -1258,6 +1262,29 @@ void RegexCompile::literalChar()  {



+//------------------------------------------------------------------------------
+//
+//    emitONE_CHAR         emit a ONE_CHAR op into the generated code.
+//                         Choose cased or uncased version, depending on the
+//                         match mode and whether the character itself is cased.
+//
+//------------------------------------------------------------------------------
+void RegexCompile::emitONE_CHAR(UChar32  c) {
+    int32_t op;
+    if (fCaseI && (u_tolower(c) != u_toupper(c))) {
+        // We have a cased character, and are in case insensitive matching mode.
+        // TODO: replace with a better test.  See Alan L.'s mail of 2/6
+        c  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+        op = URX_BUILD(URX_ONECHAR_I, c);
+    } else {
+        // Uncased char, or case sensitive match mode.
+        //  Either way, just generate a literal compare of the char.
+        op = URX_BUILD(URX_ONECHAR, c);
+    }
+    fRXPat->fCompiledPat->addElement(op, *fStatus);
+}
+
+
 //------------------------------------------------------------------------------
 //
 //    fixLiterals           When compiling something that can follow a literal
@ -1269,7 +1296,7 @@ void RegexCompile::literalChar()  {
 //                          Optionally, split the last char of the string off into
 //                          a single "ONE_CHAR" operation, so that quantifiers can
 //                          apply to that char alone.  Example:   abc*
-//                          The * needs to apply to the 'c' only.
+//                          The * must apply to the 'c' only.
 //
 //------------------------------------------------------------------------------
 void    RegexCompile::fixLiterals(UBool split) {
@ -1321,16 +1348,14 @@ void    RegexCompile::fixLiterals(UBool split) {
        stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
        op = URX_BUILD(URX_STRING_LEN, stringLen);
        fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
-        op = URX_BUILD(URX_ONECHAR, lastChar);
-        fRXPat->fCompiledPat->addElement(op, *fStatus);
+        emitONE_CHAR(lastChar);
    } else {
        // The original string consisted of exactly two characters.  Replace
        // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
        // of URX_ONECHARs.
-        op = URX_BUILD(URX_ONECHAR, nextToLastChar);
-        fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -2);
-        op = URX_BUILD(URX_ONECHAR, lastChar);
-        fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
+        fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2);
+        emitONE_CHAR(nextToLastChar);
+        emitONE_CHAR(lastChar);
    }
 }

--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -100,6 +100,8 @@ private:
    void        fixLiterals(UBool split=FALSE);      // Fix literal strings.
    void        insertOp(int32_t where);             // Open up a slot for a new op in the
                                                     //   generated code at the specified location.
+    void        emitONE_CHAR(UChar32 c);             // EMit a ONE_CHAR op into the compiled code,
+                                                     //   taking case mode into account.
    UBool       possibleNullMatch(int32_t start,     // Test a range of compiled pattern for
                                  int32_t end);      //   for possibly matching an empty string.

@ -127,10 +129,9 @@ private:
    RegexPatternChar              fC;                // Current char for parse state machine
                                                     //   processing.

-    int32_t                       fStringOpStart;    // While a literal string is being scanned
-                                                     //   holds the start index within RegexPattern.
-                                                     //   fLiteralText where the string is being stored.
-
+    //
+    //   Data for the state machine that parses the regular expression.
+    //
    RegexTableEl                  **fStateTable;     // State Transition Table for regex Rule
                                                     //   parsing.  index by p[state][char-class]

@ -138,6 +139,15 @@ private:
    int                           fStackPtr;           //  and pops as specified in the state
                                                       //  transition rules.

+    //
+    //  Data associated with the generation of the pcode for the match engine
+    //
+    UBool                         fCaseI;            // Case Insensitive Match Mode is on.
+
+    int32_t                       fStringOpStart;    // While a literal string is being scanned
+                                                     //   holds the start index within RegexPattern.
+                                                     //   fLiteralText where the string is being stored.
+
    int32_t                       fPatternLength;    // Length of the input pattern string.

    UVector32                     fParenStack;       // parentheses stack.  Each frame consists of
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -108,10 +108,16 @@ enum {
     URX_LA_START      = 37,   // Starting a LookAround expression.
                               //   Save InputPos and SP in static data.
                               //   Operand:  Static data offset for the save
-     URX_LA_END        = 38    // Ending a Lookaround expression.
+     URX_LA_END        = 38,   // Ending a Lookaround expression.
                               //   Restore InputPos and Stack to saved values.
                               //   Operand:  Static data offset for saved data.
-};
+     URX_ONECHAR_I     = 39,   // Test for case-insensitive match of a literal character.
+                               //   Operand:  the literal char.
+     URX_STRING_I      = 40    // Case insensitive string compare.
+                               //   First Operand:  Index of start of string in string literals
+                               //   Second Operand (next word in compiled code):
+                               //     the length of the string.
+};           

 // Keep this list of opcode names in sync with the above enum
 //   Used for debug printing only.
@ -154,7 +160,9 @@ enum {
        "STO_INP_LOC",         \
        "JMPX",                \
        "LA_START",            \
-        "LA_END"
+        "LA_END",              \
+        "ONECHAR_I",           \
+        "STRING_I"

 //
 //  Convenience macros for assembling and disassembling a compiled operation.
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -206,6 +206,15 @@ RegexPattern  *RegexPattern::compile(
    if (U_FAILURE(status)) {
        return NULL;
    }
+
+    const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
+                              UREGEX_DOTALL   | UREGEX_MULTILINE;
+
+    if ((flags & ~allFlags) != 0) {
+        status = U_REGEX_INVALID_FLAG;
+        return NULL;
+    }
+
    if (flags != 0) {
        status = U_REGEX_UNIMPLEMENTED;
        return NULL;
@ -474,16 +483,19 @@ void   RegexPattern::dumpOp(int32_t index) const {
    case URX_BACKREF:
    case URX_STO_INP_LOC:
    case URX_JMPX:
-
+    case URX_LA_START:
+    case URX_LA_END:
        // types with an integer operand field.
        REGEX_DUMP_DEBUG_PRINTF("%d", val);
        break;
        
    case URX_ONECHAR:
+    case URX_ONECHAR_I:
        REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?');
        break;
        
    case URX_STRING:
+    case URX_STRING_I:
        {
            int32_t lengthOp       = fCompiledPat->elementAti(index+1);
            U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -290,14 +290,14 @@ public:
     *
     * @draft ICU 2.4
     */
-    virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
+    virtual inline UClassID getDynamicClassID() const; 

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.4
     */
-    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
+    static inline UClassID getStaticClassID(); 

 private:
    //
@ -668,24 +668,25 @@ public:

   /**
     *   setTrace   Debug function, enable/disable tracing of the matching engine.
+     *              For internal ICU development use only.  DO NO USE!!!!
     *   @internal
     */
    void setTrace(UBool state);


-    /**
-     * ICU "poor man's RTTI", returns a UClassID for the actual class.
-     *
-     * @draft ICU 2.2
-     */
-    virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
-
    /**
    * ICU "poor man's RTTI", returns a UClassID for this class.
    *
    * @draft ICU 2.2
    */
-    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
+    static inline UClassID getStaticClassID();
+
+    /**
+     * ICU "poor man's RTTI", returns a UClassID for the actual class.
+     *
+     * @draft ICU 2.2
+     */
+    virtual inline UClassID getDynamicClassID() const;

 private:
    // Constructors and other object boilerplate are private.
@ -737,6 +738,13 @@ private:

 };

+inline UClassID RegexPattern::getStaticClassID() { return (UClassID)&fgClassID; }
+inline UClassID RegexPattern::getDynamicClassID() const { return getStaticClassID(); }
+
+inline UClassID RegexMatcher::getStaticClassID() { return (UClassID)&fgClassID; }
+inline UClassID RegexMatcher::getDynamicClassID() const { return getStaticClassID(); }
+
+
 U_NAMESPACE_END
 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
 #endif
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -145,6 +145,7 @@
 "(?:ABC)+"                     "<0>ABCABCABC</0>D"
 "(?:ABC)DEF+"                  "<0>ABCDEFFF</0>D"
 "AB\.C\eD\u0666E"              "<0>AB.C\u001BD\u0666E</0>F"
+"ab\Bde"                        "<0>abde</0>"


 # {min,max} iteration qualifier