ICU-2422 Regexp, general cleanup

X-SVN-Rev: 11366
2025-04-14 17:24:01 +00:00 · 2003-03-20 01:15:10 +00:00 · 2003-03-20 01:15:10 +00:00 · f6d9573913
commit f6d9573913
parent dde478d82e
7 changed files with 63 additions and 55 deletions
--- a/icu4c/source/common/putil.c
+++ b/icu4c/source/common/putil.c
@ -1763,7 +1763,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
    "U_REGEX_MAX_LT_MIN",
    "U_REGEX_INVALID_BACK_REF",
    "U_REGEX_INVALID_FLAG",
-    "U_REGEX_LOOK_BEHIND_LIMIT"
+    "U_REGEX_LOOK_BEHIND_LIMIT",
+    "U_REGEX_SET_CONTAINS_STRING"
 };

 static const char * const
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -631,6 +631,7 @@ typedef enum UErrorCode {
     U_REGEX_INVALID_BACK_REF,             /**< Back-reference to a non-existent capture group.    */
     U_REGEX_INVALID_FLAG,                 /**< Invalid value for match mode flags.                */
     U_REGEX_LOOK_BEHIND_LIMIT,            /**< Look-Behind pattern matches must have a bounded maximum length.    */
+     U_REGEX_SET_CONTAINS_STRING,          /**< Regexps cannot have UnicodeSets containing strings.*/
     U_REGEX_ERROR_LIMIT,                  /**< This must always be the last value to indicate the limit for regexp errors */

     /*
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -2,7 +2,7 @@
 //
 //  file:  regexcmp.cpp
 //
-//  Copyright (C) 2002, International Business Machines Corporation and others.
+//  Copyright (C) 2002-2003 International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains the ICU regular expression compiler, which is responsible
@ -172,14 +172,16 @@ static void ThreadSafeUnicodeSetInit(UnicodeSet **pSet, const UChar *pattern, UE
 //                             determination of Grapheme Cluster boundaries.
 //
 //----------------------------------------------------------------------------------------
-static void InitGraphemeClusterSets() {
-    UErrorCode status = U_ZERO_ERROR;     // TODO:  some sort of error handling needed.
+static void InitGraphemeClusterSets(UErrorCode &status) {
    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_EXTEND],       gGC_ExtendPattern,           status);    
    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_CONTROL],      gGC_ControlPattern,          status);    
    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_L],            gGC_LPattern,                status);    
    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_V],            gGC_VPattern,                status);    
    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_T],            gGC_TPattern,                status);   
-    
+    if (U_FAILURE(status)) {
+        return;
+    }
+
    if (gPropSets[URX_GC_NORMAL] == NULL) {

        //
@ -278,7 +280,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
    ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET],           gIsWordPattern,              status);
    ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET],          gIsSpacePattern,             status);    

-    InitGraphemeClusterSets();
+    InitGraphemeClusterSets(status);
 }


@ -922,13 +924,11 @@ UBool RegexCompile::doParseActions(EParseAction action)
    case doBadOpenParenType:
    case doRuleError:
        error(U_REGEX_RULE_SYNTAX);
-        returnVal = FALSE;
        break;


    case doMismatchedParenErr:
        error(U_REGEX_MISMATCHED_PAREN);
-        returnVal = FALSE;
        break;

    case doPlus:
@ -1210,7 +1210,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
        break;

    case doBackslashx:              // \x{abcd}   alternate hex format
-        //  TODO:  implement
+        //  TODO:  this is waiting for a decision on adding \x to unescape.
        error(U_REGEX_UNIMPLEMENTED);
        break;

@ -1477,9 +1477,13 @@ UBool RegexCompile::doParseActions(EParseAction action)
    default:
        U_ASSERT(FALSE);
        error(U_REGEX_INTERNAL_ERROR);
-        returnVal = FALSE;
        break;
    }
+
+    if (U_FAILURE(*fStatus)) {
+        returnVal = FALSE;
+    }
+
    return returnVal;
 };

@ -1560,9 +1564,9 @@ void RegexCompile::literalChar(UChar32 c)  {
 //------------------------------------------------------------------------------
 void RegexCompile::emitONE_CHAR(UChar32  c) {
    int32_t op;
-    if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && (u_tolower(c) != u_toupper(c))) {
+    if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
+        u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
        // We have a cased character, and are in case insensitive matching mode.
-        // TODO: replace with a better test.  See Alan L.'s mail of 2/6
        c  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
        op = URX_BUILD(URX_ONECHAR_I, c);
    } else {
@ -1963,7 +1967,8 @@ void        RegexCompile::compileSet(UnicodeSet *theSet)
    UChar32  firstSetChar = theSet->charAt(0);
    if (firstSetChar == -1) {
        // Sets that contain only strings, but no individual chars,
-        // will end up here.   TODO:  figure out what to with sets containing strings.
+        // will end up here.
+        error(U_REGEX_SET_CONTAINS_STRING);
        setSize = 0;
    }

@ -2050,21 +2055,6 @@ void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)



-//----------------------------------------------------------------------------------------
-//
-//  possibleNullMatch    Test a range of compiled pattern for the possibility that it
-//                       might match an empty string.  Used to control the generation
-//                       of extra checking code to prevent infinite loops in the match
-//                       engine on repeated empty matches, such as might happen with
-//                            (x?)*
-//                       when the input string is not at an x.
-//
-//----------------------------------------------------------------------------------------
-UBool  RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
-    // for now, just return true.  TODO:  make a real implementation
-    return TRUE;
-}
-

 //----------------------------------------------------------------------------------------
 //
@ -3038,16 +3028,21 @@ void RegexCompile::stripNOPs() {
 void RegexCompile::error(UErrorCode e) {
    if (U_SUCCESS(*fStatus)) {
        *fStatus = e;
-        fParseErr->line  = fLineNum;
+        fParseErr->line   = fLineNum;
        fParseErr->offset = fCharNum;
-        fParseErr->preContext[0] = 0;    // TODO:  copy in some input pattern text
-        fParseErr->preContext[0] = 0;
+
+        // Fill in the context.
+        //   Note: extractBetween() pins supplied indicies to the string bounds.
+        uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
+        uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
+        fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex,
+            fParseErr->preContext,  0);
+        fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1,
+            fParseErr->postContext, 0);
    }
 }


-
-
 //
 //  Assorted Unicode character constants.
 //     Numeric because there is no portable way to enter them as literals.
@ -3186,7 +3181,6 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
                    }
                }
                if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
-                    //  TODO:  is RuleWhiteSpace the right thing to use here?
                    break;
                }
                c.fChar = nextCharLL();
@ -3218,13 +3212,13 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
            {
                // We are in a '\' escape that will be handled by the state table scanner.
                // Just return the backslash, but remember that the following char is to
-                //  be taken literally.  TODO:  this is awkward
+                //  be taken literally.  TODO:  this is awkward, think about alternatives.
                fInBackslashQuote = TRUE;
            }
        }
    }

-    // re-enable # to end-of-line comments, in case they were disabled..
+    // re-enable # to end-of-line comments, in case they were disabled.
    // They are disabled by the parser upon seeing '(?', but this lasts for
    //  the fetching of the next character only.
    fEOLComments = TRUE;
@ -3325,7 +3319,7 @@ UnicodeSet *RegexCompile::scanProp() {
        nextChar(fC);
        if (fC.fChar == -1) {
            // Hit the end of the input string without finding the closing '}'
-            *fStatus = U_REGEX_PROPERTY_SYNTAX;
+            error(U_REGEX_PROPERTY_SYNTAX);
            return NULL;
        }
    }
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -1,7 +1,7 @@
 //
 //  regexcmp.h
 //
-//  Copyright (C) 2002, International Business Machines Corporation and others.
+//  Copyright (C) 2002-2003, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains declarations for the class RegexCompile
@ -105,8 +105,6 @@ private:
                                                     //   generated code at the specified location.
    void        emitONE_CHAR(UChar32 c);             // EMit a ONE_CHAR op into the compiled code,
                                                     //   taking case mode into account.
-    UBool       possibleNullMatch(int32_t start,     // Test a range of compiled pattern for
-                                  int32_t end);      //   for possibly matching an empty string.
    int32_t     minMatchLength(int32_t start,
                               int32_t end);
    int32_t     maxMatchLength(int32_t start,
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -5,10 +5,10 @@
 //         which is one of the main API classes for the ICU regular expression package.
 //
 /*
-**********************************************************************
-*   Copyright (C) 2002 International Business Machines Corporation   *
-*   and others. All rights reserved.                                 *
-**********************************************************************
+**************************************************************************
+*   Copyright (C) 2002-2003 International Business Machines Corporation  *
+*   and others. All rights reserved.                                     *
+**************************************************************************
 */

 #include "unicode/utypes.h"
@ -37,11 +37,14 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
    fPatternOwned      = FALSE;
    fInput             = NULL;
    fTraceDebug        = FALSE;
-    UErrorCode  status = U_ZERO_ERROR;
-    fStack             = new UVector32(status);   // TODO:  do something with status.
+    fDeferredStatus    = U_ZERO_ERROR;
+    fStack             = new UVector32(fDeferredStatus); 
    fData              = fSmallData;
    if (pat->fDataSize > sizeof(fSmallData)/sizeof(int32_t)) {
-        fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t));      // TODO:  null check
+        fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); 
+    }
+    if (fStack == NULL || fData == NULL) {
+        fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    }
        
    reset();
@ -55,10 +58,14 @@ RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &inp
    fPattern           = RegexPattern::compile(regexp, flags, pe, status);
    fPatternOwned      = TRUE;
    fTraceDebug        = FALSE;
+    fDeferredStatus    = U_ZERO_ERROR;
    fStack             = new UVector32(status); 
    fData              = fSmallData;
    if (fPattern->fDataSize > sizeof(fSmallData)/sizeof(int32_t)) {
-        fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));      // TODO:  null check
+        fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); 
+    }
+    if (fStack == NULL || fData == NULL) {
+        fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    }
    reset(input);
 }
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -753,6 +753,9 @@ private:

    UBool               fTraceDebug;       // Set true for debug tracing of match engine.

+    UErrorCode          fDeferredStatus;   // Save error state if that cannot be immediately
+                                           //   reported, or that permanently disables this matcher.
+
    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -123,7 +123,8 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
    UnicodeString patString(pat);
    REPattern = RegexPattern::compile(patString, 0, pe, status);
    if (U_FAILURE(status)) {
-        errln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %d\n", line, status);
+        errln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s\n",
+            line, u_errorName(status));
        return FALSE;
    }
    if (line==376) { REPattern->dump();}
@ -132,14 +133,16 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
    UnicodeString unEscapedInput = inputString.unescape();
    REMatcher = REPattern->matcher(unEscapedInput, status);
    if (U_FAILURE(status)) {
-        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %d\n", line, status);
+        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
+            line, u_errorName(status));
        return FALSE;
    }
  
    UBool actualmatch;
    actualmatch = REMatcher->lookingAt(status);
    if (U_FAILURE(status)) {
-        errln("RegexTest failure in lookingAt() at line %d.  Status = %d\n", line, status);
+        errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
+            line, u_errorName(status));
        retVal =  FALSE;
    }
    if (actualmatch != looking) {
@ -150,7 +153,8 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
    status = U_ZERO_ERROR;
    actualmatch = REMatcher->matches(status);
    if (U_FAILURE(status)) {
-        errln("RegexTest failure in matches() at line %d.  Status = %d\n", line, status);
+        errln("RegexTest failure in matches() at line %d.  Status = %s\n",
+            line, u_errorName(status));
        retVal = FALSE;
    }
    if (actualmatch != match) {
@ -479,9 +483,6 @@ void RegexTest::Basic() {
    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);

-    // Set contains only a string, no individual chars.
-    REGEX_TESTLM("[{ab}]", "a", FALSE, FALSE);
-
    //
    //   OR operator in patterns
    //
@ -1273,6 +1274,9 @@ void RegexTest::Errors() {
    REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
    REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);

+    // UnicodeSet containing a string
+    REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING);
+
 }