ICU-2422 Regexp, general cleanup

X-SVN-Rev: 11366
This commit is contained in:
Andy Heninger 2003-03-20 01:15:10 +00:00
parent dde478d82e
commit f6d9573913
7 changed files with 63 additions and 55 deletions

View file

@ -1763,7 +1763,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_MAX_LT_MIN",
"U_REGEX_INVALID_BACK_REF",
"U_REGEX_INVALID_FLAG",
"U_REGEX_LOOK_BEHIND_LIMIT"
"U_REGEX_LOOK_BEHIND_LIMIT",
"U_REGEX_SET_CONTAINS_STRING"
};
static const char * const

View file

@ -631,6 +631,7 @@ typedef enum UErrorCode {
U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */
U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */
U_REGEX_LOOK_BEHIND_LIMIT, /**< Look-Behind pattern matches must have a bounded maximum length. */
U_REGEX_SET_CONTAINS_STRING, /**< Regexps cannot have UnicodeSets containing strings.*/
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
/*

View file

@ -2,7 +2,7 @@
//
// file: regexcmp.cpp
//
// Copyright (C) 2002, International Business Machines Corporation and others.
// Copyright (C) 2002-2003 International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains the ICU regular expression compiler, which is responsible
@ -172,14 +172,16 @@ static void ThreadSafeUnicodeSetInit(UnicodeSet **pSet, const UChar *pattern, UE
// determination of Grapheme Cluster boundaries.
//
//----------------------------------------------------------------------------------------
static void InitGraphemeClusterSets() {
UErrorCode status = U_ZERO_ERROR; // TODO: some sort of error handling needed.
static void InitGraphemeClusterSets(UErrorCode &status) {
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_EXTEND], gGC_ExtendPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_CONTROL], gGC_ControlPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_L], gGC_LPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_V], gGC_VPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_T], gGC_TPattern, status);
if (U_FAILURE(status)) {
return;
}
if (gPropSets[URX_GC_NORMAL] == NULL) {
//
@ -278,7 +280,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET], gIsWordPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET], gIsSpacePattern, status);
InitGraphemeClusterSets();
InitGraphemeClusterSets(status);
}
@ -922,13 +924,11 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doBadOpenParenType:
case doRuleError:
error(U_REGEX_RULE_SYNTAX);
returnVal = FALSE;
break;
case doMismatchedParenErr:
error(U_REGEX_MISMATCHED_PAREN);
returnVal = FALSE;
break;
case doPlus:
@ -1210,7 +1210,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
break;
case doBackslashx: // \x{abcd} alternate hex format
// TODO: implement
// TODO: this is waiting for a decision on adding \x to unescape.
error(U_REGEX_UNIMPLEMENTED);
break;
@ -1477,9 +1477,13 @@ UBool RegexCompile::doParseActions(EParseAction action)
default:
U_ASSERT(FALSE);
error(U_REGEX_INTERNAL_ERROR);
returnVal = FALSE;
break;
}
if (U_FAILURE(*fStatus)) {
returnVal = FALSE;
}
return returnVal;
};
@ -1560,9 +1564,9 @@ void RegexCompile::literalChar(UChar32 c) {
//------------------------------------------------------------------------------
void RegexCompile::emitONE_CHAR(UChar32 c) {
int32_t op;
if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && (u_tolower(c) != u_toupper(c))) {
if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
// We have a cased character, and are in case insensitive matching mode.
// TODO: replace with a better test. See Alan L.'s mail of 2/6
c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
op = URX_BUILD(URX_ONECHAR_I, c);
} else {
@ -1963,7 +1967,8 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
UChar32 firstSetChar = theSet->charAt(0);
if (firstSetChar == -1) {
// Sets that contain only strings, but no individual chars,
// will end up here. TODO: figure out what to with sets containing strings.
// will end up here.
error(U_REGEX_SET_CONTAINS_STRING);
setSize = 0;
}
@ -2050,21 +2055,6 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
//----------------------------------------------------------------------------------------
//
// possibleNullMatch Test a range of compiled pattern for the possibility that it
// might match an empty string. Used to control the generation
// of extra checking code to prevent infinite loops in the match
// engine on repeated empty matches, such as might happen with
// (x?)*
// when the input string is not at an x.
//
//----------------------------------------------------------------------------------------
UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
// for now, just return true. TODO: make a real implementation
return TRUE;
}
//----------------------------------------------------------------------------------------
//
@ -3038,16 +3028,21 @@ void RegexCompile::stripNOPs() {
void RegexCompile::error(UErrorCode e) {
if (U_SUCCESS(*fStatus)) {
*fStatus = e;
fParseErr->line = fLineNum;
fParseErr->line = fLineNum;
fParseErr->offset = fCharNum;
fParseErr->preContext[0] = 0; // TODO: copy in some input pattern text
fParseErr->preContext[0] = 0;
// Fill in the context.
// Note: extractBetween() pins supplied indicies to the string bounds.
uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex,
fParseErr->preContext, 0);
fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1,
fParseErr->postContext, 0);
}
}
//
// Assorted Unicode character constants.
// Numeric because there is no portable way to enter them as literals.
@ -3186,7 +3181,6 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
}
}
if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
// TODO: is RuleWhiteSpace the right thing to use here?
break;
}
c.fChar = nextCharLL();
@ -3218,13 +3212,13 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
{
// We are in a '\' escape that will be handled by the state table scanner.
// Just return the backslash, but remember that the following char is to
// be taken literally. TODO: this is awkward
// be taken literally. TODO: this is awkward, think about alternatives.
fInBackslashQuote = TRUE;
}
}
}
// re-enable # to end-of-line comments, in case they were disabled..
// re-enable # to end-of-line comments, in case they were disabled.
// They are disabled by the parser upon seeing '(?', but this lasts for
// the fetching of the next character only.
fEOLComments = TRUE;
@ -3325,7 +3319,7 @@ UnicodeSet *RegexCompile::scanProp() {
nextChar(fC);
if (fC.fChar == -1) {
// Hit the end of the input string without finding the closing '}'
*fStatus = U_REGEX_PROPERTY_SYNTAX;
error(U_REGEX_PROPERTY_SYNTAX);
return NULL;
}
}

View file

@ -1,7 +1,7 @@
//
// regexcmp.h
//
// Copyright (C) 2002, International Business Machines Corporation and others.
// Copyright (C) 2002-2003, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for the class RegexCompile
@ -105,8 +105,6 @@ private:
// generated code at the specified location.
void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code,
// taking case mode into account.
UBool possibleNullMatch(int32_t start, // Test a range of compiled pattern for
int32_t end); // for possibly matching an empty string.
int32_t minMatchLength(int32_t start,
int32_t end);
int32_t maxMatchLength(int32_t start,

View file

@ -5,10 +5,10 @@
// which is one of the main API classes for the ICU regular expression package.
//
/*
**********************************************************************
* Copyright (C) 2002 International Business Machines Corporation *
* and others. All rights reserved. *
**********************************************************************
**************************************************************************
* Copyright (C) 2002-2003 International Business Machines Corporation *
* and others. All rights reserved. *
**************************************************************************
*/
#include "unicode/utypes.h"
@ -37,11 +37,14 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
fPatternOwned = FALSE;
fInput = NULL;
fTraceDebug = FALSE;
UErrorCode status = U_ZERO_ERROR;
fStack = new UVector32(status); // TODO: do something with status.
fDeferredStatus = U_ZERO_ERROR;
fStack = new UVector32(fDeferredStatus);
fData = fSmallData;
if (pat->fDataSize > sizeof(fSmallData)/sizeof(int32_t)) {
fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); // TODO: null check
fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t));
}
if (fStack == NULL || fData == NULL) {
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
}
reset();
@ -55,10 +58,14 @@ RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &inp
fPattern = RegexPattern::compile(regexp, flags, pe, status);
fPatternOwned = TRUE;
fTraceDebug = FALSE;
fDeferredStatus = U_ZERO_ERROR;
fStack = new UVector32(status);
fData = fSmallData;
if (fPattern->fDataSize > sizeof(fSmallData)/sizeof(int32_t)) {
fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); // TODO: null check
fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
}
if (fStack == NULL || fData == NULL) {
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
}
reset(input);
}

View file

@ -753,6 +753,9 @@ private:
UBool fTraceDebug; // Set true for debug tracing of match engine.
UErrorCode fDeferredStatus; // Save error state if that cannot be immediately
// reported, or that permanently disables this matcher.
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".

View file

@ -123,7 +123,8 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
UnicodeString patString(pat);
REPattern = RegexPattern::compile(patString, 0, pe, status);
if (U_FAILURE(status)) {
errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %d\n", line, status);
errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s\n",
line, u_errorName(status));
return FALSE;
}
if (line==376) { REPattern->dump();}
@ -132,14 +133,16 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
UnicodeString unEscapedInput = inputString.unescape();
REMatcher = REPattern->matcher(unEscapedInput, status);
if (U_FAILURE(status)) {
errln("RegexTest failure in REPattern::matcher() at line %d. Status = %d\n", line, status);
errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
line, u_errorName(status));
return FALSE;
}
UBool actualmatch;
actualmatch = REMatcher->lookingAt(status);
if (U_FAILURE(status)) {
errln("RegexTest failure in lookingAt() at line %d. Status = %d\n", line, status);
errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
line, u_errorName(status));
retVal = FALSE;
}
if (actualmatch != looking) {
@ -150,7 +153,8 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
status = U_ZERO_ERROR;
actualmatch = REMatcher->matches(status);
if (U_FAILURE(status)) {
errln("RegexTest failure in matches() at line %d. Status = %d\n", line, status);
errln("RegexTest failure in matches() at line %d. Status = %s\n",
line, u_errorName(status));
retVal = FALSE;
}
if (actualmatch != match) {
@ -479,9 +483,6 @@ void RegexTest::Basic() {
REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
// Set contains only a string, no individual chars.
REGEX_TESTLM("[{ab}]", "a", FALSE, FALSE);
//
// OR operator in patterns
//
@ -1273,6 +1274,9 @@ void RegexTest::Errors() {
REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
// UnicodeSet containing a string
REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING);
}