mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-2422 Regexp, general cleanup
X-SVN-Rev: 11366
This commit is contained in:
parent
dde478d82e
commit
f6d9573913
7 changed files with 63 additions and 55 deletions
|
@ -1763,7 +1763,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
|
|||
"U_REGEX_MAX_LT_MIN",
|
||||
"U_REGEX_INVALID_BACK_REF",
|
||||
"U_REGEX_INVALID_FLAG",
|
||||
"U_REGEX_LOOK_BEHIND_LIMIT"
|
||||
"U_REGEX_LOOK_BEHIND_LIMIT",
|
||||
"U_REGEX_SET_CONTAINS_STRING"
|
||||
};
|
||||
|
||||
static const char * const
|
||||
|
|
|
@ -631,6 +631,7 @@ typedef enum UErrorCode {
|
|||
U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */
|
||||
U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */
|
||||
U_REGEX_LOOK_BEHIND_LIMIT, /**< Look-Behind pattern matches must have a bounded maximum length. */
|
||||
U_REGEX_SET_CONTAINS_STRING, /**< Regexps cannot have UnicodeSets containing strings.*/
|
||||
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
|
||||
|
||||
/*
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
//
|
||||
// file: regexcmp.cpp
|
||||
//
|
||||
// Copyright (C) 2002, International Business Machines Corporation and others.
|
||||
// Copyright (C) 2002-2003 International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains the ICU regular expression compiler, which is responsible
|
||||
|
@ -172,14 +172,16 @@ static void ThreadSafeUnicodeSetInit(UnicodeSet **pSet, const UChar *pattern, UE
|
|||
// determination of Grapheme Cluster boundaries.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
static void InitGraphemeClusterSets() {
|
||||
UErrorCode status = U_ZERO_ERROR; // TODO: some sort of error handling needed.
|
||||
static void InitGraphemeClusterSets(UErrorCode &status) {
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_EXTEND], gGC_ExtendPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_CONTROL], gGC_ControlPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_L], gGC_LPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_V], gGC_VPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_T], gGC_TPattern, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (gPropSets[URX_GC_NORMAL] == NULL) {
|
||||
|
||||
//
|
||||
|
@ -278,7 +280,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
|
|||
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET], gIsWordPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET], gIsSpacePattern, status);
|
||||
|
||||
InitGraphemeClusterSets();
|
||||
InitGraphemeClusterSets(status);
|
||||
}
|
||||
|
||||
|
||||
|
@ -922,13 +924,11 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
case doBadOpenParenType:
|
||||
case doRuleError:
|
||||
error(U_REGEX_RULE_SYNTAX);
|
||||
returnVal = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
case doMismatchedParenErr:
|
||||
error(U_REGEX_MISMATCHED_PAREN);
|
||||
returnVal = FALSE;
|
||||
break;
|
||||
|
||||
case doPlus:
|
||||
|
@ -1210,7 +1210,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
break;
|
||||
|
||||
case doBackslashx: // \x{abcd} alternate hex format
|
||||
// TODO: implement
|
||||
// TODO: this is waiting for a decision on adding \x to unescape.
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
|
@ -1477,9 +1477,13 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
default:
|
||||
U_ASSERT(FALSE);
|
||||
error(U_REGEX_INTERNAL_ERROR);
|
||||
returnVal = FALSE;
|
||||
break;
|
||||
}
|
||||
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
returnVal = FALSE;
|
||||
}
|
||||
|
||||
return returnVal;
|
||||
};
|
||||
|
||||
|
@ -1560,9 +1564,9 @@ void RegexCompile::literalChar(UChar32 c) {
|
|||
//------------------------------------------------------------------------------
|
||||
void RegexCompile::emitONE_CHAR(UChar32 c) {
|
||||
int32_t op;
|
||||
if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && (u_tolower(c) != u_toupper(c))) {
|
||||
if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
|
||||
u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
|
||||
// We have a cased character, and are in case insensitive matching mode.
|
||||
// TODO: replace with a better test. See Alan L.'s mail of 2/6
|
||||
c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
|
||||
op = URX_BUILD(URX_ONECHAR_I, c);
|
||||
} else {
|
||||
|
@ -1963,7 +1967,8 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
|
|||
UChar32 firstSetChar = theSet->charAt(0);
|
||||
if (firstSetChar == -1) {
|
||||
// Sets that contain only strings, but no individual chars,
|
||||
// will end up here. TODO: figure out what to with sets containing strings.
|
||||
// will end up here.
|
||||
error(U_REGEX_SET_CONTAINS_STRING);
|
||||
setSize = 0;
|
||||
}
|
||||
|
||||
|
@ -2050,21 +2055,6 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
|
|||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// possibleNullMatch Test a range of compiled pattern for the possibility that it
|
||||
// might match an empty string. Used to control the generation
|
||||
// of extra checking code to prevent infinite loops in the match
|
||||
// engine on repeated empty matches, such as might happen with
|
||||
// (x?)*
|
||||
// when the input string is not at an x.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
|
||||
// for now, just return true. TODO: make a real implementation
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -3038,16 +3028,21 @@ void RegexCompile::stripNOPs() {
|
|||
void RegexCompile::error(UErrorCode e) {
|
||||
if (U_SUCCESS(*fStatus)) {
|
||||
*fStatus = e;
|
||||
fParseErr->line = fLineNum;
|
||||
fParseErr->line = fLineNum;
|
||||
fParseErr->offset = fCharNum;
|
||||
fParseErr->preContext[0] = 0; // TODO: copy in some input pattern text
|
||||
fParseErr->preContext[0] = 0;
|
||||
|
||||
// Fill in the context.
|
||||
// Note: extractBetween() pins supplied indicies to the string bounds.
|
||||
uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
|
||||
uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
|
||||
fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex,
|
||||
fParseErr->preContext, 0);
|
||||
fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1,
|
||||
fParseErr->postContext, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Assorted Unicode character constants.
|
||||
// Numeric because there is no portable way to enter them as literals.
|
||||
|
@ -3186,7 +3181,6 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
|||
}
|
||||
}
|
||||
if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
|
||||
// TODO: is RuleWhiteSpace the right thing to use here?
|
||||
break;
|
||||
}
|
||||
c.fChar = nextCharLL();
|
||||
|
@ -3218,13 +3212,13 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
|||
{
|
||||
// We are in a '\' escape that will be handled by the state table scanner.
|
||||
// Just return the backslash, but remember that the following char is to
|
||||
// be taken literally. TODO: this is awkward
|
||||
// be taken literally. TODO: this is awkward, think about alternatives.
|
||||
fInBackslashQuote = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// re-enable # to end-of-line comments, in case they were disabled..
|
||||
// re-enable # to end-of-line comments, in case they were disabled.
|
||||
// They are disabled by the parser upon seeing '(?', but this lasts for
|
||||
// the fetching of the next character only.
|
||||
fEOLComments = TRUE;
|
||||
|
@ -3325,7 +3319,7 @@ UnicodeSet *RegexCompile::scanProp() {
|
|||
nextChar(fC);
|
||||
if (fC.fChar == -1) {
|
||||
// Hit the end of the input string without finding the closing '}'
|
||||
*fStatus = U_REGEX_PROPERTY_SYNTAX;
|
||||
error(U_REGEX_PROPERTY_SYNTAX);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
//
|
||||
// regexcmp.h
|
||||
//
|
||||
// Copyright (C) 2002, International Business Machines Corporation and others.
|
||||
// Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains declarations for the class RegexCompile
|
||||
|
@ -105,8 +105,6 @@ private:
|
|||
// generated code at the specified location.
|
||||
void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code,
|
||||
// taking case mode into account.
|
||||
UBool possibleNullMatch(int32_t start, // Test a range of compiled pattern for
|
||||
int32_t end); // for possibly matching an empty string.
|
||||
int32_t minMatchLength(int32_t start,
|
||||
int32_t end);
|
||||
int32_t maxMatchLength(int32_t start,
|
||||
|
|
|
@ -5,10 +5,10 @@
|
|||
// which is one of the main API classes for the ICU regular expression package.
|
||||
//
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
**********************************************************************
|
||||
**************************************************************************
|
||||
* Copyright (C) 2002-2003 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
**************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
@ -37,11 +37,14 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
|
|||
fPatternOwned = FALSE;
|
||||
fInput = NULL;
|
||||
fTraceDebug = FALSE;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fStack = new UVector32(status); // TODO: do something with status.
|
||||
fDeferredStatus = U_ZERO_ERROR;
|
||||
fStack = new UVector32(fDeferredStatus);
|
||||
fData = fSmallData;
|
||||
if (pat->fDataSize > sizeof(fSmallData)/sizeof(int32_t)) {
|
||||
fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); // TODO: null check
|
||||
fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t));
|
||||
}
|
||||
if (fStack == NULL || fData == NULL) {
|
||||
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
|
||||
reset();
|
||||
|
@ -55,10 +58,14 @@ RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &inp
|
|||
fPattern = RegexPattern::compile(regexp, flags, pe, status);
|
||||
fPatternOwned = TRUE;
|
||||
fTraceDebug = FALSE;
|
||||
fDeferredStatus = U_ZERO_ERROR;
|
||||
fStack = new UVector32(status);
|
||||
fData = fSmallData;
|
||||
if (fPattern->fDataSize > sizeof(fSmallData)/sizeof(int32_t)) {
|
||||
fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); // TODO: null check
|
||||
fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
|
||||
}
|
||||
if (fStack == NULL || fData == NULL) {
|
||||
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
reset(input);
|
||||
}
|
||||
|
|
|
@ -753,6 +753,9 @@ private:
|
|||
|
||||
UBool fTraceDebug; // Set true for debug tracing of match engine.
|
||||
|
||||
UErrorCode fDeferredStatus; // Save error state if that cannot be immediately
|
||||
// reported, or that permanently disables this matcher.
|
||||
|
||||
/**
|
||||
* The address of this static class variable serves as this class's ID
|
||||
* for ICU "poor man's RTTI".
|
||||
|
|
|
@ -123,7 +123,8 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
|
|||
UnicodeString patString(pat);
|
||||
REPattern = RegexPattern::compile(patString, 0, pe, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %d\n", line, status);
|
||||
errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s\n",
|
||||
line, u_errorName(status));
|
||||
return FALSE;
|
||||
}
|
||||
if (line==376) { REPattern->dump();}
|
||||
|
@ -132,14 +133,16 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
|
|||
UnicodeString unEscapedInput = inputString.unescape();
|
||||
REMatcher = REPattern->matcher(unEscapedInput, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("RegexTest failure in REPattern::matcher() at line %d. Status = %d\n", line, status);
|
||||
errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
|
||||
line, u_errorName(status));
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
UBool actualmatch;
|
||||
actualmatch = REMatcher->lookingAt(status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("RegexTest failure in lookingAt() at line %d. Status = %d\n", line, status);
|
||||
errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
|
||||
line, u_errorName(status));
|
||||
retVal = FALSE;
|
||||
}
|
||||
if (actualmatch != looking) {
|
||||
|
@ -150,7 +153,8 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
|
|||
status = U_ZERO_ERROR;
|
||||
actualmatch = REMatcher->matches(status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("RegexTest failure in matches() at line %d. Status = %d\n", line, status);
|
||||
errln("RegexTest failure in matches() at line %d. Status = %s\n",
|
||||
line, u_errorName(status));
|
||||
retVal = FALSE;
|
||||
}
|
||||
if (actualmatch != match) {
|
||||
|
@ -479,9 +483,6 @@ void RegexTest::Basic() {
|
|||
REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
|
||||
REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
|
||||
|
||||
// Set contains only a string, no individual chars.
|
||||
REGEX_TESTLM("[{ab}]", "a", FALSE, FALSE);
|
||||
|
||||
//
|
||||
// OR operator in patterns
|
||||
//
|
||||
|
@ -1273,6 +1274,9 @@ void RegexTest::Errors() {
|
|||
REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
|
||||
REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
|
||||
|
||||
// UnicodeSet containing a string
|
||||
REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue