ICU-2422 regexp, case insensitive matching, work in progress

X-SVN-Rev: 10989
This commit is contained in:
Andy Heninger 2003-02-07 02:04:14 +00:00
parent 6c56af4be2
commit 7f44eb19cf
8 changed files with 97 additions and 31 deletions

View file

@ -1843,7 +1843,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_NUMBER_TOO_BIG",
"U_REGEX_BAD_INTERVAL",
"U_REGEX_MAX_LT_MIN",
"U_REGEX_INVALID_BACK_REF"
"U_REGEX_INVALID_BACK_REF",
"U_REGEX_INVALID_FLAG"
};
U_CAPI const char * U_EXPORT2

View file

@ -629,6 +629,7 @@ typedef enum UErrorCode {
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */
U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */
U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */

View file

@ -153,6 +153,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
fCharNum = 0;
fQuoteMode = FALSE;
fFreeForm = FALSE;
fCaseI = FALSE;
fMatchOpenParen = -1;
fMatchCloseParen = -1;
@ -1223,8 +1224,7 @@ void RegexCompile::literalChar() {
if (fStringOpStart == -1) {
// First char of a string in the pattern.
// Emit a OneChar op into the compiled pattern.
op = URX_BUILD(URX_ONECHAR, fC.fChar);
fRXPat->fCompiledPat->addElement(op, *fStatus);
emitONE_CHAR(fC.fChar);
// Also add it to the string pool, in case we get a second adjacent literal
// and want to change form ONE_CHAR to STRING
@ -1239,9 +1239,13 @@ void RegexCompile::literalChar() {
// If the most recently emitted op is a URX_ONECHAR, change it to a string op.
op = fRXPat->fCompiledPat->lastElementi();
opType = URX_TYPE(op);
U_ASSERT(opType == URX_ONECHAR || opType == URX_STRING_LEN);
if (opType == URX_ONECHAR) {
op = URX_BUILD(URX_STRING, fStringOpStart);
U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) {
if (fCaseI) {
op = URX_BUILD(URX_STRING_I, fStringOpStart);
} else {
op = URX_BUILD(URX_STRING, fStringOpStart);
}
patternLoc = fRXPat->fCompiledPat->size() - 1;
fRXPat->fCompiledPat->setElementAt(op, patternLoc);
op = URX_BUILD(URX_STRING_LEN, 0);
@ -1258,6 +1262,29 @@ void RegexCompile::literalChar() {
//------------------------------------------------------------------------------
//
// emitONE_CHAR emit a ONE_CHAR op into the generated code.
// Choose cased or uncased version, depending on the
// match mode and whether the character itself is cased.
//
//------------------------------------------------------------------------------
void RegexCompile::emitONE_CHAR(UChar32 c) {
int32_t op;
if (fCaseI && (u_tolower(c) != u_toupper(c))) {
// We have a cased character, and are in case insensitive matching mode.
// TODO: replace with a better test. See Alan L.'s mail of 2/6
c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
op = URX_BUILD(URX_ONECHAR_I, c);
} else {
// Uncased char, or case sensitive match mode.
// Either way, just generate a literal compare of the char.
op = URX_BUILD(URX_ONECHAR, c);
}
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
//------------------------------------------------------------------------------
//
// fixLiterals When compiling something that can follow a literal
@ -1269,7 +1296,7 @@ void RegexCompile::literalChar() {
// Optionally, split the last char of the string off into
// a single "ONE_CHAR" operation, so that quantifiers can
// apply to that char alone. Example: abc*
// The * needs to apply to the 'c' only.
// The * must apply to the 'c' only.
//
//------------------------------------------------------------------------------
void RegexCompile::fixLiterals(UBool split) {
@ -1321,16 +1348,14 @@ void RegexCompile::fixLiterals(UBool split) {
stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
op = URX_BUILD(URX_STRING_LEN, stringLen);
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
op = URX_BUILD(URX_ONECHAR, lastChar);
fRXPat->fCompiledPat->addElement(op, *fStatus);
emitONE_CHAR(lastChar);
} else {
// The original string consisted of exactly two characters. Replace
// the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
// of URX_ONECHARs.
op = URX_BUILD(URX_ONECHAR, nextToLastChar);
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -2);
op = URX_BUILD(URX_ONECHAR, lastChar);
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2);
emitONE_CHAR(nextToLastChar);
emitONE_CHAR(lastChar);
}
}

View file

@ -100,6 +100,8 @@ private:
void fixLiterals(UBool split=FALSE); // Fix literal strings.
void insertOp(int32_t where); // Open up a slot for a new op in the
// generated code at the specified location.
void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code,
// taking case mode into account.
UBool possibleNullMatch(int32_t start, // Test a range of compiled pattern for
int32_t end); // for possibly matching an empty string.
@ -127,10 +129,9 @@ private:
RegexPatternChar fC; // Current char for parse state machine
// processing.
int32_t fStringOpStart; // While a literal string is being scanned
// holds the start index within RegexPattern.
// fLiteralText where the string is being stored.
//
// Data for the state machine that parses the regular expression.
//
RegexTableEl **fStateTable; // State Transition Table for regex Rule
// parsing. index by p[state][char-class]
@ -138,6 +139,15 @@ private:
int fStackPtr; // and pops as specified in the state
// transition rules.
//
// Data associated with the generation of the pcode for the match engine
//
UBool fCaseI; // Case Insensitive Match Mode is on.
int32_t fStringOpStart; // While a literal string is being scanned
// holds the start index within RegexPattern.
// fLiteralText where the string is being stored.
int32_t fPatternLength; // Length of the input pattern string.
UVector32 fParenStack; // parentheses stack. Each frame consists of

View file

@ -108,10 +108,16 @@ enum {
URX_LA_START = 37, // Starting a LookAround expression.
// Save InputPos and SP in static data.
// Operand: Static data offset for the save
URX_LA_END = 38 // Ending a Lookaround expression.
URX_LA_END = 38, // Ending a Lookaround expression.
// Restore InputPos and Stack to saved values.
// Operand: Static data offset for saved data.
};
URX_ONECHAR_I = 39, // Test for case-insensitive match of a literal character.
// Operand: the literal char.
URX_STRING_I = 40 // Case insensitive string compare.
// First Operand: Index of start of string in string literals
// Second Operand (next word in compiled code):
// the length of the string.
};
// Keep this list of opcode names in sync with the above enum
// Used for debug printing only.
@ -154,7 +160,9 @@ enum {
"STO_INP_LOC", \
"JMPX", \
"LA_START", \
"LA_END"
"LA_END", \
"ONECHAR_I", \
"STRING_I"
//
// Convenience macros for assembling and disassembling a compiled operation.

View file

@ -206,6 +206,15 @@ RegexPattern *RegexPattern::compile(
if (U_FAILURE(status)) {
return NULL;
}
const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
UREGEX_DOTALL | UREGEX_MULTILINE;
if ((flags & ~allFlags) != 0) {
status = U_REGEX_INVALID_FLAG;
return NULL;
}
if (flags != 0) {
status = U_REGEX_UNIMPLEMENTED;
return NULL;
@ -474,16 +483,19 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_BACKREF:
case URX_STO_INP_LOC:
case URX_JMPX:
case URX_LA_START:
case URX_LA_END:
// types with an integer operand field.
REGEX_DUMP_DEBUG_PRINTF("%d", val);
break;
case URX_ONECHAR:
case URX_ONECHAR_I:
REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?');
break;
case URX_STRING:
case URX_STRING_I:
{
int32_t lengthOp = fCompiledPat->elementAti(index+1);
U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);

View file

@ -290,14 +290,14 @@ public:
*
* @draft ICU 2.4
*/
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
virtual inline UClassID getDynamicClassID() const;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @draft ICU 2.4
*/
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
static inline UClassID getStaticClassID();
private:
//
@ -668,24 +668,25 @@ public:
/**
* setTrace Debug function, enable/disable tracing of the matching engine.
* For internal ICU development use only. DO NO USE!!!!
* @internal
*/
void setTrace(UBool state);
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @draft ICU 2.2
*/
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @draft ICU 2.2
*/
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
static inline UClassID getStaticClassID();
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @draft ICU 2.2
*/
virtual inline UClassID getDynamicClassID() const;
private:
// Constructors and other object boilerplate are private.
@ -737,6 +738,13 @@ private:
};
inline UClassID RegexPattern::getStaticClassID() { return (UClassID)&fgClassID; }
inline UClassID RegexPattern::getDynamicClassID() const { return getStaticClassID(); }
inline UClassID RegexMatcher::getStaticClassID() { return (UClassID)&fgClassID; }
inline UClassID RegexMatcher::getDynamicClassID() const { return getStaticClassID(); }
U_NAMESPACE_END
#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
#endif

View file

@ -145,6 +145,7 @@
"(?:ABC)+" "<0>ABCABCABC</0>D"
"(?:ABC)DEF+" "<0>ABCDEFFF</0>D"
"AB\.C\eD\u0666E" "<0>AB.C\u001BD\u0666E</0>F"
"ab\Bde" "<0>abde</0>"
# {min,max} iteration qualifier