mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-21 12:40:02 +00:00
ICU-2422 regexp, case insensitive matching, work in progress
X-SVN-Rev: 10989
This commit is contained in:
parent
6c56af4be2
commit
7f44eb19cf
8 changed files with 97 additions and 31 deletions
|
@ -1843,7 +1843,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
|
|||
"U_REGEX_NUMBER_TOO_BIG",
|
||||
"U_REGEX_BAD_INTERVAL",
|
||||
"U_REGEX_MAX_LT_MIN",
|
||||
"U_REGEX_INVALID_BACK_REF"
|
||||
"U_REGEX_INVALID_BACK_REF",
|
||||
"U_REGEX_INVALID_FLAG"
|
||||
};
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
|
|
|
@ -629,6 +629,7 @@ typedef enum UErrorCode {
|
|||
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
|
||||
U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */
|
||||
U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */
|
||||
U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */
|
||||
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
|
||||
|
||||
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||
|
|
|
@ -153,6 +153,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
|||
fCharNum = 0;
|
||||
fQuoteMode = FALSE;
|
||||
fFreeForm = FALSE;
|
||||
fCaseI = FALSE;
|
||||
|
||||
fMatchOpenParen = -1;
|
||||
fMatchCloseParen = -1;
|
||||
|
@ -1223,8 +1224,7 @@ void RegexCompile::literalChar() {
|
|||
if (fStringOpStart == -1) {
|
||||
// First char of a string in the pattern.
|
||||
// Emit a OneChar op into the compiled pattern.
|
||||
op = URX_BUILD(URX_ONECHAR, fC.fChar);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
emitONE_CHAR(fC.fChar);
|
||||
|
||||
// Also add it to the string pool, in case we get a second adjacent literal
|
||||
// and want to change form ONE_CHAR to STRING
|
||||
|
@ -1239,9 +1239,13 @@ void RegexCompile::literalChar() {
|
|||
// If the most recently emitted op is a URX_ONECHAR, change it to a string op.
|
||||
op = fRXPat->fCompiledPat->lastElementi();
|
||||
opType = URX_TYPE(op);
|
||||
U_ASSERT(opType == URX_ONECHAR || opType == URX_STRING_LEN);
|
||||
if (opType == URX_ONECHAR) {
|
||||
op = URX_BUILD(URX_STRING, fStringOpStart);
|
||||
U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
|
||||
if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) {
|
||||
if (fCaseI) {
|
||||
op = URX_BUILD(URX_STRING_I, fStringOpStart);
|
||||
} else {
|
||||
op = URX_BUILD(URX_STRING, fStringOpStart);
|
||||
}
|
||||
patternLoc = fRXPat->fCompiledPat->size() - 1;
|
||||
fRXPat->fCompiledPat->setElementAt(op, patternLoc);
|
||||
op = URX_BUILD(URX_STRING_LEN, 0);
|
||||
|
@ -1258,6 +1262,29 @@ void RegexCompile::literalChar() {
|
|||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// emitONE_CHAR emit a ONE_CHAR op into the generated code.
|
||||
// Choose cased or uncased version, depending on the
|
||||
// match mode and whether the character itself is cased.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
void RegexCompile::emitONE_CHAR(UChar32 c) {
|
||||
int32_t op;
|
||||
if (fCaseI && (u_tolower(c) != u_toupper(c))) {
|
||||
// We have a cased character, and are in case insensitive matching mode.
|
||||
// TODO: replace with a better test. See Alan L.'s mail of 2/6
|
||||
c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
|
||||
op = URX_BUILD(URX_ONECHAR_I, c);
|
||||
} else {
|
||||
// Uncased char, or case sensitive match mode.
|
||||
// Either way, just generate a literal compare of the char.
|
||||
op = URX_BUILD(URX_ONECHAR, c);
|
||||
}
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// fixLiterals When compiling something that can follow a literal
|
||||
|
@ -1269,7 +1296,7 @@ void RegexCompile::literalChar() {
|
|||
// Optionally, split the last char of the string off into
|
||||
// a single "ONE_CHAR" operation, so that quantifiers can
|
||||
// apply to that char alone. Example: abc*
|
||||
// The * needs to apply to the 'c' only.
|
||||
// The * must apply to the 'c' only.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
void RegexCompile::fixLiterals(UBool split) {
|
||||
|
@ -1321,16 +1348,14 @@ void RegexCompile::fixLiterals(UBool split) {
|
|||
stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx);
|
||||
op = URX_BUILD(URX_STRING_LEN, stringLen);
|
||||
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
|
||||
op = URX_BUILD(URX_ONECHAR, lastChar);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
emitONE_CHAR(lastChar);
|
||||
} else {
|
||||
// The original string consisted of exactly two characters. Replace
|
||||
// the existing compiled URX_STRING/URX_STRING_LEN ops with a pair
|
||||
// of URX_ONECHARs.
|
||||
op = URX_BUILD(URX_ONECHAR, nextToLastChar);
|
||||
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -2);
|
||||
op = URX_BUILD(URX_ONECHAR, lastChar);
|
||||
fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1);
|
||||
fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2);
|
||||
emitONE_CHAR(nextToLastChar);
|
||||
emitONE_CHAR(lastChar);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -100,6 +100,8 @@ private:
|
|||
void fixLiterals(UBool split=FALSE); // Fix literal strings.
|
||||
void insertOp(int32_t where); // Open up a slot for a new op in the
|
||||
// generated code at the specified location.
|
||||
void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code,
|
||||
// taking case mode into account.
|
||||
UBool possibleNullMatch(int32_t start, // Test a range of compiled pattern for
|
||||
int32_t end); // for possibly matching an empty string.
|
||||
|
||||
|
@ -127,10 +129,9 @@ private:
|
|||
RegexPatternChar fC; // Current char for parse state machine
|
||||
// processing.
|
||||
|
||||
int32_t fStringOpStart; // While a literal string is being scanned
|
||||
// holds the start index within RegexPattern.
|
||||
// fLiteralText where the string is being stored.
|
||||
|
||||
//
|
||||
// Data for the state machine that parses the regular expression.
|
||||
//
|
||||
RegexTableEl **fStateTable; // State Transition Table for regex Rule
|
||||
// parsing. index by p[state][char-class]
|
||||
|
||||
|
@ -138,6 +139,15 @@ private:
|
|||
int fStackPtr; // and pops as specified in the state
|
||||
// transition rules.
|
||||
|
||||
//
|
||||
// Data associated with the generation of the pcode for the match engine
|
||||
//
|
||||
UBool fCaseI; // Case Insensitive Match Mode is on.
|
||||
|
||||
int32_t fStringOpStart; // While a literal string is being scanned
|
||||
// holds the start index within RegexPattern.
|
||||
// fLiteralText where the string is being stored.
|
||||
|
||||
int32_t fPatternLength; // Length of the input pattern string.
|
||||
|
||||
UVector32 fParenStack; // parentheses stack. Each frame consists of
|
||||
|
|
|
@ -108,10 +108,16 @@ enum {
|
|||
URX_LA_START = 37, // Starting a LookAround expression.
|
||||
// Save InputPos and SP in static data.
|
||||
// Operand: Static data offset for the save
|
||||
URX_LA_END = 38 // Ending a Lookaround expression.
|
||||
URX_LA_END = 38, // Ending a Lookaround expression.
|
||||
// Restore InputPos and Stack to saved values.
|
||||
// Operand: Static data offset for saved data.
|
||||
};
|
||||
URX_ONECHAR_I = 39, // Test for case-insensitive match of a literal character.
|
||||
// Operand: the literal char.
|
||||
URX_STRING_I = 40 // Case insensitive string compare.
|
||||
// First Operand: Index of start of string in string literals
|
||||
// Second Operand (next word in compiled code):
|
||||
// the length of the string.
|
||||
};
|
||||
|
||||
// Keep this list of opcode names in sync with the above enum
|
||||
// Used for debug printing only.
|
||||
|
@ -154,7 +160,9 @@ enum {
|
|||
"STO_INP_LOC", \
|
||||
"JMPX", \
|
||||
"LA_START", \
|
||||
"LA_END"
|
||||
"LA_END", \
|
||||
"ONECHAR_I", \
|
||||
"STRING_I"
|
||||
|
||||
//
|
||||
// Convenience macros for assembling and disassembling a compiled operation.
|
||||
|
|
|
@ -206,6 +206,15 @@ RegexPattern *RegexPattern::compile(
|
|||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
|
||||
UREGEX_DOTALL | UREGEX_MULTILINE;
|
||||
|
||||
if ((flags & ~allFlags) != 0) {
|
||||
status = U_REGEX_INVALID_FLAG;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (flags != 0) {
|
||||
status = U_REGEX_UNIMPLEMENTED;
|
||||
return NULL;
|
||||
|
@ -474,16 +483,19 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
case URX_BACKREF:
|
||||
case URX_STO_INP_LOC:
|
||||
case URX_JMPX:
|
||||
|
||||
case URX_LA_START:
|
||||
case URX_LA_END:
|
||||
// types with an integer operand field.
|
||||
REGEX_DUMP_DEBUG_PRINTF("%d", val);
|
||||
break;
|
||||
|
||||
case URX_ONECHAR:
|
||||
case URX_ONECHAR_I:
|
||||
REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?');
|
||||
break;
|
||||
|
||||
case URX_STRING:
|
||||
case URX_STRING_I:
|
||||
{
|
||||
int32_t lengthOp = fCompiledPat->elementAti(index+1);
|
||||
U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
|
||||
|
|
|
@ -290,14 +290,14 @@ public:
|
|||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
|
||||
virtual inline UClassID getDynamicClassID() const;
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for this class.
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
||||
static inline UClassID getStaticClassID();
|
||||
|
||||
private:
|
||||
//
|
||||
|
@ -668,24 +668,25 @@ public:
|
|||
|
||||
/**
|
||||
* setTrace Debug function, enable/disable tracing of the matching engine.
|
||||
* For internal ICU development use only. DO NO USE!!!!
|
||||
* @internal
|
||||
*/
|
||||
void setTrace(UBool state);
|
||||
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
*
|
||||
* @draft ICU 2.2
|
||||
*/
|
||||
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for this class.
|
||||
*
|
||||
* @draft ICU 2.2
|
||||
*/
|
||||
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
||||
static inline UClassID getStaticClassID();
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
*
|
||||
* @draft ICU 2.2
|
||||
*/
|
||||
virtual inline UClassID getDynamicClassID() const;
|
||||
|
||||
private:
|
||||
// Constructors and other object boilerplate are private.
|
||||
|
@ -737,6 +738,13 @@ private:
|
|||
|
||||
};
|
||||
|
||||
inline UClassID RegexPattern::getStaticClassID() { return (UClassID)&fgClassID; }
|
||||
inline UClassID RegexPattern::getDynamicClassID() const { return getStaticClassID(); }
|
||||
|
||||
inline UClassID RegexMatcher::getStaticClassID() { return (UClassID)&fgClassID; }
|
||||
inline UClassID RegexMatcher::getDynamicClassID() const { return getStaticClassID(); }
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
#endif
|
||||
|
|
1
icu4c/source/test/testdata/regextst.txt
vendored
1
icu4c/source/test/testdata/regextst.txt
vendored
|
@ -145,6 +145,7 @@
|
|||
"(?:ABC)+" "<0>ABCABCABC</0>D"
|
||||
"(?:ABC)DEF+" "<0>ABCDEFFF</0>D"
|
||||
"AB\.C\eD\u0666E" "<0>AB.C\u001BD\u0666E</0>F"
|
||||
"ab\Bde" "<0>abde</0>"
|
||||
|
||||
|
||||
# {min,max} iteration qualifier
|
||||
|
|
Loading…
Add table
Reference in a new issue