ICU-2422 Regexp general cleanup

X-SVN-Rev: 11387
This commit is contained in:
Andy Heninger 2003-03-24 05:23:07 +00:00
parent 86f44db9be
commit 4575efb175
6 changed files with 315 additions and 132 deletions

View file

@ -520,7 +520,6 @@ void RegexCompile::compile(
// are too short.
//
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
fRXPat->fMaxMatchLen = maxMatchLength(3, fRXPat->fCompiledPat->size()-1);
//
// Optimization pass: Categorize how a match can start, for use by find()
@ -1114,7 +1113,36 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doPossesiveInterval:
// Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
compileInterval(URX_CTR_INIT_P, URX_CTR_LOOP_P);
{
// Remember the loc for the top of the block being looped over.
// (Can not reserve a slot in the compiled pattern at this time, becuase
// compileInterval needs to reserve also, and blockTopLoc can only reserve
// once per block.)
int32_t topLoc = blockTopLoc(FALSE);
// Produce normal looping code.
compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
// Surround the just-emitted normal looping code with a STO_SP ... LD_SP
// just as if the loop was inclosed in atomic parentheses.
// First the STO_SP before the start of the loop
insertOp(topLoc);
int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
fRXPat->fDataSize += 1; // state stack ptr.
int32_t op = URX_BUILD(URX_STO_SP, varLoc);
fRXPat->fCompiledPat->setElementAt(op, topLoc);
int32_t loopOp = fRXPat->fCompiledPat->popi();
U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
loopOp++; // point LoopOp after the just-inserted STO_SP
fRXPat->fCompiledPat->push(loopOp, *fStatus);
// Then the LD_SP after the end of the loop
op = URX_BUILD(URX_LD_SP, varLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
break;
case doNGInterval:
@ -1685,7 +1713,6 @@ void RegexCompile::insertOp(int32_t where) {
opType == URX_STATE_SAVE ||
opType == URX_CTR_LOOP ||
opType == URX_CTR_LOOP_NG ||
opType == URX_CTR_LOOP_P ||
opType == URX_JMP_SAV ||
opType == URX_RELOC_OPRND) && opValue > where) {
// Target location for this opcode is after the insertion point and
@ -1705,6 +1732,13 @@ void RegexCompile::insertOp(int32_t where) {
fParenStack.setElementAt(x, loc);
}
}
if (fMatchCloseParen > where) {
fMatchCloseParen++;
}
if (fMatchOpenParen > where) {
fMatchOpenParen++;
}
}
@ -2011,6 +2045,15 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
// for all three types (greedy, non-greedy, possessive) of
// intervals. The opcodes are supplied as parameters.
//
// The code for interval loops has this form:
// 0 CTR_INIT counter loc (in stack frame)
// 1 5 patt address of CTR_LOOP at bottom of block
// 2 min count
// 3 max count (-1 for unbounded)
// 4 ... block to be iterated over
// 5 CTR_LOOP
//
// In
//----------------------------------------------------------------------------------------
void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
{
@ -2050,6 +2093,8 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
error(U_REGEX_MAX_LT_MIN);
}
}
@ -2348,7 +2393,6 @@ void RegexCompile::matchStartType() {
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_CTR_INIT_P:
{
// Loop Init Ops. These don't change the min length, but they are 4 word ops
// so location must be updated accordingly.
@ -2372,7 +2416,6 @@ void RegexCompile::matchStartType() {
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_CTR_LOOP_P:
// Loop ops.
// The jump is conditional, backwards only.
atStart = FALSE;
@ -2631,7 +2674,6 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_CTR_INIT_P:
{
// Loop Init Ops.
// If the min loop count == 0
@ -2652,7 +2694,6 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_CTR_LOOP_P:
// Loop ops.
// The jump is conditional, backwards only.
break;
@ -2882,10 +2923,8 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_CTR_INIT_P:
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_CTR_LOOP_P:
// For anything to do with loops, make the match length unbounded.
// TODO, possibly later, special case short loops like {0,1}.
// Note: INIT instructions are multi-word. Can ignore because
@ -2992,7 +3031,6 @@ void RegexCompile::stripNOPs() {
case URX_JMP:
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_CTR_LOOP_P:
case URX_RELOC_OPRND:
case URX_JMPX:
case URX_JMP_SAV:
@ -3007,14 +3045,59 @@ void RegexCompile::stripNOPs() {
break;
}
default:
// The remaining instructions are unaltered by the relocation.
case URX_RESERVED_OP:
case URX_RESERVED_OP_N:
case URX_BACKTRACK:
case URX_END:
case URX_ONECHAR:
case URX_STRING:
case URX_STRING_LEN:
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_STATIC_SETREF:
case URX_SETREF:
case URX_DOTANY:
case URX_FAIL:
case URX_BACKSLASH_B:
case URX_BACKSLASH_G:
case URX_UNUSED_1:
case URX_BACKSLASH_X:
case URX_BACKSLASH_Z:
case URX_DOTANY_ALL:
case URX_BACKSLASH_D:
case URX_CARET:
case URX_DOLLAR:
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_STO_SP:
case URX_LD_SP:
case URX_BACKREF:
case URX_STO_INP_LOC:
case URX_LA_START:
case URX_LA_END:
case URX_ONECHAR_I:
case URX_STRING_I:
case URX_BACKREF_I:
case URX_DOLLAR_M:
case URX_CARET_M:
case URX_LB_START:
case URX_LB_CONT:
case URX_LB_END:
case URX_LBN_CONT:
case URX_LBN_END:
// These instructions are unaltered by the relocation.
fRXPat->fCompiledPat->setElementAt(op, dst);
dst++;
break;
default:
// Some op is unaccounted for.
U_ASSERT(FALSE);
error(U_REGEX_INTERNAL_ERROR);
}
}
fRXPat->fCompiledPat->setSize(dst);
}

View file

@ -54,6 +54,7 @@ U_NAMESPACE_BEGIN
//
enum {
URX_RESERVED_OP = 0, // For multi-operand ops, most non-first words.
URX_RESERVED_OP_N = 255, // For multi-operand ops, negative operand values.
URX_BACKTRACK = 1,
URX_END = 2,
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
@ -84,7 +85,7 @@ enum {
URX_CTR_INIT = 25, // Counter Inits for {Interval} loops.
URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possesive.
URX_CTR_INIT_P = 27, // These are 4 word opcodes. See description.
URX_UNUSED_2 = 27, // These are 4 word opcodes. See description.
// First Operand: Data loc of counter variable
// 2nd Operand: Pat loc of the URX_CTR_LOOPx
// at the end of the loop.
@ -92,7 +93,7 @@ enum {
// 4th Operand: Max count, -1 for unbounded.
URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
URX_CTR_LOOP_NG = 29, // Also in three flavors.
URX_CTR_LOOP_P = 30, // Operand is loc of corresponding CTR_INIT.
URX_UNUSED_3 = 30, // Operand is loc of corresponding CTR_INIT.
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
// back into compiled pattern code, and thus must
@ -180,10 +181,10 @@ enum {
"URX_DOLLAR", \
"CTR_INIT", \
"CTR_INIT_NG", \
"CTR_INIT_P", \
"CTR_UNUSED_2", \
"CTR_LOOP", \
"CTR_LOOP_NG", \
"CTR_LOOP_P", \
"CTR_UNUSED_3", \
"RELOC_OPRND", \
"STO_SP", \
"LD_SP", \
@ -207,7 +208,7 @@ enum {
// Convenience macros for assembling and disassembling a compiled operation.
//
#define URX_BUILD(type, val) (int32_t)((type << 24) | (val))
#define URX_TYPE(x) ((x) >> 24)
#define URX_TYPE(x) ((uint32_t)(x) >> 24)
#define URX_VAL(x) ((x) & 0xffffff)

View file

@ -171,7 +171,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
dest.append(escapedChar);
replIdx += (c==0x55? 9: 5);
// TODO: Report errors for mal-formed \u escapes?
// As this is, the original sequence is output.
// As this is, the original sequence is output, which may be OK.
continue;
}
}
@ -655,6 +655,94 @@ void RegexMatcher::setTrace(UBool state) {
//---------------------------------------------------------------------
//
// split
//
//---------------------------------------------------------------------
int32_t RegexMatcher::split(const UnicodeString &input,
UnicodeString dest[],
int32_t destCapacity,
UErrorCode &status)
{
//
// Check arguements for validity
//
if (U_FAILURE(status)) {
return 0;
};
if (destCapacity < 1) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
//
// Reset for the input text
//
reset(input);
int32_t inputLen = input.length();
int32_t nextOutputStringStart = 0;
if (inputLen == 0) {
return 0;
}
//
// Loop through the input text, searching for the delimiter pattern
//
int i;
int32_t numCaptureGroups = fPattern->fGroupMap->size();
for (i=0; ; i++) {
if (i==destCapacity-1) {
// There is only one output string left.
// Fill it with whatever is left from the input, then exit the loop.
dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
break;
}
if (find()) {
// We found another delimiter. Move everything from where we started looking
// up until the start of the delimiter into the next output string.
int32_t fieldLen = fMatchStart - nextOutputStringStart;
dest[i].setTo(input, nextOutputStringStart, fieldLen);
nextOutputStringStart = fMatchEnd;
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
if (i==destCapacity-1) {
break;
}
i++;
dest[i] = group(groupNum, status);
}
if (nextOutputStringStart == inputLen) {
// The delimiter was at the end of the string. We're done.
break;
}
if (i==destCapacity-1) {
// We've filled up the last output string with capture group data.
// Give back the last string, to be used for the remainder of the input.
i--;
}
}
else
{
// We ran off the end of the input while looking for the next delimiter.
// All the remaining text goes into the current output string.
dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
break;
}
}
return i+1;
}
//--------------------------------------------------------------------------------
//
// start

View file

@ -67,7 +67,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fLiteralText = other.fLiteralText;
fDeferredStatus = other.fDeferredStatus;
fMinMatchLen = other.fMinMatchLen;
fMaxMatchLen = other.fMaxMatchLen;
fMaxCaptureDigits = other.fMaxCaptureDigits;
fStaticSets = other.fStaticSets;
@ -81,10 +80,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
// Note: do not copy fMatcher. It'll be created on first use if the
// destination needs one.
// TODO: thread safety
// Copy the Unicode Sets.
// Could be made more efficient if the sets were reference counted and shared,
// but I doubt that pattern copying will be particularly common.
@ -116,10 +111,8 @@ void RegexPattern::init() {
fFlags = 0;
fDeferredStatus = U_ZERO_ERROR;
fMinMatchLen = 0;
fMaxMatchLen = -1;
fMaxCaptureDigits = 1;
fStaticSets = NULL;
fMatcher = NULL;
fFrameSize = 0;
fDataSize = 0;
fStartType = START_NO_INFO;
@ -151,8 +144,6 @@ void RegexPattern::init() {
//
//--------------------------------------------------------------------------
void RegexPattern::zap() {
delete fMatcher;
fMatcher = NULL;
delete fCompiledPat;
fCompiledPat = NULL;
int i;
@ -263,6 +254,19 @@ RegexPattern *RegexPattern::compile( const UnicodeString &regex,
//
// compile with no UParseErr parameter.
//
RegexPattern *RegexPattern::compile( const UnicodeString &regex,
uint32_t flags,
UErrorCode &err)
{
UParseError pe;
return compile(regex, flags, pe, err);
}
//---------------------------------------------------------------------
//
// flags
@ -280,6 +284,21 @@ uint32_t RegexPattern::flags() const {
//---------------------------------------------------------------------
RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
UErrorCode &status) const {
RegexMatcher *retMatcher = matcher(status);
if (retMatcher != NULL) {
retMatcher->reset(input);
}
return retMatcher;
};
//---------------------------------------------------------------------
//
// matcher(status)
//
//---------------------------------------------------------------------
RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
RegexMatcher *retMatcher = NULL;
if (U_FAILURE(status)) {
@ -295,7 +314,6 @@ RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
retMatcher->reset(input);
return retMatcher;
};
@ -352,99 +370,17 @@ int32_t RegexPattern::split(const UnicodeString &input,
int32_t destCapacity,
UErrorCode &status) const
{
//
// Check arguements for validity
//
if (U_FAILURE(status)) {
return 0;
};
if (destCapacity < 1) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
//
// If we don't already have a cached matcher object from a previous call
// to split(), create one now.
// TODO: NOT THREAD SAFE. FIX.
//
if (fMatcher == NULL) {
RegexMatcher *m = matcher(input, status);
if (U_FAILURE(status)) {
return 0;
}
// Need to cast off const to cache the matcher
RegexPattern *nonConstThis = (RegexPattern *)this;
nonConstThis->fMatcher = m;
}
//
// Set our input text into the matcher
//
fMatcher->reset(input);
int32_t inputLen = input.length();
int32_t nextOutputStringStart = 0;
if (inputLen == 0) {
return 0;
}
//
// Loop through the input text, searching for the delimiter pattern
//
int i;
int32_t numCaptureGroups = fGroupMap->size();
for (i=0; ; i++) {
if (i==destCapacity-1) {
// There is only one output string left.
// Fill it with whatever is left from the input, then exit the loop.
dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
break;
}
if (fMatcher->find()) {
// We found another delimiter. Move everything from where we started looking
// up until the start of the delimiter into the next output string.
int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
dest[i].setTo(input, nextOutputStringStart, fieldLen);
nextOutputStringStart = fMatcher->fMatchEnd;
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
if (i==destCapacity-1) {
break;
}
i++;
dest[i] = fMatcher->group(groupNum, status);
}
if (nextOutputStringStart == inputLen) {
// The delimiter was at the end of the string. We're done.
break;
}
if (i==destCapacity-1) {
// We've filled up the last output string with capture group data.
// Give back the last string, to be used for the remainder of the input.
i--;
}
}
else
{
// We ran off the end of the input while looking for the next delimiter.
// All the remaining text goes into the current output string.
dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
break;
}
}
return i+1;
RegexMatcher m(this);
int32_t r = m.split(input, dest, destCapacity, status);
return r;
}
//---------------------------------------------------------------------
//
// dump Output the compiled form of the pattern.
@ -489,10 +425,8 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_STRING_LEN:
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_CTR_INIT_P:
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_CTR_LOOP_P:
case URX_RELOC_OPRND:
case URX_STO_SP:
case URX_LD_SP:
@ -567,9 +501,6 @@ void RegexPattern::dumpOp(int32_t index) const {
// TODO: get rid of max match length
void RegexPattern::dump() const {
int index;
int i;
@ -580,7 +511,6 @@ void RegexPattern::dump() const {
}
REGEX_DUMP_DEBUG_PRINTF("\n");
REGEX_DUMP_DEBUG_PRINTF(" Min Match Length: %d\n", fMinMatchLen);
REGEX_DUMP_DEBUG_PRINTF(" Max Match Length: %d\n", fMaxMatchLen);
REGEX_DUMP_DEBUG_PRINTF(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType));
if (fStartType == START_STRING) {
REGEX_DUMP_DEBUG_PRINTF(" Initial match sting: \"");

View file

@ -58,7 +58,6 @@ struct REStackFrame;
/**
* Constants for Regular Expression Match Modes.
* <p>Note that non-default match modes will not be supported until ICU 2.6</p>
* @draft ICU 2.4
*/
enum {
@ -208,6 +207,29 @@ public:
UErrorCode &status);
/**
* Compiles the regular expression in string form into a RegexPattern
* object using the specified match mode flags. These compile methods,
* rather than the constructors, are the usual way that RegexPattern objects
* are created.
*
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
* objects created from the pattern are active. RegexMatchers keep a pointer
* back to their pattern, so premature deletion of the pattern is a
* catastrophic error.</p>
*
* @param regex The regular expression to be compiled.
* @param flags The match mode flags to be used.
* @param status A reference to a UErrorCode to receive any errors.
* @return A regexPattern object for the compiled pattern.
*
* @draft ICU 2.6
*/
static RegexPattern *compile( const UnicodeString &regex,
uint32_t flags,
UErrorCode &status);
/**
* Get the match mode flags that were used when compiling this pattern.
* @return the match mode flags
@ -231,6 +253,20 @@ public:
UErrorCode &status) const;
/**
* Creates a RegexMatcher that will match against this pattern. The
* RegexMatcher can be used to perform match, find or replace operations.
* Note that a RegexPattern object must not be deleted while
* RegexMatchers created from it still exist and might possibly be used again.
*
* @param status A reference to a UErrorCode to receive any errors.
* @return A RegexMatcher object for this pattern and input.
*
* @draft ICU 2.6
*/
virtual RegexMatcher *matcher(UErrorCode &status) const;
/**
* Test whether a string matches a regular expression. This convenience function
* both compiles the reguluar expression and applies it in a single operation.
@ -259,7 +295,14 @@ public:
/**
* Split a string around matches of the pattern. Somewhat like split() from Perl.
* Split a string into fields. Somewhat like split() from Perl.
* The pattern matches identify delimiters that separate the input
* into fields. The input data between the matches becomes the
* fields themselves.
* <p>
* For the best performance on split() operations,
* RegexMatcher::split</code> is perferrable to this function
*
* @param input The string to be split into fields. The field delimiters
* match the pattern (in the "this" object)
* @param dest An array of UnicodeStrings to receive the results of the split.
@ -324,17 +367,6 @@ private:
// value may be less than the true shortest
// possible match.
int32_t fMaxMatchLen; // Maximum Match Length. All matches will have length
// <= this value. For some patterns, this calculated
// value may be greater than the true longest
// possible match. For patterns with unbounded
// match length, value = -1.
RegexMatcher *fMatcher; // A cached matcher for this pattern, used for
// split(), to avoid having to
// make new ones on each call.
// TODO: fix thread safety problems.
int32_t fFrameSize; // Size of a state stack frame in the
// execution engine.
@ -402,6 +434,12 @@ public:
* created for the same expression, it will be more efficient to
* separately create and cache a RegexPattern object, and use
* its matcher() method to create the RegexMatcher objects.
*
* @param regexp The Regular Expression to be compiled.
* @param flags Regular expression options, such as case insensitive matching.
* @see UREGEX_CASE_INSENSITIVE
* @param status Any errors are reported by setting this UErrorCode variable.
* @draft ICU 2.6
*/
RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
@ -412,6 +450,12 @@ public:
* created for the same expression, it will be more efficient to
* separately create and cache a RegexPattern object, and use
* its matcher() method to create the RegexMatcher objects.
*
* @param regexp The Regular Expression to be compiled.
* @param flags Regular expression options, such as case insensitive matching.
* @see UREGEX_CASE_INSENSITIVE
* @param status Any errors are reported by setting this UErrorCode variable.
* @draft ICU 2.6
*/
RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
uint32_t flags, UErrorCode &status);
@ -690,6 +734,37 @@ public:
/**
* Split a string into fields. Somewhat like split() from Perl.
* The pattern matches identify delimiters that separate the input
* into fields. The input data between the matches becomes the
* fields themselves.
* <p>
*
* @param input The string to be split into fields. The field delimiters
* match the pattern (in the "this" object). This matcher
* will be reset to this input string.
* @param dest An array of UnicodeStrings to receive the results of the split.
* This is an array of actual UnicodeString objects, not an
* array of pointers to strings. Local (stack based) arrays can
* work well here.
* @param destCapacity The number of elements in the destination array.
* If the number of fields found is less than destCapacity, the
* extra strings in the destination array are not altered.
* If the number of destination strings is less than the number
* of fields, the trailing part of the input string, including any
* field delimiters, is placed in the last destination string.
* @param status A reference to a UErrorCode to receive any errors.
* @return The number of fields into which the input string was split.
* @draft ICU 2.4
*/
virtual int32_t split(const UnicodeString &input,
UnicodeString dest[],
int32_t destCapacity,
UErrorCode &status);
/**
* setTrace Debug function, enable/disable tracing of the matching engine.
* For internal ICU development use only. DO NO USE!!!!

View file

@ -188,6 +188,12 @@
"(ABC){2,3}?AX" "<0>ABCABC<1>ABC</1>AX</0>"
"(ABC){2,3}?AX" "ABC<0>ABCABC<1>ABC</1>AX</0>"
# Possessive {min,max}+ intervals
"(ABC){2,3}+ABC" "ABCABCABC"
"(ABC){1,2}+ABC" "<0>ABC<1>ABC</1>ABC</0>"
"(?:(.)\1){2,5}+." "<0>aabbcc<1>d</1>de</0>x"
# Atomic Grouping
"(?>.*)abc" "abcabcabc" # no match. .* consumed entire string.
"(?>(abc{2,4}?))(c*)" "<0><1>abcc</1><2>ccc</2></0>ddd"