mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-2422 Regexp general cleanup
X-SVN-Rev: 11387
This commit is contained in:
parent
86f44db9be
commit
4575efb175
6 changed files with 315 additions and 132 deletions
icu4c/source
|
@ -520,7 +520,6 @@ void RegexCompile::compile(
|
|||
// are too short.
|
||||
//
|
||||
fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
|
||||
fRXPat->fMaxMatchLen = maxMatchLength(3, fRXPat->fCompiledPat->size()-1);
|
||||
|
||||
//
|
||||
// Optimization pass: Categorize how a match can start, for use by find()
|
||||
|
@ -1114,7 +1113,36 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
case doPossesiveInterval:
|
||||
// Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
|
||||
compileInterval(URX_CTR_INIT_P, URX_CTR_LOOP_P);
|
||||
{
|
||||
// Remember the loc for the top of the block being looped over.
|
||||
// (Can not reserve a slot in the compiled pattern at this time, becuase
|
||||
// compileInterval needs to reserve also, and blockTopLoc can only reserve
|
||||
// once per block.)
|
||||
int32_t topLoc = blockTopLoc(FALSE);
|
||||
|
||||
// Produce normal looping code.
|
||||
compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
|
||||
|
||||
// Surround the just-emitted normal looping code with a STO_SP ... LD_SP
|
||||
// just as if the loop was inclosed in atomic parentheses.
|
||||
|
||||
// First the STO_SP before the start of the loop
|
||||
insertOp(topLoc);
|
||||
int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
|
||||
fRXPat->fDataSize += 1; // state stack ptr.
|
||||
int32_t op = URX_BUILD(URX_STO_SP, varLoc);
|
||||
fRXPat->fCompiledPat->setElementAt(op, topLoc);
|
||||
|
||||
int32_t loopOp = fRXPat->fCompiledPat->popi();
|
||||
U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
|
||||
loopOp++; // point LoopOp after the just-inserted STO_SP
|
||||
fRXPat->fCompiledPat->push(loopOp, *fStatus);
|
||||
|
||||
// Then the LD_SP after the end of the loop
|
||||
op = URX_BUILD(URX_LD_SP, varLoc);
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case doNGInterval:
|
||||
|
@ -1685,7 +1713,6 @@ void RegexCompile::insertOp(int32_t where) {
|
|||
opType == URX_STATE_SAVE ||
|
||||
opType == URX_CTR_LOOP ||
|
||||
opType == URX_CTR_LOOP_NG ||
|
||||
opType == URX_CTR_LOOP_P ||
|
||||
opType == URX_JMP_SAV ||
|
||||
opType == URX_RELOC_OPRND) && opValue > where) {
|
||||
// Target location for this opcode is after the insertion point and
|
||||
|
@ -1705,6 +1732,13 @@ void RegexCompile::insertOp(int32_t where) {
|
|||
fParenStack.setElementAt(x, loc);
|
||||
}
|
||||
}
|
||||
|
||||
if (fMatchCloseParen > where) {
|
||||
fMatchCloseParen++;
|
||||
}
|
||||
if (fMatchOpenParen > where) {
|
||||
fMatchOpenParen++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -2011,6 +2045,15 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
|
|||
// for all three types (greedy, non-greedy, possessive) of
|
||||
// intervals. The opcodes are supplied as parameters.
|
||||
//
|
||||
// The code for interval loops has this form:
|
||||
// 0 CTR_INIT counter loc (in stack frame)
|
||||
// 1 5 patt address of CTR_LOOP at bottom of block
|
||||
// 2 min count
|
||||
// 3 max count (-1 for unbounded)
|
||||
// 4 ... block to be iterated over
|
||||
// 5 CTR_LOOP
|
||||
//
|
||||
// In
|
||||
//----------------------------------------------------------------------------------------
|
||||
void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
|
||||
{
|
||||
|
@ -2050,6 +2093,8 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
|
|||
error(U_REGEX_MAX_LT_MIN);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -2348,7 +2393,6 @@ void RegexCompile::matchStartType() {
|
|||
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_INIT_P:
|
||||
{
|
||||
// Loop Init Ops. These don't change the min length, but they are 4 word ops
|
||||
// so location must be updated accordingly.
|
||||
|
@ -2372,7 +2416,6 @@ void RegexCompile::matchStartType() {
|
|||
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_CTR_LOOP_P:
|
||||
// Loop ops.
|
||||
// The jump is conditional, backwards only.
|
||||
atStart = FALSE;
|
||||
|
@ -2631,7 +2674,6 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
|||
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_INIT_P:
|
||||
{
|
||||
// Loop Init Ops.
|
||||
// If the min loop count == 0
|
||||
|
@ -2652,7 +2694,6 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
|||
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_CTR_LOOP_P:
|
||||
// Loop ops.
|
||||
// The jump is conditional, backwards only.
|
||||
break;
|
||||
|
@ -2882,10 +2923,8 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
|||
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_INIT_P:
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_CTR_LOOP_P:
|
||||
// For anything to do with loops, make the match length unbounded.
|
||||
// TODO, possibly later, special case short loops like {0,1}.
|
||||
// Note: INIT instructions are multi-word. Can ignore because
|
||||
|
@ -2992,7 +3031,6 @@ void RegexCompile::stripNOPs() {
|
|||
case URX_JMP:
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_CTR_LOOP_P:
|
||||
case URX_RELOC_OPRND:
|
||||
case URX_JMPX:
|
||||
case URX_JMP_SAV:
|
||||
|
@ -3007,14 +3045,59 @@ void RegexCompile::stripNOPs() {
|
|||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
// The remaining instructions are unaltered by the relocation.
|
||||
case URX_RESERVED_OP:
|
||||
case URX_RESERVED_OP_N:
|
||||
case URX_BACKTRACK:
|
||||
case URX_END:
|
||||
case URX_ONECHAR:
|
||||
case URX_STRING:
|
||||
case URX_STRING_LEN:
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_STATIC_SETREF:
|
||||
case URX_SETREF:
|
||||
case URX_DOTANY:
|
||||
case URX_FAIL:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_UNUSED_1:
|
||||
case URX_BACKSLASH_X:
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_DOTANY_ALL:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_STO_SP:
|
||||
case URX_LD_SP:
|
||||
case URX_BACKREF:
|
||||
case URX_STO_INP_LOC:
|
||||
case URX_LA_START:
|
||||
case URX_LA_END:
|
||||
case URX_ONECHAR_I:
|
||||
case URX_STRING_I:
|
||||
case URX_BACKREF_I:
|
||||
case URX_DOLLAR_M:
|
||||
case URX_CARET_M:
|
||||
case URX_LB_START:
|
||||
case URX_LB_CONT:
|
||||
case URX_LB_END:
|
||||
case URX_LBN_CONT:
|
||||
case URX_LBN_END:
|
||||
// These instructions are unaltered by the relocation.
|
||||
fRXPat->fCompiledPat->setElementAt(op, dst);
|
||||
dst++;
|
||||
break;
|
||||
|
||||
default:
|
||||
// Some op is unaccounted for.
|
||||
U_ASSERT(FALSE);
|
||||
error(U_REGEX_INTERNAL_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
fRXPat->fCompiledPat->setSize(dst);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -54,6 +54,7 @@ U_NAMESPACE_BEGIN
|
|||
//
|
||||
enum {
|
||||
URX_RESERVED_OP = 0, // For multi-operand ops, most non-first words.
|
||||
URX_RESERVED_OP_N = 255, // For multi-operand ops, negative operand values.
|
||||
URX_BACKTRACK = 1,
|
||||
URX_END = 2,
|
||||
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
|
||||
|
@ -84,7 +85,7 @@ enum {
|
|||
|
||||
URX_CTR_INIT = 25, // Counter Inits for {Interval} loops.
|
||||
URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possesive.
|
||||
URX_CTR_INIT_P = 27, // These are 4 word opcodes. See description.
|
||||
URX_UNUSED_2 = 27, // These are 4 word opcodes. See description.
|
||||
// First Operand: Data loc of counter variable
|
||||
// 2nd Operand: Pat loc of the URX_CTR_LOOPx
|
||||
// at the end of the loop.
|
||||
|
@ -92,7 +93,7 @@ enum {
|
|||
// 4th Operand: Max count, -1 for unbounded.
|
||||
URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
|
||||
URX_CTR_LOOP_NG = 29, // Also in three flavors.
|
||||
URX_CTR_LOOP_P = 30, // Operand is loc of corresponding CTR_INIT.
|
||||
URX_UNUSED_3 = 30, // Operand is loc of corresponding CTR_INIT.
|
||||
|
||||
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
|
||||
// back into compiled pattern code, and thus must
|
||||
|
@ -180,10 +181,10 @@ enum {
|
|||
"URX_DOLLAR", \
|
||||
"CTR_INIT", \
|
||||
"CTR_INIT_NG", \
|
||||
"CTR_INIT_P", \
|
||||
"CTR_UNUSED_2", \
|
||||
"CTR_LOOP", \
|
||||
"CTR_LOOP_NG", \
|
||||
"CTR_LOOP_P", \
|
||||
"CTR_UNUSED_3", \
|
||||
"RELOC_OPRND", \
|
||||
"STO_SP", \
|
||||
"LD_SP", \
|
||||
|
@ -207,7 +208,7 @@ enum {
|
|||
// Convenience macros for assembling and disassembling a compiled operation.
|
||||
//
|
||||
#define URX_BUILD(type, val) (int32_t)((type << 24) | (val))
|
||||
#define URX_TYPE(x) ((x) >> 24)
|
||||
#define URX_TYPE(x) ((uint32_t)(x) >> 24)
|
||||
#define URX_VAL(x) ((x) & 0xffffff)
|
||||
|
||||
|
||||
|
|
|
@ -171,7 +171,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
|
|||
dest.append(escapedChar);
|
||||
replIdx += (c==0x55? 9: 5);
|
||||
// TODO: Report errors for mal-formed \u escapes?
|
||||
// As this is, the original sequence is output.
|
||||
// As this is, the original sequence is output, which may be OK.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -655,6 +655,94 @@ void RegexMatcher::setTrace(UBool state) {
|
|||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// split
|
||||
//
|
||||
//---------------------------------------------------------------------
|
||||
int32_t RegexMatcher::split(const UnicodeString &input,
|
||||
UnicodeString dest[],
|
||||
int32_t destCapacity,
|
||||
UErrorCode &status)
|
||||
{
|
||||
//
|
||||
// Check arguements for validity
|
||||
//
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
};
|
||||
|
||||
if (destCapacity < 1) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Reset for the input text
|
||||
//
|
||||
reset(input);
|
||||
int32_t inputLen = input.length();
|
||||
int32_t nextOutputStringStart = 0;
|
||||
if (inputLen == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Loop through the input text, searching for the delimiter pattern
|
||||
//
|
||||
int i;
|
||||
int32_t numCaptureGroups = fPattern->fGroupMap->size();
|
||||
for (i=0; ; i++) {
|
||||
if (i==destCapacity-1) {
|
||||
// There is only one output string left.
|
||||
// Fill it with whatever is left from the input, then exit the loop.
|
||||
dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
|
||||
break;
|
||||
}
|
||||
if (find()) {
|
||||
// We found another delimiter. Move everything from where we started looking
|
||||
// up until the start of the delimiter into the next output string.
|
||||
int32_t fieldLen = fMatchStart - nextOutputStringStart;
|
||||
dest[i].setTo(input, nextOutputStringStart, fieldLen);
|
||||
nextOutputStringStart = fMatchEnd;
|
||||
|
||||
// If the delimiter pattern has capturing parentheses, the captured
|
||||
// text goes out into the next n destination strings.
|
||||
int32_t groupNum;
|
||||
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
|
||||
if (i==destCapacity-1) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
dest[i] = group(groupNum, status);
|
||||
}
|
||||
|
||||
if (nextOutputStringStart == inputLen) {
|
||||
// The delimiter was at the end of the string. We're done.
|
||||
break;
|
||||
}
|
||||
|
||||
if (i==destCapacity-1) {
|
||||
// We've filled up the last output string with capture group data.
|
||||
// Give back the last string, to be used for the remainder of the input.
|
||||
i--;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// We ran off the end of the input while looking for the next delimiter.
|
||||
// All the remaining text goes into the current output string.
|
||||
dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return i+1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// start
|
||||
|
|
|
@ -67,7 +67,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
fLiteralText = other.fLiteralText;
|
||||
fDeferredStatus = other.fDeferredStatus;
|
||||
fMinMatchLen = other.fMinMatchLen;
|
||||
fMaxMatchLen = other.fMaxMatchLen;
|
||||
fMaxCaptureDigits = other.fMaxCaptureDigits;
|
||||
fStaticSets = other.fStaticSets;
|
||||
|
||||
|
@ -81,10 +80,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
|
||||
fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
|
||||
|
||||
// Note: do not copy fMatcher. It'll be created on first use if the
|
||||
// destination needs one.
|
||||
// TODO: thread safety
|
||||
|
||||
// Copy the Unicode Sets.
|
||||
// Could be made more efficient if the sets were reference counted and shared,
|
||||
// but I doubt that pattern copying will be particularly common.
|
||||
|
@ -116,10 +111,8 @@ void RegexPattern::init() {
|
|||
fFlags = 0;
|
||||
fDeferredStatus = U_ZERO_ERROR;
|
||||
fMinMatchLen = 0;
|
||||
fMaxMatchLen = -1;
|
||||
fMaxCaptureDigits = 1;
|
||||
fStaticSets = NULL;
|
||||
fMatcher = NULL;
|
||||
fFrameSize = 0;
|
||||
fDataSize = 0;
|
||||
fStartType = START_NO_INFO;
|
||||
|
@ -151,8 +144,6 @@ void RegexPattern::init() {
|
|||
//
|
||||
//--------------------------------------------------------------------------
|
||||
void RegexPattern::zap() {
|
||||
delete fMatcher;
|
||||
fMatcher = NULL;
|
||||
delete fCompiledPat;
|
||||
fCompiledPat = NULL;
|
||||
int i;
|
||||
|
@ -263,6 +254,19 @@ RegexPattern *RegexPattern::compile( const UnicodeString ®ex,
|
|||
|
||||
|
||||
|
||||
//
|
||||
// compile with no UParseErr parameter.
|
||||
//
|
||||
RegexPattern *RegexPattern::compile( const UnicodeString ®ex,
|
||||
uint32_t flags,
|
||||
UErrorCode &err)
|
||||
{
|
||||
UParseError pe;
|
||||
return compile(regex, flags, pe, err);
|
||||
}
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// flags
|
||||
|
@ -280,6 +284,21 @@ uint32_t RegexPattern::flags() const {
|
|||
//---------------------------------------------------------------------
|
||||
RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
|
||||
UErrorCode &status) const {
|
||||
RegexMatcher *retMatcher = matcher(status);
|
||||
if (retMatcher != NULL) {
|
||||
retMatcher->reset(input);
|
||||
}
|
||||
return retMatcher;
|
||||
};
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// matcher(status)
|
||||
//
|
||||
//---------------------------------------------------------------------
|
||||
RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
|
||||
RegexMatcher *retMatcher = NULL;
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
|
@ -295,7 +314,6 @@ RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
|
|||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
retMatcher->reset(input);
|
||||
return retMatcher;
|
||||
};
|
||||
|
||||
|
@ -352,99 +370,17 @@ int32_t RegexPattern::split(const UnicodeString &input,
|
|||
int32_t destCapacity,
|
||||
UErrorCode &status) const
|
||||
{
|
||||
//
|
||||
// Check arguements for validity
|
||||
//
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
};
|
||||
|
||||
if (destCapacity < 1) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// If we don't already have a cached matcher object from a previous call
|
||||
// to split(), create one now.
|
||||
// TODO: NOT THREAD SAFE. FIX.
|
||||
//
|
||||
if (fMatcher == NULL) {
|
||||
RegexMatcher *m = matcher(input, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
// Need to cast off const to cache the matcher
|
||||
RegexPattern *nonConstThis = (RegexPattern *)this;
|
||||
nonConstThis->fMatcher = m;
|
||||
}
|
||||
|
||||
//
|
||||
// Set our input text into the matcher
|
||||
//
|
||||
fMatcher->reset(input);
|
||||
int32_t inputLen = input.length();
|
||||
int32_t nextOutputStringStart = 0;
|
||||
if (inputLen == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Loop through the input text, searching for the delimiter pattern
|
||||
//
|
||||
int i;
|
||||
int32_t numCaptureGroups = fGroupMap->size();
|
||||
for (i=0; ; i++) {
|
||||
if (i==destCapacity-1) {
|
||||
// There is only one output string left.
|
||||
// Fill it with whatever is left from the input, then exit the loop.
|
||||
dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
|
||||
break;
|
||||
}
|
||||
if (fMatcher->find()) {
|
||||
// We found another delimiter. Move everything from where we started looking
|
||||
// up until the start of the delimiter into the next output string.
|
||||
int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
|
||||
dest[i].setTo(input, nextOutputStringStart, fieldLen);
|
||||
nextOutputStringStart = fMatcher->fMatchEnd;
|
||||
|
||||
// If the delimiter pattern has capturing parentheses, the captured
|
||||
// text goes out into the next n destination strings.
|
||||
int32_t groupNum;
|
||||
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
|
||||
if (i==destCapacity-1) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
dest[i] = fMatcher->group(groupNum, status);
|
||||
}
|
||||
|
||||
if (nextOutputStringStart == inputLen) {
|
||||
// The delimiter was at the end of the string. We're done.
|
||||
break;
|
||||
}
|
||||
|
||||
if (i==destCapacity-1) {
|
||||
// We've filled up the last output string with capture group data.
|
||||
// Give back the last string, to be used for the remainder of the input.
|
||||
i--;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// We ran off the end of the input while looking for the next delimiter.
|
||||
// All the remaining text goes into the current output string.
|
||||
dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return i+1;
|
||||
RegexMatcher m(this);
|
||||
int32_t r = m.split(input, dest, destCapacity, status);
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// dump Output the compiled form of the pattern.
|
||||
|
@ -489,10 +425,8 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
case URX_STRING_LEN:
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_INIT_P:
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_CTR_LOOP_P:
|
||||
case URX_RELOC_OPRND:
|
||||
case URX_STO_SP:
|
||||
case URX_LD_SP:
|
||||
|
@ -567,9 +501,6 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
|
||||
|
||||
|
||||
// TODO: get rid of max match length
|
||||
|
||||
|
||||
void RegexPattern::dump() const {
|
||||
int index;
|
||||
int i;
|
||||
|
@ -580,7 +511,6 @@ void RegexPattern::dump() const {
|
|||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF("\n");
|
||||
REGEX_DUMP_DEBUG_PRINTF(" Min Match Length: %d\n", fMinMatchLen);
|
||||
REGEX_DUMP_DEBUG_PRINTF(" Max Match Length: %d\n", fMaxMatchLen);
|
||||
REGEX_DUMP_DEBUG_PRINTF(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType));
|
||||
if (fStartType == START_STRING) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(" Initial match sting: \"");
|
||||
|
|
|
@ -58,7 +58,6 @@ struct REStackFrame;
|
|||
|
||||
/**
|
||||
* Constants for Regular Expression Match Modes.
|
||||
* <p>Note that non-default match modes will not be supported until ICU 2.6</p>
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
enum {
|
||||
|
@ -208,6 +207,29 @@ public:
|
|||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Compiles the regular expression in string form into a RegexPattern
|
||||
* object using the specified match mode flags. These compile methods,
|
||||
* rather than the constructors, are the usual way that RegexPattern objects
|
||||
* are created.
|
||||
*
|
||||
* <p>Note that RegexPattern objects must not be deleted while RegexMatcher
|
||||
* objects created from the pattern are active. RegexMatchers keep a pointer
|
||||
* back to their pattern, so premature deletion of the pattern is a
|
||||
* catastrophic error.</p>
|
||||
*
|
||||
* @param regex The regular expression to be compiled.
|
||||
* @param flags The match mode flags to be used.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return A regexPattern object for the compiled pattern.
|
||||
*
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
static RegexPattern *compile( const UnicodeString ®ex,
|
||||
uint32_t flags,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Get the match mode flags that were used when compiling this pattern.
|
||||
* @return the match mode flags
|
||||
|
@ -231,6 +253,20 @@ public:
|
|||
UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Creates a RegexMatcher that will match against this pattern. The
|
||||
* RegexMatcher can be used to perform match, find or replace operations.
|
||||
* Note that a RegexPattern object must not be deleted while
|
||||
* RegexMatchers created from it still exist and might possibly be used again.
|
||||
*
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return A RegexMatcher object for this pattern and input.
|
||||
*
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
virtual RegexMatcher *matcher(UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
* Test whether a string matches a regular expression. This convenience function
|
||||
* both compiles the reguluar expression and applies it in a single operation.
|
||||
|
@ -259,7 +295,14 @@ public:
|
|||
|
||||
|
||||
/**
|
||||
* Split a string around matches of the pattern. Somewhat like split() from Perl.
|
||||
* Split a string into fields. Somewhat like split() from Perl.
|
||||
* The pattern matches identify delimiters that separate the input
|
||||
* into fields. The input data between the matches becomes the
|
||||
* fields themselves.
|
||||
* <p>
|
||||
* For the best performance on split() operations,
|
||||
* RegexMatcher::split</code> is perferrable to this function
|
||||
*
|
||||
* @param input The string to be split into fields. The field delimiters
|
||||
* match the pattern (in the "this" object)
|
||||
* @param dest An array of UnicodeStrings to receive the results of the split.
|
||||
|
@ -324,17 +367,6 @@ private:
|
|||
// value may be less than the true shortest
|
||||
// possible match.
|
||||
|
||||
int32_t fMaxMatchLen; // Maximum Match Length. All matches will have length
|
||||
// <= this value. For some patterns, this calculated
|
||||
// value may be greater than the true longest
|
||||
// possible match. For patterns with unbounded
|
||||
// match length, value = -1.
|
||||
|
||||
RegexMatcher *fMatcher; // A cached matcher for this pattern, used for
|
||||
// split(), to avoid having to
|
||||
// make new ones on each call.
|
||||
// TODO: fix thread safety problems.
|
||||
|
||||
int32_t fFrameSize; // Size of a state stack frame in the
|
||||
// execution engine.
|
||||
|
||||
|
@ -402,6 +434,12 @@ public:
|
|||
* created for the same expression, it will be more efficient to
|
||||
* separately create and cache a RegexPattern object, and use
|
||||
* its matcher() method to create the RegexMatcher objects.
|
||||
*
|
||||
* @param regexp The Regular Expression to be compiled.
|
||||
* @param flags Regular expression options, such as case insensitive matching.
|
||||
* @see UREGEX_CASE_INSENSITIVE
|
||||
* @param status Any errors are reported by setting this UErrorCode variable.
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status);
|
||||
|
||||
|
@ -412,6 +450,12 @@ public:
|
|||
* created for the same expression, it will be more efficient to
|
||||
* separately create and cache a RegexPattern object, and use
|
||||
* its matcher() method to create the RegexMatcher objects.
|
||||
*
|
||||
* @param regexp The Regular Expression to be compiled.
|
||||
* @param flags Regular expression options, such as case insensitive matching.
|
||||
* @see UREGEX_CASE_INSENSITIVE
|
||||
* @param status Any errors are reported by setting this UErrorCode variable.
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
|
||||
uint32_t flags, UErrorCode &status);
|
||||
|
@ -690,6 +734,37 @@ public:
|
|||
|
||||
|
||||
|
||||
/**
|
||||
* Split a string into fields. Somewhat like split() from Perl.
|
||||
* The pattern matches identify delimiters that separate the input
|
||||
* into fields. The input data between the matches becomes the
|
||||
* fields themselves.
|
||||
* <p>
|
||||
*
|
||||
* @param input The string to be split into fields. The field delimiters
|
||||
* match the pattern (in the "this" object). This matcher
|
||||
* will be reset to this input string.
|
||||
* @param dest An array of UnicodeStrings to receive the results of the split.
|
||||
* This is an array of actual UnicodeString objects, not an
|
||||
* array of pointers to strings. Local (stack based) arrays can
|
||||
* work well here.
|
||||
* @param destCapacity The number of elements in the destination array.
|
||||
* If the number of fields found is less than destCapacity, the
|
||||
* extra strings in the destination array are not altered.
|
||||
* If the number of destination strings is less than the number
|
||||
* of fields, the trailing part of the input string, including any
|
||||
* field delimiters, is placed in the last destination string.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return The number of fields into which the input string was split.
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual int32_t split(const UnicodeString &input,
|
||||
UnicodeString dest[],
|
||||
int32_t destCapacity,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* setTrace Debug function, enable/disable tracing of the matching engine.
|
||||
* For internal ICU development use only. DO NO USE!!!!
|
||||
|
|
6
icu4c/source/test/testdata/regextst.txt
vendored
6
icu4c/source/test/testdata/regextst.txt
vendored
|
@ -188,6 +188,12 @@
|
|||
"(ABC){2,3}?AX" "<0>ABCABC<1>ABC</1>AX</0>"
|
||||
"(ABC){2,3}?AX" "ABC<0>ABCABC<1>ABC</1>AX</0>"
|
||||
|
||||
# Possessive {min,max}+ intervals
|
||||
"(ABC){2,3}+ABC" "ABCABCABC"
|
||||
"(ABC){1,2}+ABC" "<0>ABC<1>ABC</1>ABC</0>"
|
||||
"(?:(.)\1){2,5}+." "<0>aabbcc<1>d</1>de</0>x"
|
||||
|
||||
|
||||
# Atomic Grouping
|
||||
"(?>.*)abc" "abcabcabc" # no match. .* consumed entire string.
|
||||
"(?>(abc{2,4}?))(c*)" "<0><1>abcc</1><2>ccc</2></0>ddd"
|
||||
|
|
Loading…
Add table
Reference in a new issue