ICU-2422 look-behind op, plus some optimizations, work-in-progress.

X-SVN-Rev: 11145
This commit is contained in:
Andy Heninger 2003-02-24 05:12:29 +00:00
parent a0d1a524df
commit ddf59cf344
3 changed files with 179 additions and 2 deletions

View file

@ -20,8 +20,6 @@ U_NAMESPACE_BEGIN
* or a pointer. If a hint bit is zero, then the associated
* token is assumed to be an integer. This is needed for iSeries
*/
#define HINT_KEY_POINTER (1)
#define HINT_KEY_INTEGER (0)
const char UVector32::fgClassID=0;

View file

@ -1894,6 +1894,183 @@ UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
}
//----------------------------------------------------------------------------------------
//
// minMatchLength
//
//----------------------------------------------------------------------------------------
int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
U_ASSERT(start <= end);
U_ASSERT(end < fRXPat->fCompiledPat->size());
int32_t patSegLen = end - start + 1;
int32_t loc;
int32_t op;
int32_t currentLen = 0;
UVector32 lengthSoFar(fRXPat->fCompiledPat->size(), *fStatus);
lengthSoFar.setSize(fRXPat->fCompiledPat->size());
for (loc=start; loc<=end; loc++) {
lengthSoFar.setElementAt(INT32_MAX, loc);
}
loc = start-1;
for (loc = start; loc<=end; loc++) {
op = fRXPat->fCompiledPat->elementAti(loc);
if (lengthSoFar.elementAti(loc) < currentLen) {
currentLen = lengthSoFar.elementAti(loc);
}
switch (op) {
// Ops that don't change the total length matched
case URX_RESERVED_OP:
case URX_END:
case URX_STRING_LEN:
case URX_NOP:
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_BACKSLASH_A:
case URX_BACKSLASH_B:
case URX_BACKSLASH_G:
case URX_BACKSLASH_Z:
case URX_CARET:
case URX_DOLLAR:
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
case URX_CTR_INIT_P:
case URX_RELOC_OPRND:
case URX_STO_INP_LOC:
case URX_DOLLAR_M:
case URX_CARET_M:
case URX_BACKTRACK:
case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
case URX_BACKREF_I:
case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
case URX_LD_SP:
break;
// Ops that match a minimum of one character
// (and, ususally, exactly one character.)
case URX_ONECHAR:
case URX_STATIC_SETREF:
case URX_SETREF:
case URX_BACKSLASH_D:
case URX_ONECHAR_I:
case URX_BACKSLASH_W:
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
case URX_DOTANY_ALL: // . matches one or two.
case URX_DOTANY:
currentLen++;
break;
case URX_JMP:
case URX_JMPX:
{
int32_t jmpDest = URX_VAL(op);
if (jmpDest < loc) {
// Loop of some kind. Can safely ignore, the worst that will happen
// is that we understate the true minimum length
currentLen = lengthSoFar.elementAti(loc+1);
} else {
// Forward jump. Propagate the current min length to the target loc of the jump.
if (lengthSoFar.elementAti(jmpDest) > currentLen) {
lengthSoFar.setElementAt(currentLen, jmpDest);
}
}
}
break;
case URX_FAIL:
// Fails are kind of like a branch, except that the min length was
// propagated already, by the state save.
currentLen = lengthSoFar.elementAti(loc+1);
break;
case URX_STATE_SAVE:
{
// State Save, for forward jumps, propagate the current minimum.
// of the state save.
int32_t jmpDest = URX_VAL(op);
if (jmpDest > loc) {
if (currentLen < lengthSoFar.elementAti(jmpDest)) {
lengthSoFar.setElementAt(currentLen, jmpDest);
}
}
}
break;
case URX_STRING:
case URX_STRING_I:
{
loc++;
int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
currentLen += URX_VAL(stringLenOp);
}
break;
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
case URX_CTR_LOOP_P:
{
// Loop ops. These are four word instructions.
// The jump is conditional, backwards only.
loc+=3;
}
break;
case URX_LA_START:
{
// Look-ahead. Scan forward until the matching look-ahead end,
// without processing the look-ahead block. This is overly pessimistic.
// TODO: Positive lookahead could recursively do the block, then continue
// with the longer of the block or the value coming in.
int32_t depth = 0;
for (;;) {
loc++;
op = fRXPat->fCompiledPat->elementAti(loc);
if (URX_VAL(op) == URX_LA_START) {
depth++;
}
if (URX_VAL(op) == URX_LA_END) {
if (depth == 0) {
break;
}
depth--;
}
U_ASSERT(loc < end);
}
}
break;
case URX_LA_END:
// End of look-ahead ops should always be consumed by the processing at
// the URX_LA_START op.
U_ASSERT(FALSE);
break;
default:
U_ASSERT(FALSE);
}
}
return currentLen;
}
//----------------------------------------------------------------------------------------
//
// Error Report a rule parse error.

View file

@ -105,6 +105,8 @@ private:
// taking case mode into account.
UBool possibleNullMatch(int32_t start, // Test a range of compiled pattern for
int32_t end); // for possibly matching an empty string.
int32_t minMatchLength(int32_t start,
int32_t end);
UErrorCode *fStatus;