mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 22:15:31 +00:00
ICU-2422 look-behind op, plus some optimizations, work-in-progress.
X-SVN-Rev: 11145
This commit is contained in:
parent
a0d1a524df
commit
ddf59cf344
3 changed files with 179 additions and 2 deletions
|
@ -20,8 +20,6 @@ U_NAMESPACE_BEGIN
|
|||
* or a pointer. If a hint bit is zero, then the associated
|
||||
* token is assumed to be an integer. This is needed for iSeries
|
||||
*/
|
||||
#define HINT_KEY_POINTER (1)
|
||||
#define HINT_KEY_INTEGER (0)
|
||||
|
||||
const char UVector32::fgClassID=0;
|
||||
|
||||
|
|
|
@ -1894,6 +1894,183 @@ UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
|
|||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// minMatchLength
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
||||
U_ASSERT(start <= end);
|
||||
U_ASSERT(end < fRXPat->fCompiledPat->size());
|
||||
|
||||
|
||||
int32_t patSegLen = end - start + 1;
|
||||
int32_t loc;
|
||||
int32_t op;
|
||||
int32_t currentLen = 0;
|
||||
UVector32 lengthSoFar(fRXPat->fCompiledPat->size(), *fStatus);
|
||||
lengthSoFar.setSize(fRXPat->fCompiledPat->size());
|
||||
|
||||
for (loc=start; loc<=end; loc++) {
|
||||
lengthSoFar.setElementAt(INT32_MAX, loc);
|
||||
}
|
||||
|
||||
loc = start-1;
|
||||
for (loc = start; loc<=end; loc++) {
|
||||
op = fRXPat->fCompiledPat->elementAti(loc);
|
||||
if (lengthSoFar.elementAti(loc) < currentLen) {
|
||||
currentLen = lengthSoFar.elementAti(loc);
|
||||
}
|
||||
|
||||
switch (op) {
|
||||
// Ops that don't change the total length matched
|
||||
case URX_RESERVED_OP:
|
||||
case URX_END:
|
||||
case URX_STRING_LEN:
|
||||
case URX_NOP:
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_BACKSLASH_A:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
case URX_CTR_INIT_P:
|
||||
case URX_RELOC_OPRND:
|
||||
case URX_STO_INP_LOC:
|
||||
case URX_DOLLAR_M:
|
||||
case URX_CARET_M:
|
||||
case URX_BACKTRACK:
|
||||
case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
|
||||
case URX_BACKREF_I:
|
||||
|
||||
case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
|
||||
case URX_LD_SP:
|
||||
break;
|
||||
|
||||
|
||||
// Ops that match a minimum of one character
|
||||
// (and, ususally, exactly one character.)
|
||||
case URX_ONECHAR:
|
||||
case URX_STATIC_SETREF:
|
||||
case URX_SETREF:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_ONECHAR_I:
|
||||
case URX_BACKSLASH_W:
|
||||
case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
|
||||
case URX_DOTANY_ALL: // . matches one or two.
|
||||
case URX_DOTANY:
|
||||
currentLen++;
|
||||
break;
|
||||
|
||||
|
||||
case URX_JMP:
|
||||
case URX_JMPX:
|
||||
{
|
||||
int32_t jmpDest = URX_VAL(op);
|
||||
if (jmpDest < loc) {
|
||||
// Loop of some kind. Can safely ignore, the worst that will happen
|
||||
// is that we understate the true minimum length
|
||||
currentLen = lengthSoFar.elementAti(loc+1);
|
||||
|
||||
} else {
|
||||
// Forward jump. Propagate the current min length to the target loc of the jump.
|
||||
if (lengthSoFar.elementAti(jmpDest) > currentLen) {
|
||||
lengthSoFar.setElementAt(currentLen, jmpDest);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_FAIL:
|
||||
// Fails are kind of like a branch, except that the min length was
|
||||
// propagated already, by the state save.
|
||||
currentLen = lengthSoFar.elementAti(loc+1);
|
||||
break;
|
||||
|
||||
|
||||
case URX_STATE_SAVE:
|
||||
{
|
||||
// State Save, for forward jumps, propagate the current minimum.
|
||||
// of the state save.
|
||||
int32_t jmpDest = URX_VAL(op);
|
||||
if (jmpDest > loc) {
|
||||
if (currentLen < lengthSoFar.elementAti(jmpDest)) {
|
||||
lengthSoFar.setElementAt(currentLen, jmpDest);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
|
||||
case URX_STRING:
|
||||
case URX_STRING_I:
|
||||
{
|
||||
loc++;
|
||||
int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
|
||||
currentLen += URX_VAL(stringLenOp);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_CTR_LOOP:
|
||||
case URX_CTR_LOOP_NG:
|
||||
case URX_CTR_LOOP_P:
|
||||
{
|
||||
// Loop ops. These are four word instructions.
|
||||
// The jump is conditional, backwards only.
|
||||
loc+=3;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case URX_LA_START:
|
||||
{
|
||||
// Look-ahead. Scan forward until the matching look-ahead end,
|
||||
// without processing the look-ahead block. This is overly pessimistic.
|
||||
// TODO: Positive lookahead could recursively do the block, then continue
|
||||
// with the longer of the block or the value coming in.
|
||||
int32_t depth = 0;
|
||||
for (;;) {
|
||||
loc++;
|
||||
op = fRXPat->fCompiledPat->elementAti(loc);
|
||||
if (URX_VAL(op) == URX_LA_START) {
|
||||
depth++;
|
||||
}
|
||||
if (URX_VAL(op) == URX_LA_END) {
|
||||
if (depth == 0) {
|
||||
break;
|
||||
}
|
||||
depth--;
|
||||
}
|
||||
U_ASSERT(loc < end);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_LA_END:
|
||||
// End of look-ahead ops should always be consumed by the processing at
|
||||
// the URX_LA_START op.
|
||||
U_ASSERT(FALSE);
|
||||
break;
|
||||
|
||||
default:
|
||||
U_ASSERT(FALSE);
|
||||
}
|
||||
|
||||
}
|
||||
return currentLen;
|
||||
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// Error Report a rule parse error.
|
||||
|
|
|
@ -105,6 +105,8 @@ private:
|
|||
// taking case mode into account.
|
||||
UBool possibleNullMatch(int32_t start, // Test a range of compiled pattern for
|
||||
int32_t end); // for possibly matching an empty string.
|
||||
int32_t minMatchLength(int32_t start,
|
||||
int32_t end);
|
||||
|
||||
|
||||
UErrorCode *fStatus;
|
||||
|
|
Loading…
Add table
Reference in a new issue