From ddf59cf3442ff02a042eb58e478769e6d68f7a0f Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Mon, 24 Feb 2003 05:12:29 +0000 Subject: [PATCH] ICU-2422 look-behind op, plus some optimizations, work-in-progress. X-SVN-Rev: 11145 --- icu4c/source/common/uvectr32.cpp | 2 - icu4c/source/i18n/regexcmp.cpp | 177 +++++++++++++++++++++++++++++++ icu4c/source/i18n/regexcmp.h | 2 + 3 files changed, 179 insertions(+), 2 deletions(-) diff --git a/icu4c/source/common/uvectr32.cpp b/icu4c/source/common/uvectr32.cpp index e6d541d9f5a..fcf5c9a74a3 100644 --- a/icu4c/source/common/uvectr32.cpp +++ b/icu4c/source/common/uvectr32.cpp @@ -20,8 +20,6 @@ U_NAMESPACE_BEGIN * or a pointer. If a hint bit is zero, then the associated * token is assumed to be an integer. This is needed for iSeries */ -#define HINT_KEY_POINTER (1) -#define HINT_KEY_INTEGER (0) const char UVector32::fgClassID=0; diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index f7db0588215..1f8efce94a1 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -1894,6 +1894,183 @@ UBool RegexCompile::possibleNullMatch(int32_t start, int32_t end) { } +//---------------------------------------------------------------------------------------- +// +// minMatchLength +// +//---------------------------------------------------------------------------------------- +int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { + U_ASSERT(start <= end); + U_ASSERT(end < fRXPat->fCompiledPat->size()); + + + int32_t patSegLen = end - start + 1; + int32_t loc; + int32_t op; + int32_t currentLen = 0; + UVector32 lengthSoFar(fRXPat->fCompiledPat->size(), *fStatus); + lengthSoFar.setSize(fRXPat->fCompiledPat->size()); + + for (loc=start; loc<=end; loc++) { + lengthSoFar.setElementAt(INT32_MAX, loc); + } + + loc = start-1; + for (loc = start; loc<=end; loc++) { + op = fRXPat->fCompiledPat->elementAti(loc); + if (lengthSoFar.elementAti(loc) < currentLen) { + currentLen = lengthSoFar.elementAti(loc); + } + + switch (op) { + // Ops that don't change the total length matched + case URX_RESERVED_OP: + case URX_END: + case URX_STRING_LEN: + case URX_NOP: + case URX_START_CAPTURE: + case URX_END_CAPTURE: + case URX_BACKSLASH_A: + case URX_BACKSLASH_B: + case URX_BACKSLASH_G: + case URX_BACKSLASH_Z: + case URX_CARET: + case URX_DOLLAR: + case URX_CTR_INIT: + case URX_CTR_INIT_NG: + case URX_CTR_INIT_P: + case URX_RELOC_OPRND: + case URX_STO_INP_LOC: + case URX_DOLLAR_M: + case URX_CARET_M: + case URX_BACKTRACK: + case URX_BACKREF: // BackRef. Must assume that it might be a zero length match + case URX_BACKREF_I: + + case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. + case URX_LD_SP: + break; + + + // Ops that match a minimum of one character + // (and, ususally, exactly one character.) + case URX_ONECHAR: + case URX_STATIC_SETREF: + case URX_SETREF: + case URX_BACKSLASH_D: + case URX_ONECHAR_I: + case URX_BACKSLASH_W: + case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. + case URX_DOTANY_ALL: // . matches one or two. + case URX_DOTANY: + currentLen++; + break; + + + case URX_JMP: + case URX_JMPX: + { + int32_t jmpDest = URX_VAL(op); + if (jmpDest < loc) { + // Loop of some kind. Can safely ignore, the worst that will happen + // is that we understate the true minimum length + currentLen = lengthSoFar.elementAti(loc+1); + + } else { + // Forward jump. Propagate the current min length to the target loc of the jump. + if (lengthSoFar.elementAti(jmpDest) > currentLen) { + lengthSoFar.setElementAt(currentLen, jmpDest); + } + } + } + break; + + case URX_FAIL: + // Fails are kind of like a branch, except that the min length was + // propagated already, by the state save. + currentLen = lengthSoFar.elementAti(loc+1); + break; + + + case URX_STATE_SAVE: + { + // State Save, for forward jumps, propagate the current minimum. + // of the state save. + int32_t jmpDest = URX_VAL(op); + if (jmpDest > loc) { + if (currentLen < lengthSoFar.elementAti(jmpDest)) { + lengthSoFar.setElementAt(currentLen, jmpDest); + } + } + } + break; + + + + + case URX_STRING: + case URX_STRING_I: + { + loc++; + int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc); + currentLen += URX_VAL(stringLenOp); + } + break; + + + + case URX_CTR_LOOP: + case URX_CTR_LOOP_NG: + case URX_CTR_LOOP_P: + { + // Loop ops. These are four word instructions. + // The jump is conditional, backwards only. + loc+=3; + } + break; + + + + case URX_LA_START: + { + // Look-ahead. Scan forward until the matching look-ahead end, + // without processing the look-ahead block. This is overly pessimistic. + // TODO: Positive lookahead could recursively do the block, then continue + // with the longer of the block or the value coming in. + int32_t depth = 0; + for (;;) { + loc++; + op = fRXPat->fCompiledPat->elementAti(loc); + if (URX_VAL(op) == URX_LA_START) { + depth++; + } + if (URX_VAL(op) == URX_LA_END) { + if (depth == 0) { + break; + } + depth--; + } + U_ASSERT(loc < end); + } + } + break; + + case URX_LA_END: + // End of look-ahead ops should always be consumed by the processing at + // the URX_LA_START op. + U_ASSERT(FALSE); + break; + + default: + U_ASSERT(FALSE); + } + + } + return currentLen; + +} + + //---------------------------------------------------------------------------------------- // // Error Report a rule parse error. diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h index 4840849bacd..666db20dd7e 100644 --- a/icu4c/source/i18n/regexcmp.h +++ b/icu4c/source/i18n/regexcmp.h @@ -105,6 +105,8 @@ private: // taking case mode into account. UBool possibleNullMatch(int32_t start, // Test a range of compiled pattern for int32_t end); // for possibly matching an empty string. + int32_t minMatchLength(int32_t start, + int32_t end); UErrorCode *fStatus;