ICU-11554 Fix regex bug with look-behind matching & UTF-8 input.

X-SVN-Rev: 38056
This commit is contained in:
Andy Heninger 2015-10-09 20:01:46 +00:00
parent 5f297b7ad2
commit 8dba7301b7
4 changed files with 141 additions and 146 deletions

View file

@ -23,6 +23,7 @@
#include "unicode/utf16.h"
#include "uassert.h"
#include "cmemory.h"
#include "cstr.h"
#include "uvector.h"
#include "uvectr32.h"
#include "uvectr64.h"
@ -33,6 +34,7 @@
// #include <malloc.h> // Needed for heapcheck testing
U_NAMESPACE_BEGIN
// Default limit for the size of the back track stack, to avoid system
@ -782,7 +784,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
if (fMatch) {
return TRUE;
}
UTEXT_SETNATIVEINDEX(fInputText, pos);
UTEXT_SETNATIVEINDEX(fInputText, startPos);
}
if (startPos > testStartLimit) {
fMatch = FALSE;
@ -2723,6 +2725,18 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
return (REStackFrame *)newFP;
}
#if defined(REGEX_DEBUG)
namespace {
UnicodeString StringFromUText(UText *ut) {
UnicodeString result;
for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
result.append(c);
}
return result;
}
}
#endif // REGEX_DEBUG
//--------------------------------------------------------------------------------
//
@ -2742,32 +2756,10 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
int32_t opValue; // and the operand value.
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug)
{
if (fTraceDebug) {
printf("MatchAt(startIdx=%ld)\n", startIdx);
printf("Original Pattern: ");
UChar32 c = utext_next32From(fPattern->fPattern, 0);
while (c != U_SENTINEL) {
if (c<32 || c>256) {
c = '.';
}
printf("%c", c);
c = UTEXT_NEXT32(fPattern->fPattern);
}
printf("\n");
printf("Input String: ");
c = utext_next32From(fInputText, 0);
while (c != U_SENTINEL) {
if (c<32 || c>256) {
c = '.';
}
printf("%c", c);
c = UTEXT_NEXT32(fInputText);
}
printf("\n");
printf("\n");
printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
}
#endif
@ -3936,28 +3928,38 @@ GC_Done:
// of this op in the pattern.
int32_t minML = (int32_t)pat[fp->fPatIdx++];
int32_t maxML = (int32_t)pat[fp->fPatIdx++];
if (!UTEXT_USES_U16(fInputText)) {
// utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
// The max length need not be exact; it just needs to be >= actual maximum.
maxML *= 3;
}
U_ASSERT(minML <= maxML);
U_ASSERT(minML >= 0);
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
int64_t *lbStartIdx = &fData[opValue+2];
if (*lbStartIdx < 0) {
int64_t &lbStartIdx = fData[opValue+2];
if (lbStartIdx < 0) {
// First time through loop.
*lbStartIdx = fp->fInputIdx - minML;
lbStartIdx = fp->fInputIdx - minML;
if (lbStartIdx > 0) {
// move index to a code point boudary, if it's not on one already.
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
if (*lbStartIdx == 0) {
(*lbStartIdx)--;
if (lbStartIdx == 0) {
(lbStartIdx)--;
} else {
UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
(void)UTEXT_PREVIOUS32(fInputText);
*lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match. Backtrack out, and out of the
// Look Behind altogether.
@ -3972,7 +3974,7 @@ GC_Done:
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will fall off the end of the loop.)
fp = StateSave(fp, fp->fPatIdx-3, status);
fp->fInputIdx = *lbStartIdx;
fp->fInputIdx = lbStartIdx;
}
break;
@ -4009,6 +4011,11 @@ GC_Done:
// Fetch the extra parameters of this op.
int32_t minML = (int32_t)pat[fp->fPatIdx++];
int32_t maxML = (int32_t)pat[fp->fPatIdx++];
if (!UTEXT_USES_U16(fInputText)) {
// utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
// The max length need not be exact; it just needs to be >= actual maximum.
maxML *= 3;
}
int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
continueLoc = URX_VAL(continueLoc);
U_ASSERT(minML <= maxML);
@ -4017,23 +4024,28 @@ GC_Done:
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
int64_t *lbStartIdx = &fData[opValue+2];
if (*lbStartIdx < 0) {
int64_t &lbStartIdx = fData[opValue+2];
if (lbStartIdx < 0) {
// First time through loop.
*lbStartIdx = fp->fInputIdx - minML;
lbStartIdx = fp->fInputIdx - minML;
if (lbStartIdx > 0) {
// move index to a code point boudary, if it's not on one already.
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
if (*lbStartIdx == 0) {
(*lbStartIdx)--;
if (lbStartIdx == 0) {
(lbStartIdx)--;
} else {
UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
(void)UTEXT_PREVIOUS32(fInputText);
*lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match, which means that the negative lookbehind as
// a whole has succeeded. Jump forward to the continue location
@ -4048,7 +4060,7 @@ GC_Done:
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will cause a FAIL out of the loop altogether.)
fp = StateSave(fp, fp->fPatIdx-4, status);
fp->fInputIdx = *lbStartIdx;
fp->fInputIdx = lbStartIdx;
}
break;
@ -4310,29 +4322,8 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug) {
printf("MatchAt(startIdx=%d)\n", startIdx);
printf("Original Pattern: ");
UChar32 c = utext_next32From(fPattern->fPattern, 0);
while (c != U_SENTINEL) {
if (c<32 || c>256) {
c = '.';
}
printf("%c", c);
c = UTEXT_NEXT32(fPattern->fPattern);
}
printf("\n");
printf("Input String: ");
c = utext_next32From(fInputText, 0);
while (c != U_SENTINEL) {
if (c<32 || c>256) {
c = '.';
}
printf("%c", c);
c = UTEXT_NEXT32(fInputText);
}
printf("\n");
printf("\n");
printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
}
#endif
@ -5450,21 +5441,24 @@ GC_Done:
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
int64_t *lbStartIdx = &fData[opValue+2];
if (*lbStartIdx < 0) {
int64_t &lbStartIdx = fData[opValue+2];
if (lbStartIdx < 0) {
// First time through loop.
*lbStartIdx = fp->fInputIdx - minML;
lbStartIdx = fp->fInputIdx - minML;
if (lbStartIdx > 0) {
U16_SET_CP_START(inputBuf, 0, lbStartIdx);
}
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
if (*lbStartIdx == 0) {
(*lbStartIdx)--;
if (lbStartIdx == 0) {
lbStartIdx--;
} else {
U16_BACK_1(inputBuf, 0, *lbStartIdx);
U16_BACK_1(inputBuf, 0, lbStartIdx);
}
}
if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match. Backtrack out, and out of the
// Look Behind altogether.
@ -5479,7 +5473,7 @@ GC_Done:
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will fall off the end of the loop.)
fp = StateSave(fp, fp->fPatIdx-3, status);
fp->fInputIdx = *lbStartIdx;
fp->fInputIdx = lbStartIdx;
}
break;
@ -5524,21 +5518,24 @@ GC_Done:
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
int64_t *lbStartIdx = &fData[opValue+2];
if (*lbStartIdx < 0) {
int64_t &lbStartIdx = fData[opValue+2];
if (lbStartIdx < 0) {
// First time through loop.
*lbStartIdx = fp->fInputIdx - minML;
lbStartIdx = fp->fInputIdx - minML;
if (lbStartIdx > 0) {
U16_SET_CP_START(inputBuf, 0, lbStartIdx);
}
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
if (*lbStartIdx == 0) {
(*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
if (lbStartIdx == 0) {
lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
} else {
U16_BACK_1(inputBuf, 0, *lbStartIdx);
U16_BACK_1(inputBuf, 0, lbStartIdx);
}
}
if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match, which means that the negative lookbehind as
// a whole has succeeded. Jump forward to the continue location
@ -5553,7 +5550,7 @@ GC_Done:
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will cause a FAIL out of the loop altogether.)
fp = StateSave(fp, fp->fPatIdx-4, status);
fp->fInputIdx = *lbStartIdx;
fp->fInputIdx = lbStartIdx;
}
break;

View file

@ -14,6 +14,7 @@
#include "unicode/regex.h"
#include "unicode/uclean.h"
#include "cstr.h"
#include "uassert.h"
#include "uhash.h"
#include "uvector.h"
@ -675,7 +676,6 @@ int32_t RegexPattern::split(UText *input,
}
//---------------------------------------------------------------------
//
// dump Output the compiled form of the pattern.
@ -751,7 +751,11 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_ONECHAR:
case URX_ONECHAR_I:
printf("%c", val<256?val:'?');
if (val < 0x20) {
printf("%#x", val);
} else {
printf("'%s'", CStr(UnicodeString(val))());
}
break;
case URX_STRING:
@ -760,12 +764,8 @@ void RegexPattern::dumpOp(int32_t index) const {
int32_t lengthOp = fCompiledPat->elementAti(index+1);
U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
int32_t length = URX_VAL(lengthOp);
int32_t i;
for (i=val; i<val+length; i++) {
UChar c = fLiteralText[i];
if (c < 32 || c >= 256) {c = '.';}
printf("%c", c);
}
UnicodeString str(fLiteralText, val, length);
printf("%s", CStr(str)());
}
break;
@ -775,9 +775,7 @@ void RegexPattern::dumpOp(int32_t index) const {
UnicodeString s;
UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
set->toPattern(s, TRUE);
for (int32_t i=0; i<s.length(); i++) {
printf("%c", s.charAt(i));
}
printf("%s", CStr(s)());
}
break;
@ -791,9 +789,7 @@ void RegexPattern::dumpOp(int32_t index) const {
}
UnicodeSet *set = fStaticSets[val];
set->toPattern(s, TRUE);
for (int32_t i=0; i<s.length(); i++) {
printf("%c", s.charAt(i));
}
printf("%s", CStr(s)());
}
break;
@ -809,53 +805,27 @@ void RegexPattern::dumpOp(int32_t index) const {
void RegexPattern::dumpPattern() const {
#if defined(REGEX_DEBUG)
// TODO: This function assumes an ASCII based charset.
int index;
int i;
printf("Original Pattern: ");
UChar32 c = utext_next32From(fPattern, 0);
while (c != U_SENTINEL) {
if (c<32 || c>256) {
c = '.';
}
printf("%c", c);
c = UTEXT_NEXT32(fPattern);
UnicodeString patStr;
for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) {
patStr.append(c);
}
printf("\n");
printf("Original Pattern: \"%s\"\n", CStr(patStr)());
printf(" Min Match Length: %d\n", fMinMatchLen);
printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType));
if (fStartType == START_STRING) {
printf(" Initial match string: \"");
for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
printf("%c", fLiteralText[i]); // TODO: non-printables, surrogates.
}
printf("\"\n");
UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen);
printf(" Initial match string: \"%s\"\n", CStr(initialString)());
} else if (fStartType == START_SET) {
int32_t numSetChars = fInitialChars->size();
if (numSetChars > 20) {
numSetChars = 20;
}
printf(" Match First Chars : ");
for (i=0; i<numSetChars; i++) {
UChar32 c = fInitialChars->charAt(i);
if (0x20<c && c <0x7e) {
printf("%c ", c);
} else {
printf("%#x ", c);
}
}
if (numSetChars < fInitialChars->size()) {
printf(" ...");
}
printf("\n");
UnicodeString s;
fInitialChars->toPattern(s, TRUE);
printf(" Match First Chars: %s\n", CStr(s)());
} else if (fStartType == START_CHAR) {
printf(" First char of Match : ");
if (0x20 < fInitialChar && fInitialChar<0x7e) {
printf("%c\n", fInitialChar);
printf(" First char of Match: ");
if (fInitialChar > 0x20) {
printf("'%s'\n", CStr(UnicodeString(fInitialChar))());
} else {
printf("%#x\n", fInitialChar);
}
@ -869,10 +839,8 @@ void RegexPattern::dumpPattern() const {
const UHashElement *el = NULL;
while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
const UnicodeString *name = (const UnicodeString *)el->key.pointer;
char s[100];
name->extract(0, 99, s, sizeof(s), US_INV); // capture group names are invariant.
int32_t number = el->value.integer;
printf(" %d\t%s\n", number, s);
printf(" %d\t%s\n", number, CStr(*name)());
}
}

View file

@ -3583,7 +3583,7 @@ void RegexTest::regex_find(const UnicodeString &pattern,
if (UTF8Matcher == NULL) {
// UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
status = U_ZERO_ERROR;
}
}
@ -3592,6 +3592,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
// Generate native indices for UTF8 versions of region and capture group info
//
if (UTF8Matcher != NULL) {
if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
UTF8Matcher->setTrace(TRUE);
}
if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
@ -3671,6 +3674,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
}
}
matcher->setTrace(FALSE);
if (UTF8Matcher) {
UTF8Matcher->setTrace(FALSE);
}
if (U_FAILURE(status)) {
errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
}
@ -3692,16 +3698,17 @@ void RegexTest::regex_find(const UnicodeString &pattern,
failed = TRUE;
goto cleanupAndReturn;
}
if (isMatch && groupStarts.size() == 0) {
errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
failed = TRUE;
}
if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
failed = TRUE;
}
if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
// Only check for match / no match. Don't check capture groups.
if (isMatch && groupStarts.size() == 0) {
errln("Error at line %d: No match expected, but one found.", line);
failed = TRUE;
} else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
errln("Error at line %d: No match expected, but one found. (UTF8)", line);
failed = TRUE;
}
goto cleanupAndReturn;
}

View file

@ -1321,6 +1321,29 @@
"pre(.)post\1" i "pre\ud800post\ud800\udc00" # case insensiteve backrefs take a different code path
"pre(.)post\1" i "<0>pre<1>\ud800</1>post\ud800</0> fin"
# Bug 11554
#
# Maximum match length computation was assuming UTF-16.
# Used in look-behind matches to constrain how far back to look.
"(?<=a\x{100000})spam" "***a\x{100000}<0>spam</0>**"
"(?<=aą)spam" "**aą<0>spam</0>**"
"(?<=ąabc)spam" "**ąabc<0>spam</0>**"
"(?<=a\x{100000})spam" "***a\x{100001}spam**"
"(?<=aą)spam" "**bąspam**"
"(?<=ąabc)spam" "**ąabxspam**"
# with negative look-behind
"(?<!a\x{100000})spam" "***a\x{100000}spam**"
"(?<!aą)spam" "**aąspam**"
"(?<!ąabc)spam" "**ąabcspam**"
"(?<!a\x{100000})spam" "***a\x{100001}<0>spam</0>**"
"(?<!aą)spam" "**bą<0>spam</0>**"
"(?<!ąabc)spam" "**ąabx<0>spam</0>**"
# Random debugging, Temporary
#