mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-11554 Fix regex bug with look-behind matching & UTF-8 input.
X-SVN-Rev: 38056
This commit is contained in:
parent
5f297b7ad2
commit
8dba7301b7
4 changed files with 141 additions and 146 deletions
|
@ -23,6 +23,7 @@
|
|||
#include "unicode/utf16.h"
|
||||
#include "uassert.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstr.h"
|
||||
#include "uvector.h"
|
||||
#include "uvectr32.h"
|
||||
#include "uvectr64.h"
|
||||
|
@ -33,6 +34,7 @@
|
|||
|
||||
// #include <malloc.h> // Needed for heapcheck testing
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// Default limit for the size of the back track stack, to avoid system
|
||||
|
@ -782,7 +784,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
|
|||
if (fMatch) {
|
||||
return TRUE;
|
||||
}
|
||||
UTEXT_SETNATIVEINDEX(fInputText, pos);
|
||||
UTEXT_SETNATIVEINDEX(fInputText, startPos);
|
||||
}
|
||||
if (startPos > testStartLimit) {
|
||||
fMatch = FALSE;
|
||||
|
@ -2723,6 +2725,18 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
|
|||
return (REStackFrame *)newFP;
|
||||
}
|
||||
|
||||
#if defined(REGEX_DEBUG)
|
||||
namespace {
|
||||
UnicodeString StringFromUText(UText *ut) {
|
||||
UnicodeString result;
|
||||
for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
|
||||
result.append(c);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
#endif // REGEX_DEBUG
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -2742,32 +2756,10 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
|||
int32_t opValue; // and the operand value.
|
||||
|
||||
#ifdef REGEX_RUN_DEBUG
|
||||
if (fTraceDebug)
|
||||
{
|
||||
if (fTraceDebug) {
|
||||
printf("MatchAt(startIdx=%ld)\n", startIdx);
|
||||
printf("Original Pattern: ");
|
||||
UChar32 c = utext_next32From(fPattern->fPattern, 0);
|
||||
while (c != U_SENTINEL) {
|
||||
if (c<32 || c>256) {
|
||||
c = '.';
|
||||
}
|
||||
printf("%c", c);
|
||||
|
||||
c = UTEXT_NEXT32(fPattern->fPattern);
|
||||
}
|
||||
printf("\n");
|
||||
printf("Input String: ");
|
||||
c = utext_next32From(fInputText, 0);
|
||||
while (c != U_SENTINEL) {
|
||||
if (c<32 || c>256) {
|
||||
c = '.';
|
||||
}
|
||||
printf("%c", c);
|
||||
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
}
|
||||
printf("\n");
|
||||
printf("\n");
|
||||
printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
|
||||
printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -3936,28 +3928,38 @@ GC_Done:
|
|||
// of this op in the pattern.
|
||||
int32_t minML = (int32_t)pat[fp->fPatIdx++];
|
||||
int32_t maxML = (int32_t)pat[fp->fPatIdx++];
|
||||
if (!UTEXT_USES_U16(fInputText)) {
|
||||
// utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
|
||||
// The max length need not be exact; it just needs to be >= actual maximum.
|
||||
maxML *= 3;
|
||||
}
|
||||
U_ASSERT(minML <= maxML);
|
||||
U_ASSERT(minML >= 0);
|
||||
|
||||
// Fetch (from data) the last input index where a match was attempted.
|
||||
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
|
||||
int64_t *lbStartIdx = &fData[opValue+2];
|
||||
if (*lbStartIdx < 0) {
|
||||
int64_t &lbStartIdx = fData[opValue+2];
|
||||
if (lbStartIdx < 0) {
|
||||
// First time through loop.
|
||||
*lbStartIdx = fp->fInputIdx - minML;
|
||||
lbStartIdx = fp->fInputIdx - minML;
|
||||
if (lbStartIdx > 0) {
|
||||
// move index to a code point boudary, if it's not on one already.
|
||||
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
|
||||
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
}
|
||||
} else {
|
||||
// 2nd through nth time through the loop.
|
||||
// Back up start position for match by one.
|
||||
if (*lbStartIdx == 0) {
|
||||
(*lbStartIdx)--;
|
||||
if (lbStartIdx == 0) {
|
||||
(lbStartIdx)--;
|
||||
} else {
|
||||
UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
|
||||
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
|
||||
(void)UTEXT_PREVIOUS32(fInputText);
|
||||
*lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
}
|
||||
}
|
||||
|
||||
if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
|
||||
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
|
||||
// We have tried all potential match starting points without
|
||||
// getting a match. Backtrack out, and out of the
|
||||
// Look Behind altogether.
|
||||
|
@ -3972,7 +3974,7 @@ GC_Done:
|
|||
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
|
||||
// (successful match will fall off the end of the loop.)
|
||||
fp = StateSave(fp, fp->fPatIdx-3, status);
|
||||
fp->fInputIdx = *lbStartIdx;
|
||||
fp->fInputIdx = lbStartIdx;
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -4009,6 +4011,11 @@ GC_Done:
|
|||
// Fetch the extra parameters of this op.
|
||||
int32_t minML = (int32_t)pat[fp->fPatIdx++];
|
||||
int32_t maxML = (int32_t)pat[fp->fPatIdx++];
|
||||
if (!UTEXT_USES_U16(fInputText)) {
|
||||
// utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
|
||||
// The max length need not be exact; it just needs to be >= actual maximum.
|
||||
maxML *= 3;
|
||||
}
|
||||
int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
|
||||
continueLoc = URX_VAL(continueLoc);
|
||||
U_ASSERT(minML <= maxML);
|
||||
|
@ -4017,23 +4024,28 @@ GC_Done:
|
|||
|
||||
// Fetch (from data) the last input index where a match was attempted.
|
||||
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
|
||||
int64_t *lbStartIdx = &fData[opValue+2];
|
||||
if (*lbStartIdx < 0) {
|
||||
int64_t &lbStartIdx = fData[opValue+2];
|
||||
if (lbStartIdx < 0) {
|
||||
// First time through loop.
|
||||
*lbStartIdx = fp->fInputIdx - minML;
|
||||
lbStartIdx = fp->fInputIdx - minML;
|
||||
if (lbStartIdx > 0) {
|
||||
// move index to a code point boudary, if it's not on one already.
|
||||
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
|
||||
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
}
|
||||
} else {
|
||||
// 2nd through nth time through the loop.
|
||||
// Back up start position for match by one.
|
||||
if (*lbStartIdx == 0) {
|
||||
(*lbStartIdx)--;
|
||||
if (lbStartIdx == 0) {
|
||||
(lbStartIdx)--;
|
||||
} else {
|
||||
UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
|
||||
UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
|
||||
(void)UTEXT_PREVIOUS32(fInputText);
|
||||
*lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
}
|
||||
}
|
||||
|
||||
if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
|
||||
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
|
||||
// We have tried all potential match starting points without
|
||||
// getting a match, which means that the negative lookbehind as
|
||||
// a whole has succeeded. Jump forward to the continue location
|
||||
|
@ -4048,7 +4060,7 @@ GC_Done:
|
|||
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
|
||||
// (successful match will cause a FAIL out of the loop altogether.)
|
||||
fp = StateSave(fp, fp->fPatIdx-4, status);
|
||||
fp->fInputIdx = *lbStartIdx;
|
||||
fp->fInputIdx = lbStartIdx;
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -4310,29 +4322,8 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
|||
#ifdef REGEX_RUN_DEBUG
|
||||
if (fTraceDebug) {
|
||||
printf("MatchAt(startIdx=%d)\n", startIdx);
|
||||
printf("Original Pattern: ");
|
||||
UChar32 c = utext_next32From(fPattern->fPattern, 0);
|
||||
while (c != U_SENTINEL) {
|
||||
if (c<32 || c>256) {
|
||||
c = '.';
|
||||
}
|
||||
printf("%c", c);
|
||||
|
||||
c = UTEXT_NEXT32(fPattern->fPattern);
|
||||
}
|
||||
printf("\n");
|
||||
printf("Input String: ");
|
||||
c = utext_next32From(fInputText, 0);
|
||||
while (c != U_SENTINEL) {
|
||||
if (c<32 || c>256) {
|
||||
c = '.';
|
||||
}
|
||||
printf("%c", c);
|
||||
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
}
|
||||
printf("\n");
|
||||
printf("\n");
|
||||
printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
|
||||
printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -5450,21 +5441,24 @@ GC_Done:
|
|||
|
||||
// Fetch (from data) the last input index where a match was attempted.
|
||||
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
|
||||
int64_t *lbStartIdx = &fData[opValue+2];
|
||||
if (*lbStartIdx < 0) {
|
||||
int64_t &lbStartIdx = fData[opValue+2];
|
||||
if (lbStartIdx < 0) {
|
||||
// First time through loop.
|
||||
*lbStartIdx = fp->fInputIdx - minML;
|
||||
lbStartIdx = fp->fInputIdx - minML;
|
||||
if (lbStartIdx > 0) {
|
||||
U16_SET_CP_START(inputBuf, 0, lbStartIdx);
|
||||
}
|
||||
} else {
|
||||
// 2nd through nth time through the loop.
|
||||
// Back up start position for match by one.
|
||||
if (*lbStartIdx == 0) {
|
||||
(*lbStartIdx)--;
|
||||
if (lbStartIdx == 0) {
|
||||
lbStartIdx--;
|
||||
} else {
|
||||
U16_BACK_1(inputBuf, 0, *lbStartIdx);
|
||||
U16_BACK_1(inputBuf, 0, lbStartIdx);
|
||||
}
|
||||
}
|
||||
|
||||
if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
|
||||
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
|
||||
// We have tried all potential match starting points without
|
||||
// getting a match. Backtrack out, and out of the
|
||||
// Look Behind altogether.
|
||||
|
@ -5479,7 +5473,7 @@ GC_Done:
|
|||
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
|
||||
// (successful match will fall off the end of the loop.)
|
||||
fp = StateSave(fp, fp->fPatIdx-3, status);
|
||||
fp->fInputIdx = *lbStartIdx;
|
||||
fp->fInputIdx = lbStartIdx;
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -5524,21 +5518,24 @@ GC_Done:
|
|||
|
||||
// Fetch (from data) the last input index where a match was attempted.
|
||||
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
|
||||
int64_t *lbStartIdx = &fData[opValue+2];
|
||||
if (*lbStartIdx < 0) {
|
||||
int64_t &lbStartIdx = fData[opValue+2];
|
||||
if (lbStartIdx < 0) {
|
||||
// First time through loop.
|
||||
*lbStartIdx = fp->fInputIdx - minML;
|
||||
lbStartIdx = fp->fInputIdx - minML;
|
||||
if (lbStartIdx > 0) {
|
||||
U16_SET_CP_START(inputBuf, 0, lbStartIdx);
|
||||
}
|
||||
} else {
|
||||
// 2nd through nth time through the loop.
|
||||
// Back up start position for match by one.
|
||||
if (*lbStartIdx == 0) {
|
||||
(*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
|
||||
if (lbStartIdx == 0) {
|
||||
lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
|
||||
} else {
|
||||
U16_BACK_1(inputBuf, 0, *lbStartIdx);
|
||||
U16_BACK_1(inputBuf, 0, lbStartIdx);
|
||||
}
|
||||
}
|
||||
|
||||
if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
|
||||
if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
|
||||
// We have tried all potential match starting points without
|
||||
// getting a match, which means that the negative lookbehind as
|
||||
// a whole has succeeded. Jump forward to the continue location
|
||||
|
@ -5553,7 +5550,7 @@ GC_Done:
|
|||
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
|
||||
// (successful match will cause a FAIL out of the loop altogether.)
|
||||
fp = StateSave(fp, fp->fPatIdx-4, status);
|
||||
fp->fInputIdx = *lbStartIdx;
|
||||
fp->fInputIdx = lbStartIdx;
|
||||
}
|
||||
break;
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include "unicode/regex.h"
|
||||
#include "unicode/uclean.h"
|
||||
#include "cstr.h"
|
||||
#include "uassert.h"
|
||||
#include "uhash.h"
|
||||
#include "uvector.h"
|
||||
|
@ -675,7 +676,6 @@ int32_t RegexPattern::split(UText *input,
|
|||
}
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// dump Output the compiled form of the pattern.
|
||||
|
@ -751,7 +751,11 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
|
||||
case URX_ONECHAR:
|
||||
case URX_ONECHAR_I:
|
||||
printf("%c", val<256?val:'?');
|
||||
if (val < 0x20) {
|
||||
printf("%#x", val);
|
||||
} else {
|
||||
printf("'%s'", CStr(UnicodeString(val))());
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_STRING:
|
||||
|
@ -760,12 +764,8 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
int32_t lengthOp = fCompiledPat->elementAti(index+1);
|
||||
U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
|
||||
int32_t length = URX_VAL(lengthOp);
|
||||
int32_t i;
|
||||
for (i=val; i<val+length; i++) {
|
||||
UChar c = fLiteralText[i];
|
||||
if (c < 32 || c >= 256) {c = '.';}
|
||||
printf("%c", c);
|
||||
}
|
||||
UnicodeString str(fLiteralText, val, length);
|
||||
printf("%s", CStr(str)());
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -775,9 +775,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
UnicodeString s;
|
||||
UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
|
||||
set->toPattern(s, TRUE);
|
||||
for (int32_t i=0; i<s.length(); i++) {
|
||||
printf("%c", s.charAt(i));
|
||||
}
|
||||
printf("%s", CStr(s)());
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -791,9 +789,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
}
|
||||
UnicodeSet *set = fStaticSets[val];
|
||||
set->toPattern(s, TRUE);
|
||||
for (int32_t i=0; i<s.length(); i++) {
|
||||
printf("%c", s.charAt(i));
|
||||
}
|
||||
printf("%s", CStr(s)());
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -809,53 +805,27 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
|
||||
void RegexPattern::dumpPattern() const {
|
||||
#if defined(REGEX_DEBUG)
|
||||
// TODO: This function assumes an ASCII based charset.
|
||||
int index;
|
||||
int i;
|
||||
|
||||
printf("Original Pattern: ");
|
||||
UChar32 c = utext_next32From(fPattern, 0);
|
||||
while (c != U_SENTINEL) {
|
||||
if (c<32 || c>256) {
|
||||
c = '.';
|
||||
}
|
||||
printf("%c", c);
|
||||
|
||||
c = UTEXT_NEXT32(fPattern);
|
||||
UnicodeString patStr;
|
||||
for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) {
|
||||
patStr.append(c);
|
||||
}
|
||||
printf("\n");
|
||||
printf("Original Pattern: \"%s\"\n", CStr(patStr)());
|
||||
printf(" Min Match Length: %d\n", fMinMatchLen);
|
||||
printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType));
|
||||
if (fStartType == START_STRING) {
|
||||
printf(" Initial match string: \"");
|
||||
for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
|
||||
printf("%c", fLiteralText[i]); // TODO: non-printables, surrogates.
|
||||
}
|
||||
printf("\"\n");
|
||||
|
||||
UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen);
|
||||
printf(" Initial match string: \"%s\"\n", CStr(initialString)());
|
||||
} else if (fStartType == START_SET) {
|
||||
int32_t numSetChars = fInitialChars->size();
|
||||
if (numSetChars > 20) {
|
||||
numSetChars = 20;
|
||||
}
|
||||
printf(" Match First Chars : ");
|
||||
for (i=0; i<numSetChars; i++) {
|
||||
UChar32 c = fInitialChars->charAt(i);
|
||||
if (0x20<c && c <0x7e) {
|
||||
printf("%c ", c);
|
||||
} else {
|
||||
printf("%#x ", c);
|
||||
}
|
||||
}
|
||||
if (numSetChars < fInitialChars->size()) {
|
||||
printf(" ...");
|
||||
}
|
||||
printf("\n");
|
||||
UnicodeString s;
|
||||
fInitialChars->toPattern(s, TRUE);
|
||||
printf(" Match First Chars: %s\n", CStr(s)());
|
||||
|
||||
} else if (fStartType == START_CHAR) {
|
||||
printf(" First char of Match : ");
|
||||
if (0x20 < fInitialChar && fInitialChar<0x7e) {
|
||||
printf("%c\n", fInitialChar);
|
||||
printf(" First char of Match: ");
|
||||
if (fInitialChar > 0x20) {
|
||||
printf("'%s'\n", CStr(UnicodeString(fInitialChar))());
|
||||
} else {
|
||||
printf("%#x\n", fInitialChar);
|
||||
}
|
||||
|
@ -869,10 +839,8 @@ void RegexPattern::dumpPattern() const {
|
|||
const UHashElement *el = NULL;
|
||||
while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
|
||||
const UnicodeString *name = (const UnicodeString *)el->key.pointer;
|
||||
char s[100];
|
||||
name->extract(0, 99, s, sizeof(s), US_INV); // capture group names are invariant.
|
||||
int32_t number = el->value.integer;
|
||||
printf(" %d\t%s\n", number, s);
|
||||
printf(" %d\t%s\n", number, CStr(*name)());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -3583,7 +3583,7 @@ void RegexTest::regex_find(const UnicodeString &pattern,
|
|||
|
||||
if (UTF8Matcher == NULL) {
|
||||
// UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
|
||||
logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
|
||||
logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
|
||||
status = U_ZERO_ERROR;
|
||||
}
|
||||
}
|
||||
|
@ -3592,6 +3592,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
|
|||
// Generate native indices for UTF8 versions of region and capture group info
|
||||
//
|
||||
if (UTF8Matcher != NULL) {
|
||||
if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
|
||||
UTF8Matcher->setTrace(TRUE);
|
||||
}
|
||||
if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
|
||||
if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
|
||||
|
||||
|
@ -3671,6 +3674,9 @@ void RegexTest::regex_find(const UnicodeString &pattern,
|
|||
}
|
||||
}
|
||||
matcher->setTrace(FALSE);
|
||||
if (UTF8Matcher) {
|
||||
UTF8Matcher->setTrace(FALSE);
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
|
||||
}
|
||||
|
@ -3692,16 +3698,17 @@ void RegexTest::regex_find(const UnicodeString &pattern,
|
|||
failed = TRUE;
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
if (isMatch && groupStarts.size() == 0) {
|
||||
errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
|
||||
failed = TRUE;
|
||||
}
|
||||
if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
|
||||
errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
|
||||
failed = TRUE;
|
||||
}
|
||||
|
||||
if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
|
||||
// Only check for match / no match. Don't check capture groups.
|
||||
if (isMatch && groupStarts.size() == 0) {
|
||||
errln("Error at line %d: No match expected, but one found.", line);
|
||||
failed = TRUE;
|
||||
} else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
|
||||
errln("Error at line %d: No match expected, but one found. (UTF8)", line);
|
||||
failed = TRUE;
|
||||
}
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
|
||||
|
|
23
icu4c/source/test/testdata/regextst.txt
vendored
23
icu4c/source/test/testdata/regextst.txt
vendored
|
@ -1321,6 +1321,29 @@
|
|||
"pre(.)post\1" i "pre\ud800post\ud800\udc00" # case insensiteve backrefs take a different code path
|
||||
"pre(.)post\1" i "<0>pre<1>\ud800</1>post\ud800</0> fin"
|
||||
|
||||
# Bug 11554
|
||||
#
|
||||
# Maximum match length computation was assuming UTF-16.
|
||||
# Used in look-behind matches to constrain how far back to look.
|
||||
|
||||
"(?<=a\x{100000})spam" "***a\x{100000}<0>spam</0>**"
|
||||
"(?<=aą)spam" "**aą<0>spam</0>**"
|
||||
"(?<=ąabc)spam" "**ąabc<0>spam</0>**"
|
||||
|
||||
"(?<=a\x{100000})spam" "***a\x{100001}spam**"
|
||||
"(?<=aą)spam" "**bąspam**"
|
||||
"(?<=ąabc)spam" "**ąabxspam**"
|
||||
|
||||
# with negative look-behind
|
||||
|
||||
"(?<!a\x{100000})spam" "***a\x{100000}spam**"
|
||||
"(?<!aą)spam" "**aąspam**"
|
||||
"(?<!ąabc)spam" "**ąabcspam**"
|
||||
|
||||
"(?<!a\x{100000})spam" "***a\x{100001}<0>spam</0>**"
|
||||
"(?<!aą)spam" "**bą<0>spam</0>**"
|
||||
"(?<!ąabc)spam" "**ąabx<0>spam</0>**"
|
||||
|
||||
# Random debugging, Temporary
|
||||
#
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue