mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-11049 regular expressions, use same logic in UText and (UChar *) code paths when checking limit of potential match start positions.
X-SVN-Rev: 36161
This commit is contained in:
parent
b0a0f67e21
commit
1ba1ec3b83
3 changed files with 55 additions and 39 deletions
|
@ -640,9 +640,9 @@ UBool RegexMatcher::find() {
|
|||
return FALSE;
|
||||
}
|
||||
} else {
|
||||
// For now, let the matcher discover that it can't match on its own
|
||||
// We don't know how long the match len is in native characters
|
||||
testStartLimit = fActiveLimit;
|
||||
// We don't know exactly how long the minimum match length is in native characters.
|
||||
// Treat anything > 0 as 1.
|
||||
testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
|
||||
}
|
||||
|
||||
UChar32 c;
|
||||
|
@ -693,17 +693,17 @@ UBool RegexMatcher::find() {
|
|||
{
|
||||
// Match may start on any char from a pre-computed set.
|
||||
U_ASSERT(fPattern->fMinMatchLen > 0);
|
||||
int64_t pos;
|
||||
UTEXT_SETNATIVEINDEX(fInputText, startPos);
|
||||
for (;;) {
|
||||
int64_t pos = startPos;
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
pos = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
startPos = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
// c will be -1 (U_SENTINEL) at end of text, in which case we
|
||||
// skip this next block (so we don't have a negative array index)
|
||||
// and handle end of text in the following block.
|
||||
if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
|
||||
(c>=256 && fPattern->fInitialChars->contains(c)))) {
|
||||
MatchAt(startPos, FALSE, fDeferredStatus);
|
||||
MatchAt(pos, FALSE, fDeferredStatus);
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -712,12 +712,11 @@ UBool RegexMatcher::find() {
|
|||
}
|
||||
UTEXT_SETNATIVEINDEX(fInputText, pos);
|
||||
}
|
||||
if (startPos >= testStartLimit) {
|
||||
if (startPos > testStartLimit) {
|
||||
fMatch = FALSE;
|
||||
fHitEnd = TRUE;
|
||||
return FALSE;
|
||||
}
|
||||
startPos = pos;
|
||||
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -730,13 +729,13 @@ UBool RegexMatcher::find() {
|
|||
// Match starts on exactly one char.
|
||||
U_ASSERT(fPattern->fMinMatchLen > 0);
|
||||
UChar32 theChar = fPattern->fInitialChar;
|
||||
int64_t pos;
|
||||
UTEXT_SETNATIVEINDEX(fInputText, startPos);
|
||||
for (;;) {
|
||||
int64_t pos = startPos;
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
pos = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
startPos = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
if (c == theChar) {
|
||||
MatchAt(startPos, FALSE, fDeferredStatus);
|
||||
MatchAt(pos, FALSE, fDeferredStatus);
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -745,12 +744,11 @@ UBool RegexMatcher::find() {
|
|||
}
|
||||
UTEXT_SETNATIVEINDEX(fInputText, pos);
|
||||
}
|
||||
if (startPos >= testStartLimit) {
|
||||
if (startPos > testStartLimit) {
|
||||
fMatch = FALSE;
|
||||
fHitEnd = TRUE;
|
||||
return FALSE;
|
||||
}
|
||||
startPos = pos;
|
||||
if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -917,6 +915,7 @@ UBool RegexMatcher::findUsingChunk() {
|
|||
// the minimum length match would extend past the end of the input.
|
||||
// Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
|
||||
// Be aware of possible overflows if making changes here.
|
||||
// Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
|
||||
int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
|
||||
if (startPos > testLen) {
|
||||
fMatch = FALSE;
|
||||
|
@ -1012,7 +1011,7 @@ UBool RegexMatcher::findUsingChunk() {
|
|||
return TRUE;
|
||||
}
|
||||
}
|
||||
if (pos >= testLen) {
|
||||
if (startPos > testLen) {
|
||||
fMatch = FALSE;
|
||||
fHitEnd = TRUE;
|
||||
return FALSE;
|
||||
|
@ -1021,7 +1020,7 @@ UBool RegexMatcher::findUsingChunk() {
|
|||
return FALSE;
|
||||
}
|
||||
}
|
||||
U_ASSERT(FALSE);
|
||||
U_ASSERT(FALSE);
|
||||
|
||||
case START_LINE:
|
||||
{
|
||||
|
|
|
@ -5314,43 +5314,58 @@ void RegexTest::TestBug11049() {
|
|||
// To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
|
||||
// detect the bad read.
|
||||
|
||||
UnicodeString patternString("A|B|C");
|
||||
UnicodeString txtString = UnicodeString("a string \\ud800\\udc00").unescape();
|
||||
UChar *exactBuffer = new UChar[txtString.length()];
|
||||
TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
|
||||
TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
|
||||
|
||||
// Test again with a pattern starting with a single character,
|
||||
// which takes a different code path than starting with an OR expression,
|
||||
// but with similar logic.
|
||||
TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
|
||||
TestCase11049("C", "string matches at end C", TRUE, __LINE__);
|
||||
}
|
||||
|
||||
// Run a single test case from TestBug11049(). Internal function.
|
||||
void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
txtString.extract(exactBuffer, txtString.length(), status);
|
||||
UText *ut = utext_openUChars(NULL, exactBuffer, txtString.length(), &status);
|
||||
UnicodeString patternString = UnicodeString(pattern).unescape();
|
||||
LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
|
||||
|
||||
LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, 0, status));
|
||||
UnicodeString dataString = UnicodeString(data).unescape();
|
||||
UChar *exactBuffer = new UChar[dataString.length()];
|
||||
dataString.extract(exactBuffer, dataString.length(), status);
|
||||
UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
|
||||
|
||||
LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
|
||||
REGEX_CHECK_STATUS;
|
||||
LocalPointer<RegexMatcher> matcher(pattern->matcher(status));
|
||||
matcher->reset(ut);
|
||||
REGEX_CHECK_STATUS;
|
||||
UBool result = matcher->find();
|
||||
REGEX_ASSERT(result == FALSE);
|
||||
if (result != expectMatch) {
|
||||
errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
|
||||
__FILE__, lineNumber, expectMatch, result, pattern, data);
|
||||
}
|
||||
|
||||
// Verify that match starting on the last char in input will be found.
|
||||
txtString = UnicodeString("string matches at end C");
|
||||
matcher->reset(txtString);
|
||||
result = matcher->find();
|
||||
REGEX_ASSERT(result == TRUE);
|
||||
|
||||
// Put an unpaired surrogate at the end of the input text,
|
||||
// let valgrind verify that find() doesn't look off the end.
|
||||
txtString = UnicodeString("a string \\ud800").unescape();
|
||||
delete [] exactBuffer;
|
||||
exactBuffer = new UChar[txtString.length()];
|
||||
txtString.extract(exactBuffer, txtString.length(), status);
|
||||
utext_openUChars(ut, exactBuffer, txtString.length(), &status);
|
||||
// Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
|
||||
// off-by-one on find() with match at the last code point.
|
||||
// Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
|
||||
// because string.unescape() will only shrink it.
|
||||
char * utf8Buffer = new char[uprv_strlen(data)+1];
|
||||
u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
|
||||
REGEX_CHECK_STATUS;
|
||||
matcher->reset(ut);
|
||||
result = matcher->find();
|
||||
REGEX_ASSERT(result == FALSE);
|
||||
REGEX_CHECK_STATUS;
|
||||
if (result != expectMatch) {
|
||||
errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
|
||||
__FILE__, lineNumber, expectMatch, result, pattern, data);
|
||||
}
|
||||
delete [] utf8Buffer;
|
||||
|
||||
utext_close(ut);
|
||||
delete [] exactBuffer;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
|
||||
|
||||
|
|
|
@ -63,6 +63,8 @@ public:
|
|||
virtual UChar *ReadAndConvertFile(const char *fileName, int32_t &len, const char *charset, UErrorCode &status);
|
||||
virtual const char *getPath(char buffer[2048], const char *filename);
|
||||
|
||||
virtual void TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber);
|
||||
|
||||
static const char* extractToAssertBuf(const UnicodeString& message);
|
||||
|
||||
};
|
||||
|
|
Loading…
Add table
Reference in a new issue