From e03585d7cf96e272286103a91822b78da5fb058c Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Wed, 6 Aug 2014 21:49:08 +0000 Subject: [PATCH] ICU-11049 fix regex find() memory overrun. X-SVN-Rev: 36124 --- icu4c/source/i18n/rematch.cpp | 4 +- icu4c/source/test/intltest/regextst.cpp | 51 ++++++++++++++++++++++++- icu4c/source/test/intltest/regextst.h | 1 + icu4c/source/test/testdata/regextst.txt | 9 +++++ 4 files changed, 62 insertions(+), 3 deletions(-) diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index 6ffe61058e7..d3b8f8344eb 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -1,6 +1,6 @@ /* ************************************************************************** -* Copyright (C) 2002-2013 International Business Machines Corporation * +* Copyright (C) 2002-2014 International Business Machines Corporation * * and others. All rights reserved. * ************************************************************************** */ @@ -983,7 +983,7 @@ UBool RegexMatcher::findUsingChunk() { return TRUE; } } - if (pos >= testLen) { + if (startPos > testLen) { fMatch = FALSE; fHitEnd = TRUE; return FALSE; diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index 18155e90323..5b697c5d390 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -23,6 +23,7 @@ #include "intltest.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS +#include "unicode/localpointer.h" #include "unicode/regex.h" #include "unicode/uchar.h" #include "unicode/ucnv.h" @@ -140,7 +141,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch case 23: name = "TestCaseInsensitiveStarters"; if (exec) TestCaseInsensitiveStarters(); break; - + case 24: name = "TestBug11049"; + if (exec) TestBug11049(); + break; default: name = ""; break; //needed to end loop } @@ -5303,5 +5306,51 @@ void RegexTest::TestCaseInsensitiveStarters() { } +void RegexTest::TestBug11049() { + // Original bug report: pattern with match start consisting of one of several individual characters, + // and the text being matched ending with a supplementary character. find() would read past the + // end of the input text when searching for potential match starting points. + + // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will + // detect the bad read. + + UnicodeString patternString("A|B|C"); + UnicodeString txtString = UnicodeString("a string \\ud800\\udc00").unescape(); + UChar *exactBuffer = new UChar[txtString.length()]; + UErrorCode status = U_ZERO_ERROR; + txtString.extract(exactBuffer, txtString.length(), status); + UText *ut = utext_openUChars(NULL, exactBuffer, txtString.length(), &status); + + LocalPointer pattern(RegexPattern::compile(patternString, 0, status)); + REGEX_CHECK_STATUS; + LocalPointer matcher(pattern->matcher(status)); + matcher->reset(ut); + REGEX_CHECK_STATUS; + UBool result = matcher->find(); + REGEX_ASSERT(result == FALSE); + + // Verify that match starting on the last char in input will be found. + txtString = UnicodeString("string matches at end C"); + matcher->reset(txtString); + result = matcher->find(); + REGEX_ASSERT(result == TRUE); + + // Put an unpaired surrogate at the end of the input text, + // let valgrind verify that find() doesn't look off the end. + txtString = UnicodeString("a string \\ud800").unescape(); + delete [] exactBuffer; + exactBuffer = new UChar[txtString.length()]; + txtString.extract(exactBuffer, txtString.length(), status); + utext_openUChars(ut, exactBuffer, txtString.length(), &status); + matcher->reset(ut); + result = matcher->find(); + REGEX_ASSERT(result == FALSE); + REGEX_CHECK_STATUS; + + utext_close(ut); + delete [] exactBuffer; +} + + #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ diff --git a/icu4c/source/test/intltest/regextst.h b/icu4c/source/test/intltest/regextst.h index 974e1355ae5..bb8777a338d 100644 --- a/icu4c/source/test/intltest/regextst.h +++ b/icu4c/source/test/intltest/regextst.h @@ -49,6 +49,7 @@ public: virtual void CheckInvBufSize(); virtual void Bug10459(); virtual void TestCaseInsensitiveStarters(); + virtual void TestBug11049(); // The following functions are internal to the regexp tests. virtual void assertUText(const char *expected, UText *actual, const char *file, int line); diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt index f70cd0256bd..4d2e7f6dcd3 100644 --- a/icu4c/source/test/testdata/regextst.txt +++ b/icu4c/source/test/testdata/regextst.txt @@ -1192,6 +1192,15 @@ "^(\w+\d\w+:\w+)$" "<0><1>DiesIst1Beispiel:text" "^(\w+\d\w+:\w+)$" i "<0><1>DiesIst1Beispiel:text" +# Bug 11049 +# Edge cases in find() when pattern match begins with set of code points +# and the match begins at the end of the string. + +"A|B|C" "hello <0>A" +"A|B|C" "hello \U00011234" +"A|B|\U00012345" "hello <0>\U00012345" +"A|B|\U00010000" "hello \ud800" + # Random debugging, Temporary #