mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-21 12:40:02 +00:00
parent
dfc342e11a
commit
4fbfe4a646
4 changed files with 64 additions and 5 deletions
|
@ -3306,10 +3306,31 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
|||
}
|
||||
|
||||
case URX_STRING_I:
|
||||
// TODO: Is the case-folded string the longest?
|
||||
// If so we can optimize this the same as URX_STRING.
|
||||
loc++;
|
||||
currentLen = INT32_MAX;
|
||||
// TODO: This code assumes that any user string that matches will be no longer
|
||||
// than our compiled string, with case insensitive matching.
|
||||
// Our compiled string has been case-folded already.
|
||||
//
|
||||
// Any matching user string will have no more code points than our
|
||||
// compiled (folded) string. Folding may add code points, but
|
||||
// not remove them.
|
||||
//
|
||||
// There is a potential problem if a supplemental code point
|
||||
// case-folds to a BMP code point. In this case our compiled string
|
||||
// could be shorter (in code units) than a matching user string.
|
||||
//
|
||||
// At this time (Unicode 6.1) there are no such characters, and this case
|
||||
// is not being handled. A test, intltest regex/Bug9283, will fail if
|
||||
// any problematic characters are added to Unicode.
|
||||
//
|
||||
// If this happens, we can make a set of the BMP chars that the
|
||||
// troublesome supplementals fold to, scan our string, and bump the
|
||||
// currentLen one extra for each that is found.
|
||||
//
|
||||
{
|
||||
loc++;
|
||||
int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
|
||||
currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
|
||||
}
|
||||
break;
|
||||
|
||||
case URX_CTR_INIT:
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "unicode/regex.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "regextst.h"
|
||||
#include "uvector.h"
|
||||
|
@ -127,6 +128,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
|
|||
case 20: name = "CheckInvBufSize";
|
||||
if (exec) CheckInvBufSize();
|
||||
break;
|
||||
case 21: name = "Bug 9283";
|
||||
if (exec) Bug9283();
|
||||
break;
|
||||
|
||||
default: name = "";
|
||||
break; //needed to end loop
|
||||
|
@ -5184,6 +5188,34 @@ void RegexTest::Bug7029() {
|
|||
delete pMatcher;
|
||||
}
|
||||
|
||||
// Bug 9283
|
||||
// This test is checking for the existance of any supplemental characters that case-fold
|
||||
// to a bmp character.
|
||||
//
|
||||
// At the time of this writing there are none. If any should appear in a subsequent release
|
||||
// of Unicode, the code in regular expressions compilation that determines the longest
|
||||
// posssible match for a literal string will need to be enhanced.
|
||||
//
|
||||
// See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
|
||||
// for details on what to do in case of a failure of this test.
|
||||
//
|
||||
void RegexTest::Bug9283() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
int32_t index;
|
||||
UChar32 c;
|
||||
for (index=0; ; index++) {
|
||||
c = supplementalsWithCaseFolding.charAt(index);
|
||||
if (c == -1) {
|
||||
break;
|
||||
}
|
||||
UnicodeString cf = UnicodeString(c).foldCase();
|
||||
REGEX_ASSERT(cf.length() >= 2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void RegexTest::CheckInvBufSize() {
|
||||
if(inv_next>=INV_BUFSIZ) {
|
||||
errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2002-2011, International Business Machines Corporation and
|
||||
* Copyright (c) 2002-2012, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -45,6 +45,7 @@ public:
|
|||
virtual void Bug7740();
|
||||
virtual void Bug8479();
|
||||
virtual void Bug7029();
|
||||
virtual void Bug9283();
|
||||
virtual void CheckInvBufSize();
|
||||
|
||||
// The following functions are internal to the regexp tests.
|
||||
|
|
5
source/test/testdata/regextst.txt
vendored
5
source/test/testdata/regextst.txt
vendored
|
@ -1141,6 +1141,11 @@
|
|||
"[\w]+" " <0>abc\u200cdef\u200dghi</0> "
|
||||
"[\w]+" i " <0>abc\u200cdef\u200dghi</0> "
|
||||
|
||||
# Bug 9283
|
||||
# uregex_open fails for look-behind assertion + case-insensitive
|
||||
|
||||
"(ab)?(?<=ab)cd|ef" i "<0><1>ab</1>cd</0>"
|
||||
|
||||
# Random debugging, Temporary
|
||||
#
|
||||
#"^(?:a?b?)*$" "a--"
|
||||
|
|
Loading…
Add table
Reference in a new issue