ICU-10835 Improve performance of case insensitive find operations.

X-SVN-Rev: 35683
This commit is contained in:
Andy Heninger 2014-05-02 22:51:53 +00:00
parent 8807332753
commit a45f7faf63
5 changed files with 165 additions and 22 deletions

View file

@ -2375,6 +2375,105 @@ UBool RegexCompile::compileInlineInterval() {
//------------------------------------------------------------------------------
//
// caseInsensitiveStart given a single code point from a pattern string, determine the
// set of characters that could potentially begin a case-insensitive
// match of a string beginning with that character, using full Unicode
// case insensitive matching.
//
// This is used in optimizing find().
//
// closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
// misses cases like this:
// A string from the pattern begins with 'ss' (although all we know
// in this context is that it begins with 's')
// The pattern could match a string beginning with a German sharp-s
//
// To the ordinary case closure for a character c, we add all other
// characters cx where the case closure of cx incudes a string form that begins
// with the original character c.
//
// This function could be made smarter. The full pattern string is available
// and it would be possible to verify that the extra characters being added
// to the starting set fully match, rather than having just a first-char of the
// folded form match.
//
//------------------------------------------------------------------------------
void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
// Machine Generated below.
// It may need updating with new versions of Unicode.
// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
// The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
// Machine Generated Data. Do not hand edit.
static const UChar32 RECaseFixCodePoints[] = {
0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
static const int16_t RECaseFixStringOffsets[] = {
0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
static const int16_t RECaseFixCounts[] = {
0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
static const UChar RECaseFixData[] = {
0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
// End of machine generated data.
if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
starterChars->set(caseFoldedC, caseFoldedC);
int32_t i;
for (i=0; RECaseFixCodePoints[i]<c ; i++) {
// Simple linear search through the sorted list of interesting code points.
}
if (RECaseFixCodePoints[i] == c) {
int32_t dataIndex = RECaseFixStringOffsets[i];
int32_t numCharsToAdd = RECaseFixCounts[i];
UChar32 cpToAdd = 0;
for (int32_t j=0; j<numCharsToAdd; j++) {
U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
starterChars->add(cpToAdd);
}
}
starterChars->closeOver(USET_CASE_INSENSITIVE);
starterChars->removeAllStrings();
} else {
// Not a cased character. Just return it alone.
starterChars->set(c, c);
}
}
//------------------------------------------------------------------------------
//
// matchStartType Determine how a match can start.
@ -2565,17 +2664,12 @@ void RegexCompile::matchStartType() {
if (currentLen == 0) {
UChar32 c = URX_VAL(op);
if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
// Disable optimizations on first char of match.
// TODO: Compute the set of chars that case fold to this char, or to
// a string that begins with this char.
// For simple case folding, this code worked:
// UnicodeSet s(c, c);
// s.closeOver(USET_CASE_INSENSITIVE);
// fRXPat->fInitialChars->addAll(s);
fRXPat->fInitialChars->clear();
fRXPat->fInitialChars->complement();
UnicodeSet starters(c, c);
starters.closeOver(USET_CASE_INSENSITIVE);
// findCaseInsensitiveStarters(c, &starters);
// For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
// The expanded folding can't match the pattern.
fRXPat->fInitialChars->addAll(starters);
} else {
// Char has no case variants. Just add it as-is to the
// set of possible starting chars.
@ -2698,14 +2792,8 @@ void RegexCompile::matchStartType() {
// characters for this pattern.
int32_t stringStartIdx = URX_VAL(op);
UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
UnicodeSet s(c, c);
// TODO: compute correct set of starting chars for full case folding.
// For the moment, say any char can start.
// s.closeOver(USET_CASE_INSENSITIVE);
s.clear();
s.complement();
UnicodeSet s;
findCaseInsensitiveStarters(c, &s);
fRXPat->fInitialChars->addAll(s);
numInitialStrings += 2; // Matching on an initial string not possible.
}

View file

@ -1,7 +1,7 @@
//
// regexcmp.h
//
// Copyright (C) 2002-2012, International Business Machines Corporation and others.
// Copyright (C) 2002-2014, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for the class RegexCompile
@ -22,6 +22,7 @@
#include "unicode/parseerr.h"
#include "uhash.h"
#include "uvector.h"
#include "uvectr32.h"
@ -115,6 +116,10 @@ private:
UChar32 scanNamedChar();
UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
public: // Public for testing only.
static void findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars);
private:
UErrorCode *fStatus;
RegexPattern *fRXPat;

View file

@ -28,8 +28,10 @@
#include "unicode/ucnv.h"
#include "unicode/uniset.h"
#include "unicode/uregex.h"
#include "unicode/usetiter.h"
#include "unicode/ustring.h"
#include "regextst.h"
#include "regexcmp.h"
#include "uvector.h"
#include "util.h"
#include <stdlib.h>
@ -135,6 +137,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
case 22: name = "Bug10459";
if (exec) Bug10459();
break;
case 23: name = "TestCaseInsensitiveStarters";
if (exec) TestCaseInsensitiveStarters();
break;
default: name = "";
break; //needed to end loop
@ -5267,5 +5272,36 @@ void RegexTest::Bug10459() {
utext_close(utext_txt);
}
void RegexTest::TestCaseInsensitiveStarters() {
// Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
// become stale because of new Unicode characters.
// If it is stale, rerun the generation tool
// svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
// and replace the embedded data in i18n/regexcmp.cpp
for (UChar32 cp=0; cp<=0x10ffff; cp++) {
if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
continue;
}
UnicodeSet s(cp, cp);
s.closeOver(USET_CASE_INSENSITIVE);
UnicodeSetIterator setIter(s);
while (setIter.next()) {
if (!setIter.isString()) {
continue;
}
const UnicodeString &str = setIter.getString();
UChar32 firstChar = str.char32At(0);
UnicodeSet starters;
RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
if (!starters.contains(cp)) {
errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
return;
}
}
}
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2002-2013, International Business Machines Corporation and
* Copyright (c) 2002-2014, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -48,6 +48,7 @@ public:
virtual void Bug9283();
virtual void CheckInvBufSize();
virtual void Bug10459();
virtual void TestCaseInsensitiveStarters();
// The following functions are internal to the regexp tests.
virtual void assertUText(const char *expected, UText *actual, const char *file, int line);

View file

@ -519,9 +519,15 @@
'((((((((((a))))))))))\10' i "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"
"(?:(?i)a)b" "<0>Ab</0>"
"ab(?i)cd" "<0>abCd</0>"
"ab(?i)cd" "<0>abCd</0>"
"ab$cd" "abcd"
"ssl" i "abc<0>ßl</0>xyz"
"ssl" i "abc<0>ẞl</0>xyz"
"FIND" i "can <0>find</0> ?" # fi ligature, \ufb01
"find" i "can <0>FIND</0> ?"
"ῧ" i "xxx<0>ῧ</0>xxx" # Composed char (match string) decomposes when case-folded (pattern)
# White space handling
"a b" "ab"
"abc " "abc"
@ -1172,6 +1178,13 @@
"(?<=a{1,})bc" E "aaaa<0>bc</0>def" # U_REGEX_LOOK_BEHIND_LIMIT error.
"(?<=(?:){11})bc" "<0>bc</0>" # Empty (?:) expression.
# Bug 10835
# Match Start Set not being correctly computed for case insensitive patterns.
# (Test here is to dump the compiled pattern & manually check the start set.)
"(private|secret|confidential|classified|restricted)" i "hmm, <0><1>Classified</1></0> stuff"
"(private|secret|confidential|classified|restricted)" "hmm, Classified stuff"
# Bug 10844
"^([\w\d:]+)$" "<0><1>DiesIst1Beispiel:text</1></0>"