mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-20863 Regex, lazy creation and reduced size of map from capture group names to numbers.
This commit is contained in:
parent
b77e868690
commit
03937347fb
7 changed files with 143 additions and 26 deletions
|
@ -490,6 +490,12 @@ UBool RegexCompile::doParseActions(int32_t action)
|
|||
|
||||
// If this is a named capture group, add the name->group number mapping.
|
||||
if (fCaptureName != NULL) {
|
||||
if (!fRXPat->initNamedCaptureMap()) {
|
||||
if (U_SUCCESS(*fStatus)) {
|
||||
error(fRXPat->fDeferredStatus);
|
||||
}
|
||||
break;
|
||||
}
|
||||
int32_t groupNumber = fRXPat->fGroupMap->size();
|
||||
int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
|
||||
fCaptureName = NULL; // hash table takes ownership of the name (key) string.
|
||||
|
@ -1345,7 +1351,8 @@ UBool RegexCompile::doParseActions(int32_t action)
|
|||
|
||||
case doCompleteNamedBackRef:
|
||||
{
|
||||
int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
|
||||
int32_t groupNumber =
|
||||
fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
|
||||
if (groupNumber == 0) {
|
||||
// Group name has not been defined.
|
||||
// Could be a forward reference. If we choose to support them at some
|
||||
|
|
|
@ -429,7 +429,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
|
|||
(nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
|
||||
groupName.append(nextChar);
|
||||
} else if (nextChar == RIGHTBRACKET) {
|
||||
groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
|
||||
groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
|
||||
if (groupNum == 0) {
|
||||
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
}
|
||||
|
|
|
@ -138,18 +138,20 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
}
|
||||
|
||||
// Copy the named capture group hash map.
|
||||
int32_t hashPos = UHASH_FIRST;
|
||||
while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
break;
|
||||
}
|
||||
const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
|
||||
UnicodeString *key = new UnicodeString(*name);
|
||||
int32_t val = hashEl->value.integer;
|
||||
if (key == NULL) {
|
||||
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
|
||||
if (other.fNamedCaptureMap != nullptr && initNamedCaptureMap()) {
|
||||
int32_t hashPos = UHASH_FIRST;
|
||||
while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
break;
|
||||
}
|
||||
const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
|
||||
UnicodeString *key = new UnicodeString(*name);
|
||||
int32_t val = hashEl->value.integer;
|
||||
if (key == NULL) {
|
||||
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
|
||||
}
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
|
@ -191,27 +193,38 @@ void RegexPattern::init() {
|
|||
fSets = new UVector(fDeferredStatus);
|
||||
fInitialChars = new UnicodeSet;
|
||||
fInitialChars8 = new Regex8BitSet;
|
||||
fNamedCaptureMap = uhash_open(uhash_hashUnicodeString, // Key hash function
|
||||
uhash_compareUnicodeString, // Key comparator function
|
||||
uhash_compareLong, // Value comparator function
|
||||
&fDeferredStatus);
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
return;
|
||||
}
|
||||
if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
|
||||
fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) {
|
||||
fInitialChars == NULL || fInitialChars8 == NULL) {
|
||||
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
// Slot zero of the vector of sets is reserved. Fill it here.
|
||||
fSets->addElement((int32_t)0, fDeferredStatus);
|
||||
}
|
||||
|
||||
|
||||
bool RegexPattern::initNamedCaptureMap() {
|
||||
if (fNamedCaptureMap) {
|
||||
return true;
|
||||
}
|
||||
fNamedCaptureMap = uhash_openSize(uhash_hashUnicodeString, // Key hash function
|
||||
uhash_compareUnicodeString, // Key comparator function
|
||||
uhash_compareLong, // Value comparator function
|
||||
7, // Initial table capacity
|
||||
&fDeferredStatus);
|
||||
if (U_FAILURE(fDeferredStatus)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// fNamedCaptureMap owns its key strings, type (UnicodeString *)
|
||||
uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
//
|
||||
// zap Delete everything owned by this RegexPattern.
|
||||
|
@ -246,8 +259,10 @@ void RegexPattern::zap() {
|
|||
delete fPatternString;
|
||||
fPatternString = NULL;
|
||||
}
|
||||
uhash_close(fNamedCaptureMap);
|
||||
fNamedCaptureMap = NULL;
|
||||
if (fNamedCaptureMap != NULL) {
|
||||
uhash_close(fNamedCaptureMap);
|
||||
fNamedCaptureMap = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -618,7 +633,7 @@ int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UError
|
|||
// No need to explicitly check for syntactically valid names.
|
||||
// Invalid ones will never be in the map, and the lookup will fail.
|
||||
|
||||
int32_t number = uhash_geti(fNamedCaptureMap, &groupName);
|
||||
int32_t number = fNamedCaptureMap ? uhash_geti(fNamedCaptureMap, &groupName) : 0;
|
||||
if (number == 0) {
|
||||
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
}
|
||||
|
|
|
@ -635,8 +635,9 @@ private:
|
|||
//
|
||||
// Implementation Methods
|
||||
//
|
||||
void init(); // Common initialization, for use by constructors.
|
||||
void zap(); // Common cleanup
|
||||
void init(); // Common initialization, for use by constructors.
|
||||
bool initNamedCaptureMap(); // Lazy init for fNamedCaptureMap.
|
||||
void zap(); // Common cleanup
|
||||
|
||||
void dumpOp(int32_t index) const;
|
||||
|
||||
|
|
|
@ -1508,7 +1508,8 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
|
|||
(c32 >= 0x31 && c32 <= 0x39)) { // 0..9
|
||||
groupName.append(c32);
|
||||
} else if (c32 == RIGHTBRACKET) {
|
||||
groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
|
||||
groupNum = regexp->fPat->fNamedCaptureMap ?
|
||||
uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName) : 0;
|
||||
if (groupNum == 0) {
|
||||
// Name not defined by pattern.
|
||||
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
|
||||
|
|
|
@ -105,6 +105,7 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
|
|||
TESTCASE_AUTO(TestBug13631);
|
||||
TESTCASE_AUTO(TestBug13632);
|
||||
TESTCASE_AUTO(TestBug20359);
|
||||
TESTCASE_AUTO(TestBug20863);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
@ -5913,4 +5914,95 @@ void RegexTest::TestBug20359() {
|
|||
assertSuccess(WHERE, status);
|
||||
}
|
||||
|
||||
|
||||
void RegexTest::TestBug20863() {
|
||||
// Test that patterns with a large number of named capture groups work correctly.
|
||||
//
|
||||
// The ticket was not for a bug per se, but to reduce memory usage by using lazy
|
||||
// construction of the map from capture names to numbers, and decreasing the
|
||||
// default size of the map.
|
||||
|
||||
constexpr int GROUP_COUNT = 2000;
|
||||
std::vector<UnicodeString> groupNames;
|
||||
for (int32_t i=0; i<GROUP_COUNT; ++i) {
|
||||
UnicodeString name;
|
||||
name.append(u"name");
|
||||
name.append(Int64ToUnicodeString(i));
|
||||
groupNames.push_back(name);
|
||||
}
|
||||
|
||||
UnicodeString patternString;
|
||||
for (UnicodeString name: groupNames) {
|
||||
patternString.append(u"(?<");
|
||||
patternString.append(name);
|
||||
patternString.append(u">.)");
|
||||
}
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError pe;
|
||||
LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
|
||||
if (!assertSuccess(WHERE, status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int32_t i=0; i<GROUP_COUNT; ++i) {
|
||||
int32_t group = pattern->groupNumberFromName(groupNames[i], status);
|
||||
if (!assertSuccess(WHERE, status)) {
|
||||
return;
|
||||
}
|
||||
assertEquals(WHERE, i+1, group);
|
||||
// Note: group 0 is the overall match; group 1 is the first separate capture group.
|
||||
}
|
||||
|
||||
// Verify that assignment of patterns with various combinations of named capture work.
|
||||
// Lazy creation of the internal named capture map changed the implementation logic here.
|
||||
{
|
||||
LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
|
||||
LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
|
||||
assertSuccess(WHERE, status);
|
||||
assertFalse(WHERE, *pat1 == *pat2);
|
||||
*pat1 = *pat2;
|
||||
assertTrue(WHERE, *pat1 == *pat2);
|
||||
assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
|
||||
assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
|
||||
assertSuccess(WHERE, status);
|
||||
}
|
||||
|
||||
{
|
||||
LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
|
||||
LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
|
||||
assertSuccess(WHERE, status);
|
||||
assertFalse(WHERE, *pat1 == *pat2);
|
||||
*pat2 = *pat1;
|
||||
assertTrue(WHERE, *pat1 == *pat2);
|
||||
assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
|
||||
assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
|
||||
status = U_ZERO_ERROR;
|
||||
assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
|
||||
assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
|
||||
status = U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
{
|
||||
LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
|
||||
LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
|
||||
assertSuccess(WHERE, status);
|
||||
assertFalse(WHERE, *pat1 == *pat2);
|
||||
*pat2 = *pat1;
|
||||
assertTrue(WHERE, *pat1 == *pat2);
|
||||
assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
|
||||
assertSuccess(WHERE, status);
|
||||
assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
|
||||
assertSuccess(WHERE, status);
|
||||
assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
|
||||
assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
|
||||
status = U_ZERO_ERROR;
|
||||
assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
|
||||
assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
|
||||
status = U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
|
||||
|
|
|
@ -60,6 +60,7 @@ public:
|
|||
virtual void TestBug13631();
|
||||
virtual void TestBug13632();
|
||||
virtual void TestBug20359();
|
||||
virtual void TestBug20863();
|
||||
|
||||
// The following functions are internal to the regexp tests.
|
||||
virtual void assertUText(const char *expected, UText *actual, const char *file, int line);
|
||||
|
|
Loading…
Add table
Reference in a new issue