ICU-20863 Regex, lazy creation and reduced size of map from capture group names to numbers.

This commit is contained in:
Andy Heninger 2019-10-18 19:00:32 -07:00
parent b77e868690
commit 03937347fb
7 changed files with 143 additions and 26 deletions

View file

@ -490,6 +490,12 @@ UBool RegexCompile::doParseActions(int32_t action)
// If this is a named capture group, add the name->group number mapping.
if (fCaptureName != NULL) {
if (!fRXPat->initNamedCaptureMap()) {
if (U_SUCCESS(*fStatus)) {
error(fRXPat->fDeferredStatus);
}
break;
}
int32_t groupNumber = fRXPat->fGroupMap->size();
int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
fCaptureName = NULL; // hash table takes ownership of the name (key) string.
@ -1345,7 +1351,8 @@ UBool RegexCompile::doParseActions(int32_t action)
case doCompleteNamedBackRef:
{
int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
int32_t groupNumber =
fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
if (groupNumber == 0) {
// Group name has not been defined.
// Could be a forward reference. If we choose to support them at some

View file

@ -429,7 +429,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
(nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
groupName.append(nextChar);
} else if (nextChar == RIGHTBRACKET) {
groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
if (groupNum == 0) {
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}

View file

@ -138,18 +138,20 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
}
// Copy the named capture group hash map.
int32_t hashPos = UHASH_FIRST;
while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
if (U_FAILURE(fDeferredStatus)) {
break;
}
const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
UnicodeString *key = new UnicodeString(*name);
int32_t val = hashEl->value.integer;
if (key == NULL) {
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
} else {
uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
if (other.fNamedCaptureMap != nullptr && initNamedCaptureMap()) {
int32_t hashPos = UHASH_FIRST;
while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
if (U_FAILURE(fDeferredStatus)) {
break;
}
const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
UnicodeString *key = new UnicodeString(*name);
int32_t val = hashEl->value.integer;
if (key == NULL) {
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
} else {
uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
}
}
}
return *this;
@ -191,27 +193,38 @@ void RegexPattern::init() {
fSets = new UVector(fDeferredStatus);
fInitialChars = new UnicodeSet;
fInitialChars8 = new Regex8BitSet;
fNamedCaptureMap = uhash_open(uhash_hashUnicodeString, // Key hash function
uhash_compareUnicodeString, // Key comparator function
uhash_compareLong, // Value comparator function
&fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return;
}
if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) {
fInitialChars == NULL || fInitialChars8 == NULL) {
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Slot zero of the vector of sets is reserved. Fill it here.
fSets->addElement((int32_t)0, fDeferredStatus);
}
bool RegexPattern::initNamedCaptureMap() {
if (fNamedCaptureMap) {
return true;
}
fNamedCaptureMap = uhash_openSize(uhash_hashUnicodeString, // Key hash function
uhash_compareUnicodeString, // Key comparator function
uhash_compareLong, // Value comparator function
7, // Initial table capacity
&fDeferredStatus);
if (U_FAILURE(fDeferredStatus)) {
return false;
}
// fNamedCaptureMap owns its key strings, type (UnicodeString *)
uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
return true;
}
//--------------------------------------------------------------------------
//
// zap Delete everything owned by this RegexPattern.
@ -246,8 +259,10 @@ void RegexPattern::zap() {
delete fPatternString;
fPatternString = NULL;
}
uhash_close(fNamedCaptureMap);
fNamedCaptureMap = NULL;
if (fNamedCaptureMap != NULL) {
uhash_close(fNamedCaptureMap);
fNamedCaptureMap = NULL;
}
}
@ -618,7 +633,7 @@ int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UError
// No need to explicitly check for syntactically valid names.
// Invalid ones will never be in the map, and the lookup will fail.
int32_t number = uhash_geti(fNamedCaptureMap, &groupName);
int32_t number = fNamedCaptureMap ? uhash_geti(fNamedCaptureMap, &groupName) : 0;
if (number == 0) {
status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}

View file

@ -635,8 +635,9 @@ private:
//
// Implementation Methods
//
void init(); // Common initialization, for use by constructors.
void zap(); // Common cleanup
void init(); // Common initialization, for use by constructors.
bool initNamedCaptureMap(); // Lazy init for fNamedCaptureMap.
void zap(); // Common cleanup
void dumpOp(int32_t index) const;

View file

@ -1508,7 +1508,8 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
(c32 >= 0x31 && c32 <= 0x39)) { // 0..9
groupName.append(c32);
} else if (c32 == RIGHTBRACKET) {
groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
groupNum = regexp->fPat->fNamedCaptureMap ?
uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName) : 0;
if (groupNum == 0) {
// Name not defined by pattern.
*status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;

View file

@ -105,6 +105,7 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
TESTCASE_AUTO(TestBug13631);
TESTCASE_AUTO(TestBug13632);
TESTCASE_AUTO(TestBug20359);
TESTCASE_AUTO(TestBug20863);
TESTCASE_AUTO_END;
}
@ -5913,4 +5914,95 @@ void RegexTest::TestBug20359() {
assertSuccess(WHERE, status);
}
void RegexTest::TestBug20863() {
// Test that patterns with a large number of named capture groups work correctly.
//
// The ticket was not for a bug per se, but to reduce memory usage by using lazy
// construction of the map from capture names to numbers, and decreasing the
// default size of the map.
constexpr int GROUP_COUNT = 2000;
std::vector<UnicodeString> groupNames;
for (int32_t i=0; i<GROUP_COUNT; ++i) {
UnicodeString name;
name.append(u"name");
name.append(Int64ToUnicodeString(i));
groupNames.push_back(name);
}
UnicodeString patternString;
for (UnicodeString name: groupNames) {
patternString.append(u"(?<");
patternString.append(name);
patternString.append(u">.)");
}
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
if (!assertSuccess(WHERE, status)) {
return;
}
for (int32_t i=0; i<GROUP_COUNT; ++i) {
int32_t group = pattern->groupNumberFromName(groupNames[i], status);
if (!assertSuccess(WHERE, status)) {
return;
}
assertEquals(WHERE, i+1, group);
// Note: group 0 is the overall match; group 1 is the first separate capture group.
}
// Verify that assignment of patterns with various combinations of named capture work.
// Lazy creation of the internal named capture map changed the implementation logic here.
{
LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
assertSuccess(WHERE, status);
assertFalse(WHERE, *pat1 == *pat2);
*pat1 = *pat2;
assertTrue(WHERE, *pat1 == *pat2);
assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
assertSuccess(WHERE, status);
}
{
LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
assertSuccess(WHERE, status);
assertFalse(WHERE, *pat1 == *pat2);
*pat2 = *pat1;
assertTrue(WHERE, *pat1 == *pat2);
assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
status = U_ZERO_ERROR;
assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
status = U_ZERO_ERROR;
}
{
LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
assertSuccess(WHERE, status);
assertFalse(WHERE, *pat1 == *pat2);
*pat2 = *pat1;
assertTrue(WHERE, *pat1 == *pat2);
assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
assertSuccess(WHERE, status);
assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
assertSuccess(WHERE, status);
assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
status = U_ZERO_ERROR;
assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
status = U_ZERO_ERROR;
}
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

View file

@ -60,6 +60,7 @@ public:
virtual void TestBug13631();
virtual void TestBug13632();
virtual void TestBug20359();
virtual void TestBug20863();
// The following functions are internal to the regexp tests.
virtual void assertUText(const char *expected, UText *actual, const char *file, int line);