From ac329166b492dfcd123d8bf9b360e449aff8007a Mon Sep 17 00:00:00 2001 From: George Rhoten Date: Fri, 22 Jun 2007 01:06:31 +0000 Subject: [PATCH] ICU-5427 Reduce regex static memory consumption by 35% X-SVN-Rev: 21805 --- icu4c/source/i18n/regexcmp.cpp | 4 +- icu4c/source/i18n/regexst.cpp | 93 +++++++++++++++++----------------- icu4c/source/i18n/regexst.h | 15 +++--- icu4c/source/i18n/rematch.cpp | 4 +- 4 files changed, 57 insertions(+), 59 deletions(-) diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index d74124d63ef..558f80b6c48 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -1175,7 +1175,7 @@ UBool RegexCompile::doParseActions(int32_t action) break; } c = peekCharLL(); - if (RegexStaticSets::gStaticSets->fRuleDigits->contains(c) == FALSE) { + if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) { break; } nextCharLL(); @@ -3375,7 +3375,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) { int32_t startX = fNextIndex; // start and end positions of the int32_t endX = fNextIndex; // sequence following the '\' if (c.fChar == chBackSlash) { - if (RegexStaticSets::gStaticSets->fUnescapeCharSet->contains(peekCharLL())) { + if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { // // A '\' sequence that is handled by ICU's standard unescapeAt function. // Includes \uxxxx, \n, \r, many others. diff --git a/icu4c/source/i18n/regexst.cpp b/icu4c/source/i18n/regexst.cpp index 6a4bb305a98..41014365dff 100644 --- a/icu4c/source/i18n/regexst.cpp +++ b/icu4c/source/i18n/regexst.cpp @@ -1,7 +1,7 @@ // // regexst.h // -// Copyright (C) 2004-2006, International Business Machines Corporation and others. +// Copyright (C) 2004-2007, International Business Machines Corporation and others. // All Rights Reserved. // // This file contains class RegexStaticSets @@ -38,13 +38,13 @@ U_NAMESPACE_BEGIN -//---------------------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // // Unicode Set pattern strings for all of the required constant sets. // Initialized with hex values for portability to EBCDIC based machines. // Really ugly, but there's no good way to avoid it. // -//---------------------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // "Rule Char" Characters are those with no special meaning, and therefore do not // need to be escaped to appear as literals in a regexp. Expressed @@ -99,7 +99,7 @@ static const UChar gIsWordPattern[] = { // // Unicode Set Definitions for Regular Expression \s // - static const UChar gIsSpacePattern[] = { +static const UChar gIsSpacePattern[] = { // [ \ p { W h i t e S p a c e } ] 0x5b, 0x5c, 0x70, 0x7b, 0x57, 0x68, 0x69, 0x74, 0x65, 0x53, 0x70, 0x61, 0x63, 0x65, 0x7d, 0x5d, 0}; @@ -107,7 +107,7 @@ static const UChar gIsWordPattern[] = { // // UnicodeSets used in implementation of Grapheme Cluster detection, \X // - static const UChar gGC_ControlPattern[] = { +static const UChar gGC_ControlPattern[] = { // [ [ : Z l : ] [ : Z p : ] 0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d, // [ : C c : ] [ : C f : ] - @@ -117,37 +117,37 @@ static const UChar gIsWordPattern[] = { // E x t e n d : ] ] 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x3a, 0x5d, 0x5d, 0}; - static const UChar gGC_ExtendPattern[] = { +static const UChar gGC_ExtendPattern[] = { // [ \ p { G r a p h e m e _ 0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f, // E x t e n d } ] 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0}; - static const UChar gGC_LPattern[] = { +static const UChar gGC_LPattern[] = { // [ \ p { H a n g u l _ S y l 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, // l a b l e _ T y p e = L } ] 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0}; - static const UChar gGC_VPattern[] = { +static const UChar gGC_VPattern[] = { // [ \ p { H a n g u l _ S y l 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, // l a b l e _ T y p e = V } ] 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0}; - static const UChar gGC_TPattern[] = { +static const UChar gGC_TPattern[] = { // [ \ p { H a n g u l _ S y l 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, // l a b l e _ T y p e = T } ] 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0}; - static const UChar gGC_LVPattern[] = { +static const UChar gGC_LVPattern[] = { // [ \ p { H a n g u l _ S y l 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, // l a b l e _ T y p e = L V } ] 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0}; - static const UChar gGC_LVTPattern[] = { +static const UChar gGC_LVTPattern[] = { // [ \ p { H a n g u l _ S y l 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, // l a b l e _ T y p e = L V T } ] @@ -155,29 +155,30 @@ static const UChar gIsWordPattern[] = { RegexStaticSets *RegexStaticSets::gStaticSets = NULL; -RegexStaticSets::RegexStaticSets(UErrorCode *status) { +RegexStaticSets::RegexStaticSets(UErrorCode *status) +: +fUnescapeCharSet(UnicodeString(TRUE, gUnescapeCharPattern, -1), *status), +fRuleDigitsAlias(NULL) +{ // First zero out everything int i; for (i=0; icomplement(); + fPropSets[URX_GC_NORMAL] = new UnicodeSet(0, UnicodeSet::MAX_VALUE); fPropSets[URX_GC_NORMAL]->remove(0xac00, 0xd7a4); fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_CONTROL]); fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]); @@ -206,47 +206,46 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) { // Initialize the 8-bit fast bit sets from the parallel full // UnicodeSets. for (i=0; icompact(); + fPropSets8[i].init(fPropSets[i]); + } } // Sets used while parsing rules, but not referenced from the parse state table - fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(gRuleSet_rule_char_pattern, *status); - fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(gRuleWhiteSpacePattern, *status); - fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern, *status); - fRuleDigits = new UnicodeSet(gRuleSet_digit_char_pattern, *status); - fUnescapeCharSet = new UnicodeSet(gUnescapeCharPattern, *status); - - // Empty UnicodeString, for use by matchers with NULL input. - fEmptyString = new UnicodeString; + fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status); + fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodeString(TRUE, gRuleWhiteSpacePattern, -1), *status); + fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status); + fRuleDigitsAlias = fRuleSets[kRuleSet_digit_char-128]; + for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) { + if (fRuleSets[i]) { + fRuleSets[i]->compact(); + } + } } RegexStaticSets::~RegexStaticSets() { - int i; + int32_t i; for (i=0; ifEmptyString); + reset(RegexStaticSets::gStaticSets->fEmptyString); } @@ -103,7 +103,7 @@ RegexMatcher::RegexMatcher(const UnicodeString ®exp, if (fStack == NULL || fData == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } - reset(*RegexStaticSets::gStaticSets->fEmptyString); + reset(RegexStaticSets::gStaticSets->fEmptyString); }