From ac329166b492dfcd123d8bf9b360e449aff8007a Mon Sep 17 00:00:00 2001
From: George Rhoten <grhoten@users.noreply.github.com>
Date: Fri, 22 Jun 2007 01:06:31 +0000
Subject: [PATCH] ICU-5427 Reduce regex static memory consumption by 35%

X-SVN-Rev: 21805
---
 icu4c/source/i18n/regexcmp.cpp |  4 +-
 icu4c/source/i18n/regexst.cpp  | 93 +++++++++++++++++-----------------
 icu4c/source/i18n/regexst.h    | 15 +++---
 icu4c/source/i18n/rematch.cpp  |  4 +-
 4 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp
index d74124d63ef..558f80b6c48 100644
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@@ -1175,7 +1175,7 @@ UBool RegexCompile::doParseActions(int32_t action)
                     break;
                 }
                 c = peekCharLL();
-                if (RegexStaticSets::gStaticSets->fRuleDigits->contains(c) == FALSE) {
+                if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
                     break;
                 }
                 nextCharLL();
@@ -3375,7 +3375,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
                 int32_t startX = fNextIndex;  // start and end positions of the
                 int32_t endX   = fNextIndex;  //   sequence following the '\'
         if (c.fChar == chBackSlash) {
-            if (RegexStaticSets::gStaticSets->fUnescapeCharSet->contains(peekCharLL())) {
+            if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
                 //
                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
                 //   Includes \uxxxx, \n, \r, many others.
diff --git a/icu4c/source/i18n/regexst.cpp b/icu4c/source/i18n/regexst.cpp
index 6a4bb305a98..41014365dff 100644
--- a/icu4c/source/i18n/regexst.cpp
+++ b/icu4c/source/i18n/regexst.cpp
@@ -1,7 +1,7 @@
 //
 //  regexst.h
 //
-//  Copyright (C) 2004-2006, International Business Machines Corporation and others.
+//  Copyright (C) 2004-2007, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains class RegexStaticSets
@@ -38,13 +38,13 @@
 U_NAMESPACE_BEGIN
 
 
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 //
 // Unicode Set pattern strings for all of the required constant sets.
 //               Initialized with hex values for portability to EBCDIC based machines.
 //                Really ugly, but there's no good way to avoid it.
 //
-//----------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 // "Rule Char" Characters are those with no special meaning, and therefore do not
 //    need to be escaped to appear as literals in a regexp.  Expressed
@@ -99,7 +99,7 @@ static const UChar gIsWordPattern[] = {
 //
 //  Unicode Set Definitions for Regular Expression  \s
 //
-    static const UChar gIsSpacePattern[] = {
+static const UChar gIsSpacePattern[] = {
 //        [     \     p     {     W     h     i     t     e     S     p     a     c     e     }     ]
         0x5b, 0x5c, 0x70, 0x7b, 0x57, 0x68, 0x69, 0x74, 0x65, 0x53, 0x70, 0x61, 0x63, 0x65, 0x7d, 0x5d, 0};
 
@@ -107,7 +107,7 @@ static const UChar gIsWordPattern[] = {
 //
 //  UnicodeSets used in implementation of Grapheme Cluster detection, \X
 //
-    static const UChar gGC_ControlPattern[] = {
+static const UChar gGC_ControlPattern[] = {
 //    [     [     :     Z     l     :     ]     [     :     Z     p     :     ]    
     0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d, 
 //    [     :     C     c     :     ]     [     :     C     f     :     ]     -
@@ -117,37 +117,37 @@ static const UChar gIsWordPattern[] = {
 //    E     x     t     e     n     d     :     ]     ]
     0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x3a, 0x5d, 0x5d, 0};
 
-    static const UChar gGC_ExtendPattern[] = {
+static const UChar gGC_ExtendPattern[] = {
 //    [     \     p     {     G     r     a     p     h     e     m     e     _
     0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
 //    E     x     t     e     n     d     }     ]
     0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};
 
-    static const UChar gGC_LPattern[] = {
+static const UChar gGC_LPattern[] = {
 //    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
     0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
 //    l     a     b     l     e     _     T     y     p     e     =     L     }     ]
     0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d,  0x5d, 0}; 
 
-    static const UChar gGC_VPattern[] = {
+static const UChar gGC_VPattern[] = {
 //    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
     0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
 //    l     a     b     l     e     _     T     y     p     e     =     V     }     ]
     0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d,  0x5d, 0}; 
 
-    static const UChar gGC_TPattern[] = {
+static const UChar gGC_TPattern[] = {
 //    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
     0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
 //    l     a     b     l     e     _     T     y     p     e     =     T     }    ]
     0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0}; 
 
-    static const UChar gGC_LVPattern[] = {
+static const UChar gGC_LVPattern[] = {
 //    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
     0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
 //    l     a     b     l     e     _     T     y     p     e     =     L     V     }     ]
     0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0}; 
 
-    static const UChar gGC_LVTPattern[] = {
+static const UChar gGC_LVTPattern[] = {
 //    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
     0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
 //    l     a     b     l     e     _     T     y     p     e     =     L     V     T     }     ]
@@ -155,29 +155,30 @@ static const UChar gIsWordPattern[] = {
 
 RegexStaticSets *RegexStaticSets::gStaticSets = NULL;
 
-RegexStaticSets::RegexStaticSets(UErrorCode *status) {
+RegexStaticSets::RegexStaticSets(UErrorCode *status)
+:
+fUnescapeCharSet(UnicodeString(TRUE, gUnescapeCharPattern, -1), *status),
+fRuleDigitsAlias(NULL)
+{
     // First zero out everything  
     int i;
     for (i=0; i<URX_LAST_SET; i++) {
         fPropSets[i] = NULL;
     }
-    for (i=0; i<10; i++) {
+    for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
         fRuleSets[i] = NULL;
     }
-    fUnescapeCharSet = NULL;
-    fRuleDigits      = NULL;
-    fEmptyString     = NULL;
 
     // Then init the sets to their correct values.
-    fPropSets[URX_ISWORD_SET]  = new UnicodeSet(gIsWordPattern,     *status);
-    fPropSets[URX_ISSPACE_SET] = new UnicodeSet(gIsSpacePattern,    *status);    
-    fPropSets[URX_GC_EXTEND]   = new UnicodeSet(gGC_ExtendPattern,  *status);
-    fPropSets[URX_GC_CONTROL]  = new UnicodeSet(gGC_ControlPattern, *status);
-    fPropSets[URX_GC_L]        = new UnicodeSet(gGC_LPattern,       *status);
-    fPropSets[URX_GC_V]        = new UnicodeSet(gGC_VPattern,       *status);
-    fPropSets[URX_GC_T]        = new UnicodeSet(gGC_TPattern,       *status);
-    fPropSets[URX_GC_LV]       = new UnicodeSet(gGC_LVPattern,      *status);
-    fPropSets[URX_GC_LVT]      = new UnicodeSet(gGC_LVTPattern,     *status);
+    fPropSets[URX_ISWORD_SET]  = new UnicodeSet(UnicodeString(TRUE, gIsWordPattern, -1),     *status);
+    fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1),    *status);    
+    fPropSets[URX_GC_EXTEND]   = new UnicodeSet(UnicodeString(TRUE, gGC_ExtendPattern, -1),  *status);
+    fPropSets[URX_GC_CONTROL]  = new UnicodeSet(UnicodeString(TRUE, gGC_ControlPattern, -1), *status);
+    fPropSets[URX_GC_L]        = new UnicodeSet(UnicodeString(TRUE, gGC_LPattern, -1),       *status);
+    fPropSets[URX_GC_V]        = new UnicodeSet(UnicodeString(TRUE, gGC_VPattern, -1),       *status);
+    fPropSets[URX_GC_T]        = new UnicodeSet(UnicodeString(TRUE, gGC_TPattern, -1),       *status);
+    fPropSets[URX_GC_LV]       = new UnicodeSet(UnicodeString(TRUE, gGC_LVPattern, -1),      *status);
+    fPropSets[URX_GC_LVT]      = new UnicodeSet(UnicodeString(TRUE, gGC_LVTPattern, -1),     *status);
     if (U_FAILURE(*status)) {
         // Bail out if we were unable to create the above sets.
         // The rest of the initialization needs them, so we cannot proceed.
@@ -187,7 +188,7 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) {
     
     //
     // The following sets  are dynamically constructed, because their
-    //   intialization strings would be unreasonable.
+    //   initialization strings would be unreasonable.
     //
     
     
@@ -195,8 +196,7 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) {
     //  "Normal" is the set of characters that don't need special handling
     //            when finding grapheme cluster boundaries.
     //
-    fPropSets[URX_GC_NORMAL] = new UnicodeSet;
-    fPropSets[URX_GC_NORMAL]->complement();
+    fPropSets[URX_GC_NORMAL] = new UnicodeSet(0, UnicodeSet::MAX_VALUE);
     fPropSets[URX_GC_NORMAL]->remove(0xac00, 0xd7a4);
     fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_CONTROL]);
     fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]);
@@ -206,47 +206,46 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) {
     // Initialize the 8-bit fast bit sets from the parallel full
     //   UnicodeSets.
     for (i=0; i<URX_LAST_SET; i++) {
-        fPropSets8[i].init(fPropSets[i]);
+        if (fPropSets[i]) {
+            fPropSets[i]->compact();
+            fPropSets8[i].init(fPropSets[i]);
+        }
     }
 
     // Sets used while parsing rules, but not referenced from the parse state table
-    fRuleSets[kRuleSet_rule_char-128]   = new UnicodeSet(gRuleSet_rule_char_pattern,  *status);
-    fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(gRuleWhiteSpacePattern,      *status);
-    fRuleSets[kRuleSet_digit_char-128]  = new UnicodeSet(gRuleSet_digit_char_pattern, *status);
-    fRuleDigits                         = new UnicodeSet(gRuleSet_digit_char_pattern, *status);
-    fUnescapeCharSet                    = new UnicodeSet(gUnescapeCharPattern,        *status);
-
-    // Empty UnicodeString, for use by matchers with NULL input.
-    fEmptyString = new UnicodeString;
+    fRuleSets[kRuleSet_rule_char-128]   = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1),  *status);
+    fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodeString(TRUE, gRuleWhiteSpacePattern, -1),      *status);
+    fRuleSets[kRuleSet_digit_char-128]  = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status);
+    fRuleDigitsAlias = fRuleSets[kRuleSet_digit_char-128];
+    for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
+        if (fRuleSets[i]) {
+            fRuleSets[i]->compact();
+        }
+    }
 }
 
 
 RegexStaticSets::~RegexStaticSets() {
-    int i;
+    int32_t i;
 
     for (i=0; i<URX_LAST_SET; i++) {
         delete fPropSets[i];
         fPropSets[i] = NULL;
     }
-    for (i=0; i<10; i++) {
+    for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
         delete fRuleSets[i];
         fRuleSets[i] = NULL;
     }
-    delete fUnescapeCharSet;
-    fUnescapeCharSet = NULL;
-    delete fRuleDigits;
-    fRuleDigits = NULL;
-    delete fEmptyString;
-    fEmptyString = NULL;
+    fRuleDigitsAlias = NULL;
 }
 
 
-//----------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 //
 //   regex_cleanup      Memory cleanup function, free/delete all
 //                      cached memory.  Called by ICU's u_cleanup() function.
 //
-//----------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 UBool
 RegexStaticSets::cleanup(void) {
     delete RegexStaticSets::gStaticSets;
diff --git a/icu4c/source/i18n/regexst.h b/icu4c/source/i18n/regexst.h
index aaba34d5e9e..2fd9c32d773 100644
--- a/icu4c/source/i18n/regexst.h
+++ b/icu4c/source/i18n/regexst.h
@@ -1,7 +1,7 @@
 //
 //  regexst.h
 //
-//  Copyright (C) 2003-2004, International Business Machines Corporation and others.
+//  Copyright (C) 2003-2007, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains declarations for the class RegexStaticSets
@@ -42,17 +42,16 @@ public:
     Regex8BitSet   fPropSets8[URX_LAST_SET];    // Fast bitmap sets for latin-1 range for above.
 
     UnicodeSet    *fRuleSets[10];               // Sets used while parsing regexp patterns.
-    UnicodeSet    *fUnescapeCharSet;            // Set of chars handled by unescape when
-                                                //   encountered with a \ in a pattern.
-    UnicodeSet    *fRuleDigits;
-    UnicodeString *fEmptyString;                // An empty string, to be used when a matcher
-                                                //   is created with no input.
+    UnicodeSet    fUnescapeCharSet;            // Set of chars handled by unescape when
+                                               //   encountered with a \ in a pattern.
+    UnicodeSet    *fRuleDigitsAlias;
+    UnicodeString fEmptyString;                // An empty string, to be used when a matcher
+                                               //   is created with no input.
 
 };
 
 
-
-
 U_NAMESPACE_END
 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
 #endif   // REGEXST_H
+
diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp
index d3aab9f7189..080c1ade900 100644
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@@ -55,7 +55,7 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
         fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
     }
         
-    reset(*RegexStaticSets::gStaticSets->fEmptyString);
+    reset(RegexStaticSets::gStaticSets->fEmptyString);
 }
 
 
@@ -103,7 +103,7 @@ RegexMatcher::RegexMatcher(const UnicodeString &regexp,
     if (fStack == NULL || fData == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
     }
-    reset(*RegexStaticSets::gStaticSets->fEmptyString);
+    reset(RegexStaticSets::gStaticSets->fEmptyString);
 }