From 7ce42e2f314c0660c5aa893e85f3269c14cfc506 Mon Sep 17 00:00:00 2001
From: Alan Liu <alansliu@gmail.com>
Date: Tue, 8 Feb 2000 02:49:15 +0000
Subject: [PATCH] ICU-265 map char to set with array instead of hash for better
 performance

X-SVN-Rev: 728
---
 icu4c/source/i18n/rbt_data.cpp     | 38 ++-----------
 icu4c/source/i18n/rbt_data.h       | 45 +++++++++------
 icu4c/source/i18n/rbt_pars.cpp     | 89 +++++++++++++++++++++++++++---
 icu4c/source/i18n/rbt_pars.h       | 19 +++++++
 icu4c/source/i18n/rbt_rule.cpp     |  4 +-
 icu4c/source/i18n/symtable.h       | 29 ++++++++++
 icu4c/source/i18n/unicode/uniset.h |  6 +-
 icu4c/source/i18n/uniset.cpp       | 17 +++---
 8 files changed, 177 insertions(+), 70 deletions(-)
 create mode 100644 icu4c/source/i18n/symtable.h

diff --git a/icu4c/source/i18n/rbt_data.cpp b/icu4c/source/i18n/rbt_data.cpp
index 25f9a38ec20..de2d4212cbd 100644
--- a/icu4c/source/i18n/rbt_data.cpp
+++ b/icu4c/source/i18n/rbt_data.cpp
@@ -17,16 +17,15 @@ TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
         return;
     }
     variableNames = uhash_open((UHashFunction)uhash_hashUString, &status);
-    setVariables = uhash_open(0, &status);
+    setVariables = 0;
+    setVariablesLength = 0;
 }
 
 TransliterationRuleData::~TransliterationRuleData() {
     if (variableNames != 0) {
         uhash_close(variableNames);
     }
-    if (setVariables != 0) {
-        uhash_close(setVariables);
-    }
+    delete[] setVariables;
 }
 
 void
@@ -38,31 +37,6 @@ TransliterationRuleData::defineVariable(const UnicodeString& name,
                  &status);
 }
 
-void
-TransliterationRuleData::defineVariable(const UnicodeString& name,
-                                        UChar standIn,
-                                        UnicodeSet* adoptedSet,
-                                        UErrorCode& status) {
-    defineVariable(name, standIn, status);
-    defineSet(standIn, adoptedSet, status);
-}
-
-void
-TransliterationRuleData::defineSet(UChar standIn,
-                                   UnicodeSet* adoptedSet,
-                                   UErrorCode& status) {
-    if (U_FAILURE(status)) {
-        return;
-    }
-    if (adoptedSet == 0) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-        return;
-    }
-    uhash_putKey(setVariables, (int32_t) (standIn & 0x7FFFFFFF),
-                 adoptedSet,
-                 &status);
-}
-
 UChar
 TransliterationRuleData::lookupVariable(const UnicodeString& name,
                                         UErrorCode& status) const {
@@ -76,10 +50,10 @@ TransliterationRuleData::lookupVariable(const UnicodeString& name,
     return (UChar) (int32_t) value;
 }
 
-UnicodeSet*
+const UnicodeSet*
 TransliterationRuleData::lookupSet(UChar standIn) const {
-    void* value = uhash_get(setVariables, (int32_t) (standIn & 0x7FFFFFFF));
-    return (UnicodeSet*) value;
+    int32_t i = standIn - setVariablesBase;
+    return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0;
 }
 
 bool_t
diff --git a/icu4c/source/i18n/rbt_data.h b/icu4c/source/i18n/rbt_data.h
index 7ba43fcb3e1..aa99ed8d546 100644
--- a/icu4c/source/i18n/rbt_data.h
+++ b/icu4c/source/i18n/rbt_data.h
@@ -20,6 +20,16 @@ struct UHashtable;
  * are essentially the parsed rules in compact, usable form.  The
  * TRD objects themselves are held for the life of the process in
  * a static cache owned by Transliterator.
+ *
+ * This class' API is a little asymmetric.  There is a method to
+ * define a variable, but no way to define a set.  This is because the
+ * sets are defined by the parser in a UVector, and the vector is
+ * copied into a fixed-size array here.  Once this is done, no new
+ * sets may be defined.  In practice, there is no need to do so, since
+ * generating the data and using it are discrete phases.  When there
+ * is a need to access the set data during the parse phase, another
+ * data structure handles this.  See the parsing code for more
+ * details.
  */
 class TransliterationRuleData {
 
@@ -47,18 +57,28 @@ public:
     UHashtable* variableNames;
     
     /**
-     * Map category variable (UChar) to set (UnicodeSet).
+     * Map category variable (Character) to set (UnicodeSet).
      * Variables that correspond to a set of characters are mapped
-     * from variable name to a stand-in character in
-     * data.variableNames.  The stand-in then serves as a key in
-     * this hash to lookup the actual UnicodeSet object.  In
-     * addition, the stand-in is stored in the rule text to
-     * represent the set of characters.
+     * from variable name to a stand-in character in data.variableNames.
+     * The stand-in then serves as a key in this hash to lookup the
+     * actual UnicodeSet object.  In addition, the stand-in is
+     * stored in the rule text to represent the set of characters.
+     * setVariables[i] represents character (setVariablesBase + i).
      *
      * PUBLIC DATA MEMBER for internal use by RBT
      */
-    UHashtable* setVariables;
+    UnicodeSet** setVariables;
     
+    /**
+     * The character represented by setVariables[0].
+     */
+    UChar setVariablesBase;
+
+    /**
+     * The length of setVariables.
+     */
+    int32_t setVariablesLength;
+
     TransliterationRuleData(UErrorCode& status);
 
     ~TransliterationRuleData();
@@ -67,19 +87,10 @@ public:
                         UChar value,
                         UErrorCode& status);
         
-    void defineVariable(const UnicodeString& name,
-                        UChar standIn,
-                        UnicodeSet* adoptedSet,
-                        UErrorCode& status);
-
-    void defineSet(UChar standIn,
-                   UnicodeSet* adoptedSet,
-                   UErrorCode& status);
-
     UChar lookupVariable(const UnicodeString& name,
                          UErrorCode& status) const;
     
-	UnicodeSet* lookupSet(UChar standIn) const;
+	const UnicodeSet* lookupSet(UChar standIn) const;
 
     bool_t isVariableDefined(const UnicodeString& name) const;
 };
diff --git a/icu4c/source/i18n/rbt_pars.cpp b/icu4c/source/i18n/rbt_pars.cpp
index ef20b261822..29306b329fa 100644
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@@ -15,6 +15,7 @@
 #include "unicode/uniset.h"
 #include "cstring.h"
 #include "unicode/parsepos.h"
+#include "symtable.h"
 
 // Operators
 const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
@@ -37,6 +38,56 @@ const UChar TransliterationRuleParser::SET_OPEN = '[';
 const UChar TransliterationRuleParser::SET_CLOSE = ']';
 const UChar TransliterationRuleParser::CURSOR_POS = '|';
 
+//----------------------------------------------------------------------
+// BEGIN ParseData
+//----------------------------------------------------------------------
+
+/**
+ * This class implements the SymbolTable interface.  It is used
+ * during parsing to give UnicodeSet access to variables that
+ * have been defined so far.  Note that it uses setVariablesVector,
+ * _not_ data.setVariables.
+ */
+class ParseData : public SymbolTable {
+public:
+    const TransliterationRuleData* data; // alias
+
+    const UVector* setVariablesVector; // alias
+
+    ParseData(const TransliterationRuleData* data = 0,
+              const UVector* setVariablesVector = 0);
+
+    /**
+     * Lookup the object associated with this string and return it.
+     * Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
+     * exist.  Return a non-NULL set if the name is mapped to a set;
+     * otherwise return a NULL set.
+     */
+    virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
+                        UErrorCode& status) const;
+};
+
+ParseData::ParseData(const TransliterationRuleData* d,
+                     const UVector* sets) :
+    data(d), setVariablesVector(sets) {}
+
+/**
+ * Implement SymbolTable API.  Lookup a variable, returning
+ * either a Character, a UnicodeSet, or null.
+ */
+void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
+                       UErrorCode& status) const {
+    c = data->lookupVariable(name, status);
+    if (U_SUCCESS(status)) {
+        int32_t i = c - data->setVariablesBase;
+        set = (i < setVariablesVector->size()) ?
+            (UnicodeSet*) setVariablesVector->elementAt(i) : 0;
+    }
+}
+
+//----------------------------------------------------------------------
+// END ParseData
+//----------------------------------------------------------------------
 
 TransliterationRuleData*
 TransliterationRuleParser::parse(const UnicodeString& rules,
@@ -58,7 +109,16 @@ TransliterationRuleParser::parse(const UnicodeString& rules,
 TransliterationRuleParser::TransliterationRuleParser(
                                      const UnicodeString& theRules,
                                      RuleBasedTransliterator::Direction theDirection) :
-    rules(theRules), direction(theDirection), data(0) {}
+    rules(theRules), direction(theDirection), data(0) {
+    parseData = new ParseData(0, &setVariablesVector);
+}
+
+/**
+ * Destructor.
+ */
+TransliterationRuleParser::~TransliterationRuleParser() {
+    delete parseData;
+}
 
 /**
  * Parse the given string as a sequence of rules, separated by newline
@@ -76,7 +136,9 @@ void TransliterationRuleParser::parseRules(void) {
     if (U_FAILURE(status)) {
         return;
     }
-    
+
+    parseData->data = data;
+    setVariablesVector.removeAllElements();
     determineVariableRange();
 
     int32_t pos = 0;
@@ -103,6 +165,18 @@ void TransliterationRuleParser::parseRules(void) {
         pos = parseRule(--pos, limit);                    
     }
     
+    // Convert the set vector to an array
+    data->setVariablesLength = setVariablesVector.size();
+    data->setVariables = new UnicodeSet*[data->setVariablesLength];
+    // orphanElement removes the given element and shifts all other
+    // elements down.  For performance (and code clarity) we work from
+    // the end back to index 0.
+    for (int32_t i=data->setVariablesLength; i>0; ) {
+        --i;
+        data->setVariables[i] =
+            (UnicodeSet*) setVariablesVector.orphanElementAt(i);
+    }
+
     // Index the rules
     if (U_SUCCESS(status)) {
         data->ruleSet.freeze(*data, status);
@@ -272,7 +346,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
             break;
         case SET_OPEN: {
             ParsePosition pp(pos-1); // Backup to opening '['
-            buf.append(registerSet(new UnicodeSet(rules, pp, data, status)));
+            buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status)));
             if (U_FAILURE(status)) {
                 return syntaxError("Invalid set", rules, start);
             }
@@ -407,9 +481,8 @@ UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
         status = U_ILLEGAL_ARGUMENT_ERROR;
         return 0;
     }
-    UChar c = variableNext++;
-    data->defineSet(c, adoptedSet, status);
-    return c;
+    setVariablesVector.addElement(adoptedSet);
+    return variableNext++;
 }
 
 /**
@@ -425,10 +498,10 @@ void TransliterationRuleParser::determineVariableRange(void) {
 
     UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
 
-    variableNext = variableLimit = (UChar) 0;
+    data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
     
     if (r != 0) {
-        variableNext = r->start;
+        data->setVariablesBase = variableNext = r->start;
         variableLimit = (UChar) (r->start + r->length);
         delete r;
     }
diff --git a/icu4c/source/i18n/rbt_pars.h b/icu4c/source/i18n/rbt_pars.h
index be6ad0b0ddf..1af299106fd 100644
--- a/icu4c/source/i18n/rbt_pars.h
+++ b/icu4c/source/i18n/rbt_pars.h
@@ -9,9 +9,11 @@
 #define RBT_PARS_H
 
 #include "unicode/rbt.h"
+#include "uvector.h"
 
 class TransliterationRuleData;
 class UnicodeSet;
+class ParseData;
 
 class TransliterationRuleParser {
 
@@ -31,6 +33,18 @@ class TransliterationRuleParser {
      */
     UErrorCode status;
 
+    /**
+     * Temporary symbol table used during parsing.
+     */
+    ParseData* parseData;
+
+    /**
+     * Temporary vector of set variables.  When parsing is complete, this
+     * is copied into the array data.setVariables.  As with data.setVariables,
+     * element 0 corresponds to character data.setVariablesBase.
+     */
+    UVector setVariablesVector;
+
     /**
      * The next available stand-in for variables.  This starts at some point in
      * the private use area (discovered dynamically) and increments up toward
@@ -82,6 +96,11 @@ private:
     TransliterationRuleParser(const UnicodeString& rules,
                               RuleBasedTransliterator::Direction direction);
 
+    /**
+     * Destructor.
+     */
+    ~TransliterationRuleParser();
+
     /**
      * Parse the given string as a sequence of rules, separated by newline
      * characters ('\n'), and cause this object to implement those rules.  Any
diff --git a/icu4c/source/i18n/rbt_rule.cpp b/icu4c/source/i18n/rbt_rule.cpp
index cb8ea4a9610..d1758c8eca0 100644
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@@ -143,7 +143,7 @@ bool_t TransliterationRule::matchesIndexValue(uint8_t v,
         return TRUE;
     }
     UChar c = pattern.charAt(anteContextLength);
-    UnicodeSet* set = data.lookupSet(c);
+    const UnicodeSet* set = data.lookupSet(c);
     return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
 }
 
@@ -314,7 +314,7 @@ int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text,
 bool_t TransliterationRule::charMatches(UChar keyChar, UChar textChar,
                                         const TransliterationRuleData& data,
                                         const UnicodeFilter* filter) const {
-    UnicodeSet* set = 0;
+    const UnicodeSet* set = 0;
     return (filter == 0 || filter->contains(textChar)) &&
         (((set = data.lookupSet(keyChar)) == 0) ?
          keyChar == textChar : set->contains(textChar));
diff --git a/icu4c/source/i18n/symtable.h b/icu4c/source/i18n/symtable.h
new file mode 100644
index 00000000000..b9462edfde9
--- /dev/null
+++ b/icu4c/source/i18n/symtable.h
@@ -0,0 +1,29 @@
+/*
+**********************************************************************
+*   Copyright (c) 2000, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   Date        Name        Description
+*   02/04/00    aliu        Creation.
+**********************************************************************
+*/
+#ifndef SYMTABLE_H
+#define SYMTABLE_H
+
+/**
+ * An abstract class that maps strings to objects.
+ */
+class SymbolTable {
+public:
+
+    /**
+     * Lookup the object associated with this string and return it.
+     * Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
+     * exist.  Return a non-NULL set if the name is mapped to a set;
+     * otherwise return a NULL set.
+     */
+    virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
+                        UErrorCode& status) const = 0;
+};
+
+#endif
diff --git a/icu4c/source/i18n/unicode/uniset.h b/icu4c/source/i18n/unicode/uniset.h
index f5b2914a596..e0a102653a3 100644
--- a/icu4c/source/i18n/unicode/uniset.h
+++ b/icu4c/source/i18n/unicode/uniset.h
@@ -15,7 +15,7 @@
 #include "unicode/unistr.h"
 
 class ParsePosition;
-class TransliterationRuleData;
+class SymbolTable;
 class TransliterationRuleParser;
 class TransliterationRule;
 
@@ -557,7 +557,7 @@ private:
      * contains a syntax error.
      */
     UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
-               const TransliterationRuleData* data,
+               const SymbolTable& symbols,
                UErrorCode& status);
 
     /**
@@ -600,7 +600,7 @@ private:
     static UnicodeString& parse(UnicodeString& pairsBuf /*result*/,
                                 const UnicodeString& pattern,
                                 ParsePosition& pos,
-                                const TransliterationRuleData* data,
+                                const SymbolTable* symbols,
                                 UErrorCode& status);
 
     //----------------------------------------------------------------
diff --git a/icu4c/source/i18n/uniset.cpp b/icu4c/source/i18n/uniset.cpp
index 4be900298e0..cb92ab833eb 100644
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@@ -10,7 +10,7 @@
 
 #include "unicode/uniset.h"
 #include "unicode/parsepos.h"
-#include "rbt_data.h"
+#include "symtable.h"
 
 // N.B.: This mapping is different in ICU and Java
 const UnicodeString UnicodeSet::CATEGORY_NAMES(
@@ -77,10 +77,11 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern,
     applyPattern(pattern, status);
 }
 
+// For internal use by RuleBasedTransliterator
 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
-                       const TransliterationRuleData* data,
+                       const SymbolTable& symbols,
                        UErrorCode& status) {
-    parse(pairs, pattern, pos, data, status);
+    parse(pairs, pattern, pos, &symbols, status);
 }
 
 /**
@@ -452,7 +453,7 @@ void UnicodeSet::clear(void) {
 UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
                                  const UnicodeString& pattern,
                                  ParsePosition& pos,
-                                 const TransliterationRuleData* data,
+                                 const SymbolTable* symbols,
                                  UErrorCode& status) {
     if (U_FAILURE(status)) {
         return pairsBuf;
@@ -583,9 +584,10 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
          * Variable names are only parsed if varNameToChar is not null.
          * Set variables are only looked up if varCharToSet is not null.
          */
-        else if (data != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
+        else if (symbols != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
             ++i;
             int32_t j = pattern.indexOf(VARIABLE_REF_CLOSE, i);
+            UnicodeSet* set = NULL;
             if (i == j || j < 0) { // empty or unterminated
                 // throw new IllegalArgumentException("Illegal variable reference");
                 status = U_ILLEGAL_ARGUMENT_ERROR;
@@ -593,7 +595,7 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
                 scratch.truncate(0);
                 pattern.extractBetween(i, j, scratch);
                 ++j;
-                c = data->lookupVariable(scratch, status);
+                symbols->lookup(scratch, c, set, status);
             }
             if (U_FAILURE(status)) {
                 // Either the reference was ill-formed (empty name, or no
@@ -602,7 +604,6 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
             }
             isLiteral = TRUE;
 
-            UnicodeSet* set = data->lookupSet(c);
             if (set != NULL) {
                 nestedPairs = &set->pairs;
             }
@@ -638,7 +639,7 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
             } else {
                 // Recurse to get the pairs for this nested set.
                 pos.setIndex(i);
-                nestedPairs = &parse(nestedAux, pattern, pos, data, status);
+                nestedPairs = &parse(nestedAux, pattern, pos, symbols, status);
                 if (U_FAILURE(status)) {
                     return pairsBuf;
                 }