ICU-265 map char to set with array instead of hash for better performance

X-SVN-Rev: 728
2025-04-13 08:53:20 +00:00 · 2000-02-08 02:49:15 +00:00 · 2000-02-08 02:49:15 +00:00 · 7ce42e2f31
commit 7ce42e2f31
parent bf89e792e3
8 changed files with 177 additions and 70 deletions
--- a/icu4c/source/i18n/rbt_data.cpp
+++ b/icu4c/source/i18n/rbt_data.cpp
@ -17,16 +17,15 @@ TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
        return;
    }
    variableNames = uhash_open((UHashFunction)uhash_hashUString, &status);
-    setVariables = uhash_open(0, &status);
+    setVariables = 0;
+    setVariablesLength = 0;
 }

 TransliterationRuleData::~TransliterationRuleData() {
    if (variableNames != 0) {
        uhash_close(variableNames);
    }
-    if (setVariables != 0) {
-        uhash_close(setVariables);
-    }
+    delete[] setVariables;
 }

 void
@ -38,31 +37,6 @@ TransliterationRuleData::defineVariable(const UnicodeString& name,
                 &status);
 }

-void
-TransliterationRuleData::defineVariable(const UnicodeString& name,
-                                        UChar standIn,
-                                        UnicodeSet* adoptedSet,
-                                        UErrorCode& status) {
-    defineVariable(name, standIn, status);
-    defineSet(standIn, adoptedSet, status);
-}
-
-void
-TransliterationRuleData::defineSet(UChar standIn,
-                                   UnicodeSet* adoptedSet,
-                                   UErrorCode& status) {
-    if (U_FAILURE(status)) {
-        return;
-    }
-    if (adoptedSet == 0) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-        return;
-    }
-    uhash_putKey(setVariables, (int32_t) (standIn & 0x7FFFFFFF),
-                 adoptedSet,
-                 &status);
-}
-
 UChar
 TransliterationRuleData::lookupVariable(const UnicodeString& name,
                                        UErrorCode& status) const {
@ -76,10 +50,10 @@ TransliterationRuleData::lookupVariable(const UnicodeString& name,
    return (UChar) (int32_t) value;
 }

-UnicodeSet*
+const UnicodeSet*
 TransliterationRuleData::lookupSet(UChar standIn) const {
-    void* value = uhash_get(setVariables, (int32_t) (standIn & 0x7FFFFFFF));
-    return (UnicodeSet*) value;
+    int32_t i = standIn - setVariablesBase;
+    return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0;
 }

 bool_t
--- a/icu4c/source/i18n/rbt_data.h
+++ b/icu4c/source/i18n/rbt_data.h
@ -20,6 +20,16 @@ struct UHashtable;
 * are essentially the parsed rules in compact, usable form.  The
 * TRD objects themselves are held for the life of the process in
 * a static cache owned by Transliterator.
+ *
+ * This class' API is a little asymmetric.  There is a method to
+ * define a variable, but no way to define a set.  This is because the
+ * sets are defined by the parser in a UVector, and the vector is
+ * copied into a fixed-size array here.  Once this is done, no new
+ * sets may be defined.  In practice, there is no need to do so, since
+ * generating the data and using it are discrete phases.  When there
+ * is a need to access the set data during the parse phase, another
+ * data structure handles this.  See the parsing code for more
+ * details.
 */
 class TransliterationRuleData {

@ -47,18 +57,28 @@ public:
    UHashtable* variableNames;
    
    /**
-     * Map category variable (UChar) to set (UnicodeSet).
+     * Map category variable (Character) to set (UnicodeSet).
     * Variables that correspond to a set of characters are mapped
-     * from variable name to a stand-in character in
-     * data.variableNames.  The stand-in then serves as a key in
-     * this hash to lookup the actual UnicodeSet object.  In
-     * addition, the stand-in is stored in the rule text to
-     * represent the set of characters.
+     * from variable name to a stand-in character in data.variableNames.
+     * The stand-in then serves as a key in this hash to lookup the
+     * actual UnicodeSet object.  In addition, the stand-in is
+     * stored in the rule text to represent the set of characters.
+     * setVariables[i] represents character (setVariablesBase + i).
     *
     * PUBLIC DATA MEMBER for internal use by RBT
     */
-    UHashtable* setVariables;
+    UnicodeSet** setVariables;
    
+    /**
+     * The character represented by setVariables[0].
+     */
+    UChar setVariablesBase;
+
+    /**
+     * The length of setVariables.
+     */
+    int32_t setVariablesLength;
+
    TransliterationRuleData(UErrorCode& status);

    ~TransliterationRuleData();
@ -67,19 +87,10 @@ public:
                        UChar value,
                        UErrorCode& status);
        
-    void defineVariable(const UnicodeString& name,
-                        UChar standIn,
-                        UnicodeSet* adoptedSet,
-                        UErrorCode& status);
-
-    void defineSet(UChar standIn,
-                   UnicodeSet* adoptedSet,
-                   UErrorCode& status);
-
    UChar lookupVariable(const UnicodeString& name,
                         UErrorCode& status) const;
    
-	UnicodeSet* lookupSet(UChar standIn) const;
+	const UnicodeSet* lookupSet(UChar standIn) const;

    bool_t isVariableDefined(const UnicodeString& name) const;
 };
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -15,6 +15,7 @@
 #include "unicode/uniset.h"
 #include "cstring.h"
 #include "unicode/parsepos.h"
+#include "symtable.h"

 // Operators
 const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
@ -37,6 +38,56 @@ const UChar TransliterationRuleParser::SET_OPEN = '[';
 const UChar TransliterationRuleParser::SET_CLOSE = ']';
 const UChar TransliterationRuleParser::CURSOR_POS = '|';

+//----------------------------------------------------------------------
+// BEGIN ParseData
+//----------------------------------------------------------------------
+
+/**
+ * This class implements the SymbolTable interface.  It is used
+ * during parsing to give UnicodeSet access to variables that
+ * have been defined so far.  Note that it uses setVariablesVector,
+ * _not_ data.setVariables.
+ */
+class ParseData : public SymbolTable {
+public:
+    const TransliterationRuleData* data; // alias
+
+    const UVector* setVariablesVector; // alias
+
+    ParseData(const TransliterationRuleData* data = 0,
+              const UVector* setVariablesVector = 0);
+
+    /**
+     * Lookup the object associated with this string and return it.
+     * Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
+     * exist.  Return a non-NULL set if the name is mapped to a set;
+     * otherwise return a NULL set.
+     */
+    virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
+                        UErrorCode& status) const;
+};
+
+ParseData::ParseData(const TransliterationRuleData* d,
+                     const UVector* sets) :
+    data(d), setVariablesVector(sets) {}
+
+/**
+ * Implement SymbolTable API.  Lookup a variable, returning
+ * either a Character, a UnicodeSet, or null.
+ */
+void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
+                       UErrorCode& status) const {
+    c = data->lookupVariable(name, status);
+    if (U_SUCCESS(status)) {
+        int32_t i = c - data->setVariablesBase;
+        set = (i < setVariablesVector->size()) ?
+            (UnicodeSet*) setVariablesVector->elementAt(i) : 0;
+    }
+}
+
+//----------------------------------------------------------------------
+// END ParseData
+//----------------------------------------------------------------------

 TransliterationRuleData*
 TransliterationRuleParser::parse(const UnicodeString& rules,
@ -58,7 +109,16 @@ TransliterationRuleParser::parse(const UnicodeString& rules,
 TransliterationRuleParser::TransliterationRuleParser(
                                     const UnicodeString& theRules,
                                     RuleBasedTransliterator::Direction theDirection) :
-    rules(theRules), direction(theDirection), data(0) {}
+    rules(theRules), direction(theDirection), data(0) {
+    parseData = new ParseData(0, &setVariablesVector);
+}
+
+/**
+ * Destructor.
+ */
+TransliterationRuleParser::~TransliterationRuleParser() {
+    delete parseData;
+}

 /**
 * Parse the given string as a sequence of rules, separated by newline
@ -76,7 +136,9 @@ void TransliterationRuleParser::parseRules(void) {
    if (U_FAILURE(status)) {
        return;
    }
-    
+
+    parseData->data = data;
+    setVariablesVector.removeAllElements();
    determineVariableRange();

    int32_t pos = 0;
@ -103,6 +165,18 @@ void TransliterationRuleParser::parseRules(void) {
        pos = parseRule(--pos, limit);                    
    }
    
+    // Convert the set vector to an array
+    data->setVariablesLength = setVariablesVector.size();
+    data->setVariables = new UnicodeSet*[data->setVariablesLength];
+    // orphanElement removes the given element and shifts all other
+    // elements down.  For performance (and code clarity) we work from
+    // the end back to index 0.
+    for (int32_t i=data->setVariablesLength; i>0; ) {
+        --i;
+        data->setVariables[i] =
+            (UnicodeSet*) setVariablesVector.orphanElementAt(i);
+    }
+
    // Index the rules
    if (U_SUCCESS(status)) {
        data->ruleSet.freeze(*data, status);
@ -272,7 +346,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
            break;
        case SET_OPEN: {
            ParsePosition pp(pos-1); // Backup to opening '['
-            buf.append(registerSet(new UnicodeSet(rules, pp, data, status)));
+            buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status)));
            if (U_FAILURE(status)) {
                return syntaxError("Invalid set", rules, start);
            }
@ -407,9 +481,8 @@ UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
-    UChar c = variableNext++;
-    data->defineSet(c, adoptedSet, status);
-    return c;
+    setVariablesVector.addElement(adoptedSet);
+    return variableNext++;
 }

 /**
@ -425,10 +498,10 @@ void TransliterationRuleParser::determineVariableRange(void) {

    UnicodeRange* r = privateUse.largestUnusedSubrange(rules);

-    variableNext = variableLimit = (UChar) 0;
+    data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
    
    if (r != 0) {
-        variableNext = r->start;
+        data->setVariablesBase = variableNext = r->start;
        variableLimit = (UChar) (r->start + r->length);
        delete r;
    }
--- a/icu4c/source/i18n/rbt_pars.h
+++ b/icu4c/source/i18n/rbt_pars.h
@ -9,9 +9,11 @@
 #define RBT_PARS_H

 #include "unicode/rbt.h"
+#include "uvector.h"

 class TransliterationRuleData;
 class UnicodeSet;
+class ParseData;

 class TransliterationRuleParser {

@ -31,6 +33,18 @@ class TransliterationRuleParser {
     */
    UErrorCode status;

+    /**
+     * Temporary symbol table used during parsing.
+     */
+    ParseData* parseData;
+
+    /**
+     * Temporary vector of set variables.  When parsing is complete, this
+     * is copied into the array data.setVariables.  As with data.setVariables,
+     * element 0 corresponds to character data.setVariablesBase.
+     */
+    UVector setVariablesVector;
+
    /**
     * The next available stand-in for variables.  This starts at some point in
     * the private use area (discovered dynamically) and increments up toward
@ -82,6 +96,11 @@ private:
    TransliterationRuleParser(const UnicodeString& rules,
                              RuleBasedTransliterator::Direction direction);

+    /**
+     * Destructor.
+     */
+    ~TransliterationRuleParser();
+
    /**
     * Parse the given string as a sequence of rules, separated by newline
     * characters ('\n'), and cause this object to implement those rules.  Any
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@ -143,7 +143,7 @@ bool_t TransliterationRule::matchesIndexValue(uint8_t v,
        return TRUE;
    }
    UChar c = pattern.charAt(anteContextLength);
-    UnicodeSet* set = data.lookupSet(c);
+    const UnicodeSet* set = data.lookupSet(c);
    return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
 }

@ -314,7 +314,7 @@ int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text,
 bool_t TransliterationRule::charMatches(UChar keyChar, UChar textChar,
                                        const TransliterationRuleData& data,
                                        const UnicodeFilter* filter) const {
-    UnicodeSet* set = 0;
+    const UnicodeSet* set = 0;
    return (filter == 0 || filter->contains(textChar)) &&
        (((set = data.lookupSet(keyChar)) == 0) ?
         keyChar == textChar : set->contains(textChar));
--- a/icu4c/source/i18n/symtable.h
+++ b/icu4c/source/i18n/symtable.h
@ -0,0 +1,29 @@
+/*
+**********************************************************************
+*   Copyright (c) 2000, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   Date        Name        Description
+*   02/04/00    aliu        Creation.
+**********************************************************************
+*/
+#ifndef SYMTABLE_H
+#define SYMTABLE_H
+
+/**
+ * An abstract class that maps strings to objects.
+ */
+class SymbolTable {
+public:
+
+    /**
+     * Lookup the object associated with this string and return it.
+     * Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
+     * exist.  Return a non-NULL set if the name is mapped to a set;
+     * otherwise return a NULL set.
+     */
+    virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
+                        UErrorCode& status) const = 0;
+};
+
+#endif
--- a/icu4c/source/i18n/unicode/uniset.h
+++ b/icu4c/source/i18n/unicode/uniset.h
@ -15,7 +15,7 @@
 #include "unicode/unistr.h"

 class ParsePosition;
-class TransliterationRuleData;
+class SymbolTable;
 class TransliterationRuleParser;
 class TransliterationRule;

@ -557,7 +557,7 @@ private:
     * contains a syntax error.
     */
    UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
-               const TransliterationRuleData* data,
+               const SymbolTable& symbols,
               UErrorCode& status);

    /**
@ -600,7 +600,7 @@ private:
    static UnicodeString& parse(UnicodeString& pairsBuf /*result*/,
                                const UnicodeString& pattern,
                                ParsePosition& pos,
-                                const TransliterationRuleData* data,
+                                const SymbolTable* symbols,
                                UErrorCode& status);

    //----------------------------------------------------------------
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@ -10,7 +10,7 @@

 #include "unicode/uniset.h"
 #include "unicode/parsepos.h"
-#include "rbt_data.h"
+#include "symtable.h"

 // N.B.: This mapping is different in ICU and Java
 const UnicodeString UnicodeSet::CATEGORY_NAMES(
@ -77,10 +77,11 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern,
    applyPattern(pattern, status);
 }

+// For internal use by RuleBasedTransliterator
 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
-                       const TransliterationRuleData* data,
+                       const SymbolTable& symbols,
                       UErrorCode& status) {
-    parse(pairs, pattern, pos, data, status);
+    parse(pairs, pattern, pos, &symbols, status);
 }

 /**
@ -452,7 +453,7 @@ void UnicodeSet::clear(void) {
 UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
                                 const UnicodeString& pattern,
                                 ParsePosition& pos,
-                                 const TransliterationRuleData* data,
+                                 const SymbolTable* symbols,
                                 UErrorCode& status) {
    if (U_FAILURE(status)) {
        return pairsBuf;
@ -583,9 +584,10 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
         * Variable names are only parsed if varNameToChar is not null.
         * Set variables are only looked up if varCharToSet is not null.
         */
-        else if (data != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
+        else if (symbols != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
            ++i;
            int32_t j = pattern.indexOf(VARIABLE_REF_CLOSE, i);
+            UnicodeSet* set = NULL;
            if (i == j || j < 0) { // empty or unterminated
                // throw new IllegalArgumentException("Illegal variable reference");
                status = U_ILLEGAL_ARGUMENT_ERROR;
@ -593,7 +595,7 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
                scratch.truncate(0);
                pattern.extractBetween(i, j, scratch);
                ++j;
-                c = data->lookupVariable(scratch, status);
+                symbols->lookup(scratch, c, set, status);
            }
            if (U_FAILURE(status)) {
                // Either the reference was ill-formed (empty name, or no
@ -602,7 +604,6 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
            }
            isLiteral = TRUE;

-            UnicodeSet* set = data->lookupSet(c);
            if (set != NULL) {
                nestedPairs = &set->pairs;
            }
@ -638,7 +639,7 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
            } else {
                // Recurse to get the pairs for this nested set.
                pos.setIndex(i);
-                nestedPairs = &parse(nestedAux, pattern, pos, data, status);
+                nestedPairs = &parse(nestedAux, pattern, pos, symbols, status);
                if (U_FAILURE(status)) {
                    return pairsBuf;
                }