From 7ce42e2f314c0660c5aa893e85f3269c14cfc506 Mon Sep 17 00:00:00 2001 From: Alan Liu Date: Tue, 8 Feb 2000 02:49:15 +0000 Subject: [PATCH] ICU-265 map char to set with array instead of hash for better performance X-SVN-Rev: 728 --- icu4c/source/i18n/rbt_data.cpp | 38 ++----------- icu4c/source/i18n/rbt_data.h | 45 +++++++++------ icu4c/source/i18n/rbt_pars.cpp | 89 +++++++++++++++++++++++++++--- icu4c/source/i18n/rbt_pars.h | 19 +++++++ icu4c/source/i18n/rbt_rule.cpp | 4 +- icu4c/source/i18n/symtable.h | 29 ++++++++++ icu4c/source/i18n/unicode/uniset.h | 6 +- icu4c/source/i18n/uniset.cpp | 17 +++--- 8 files changed, 177 insertions(+), 70 deletions(-) create mode 100644 icu4c/source/i18n/symtable.h diff --git a/icu4c/source/i18n/rbt_data.cpp b/icu4c/source/i18n/rbt_data.cpp index 25f9a38ec20..de2d4212cbd 100644 --- a/icu4c/source/i18n/rbt_data.cpp +++ b/icu4c/source/i18n/rbt_data.cpp @@ -17,16 +17,15 @@ TransliterationRuleData::TransliterationRuleData(UErrorCode& status) : return; } variableNames = uhash_open((UHashFunction)uhash_hashUString, &status); - setVariables = uhash_open(0, &status); + setVariables = 0; + setVariablesLength = 0; } TransliterationRuleData::~TransliterationRuleData() { if (variableNames != 0) { uhash_close(variableNames); } - if (setVariables != 0) { - uhash_close(setVariables); - } + delete[] setVariables; } void @@ -38,31 +37,6 @@ TransliterationRuleData::defineVariable(const UnicodeString& name, &status); } -void -TransliterationRuleData::defineVariable(const UnicodeString& name, - UChar standIn, - UnicodeSet* adoptedSet, - UErrorCode& status) { - defineVariable(name, standIn, status); - defineSet(standIn, adoptedSet, status); -} - -void -TransliterationRuleData::defineSet(UChar standIn, - UnicodeSet* adoptedSet, - UErrorCode& status) { - if (U_FAILURE(status)) { - return; - } - if (adoptedSet == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - uhash_putKey(setVariables, (int32_t) (standIn & 0x7FFFFFFF), - adoptedSet, - &status); -} - UChar TransliterationRuleData::lookupVariable(const UnicodeString& name, UErrorCode& status) const { @@ -76,10 +50,10 @@ TransliterationRuleData::lookupVariable(const UnicodeString& name, return (UChar) (int32_t) value; } -UnicodeSet* +const UnicodeSet* TransliterationRuleData::lookupSet(UChar standIn) const { - void* value = uhash_get(setVariables, (int32_t) (standIn & 0x7FFFFFFF)); - return (UnicodeSet*) value; + int32_t i = standIn - setVariablesBase; + return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0; } bool_t diff --git a/icu4c/source/i18n/rbt_data.h b/icu4c/source/i18n/rbt_data.h index 7ba43fcb3e1..aa99ed8d546 100644 --- a/icu4c/source/i18n/rbt_data.h +++ b/icu4c/source/i18n/rbt_data.h @@ -20,6 +20,16 @@ struct UHashtable; * are essentially the parsed rules in compact, usable form. The * TRD objects themselves are held for the life of the process in * a static cache owned by Transliterator. + * + * This class' API is a little asymmetric. There is a method to + * define a variable, but no way to define a set. This is because the + * sets are defined by the parser in a UVector, and the vector is + * copied into a fixed-size array here. Once this is done, no new + * sets may be defined. In practice, there is no need to do so, since + * generating the data and using it are discrete phases. When there + * is a need to access the set data during the parse phase, another + * data structure handles this. See the parsing code for more + * details. */ class TransliterationRuleData { @@ -47,18 +57,28 @@ public: UHashtable* variableNames; /** - * Map category variable (UChar) to set (UnicodeSet). + * Map category variable (Character) to set (UnicodeSet). * Variables that correspond to a set of characters are mapped - * from variable name to a stand-in character in - * data.variableNames. The stand-in then serves as a key in - * this hash to lookup the actual UnicodeSet object. In - * addition, the stand-in is stored in the rule text to - * represent the set of characters. + * from variable name to a stand-in character in data.variableNames. + * The stand-in then serves as a key in this hash to lookup the + * actual UnicodeSet object. In addition, the stand-in is + * stored in the rule text to represent the set of characters. + * setVariables[i] represents character (setVariablesBase + i). * * PUBLIC DATA MEMBER for internal use by RBT */ - UHashtable* setVariables; + UnicodeSet** setVariables; + /** + * The character represented by setVariables[0]. + */ + UChar setVariablesBase; + + /** + * The length of setVariables. + */ + int32_t setVariablesLength; + TransliterationRuleData(UErrorCode& status); ~TransliterationRuleData(); @@ -67,19 +87,10 @@ public: UChar value, UErrorCode& status); - void defineVariable(const UnicodeString& name, - UChar standIn, - UnicodeSet* adoptedSet, - UErrorCode& status); - - void defineSet(UChar standIn, - UnicodeSet* adoptedSet, - UErrorCode& status); - UChar lookupVariable(const UnicodeString& name, UErrorCode& status) const; - UnicodeSet* lookupSet(UChar standIn) const; + const UnicodeSet* lookupSet(UChar standIn) const; bool_t isVariableDefined(const UnicodeString& name) const; }; diff --git a/icu4c/source/i18n/rbt_pars.cpp b/icu4c/source/i18n/rbt_pars.cpp index ef20b261822..29306b329fa 100644 --- a/icu4c/source/i18n/rbt_pars.cpp +++ b/icu4c/source/i18n/rbt_pars.cpp @@ -15,6 +15,7 @@ #include "unicode/uniset.h" #include "cstring.h" #include "unicode/parsepos.h" +#include "symtable.h" // Operators const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '='; @@ -37,6 +38,56 @@ const UChar TransliterationRuleParser::SET_OPEN = '['; const UChar TransliterationRuleParser::SET_CLOSE = ']'; const UChar TransliterationRuleParser::CURSOR_POS = '|'; +//---------------------------------------------------------------------- +// BEGIN ParseData +//---------------------------------------------------------------------- + +/** + * This class implements the SymbolTable interface. It is used + * during parsing to give UnicodeSet access to variables that + * have been defined so far. Note that it uses setVariablesVector, + * _not_ data.setVariables. + */ +class ParseData : public SymbolTable { +public: + const TransliterationRuleData* data; // alias + + const UVector* setVariablesVector; // alias + + ParseData(const TransliterationRuleData* data = 0, + const UVector* setVariablesVector = 0); + + /** + * Lookup the object associated with this string and return it. + * Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not + * exist. Return a non-NULL set if the name is mapped to a set; + * otherwise return a NULL set. + */ + virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set, + UErrorCode& status) const; +}; + +ParseData::ParseData(const TransliterationRuleData* d, + const UVector* sets) : + data(d), setVariablesVector(sets) {} + +/** + * Implement SymbolTable API. Lookup a variable, returning + * either a Character, a UnicodeSet, or null. + */ +void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set, + UErrorCode& status) const { + c = data->lookupVariable(name, status); + if (U_SUCCESS(status)) { + int32_t i = c - data->setVariablesBase; + set = (i < setVariablesVector->size()) ? + (UnicodeSet*) setVariablesVector->elementAt(i) : 0; + } +} + +//---------------------------------------------------------------------- +// END ParseData +//---------------------------------------------------------------------- TransliterationRuleData* TransliterationRuleParser::parse(const UnicodeString& rules, @@ -58,7 +109,16 @@ TransliterationRuleParser::parse(const UnicodeString& rules, TransliterationRuleParser::TransliterationRuleParser( const UnicodeString& theRules, RuleBasedTransliterator::Direction theDirection) : - rules(theRules), direction(theDirection), data(0) {} + rules(theRules), direction(theDirection), data(0) { + parseData = new ParseData(0, &setVariablesVector); +} + +/** + * Destructor. + */ +TransliterationRuleParser::~TransliterationRuleParser() { + delete parseData; +} /** * Parse the given string as a sequence of rules, separated by newline @@ -76,7 +136,9 @@ void TransliterationRuleParser::parseRules(void) { if (U_FAILURE(status)) { return; } - + + parseData->data = data; + setVariablesVector.removeAllElements(); determineVariableRange(); int32_t pos = 0; @@ -103,6 +165,18 @@ void TransliterationRuleParser::parseRules(void) { pos = parseRule(--pos, limit); } + // Convert the set vector to an array + data->setVariablesLength = setVariablesVector.size(); + data->setVariables = new UnicodeSet*[data->setVariablesLength]; + // orphanElement removes the given element and shifts all other + // elements down. For performance (and code clarity) we work from + // the end back to index 0. + for (int32_t i=data->setVariablesLength; i>0; ) { + --i; + data->setVariables[i] = + (UnicodeSet*) setVariablesVector.orphanElementAt(i); + } + // Index the rules if (U_SUCCESS(status)) { data->ruleSet.freeze(*data, status); @@ -272,7 +346,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) { break; case SET_OPEN: { ParsePosition pp(pos-1); // Backup to opening '[' - buf.append(registerSet(new UnicodeSet(rules, pp, data, status))); + buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status))); if (U_FAILURE(status)) { return syntaxError("Invalid set", rules, start); } @@ -407,9 +481,8 @@ UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) { status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } - UChar c = variableNext++; - data->defineSet(c, adoptedSet, status); - return c; + setVariablesVector.addElement(adoptedSet); + return variableNext++; } /** @@ -425,10 +498,10 @@ void TransliterationRuleParser::determineVariableRange(void) { UnicodeRange* r = privateUse.largestUnusedSubrange(rules); - variableNext = variableLimit = (UChar) 0; + data->setVariablesBase = variableNext = variableLimit = (UChar) 0; if (r != 0) { - variableNext = r->start; + data->setVariablesBase = variableNext = r->start; variableLimit = (UChar) (r->start + r->length); delete r; } diff --git a/icu4c/source/i18n/rbt_pars.h b/icu4c/source/i18n/rbt_pars.h index be6ad0b0ddf..1af299106fd 100644 --- a/icu4c/source/i18n/rbt_pars.h +++ b/icu4c/source/i18n/rbt_pars.h @@ -9,9 +9,11 @@ #define RBT_PARS_H #include "unicode/rbt.h" +#include "uvector.h" class TransliterationRuleData; class UnicodeSet; +class ParseData; class TransliterationRuleParser { @@ -31,6 +33,18 @@ class TransliterationRuleParser { */ UErrorCode status; + /** + * Temporary symbol table used during parsing. + */ + ParseData* parseData; + + /** + * Temporary vector of set variables. When parsing is complete, this + * is copied into the array data.setVariables. As with data.setVariables, + * element 0 corresponds to character data.setVariablesBase. + */ + UVector setVariablesVector; + /** * The next available stand-in for variables. This starts at some point in * the private use area (discovered dynamically) and increments up toward @@ -82,6 +96,11 @@ private: TransliterationRuleParser(const UnicodeString& rules, RuleBasedTransliterator::Direction direction); + /** + * Destructor. + */ + ~TransliterationRuleParser(); + /** * Parse the given string as a sequence of rules, separated by newline * characters ('\n'), and cause this object to implement those rules. Any diff --git a/icu4c/source/i18n/rbt_rule.cpp b/icu4c/source/i18n/rbt_rule.cpp index cb8ea4a9610..d1758c8eca0 100644 --- a/icu4c/source/i18n/rbt_rule.cpp +++ b/icu4c/source/i18n/rbt_rule.cpp @@ -143,7 +143,7 @@ bool_t TransliterationRule::matchesIndexValue(uint8_t v, return TRUE; } UChar c = pattern.charAt(anteContextLength); - UnicodeSet* set = data.lookupSet(c); + const UnicodeSet* set = data.lookupSet(c); return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v); } @@ -314,7 +314,7 @@ int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text, bool_t TransliterationRule::charMatches(UChar keyChar, UChar textChar, const TransliterationRuleData& data, const UnicodeFilter* filter) const { - UnicodeSet* set = 0; + const UnicodeSet* set = 0; return (filter == 0 || filter->contains(textChar)) && (((set = data.lookupSet(keyChar)) == 0) ? keyChar == textChar : set->contains(textChar)); diff --git a/icu4c/source/i18n/symtable.h b/icu4c/source/i18n/symtable.h new file mode 100644 index 00000000000..b9462edfde9 --- /dev/null +++ b/icu4c/source/i18n/symtable.h @@ -0,0 +1,29 @@ +/* +********************************************************************** +* Copyright (c) 2000, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* Date Name Description +* 02/04/00 aliu Creation. +********************************************************************** +*/ +#ifndef SYMTABLE_H +#define SYMTABLE_H + +/** + * An abstract class that maps strings to objects. + */ +class SymbolTable { +public: + + /** + * Lookup the object associated with this string and return it. + * Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not + * exist. Return a non-NULL set if the name is mapped to a set; + * otherwise return a NULL set. + */ + virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set, + UErrorCode& status) const = 0; +}; + +#endif diff --git a/icu4c/source/i18n/unicode/uniset.h b/icu4c/source/i18n/unicode/uniset.h index f5b2914a596..e0a102653a3 100644 --- a/icu4c/source/i18n/unicode/uniset.h +++ b/icu4c/source/i18n/unicode/uniset.h @@ -15,7 +15,7 @@ #include "unicode/unistr.h" class ParsePosition; -class TransliterationRuleData; +class SymbolTable; class TransliterationRuleParser; class TransliterationRule; @@ -557,7 +557,7 @@ private: * contains a syntax error. */ UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, - const TransliterationRuleData* data, + const SymbolTable& symbols, UErrorCode& status); /** @@ -600,7 +600,7 @@ private: static UnicodeString& parse(UnicodeString& pairsBuf /*result*/, const UnicodeString& pattern, ParsePosition& pos, - const TransliterationRuleData* data, + const SymbolTable* symbols, UErrorCode& status); //---------------------------------------------------------------- diff --git a/icu4c/source/i18n/uniset.cpp b/icu4c/source/i18n/uniset.cpp index 4be900298e0..cb92ab833eb 100644 --- a/icu4c/source/i18n/uniset.cpp +++ b/icu4c/source/i18n/uniset.cpp @@ -10,7 +10,7 @@ #include "unicode/uniset.h" #include "unicode/parsepos.h" -#include "rbt_data.h" +#include "symtable.h" // N.B.: This mapping is different in ICU and Java const UnicodeString UnicodeSet::CATEGORY_NAMES( @@ -77,10 +77,11 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, applyPattern(pattern, status); } +// For internal use by RuleBasedTransliterator UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, - const TransliterationRuleData* data, + const SymbolTable& symbols, UErrorCode& status) { - parse(pairs, pattern, pos, data, status); + parse(pairs, pattern, pos, &symbols, status); } /** @@ -452,7 +453,7 @@ void UnicodeSet::clear(void) { UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/, const UnicodeString& pattern, ParsePosition& pos, - const TransliterationRuleData* data, + const SymbolTable* symbols, UErrorCode& status) { if (U_FAILURE(status)) { return pairsBuf; @@ -583,9 +584,10 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/, * Variable names are only parsed if varNameToChar is not null. * Set variables are only looked up if varCharToSet is not null. */ - else if (data != NULL && !isLiteral && c == VARIABLE_REF_OPEN) { + else if (symbols != NULL && !isLiteral && c == VARIABLE_REF_OPEN) { ++i; int32_t j = pattern.indexOf(VARIABLE_REF_CLOSE, i); + UnicodeSet* set = NULL; if (i == j || j < 0) { // empty or unterminated // throw new IllegalArgumentException("Illegal variable reference"); status = U_ILLEGAL_ARGUMENT_ERROR; @@ -593,7 +595,7 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/, scratch.truncate(0); pattern.extractBetween(i, j, scratch); ++j; - c = data->lookupVariable(scratch, status); + symbols->lookup(scratch, c, set, status); } if (U_FAILURE(status)) { // Either the reference was ill-formed (empty name, or no @@ -602,7 +604,6 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/, } isLiteral = TRUE; - UnicodeSet* set = data->lookupSet(c); if (set != NULL) { nestedPairs = &set->pairs; } @@ -638,7 +639,7 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/, } else { // Recurse to get the pairs for this nested set. pos.setIndex(i); - nestedPairs = &parse(nestedAux, pattern, pos, data, status); + nestedPairs = &parse(nestedAux, pattern, pos, symbols, status); if (U_FAILURE(status)) { return pairsBuf; }