ICU-265 map char to set with array instead of hash for better performance

X-SVN-Rev: 728
This commit is contained in:
Alan Liu 2000-02-08 02:49:15 +00:00
parent bf89e792e3
commit 7ce42e2f31
8 changed files with 177 additions and 70 deletions

View file

@ -17,16 +17,15 @@ TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
return;
}
variableNames = uhash_open((UHashFunction)uhash_hashUString, &status);
setVariables = uhash_open(0, &status);
setVariables = 0;
setVariablesLength = 0;
}
TransliterationRuleData::~TransliterationRuleData() {
if (variableNames != 0) {
uhash_close(variableNames);
}
if (setVariables != 0) {
uhash_close(setVariables);
}
delete[] setVariables;
}
void
@ -38,31 +37,6 @@ TransliterationRuleData::defineVariable(const UnicodeString& name,
&status);
}
void
TransliterationRuleData::defineVariable(const UnicodeString& name,
UChar standIn,
UnicodeSet* adoptedSet,
UErrorCode& status) {
defineVariable(name, standIn, status);
defineSet(standIn, adoptedSet, status);
}
void
TransliterationRuleData::defineSet(UChar standIn,
UnicodeSet* adoptedSet,
UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
if (adoptedSet == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
uhash_putKey(setVariables, (int32_t) (standIn & 0x7FFFFFFF),
adoptedSet,
&status);
}
UChar
TransliterationRuleData::lookupVariable(const UnicodeString& name,
UErrorCode& status) const {
@ -76,10 +50,10 @@ TransliterationRuleData::lookupVariable(const UnicodeString& name,
return (UChar) (int32_t) value;
}
UnicodeSet*
const UnicodeSet*
TransliterationRuleData::lookupSet(UChar standIn) const {
void* value = uhash_get(setVariables, (int32_t) (standIn & 0x7FFFFFFF));
return (UnicodeSet*) value;
int32_t i = standIn - setVariablesBase;
return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0;
}
bool_t

View file

@ -20,6 +20,16 @@ struct UHashtable;
* are essentially the parsed rules in compact, usable form. The
* TRD objects themselves are held for the life of the process in
* a static cache owned by Transliterator.
*
* This class' API is a little asymmetric. There is a method to
* define a variable, but no way to define a set. This is because the
* sets are defined by the parser in a UVector, and the vector is
* copied into a fixed-size array here. Once this is done, no new
* sets may be defined. In practice, there is no need to do so, since
* generating the data and using it are discrete phases. When there
* is a need to access the set data during the parse phase, another
* data structure handles this. See the parsing code for more
* details.
*/
class TransliterationRuleData {
@ -47,18 +57,28 @@ public:
UHashtable* variableNames;
/**
* Map category variable (UChar) to set (UnicodeSet).
* Map category variable (Character) to set (UnicodeSet).
* Variables that correspond to a set of characters are mapped
* from variable name to a stand-in character in
* data.variableNames. The stand-in then serves as a key in
* this hash to lookup the actual UnicodeSet object. In
* addition, the stand-in is stored in the rule text to
* represent the set of characters.
* from variable name to a stand-in character in data.variableNames.
* The stand-in then serves as a key in this hash to lookup the
* actual UnicodeSet object. In addition, the stand-in is
* stored in the rule text to represent the set of characters.
* setVariables[i] represents character (setVariablesBase + i).
*
* PUBLIC DATA MEMBER for internal use by RBT
*/
UHashtable* setVariables;
UnicodeSet** setVariables;
/**
* The character represented by setVariables[0].
*/
UChar setVariablesBase;
/**
* The length of setVariables.
*/
int32_t setVariablesLength;
TransliterationRuleData(UErrorCode& status);
~TransliterationRuleData();
@ -67,19 +87,10 @@ public:
UChar value,
UErrorCode& status);
void defineVariable(const UnicodeString& name,
UChar standIn,
UnicodeSet* adoptedSet,
UErrorCode& status);
void defineSet(UChar standIn,
UnicodeSet* adoptedSet,
UErrorCode& status);
UChar lookupVariable(const UnicodeString& name,
UErrorCode& status) const;
UnicodeSet* lookupSet(UChar standIn) const;
const UnicodeSet* lookupSet(UChar standIn) const;
bool_t isVariableDefined(const UnicodeString& name) const;
};

View file

@ -15,6 +15,7 @@
#include "unicode/uniset.h"
#include "cstring.h"
#include "unicode/parsepos.h"
#include "symtable.h"
// Operators
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
@ -37,6 +38,56 @@ const UChar TransliterationRuleParser::SET_OPEN = '[';
const UChar TransliterationRuleParser::SET_CLOSE = ']';
const UChar TransliterationRuleParser::CURSOR_POS = '|';
//----------------------------------------------------------------------
// BEGIN ParseData
//----------------------------------------------------------------------
/**
* This class implements the SymbolTable interface. It is used
* during parsing to give UnicodeSet access to variables that
* have been defined so far. Note that it uses setVariablesVector,
* _not_ data.setVariables.
*/
class ParseData : public SymbolTable {
public:
const TransliterationRuleData* data; // alias
const UVector* setVariablesVector; // alias
ParseData(const TransliterationRuleData* data = 0,
const UVector* setVariablesVector = 0);
/**
* Lookup the object associated with this string and return it.
* Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
* exist. Return a non-NULL set if the name is mapped to a set;
* otherwise return a NULL set.
*/
virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
UErrorCode& status) const;
};
ParseData::ParseData(const TransliterationRuleData* d,
const UVector* sets) :
data(d), setVariablesVector(sets) {}
/**
* Implement SymbolTable API. Lookup a variable, returning
* either a Character, a UnicodeSet, or null.
*/
void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
UErrorCode& status) const {
c = data->lookupVariable(name, status);
if (U_SUCCESS(status)) {
int32_t i = c - data->setVariablesBase;
set = (i < setVariablesVector->size()) ?
(UnicodeSet*) setVariablesVector->elementAt(i) : 0;
}
}
//----------------------------------------------------------------------
// END ParseData
//----------------------------------------------------------------------
TransliterationRuleData*
TransliterationRuleParser::parse(const UnicodeString& rules,
@ -58,7 +109,16 @@ TransliterationRuleParser::parse(const UnicodeString& rules,
TransliterationRuleParser::TransliterationRuleParser(
const UnicodeString& theRules,
RuleBasedTransliterator::Direction theDirection) :
rules(theRules), direction(theDirection), data(0) {}
rules(theRules), direction(theDirection), data(0) {
parseData = new ParseData(0, &setVariablesVector);
}
/**
* Destructor.
*/
TransliterationRuleParser::~TransliterationRuleParser() {
delete parseData;
}
/**
* Parse the given string as a sequence of rules, separated by newline
@ -76,7 +136,9 @@ void TransliterationRuleParser::parseRules(void) {
if (U_FAILURE(status)) {
return;
}
parseData->data = data;
setVariablesVector.removeAllElements();
determineVariableRange();
int32_t pos = 0;
@ -103,6 +165,18 @@ void TransliterationRuleParser::parseRules(void) {
pos = parseRule(--pos, limit);
}
// Convert the set vector to an array
data->setVariablesLength = setVariablesVector.size();
data->setVariables = new UnicodeSet*[data->setVariablesLength];
// orphanElement removes the given element and shifts all other
// elements down. For performance (and code clarity) we work from
// the end back to index 0.
for (int32_t i=data->setVariablesLength; i>0; ) {
--i;
data->setVariables[i] =
(UnicodeSet*) setVariablesVector.orphanElementAt(i);
}
// Index the rules
if (U_SUCCESS(status)) {
data->ruleSet.freeze(*data, status);
@ -272,7 +346,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
break;
case SET_OPEN: {
ParsePosition pp(pos-1); // Backup to opening '['
buf.append(registerSet(new UnicodeSet(rules, pp, data, status)));
buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status)));
if (U_FAILURE(status)) {
return syntaxError("Invalid set", rules, start);
}
@ -407,9 +481,8 @@ UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar c = variableNext++;
data->defineSet(c, adoptedSet, status);
return c;
setVariablesVector.addElement(adoptedSet);
return variableNext++;
}
/**
@ -425,10 +498,10 @@ void TransliterationRuleParser::determineVariableRange(void) {
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
variableNext = variableLimit = (UChar) 0;
data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
if (r != 0) {
variableNext = r->start;
data->setVariablesBase = variableNext = r->start;
variableLimit = (UChar) (r->start + r->length);
delete r;
}

View file

@ -9,9 +9,11 @@
#define RBT_PARS_H
#include "unicode/rbt.h"
#include "uvector.h"
class TransliterationRuleData;
class UnicodeSet;
class ParseData;
class TransliterationRuleParser {
@ -31,6 +33,18 @@ class TransliterationRuleParser {
*/
UErrorCode status;
/**
* Temporary symbol table used during parsing.
*/
ParseData* parseData;
/**
* Temporary vector of set variables. When parsing is complete, this
* is copied into the array data.setVariables. As with data.setVariables,
* element 0 corresponds to character data.setVariablesBase.
*/
UVector setVariablesVector;
/**
* The next available stand-in for variables. This starts at some point in
* the private use area (discovered dynamically) and increments up toward
@ -82,6 +96,11 @@ private:
TransliterationRuleParser(const UnicodeString& rules,
RuleBasedTransliterator::Direction direction);
/**
* Destructor.
*/
~TransliterationRuleParser();
/**
* Parse the given string as a sequence of rules, separated by newline
* characters ('\n'), and cause this object to implement those rules. Any

View file

@ -143,7 +143,7 @@ bool_t TransliterationRule::matchesIndexValue(uint8_t v,
return TRUE;
}
UChar c = pattern.charAt(anteContextLength);
UnicodeSet* set = data.lookupSet(c);
const UnicodeSet* set = data.lookupSet(c);
return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
}
@ -314,7 +314,7 @@ int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text,
bool_t TransliterationRule::charMatches(UChar keyChar, UChar textChar,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
UnicodeSet* set = 0;
const UnicodeSet* set = 0;
return (filter == 0 || filter->contains(textChar)) &&
(((set = data.lookupSet(keyChar)) == 0) ?
keyChar == textChar : set->contains(textChar));

View file

@ -0,0 +1,29 @@
/*
**********************************************************************
* Copyright (c) 2000, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 02/04/00 aliu Creation.
**********************************************************************
*/
#ifndef SYMTABLE_H
#define SYMTABLE_H
/**
* An abstract class that maps strings to objects.
*/
class SymbolTable {
public:
/**
* Lookup the object associated with this string and return it.
* Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
* exist. Return a non-NULL set if the name is mapped to a set;
* otherwise return a NULL set.
*/
virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
UErrorCode& status) const = 0;
};
#endif

View file

@ -15,7 +15,7 @@
#include "unicode/unistr.h"
class ParsePosition;
class TransliterationRuleData;
class SymbolTable;
class TransliterationRuleParser;
class TransliterationRule;
@ -557,7 +557,7 @@ private:
* contains a syntax error.
*/
UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
const TransliterationRuleData* data,
const SymbolTable& symbols,
UErrorCode& status);
/**
@ -600,7 +600,7 @@ private:
static UnicodeString& parse(UnicodeString& pairsBuf /*result*/,
const UnicodeString& pattern,
ParsePosition& pos,
const TransliterationRuleData* data,
const SymbolTable* symbols,
UErrorCode& status);
//----------------------------------------------------------------

View file

@ -10,7 +10,7 @@
#include "unicode/uniset.h"
#include "unicode/parsepos.h"
#include "rbt_data.h"
#include "symtable.h"
// N.B.: This mapping is different in ICU and Java
const UnicodeString UnicodeSet::CATEGORY_NAMES(
@ -77,10 +77,11 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern,
applyPattern(pattern, status);
}
// For internal use by RuleBasedTransliterator
UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
const TransliterationRuleData* data,
const SymbolTable& symbols,
UErrorCode& status) {
parse(pairs, pattern, pos, data, status);
parse(pairs, pattern, pos, &symbols, status);
}
/**
@ -452,7 +453,7 @@ void UnicodeSet::clear(void) {
UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
const UnicodeString& pattern,
ParsePosition& pos,
const TransliterationRuleData* data,
const SymbolTable* symbols,
UErrorCode& status) {
if (U_FAILURE(status)) {
return pairsBuf;
@ -583,9 +584,10 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
* Variable names are only parsed if varNameToChar is not null.
* Set variables are only looked up if varCharToSet is not null.
*/
else if (data != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
else if (symbols != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
++i;
int32_t j = pattern.indexOf(VARIABLE_REF_CLOSE, i);
UnicodeSet* set = NULL;
if (i == j || j < 0) { // empty or unterminated
// throw new IllegalArgumentException("Illegal variable reference");
status = U_ILLEGAL_ARGUMENT_ERROR;
@ -593,7 +595,7 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
scratch.truncate(0);
pattern.extractBetween(i, j, scratch);
++j;
c = data->lookupVariable(scratch, status);
symbols->lookup(scratch, c, set, status);
}
if (U_FAILURE(status)) {
// Either the reference was ill-formed (empty name, or no
@ -602,7 +604,6 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
}
isLiteral = TRUE;
UnicodeSet* set = data->lookupSet(c);
if (set != NULL) {
nestedPairs = &set->pairs;
}
@ -638,7 +639,7 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
} else {
// Recurse to get the pairs for this nested set.
pos.setIndex(i);
nestedPairs = &parse(nestedAux, pattern, pos, data, status);
nestedPairs = &parse(nestedAux, pattern, pos, symbols, status);
if (U_FAILURE(status)) {
return pairsBuf;
}