mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-265 map char to set with array instead of hash for better performance
X-SVN-Rev: 728
This commit is contained in:
parent
bf89e792e3
commit
7ce42e2f31
8 changed files with 177 additions and 70 deletions
|
@ -17,16 +17,15 @@ TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
|
|||
return;
|
||||
}
|
||||
variableNames = uhash_open((UHashFunction)uhash_hashUString, &status);
|
||||
setVariables = uhash_open(0, &status);
|
||||
setVariables = 0;
|
||||
setVariablesLength = 0;
|
||||
}
|
||||
|
||||
TransliterationRuleData::~TransliterationRuleData() {
|
||||
if (variableNames != 0) {
|
||||
uhash_close(variableNames);
|
||||
}
|
||||
if (setVariables != 0) {
|
||||
uhash_close(setVariables);
|
||||
}
|
||||
delete[] setVariables;
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -38,31 +37,6 @@ TransliterationRuleData::defineVariable(const UnicodeString& name,
|
|||
&status);
|
||||
}
|
||||
|
||||
void
|
||||
TransliterationRuleData::defineVariable(const UnicodeString& name,
|
||||
UChar standIn,
|
||||
UnicodeSet* adoptedSet,
|
||||
UErrorCode& status) {
|
||||
defineVariable(name, standIn, status);
|
||||
defineSet(standIn, adoptedSet, status);
|
||||
}
|
||||
|
||||
void
|
||||
TransliterationRuleData::defineSet(UChar standIn,
|
||||
UnicodeSet* adoptedSet,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (adoptedSet == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
uhash_putKey(setVariables, (int32_t) (standIn & 0x7FFFFFFF),
|
||||
adoptedSet,
|
||||
&status);
|
||||
}
|
||||
|
||||
UChar
|
||||
TransliterationRuleData::lookupVariable(const UnicodeString& name,
|
||||
UErrorCode& status) const {
|
||||
|
@ -76,10 +50,10 @@ TransliterationRuleData::lookupVariable(const UnicodeString& name,
|
|||
return (UChar) (int32_t) value;
|
||||
}
|
||||
|
||||
UnicodeSet*
|
||||
const UnicodeSet*
|
||||
TransliterationRuleData::lookupSet(UChar standIn) const {
|
||||
void* value = uhash_get(setVariables, (int32_t) (standIn & 0x7FFFFFFF));
|
||||
return (UnicodeSet*) value;
|
||||
int32_t i = standIn - setVariablesBase;
|
||||
return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0;
|
||||
}
|
||||
|
||||
bool_t
|
||||
|
|
|
@ -20,6 +20,16 @@ struct UHashtable;
|
|||
* are essentially the parsed rules in compact, usable form. The
|
||||
* TRD objects themselves are held for the life of the process in
|
||||
* a static cache owned by Transliterator.
|
||||
*
|
||||
* This class' API is a little asymmetric. There is a method to
|
||||
* define a variable, but no way to define a set. This is because the
|
||||
* sets are defined by the parser in a UVector, and the vector is
|
||||
* copied into a fixed-size array here. Once this is done, no new
|
||||
* sets may be defined. In practice, there is no need to do so, since
|
||||
* generating the data and using it are discrete phases. When there
|
||||
* is a need to access the set data during the parse phase, another
|
||||
* data structure handles this. See the parsing code for more
|
||||
* details.
|
||||
*/
|
||||
class TransliterationRuleData {
|
||||
|
||||
|
@ -47,18 +57,28 @@ public:
|
|||
UHashtable* variableNames;
|
||||
|
||||
/**
|
||||
* Map category variable (UChar) to set (UnicodeSet).
|
||||
* Map category variable (Character) to set (UnicodeSet).
|
||||
* Variables that correspond to a set of characters are mapped
|
||||
* from variable name to a stand-in character in
|
||||
* data.variableNames. The stand-in then serves as a key in
|
||||
* this hash to lookup the actual UnicodeSet object. In
|
||||
* addition, the stand-in is stored in the rule text to
|
||||
* represent the set of characters.
|
||||
* from variable name to a stand-in character in data.variableNames.
|
||||
* The stand-in then serves as a key in this hash to lookup the
|
||||
* actual UnicodeSet object. In addition, the stand-in is
|
||||
* stored in the rule text to represent the set of characters.
|
||||
* setVariables[i] represents character (setVariablesBase + i).
|
||||
*
|
||||
* PUBLIC DATA MEMBER for internal use by RBT
|
||||
*/
|
||||
UHashtable* setVariables;
|
||||
UnicodeSet** setVariables;
|
||||
|
||||
/**
|
||||
* The character represented by setVariables[0].
|
||||
*/
|
||||
UChar setVariablesBase;
|
||||
|
||||
/**
|
||||
* The length of setVariables.
|
||||
*/
|
||||
int32_t setVariablesLength;
|
||||
|
||||
TransliterationRuleData(UErrorCode& status);
|
||||
|
||||
~TransliterationRuleData();
|
||||
|
@ -67,19 +87,10 @@ public:
|
|||
UChar value,
|
||||
UErrorCode& status);
|
||||
|
||||
void defineVariable(const UnicodeString& name,
|
||||
UChar standIn,
|
||||
UnicodeSet* adoptedSet,
|
||||
UErrorCode& status);
|
||||
|
||||
void defineSet(UChar standIn,
|
||||
UnicodeSet* adoptedSet,
|
||||
UErrorCode& status);
|
||||
|
||||
UChar lookupVariable(const UnicodeString& name,
|
||||
UErrorCode& status) const;
|
||||
|
||||
UnicodeSet* lookupSet(UChar standIn) const;
|
||||
const UnicodeSet* lookupSet(UChar standIn) const;
|
||||
|
||||
bool_t isVariableDefined(const UnicodeString& name) const;
|
||||
};
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "unicode/uniset.h"
|
||||
#include "cstring.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "symtable.h"
|
||||
|
||||
// Operators
|
||||
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
|
||||
|
@ -37,6 +38,56 @@ const UChar TransliterationRuleParser::SET_OPEN = '[';
|
|||
const UChar TransliterationRuleParser::SET_CLOSE = ']';
|
||||
const UChar TransliterationRuleParser::CURSOR_POS = '|';
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// BEGIN ParseData
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* This class implements the SymbolTable interface. It is used
|
||||
* during parsing to give UnicodeSet access to variables that
|
||||
* have been defined so far. Note that it uses setVariablesVector,
|
||||
* _not_ data.setVariables.
|
||||
*/
|
||||
class ParseData : public SymbolTable {
|
||||
public:
|
||||
const TransliterationRuleData* data; // alias
|
||||
|
||||
const UVector* setVariablesVector; // alias
|
||||
|
||||
ParseData(const TransliterationRuleData* data = 0,
|
||||
const UVector* setVariablesVector = 0);
|
||||
|
||||
/**
|
||||
* Lookup the object associated with this string and return it.
|
||||
* Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
|
||||
* exist. Return a non-NULL set if the name is mapped to a set;
|
||||
* otherwise return a NULL set.
|
||||
*/
|
||||
virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
|
||||
UErrorCode& status) const;
|
||||
};
|
||||
|
||||
ParseData::ParseData(const TransliterationRuleData* d,
|
||||
const UVector* sets) :
|
||||
data(d), setVariablesVector(sets) {}
|
||||
|
||||
/**
|
||||
* Implement SymbolTable API. Lookup a variable, returning
|
||||
* either a Character, a UnicodeSet, or null.
|
||||
*/
|
||||
void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
|
||||
UErrorCode& status) const {
|
||||
c = data->lookupVariable(name, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
int32_t i = c - data->setVariablesBase;
|
||||
set = (i < setVariablesVector->size()) ?
|
||||
(UnicodeSet*) setVariablesVector->elementAt(i) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// END ParseData
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
TransliterationRuleData*
|
||||
TransliterationRuleParser::parse(const UnicodeString& rules,
|
||||
|
@ -58,7 +109,16 @@ TransliterationRuleParser::parse(const UnicodeString& rules,
|
|||
TransliterationRuleParser::TransliterationRuleParser(
|
||||
const UnicodeString& theRules,
|
||||
RuleBasedTransliterator::Direction theDirection) :
|
||||
rules(theRules), direction(theDirection), data(0) {}
|
||||
rules(theRules), direction(theDirection), data(0) {
|
||||
parseData = new ParseData(0, &setVariablesVector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
TransliterationRuleParser::~TransliterationRuleParser() {
|
||||
delete parseData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the given string as a sequence of rules, separated by newline
|
||||
|
@ -76,7 +136,9 @@ void TransliterationRuleParser::parseRules(void) {
|
|||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
parseData->data = data;
|
||||
setVariablesVector.removeAllElements();
|
||||
determineVariableRange();
|
||||
|
||||
int32_t pos = 0;
|
||||
|
@ -103,6 +165,18 @@ void TransliterationRuleParser::parseRules(void) {
|
|||
pos = parseRule(--pos, limit);
|
||||
}
|
||||
|
||||
// Convert the set vector to an array
|
||||
data->setVariablesLength = setVariablesVector.size();
|
||||
data->setVariables = new UnicodeSet*[data->setVariablesLength];
|
||||
// orphanElement removes the given element and shifts all other
|
||||
// elements down. For performance (and code clarity) we work from
|
||||
// the end back to index 0.
|
||||
for (int32_t i=data->setVariablesLength; i>0; ) {
|
||||
--i;
|
||||
data->setVariables[i] =
|
||||
(UnicodeSet*) setVariablesVector.orphanElementAt(i);
|
||||
}
|
||||
|
||||
// Index the rules
|
||||
if (U_SUCCESS(status)) {
|
||||
data->ruleSet.freeze(*data, status);
|
||||
|
@ -272,7 +346,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
|
|||
break;
|
||||
case SET_OPEN: {
|
||||
ParsePosition pp(pos-1); // Backup to opening '['
|
||||
buf.append(registerSet(new UnicodeSet(rules, pp, data, status)));
|
||||
buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status)));
|
||||
if (U_FAILURE(status)) {
|
||||
return syntaxError("Invalid set", rules, start);
|
||||
}
|
||||
|
@ -407,9 +481,8 @@ UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
|
|||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UChar c = variableNext++;
|
||||
data->defineSet(c, adoptedSet, status);
|
||||
return c;
|
||||
setVariablesVector.addElement(adoptedSet);
|
||||
return variableNext++;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -425,10 +498,10 @@ void TransliterationRuleParser::determineVariableRange(void) {
|
|||
|
||||
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
|
||||
|
||||
variableNext = variableLimit = (UChar) 0;
|
||||
data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
|
||||
|
||||
if (r != 0) {
|
||||
variableNext = r->start;
|
||||
data->setVariablesBase = variableNext = r->start;
|
||||
variableLimit = (UChar) (r->start + r->length);
|
||||
delete r;
|
||||
}
|
||||
|
|
|
@ -9,9 +9,11 @@
|
|||
#define RBT_PARS_H
|
||||
|
||||
#include "unicode/rbt.h"
|
||||
#include "uvector.h"
|
||||
|
||||
class TransliterationRuleData;
|
||||
class UnicodeSet;
|
||||
class ParseData;
|
||||
|
||||
class TransliterationRuleParser {
|
||||
|
||||
|
@ -31,6 +33,18 @@ class TransliterationRuleParser {
|
|||
*/
|
||||
UErrorCode status;
|
||||
|
||||
/**
|
||||
* Temporary symbol table used during parsing.
|
||||
*/
|
||||
ParseData* parseData;
|
||||
|
||||
/**
|
||||
* Temporary vector of set variables. When parsing is complete, this
|
||||
* is copied into the array data.setVariables. As with data.setVariables,
|
||||
* element 0 corresponds to character data.setVariablesBase.
|
||||
*/
|
||||
UVector setVariablesVector;
|
||||
|
||||
/**
|
||||
* The next available stand-in for variables. This starts at some point in
|
||||
* the private use area (discovered dynamically) and increments up toward
|
||||
|
@ -82,6 +96,11 @@ private:
|
|||
TransliterationRuleParser(const UnicodeString& rules,
|
||||
RuleBasedTransliterator::Direction direction);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~TransliterationRuleParser();
|
||||
|
||||
/**
|
||||
* Parse the given string as a sequence of rules, separated by newline
|
||||
* characters ('\n'), and cause this object to implement those rules. Any
|
||||
|
|
|
@ -143,7 +143,7 @@ bool_t TransliterationRule::matchesIndexValue(uint8_t v,
|
|||
return TRUE;
|
||||
}
|
||||
UChar c = pattern.charAt(anteContextLength);
|
||||
UnicodeSet* set = data.lookupSet(c);
|
||||
const UnicodeSet* set = data.lookupSet(c);
|
||||
return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
|
||||
}
|
||||
|
||||
|
@ -314,7 +314,7 @@ int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text,
|
|||
bool_t TransliterationRule::charMatches(UChar keyChar, UChar textChar,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
UnicodeSet* set = 0;
|
||||
const UnicodeSet* set = 0;
|
||||
return (filter == 0 || filter->contains(textChar)) &&
|
||||
(((set = data.lookupSet(keyChar)) == 0) ?
|
||||
keyChar == textChar : set->contains(textChar));
|
||||
|
|
29
icu4c/source/i18n/symtable.h
Normal file
29
icu4c/source/i18n/symtable.h
Normal file
|
@ -0,0 +1,29 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2000, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 02/04/00 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef SYMTABLE_H
|
||||
#define SYMTABLE_H
|
||||
|
||||
/**
|
||||
* An abstract class that maps strings to objects.
|
||||
*/
|
||||
class SymbolTable {
|
||||
public:
|
||||
|
||||
/**
|
||||
* Lookup the object associated with this string and return it.
|
||||
* Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
|
||||
* exist. Return a non-NULL set if the name is mapped to a set;
|
||||
* otherwise return a NULL set.
|
||||
*/
|
||||
virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
|
||||
UErrorCode& status) const = 0;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -15,7 +15,7 @@
|
|||
#include "unicode/unistr.h"
|
||||
|
||||
class ParsePosition;
|
||||
class TransliterationRuleData;
|
||||
class SymbolTable;
|
||||
class TransliterationRuleParser;
|
||||
class TransliterationRule;
|
||||
|
||||
|
@ -557,7 +557,7 @@ private:
|
|||
* contains a syntax error.
|
||||
*/
|
||||
UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
|
||||
const TransliterationRuleData* data,
|
||||
const SymbolTable& symbols,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
|
@ -600,7 +600,7 @@ private:
|
|||
static UnicodeString& parse(UnicodeString& pairsBuf /*result*/,
|
||||
const UnicodeString& pattern,
|
||||
ParsePosition& pos,
|
||||
const TransliterationRuleData* data,
|
||||
const SymbolTable* symbols,
|
||||
UErrorCode& status);
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "rbt_data.h"
|
||||
#include "symtable.h"
|
||||
|
||||
// N.B.: This mapping is different in ICU and Java
|
||||
const UnicodeString UnicodeSet::CATEGORY_NAMES(
|
||||
|
@ -77,10 +77,11 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern,
|
|||
applyPattern(pattern, status);
|
||||
}
|
||||
|
||||
// For internal use by RuleBasedTransliterator
|
||||
UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
|
||||
const TransliterationRuleData* data,
|
||||
const SymbolTable& symbols,
|
||||
UErrorCode& status) {
|
||||
parse(pairs, pattern, pos, data, status);
|
||||
parse(pairs, pattern, pos, &symbols, status);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -452,7 +453,7 @@ void UnicodeSet::clear(void) {
|
|||
UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
|
||||
const UnicodeString& pattern,
|
||||
ParsePosition& pos,
|
||||
const TransliterationRuleData* data,
|
||||
const SymbolTable* symbols,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return pairsBuf;
|
||||
|
@ -583,9 +584,10 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
|
|||
* Variable names are only parsed if varNameToChar is not null.
|
||||
* Set variables are only looked up if varCharToSet is not null.
|
||||
*/
|
||||
else if (data != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
|
||||
else if (symbols != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
|
||||
++i;
|
||||
int32_t j = pattern.indexOf(VARIABLE_REF_CLOSE, i);
|
||||
UnicodeSet* set = NULL;
|
||||
if (i == j || j < 0) { // empty or unterminated
|
||||
// throw new IllegalArgumentException("Illegal variable reference");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
@ -593,7 +595,7 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
|
|||
scratch.truncate(0);
|
||||
pattern.extractBetween(i, j, scratch);
|
||||
++j;
|
||||
c = data->lookupVariable(scratch, status);
|
||||
symbols->lookup(scratch, c, set, status);
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
// Either the reference was ill-formed (empty name, or no
|
||||
|
@ -602,7 +604,6 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
|
|||
}
|
||||
isLiteral = TRUE;
|
||||
|
||||
UnicodeSet* set = data->lookupSet(c);
|
||||
if (set != NULL) {
|
||||
nestedPairs = &set->pairs;
|
||||
}
|
||||
|
@ -638,7 +639,7 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
|
|||
} else {
|
||||
// Recurse to get the pairs for this nested set.
|
||||
pos.setIndex(i);
|
||||
nestedPairs = &parse(nestedAux, pattern, pos, data, status);
|
||||
nestedPairs = &parse(nestedAux, pattern, pos, symbols, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return pairsBuf;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue