ICU-1261 initial implementation of compound filters in IDs and ::ID blocks

X-SVN-Rev: 6154
This commit is contained in:
Alan Liu 2001-10-10 19:29:45 +00:00
parent c968b1ea77
commit 267a914bc3
7 changed files with 413 additions and 302 deletions

View file

@ -10,6 +10,7 @@
#include "unicode/cpdtrans.h"
#include "unicode/unifilt.h"
#include "unicode/unifltlg.h"
#include "unicode/uniset.h"
#include "uvector.h"
// keep in sync with Transliterator
@ -129,13 +130,18 @@ void CompoundTransliterator::init(const UnicodeString& id,
}
UVector list(status);
UnicodeSet* compoundFilter = NULL;
UnicodeString regenID;
Transliterator::parseCompoundID(id, regenID, direction,
idSplitPoint, adoptedSplitTrans,
list, compoundRBTIndex,
list, compoundRBTIndex, compoundFilter,
parseError, status);
init(list, direction, fixReverseID, status);
if (compoundFilter != NULL) {
adoptFilter(compoundFilter);
}
}
/**

View file

@ -23,12 +23,24 @@ void RuleBasedTransliterator::_construct(const UnicodeString& rules,
UErrorCode& status) {
data = 0;
isDataOwned = TRUE;
if (U_SUCCESS(status)) {
data = TransliteratorParser::parse(rules, direction, parseError,status);
if (U_SUCCESS(status)) {
setMaximumContextLength(data->ruleSet.getMaximumContextLength());
}
if (U_FAILURE(status)) {
return;
}
TransliteratorParser parser;
parser.parse(rules, direction, parseError, status);
if (U_FAILURE(status)) {
return;
}
if (parser.idBlock.length() != 0 ||
parser.compoundFilter != NULL) {
status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
return;
}
data = parser.orphanData();
setMaximumContextLength(data->ruleSet.getMaximumContextLength());
}
RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,

View file

@ -16,6 +16,7 @@
#include "strmatch.h"
#include "symtable.h"
#include "unirange.h"
#include "uvector.h"
#include "unicode/parseerr.h"
#include "unicode/parsepos.h"
#include "unicode/putil.h"
@ -795,89 +796,63 @@ int32_t* RuleHalf::createSegments(UErrorCode& status) const {
}
//----------------------------------------------------------------------
// END RuleHalf
// PUBLIC API
//----------------------------------------------------------------------
TransliterationRuleData*
TransliteratorParser::parse(const UnicodeString& rules,
UTransDirection direction,
UParseError& parseError,
UErrorCode& ec) {
TransliteratorParser parser(rules, direction, parseError);
UnicodeString idBlock;
int32_t idSplitPoint, count;
parser.parseRules(idBlock, idSplitPoint, count);
if (U_FAILURE(parser.status) || idBlock.length() != 0) {
delete parser.data;
parser.data = 0;
ec = U_FAILURE(parser.status) ? parser.status : U_ILLEGAL_ARGUMENT_ERROR;
}
return parser.data;
}
/**
* Parse a given set of rules. Return up to three pieces of
* parsed data. These are the header ::id block, the rule block,
* and the footer ::id block. Any or all of these may be empty.
* If the ::id blocks are empty, their corresponding parameters
* are returned as the empty string. If there are no rules, the
* TransliterationRuleData result is 0.
* @param ruleDataResult caller owns the pointer stored here.
* May be NULL.
* @param headerRule string including semicolons for the header
* ::id block. May be empty.
* @param footerRule string including semicolons for the footer
* ::id block. May be empty.
* Constructor.
*/
void TransliteratorParser::parse(const UnicodeString& rules,
UTransDirection direction,
TransliterationRuleData*& ruleDataResult,
UnicodeString& idBlockResult,
int32_t& idSplitPointResult,
UParseError& parseError,
UErrorCode& ec) {
if (U_FAILURE(ec)) {
ruleDataResult = 0;
return;
}
TransliteratorParser parser(rules, direction, parseError);
int32_t count;
parser.parseRules(idBlockResult, idSplitPointResult, count);
if (U_FAILURE(parser.status) || count == 0) {
delete parser.data;
parser.data = 0;
}
ruleDataResult = parser.data;
ec = parser.status;
}
/**
* @param rules list of rules, separated by newline characters
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
/* Ram: Reordered member initializers to match declaration order and make GCC happy */
TransliteratorParser::TransliteratorParser(
const UnicodeString& theRules,
UTransDirection theDirection,
UParseError& theParseError)
:
rules(theRules), direction(theDirection),data(0),parseError(theParseError), variablesVector(status)
{
parseData = new ParseData(0, &variablesVector);
if (parseData == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
TransliteratorParser::TransliteratorParser() {
data = NULL;
compoundFilter = NULL;
parseData = NULL;
variablesVector = NULL;
}
/**
* Destructor.
*/
TransliteratorParser::~TransliteratorParser() {
delete data;
delete compoundFilter;
delete parseData;
delete variablesVector;
}
void
TransliteratorParser::parse(const UnicodeString& rules,
UTransDirection direction,
UParseError& pe,
UErrorCode& ec) {
if (U_SUCCESS(ec)) {
parseRules(rules, direction);
pe = parseError;
ec = status;
}
}
/**
* Return the compound filter parsed by parse(). Caller owns result.
*/
UnicodeSet* TransliteratorParser::orphanCompoundFilter() {
UnicodeSet* f = compoundFilter;
compoundFilter = NULL;
return f;
}
/**
* Return the data object parsed by parse(). Caller owns result.
*/
TransliterationRuleData* TransliteratorParser::orphanData() {
TransliterationRuleData* d = data;
data = NULL;
return d;
}
//----------------------------------------------------------------------
// Private implementation
//----------------------------------------------------------------------
/**
* Parse the given string as a sequence of rules, separated by newline
* characters ('\n'), and cause this object to implement those rules. Any
@ -886,18 +861,12 @@ TransliteratorParser::~TransliteratorParser() {
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
int32_t& idSplitPointResult,
int32_t& ruleCount) {
status = U_ZERO_ERROR;
ruleCount = 0;
void TransliteratorParser::parseRules(const UnicodeString& rules,
UTransDirection theDirection) {
// Clear error struct
//if (parseError != 0) {
//parseError->code = parseError->line = 0;
parseError.offset = 0;
parseError.preContext[0] = parseError.postContext[0] = (UChar)0;
//}
parseError.line = parseError.offset = 0;
parseError.preContext[0] = parseError.postContext[0] = (UChar)0;
status = U_ZERO_ERROR;
delete data;
data = new TransliterationRuleData(status);
@ -905,17 +874,28 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
return;
}
parseData->data = data;
variablesVector.removeAllElements();
/* if (parseError != 0) {
parseError->code = 0;
direction = theDirection;
ruleCount = 0;
delete compoundFilter;
compoundFilter = NULL;
if (variablesVector == NULL) {
variablesVector = new UVector(status);
} else {
variablesVector->removeAllElements();
}
*/
determineVariableRange();
parseData = new ParseData(0, variablesVector);
if (parseData == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
parseData->data = data;
determineVariableRange(rules);
UnicodeString str; // scratch
idBlockResult.truncate(0);
idSplitPointResult = -1;
idBlock.truncate(0);
idSplitPoint = -1;
int32_t pos = 0;
int32_t limit = rules.length();
// The mode marks whether we are in the header ::id block, the
@ -924,6 +904,15 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
// mode == 1: in rules: rule->1, ::id->2
// mode == 2: in footer rule block: rule->ERROR, ::id->2
int32_t mode = 0;
// The compound filter offset is an index into idBlockResult.
// If it is 0, then the compound filter occurred at the start,
// and it is the offset to the _start_ of the compound filter
// pattern. Otherwise it is the offset to the _limit_ of the
// compound filter pattern within idBlockResult.
compoundFilter = NULL;
int32_t compoundFilterOffset = -1;
while (pos < limit && U_SUCCESS(status)) {
UChar c = rules.charAt(pos++);
if (u_isWhitespace(c)) {
@ -954,25 +943,39 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
int32_t p = pos;
UBool sawDelim;
UnicodeString regenID;
Transliterator::parseID(rules, regenID, p, sawDelim, direction,parseError, FALSE,status);
UnicodeSet* cpdFilter = NULL;
Transliterator::parseID(rules, regenID, p, sawDelim, cpdFilter, direction,parseError, FALSE,status);
if (p == pos || !sawDelim) {
// Invalid ::id
delete cpdFilter;
syntaxError(U_ILLEGAL_ARGUMENT_ERROR, rules, pos);
} else {
if (mode == 1) {
mode = 2;
idSplitPointResult = idBlockResult.length();
idSplitPoint = idBlock.length();
}
if (cpdFilter != NULL) {
if (compoundFilter != NULL) {
syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rules, pos);
}
compoundFilter = cpdFilter;
if (idBlock.length() == 0) {
compoundFilterOffset = 0;
}
}
rules.extractBetween(pos, p, str);
idBlockResult.append(str);
idBlock.append(str);
if (!sawDelim) {
idBlockResult.append((UChar)0x003B /*;*/);
idBlock.append((UChar)0x003B /*;*/);
}
if (cpdFilter != NULL && compoundFilterOffset < 0) {
compoundFilterOffset = idBlock.length();
}
pos = p;
}
} else {
// Parse a rule
pos = parseRule(pos, limit);
pos = parseRule(rules, pos, limit);
if (U_SUCCESS(status)) {
++ruleCount;
if (mode == 2) {
@ -988,7 +991,7 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
}
// Convert the set vector to an array
data->variablesLength = variablesVector.size();
data->variablesLength = variablesVector->size();
data->variables = data->variablesLength == 0 ? 0 : new UnicodeMatcher*[data->variablesLength];
// orphanElement removes the given element and shifts all other
// elements down. For performance (and code clarity) we work from
@ -997,14 +1000,29 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
for (i=data->variablesLength; i>0; ) {
--i;
data->variables[i] =
(UnicodeSet*) variablesVector.orphanElementAt(i);
(UnicodeSet*) variablesVector->orphanElementAt(i);
}
// Index the rules
if (U_SUCCESS(status)) {
if (compoundFilter != NULL) {
if ((direction == UTRANS_FORWARD &&
compoundFilterOffset != 0) ||
(direction == UTRANS_REVERSE &&
compoundFilterOffset != idBlock.length())) {
status = U_MISPLACED_COMPOUND_FILTER;
}
}
data->ruleSet.freeze(parseError,status);
if (idSplitPointResult < 0) {
idSplitPointResult = idBlockResult.length();
if (idSplitPoint < 0) {
idSplitPoint = idBlock.length();
}
if (ruleCount == 0) {
delete data;
data = NULL;
}
}
}
@ -1022,11 +1040,10 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
* indicators. Once it does a lexical breakdown of the rule at pos, it
* creates a rule object and adds it to our rule list.
*/
int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) {
int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit) {
// Locate the left side, operator, and right side
int32_t start = pos;
UChar op = 0;
const UnicodeString& rule = rules; // TEMPORARY: FIX LATER
// Use pointers to automatics to make swapping possible.
RuleHalf _left(*this), _right(*this);
@ -1188,41 +1205,26 @@ int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) {
int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode,
const UnicodeString& rule,
int32_t pos) {
// if (parseError != 0) {
/* parseError->line = 0; // We don't return a line #
parseError->offset = start; // Character offset from rule start
int32_t end = quotedIndexOf(rule, start, rule.length(), END_OF_RULE);
if (end < 0) {
end = rule.length();
}
int32_t len = uprv_min(end - start, U_PARSE_CONTEXT_LEN-1);
// Extract everything into the preContext and leave the postContext
// blank, since we don't have precise error position.
// TODO: Fix this.
rule.extract(start, len, parseError->preContext); // Current rule
parseError->preContext[len] = 0;
parseError->postContext[0] = 0;
*/
parseError.offset = pos;
parseError.line = 0 ; /* we are not using line numbers */
parseError.offset = pos;
parseError.line = 0 ; /* we are not using line numbers */
// for pre-context
int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
int32_t stop = pos;
// for pre-context
int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
int32_t stop = pos;
rule.extract(start,stop-start,parseError.preContext);
//null terminate the buffer
parseError.preContext[stop-start] = 0;
rule.extract(start,stop-start,parseError.preContext);
//null terminate the buffer
parseError.preContext[stop-start] = 0;
//for post-context
start = pos+1;
stop = ((pos+U_PARSE_CONTEXT_LEN)<= rule.length() )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
rule.length();
//for post-context
start = pos+1;
stop = ((pos+U_PARSE_CONTEXT_LEN)<= rule.length() )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
rule.length();
rule.extract(start,stop-start,parseError.postContext);
//null terminate the buffer
parseError.postContext[stop-start]= 0;
rule.extract(start,stop-start,parseError.postContext);
//null terminate the buffer
parseError.postContext[stop-start]= 0;
// }
status = (UErrorCode)parseErrorCode;
return pos;
@ -1251,7 +1253,7 @@ UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
variablesVector.addElement(adopted, status);
variablesVector->addElement(adopted, status);
return variableNext++;
}
@ -1306,7 +1308,7 @@ UChar TransliteratorParser::getSegmentStandin(int32_t r) {
* When done, everything not in the hash is available for use. In practice,
* this method may employ some other algorithm for improved speed.
*/
void TransliteratorParser::determineVariableRange(void) {
void TransliteratorParser::determineVariableRange(const UnicodeString& rules) {
UnicodeRange privateUse(0xE000, 0x1900); // Private use area
UnicodeRange* r = privateUse.largestUnusedSubrange(rules, status);

View file

@ -9,7 +9,6 @@
#define RBT_PARS_H
#include "unicode/rbt.h"
#include "uvector.h"
#include "unicode/parseerr.h"
U_NAMESPACE_BEGIN
@ -19,19 +18,49 @@ class UnicodeMatcher;
class ParseData;
class RuleHalf;
class ParsePosition;
class UVector;
class TransliteratorParser {
public:
/**
* This is a reference to external data we don't own. This works because
* we only hold this for the duration of the call to parse().
* PUBLIC data member containing the parsed data object, or null if
* there were no rules.
*/
const UnicodeString& rules;
TransliterationRuleData* data;
/**
* PUBLIC data member.
* The block of ::IDs, both at the top and at the bottom.
* Inserted into these may be additional rules at the
* idSplitPoint.
*/
UnicodeString idBlock;
/**
* PUBLIC data member.
* In a compound RBT, the index at which the RBT rules are
* inserted into the ID block. Index 0 means before any IDs
* in the block. Index idBlock.length() means after all IDs
* in the block. Index is a string index.
*/
int32_t idSplitPoint;
/**
* PUBLIC data member containing the parsed compound filter, if any.
*/
UnicodeSet* compoundFilter;
private:
// The number of rules parsed. This tells us if there were
// any actual transliterator rules, or if there were just ::ID
// block IDs.
int32_t ruleCount;
UTransDirection direction;
TransliterationRuleData* data;
/**
* We use a single error code during parsing. Rather than pass it
* through each API, we keep it here.
@ -39,10 +68,9 @@ class TransliteratorParser {
UErrorCode status;
/**
* Pointer to user structure in which to return parse error information.
* May be NULL.
* Parse error information.
*/
UParseError& parseError;
UParseError parseError;
/**
* Temporary symbol table used during parsing.
@ -54,7 +82,7 @@ class TransliteratorParser {
* is copied into the array data.variables. As with data.variables,
* element 0 corresponds to character data.variablesBase.
*/
UVector variablesVector;
UVector* variablesVector;
/**
* The next available stand-in for variables. This starts at some point in
@ -82,44 +110,10 @@ class TransliteratorParser {
public:
static TransliterationRuleData*
parse(const UnicodeString& rules,
UTransDirection direction,
UParseError& parseError,
UErrorCode& ec);
/**
* Parse a given set of rules. Return up to three pieces of
* parsed data. These are the header ::id block, the rule block,
* and the footer ::id block. Any or all of these may be empty.
* If the ::id blocks are empty, their corresponding parameters
* are returned as the empty string. If there are no rules, the
* TransliterationRuleData result is 0.
* @param ruleDataResult caller owns the pointer stored here.
* May be NULL.
* @param headerRule string including semicolons for the header
* ::id block. May be empty.
* @param footerRule string including semicolons for the footer
* ::id block. May be empty.
* Constructor.
*/
static void parse(const UnicodeString& rules,
UTransDirection direction,
TransliterationRuleData*& ruleDataResult,
UnicodeString& idBlockResult,
int32_t& idSplitPointResult,
UParseError& parseError,
UErrorCode& ec);
private:
/**
* @param rules list of rules, separated by newline characters
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
TransliteratorParser(const UnicodeString& rules,
UTransDirection direction,
UParseError& parseError);
TransliteratorParser();
/**
* Destructor.
@ -130,12 +124,32 @@ private:
* Parse the given string as a sequence of rules, separated by newline
* characters ('\n'), and cause this object to implement those rules. Any
* previous rules are discarded. Typically this method is called exactly
* once, during construction.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
* once after construction.
*
* Parse the given rules, in the given direction. After this call
* returns, query the public data members for results. The caller
* owns the 'data' and 'compoundFilter' data members after this
* call returns.
*/
void parseRules(UnicodeString& idBlockResult, int32_t& idSplitPointResult,
int32_t& ruleCount);
void parse(const UnicodeString& rules,
UTransDirection direction,
UParseError& pe,
UErrorCode& ec);
/**
* Return the compound filter parsed by parse(). Caller owns result.
*/
UnicodeSet* orphanCompoundFilter();
/**
* Return the data object parsed by parse(). Caller owns result.
*/
TransliterationRuleData* orphanData();
private:
void parseRules(const UnicodeString& rules,
UTransDirection direction);
/**
* MAIN PARSER. Parse the next rule in the given rule string, starting
@ -150,7 +164,7 @@ private:
* indicators. Once it does a lexical breakdown of the rule at pos, it
* creates a rule object and adds it to our rule list.
*/
int32_t parseRule(int32_t pos, int32_t limit);
int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit);
/**
* Called by main parser upon syntax error. Search the rule string
@ -198,7 +212,7 @@ private:
* When done, everything not in the hash is available for use. In practice,
* this method may employ some other algorithm for improved speed.
*/
void determineVariableRange(void);
void determineVariableRange(const UnicodeString&);
/**
* Returns the index of a character, ignoring quoted text.

View file

@ -43,6 +43,7 @@ static const UChar ID_DELIM = 0x003B; /*;*/
static const UChar VARIANT_SEP = 0x002F; // '/'
static const UChar OPEN_PAREN = 40;
static const UChar CLOSE_PAREN = 41;
/**
* Prefix for resource bundle key for the display name for a
* transliterator. The ID is appended to this to form the key.
@ -688,8 +689,9 @@ Transliterator* Transliterator::createInstance(const UnicodeString& ID,
UVector list(status);
int32_t ignored;
UnicodeString regenID;
UnicodeSet* compoundFilter = 0;
parseCompoundID(ID, regenID, dir, idSplitPoint, adoptedSplitTrans,
list, ignored, parseError, status);
list, ignored, compoundFilter, parseError, status);
if (U_FAILURE(status)) {
return 0;
@ -708,6 +710,9 @@ Transliterator* Transliterator::createInstance(const UnicodeString& ID,
break;
}
t->setID(regenID);
if (compoundFilter != NULL) {
t->adoptFilter(compoundFilter);
}
return t;
}
@ -724,52 +729,52 @@ Transliterator* Transliterator::createFromRules(const UnicodeString& ID,
UTransDirection dir,
UParseError& parseError,
UErrorCode& status) {
UnicodeString idBlock;
int32_t idSplitPoint = -1;
TransliterationRuleData *data = 0;
Transliterator* t = NULL;
TransliteratorParser::parse(rules, dir, data,
idBlock, idSplitPoint,
parseError, status);
TransliteratorParser parser;
parser.parse(rules, dir, parseError, status);
if (U_FAILURE(status)) {
delete data;
return 0;
}
// NOTE: The logic here matches that in TransliteratorRegistry.
if (idBlock.length() == 0) {
if (data == 0) {
if (parser.idBlock.length() == 0) {
if (parser.data == NULL) {
// No idBlock, no data -- this is just an
// alias for Null
return new NullTransliterator();
t = new NullTransliterator();
} else {
// No idBlock, data != 0 -- this is an
// ordinary RBT_DATA.
return new RuleBasedTransliterator(ID, data, TRUE); // TRUE == adopt data object
t = new RuleBasedTransliterator(ID, parser.orphanData(), TRUE); // TRUE == adopt data object
}
} else {
if (data == 0) {
if (parser.data == NULL) {
// idBlock, no data -- this is an alias
Transliterator *t = createInstance(idBlock, dir, parseError,status);
if (t != 0) {
t = createInstance(parser.idBlock, dir, parseError, status);
if (t != NULL) {
t->setID(ID);
}
return t;
} else {
// idBlock and data -- this is a compound
// RBT
UnicodeString id("_", "");
Transliterator *t = new RuleBasedTransliterator(id, data, TRUE); // TRUE == adopt data object
t = new CompoundTransliterator(ID, idBlock, idSplitPoint,
t,parseError,status);
t = new RuleBasedTransliterator(id, parser.orphanData(), TRUE); // TRUE == adopt data object
t = new CompoundTransliterator(ID, parser.idBlock, parser.idSplitPoint,
t, parseError, status);
if (U_FAILURE(status)) {
delete t;
t = 0;
}
if (parser.compoundFilter != NULL) {
t->adoptFilter(parser.orphanCompoundFilter());
}
return t;
}
}
return t;
}
UnicodeString& Transliterator::toRules(UnicodeString& rulesSource,
@ -806,6 +811,7 @@ void Transliterator::parseCompoundID(const UnicodeString& id,
Transliterator *adoptedSplitTrans,
UVector& result,
int32_t& splitTransIndex,
UnicodeSet*& compoundFilter,
UParseError& parseError,
UErrorCode& status) {
if (U_FAILURE(status)) {
@ -816,6 +822,15 @@ void Transliterator::parseCompoundID(const UnicodeString& id,
splitTransIndex = -1;
int32_t pos = 0;
int32_t i;
// A compound filter is a filter on an entire compound
// transliterator. It is indicated by the syntax [abc]; A-B;
// B-C or in the reverse direction A-B; B-C; ([abc]). We
// record the filter and its index (in terms of the result
// vector).
compoundFilter = NULL;
int32_t compoundFilterIndex = -1;
while (pos < id.length()) {
// We compare (pos >= split), not (pos == split), so we can
// skip over whitespace (see below).
@ -826,13 +841,25 @@ void Transliterator::parseCompoundID(const UnicodeString& id,
}
int32_t p = pos;
UBool sawDelimiter; // We ignore this
UnicodeSet* cpdFilter = NULL;
Transliterator *t =
parseID(id, regenID, p, sawDelimiter, dir, parseError, TRUE,status);
parseID(id, regenID, p, sawDelimiter, cpdFilter, dir, parseError, TRUE,status);
if(U_FAILURE(status)){
delete t;
delete cpdFilter;
break;
}
if (cpdFilter != NULL) {
if (compoundFilter != NULL) {
status = U_MULTIPLE_COMPOUND_FILTERS;
delete t;
delete cpdFilter;
break;
}
compoundFilter = cpdFilter;
compoundFilterIndex = result.size();
}
if (p == pos || (p < id.length() && !sawDelimiter)) {
delete t;
@ -848,18 +875,28 @@ void Transliterator::parseCompoundID(const UnicodeString& id,
}
// Handle case of idSplitPoint == id.length()
if (pos >= idSplitPoint && adoptedSplitTrans != 0) {
if (U_SUCCESS(status) && pos >= idSplitPoint && adoptedSplitTrans != 0) {
splitTransIndex = result.size();
result.addElement(adoptedSplitTrans, status);
adoptedSplitTrans = 0;
}
// Check validity of compound filter position
if (compoundFilter != NULL) {
if ((dir == UTRANS_FORWARD && compoundFilterIndex != 0) ||
(dir == UTRANS_REVERSE && compoundFilterIndex != result.size())) {
status = U_MISPLACED_COMPOUND_FILTER;
}
}
if (U_FAILURE(status)) {
for (i=0; i<result.size(); ++i) {
delete (Transliterator*)result.elementAt(i);
}
result.removeAllElements();
delete adoptedSplitTrans;
delete compoundFilter;
compoundFilter = NULL;
}
}
@ -885,6 +922,9 @@ void Transliterator::parseCompoundID(const UnicodeString& id,
* first character to parse. On output, the position after the last
* character parsed. This will be a semicolon or ID.length(). In the
* case of an error this value will be unchanged.
* @param compoundFilter OUTPUT parameter to receive a compound
* filter, if one is parsed. When a non-null compound filter is
* returned then a null Transliterator pointer is returned.
* @param create if TRUE, create and return the result. If FALSE,
* only scan the ID, and return NULL.
* @return a newly created transliterator, or NULL. NULL is returned
@ -898,6 +938,7 @@ Transliterator* Transliterator::parseID(const UnicodeString& ID,
UnicodeString& regenID,
int32_t& pos,
UBool& sawDelimiter,
UnicodeSet*& compoundFilter,
UTransDirection dir,
UParseError& parseError,
UBool create,
@ -907,19 +948,22 @@ Transliterator* Transliterator::parseID(const UnicodeString& ID,
idStart, idLimit,
setStart, setLimit;
UnicodeSet* fwdFilter = NULL;
UnicodeSet* revFilter = NULL;
UnicodeSet* filter = 0;
if (!parseIDBounds(ID, pos, FALSE, limit,
setStart, setLimit, revStart, filter)) {
delete filter;
setStart, setLimit, revStart, fwdFilter)) {
delete fwdFilter;
return 0;
}
filter = fwdFilter;
idStart = pos;
idLimit = limit;
if (revStart >= 0 && revStart < limit) {
int32_t revSetStart, revSetLimit, dummy;
UnicodeSet* revFilter = 0;
if (!parseIDBounds(ID, revStart+1, TRUE, revLimit,
revSetStart, revSetLimit, dummy, revFilter)) {
delete filter;
@ -981,83 +1025,103 @@ Transliterator* Transliterator::parseID(const UnicodeString& ID,
}
}
// Fix the id, if necessary, by reversing it (A-B => B-A). This
// is only done if the id is NOT of the form Foo(Bar). Record the
// position of the separator.
//
// For both A-B and Foo(Bar) ids, detect the special case of Null,
// whose inverse is itself. Given an ID with no separator "Foo",
// an abbreviation for "Any-Foo", consider the inverse to be
// "Foo-Any".
int32_t sep = id.indexOf(ID_SEP);
if (sep < 0 && id.caseCompare(NullTransliterator::SHORT_ID,
U_FOLD_CASE_DEFAULT) == 0) {
// Handle "Null"
sep = id.length();
} else if (dir == UTRANS_REVERSE &&
id.caseCompare(NullTransliterator::ID,
U_FOLD_CASE_DEFAULT) == 0) {
// Reverse of "Any-Null" => "Null"
id.removeBetween(0, sep+1);
sep = id.length();
} else if (dir == UTRANS_REVERSE && revStart < 0) {
if (sep >= 0) {
id.extractBetween(0, sep, str);
id.removeBetween(0, sep+1);
Transliterator* t = NULL;
int32_t sep = 0; // index of the separator ('-') in id
// If id is empty, then we have either an empty specifier,
// which is illegal, or a compound filter, which is legal
// as long as its in the right place -- we let the caller
// decide that.
UBool isCompoundFilter = (id.length() == 0 && filter != NULL);
if (isCompoundFilter) {
if (dir == UTRANS_FORWARD) {
compoundFilter = fwdFilter;
delete revFilter;
revFilter = NULL;
} else {
str = UnicodeString("Any", "");
compoundFilter = revFilter;
delete fwdFilter;
fwdFilter = NULL;
}
sep = id.length();
id.append(ID_SEP).append(str);
} else if (sep < 0 && id.length() > 0) {
// Don't do anything for empty IDs -- we handle these specially below
str = UnicodeString("Any-", "");
sep = str.length() - 1;
id.insert(0, str);
}
Transliterator *t = 0;
// If we have a reverse part of the ID, e.g., Foo(Bar), then we
// need to check for an empty part, which represents a Null
// transliterator. We return 0 (not a NullTransliterator). If we
// are not of the form Foo(Bar) then an empty string is illegal.
if (revStart >= 0 && id.length() == 0) {
// Ignore any filters; filters on Null are meaningless (and we
// can't attach them to 0 anyway)
delete filter;
}
else {
// Create the actual transliterator from the registry
if (registry == 0) {
initializeRegistry();
}
parseError.line = parseError.offset = 0;
parseError.preContext[0] = parseError.postContext[0] = 0;
TransliteratorAlias* alias = 0;
{
Mutex lock(&registryMutex);
t = registry->get(id, alias, parseError,status);
// Need to enclose this in a block to prevent deadlock when
// instantiating aliases (below).
}
if (alias != 0) {
// assert(t==0);
// Instantiate an alias
t = alias->create(parseError, status);
delete alias;
// Fix the id, if necessary, by reversing it (A-B => B-A). This
// is only done if the id is NOT of the form Foo(Bar). Record the
// position of the separator.
//
// For both A-B and Foo(Bar) ids, detect the special case of Null,
// whose inverse is itself. Given an ID with no separator "Foo",
// an abbreviation for "Any-Foo", consider the inverse to be
// "Foo-Any".
sep = id.indexOf(ID_SEP);
if (sep < 0 && id.caseCompare(NullTransliterator::SHORT_ID,
U_FOLD_CASE_DEFAULT) == 0) {
// Handle "Null"
sep = id.length();
} else if (dir == UTRANS_REVERSE &&
id.caseCompare(NullTransliterator::ID,
U_FOLD_CASE_DEFAULT) == 0) {
// Reverse of "Any-Null" => "Null"
id.removeBetween(0, sep+1);
sep = id.length();
} else if (dir == UTRANS_REVERSE && revStart < 0) {
if (sep >= 0) {
id.extractBetween(0, sep, str);
id.removeBetween(0, sep+1);
} else {
str = UnicodeString("Any", "");
}
sep = id.length();
id.append(ID_SEP).append(str);
} else if (sep < 0 && id.length() > 0) {
// Don't do anything for empty IDs -- we handle these specially below
str = UnicodeString("Any-", "");
sep = str.length() - 1;
id.insert(0, str);
}
if (t == 0) {
// Creation failed; the ID is invalid
// If we have a reverse part of the ID, e.g., Foo(Bar), then we
// need to check for an empty part, which represents a Null
// transliterator. We return 0 (not a NullTransliterator). If we
// are not of the form Foo(Bar) then an empty string is illegal.
if (revStart >= 0 && id.length() == 0) {
// Ignore any filters; filters on Null are meaningless (and we
// can't attach them to 0 anyway)
delete filter;
return 0;
}
// Set the filter, if any
t->adoptFilter(filter);
else {
// Create the actual transliterator from the registry
if (registry == 0) {
initializeRegistry();
}
parseError.line = parseError.offset = 0;
parseError.preContext[0] = parseError.postContext[0] = 0;
TransliteratorAlias* alias = 0;
{
Mutex lock(&registryMutex);
t = registry->get(id, alias, parseError,status);
// Need to enclose this in a block to prevent deadlock when
// instantiating aliases (below).
}
if (alias != 0) {
// assert(t==0);
// Instantiate an alias
t = alias->create(parseError, status);
delete alias;
}
if (t == 0) {
// Creation failed; the ID is invalid
delete filter;
return 0;
}
// Set the filter, if any
t->adoptFilter(filter);
}
}
// Set the ID. This is normally just a substring of the input

View file

@ -16,6 +16,7 @@
#include "unicode/rbt.h"
#include "unicode/resbund.h"
#include "unicode/translit.h"
#include "unicode/uniset.h"
#include "unicode/uscript.h"
// UChar constants
@ -77,11 +78,13 @@ TransliteratorAlias::TransliteratorAlias(const UnicodeString& theAliasID) :
TransliteratorAlias::TransliteratorAlias(const UnicodeString& theID,
const UnicodeString& idBlock,
Transliterator* adopted,
int32_t theIDSplitPoint) :
int32_t theIDSplitPoint,
const UnicodeSet* cpdFilter) :
ID(theID),
aliasID(idBlock),
trans(adopted),
idSplitPoint(theIDSplitPoint) {
idSplitPoint(theIDSplitPoint),
compoundFilter(cpdFilter) {
}
TransliteratorAlias::~TransliteratorAlias() {
@ -90,16 +93,19 @@ TransliteratorAlias::~TransliteratorAlias() {
Transliterator* TransliteratorAlias::create(UParseError& pe,
UErrorCode& ec) {
UErrorCode& ec) {
Transliterator *t;
if (trans == 0) {
return Transliterator::createInstance(aliasID, UTRANS_FORWARD, pe, ec);
t = Transliterator::createInstance(aliasID, UTRANS_FORWARD, pe, ec);
} else {
Transliterator *t = trans;
t = new CompoundTransliterator(ID, aliasID, idSplitPoint,
trans, pe, ec);
trans = 0; // so we don't delete it later
return new CompoundTransliterator(ID, aliasID, idSplitPoint,
t, pe, ec);
if (compoundFilter) {
t->adoptFilter((UnicodeSet*) compoundFilter->clone());
}
}
return t;
}
//----------------------------------------------------------------------
@ -277,6 +283,7 @@ public:
// it has a copy constructor
UnicodeString stringArg; // For RULES_*, ALIAS, COMPOUND_RBT
int32_t intArg; // For COMPOUND_RBT
UnicodeSet* compoundFilter; // For COMPOUND_RBT
union {
Transliterator* prototype; // For PROTOTYPE
TransliterationRuleData* data; // For RBT_DATA, COMPOUND_RBT
@ -290,6 +297,7 @@ public:
Entry::Entry() {
u.prototype = 0;
compoundFilter = NULL;
entryType = NONE;
}
@ -303,6 +311,7 @@ Entry::~Entry() {
// invalidates any RBTs that the user has instantiated.
delete u.data;
}
delete compoundFilter;
}
void Entry::adoptPrototype(Transliterator* adopted) {
@ -906,7 +915,7 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
} else if (entry->entryType == Entry::COMPOUND_RBT) {
UnicodeString id("_", "");
Transliterator *t = new RuleBasedTransliterator(id, entry->u.data);
aliasReturn = new TransliteratorAlias(ID, entry->stringArg, t, entry->intArg);
aliasReturn = new TransliteratorAlias(ID, entry->stringArg, t, entry->intArg, entry->compoundFilter);
return 0;
}
@ -935,13 +944,9 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
// transliterators; if it lists something that's not
// installed, we'll get an error from ResourceBundle.
TransliteratorParser::parse(rules, isReverse ?
UTRANS_REVERSE : UTRANS_FORWARD,
entry->u.data,
entry->stringArg,
entry->intArg,
parseError,
status);
TransliteratorParser parser;
parser.parse(rules, isReverse ? UTRANS_REVERSE : UTRANS_FORWARD,
parseError, status);
if (U_FAILURE(status)) {
// We have a failure of some kind. Remove the ID from the
@ -954,6 +959,11 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
break;
}
entry->u.data = parser.orphanData();
entry->stringArg = parser.idBlock;
entry->intArg = parser.idSplitPoint;
entry->compoundFilter = parser.orphanCompoundFilter();
// Reset entry->entryType to something that we process at the
// top of the loop, then loop back to the top. As long as we
// do this, we only loop through twice at most.

View file

@ -45,7 +45,8 @@ class TransliteratorAlias {
* Construct a compound RBT alias.
*/
TransliteratorAlias(const UnicodeString& ID, const UnicodeString& idBlock,
Transliterator* adopted, int32_t idSplitPoint);
Transliterator* adopted, int32_t idSplitPoint,
const UnicodeSet* compoundFilter);
~TransliteratorAlias();
@ -64,10 +65,12 @@ class TransliteratorAlias {
// 2. CompoundRBT
// Here ID is the ID, aliasID is the idBlock, trans is the
// contained RBT, and idSplitPoint is the offet in aliasID
// where the contained RBT goes.
// where the contained RBT goes. compoundFilter is the
// compound filter, and it is _not_ owned.
UnicodeString ID;
UnicodeString aliasID;
Transliterator* trans; // owned
const UnicodeSet* compoundFilter; // alias
int32_t idSplitPoint;
};