diff --git a/icu4c/source/i18n/cpdtrans.cpp b/icu4c/source/i18n/cpdtrans.cpp index f840cf2f85e..5fbfc16658a 100644 --- a/icu4c/source/i18n/cpdtrans.cpp +++ b/icu4c/source/i18n/cpdtrans.cpp @@ -10,6 +10,7 @@ #include "unicode/cpdtrans.h" #include "unicode/unifilt.h" #include "unicode/unifltlg.h" +#include "uvector.h" /** * Constructs a new compound transliterator given an array of @@ -30,7 +31,7 @@ CompoundTransliterator::CompoundTransliterator( int32_t transliteratorCount, UnicodeFilter* adoptedFilter) : Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter), - trans(0), filters(0), count(0) { + trans(0), filters(0), count(0), compoundRBTIndex(-1) { setTransliterators(transliterators, transliteratorCount); } @@ -46,44 +47,142 @@ CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, UnicodeFilter* adoptedFilter, UErrorCode& status) : Transliterator(id, 0), // set filter to 0 here! - trans(0), filters(0) { - init(id, direction, adoptedFilter, status); + trans(0), filters(0), compoundRBTIndex(-1) { + init(id, direction, adoptedFilter, -1, 0, TRUE, status); } CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, UErrorCode& status) : Transliterator(id, 0), // set filter to 0 here! - trans(0), filters(0) { - init(id, UTRANS_FORWARD, 0, status); + trans(0), filters(0), compoundRBTIndex(-1) { + init(id, UTRANS_FORWARD, 0, -1, 0, TRUE, status); } +/** + * Private constructor for compound RBTs. Construct a compound + * transliterator using the given idBlock, with the adoptedTrans + * inserted at the idSplitPoint. + */ +CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID, + const UnicodeString& idBlock, + int32_t idSplitPoint, + Transliterator *adoptedTrans, + UErrorCode& status) : + Transliterator(ID, 0), + trans(0), filters(0), compoundRBTIndex(-1) { + init(idBlock, UTRANS_FORWARD, 0, idSplitPoint, adoptedTrans, FALSE, status); +} + +/** + * Private constructor for Transliterator from a vector of + * transliterators. The vector order is FORWARD, so if dir is REVERSE + * then the vector order will be reversed. + */ +CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID, + UTransDirection dir, + UVector& list, + UErrorCode& status) : + Transliterator(ID, 0), + trans(0), filters(0), compoundRBTIndex(-1) { + init(list, dir, 0, TRUE, status); +} + +/** + * Finish constructing a transliterator: only to be called by + * constructors. Before calling init(), set trans and filter to NULL. + * @param id the id containing ';'-separated entries + * @param direction either FORWARD or REVERSE + * @param adoptedFilter a filter object to be owned by this transliterator. + * May be NULL. + * @param idSplitPoint the index into id at which the + * adoptedSplitTransliterator should be inserted, if there is one, or + * -1 if there is none. + * @param adoptedSplitTransliterator a transliterator to be inserted + * before the entry at offset idSplitPoint in the id string. May be + * NULL to insert no entry. + * @param fixReverseID if TRUE, then reconstruct the ID of reverse + * entries by calling getID() of component entries. Some constructors + * do not require this because they apply a facade ID anyway. + * @param status the error code indicating success or failure + */ void CompoundTransliterator::init(const UnicodeString& id, UTransDirection direction, UnicodeFilter* adoptedFilter, + int32_t idSplitPoint, + Transliterator *adoptedSplitTrans, + UBool fixReverseID, UErrorCode& status) { - if (U_FAILURE(status)) - return; - UnicodeString* list = split(id, ID_DELIM, &count); - trans = new Transliterator*[count]; - for (int32_t i = 0; i < count; ++i) { - trans[i] = createInstance(list[direction==UTRANS_FORWARD ? i : (count-1-i)], - direction); - if (trans[i] == NULL) { - while (++i < count) - trans[i] = 0; - status = U_ILLEGAL_ARGUMENT_ERROR; - delete[] list; - delete adoptedFilter; - return; - } - } - delete[] list; + // assert(trans == 0); + // assert(filters == 0); - // If the direction is UTRANS_REVERSE then we need to fix - // the ID. - if (direction == UTRANS_REVERSE) { + if (U_FAILURE(status)) { + delete adoptedFilter; + delete adoptedSplitTrans; + return; + } + + UVector list; + Transliterator::parseCompoundID(id, direction, + idSplitPoint, adoptedSplitTrans, + list, compoundRBTIndex, + NULL, status); + + init(list, direction, adoptedFilter, fixReverseID, status); +} + +/** + * Finish constructing a transliterator: only to be called by + * constructors. Before calling init(), set trans and filter to NULL. + * @param list a vector of transliterator objects to be adopted. It + * should NOT be empty. The list should be in declared order. That + * is, it should be in the FORWARD order; if direction is REVERSE then + * the list order will be reversed. + * @param direction either FORWARD or REVERSE + * @param adoptedFilter a filter object to be owned by this transliterator. + * May be NULL. + * @param fixReverseID if TRUE, then reconstruct the ID of reverse + * entries by calling getID() of component entries. Some constructors + * do not require this because they apply a facade ID anyway. + * @param status the error code indicating success or failure + */ +void CompoundTransliterator::init(UVector& list, + UTransDirection direction, + UnicodeFilter* adoptedFilter, + UBool fixReverseID, + UErrorCode& status) { + // assert(trans == 0); + // assert(filters == 0); + + // Allocate array + if (U_SUCCESS(status)) { + count = list.size(); + trans = new Transliterator*[count]; + } + + if (U_FAILURE(status) || trans == 0) { + delete adoptedFilter; + // assert(trans == 0); + return; + } + + // Move the transliterators from the vector into an array. + // Reverse the order if necessary. + int32_t i; + for (i=0; i= 0 && direction == UTRANS_REVERSE) { + compoundRBTIndex = count - 1 - compoundRBTIndex; + } + + // If the direction is UTRANS_REVERSE then we may need to fix the + // ID. + if (direction == UTRANS_REVERSE && fixReverseID) { UnicodeString newID; - for (int32_t i=0; i 0) { newID.append(ID_DELIM); } @@ -113,35 +212,35 @@ UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterat return id; // Return temporary } -/** - * Splits a string, as in JavaScript - */ -UnicodeString* CompoundTransliterator::split(const UnicodeString& s, - UChar divider, - int32_t* countPtr) { - // changed MED - // see how many there are - *countPtr = 1; - int32_t i; - for (i = 0; i < s.length(); ++i) { - if (s.charAt(i) == divider) - ++(*countPtr); - } - - // make an array with them - UnicodeString* result = new UnicodeString[*countPtr]; - int32_t last = 0; - int32_t current = 0; - - for (i = 0; i < s.length(); ++i) { - if (s.charAt(i) == divider) { - s.extractBetween(last, i, result[current++]); - last = i+1; - } - } - s.extractBetween(last, i, result[current]); - return result; -} +///** +// * Splits a string, as in JavaScript +// */ +//UnicodeString* CompoundTransliterator::split(const UnicodeString& s, +// UChar divider, +// int32_t* countPtr) { +// // changed MED +// // see how many there are +// *countPtr = 1; +// int32_t i; +// for (i = 0; i < s.length(); ++i) { +// if (s.charAt(i) == divider) +// ++(*countPtr); +// } +// +// // make an array with them +// UnicodeString* result = new UnicodeString[*countPtr]; +// int32_t last = 0; +// int32_t current = 0; +// +// for (i = 0; i < s.length(); ++i) { +// if (s.charAt(i) == divider) { +// s.extractBetween(last, i, result[current++]); +// last = i+1; +// } +// } +// s.extractBetween(last, i, result[current]); +// return result; +//} /** * Copy constructor. @@ -301,73 +400,102 @@ void CompoundTransliterator::adoptFilter(UnicodeFilter* f) { Transliterator::adoptFilter(f); } +UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource, + UBool escapeUnprintable) const { + // We do NOT call toRules() on our component transliterators, in + // general. If we have several rule-based transliterators, this + // yields a concatenation of the rules -- not what we want. We do + // handle compound RBT transliterators specially -- those for which + // compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex, + // we do call toRules() recursively. + rulesSource.truncate(0); + for (int32_t i=0; itoRules(rule, escapeUnprintable); + } else { + trans[i]->Transliterator::toRules(rule, escapeUnprintable); + } + if (rulesSource.length() && + rulesSource.charAt(rulesSource.length() - 1) != 10) { + rulesSource.append((UChar)10); + } + rulesSource.append(rule); + if (rulesSource.length() && + rulesSource.charAt(rulesSource.length() - 1) != ID_DELIM) { + rulesSource.append(ID_DELIM); + } + } + return rulesSource; +} + /** * Implements {@link Transliterator#handleTransliterate}. */ void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, UBool incremental) const { - /* Call each transliterator with the same start value and - * initial cursor index, but with the limit index as modified - * by preceding transliterators. The cursor index must be + /* Call each transliterator with the same contextStart and + * start, but with the limit as modified + * by preceding transliterators. The start index must be * reset for each transliterator to give each a chance to - * transliterate the text. The initial cursor index is known + * transliterate the text. The initial contextStart index is known * to still point to the same place after each transliterator * is called because each transliterator will not change the - * text between start and the initial value of cursor. + * text between contextStart and the initial start index. * * IMPORTANT: After the first transliterator, each subsequent * transliterator only gets to transliterate text committed by - * preceding transliterators; that is, the cursor (output + * preceding transliterators; that is, the start (output * value) of transliterator i becomes the limit (input value) * of transliterator i+1. Finally, the overall limit is fixed * up before we return. * * Assumptions we make here: - * (1) start <= cursor <= limit ;cursor valid on entry - * (2) cursor <= cursor' <= limit' ;cursor doesn't move back - * (3) cursor <= limit' ;text before cursor unchanged - * - cursor' is the value of cursor after calling handleKT + * (1) contextStart <= start <= limit ;cursor valid on entry + * (2) start <= start' <= limit' ;cursor doesn't move back + * (3) start <= limit' ;text before start unchanged + * - start' is the value of start after calling handleKT * - limit' is the value of limit after calling handleKT */ /** * Example: 3 transliterators. This example illustrates the - * mechanics we need to implement. S, C, and L are the start, - * cursor, and limit. gl is the globalLimit. + * mechanics we need to implement. C, S, and L are the contextStart, + * start, and limit. gl is the globalLimit. * * 1. h-u, changes hex to Unicode * * 4 7 a d 0 4 7 a * abc/u0061/u => abca/u - * S C L S C L gl=f->a + * C S L C S L gl=f->a * * 2. upup, changes "x" to "XX" * * 4 7 a 4 7 a * abca/u => abcAA/u - * S CL S C + * C SL C S * L gl=a->b * 3. u-h, changes Unicode to hex * * 4 7 a 4 7 a d 0 3 * abcAA/u => abc/u0041/u0041/u - * S C L S C + * C S L C S * L gl=b->15 * 4. return * * 4 7 a d 0 3 * abc/u0041/u0041/u - * S C L + * C S L */ if (count < 1) { + index.start = index.limit; return; // Short circuit for empty compound transliterators } int32_t i; - int32_t cursor = index.start; - int32_t limit = index.limit; - int32_t globalLimit = limit; + int32_t start = index.start; + int32_t globalLimit = index.limit; /* globalLimit is the overall limit. We keep track of this * since we overwrite index.limit with the previous * index.start. After each transliteration, we update @@ -375,16 +503,16 @@ void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPositi */ for (i=0; ihandleTransliterate(text, index, incremental); // Adjust overall limit for insertions/deletions globalLimit += index.limit - limit; - limit = index.start; // Move limit to end of committed text + index.limit = index.start; // Move limit to end of committed text } - // Cursor is good where it is -- where the last + // Start is good where it is -- where the last // transliterator left it. Limit needs to be put back // where it was, modulo adjustments for deletions/insertions. index.limit = globalLimit; diff --git a/icu4c/source/i18n/nortrans.cpp b/icu4c/source/i18n/nortrans.cpp index 45a865f32cd..de6057ad3cb 100644 --- a/icu4c/source/i18n/nortrans.cpp +++ b/icu4c/source/i18n/nortrans.cpp @@ -15,10 +15,10 @@ */ void NormalizationTransliterator::registerIDs() { UErrorCode status = U_ZERO_ERROR; - Transliterator::_registerFactory(UnicodeString("NFC", ""), _createNFC, status); - Transliterator::_registerFactory(UnicodeString("NFKC", ""), _createNFKC, status); - Transliterator::_registerFactory(UnicodeString("NFD", ""), _createNFD, status); - Transliterator::_registerFactory(UnicodeString("NFKD", ""), _createNFKD, status); + Transliterator::_registerFactory(UnicodeString("Any-NFC", ""), _createNFC, status); + Transliterator::_registerFactory(UnicodeString("Any-NFKC", ""), _createNFKC, status); + Transliterator::_registerFactory(UnicodeString("Any-NFD", ""), _createNFD, status); + Transliterator::_registerFactory(UnicodeString("Any-NFKD", ""), _createNFKD, status); } /** diff --git a/icu4c/source/i18n/rbt.cpp b/icu4c/source/i18n/rbt.cpp index be60176a6f8..97eef0373d8 100644 --- a/icu4c/source/i18n/rbt.cpp +++ b/icu4c/source/i18n/rbt.cpp @@ -22,7 +22,7 @@ void RuleBasedTransliterator::_construct(const UnicodeString& rules, data = 0; isDataOwned = TRUE; if (U_SUCCESS(status)) { - data = TransliterationRuleParser::parse(rules, direction, parseError); + data = TransliteratorParser::parse(rules, direction, parseError); if (data == 0) { status = U_ILLEGAL_ARGUMENT_ERROR; } else { @@ -40,6 +40,18 @@ RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, setMaximumContextLength(data->ruleSet.getMaximumContextLength()); } +/** + * Internal constructor. + */ +RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, + TransliterationRuleData* theData, + UBool isDataAdopted) : + Transliterator(id, 0), + data(theData), + isDataOwned(isDataAdopted) { + setMaximumContextLength(data->ruleSet.getMaximumContextLength()); +} + /** * Copy constructor. Since the data object is immutable, we can share * it with other objects -- no need to clone it. diff --git a/icu4c/source/i18n/rbt_pars.cpp b/icu4c/source/i18n/rbt_pars.cpp index 24a22ef9810..bb1032533b9 100644 --- a/icu4c/source/i18n/rbt_pars.cpp +++ b/icu4c/source/i18n/rbt_pars.cpp @@ -48,7 +48,11 @@ // trailing SymbolTable.SYMBOL_REF character. // private static final char ANCHOR_END = '$'; -const UnicodeString TransliterationRuleParser::gOPERATORS = OPERATORS; +const UnicodeString TransliteratorParser::gOPERATORS = OPERATORS; + +// These are also used in Transliterator::toRules() +static const int32_t ID_TOKEN_LEN = 2; +static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':' //---------------------------------------------------------------------- // BEGIN ParseData @@ -167,14 +171,14 @@ public: UBool anchorStart; UBool anchorEnd; - TransliterationRuleParser& parser; + TransliteratorParser& parser; static const UnicodeString gOperators; //-------------------------------------------------- // Methods - RuleHalf(TransliterationRuleParser& parser); + RuleHalf(TransliteratorParser& parser); ~RuleHalf(); /** @@ -220,7 +224,7 @@ inline int32_t _voidPtr_to_int32(void* x) { const UnicodeString RuleHalf::gOperators = OPERATORS; -RuleHalf::RuleHalf(TransliterationRuleParser& p) : parser(p) { +RuleHalf::RuleHalf(TransliteratorParser& p) : parser(p) { cursor = -1; ante = -1; post = -1; @@ -487,24 +491,62 @@ int32_t* RuleHalf::createSegments() const { //---------------------------------------------------------------------- TransliterationRuleData* -TransliterationRuleParser::parse(const UnicodeString& rules, - UTransDirection direction, - UParseError* parseError) { - TransliterationRuleParser parser(rules, direction, parseError); - parser.parseRules(); - if (U_FAILURE(parser.status)) { +TransliteratorParser::parse(const UnicodeString& rules, + UTransDirection direction, + UParseError* parseError) { + TransliteratorParser parser(rules, direction, parseError); + UnicodeString idBlock; + int32_t idSplitPoint, count; + parser.parseRules(idBlock, idSplitPoint, count); + if (U_FAILURE(parser.status) || idBlock.length() != 0) { delete parser.data; parser.data = 0; } return parser.data; } +/** + * Parse a given set of rules. Return up to three pieces of + * parsed data. These are the header ::id block, the rule block, + * and the footer ::id block. Any or all of these may be empty. + * If the ::id blocks are empty, their corresponding parameters + * are returned as the empty string. If there are no rules, the + * TransliterationRuleData result is 0. + * @param ruleDataResult caller owns the pointer stored here. + * May be NULL. + * @param headerRule string including semicolons for the header + * ::id block. May be empty. + * @param footerRule string including semicolons for the footer + * ::id block. May be empty. + */ +void TransliteratorParser::parse(const UnicodeString& rules, + UTransDirection direction, + TransliterationRuleData*& ruleDataResult, + UnicodeString& idBlockResult, + int32_t& idSplitPointResult, + UParseError* parseError, + UErrorCode& ec) { + if (U_FAILURE(ec)) { + ruleDataResult = 0; + return; + } + TransliteratorParser parser(rules, direction, parseError); + int32_t count; + parser.parseRules(idBlockResult, idSplitPointResult, count); + if (U_FAILURE(parser.status) || count == 0) { + delete parser.data; + parser.data = 0; + } + ruleDataResult = parser.data; + ec = parser.status; +} + /** * @param rules list of rules, separated by newline characters * @exception IllegalArgumentException if there is a syntax error in the * rules */ -TransliterationRuleParser::TransliterationRuleParser( +TransliteratorParser::TransliteratorParser( const UnicodeString& theRules, UTransDirection theDirection, UParseError* theParseError) : @@ -515,7 +557,7 @@ TransliterationRuleParser::TransliterationRuleParser( /** * Destructor. */ -TransliterationRuleParser::~TransliterationRuleParser() { +TransliteratorParser::~TransliteratorParser() { delete parseData; } @@ -527,8 +569,11 @@ TransliterationRuleParser::~TransliterationRuleParser() { * @exception IllegalArgumentException if there is a syntax error in the * rules */ -void TransliterationRuleParser::parseRules(void) { +void TransliteratorParser::parseRules(UnicodeString& idBlockResult, + int32_t& idSplitPointResult, + int32_t& ruleCount) { status = U_ZERO_ERROR; + ruleCount = 0; delete data; data = new TransliterationRuleData(status); @@ -543,14 +588,21 @@ void TransliterationRuleParser::parseRules(void) { } determineVariableRange(); + UnicodeString str; // scratch + idBlockResult.truncate(0); + idSplitPointResult = -1; int32_t pos = 0; int32_t limit = rules.length(); + // The mode marks whether we are in the header ::id block, the + // rule block, or the footer ::id block. + // mode == 0: start: rule->1, ::id->0 + // mode == 1: in rules: rule->1, ::id->2 + // mode == 2: in footer rule block: rule->ERROR, ::id->2 + int32_t mode = 0; while (pos < limit && U_SUCCESS(status)) { UChar c = rules.charAt(pos++); if (Unicode::isWhitespace(c)) { - // Ignore leading whitespace. Note that this is not - // Unicode spaces, but Java spaces -- a subset, - // representing whitespace likely to be seen in code. + // Ignore leading whitespace. continue; } // Skip lines starting with the comment character @@ -561,10 +613,50 @@ void TransliterationRuleParser::parseRules(void) { } continue; // Either fall out or restart with next line } - // We've found the start of a rule. c is its first - // character, and pos points past c. Lexically parse the - // rule into component pieces. - pos = parseRule(--pos, limit); + // We've found the start of a rule or ID. c is its first + // character, and pos points past c. + --pos; + // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 + // chars left. + if ((pos + ID_TOKEN_LEN + 1) <= limit && + rules.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) { + pos += ID_TOKEN_LEN; + c = rules.charAt(pos); + while (Unicode::isWhitespace(c) && pos < limit) { + ++pos; + c = rules.charAt(pos); + } + int32_t p = pos; + UBool sawDelim; + Transliterator::parseID(rules, p, sawDelim, direction, NULL, FALSE); + if (p == pos) { + // Invalid ::id + status = U_ILLEGAL_ARGUMENT_ERROR; + } else { + if (mode == 1) { + mode = 2; + idSplitPointResult = idBlockResult.length(); + } + rules.extractBetween(pos, p, str); + idBlockResult.append(str); + if (!sawDelim) { + idBlockResult.append((UChar)0x003B /*;*/); + } + pos = p; + } + } else { + // Parse a rule + pos = parseRule(pos, limit); + if (U_SUCCESS(status)) { + ++ruleCount; + if (mode == 2) { + // ::id in illegal position (because a rule + // occurred after the ::id footer block) + status = U_ILLEGAL_ARGUMENT_ERROR; + } + } + mode = 1; + } } // Convert the set vector to an array @@ -573,7 +665,8 @@ void TransliterationRuleParser::parseRules(void) { // orphanElement removes the given element and shifts all other // elements down. For performance (and code clarity) we work from // the end back to index 0. - for (int32_t i=data->setVariablesLength; i>0; ) { + int32_t i; + for (i=data->setVariablesLength; i>0; ) { --i; data->setVariables[i] = (UnicodeSet*) setVariablesVector.orphanElementAt(i); @@ -582,6 +675,9 @@ void TransliterationRuleParser::parseRules(void) { // Index the rules if (U_SUCCESS(status)) { data->ruleSet.freeze(*data, status); + if (idSplitPointResult < 0) { + idSplitPointResult = idBlockResult.length(); + } } } @@ -598,7 +694,7 @@ void TransliterationRuleParser::parseRules(void) { * indicators. Once it does a lexical breakdown of the rule at pos, it * creates a rule object and adds it to our rule list. */ -int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) { +int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) { // Locate the left side, operator, and right side int32_t start = pos; UChar op = 0; @@ -759,7 +855,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) { * @param rule pattern string * @param start position of first character of current rule */ -int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode, +int32_t TransliteratorParser::syntaxError(int32_t parseErrorCode, const UnicodeString& rule, int32_t start) { if (parseError != 0) { @@ -786,7 +882,7 @@ int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode, * Parse a UnicodeSet out, store it, and return the stand-in character * used to represent it. */ -UChar TransliterationRuleParser::parseSet(const UnicodeString& rule, +UChar TransliteratorParser::parseSet(const UnicodeString& rule, ParsePosition& pos) { UnicodeSet* set = new UnicodeSet(rule, pos, *parseData, status); if (variableNext >= variableLimit) { @@ -804,7 +900,7 @@ UChar TransliterationRuleParser::parseSet(const UnicodeString& rule, * Append the value of the given variable name to the given * UnicodeString. */ -void TransliterationRuleParser::appendVariableDef(const UnicodeString& name, +void TransliteratorParser::appendVariableDef(const UnicodeString& name, UnicodeString& buf) { const UnicodeString* s = (const UnicodeString*) data->variableNames->get(name); if (s == NULL) { @@ -839,7 +935,7 @@ void TransliterationRuleParser::appendVariableDef(const UnicodeString& name, * When done, everything not in the hash is available for use. In practice, * this method may employ some other algorithm for improved speed. */ -void TransliterationRuleParser::determineVariableRange(void) { +void TransliteratorParser::determineVariableRange(void) { UnicodeRange privateUse(0xE000, 0x1900); // Private use area UnicodeRange* r = privateUse.largestUnusedSubrange(rules); @@ -864,7 +960,7 @@ void TransliterationRuleParser::determineVariableRange(void) { * For example, in the string "abc'hide'h", the 'h' in "hide" will not be * found by a search for 'h'. */ -int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text, +int32_t TransliteratorParser::quotedIndexOf(const UnicodeString& text, int32_t start, int32_t limit, UChar charToFind) { for (int32_t i=start; i= 0) { - UErrorCode status = U_ZERO_ERROR; - t = new CompoundTransliterator(ID, dir, 0, status); - if (U_FAILURE(status)) { - delete t; - t = 0; + UErrorCode status = U_ZERO_ERROR; + return createInstance(ID, dir, -1, NULL, parseError, status); +} + +/** + * Create a transliterator given a compound ID (possibly degenerate, + * with no ID_DELIM). If idSplitPoint >= 0 and adoptedSplitTrans != + * 0, then insert adoptedSplitTrans in the compound ID at offset + * idSplitPoint. Otherwise idSplitPoint should be -1 and + * adoptedSplitTrans should be 0. The resultant transliterator will + * be an atomic (non-compound) transliterator if this is indicated by + * ID. Otherwise it will be a compound translitertor. + */ +Transliterator* Transliterator::createInstance(const UnicodeString& ID, + UTransDirection dir, + int32_t idSplitPoint, + Transliterator *adoptedSplitTrans, + UParseError* parseError, + UErrorCode& status) { + if (U_FAILURE(status)) { + return 0; + } + + UVector list; + int32_t ignored; + parseCompoundID(ID, dir, idSplitPoint, adoptedSplitTrans, + list, ignored, parseError, status); + + if (U_FAILURE(status)) { + return 0; + } + + switch (list.size()) { + case 0: + return new NullTransliterator(); + + case 1: + return (Transliterator*) list.elementAt(0); + + default: + return new CompoundTransliterator(ID, dir, list, status); + } +} + +/** + * Returns a Transliterator object constructed from + * the given rule string. This will be a RuleBasedTransliterator, + * if the rule string contains only rules, or a + * CompoundTransliterator, if it contains ID blocks, or a + * NullTransliterator, if it contains ID blocks which parse as + * empty for the given direction. + */ +Transliterator* Transliterator::createFromRules(const UnicodeString& ID, + const UnicodeString& rules, + UTransDirection dir, + UParseError* parseError) { + UnicodeString idBlock; + int32_t idSplitPoint = -1; + TransliterationRuleData *data = 0; + UErrorCode status = U_ZERO_ERROR; + + TransliteratorParser::parse(rules, dir, data, + idBlock, idSplitPoint, + parseError, status); + + if (U_FAILURE(status)) { + delete data; + return 0; + } + + // NOTE: The logic here matches that in _createInstance(). + if (idBlock.length() == 0) { + if (data == 0) { + // No idBlock, no data -- this is just an + // alias for Null + return new NullTransliterator(); + } else { + // No idBlock, data != 0 -- this is an + // ordinary RBT_DATA. + return new RuleBasedTransliterator(ID, data, TRUE); // TRUE == adopt data object } } else { - // 'id' is the ID with the filter pattern removed and with - // whitespace deleted. - UnicodeString id(ID); - - // Look for embedded filter pattern - UnicodeSet *filter = 0; - int32_t setStart = id.indexOf((UChar)0x005B /*[*/); - int32_t setLimit; - if (setStart >= 0) { - UErrorCode status = U_ZERO_ERROR; - ParsePosition pos(setStart); - filter = new UnicodeSet(); - filter->applyPattern(id, pos, 0, status); + if (data == 0) { + // idBlock, no data -- this is an alias + Transliterator *t = createInstance(idBlock, dir, parseError); + if (t != 0) { + t->setID(ID); + } + return t; + } else { + // idBlock and data -- this is a compound + // RBT + UnicodeString id("_", ""); + Transliterator *t = new RuleBasedTransliterator(id, data, TRUE); // TRUE == adopt data object + t = new CompoundTransliterator(ID, idBlock, idSplitPoint, + t, status); if (U_FAILURE(status)) { - // There was a parse failure in the filter pattern - delete filter; - return 0; + delete t; + t = 0; } - setLimit = pos.getIndex(); - id.removeBetween(setStart, setLimit); - } - - // Delete whitespace - int32_t i; - for (i=0; i B-A). - // Record the position of the separator. Detect the special - // case of Null, whose inverse is itself. Given an ID with no - // separator "Foo", an abbreviation for "Any-Foo", consider - // the inverse to be "Foo-Any". - int32_t sep = id.indexOf(ID_SEP); - if (id.caseCompare(NullTransliterator::ID, - U_FOLD_CASE_DEFAULT) == 0) { - sep = id.length(); - } else if (dir == UTRANS_REVERSE) { - UnicodeString left; - if (sep >= 0) { - id.extractBetween(0, sep, left); - id.removeBetween(0, sep+1); - } else { - left = UnicodeString("Any", ""); - } - sep = id.length(); - id.append(ID_SEP).append(left); - } else if (sep < 0) { - sep = id.length(); - } - - // The 'alias' parameter is non-empty if _createInstance() - // finds that the given ID refers to an alias. The reason - // _createInstance() doesn't call createInstance() (this - // method) directly is to avoid deadlock. There are other - // ways to do this but this is one of the more efficient ways. - UnicodeString alias; - t = _createInstance(id, alias, parseError); - - if (alias.length() > 0) { // assert(t==0) - t = createInstance(alias); - } - - if (t != 0) { - if (filter != 0) { - t->adoptFilter(filter); - id.insert(sep, ID, setStart, setLimit-setStart); - } - t->setID(id); + return t; } } +} + +UnicodeString& Transliterator::toRules(UnicodeString& rulesSource, + UBool escapeUnprintable) const { + // The base class implementation of toRules munges the ID into + // the correct format. That is: foo => ::foo + rulesSource = getID(); + // KEEP in sync with rbt_pars + rulesSource.insert(0, UnicodeString("::", "")); + return rulesSource; +} + +/** + * Parse a compound ID (possibly a degenerate one, containing no + * ID_DELIM). If idSplitPoint >= 0 and adoptedSplitTrans != 0, then + * insert adoptedSplitTrans in the compound ID at offset idSplitPoint. + * Otherwise idSplitPoint should be -1 and adoptedSplitTrans should be + * 0. Return in the result vector the instantiated transliterator + * objects (one of these will be adoptedSplitTrans, if the latter was + * specified). These will be in order of id, so if dir is REVERSE, + * then the caller will have to reverse the order. + * + * @param splitTransIndex output parameter to receive the index in + * 'result' at which the adoptedSplitTrans is stored, or -1 if + * adoptedSplitTrans == 0 + */ +void Transliterator::parseCompoundID(const UnicodeString& id, + UTransDirection dir, + int32_t idSplitPoint, + Transliterator *adoptedSplitTrans, + UVector& result, + int32_t& splitTransIndex, + UParseError* parseError, + UErrorCode& status) { + if (U_FAILURE(status)) { + return; + } + + splitTransIndex = -1; + int32_t pos = 0; + int32_t i; + while (pos < id.length()) { + // We compare (pos >= split), not (pos == split), so we can + // skip over whitespace (see below). + if (pos >= idSplitPoint && adoptedSplitTrans != 0) { + splitTransIndex = result.size(); + result.addElement(adoptedSplitTrans); + adoptedSplitTrans = 0; + } + int32_t p = pos; + UBool sawDelimiter; // We ignore this + Transliterator *t = + parseID(id, p, sawDelimiter, dir, parseError, TRUE); + if (p == pos) { + delete t; + status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + pos = p; + // The return value may be NULL when, for instance, creating a + // REVERSE transliterator of ID "Latin-Greek()". + if (t != 0) { + result.addElement(t); + } + } + + // Handle case of idSplitPoint == id.length() + if (pos >= idSplitPoint && adoptedSplitTrans != 0) { + splitTransIndex = result.size(); + result.addElement(adoptedSplitTrans); + adoptedSplitTrans = 0; + } + + if (U_FAILURE(status)) { + for (i=0; i= 0; + if (!sawDelimiter) { + limit = ID.length(); + } + int32_t setStart = ID.indexOf((UChar)0x005B /*[*/, pos); + int32_t setLimit; + if (setStart >= 0 && setStart < limit) { + UErrorCode status = U_ZERO_ERROR; + ParsePosition ppos(setStart); + filter = new UnicodeSet(); + filter->applyPattern(ID, ppos, 0, status); + if (U_FAILURE(status)) { + // There was a parse failure in the filter pattern + delete filter; + return 0; + } + setLimit = ppos.getIndex(); + if (limit < setLimit) { + limit = ID.indexOf(ID_DELIM, setLimit); + sawDelimiter = limit >= 0; + if (!sawDelimiter) { + limit = ID.length(); + } + } + } else { + setStart = setLimit = pos; + } + + // Advance limit past /;?\s*/ + int32_t idLimit = limit; // limit before separator + if (sawDelimiter) { + // assert(limit < ID.length() && ID.charAt(limit) == ID_DELIM); + ++limit; + } + while (limit < ID.length() && u_isspace(ID.charAt(limit))) { + ++limit; + } + + if (!create) { + // TODO Improve performance by scanning the UnicodeSet pattern + // without actually constructing it, if create is FALSE. That + // is, create a method like this one for UnicodeSet. + delete filter; + pos = limit; + return 0; + } + + // 'id' is the ID with the filter pattern removed and with + // whitespace deleted. + UnicodeString id; + ID.extractBetween(pos, setStart, id); + ID.extractBetween(setLimit, idLimit, str); + id.append(str); + + // Delete whitespace + int32_t i; + for (i=0; i B-A). + // Record the position of the separator. Detect the special + // case of Null, whose inverse is itself. Given an ID with no + // separator "Foo", an abbreviation for "Any-Foo", consider + // the inverse to be "Foo-Any". + int32_t sep = id.indexOf(ID_SEP); + if (sep < 0 && id.caseCompare(NullTransliterator::ID, + U_FOLD_CASE_DEFAULT) == 0) { + sep = id.length(); + } else if (dir == UTRANS_REVERSE) { + if (sep >= 0) { + id.extractBetween(0, sep, str); + id.removeBetween(0, sep+1); + } else { + str = UnicodeString("Any", ""); + } + sep = id.length(); + id.append(ID_SEP).append(str); + } else if (sep < 0) { + str = UnicodeString("Any-", ""); + sep = str.length(); + id.insert(0, str); + } + + // The 'alias' parameter is non-empty if _createInstance() + // finds that the given ID refers to an alias. The reason + // _createInstance() doesn't call createInstance() (this + // method) directly is to avoid deadlock. There are other + // ways to do this but this is one of the more efficient ways. + str.truncate(0); + t = _createInstance(id, str /*alias*/, parseError); + + if (str.length() > 0) { + // assert(t==0); + t = createInstance(str, UTRANS_FORWARD, parseError); + } + + if (t != 0) { + if (filter != 0) { + t->adoptFilter(filter); + id.insert(sep, ID, setStart, setLimit-setStart); + } + t->setID(id); + pos = limit; + } + return t; } + /** * Returns a transliterator object given its ID. Unlike getInstance(), @@ -661,8 +917,6 @@ Transliterator* Transliterator::createInstance(const UnicodeString& ID, Transliterator* Transliterator::_createInstance(const UnicodeString& ID, UnicodeString& aliasReturn, UParseError* parseError) { - UErrorCode status = U_ZERO_ERROR; - if (!cacheInitialized) { initializeCache(); } @@ -672,46 +926,55 @@ Transliterator* Transliterator::_createInstance(const UnicodeString& ID, CacheEntry* entry = (CacheEntry*) cache->get(ID); if (entry == 0) { entry = (CacheEntry*) internalCache->get(ID); + if (entry == 0) { + return 0; // out of memory + } } - TransliterationRuleData* data = 0; + UErrorCode status = U_ZERO_ERROR; - if (entry == 0) { - return 0; - } + for (;;) { + if (entry->entryType == CacheEntry::RBT_DATA) { + return new RuleBasedTransliterator(ID, entry->u.data); + } else if (entry->entryType == CacheEntry::PROTOTYPE) { + return entry->u.prototype->clone(); + } else if (entry->entryType == CacheEntry::ALIAS) { + // We can't call createInstance() here because of deadlock. + aliasReturn = entry->stringArg; + return 0; + } else if (entry->entryType == CacheEntry::FACTORY) { + return entry->u.factory(); + } else if (entry->entryType == CacheEntry::COMPOUND_RBT) { + UnicodeString id("_", ""); + Transliterator *t = new RuleBasedTransliterator(id, entry->u.data); + t = new CompoundTransliterator(ID, entry->stringArg, + entry->intArg, t, status); + if (U_FAILURE(status)) { + delete t; + t = 0; + _unregister(ID); + } + return t; + } - if (entry->entryType == CacheEntry::RBT_DATA) { - data = entry->u.data; - // Fall through to construct transliterator from cached Data object. - } else if (entry->entryType == CacheEntry::PROTOTYPE) { - return entry->u.prototype->clone(); - } else if (entry->entryType == CacheEntry::ALIAS) { - // We can't call createInstance() here because of deadlock. - aliasReturn = entry->stringArg; - return 0; - } else if (entry->entryType == CacheEntry::FACTORY) { - return entry->u.factory(); - } else { - // At this point entry type must be either RULES_FORWARD - // or RULES_REVERSE + // At this point entry type must be either RULES_FORWARD or + // RULES_REVERSE. We process the rule data into a + // TransliteratorRuleData object, and possibly also into an + // ::id header and/or footer. Then we modify the cache with + // the parsed data and retry. UBool isReverse = (entry->entryType == CacheEntry::RULES_REVERSE); - + // We use the file name, taken from another resource bundle // 2-d array at static init time, as a locale language. We're // just using the locale mechanism to map through to a file // name; this in no way represents an actual locale. - char *ch; - ch = new char[entry->stringArg.length() + 1]; + char *ch = new char[entry->stringArg.length() + 1]; ch[entry->stringArg.extract(0, 0x7fffffff, ch, "")] = 0; Locale fakeLocale(ch); delete [] ch; - ResourceBundle bundle((char *)0, - fakeLocale, status); - - // Call RBT to parse the rules from the resource bundle - + ResourceBundle bundle((char *)0, fakeLocale, status); UnicodeString rules = bundle.getStringEx(RB_RULE, status); // If the status indicates a failure, then we don't have any @@ -719,42 +982,54 @@ Transliterator* Transliterator::_createInstance(const UnicodeString& ID, // in the root locale should correspond to all the installed // transliterators; if it lists something that's not // installed, we'll get an error from ResourceBundle. - if (U_SUCCESS(status)) { - data = TransliterationRuleParser::parse(rules, isReverse - ? UTRANS_REVERSE - : UTRANS_FORWARD, - parseError); + TransliteratorParser::parse(rules, isReverse ? + UTRANS_REVERSE : UTRANS_FORWARD, + entry->u.data, + entry->stringArg, + entry->intArg, + parseError, + status); - // Double check to see if someone has modified the entry - // since we last looked at it. - if (entry->entryType != CacheEntry::RBT_DATA) { - entry->entryType = CacheEntry::RBT_DATA; - entry->u.data = data; + if (U_FAILURE(status)) { + // We have a failure of some kind. Remove the ID from the + // cache so we don't keep trying. NOTE: This will throw off + // anyone who is, at the moment, trying to iterate over the + // available IDs. That's acceptable since we should never + // really get here except under installation, configuration, + // or unrecoverable run time memory failures. + _unregister(ID); + break; + } + + // Reset entry->entryType to something that we process at the + // top of the loop, then loop back to the top. As long as we + // do this, we only loop through twice at most. + // NOTE: The logic here matches that in createFromRules(). + if (entry->stringArg.length() == 0) { + if (entry->u.data == 0) { + // No idBlock, no data -- this is just an + // alias for Null + entry->entryType = CacheEntry::ALIAS; + entry->stringArg = NullTransliterator::ID; } else { - // Oops! Another thread has updated this cache entry - // already to point to a data object. Discard the - // one we just created and use the one in the cache - // instead. - delete data; - data = entry->u.data; + // No idBlock, data != 0 -- this is an + // ordinary RBT_DATA + entry->entryType = CacheEntry::RBT_DATA; + } + } else { + if (entry->u.data == 0) { + // idBlock, no data -- this is an alias + entry->entryType = CacheEntry::ALIAS; + } else { + // idBlock and data -- this is a compound + // RBT + entry->entryType = CacheEntry::COMPOUND_RBT; } } } - if (data != 0) { - return new RuleBasedTransliterator(ID, data); - } else { - // We have a failure of some kind. Remove the ID from the - // cache so we don't keep trying. NOTE: This will throw off - // anyone who is, at the moment, trying to iterate over the - // available IDs. That's acceptable since we should never - // really get here except under installation, configuration, - // or unrecoverable run time memory failures. - _unregister(ID); - } - - return 0; + return 0; // failed } // For public consumption @@ -907,10 +1182,11 @@ UChar Transliterator::filteredCharAt(const Replaceable& text, int32_t i) const { (localFilter->contains(c = text.charAt(i)) ? c : (UChar)0xFFFE); } -// TODO Move this into the class -// NO This should remain a C function for os/390 and Solaris Workshop [grhoten] /** * Comparison function for UVector. + * + * Do not make this a class static: This should remain a C function + * for os/390 and Solaris Workshop [grhoten] */ U_CDECL_BEGIN static UBool U_CALLCONV diff --git a/icu4c/source/test/intltest/transtst.cpp b/icu4c/source/test/intltest/transtst.cpp index 02f4be81b3b..6137c8f57b7 100644 --- a/icu4c/source/test/intltest/transtst.cpp +++ b/icu4c/source/test/intltest/transtst.cpp @@ -17,6 +17,7 @@ #include "unicode/hextouni.h" #include "unicode/unitohex.h" #include "unicode/unicode.h" +#include "unicode/uniset.h" #include "unicode/ucnv.h" #include "unicode/ucnv_err.h" @@ -61,6 +62,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec, TESTCASE(26,TestLiberalizedID); TESTCASE(27,TestCreateInstance); TESTCASE(28,TestNormalizationTransliterator); + TESTCASE(29,TestCompoundRBT); default: name = ""; break; } } @@ -1053,93 +1055,6 @@ void TransliteratorTest::TestLiberalizedID(void) { } } -//====================================================================== -// Support methods -//====================================================================== -void TransliteratorTest::expect(const UnicodeString& rules, - const UnicodeString& source, - const UnicodeString& expectedResult) { - UErrorCode status = U_ZERO_ERROR; - Transliterator *t = new RuleBasedTransliterator("", rules, status); - if (U_FAILURE(status)) { - errln("FAIL: Transliterator constructor failed"); - } else { - expect(*t, source, expectedResult); - } - delete t; -} - -void TransliteratorTest::expect(const Transliterator& t, - const UnicodeString& source, - const UnicodeString& expectedResult, - const Transliterator& reverseTransliterator) { - expect(t, source, expectedResult); - expect(reverseTransliterator, expectedResult, source); -} - -void TransliteratorTest::expect(const Transliterator& t, - const UnicodeString& source, - const UnicodeString& expectedResult) { - UnicodeString result(source); - t.transliterate(result); - expectAux(t.getID() + ":String", source, result, expectedResult); - - UnicodeString rsource(source); - t.transliterate(rsource); - expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult); - - // Test keyboard (incremental) transliteration -- this result - // must be the same after we finalize (see below). - rsource.remove(); - UTransPosition index={0, 0, 0, 0}; - UnicodeString log; - - for (int32_t i=0; i "); - UErrorCode status = U_ZERO_ERROR; - t.transliterate(rsource, index, source.charAt(i), status); - // Append the string buffer with a vertical bar '|' where - // the committed index is. - UnicodeString left, right; - rsource.extractBetween(0, index.start, left); - rsource.extractBetween(index.start, rsource.length(), right); - log.append(left).append((UChar)PIPE).append(right); - } - - // As a final step in keyboard transliteration, we must call - // transliterate to finish off any pending partial matches that - // were waiting for more input. - t.finishTransliteration(rsource, index); - log.append(" => ").append(rsource); - - expectAux(t.getID() + ":Keyboard", log, - rsource == expectedResult, - expectedResult); -} - -void TransliteratorTest::expectAux(const UnicodeString& tag, - const UnicodeString& source, - const UnicodeString& result, - const UnicodeString& expectedResult) { - expectAux(tag, source + " -> " + result, - result == expectedResult, - expectedResult); -} - -void TransliteratorTest::expectAux(const UnicodeString& tag, - const UnicodeString& summary, UBool pass, - const UnicodeString& expectedResult) { - if (pass) { - logln(UnicodeString("(")+tag+") " + prettify(summary)); - } else { - errln(UnicodeString("FAIL: (")+tag+") " - + prettify(summary) - + ", expected " + prettify(expectedResult)); - } -} /* test for Jitterbug 912 */ void TransliteratorTest::TestCreateInstance(){ UParseError *err = 0; @@ -1248,3 +1163,157 @@ void TransliteratorTest::TestNormalizationTransliterator() { delete NFKD; delete NFKC; } + +/** + * Test compound RBT rules. + */ +void TransliteratorTest::TestCompoundRBT(void) { + // Careful with spacing and ';' here: Phrase this exactly + // as toRules() is going to return it. If toRules() changes + // with regard to spacing or ';', then adjust this string. + UnicodeString rule("::Hex-Unicode;\n" + "::Any-Lower;\n" + "a > '.A.';\n" + "b > '.B.';\n" + "::Any[^t]-Upper;", ""); + Transliterator *t = Transliterator::createFromRules("Test", rule); + if (t == 0) { + errln("FAIL: createFromRules failed"); + return; + } + expect(*t, "\\u0043at in the hat, bat on the mat", + "C.A.t IN tHE H.A.t, .B..A.t ON tHE M.A.t"); + UnicodeString r; + t->toRules(r, TRUE); + if (r == rule) { + logln((UnicodeString)"OK: toRules() => " + r); + } else { + errln((UnicodeString)"FAIL: toRules() => " + r + + ", expected " + rule); + } + delete t; + + // Now test toRules + t = Transliterator::createInstance("Greek-Latin; Latin-Cyrillic"); + if (t == 0) { + errln("FAIL: createInstance failed"); + return; + } + UnicodeString exp("::Greek-Latin;\n::Latin-Cyrillic;"); + t->toRules(r, TRUE); + if (r != exp) { + errln((UnicodeString)"FAIL: toRules() => " + r + + ", expected " + exp); + } else { + logln((UnicodeString)"OK: toRules() => " + r); + } + delete t; + + // Round trip the result of toRules + t = Transliterator::createFromRules("Test", r); + if (t == 0) { + errln("FAIL: createFromRules #2 failed"); + return; + } else { + logln((UnicodeString)"OK: createFromRules(" + r + ") succeeded"); + } + + // Test toRules again + t->toRules(r, TRUE); + if (r != exp) { + errln((UnicodeString)"FAIL: toRules() => " + r + + ", expected " + exp); + } else { + logln((UnicodeString)"OK: toRules() => " + r); + } + + delete t; +} + +//====================================================================== +// Support methods +//====================================================================== +void TransliteratorTest::expect(const UnicodeString& rules, + const UnicodeString& source, + const UnicodeString& expectedResult) { + UErrorCode status = U_ZERO_ERROR; + Transliterator *t = new RuleBasedTransliterator("", rules, status); + if (U_FAILURE(status)) { + errln("FAIL: Transliterator constructor failed"); + } else { + expect(*t, source, expectedResult); + } + delete t; +} + +void TransliteratorTest::expect(const Transliterator& t, + const UnicodeString& source, + const UnicodeString& expectedResult, + const Transliterator& reverseTransliterator) { + expect(t, source, expectedResult); + expect(reverseTransliterator, expectedResult, source); +} + +void TransliteratorTest::expect(const Transliterator& t, + const UnicodeString& source, + const UnicodeString& expectedResult) { + UnicodeString result(source); + t.transliterate(result); + expectAux(t.getID() + ":String", source, result, expectedResult); + + UnicodeString rsource(source); + t.transliterate(rsource); + expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult); + + // Test keyboard (incremental) transliteration -- this result + // must be the same after we finalize (see below). + rsource.remove(); + UTransPosition index={0, 0, 0, 0}; + UnicodeString log; + + for (int32_t i=0; i "); + UErrorCode status = U_ZERO_ERROR; + t.transliterate(rsource, index, source.charAt(i), status); + // Append the string buffer with a vertical bar '|' where + // the committed index is. + UnicodeString left, right; + rsource.extractBetween(0, index.start, left); + rsource.extractBetween(index.start, rsource.length(), right); + log.append(left).append((UChar)PIPE).append(right); + } + + // As a final step in keyboard transliteration, we must call + // transliterate to finish off any pending partial matches that + // were waiting for more input. + t.finishTransliteration(rsource, index); + log.append(" => ").append(rsource); + + expectAux(t.getID() + ":Keyboard", log, + rsource == expectedResult, + expectedResult); +} + +void TransliteratorTest::expectAux(const UnicodeString& tag, + const UnicodeString& source, + const UnicodeString& result, + const UnicodeString& expectedResult) { + expectAux(tag, source + " -> " + result, + result == expectedResult, + expectedResult); +} + +void TransliteratorTest::expectAux(const UnicodeString& tag, + const UnicodeString& summary, UBool pass, + const UnicodeString& expectedResult) { + if (pass) { + logln(UnicodeString("(")+tag+") " + prettify(summary)); + } else { + errln(UnicodeString("FAIL: (")+tag+") " + + prettify(summary) + + ", expected " + prettify(expectedResult)); + } +} diff --git a/icu4c/source/test/intltest/transtst.h b/icu4c/source/test/intltest/transtst.h index bf066d34525..1a48be4c17d 100644 --- a/icu4c/source/test/intltest/transtst.h +++ b/icu4c/source/test/intltest/transtst.h @@ -159,6 +159,8 @@ class TransliteratorTest : public IntlTest { void TestNormalizationTransliterator(void); + void TestCompoundRBT(void); + //====================================================================== // Support methods //======================================================================