diff --git a/icu4c/source/i18n/rbt_rule.cpp b/icu4c/source/i18n/rbt_rule.cpp index 20d38a90026..c25b49da48b 100644 --- a/icu4c/source/i18n/rbt_rule.cpp +++ b/icu4c/source/i18n/rbt_rule.cpp @@ -12,6 +12,7 @@ #include "rbt_data.h" #include "unicode/unifilt.h" #include "unicode/uniset.h" +#include "unicode/unicode.h" #include "cmemory.h" const UChar TransliterationRule::ETHER = 0xFFFF; @@ -484,24 +485,190 @@ UBool TransliterationRule::charMatches(UChar keyChar, const Replaceable& text, } /** - * Return true if the given key matches the given text. This method - * accounts for the fact that the key character may represent a character - * set. Note that the key and text characters may not be interchanged - * without altering the results. - * @param keyChar a character in the match key - * @param textChar a character in the text being transliterated - * @param data a dictionary of variables mapping Character - * to UnicodeSet - * @param filter the filter. Any character for which - * filter.contains() returns false will not be - * altered by this transliterator. If filter is - * null then no filtering is applied. + * Append a character to a rule that is being built up. + * @param rule the string to append the character to + * @param c the character to append + * @param isLiteral if true, then the given character should not be + * quoted or escaped. Usually this means it is a syntactic element + * such as > or $ + * @param escapeUnprintable if true, then unprintable characters + * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will + * appear outside of quotes. + * @param quoteBuf a buffer which is used to build up quoted + * substrings. The caller should initially supply an empty buffer, + * and thereafter should not modify the buffer. The buffer should be + * cleared out by, at the end, calling this method with a literal + * character. */ -//[ANCHOR]UBool TransliterationRule::charMatches(UChar keyChar, UChar textChar, -//[ANCHOR] const TransliterationRuleData& data, -//[ANCHOR] const UnicodeFilter* filter) const { -//[ANCHOR] const UnicodeSet* set = 0; -//[ANCHOR] return (filter == 0 || filter->contains(textChar)) && -//[ANCHOR] (((set = data.lookupSet(keyChar)) == 0) ? -//[ANCHOR] keyChar == textChar : set->contains(textChar)); -//[ANCHOR]} +void TransliterationRule::_appendToRule(UnicodeString& rule, + UChar32 c, + UBool isLiteral, + UBool escapeUnprintable, + UnicodeString& quoteBuf) { + // If we are escaping unprintables, then escape them outside + // quotes. \u and \U are not recognized within quotes. The same + // logic applies to literals, but literals are never escaped. + if (isLiteral || + (escapeUnprintable && UnicodeSet::_isUnprintable(c))) { + if (quoteBuf.length() > 0) { + rule.append((UChar) 0x0027 /*'*/); + rule.append(quoteBuf); + rule.append((UChar) 0x0027 /*'*/); + quoteBuf.truncate(0); + } + if (!UnicodeSet::_escapeUnprintable(rule, c)) { + // Literals should be printable and should get appended + // here. + rule.append(c); + } + } + + // Double ' and '\' and don't begin a quote just for them + else if (quoteBuf.length() == 0 && + (c == (UChar) 0x0027 /*'*/ || + c == (UChar) 0x005C /*\*/)) { + rule.append(c); + rule.append(c); + } + + // Specials (printable ascii that isn't [0-9a-zA-Z]) and + // whitespace need quoting. Also append stuff to quotes if we are + // building up a quoted substring already. + else if ((c >= 0x0021 && c <= 0x007E && + !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || + (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || + (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || + Unicode::isWhitespace(c) || + quoteBuf.length() > 0) { + quoteBuf.append(c); + // Double ' within a quote + if (c == (UChar) 0x0027 /*'*/) { + quoteBuf.append(c); + } + } + + // Otherwise just append + else { + rule.append(c); + } +} + +void TransliterationRule::_appendToRule(UnicodeString& rule, + const UnicodeString& text, + UBool isLiteral, + UBool escapeUnprintable, + UnicodeString& quoteBuf) { + for (int32_t i=0; itoPattern(str, escapeUnprintable), + TRUE, escapeUnprintable, quoteBuf); + } + } + + if (i == nextSeg) { + // assert((iseg % 2) == 0); + _appendToRule(rule, (UChar)0x0029 /*)*/, TRUE, escapeUnprintable, quoteBuf); + } + + if (emitBraces && i == (anteContextLength + keyLength)) { + _appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); + } + + _appendToRule(rule, UnicodeString(" > ", ""), TRUE, escapeUnprintable, quoteBuf); + + // Emit the output pattern + + // Handle a cursor preceding the output + int32_t cursor = cursorPos; + if (cursor < 0) { + while (cursor++ < 0) { + _appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); + } + // Fall through and append '|' below + } + + for (i=0; i rather than >= because + // if cursor == output.length() it is at the end of the output, + // which is the default position, so we need not emit it. + if (cursor > output.length()) { + cursor -= output.length(); + while (cursor-- > 0) { + _appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); + } + _appendToRule(rule, (UChar) 0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); + } + + _appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); + + return rule; +} diff --git a/icu4c/source/i18n/rbt_rule.h b/icu4c/source/i18n/rbt_rule.h index f96f8df1778..517825f5446 100644 --- a/icu4c/source/i18n/rbt_rule.h +++ b/icu4c/source/i18n/rbt_rule.h @@ -362,23 +362,12 @@ public: const UnicodeFilter* filter) const; /** - * Return true if the given key matches the given text. This method - * accounts for the fact that the key character may represent a character - * set. Note that the key and text characters may not be interchanged - * without altering the results. - * @param keyChar a character in the match key - * @param textChar a character in the text being transliterated - * @param data a dictionary of variables mapping Character - * to UnicodeSet - * @param filter the filter. Any character for which - * filter.isIn() returns false will not be - * altered by this transliterator. If filter is - * null then no filtering is applied. + * Create a rule string that represents this rule object. Append + * it to the given string. */ -//[ANCHOR] virtual UBool charMatches(UChar keyChar, UChar textChar, -//[ANCHOR] const TransliterationRuleData& data, -//[ANCHOR] const UnicodeFilter* filter) const; - + virtual UnicodeString& toRule(UnicodeString& pat, + const TransliterationRuleData& data, + UBool escapeUnprintable) const; private: void init(const UnicodeString& input, @@ -389,6 +378,17 @@ private: UBool anchorStart, UBool anchorEnd, UErrorCode& status); + static void _appendToRule(UnicodeString& rule, + UChar32 c, + UBool isLiteral, + UBool escapeUnprintable, + UnicodeString& quoteBuf); + + static void _appendToRule(UnicodeString& rule, + const UnicodeString& text, + UBool isLiteral, + UBool escapeUnprintable, + UnicodeString& quoteBuf); }; #endif diff --git a/icu4c/source/i18n/rbt_set.cpp b/icu4c/source/i18n/rbt_set.cpp index 7ce5dfddf26..619cefdeb44 100644 --- a/icu4c/source/i18n/rbt_set.cpp +++ b/icu4c/source/i18n/rbt_set.cpp @@ -277,3 +277,21 @@ TransliterationRuleSet::findIncrementalMatch(const Replaceable& text, } return NULL; } + +/** + * Create rule strings that represents this rule set. + */ +UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource, + const TransliterationRuleData& data, + UBool escapeUnprintable) const { + int32_t i; + int32_t count = index[256]; + ruleSource.truncate(0); + for (i=0; itoRule(ruleSource, data, escapeUnprintable); + } + return ruleSource; +} diff --git a/icu4c/source/i18n/rbt_set.h b/icu4c/source/i18n/rbt_set.h index d302bc84042..48054f830a6 100644 --- a/icu4c/source/i18n/rbt_set.h +++ b/icu4c/source/i18n/rbt_set.h @@ -156,5 +156,14 @@ public: const TransliterationRuleData& data, UBool& isPartial, const UnicodeFilter* filter) const; + + /** + * Create rule strings that represents this rule set. + * @param result string to receive the rule strings. Current + * contents will be deleted. + */ + virtual UnicodeString& toRules(UnicodeString& result, + const TransliterationRuleData& data, + UBool escapeUnprintable) const; }; #endif