diff --git a/icu4c/source/i18n/rbt_rule.cpp b/icu4c/source/i18n/rbt_rule.cpp
index 20d38a90026..c25b49da48b 100644
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@@ -12,6 +12,7 @@
#include "rbt_data.h"
#include "unicode/unifilt.h"
#include "unicode/uniset.h"
+#include "unicode/unicode.h"
#include "cmemory.h"
const UChar TransliterationRule::ETHER = 0xFFFF;
@@ -484,24 +485,190 @@ UBool TransliterationRule::charMatches(UChar keyChar, const Replaceable& text,
}
/**
- * Return true if the given key matches the given text. This method
- * accounts for the fact that the key character may represent a character
- * set. Note that the key and text characters may not be interchanged
- * without altering the results.
- * @param keyChar a character in the match key
- * @param textChar a character in the text being transliterated
- * @param data a dictionary of variables mapping Character
- * to UnicodeSet
- * @param filter the filter. Any character for which
- * filter.contains() returns false will not be
- * altered by this transliterator. If filter is
- * null then no filtering is applied.
+ * Append a character to a rule that is being built up.
+ * @param rule the string to append the character to
+ * @param c the character to append
+ * @param isLiteral if true, then the given character should not be
+ * quoted or escaped. Usually this means it is a syntactic element
+ * such as > or $
+ * @param escapeUnprintable if true, then unprintable characters
+ * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will
+ * appear outside of quotes.
+ * @param quoteBuf a buffer which is used to build up quoted
+ * substrings. The caller should initially supply an empty buffer,
+ * and thereafter should not modify the buffer. The buffer should be
+ * cleared out by, at the end, calling this method with a literal
+ * character.
*/
-//[ANCHOR]UBool TransliterationRule::charMatches(UChar keyChar, UChar textChar,
-//[ANCHOR] const TransliterationRuleData& data,
-//[ANCHOR] const UnicodeFilter* filter) const {
-//[ANCHOR] const UnicodeSet* set = 0;
-//[ANCHOR] return (filter == 0 || filter->contains(textChar)) &&
-//[ANCHOR] (((set = data.lookupSet(keyChar)) == 0) ?
-//[ANCHOR] keyChar == textChar : set->contains(textChar));
-//[ANCHOR]}
+void TransliterationRule::_appendToRule(UnicodeString& rule,
+ UChar32 c,
+ UBool isLiteral,
+ UBool escapeUnprintable,
+ UnicodeString& quoteBuf) {
+ // If we are escaping unprintables, then escape them outside
+ // quotes. \u and \U are not recognized within quotes. The same
+ // logic applies to literals, but literals are never escaped.
+ if (isLiteral ||
+ (escapeUnprintable && UnicodeSet::_isUnprintable(c))) {
+ if (quoteBuf.length() > 0) {
+ rule.append((UChar) 0x0027 /*'*/);
+ rule.append(quoteBuf);
+ rule.append((UChar) 0x0027 /*'*/);
+ quoteBuf.truncate(0);
+ }
+ if (!UnicodeSet::_escapeUnprintable(rule, c)) {
+ // Literals should be printable and should get appended
+ // here.
+ rule.append(c);
+ }
+ }
+
+ // Double ' and '\' and don't begin a quote just for them
+ else if (quoteBuf.length() == 0 &&
+ (c == (UChar) 0x0027 /*'*/ ||
+ c == (UChar) 0x005C /*\*/)) {
+ rule.append(c);
+ rule.append(c);
+ }
+
+ // Specials (printable ascii that isn't [0-9a-zA-Z]) and
+ // whitespace need quoting. Also append stuff to quotes if we are
+ // building up a quoted substring already.
+ else if ((c >= 0x0021 && c <= 0x007E &&
+ !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
+ (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
+ (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
+ Unicode::isWhitespace(c) ||
+ quoteBuf.length() > 0) {
+ quoteBuf.append(c);
+ // Double ' within a quote
+ if (c == (UChar) 0x0027 /*'*/) {
+ quoteBuf.append(c);
+ }
+ }
+
+ // Otherwise just append
+ else {
+ rule.append(c);
+ }
+}
+
+void TransliterationRule::_appendToRule(UnicodeString& rule,
+ const UnicodeString& text,
+ UBool isLiteral,
+ UBool escapeUnprintable,
+ UnicodeString& quoteBuf) {
+ for (int32_t i=0; itoPattern(str, escapeUnprintable),
+ TRUE, escapeUnprintable, quoteBuf);
+ }
+ }
+
+ if (i == nextSeg) {
+ // assert((iseg % 2) == 0);
+ _appendToRule(rule, (UChar)0x0029 /*)*/, TRUE, escapeUnprintable, quoteBuf);
+ }
+
+ if (emitBraces && i == (anteContextLength + keyLength)) {
+ _appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
+ }
+
+ _appendToRule(rule, UnicodeString(" > ", ""), TRUE, escapeUnprintable, quoteBuf);
+
+ // Emit the output pattern
+
+ // Handle a cursor preceding the output
+ int32_t cursor = cursorPos;
+ if (cursor < 0) {
+ while (cursor++ < 0) {
+ _appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
+ }
+ // Fall through and append '|' below
+ }
+
+ for (i=0; i rather than >= because
+ // if cursor == output.length() it is at the end of the output,
+ // which is the default position, so we need not emit it.
+ if (cursor > output.length()) {
+ cursor -= output.length();
+ while (cursor-- > 0) {
+ _appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
+ }
+ _appendToRule(rule, (UChar) 0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
+ }
+
+ _appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
+
+ return rule;
+}
diff --git a/icu4c/source/i18n/rbt_rule.h b/icu4c/source/i18n/rbt_rule.h
index f96f8df1778..517825f5446 100644
--- a/icu4c/source/i18n/rbt_rule.h
+++ b/icu4c/source/i18n/rbt_rule.h
@@ -362,23 +362,12 @@ public:
const UnicodeFilter* filter) const;
/**
- * Return true if the given key matches the given text. This method
- * accounts for the fact that the key character may represent a character
- * set. Note that the key and text characters may not be interchanged
- * without altering the results.
- * @param keyChar a character in the match key
- * @param textChar a character in the text being transliterated
- * @param data a dictionary of variables mapping Character
- * to UnicodeSet
- * @param filter the filter. Any character for which
- * filter.isIn() returns false will not be
- * altered by this transliterator. If filter is
- * null then no filtering is applied.
+ * Create a rule string that represents this rule object. Append
+ * it to the given string.
*/
-//[ANCHOR] virtual UBool charMatches(UChar keyChar, UChar textChar,
-//[ANCHOR] const TransliterationRuleData& data,
-//[ANCHOR] const UnicodeFilter* filter) const;
-
+ virtual UnicodeString& toRule(UnicodeString& pat,
+ const TransliterationRuleData& data,
+ UBool escapeUnprintable) const;
private:
void init(const UnicodeString& input,
@@ -389,6 +378,17 @@ private:
UBool anchorStart, UBool anchorEnd,
UErrorCode& status);
+ static void _appendToRule(UnicodeString& rule,
+ UChar32 c,
+ UBool isLiteral,
+ UBool escapeUnprintable,
+ UnicodeString& quoteBuf);
+
+ static void _appendToRule(UnicodeString& rule,
+ const UnicodeString& text,
+ UBool isLiteral,
+ UBool escapeUnprintable,
+ UnicodeString& quoteBuf);
};
#endif
diff --git a/icu4c/source/i18n/rbt_set.cpp b/icu4c/source/i18n/rbt_set.cpp
index 7ce5dfddf26..619cefdeb44 100644
--- a/icu4c/source/i18n/rbt_set.cpp
+++ b/icu4c/source/i18n/rbt_set.cpp
@@ -277,3 +277,21 @@ TransliterationRuleSet::findIncrementalMatch(const Replaceable& text,
}
return NULL;
}
+
+/**
+ * Create rule strings that represents this rule set.
+ */
+UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
+ const TransliterationRuleData& data,
+ UBool escapeUnprintable) const {
+ int32_t i;
+ int32_t count = index[256];
+ ruleSource.truncate(0);
+ for (i=0; itoRule(ruleSource, data, escapeUnprintable);
+ }
+ return ruleSource;
+}
diff --git a/icu4c/source/i18n/rbt_set.h b/icu4c/source/i18n/rbt_set.h
index d302bc84042..48054f830a6 100644
--- a/icu4c/source/i18n/rbt_set.h
+++ b/icu4c/source/i18n/rbt_set.h
@@ -156,5 +156,14 @@ public:
const TransliterationRuleData& data,
UBool& isPartial,
const UnicodeFilter* filter) const;
+
+ /**
+ * Create rule strings that represents this rule set.
+ * @param result string to receive the rule strings. Current
+ * contents will be deleted.
+ */
+ virtual UnicodeString& toRules(UnicodeString& result,
+ const TransliterationRuleData& data,
+ UBool escapeUnprintable) const;
};
#endif