diff --git a/icu4c/source/i18n/rbt_rule.cpp b/icu4c/source/i18n/rbt_rule.cpp index 4a9564da5e4..0a54e2ed806 100644 --- a/icu4c/source/i18n/rbt_rule.cpp +++ b/icu4c/source/i18n/rbt_rule.cpp @@ -15,10 +15,10 @@ #include "unicode/unicode.h" #include "cmemory.h" #include "strmatch.h" +#include "strrepl.h" #include "util.h" -static const UChar APOSTROPHE = 0x0027; // '\'' -static const UChar BACKSLASH = 0x005C; // '\' +static const UChar FORWARD_OP[] = {32,62,32,0}; // " > " U_NAMESPACE_BEGIN @@ -40,7 +40,7 @@ const UChar TransliterationRule::ETHER = 0xFFFF; * output; that is, -1 is equivalent to * output.length(). If greater than * output.length() then an exception is thrown. - * @param segs array of UnicodeMatcher corresponding to input pattern + * @param segs array of UnicodeFunctors corresponding to input pattern * segments, or null if there are none. The array itself is adopted, * but the pointers within it are not. * @param segsCount number of elements in segs[] @@ -53,7 +53,7 @@ TransliterationRule::TransliterationRule(const UnicodeString& input, int32_t anteContextPos, int32_t postContextPos, const UnicodeString& outputStr, int32_t cursorPosition, int32_t cursorOffset, - UnicodeMatcher** segs, + UnicodeFunctor** segs, int32_t segsCount, UBool anchorStart, UBool anchorEnd, const TransliterationRuleData* theData, @@ -93,8 +93,6 @@ TransliterationRule::TransliterationRule(const UnicodeString& input, status = U_ILLEGAL_ARGUMENT_ERROR; return; } - this->cursorPos = cursorPosition + cursorOffset; - this->output = outputStr; // We don't validate the segments array. The caller must // guarantee that the segments are well-formed (that is, that // all $n references in the output refer to indices of this @@ -129,6 +127,8 @@ TransliterationRule::TransliterationRule(const UnicodeString& input, postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), FALSE, *data); } + + this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); } /** @@ -139,17 +139,15 @@ TransliterationRule::TransliterationRule(TransliterationRule& other) : key(NULL), postContext(NULL), pattern(other.pattern), - output(other.output), anteContextLength(other.anteContextLength), keyLength(other.keyLength), - cursorPos(other.cursorPos), flags(other.flags), data(other.data) { segments = NULL; segmentsCount = 0; if (other.segmentsCount > 0) { - segments = new UnicodeMatcher*[other.segmentsCount]; + segments = new UnicodeFunctor*[other.segmentsCount]; uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0])); } @@ -162,6 +160,7 @@ TransliterationRule::TransliterationRule(TransliterationRule& other) : if (other.postContext != NULL) { postContext = (StringMatcher*) other.postContext->clone(); } + output = other.output->clone(); } TransliterationRule::~TransliterationRule() { @@ -169,14 +168,7 @@ TransliterationRule::~TransliterationRule() { delete anteContext; delete key; delete postContext; -} - -/** - * Return the position of the cursor within the output string. - * @return a value from 0 to getOutput().length(), inclusive. - */ -int32_t TransliterationRule::getCursorPos(void) const { - return cursorPos; + delete output; } /** @@ -205,7 +197,7 @@ int16_t TransliterationRule::getIndexValue() const { return -1; } UChar32 c = pattern.char32At(anteContextLength); - return (int16_t)(data->lookup(c) == NULL ? (c & 0xFF) : -1); + return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); } /** @@ -346,7 +338,8 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, } } - int32_t lenDelta, keyLimit; +// int32_t lenDelta, keyLimit; + int32_t keyLimit; // ------------------------ Ante Context ------------------------ @@ -354,7 +347,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, // is an outright U_MISMATCH regardless of whether we are // incremental or not. int32_t oText; // offset into 'text' - int32_t newStart = 0; +// int32_t newStart = 0; int32_t minOText; // Note (1): We process text in 16-bit code units, rather than @@ -428,102 +421,10 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, // We have a full match. The key is between pos.start and // keyLimit. - if (segments == NULL) { - text.handleReplaceBetween(pos.start, keyLimit, output); - lenDelta = output.length() - (keyLimit - pos.start); - if (cursorPos >= 0 && cursorPos <= output.length()) { - // Within the output string, the cursor refers to 16-bit code units - newStart = pos.start + cursorPos; - } else { - newStart = pos.start; - int32_t n = cursorPos; - // Outside the output string, cursorPos counts code points - while (n > 0) { - newStart += UTF_CHAR_LENGTH(text.char32At(newStart)); - --n; - } - while (n < 0) { - newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1)); - ++n; - } - } - } else { - /* When there are segments to be copied, use the Replaceable.copy() - * API in order to retain out-of-band data. Copy everything to the - * point after the key, then delete the key. That is, copy things - * into offset + keyLength, then replace offset .. offset + - * keyLength with the empty string. - * - * Minimize the number of calls to Replaceable.replace() and - * Replaceable.copy(). - */ - int32_t dest = keyLimit; // copy new text to here - UnicodeString buf; - int oOutput; // offset into 'output' - for (oOutput=0; oOutputlookupSegmentReference(c); - if (b < 0) { - // Accumulate straight (non-segment) text. - buf.append(c); - } else { - // Insert any accumulated straight text. - if (buf.length() > 0) { - text.handleReplaceBetween(dest, dest, buf); - dest += buf.length(); - buf.remove(); - } - // Copy segment with out-of-band data - StringMatcher* m = (StringMatcher*) segments[b]; - int32_t start = m->getMatchStart(); - int32_t limit = m->getMatchLimit(); - // If there was no match, that means that a quantifier - // matched zero-length. E.g., x (a)* y matched "xy". - if (start >= 0) { - if (start != limit) { - // Adjust indices for segments in post context - // for any inserted text between the key and - // the post context. - if (start >= keyLimit) { - start += dest - keyLimit; - limit += dest - keyLimit; - } - text.copy(start, limit, dest); - dest += limit - start; - } - } - } - oOutput += UTF_CHAR_LENGTH(c); - } - // Insert any accumulated straight text. - if (buf.length() > 0) { - text.handleReplaceBetween(dest, dest, buf); - dest += buf.length(); - } - if (oOutput == cursorPos) { - // Record the position of the cursor - newStart = dest - (keyLimit - pos.start); - } - // Delete the key - buf.remove(); - text.handleReplaceBetween(pos.start, keyLimit, buf); - lenDelta = dest - keyLimit - (keyLimit - pos.start); - // Handle cursor in postContext - if (cursorPos > output.length()) { - newStart = pos.start + (dest - keyLimit); - int32_t n = cursorPos - output.length(); - // cursorPos counts code points - while (n > 0) { - newStart += UTF_CHAR_LENGTH(text.char32At(newStart)); - n--; - } - } - } - + int32_t newStart; + int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); + int32_t lenDelta = newLength - (keyLimit - pos.start); + oText += lenDelta; pos.limit += lenDelta; pos.contextLimit += lenDelta; @@ -532,135 +433,12 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, return U_MATCH; } -/** - * Append a character to a rule that is being built up. To flush - * the quoteBuf to rule, make one final call with isLiteral == TRUE. - * If there is no final character, pass in (UChar32)-1 as c. - * @param rule the string to append the character to - * @param c the character to append, or (UChar32)-1 if none. - * @param isLiteral if true, then the given character should not be - * quoted or escaped. Usually this means it is a syntactic element - * such as > or $ - * @param escapeUnprintable if true, then unprintable characters - * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will - * appear outside of quotes. - * @param quoteBuf a buffer which is used to build up quoted - * substrings. The caller should initially supply an empty buffer, - * and thereafter should not modify the buffer. The buffer should be - * cleared out by, at the end, calling this method with a literal - * character. - */ -void TransliterationRule::appendToRule(UnicodeString& rule, - UChar32 c, - UBool isLiteral, - UBool escapeUnprintable, - UnicodeString& quoteBuf) { - // If we are escaping unprintables, then escape them outside - // quotes. \u and \U are not recognized within quotes. The same - // logic applies to literals, but literals are never escaped. - if (isLiteral || - (escapeUnprintable && ICU_Utility::isUnprintable(c))) { - if (quoteBuf.length() > 0) { - // We prefer backslash APOSTROPHE to double APOSTROPHE - // (more readable, less similar to ") so if there are - // double APOSTROPHEs at the ends, we pull them outside - // of the quote. - - // If the first thing in the quoteBuf is APOSTROPHE - // (doubled) then pull it out. - while (quoteBuf.length() >= 2 && - quoteBuf.charAt(0) == APOSTROPHE && - quoteBuf.charAt(1) == APOSTROPHE) { - rule.append(BACKSLASH).append(APOSTROPHE); - quoteBuf.remove(0, 2); - } - // If the last thing in the quoteBuf is APOSTROPHE - // (doubled) then remove and count it and add it after. - int32_t trailingCount = 0; - while (quoteBuf.length() >= 2 && - quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE && - quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) { - quoteBuf.truncate(quoteBuf.length()-2); - ++trailingCount; - } - if (quoteBuf.length() > 0) { - rule.append(APOSTROPHE); - rule.append(quoteBuf); - rule.append(APOSTROPHE); - quoteBuf.truncate(0); - } - while (trailingCount-- > 0) { - rule.append(BACKSLASH).append(APOSTROPHE); - } - } - if (c != (UChar32)-1) { - if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) { - rule.append(c); - } - } - } - - // Escape ' and '\' and don't begin a quote just for them - else if (quoteBuf.length() == 0 && - (c == APOSTROPHE || c == BACKSLASH)) { - rule.append(BACKSLASH); - rule.append(c); - } - - // Specials (printable ascii that isn't [0-9a-zA-Z]) and - // whitespace need quoting. Also append stuff to quotes if we are - // building up a quoted substring already. - else if (quoteBuf.length() > 0 || - (c >= 0x0021 && c <= 0x007E && - !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || - (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || - (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) || - Unicode::isWhitespace(c)) { - quoteBuf.append(c); - // Double ' within a quote - if (c == APOSTROPHE) { - quoteBuf.append(c); - } - } - - // Otherwise just append - else { - rule.append(c); - } -} - -void TransliterationRule::appendToRule(UnicodeString& rule, - const UnicodeString& text, - UBool isLiteral, - UBool escapeUnprintable, - UnicodeString& quoteBuf) { - for (int32_t i=0; itoPattern(pat, escapeUnprintable), - TRUE, escapeUnprintable, quoteBuf); - } -} - /** * Create a source string that represents this rule. Append it to the * given string. */ UnicodeString& TransliterationRule::toRule(UnicodeString& rule, UBool escapeUnprintable) const { - int32_t i; // Accumulate special characters (and non-specials following them) // into quoteBuf. Append quoteBuf, within single quotes, when @@ -678,67 +456,33 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule, } // Emit the input pattern - appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); + ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); if (emitBraces) { - appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); + ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); } - appendToRule(rule, key, escapeUnprintable, quoteBuf); + ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); if (emitBraces) { - appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); + ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); } - appendToRule(rule, postContext, escapeUnprintable, quoteBuf); + ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); // Emit end anchor if ((flags & ANCHOR_END) != 0) { rule.append((UChar)36/*$*/); } - appendToRule(rule, UNICODE_STRING_SIMPLE(" > "), TRUE, escapeUnprintable, quoteBuf); + ICU_Utility::appendToRule(rule, FORWARD_OP, TRUE, escapeUnprintable, quoteBuf); // Emit the output pattern - // Handle a cursor preceding the output - int32_t cursor = cursorPos; - if (cursor < 0) { - while (cursor++ < 0) { - appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); - } - // Fall through and append '|' below - } + ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), + TRUE, escapeUnprintable, quoteBuf); - for (i=0; ilookupSegmentReference(c); - if (seg < 0) { - appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); - } else { - ++seg; // make 1-based - appendToRule(rule, (UChar)0x20, TRUE, escapeUnprintable, quoteBuf); - rule.append((UChar)0x24 /*$*/); - ICU_Utility::appendNumber(rule, seg, 10, 1); - rule.append((UChar)0x20); - } - } - - // Handle a cursor after the output. Use > rather than >= because - // if cursor == output.length() it is at the end of the output, - // which is the default position, so we need not emit it. - if (cursor > output.length()) { - cursor -= output.length(); - while (cursor-- > 0) { - appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); - } - appendToRule(rule, (UChar) 0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); - } - - appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); + ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); return rule; }