diff --git a/icu4c/source/i18n/rbt_pars.cpp b/icu4c/source/i18n/rbt_pars.cpp index 21cc093a84c..6c6aedbc6a1 100644 --- a/icu4c/source/i18n/rbt_pars.cpp +++ b/icu4c/source/i18n/rbt_pars.cpp @@ -1091,8 +1091,12 @@ int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) { if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 || right->segments != NULL || left->maxRef >= 0 || (right->cursorOffset != 0 && right->cursor < 0) || - (right->cursorOffset > (left->text.length() - left->post)) || - (-right->cursorOffset > left->ante) || + // - The following two checks were used to ensure that the + // - the cursor offset stayed within the ante- or postcontext. + // - However, with the addition of quantifiers, we have to + // - allow arbitrary cursor offsets and do runtime checking. + //(right->cursorOffset > (left->text.length() - left->post)) || + //(-right->cursorOffset > left->ante) || right->anchorStart || right->anchorEnd) { return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rule, start); diff --git a/icu4c/source/i18n/rbt_rule.cpp b/icu4c/source/i18n/rbt_rule.cpp index 7db91044170..bd9fce1c323 100644 --- a/icu4c/source/i18n/rbt_rule.cpp +++ b/icu4c/source/i18n/rbt_rule.cpp @@ -68,67 +68,7 @@ TransliterationRule::TransliterationRule(const UnicodeString& input, const TransliterationRuleData& theData, UErrorCode& status) : data(theData) { - init(input, anteContextPos, postContextPos, - outputStr, cursorPosition, cursorOffset, adoptedSegs, - anchorStart, anchorEnd, status); -} -/** - * Construct a new rule with the given input, output text, and other - * attributes. A cursor position may be specified for the output text. - * @param input input string, including key and optional ante and - * post context - * @param anteContextPos offset into input to end of ante context, or -1 if - * none. Must be <= input.length() if not -1. - * @param postContextPos offset into input to start of post context, or -1 - * if none. Must be <= input.length() if not -1, and must be >= - * anteContextPos. - * @param output output string - * @param cursorPosition offset into output at which cursor is located, or -1 if - * none. If less than zero, then the cursor is placed after the - * output; that is, -1 is equivalent to - * output.length(). If greater than - * output.length() then an exception is thrown. - */ -TransliterationRule::TransliterationRule(const UnicodeString& input, - int32_t anteContextPos, int32_t postContextPos, - const UnicodeString& outputStr, - int32_t cursorPosition, - const TransliterationRuleData& theData, - UErrorCode& status) : - data(theData) { - init(input, anteContextPos, postContextPos, - outputStr, cursorPosition, 0, NULL, FALSE, FALSE, status); -} - -/** - * Copy constructor. - */ -TransliterationRule::TransliterationRule(TransliterationRule& other) : - pattern(other.pattern), - output(other.output), - anteContextLength(other.anteContextLength), - keyLength(other.keyLength), - cursorPos(other.cursorPos), - flags(other.flags), - firstKeySeg(other.firstKeySeg), - data(other.data) { - - segments = 0; - if (other.segments != 0) { - int32_t len = SEGMENTS_LEN; - segments = new int32_t[len]; - uprv_memcpy(segments, other.segments, len*sizeof(segments[0])); - } -} - -void TransliterationRule::init(const UnicodeString& input, - int32_t anteContextPos, int32_t postContextPos, - const UnicodeString& outputStr, - int32_t cursorPosition, int32_t cursorOffset, - int32_t* adoptedSegs, - UBool anchorStart, UBool anchorEnd, - UErrorCode& status) { if (U_FAILURE(status)) { return; } @@ -193,6 +133,27 @@ void TransliterationRule::init(const UnicodeString& input, } } +/** + * Copy constructor. + */ +TransliterationRule::TransliterationRule(TransliterationRule& other) : + pattern(other.pattern), + output(other.output), + anteContextLength(other.anteContextLength), + keyLength(other.keyLength), + cursorPos(other.cursorPos), + flags(other.flags), + firstKeySeg(other.firstKeySeg), + data(other.data) { + + segments = 0; + if (other.segments != 0) { + int32_t len = SEGMENTS_LEN; + segments = new int32_t[len]; + uprv_memcpy(segments, other.segments, len*sizeof(segments[0])); + } +} + TransliterationRule::~TransliterationRule() { delete[] segments; } @@ -326,6 +287,18 @@ UBool TransliterationRule::masks(const TransliterationRule& r2) const { 0 == r2.pattern.compare(left2 - left, len, pattern); } +inline int32_t posBefore(const Replaceable& str, int32_t pos) { + return (pos > 0) ? + pos - UTF_CHAR_LENGTH(str.char32At(pos-1)) : + pos - 1; +} + +inline int32_t posAfter(const Replaceable& str, int32_t pos) { + return (pos < str.length()) ? + pos + UTF_CHAR_LENGTH(str.char32At(pos)) : + pos + 1; +} + /** * Attempt a match and replacement at the given position. Return * the degree of match between this rule and the given text. The @@ -385,16 +358,13 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, // A mismatch in the ante context, or with the start anchor, // is an outright U_MISMATCH regardless of whether we are // incremental or not. - int32_t cursor = pos.start; + int32_t cursor; int32_t newStart = 0; + int32_t minCursor; int32_t i; // Backup cursor by one - if (cursor > 0) { - cursor -= UTF_CHAR_LENGTH(text.char32At(cursor-1)); - } else { - --cursor; - } + cursor = posBefore(text, pos.start); for (i=anteContextLength-1; i>=0; --i) { UChar keyChar = pattern.charAt(i); @@ -415,10 +385,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, goto exit; } } - if (cursorPos == (i - anteContextLength)) { - // Record the position of the cursor - newStart = cursor; - } while (nextSegPos == i) { segPos[iSeg] = cursor; if (cursor >= 0) { @@ -430,9 +396,11 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, } } + minCursor = posAfter(text, cursor); + // ------------------------ Start Anchor ------------------------ - if ((flags & ANCHOR_START) && cursor != (pos.contextStart-1)) { + if ((flags & ANCHOR_START) && cursor != posBefore(text, pos.contextStart)) { m = U_MISMATCH; goto exit; } @@ -513,8 +481,18 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, if (segments == NULL) { text.handleReplaceBetween(pos.start, keyLimit, output); lenDelta = output.length() - (keyLimit - pos.start); - if (cursorPos >= 0) { - newStart = pos.start + cursorPos; + newStart = pos.start; + int32_t n = cursorPos; + // cursorPos counts 16-bit code units + while (n > 0) { + int32_t l = UTF_CHAR_LENGTH(text.char32At(newStart)); + n -= l; + newStart += l; + } + while (n < 0) { + int32_t l = UTF_CHAR_LENGTH(text.char32At(newStart-1)); + n += l; + newStart -= l; } } else { /* When there are segments to be copied, use the Replaceable.copy() @@ -567,11 +545,23 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, buf.remove(); text.handleReplaceBetween(pos.start, keyLimit, buf); lenDelta = dest - keyLimit - (keyLimit - pos.start); + // Handle cursor in postContext + if (cursorPos > output.length()) { + newStart = pos.start + (dest - keyLimit); + int32_t n = cursorPos - output.length(); + // cursorPos counts 16-bit code units + while (n > 0) { + int32_t l = UTF_CHAR_LENGTH(text.char32At(newStart)); + n -= l; + newStart += l; + } + } } pos.limit += lenDelta; pos.contextLimit += lenDelta; - pos.start = newStart; + // Restrict new value of start to [minCursor, pos.limit]. + pos.start = uprv_max(minCursor, uprv_min(pos.limit, newStart)); m = U_MATCH; exit: diff --git a/icu4c/source/i18n/rbt_rule.h b/icu4c/source/i18n/rbt_rule.h index bf04b31a0b2..e97f6f525b0 100644 --- a/icu4c/source/i18n/rbt_rule.h +++ b/icu4c/source/i18n/rbt_rule.h @@ -158,30 +158,6 @@ public: const TransliterationRuleData& data, UErrorCode& status); - /** - * Construct a new rule with the given input, output text, and other - * attributes. A cursor position may be specified for the output text. - * @param input input string, including key and optional ante and - * post context - * @param anteContextPos offset into input to end of ante context, or -1 if - * none. Must be <= input.length() if not -1. - * @param postContextPos offset into input to start of post context, or -1 - * if none. Must be <= input.length() if not -1, and must be >= - * anteContextPos. - * @param output output string - * @param cursorPosition offset into output at which cursor is located, or -1 if - * none. If less than zero, then the cursor is placed after the - * output; that is, -1 is equivalent to - * output.length(). If greater than - * output.length() then an exception is thrown. - */ - TransliterationRule(const UnicodeString& input, - int32_t anteContextPos, int32_t postContextPos, - const UnicodeString& outputStr, - int32_t cursorPosition, - const TransliterationRuleData& data, - UErrorCode& status); - /** * Copy constructor. */ @@ -268,16 +244,6 @@ public: */ virtual UnicodeString& toRule(UnicodeString& pat, UBool escapeUnprintable) const; - private: - - void init(const UnicodeString& input, - int32_t anteContextPos, int32_t postContextPos, - const UnicodeString& output, - int32_t cursorPos, int32_t cursorOffset, - int32_t* adoptedSegs, - UBool anchorStart, UBool anchorEnd, - UErrorCode& status); - private: friend class StringMatcher;