diff --git a/icu4c/source/i18n/rbt_pars.cpp b/icu4c/source/i18n/rbt_pars.cpp
index 21cc093a84c..6c6aedbc6a1 100644
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@@ -1091,8 +1091,12 @@ int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) {
if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 ||
right->segments != NULL || left->maxRef >= 0 ||
(right->cursorOffset != 0 && right->cursor < 0) ||
- (right->cursorOffset > (left->text.length() - left->post)) ||
- (-right->cursorOffset > left->ante) ||
+ // - The following two checks were used to ensure that the
+ // - the cursor offset stayed within the ante- or postcontext.
+ // - However, with the addition of quantifiers, we have to
+ // - allow arbitrary cursor offsets and do runtime checking.
+ //(right->cursorOffset > (left->text.length() - left->post)) ||
+ //(-right->cursorOffset > left->ante) ||
right->anchorStart || right->anchorEnd) {
return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rule, start);
diff --git a/icu4c/source/i18n/rbt_rule.cpp b/icu4c/source/i18n/rbt_rule.cpp
index 7db91044170..bd9fce1c323 100644
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@@ -68,67 +68,7 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
const TransliterationRuleData& theData,
UErrorCode& status) :
data(theData) {
- init(input, anteContextPos, postContextPos,
- outputStr, cursorPosition, cursorOffset, adoptedSegs,
- anchorStart, anchorEnd, status);
-}
-/**
- * Construct a new rule with the given input, output text, and other
- * attributes. A cursor position may be specified for the output text.
- * @param input input string, including key and optional ante and
- * post context
- * @param anteContextPos offset into input to end of ante context, or -1 if
- * none. Must be <= input.length() if not -1.
- * @param postContextPos offset into input to start of post context, or -1
- * if none. Must be <= input.length() if not -1, and must be >=
- * anteContextPos.
- * @param output output string
- * @param cursorPosition offset into output at which cursor is located, or -1 if
- * none. If less than zero, then the cursor is placed after the
- * output
; that is, -1 is equivalent to
- * output.length()
. If greater than
- * output.length()
then an exception is thrown.
- */
-TransliterationRule::TransliterationRule(const UnicodeString& input,
- int32_t anteContextPos, int32_t postContextPos,
- const UnicodeString& outputStr,
- int32_t cursorPosition,
- const TransliterationRuleData& theData,
- UErrorCode& status) :
- data(theData) {
- init(input, anteContextPos, postContextPos,
- outputStr, cursorPosition, 0, NULL, FALSE, FALSE, status);
-}
-
-/**
- * Copy constructor.
- */
-TransliterationRule::TransliterationRule(TransliterationRule& other) :
- pattern(other.pattern),
- output(other.output),
- anteContextLength(other.anteContextLength),
- keyLength(other.keyLength),
- cursorPos(other.cursorPos),
- flags(other.flags),
- firstKeySeg(other.firstKeySeg),
- data(other.data) {
-
- segments = 0;
- if (other.segments != 0) {
- int32_t len = SEGMENTS_LEN;
- segments = new int32_t[len];
- uprv_memcpy(segments, other.segments, len*sizeof(segments[0]));
- }
-}
-
-void TransliterationRule::init(const UnicodeString& input,
- int32_t anteContextPos, int32_t postContextPos,
- const UnicodeString& outputStr,
- int32_t cursorPosition, int32_t cursorOffset,
- int32_t* adoptedSegs,
- UBool anchorStart, UBool anchorEnd,
- UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
@@ -193,6 +133,27 @@ void TransliterationRule::init(const UnicodeString& input,
}
}
+/**
+ * Copy constructor.
+ */
+TransliterationRule::TransliterationRule(TransliterationRule& other) :
+ pattern(other.pattern),
+ output(other.output),
+ anteContextLength(other.anteContextLength),
+ keyLength(other.keyLength),
+ cursorPos(other.cursorPos),
+ flags(other.flags),
+ firstKeySeg(other.firstKeySeg),
+ data(other.data) {
+
+ segments = 0;
+ if (other.segments != 0) {
+ int32_t len = SEGMENTS_LEN;
+ segments = new int32_t[len];
+ uprv_memcpy(segments, other.segments, len*sizeof(segments[0]));
+ }
+}
+
TransliterationRule::~TransliterationRule() {
delete[] segments;
}
@@ -326,6 +287,18 @@ UBool TransliterationRule::masks(const TransliterationRule& r2) const {
0 == r2.pattern.compare(left2 - left, len, pattern);
}
+inline int32_t posBefore(const Replaceable& str, int32_t pos) {
+ return (pos > 0) ?
+ pos - UTF_CHAR_LENGTH(str.char32At(pos-1)) :
+ pos - 1;
+}
+
+inline int32_t posAfter(const Replaceable& str, int32_t pos) {
+ return (pos < str.length()) ?
+ pos + UTF_CHAR_LENGTH(str.char32At(pos)) :
+ pos + 1;
+}
+
/**
* Attempt a match and replacement at the given position. Return
* the degree of match between this rule and the given text. The
@@ -385,16 +358,13 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
// A mismatch in the ante context, or with the start anchor,
// is an outright U_MISMATCH regardless of whether we are
// incremental or not.
- int32_t cursor = pos.start;
+ int32_t cursor;
int32_t newStart = 0;
+ int32_t minCursor;
int32_t i;
// Backup cursor by one
- if (cursor > 0) {
- cursor -= UTF_CHAR_LENGTH(text.char32At(cursor-1));
- } else {
- --cursor;
- }
+ cursor = posBefore(text, pos.start);
for (i=anteContextLength-1; i>=0; --i) {
UChar keyChar = pattern.charAt(i);
@@ -415,10 +385,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
goto exit;
}
}
- if (cursorPos == (i - anteContextLength)) {
- // Record the position of the cursor
- newStart = cursor;
- }
while (nextSegPos == i) {
segPos[iSeg] = cursor;
if (cursor >= 0) {
@@ -430,9 +396,11 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
}
}
+ minCursor = posAfter(text, cursor);
+
// ------------------------ Start Anchor ------------------------
- if ((flags & ANCHOR_START) && cursor != (pos.contextStart-1)) {
+ if ((flags & ANCHOR_START) && cursor != posBefore(text, pos.contextStart)) {
m = U_MISMATCH;
goto exit;
}
@@ -513,8 +481,18 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
if (segments == NULL) {
text.handleReplaceBetween(pos.start, keyLimit, output);
lenDelta = output.length() - (keyLimit - pos.start);
- if (cursorPos >= 0) {
- newStart = pos.start + cursorPos;
+ newStart = pos.start;
+ int32_t n = cursorPos;
+ // cursorPos counts 16-bit code units
+ while (n > 0) {
+ int32_t l = UTF_CHAR_LENGTH(text.char32At(newStart));
+ n -= l;
+ newStart += l;
+ }
+ while (n < 0) {
+ int32_t l = UTF_CHAR_LENGTH(text.char32At(newStart-1));
+ n += l;
+ newStart -= l;
}
} else {
/* When there are segments to be copied, use the Replaceable.copy()
@@ -567,11 +545,23 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
buf.remove();
text.handleReplaceBetween(pos.start, keyLimit, buf);
lenDelta = dest - keyLimit - (keyLimit - pos.start);
+ // Handle cursor in postContext
+ if (cursorPos > output.length()) {
+ newStart = pos.start + (dest - keyLimit);
+ int32_t n = cursorPos - output.length();
+ // cursorPos counts 16-bit code units
+ while (n > 0) {
+ int32_t l = UTF_CHAR_LENGTH(text.char32At(newStart));
+ n -= l;
+ newStart += l;
+ }
+ }
}
pos.limit += lenDelta;
pos.contextLimit += lenDelta;
- pos.start = newStart;
+ // Restrict new value of start to [minCursor, pos.limit].
+ pos.start = uprv_max(minCursor, uprv_min(pos.limit, newStart));
m = U_MATCH;
exit:
diff --git a/icu4c/source/i18n/rbt_rule.h b/icu4c/source/i18n/rbt_rule.h
index bf04b31a0b2..e97f6f525b0 100644
--- a/icu4c/source/i18n/rbt_rule.h
+++ b/icu4c/source/i18n/rbt_rule.h
@@ -158,30 +158,6 @@ public:
const TransliterationRuleData& data,
UErrorCode& status);
- /**
- * Construct a new rule with the given input, output text, and other
- * attributes. A cursor position may be specified for the output text.
- * @param input input string, including key and optional ante and
- * post context
- * @param anteContextPos offset into input to end of ante context, or -1 if
- * none. Must be <= input.length() if not -1.
- * @param postContextPos offset into input to start of post context, or -1
- * if none. Must be <= input.length() if not -1, and must be >=
- * anteContextPos.
- * @param output output string
- * @param cursorPosition offset into output at which cursor is located, or -1 if
- * none. If less than zero, then the cursor is placed after the
- * output
; that is, -1 is equivalent to
- * output.length()
. If greater than
- * output.length()
then an exception is thrown.
- */
- TransliterationRule(const UnicodeString& input,
- int32_t anteContextPos, int32_t postContextPos,
- const UnicodeString& outputStr,
- int32_t cursorPosition,
- const TransliterationRuleData& data,
- UErrorCode& status);
-
/**
* Copy constructor.
*/
@@ -268,16 +244,6 @@ public:
*/
virtual UnicodeString& toRule(UnicodeString& pat,
UBool escapeUnprintable) const;
- private:
-
- void init(const UnicodeString& input,
- int32_t anteContextPos, int32_t postContextPos,
- const UnicodeString& output,
- int32_t cursorPos, int32_t cursorOffset,
- int32_t* adoptedSegs,
- UBool anchorStart, UBool anchorEnd,
- UErrorCode& status);
-
private:
friend class StringMatcher;