Transliterator
method
- * getMaximumContextLength()
.
+ * getMaximumContextLength()
. Internally, this is
+ * implemented as the anteContextLength, optionally plus one if
+ * there is a start anchor. The one character anchor gap is
+ * needed to make repeated incremental transliteration with
+ * anchors work.
*/
-int32_t TransliterationRule::getAnteContextLength(void) const {
- return anteContextLength;
+int32_t TransliterationRule::getContextLength(void) const {
+ return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0);
}
/**
@@ -209,81 +214,16 @@ int32_t TransliterationRule::getAnteContextLength(void) const {
* unless the first character of the key is a set. If it's a
* set, or otherwise can match multiple keys, the index value is -1.
*/
-int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data) const {
+int16_t TransliterationRule::getIndexValue() const {
if (anteContextLength == pattern.length()) {
// A pattern with just ante context {such as foo)>bar} can
// match any key.
return -1;
}
- UChar c = pattern.charAt(anteContextLength);
+ UChar32 c = pattern.char32At(anteContextLength);
return (int16_t)(data.lookupSet(c) == NULL ? (c & 0xFF) : -1);
}
-/**
- * Do a replacement of the input pattern with the output text in
- * the given string, at the given offset. This method assumes
- * that a match has already been found in the given text at the
- * given position.
- * @param text the text containing the substring to be replaced
- * @param offset the offset into the text at which the pattern
- * matches. This is the offset to the point after the ante
- * context, if any, and before the match string and any post
- * context.
- * @param data the RuleBasedTransliterator.Data object specifying
- * context for this transliterator.
- * @return the change in the length of the text
- */
-int32_t TransliterationRule::replace(Replaceable& text, int32_t offset,
- const TransliterationRuleData& data) const {
- if (segments == NULL) {
- text.handleReplaceBetween(offset, offset + keyLength, output);
- return output.length() - keyLength;
- } else {
- /* When there are segments to be copied, use the Replaceable.copy()
- * API in order to retain out-of-band data. Copy everything to the
- * point after the key, then delete the key. That is, copy things
- * into offset + keyLength, then replace offset .. offset +
- * keyLength with the empty string.
- *
- * Minimize the number of calls to Replaceable.replace() and
- * Replaceable.copy().
- */
- int32_t textStart = offset - anteContextLength;
- int32_t dest = offset + keyLength; // copy new text to here
- UnicodeString buf;
- for (int32_t i=0; i0 <= start
- * <= limit
.
- * @param limit the ending index, exclusive; start <= limit
- * <= text.length()
.
- * @param cursor position at which to translate next, representing offset
- * into text. This value must be between start
and
- * limit
.
+ * Attempt a match and replacement at the given position. Return
+ * the degree of match between this rule and the given text. The
+ * degree of match may be mismatch, a partial match, or a full
+ * match. A mismatch means at least one character of the text
+ * does not match the context or key. A partial match means some
+ * context and key characters match, but the text is not long
+ * enough to match all of them. A full match means all context
+ * and key characters match.
+ *
+ * If a full match is obtained, perform a replacement, update pos,
+ * and return U_MATCH. Otherwise both text and pos are unchanged.
+ *
+ * @param text the text
+ * @param pos the position indices
+ * @param incremental if TRUE, test for partial matches that may
+ * be completed by additional text inserted at pos.limit.
+ * @return one of U_MISMATCH
,
+ * U_PARTIAL_MATCH
, or U_MATCH
. If
+ * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
*/
-UBool TransliterationRule::matches(const Replaceable& text,
- const UTransPosition& pos,
- const TransliterationRuleData& data) const {
- // Match anteContext, key, and postContext
- int32_t cursor = pos.start - anteContextLength;
- // Quick length check; this is a performance win for long rules.
- // Widen by one (on both sides) to allow anchor matching.
- if (cursor < (pos.contextStart - 1) ||
- (cursor + pattern.length()) > (pos.contextLimit + 1)) {
- return FALSE;
- }
- for (int32_t i=0; istart <= limit
- * <= text.length()
.
- * @param cursor position at which to translate next, representing offset
- * into text. This value must be between start
and
- * limit
.
- * @return one of MISMATCH
, PARTIAL_MATCH
, or
- * FULL_MATCH
.
- * @see #MISMATCH
- * @see #PARTIAL_MATCH
- * @see #FULL_MATCH
- */
-int32_t TransliterationRule::getMatchDegree(const Replaceable& text,
- const UTransPosition& pos,
- const TransliterationRuleData& data) const {
- int len = getRegionMatchLength(text, pos, data);
- return len < anteContextLength ? MISMATCH :
- (len < pattern.length() ? PARTIAL_MATCH : FULL_MATCH);
-}
+ // ============================ MATCH ===========================
-/**
- * Return the number of characters of the text that match this rule. If
- * there is a mismatch, return -1. If the text is not long enough to match
- * any characters, return 0.
- * @param text the text, both translated and untranslated
- * @param start the beginning index, inclusive; 0 <= start
- * <= limit
.
- * @param limit the ending index, exclusive; start <= limit
- * <= text.length()
.
- * @param cursor position at which to translate next, representing offset
- * into text. This value must be between start
and
- * limit
.
- * @param data a dictionary of variables mapping Character
- * to UnicodeSet
- * @return -1 if there is a mismatch, 0 if the text is not long enough to
- * match any characters, otherwise the number of characters of text that
- * match this rule.
- */
-int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text,
- const UTransPosition& pos,
- const TransliterationRuleData& data) const {
- int32_t cursor = pos.start - anteContextLength;
- // Quick length check; this is a performance win for long rules.
- // Widen by one to allow anchor matching.
- if (cursor < (pos.contextStart - 1)) {
- return -1;
- }
+ // Record the positions of segments. We assume the following:
+ // - The maximum number of segments is 9.
+ // - The segment indices occur in ascending order. That is,
+ // segment 1 start <= segment 1 limit <= sement 2 start...
+ // - The segments have been validated such that there are no
+ // references to nonexistent segments.
+ // - The end of the segment array is marked by a start of -1.
+ // Currently, the parser enforces all of these constraints.
+ // In the future, the first two constraints may be lifted,
+ // in which case this method will have to be modified.
+
+ int32_t segPos[18];
+ int32_t iSeg = firstKeySeg - 1;
+ int32_t nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1;
+
+ // ------------------------ Ante Context ------------------------
+
+ // A mismatch in the ante context, or with the start anchor,
+ // is an outright U_MISMATCH regardless of whether we are
+ // incremental or not.
+ int32_t cursor = pos.start - 1;
int32_t i;
- for (i=0; iCharacter
- * to UnicodeSet
- */
-UBool TransliterationRule::charMatches(UChar keyChar, const Replaceable& text,
- int32_t index,
- const UTransPosition& pos,
- const TransliterationRuleData& data) const {
- const UnicodeSet* set = 0;
- UChar textChar = (index >= pos.contextStart && index < pos.contextLimit)
- ? text.charAt(index) : ETHER;
- return ((set = data.lookupSet(keyChar)) == 0) ?
- keyChar == textChar : set->contains(textChar);
+ // ------------------------ Start Anchor ------------------------
+
+ if ((flags & ANCHOR_START) && cursor != (pos.contextStart-1)) {
+ return U_MISMATCH;
+ }
+
+ // -------------------- Key and Post Context --------------------
+
+ iSeg = firstKeySeg;
+ nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1;
+
+ i = 0;
+ cursor = pos.start;
+ int32_t keyLimit = 0;
+ while (i < (pattern.length() - anteContextLength)) {
+ if (incremental && cursor == pos.contextLimit) {
+ // We've reached the context limit without a mismatch and
+ // without completing our match.
+ return U_PARTIAL_MATCH;
+ }
+ if (cursor == pos.limit && i < keyLength) {
+ // We're still in the pattern key but we're entering the
+ // post context.
+ return U_MISMATCH;
+ }
+ while (i == nextSegPos) {
+ segPos[iSeg] = cursor;
+ nextSegPos = segments[++iSeg];
+ }
+ if (i == keyLength) {
+ keyLimit = cursor;
+ }
+ UChar keyChar = pattern.charAt(anteContextLength + i++);
+ const UnicodeSet* set = data.lookupSet(keyChar);
+ if (set == 0) {
+ // Don't need the cursor < pos.contextLimit check if
+ // incremental is TRUE (because it's done above); do need
+ // it otherwise.
+ if (cursor < pos.contextLimit &&
+ keyChar == text.charAt(cursor)) {
+ ++cursor;
+ } else {
+ return U_MISMATCH;
+ }
+ } else {
+ UMatchDegree m =
+ set->matches(text, cursor, pos.contextLimit, incremental);
+ if (m != U_MATCH) {
+ return m;
+ }
+ }
+ }
+ while (i == nextSegPos) {
+ segPos[iSeg] = cursor;
+ nextSegPos = segments[++iSeg];
+ }
+ if (i == keyLength) {
+ keyLimit = cursor;
+ }
+
+ // ------------------------- Stop Anchor ------------------------
+
+ if ((flags & ANCHOR_END) != 0) {
+ if (cursor != pos.contextLimit) {
+ return U_MISMATCH;
+ }
+ if (incremental) {
+ return U_PARTIAL_MATCH;
+ }
+ }
+
+ // =========================== REPLACE ==========================
+
+ // We have a full match. The key is between pos.start and
+ // keyLimit. Segment indices have been recorded in segPos[].
+ // Perform a replacement.
+
+ int32_t lenDelta = 0;
+
+ if (segments == NULL) {
+ text.handleReplaceBetween(pos.start, keyLimit, output);
+ lenDelta = output.length() - (keyLimit - pos.start);
+ pos.start += cursorPos;
+ } else {
+ /* When there are segments to be copied, use the Replaceable.copy()
+ * API in order to retain out-of-band data. Copy everything to the
+ * point after the key, then delete the key. That is, copy things
+ * into offset + keyLength, then replace offset .. offset +
+ * keyLength with the empty string.
+ *
+ * Minimize the number of calls to Replaceable.replace() and
+ * Replaceable.copy().
+ */
+ int32_t dest = keyLimit; // copy new text to here
+ UnicodeString buf;
+ for (i=0; igetMatchDegree()
indicating
- * the degree of match between the text and this rule.
- * @see #getMatchDegree
- */
- enum {
- /**
- * Constant returned by getMatchDegree()
- * indicating a mismatch between the text and this rule. One
- * or more characters of the context or key do not match the
- * text.
- */
- MISMATCH,
-
- /**
- * Constant returned by getMatchDegree()
- * indicating a partial match between the text and this rule.
- * All characters of the text match the corresponding context
- * or key, but more characters are required for a complete
- * match. There are some key or context characters at the end
- * of the pattern that remain unmatched because the text isn't
- * long enough.
- */
- PARTIAL_MATCH,
-
- /**
- * Constant returned by getMatchDegree()
- * indicating a complete match between the text and this rule.
- * The text matches all context and key characters.
- */
- FULL_MATCH
- };
-
/**
* The character at index i, where i < contextStart || i >= contextLimit,
* is ETHER. This allows explicit matching by rules and UnicodeSets
@@ -109,6 +77,14 @@ private:
*/
int32_t* segments;
+ /**
+ * A value we compute from segments. The first index into segments[]
+ * that is >= anteContextLength. That is, the first one that is within
+ * the forward scanned part of the pattern -- the key or the postContext.
+ * If there are no segments, this has the value -1.
+ */
+ int32_t firstKeySeg;
+
/**
* The length of the string that must match before the key. If
* zero, then there is no matching requirement before the key.
@@ -130,6 +106,25 @@ private:
*/
int32_t cursorPos;
+ /**
+ * Miscellaneous attributes.
+ */
+ int8_t flags;
+
+ /**
+ * Flag attributes.
+ */
+ enum {
+ ANCHOR_START = 1,
+ ANCHOR_END = 2,
+ };
+
+ /**
+ * A reference to the data for this rule. The data provides
+ * lookup services for matchers and segments.
+ */
+ const TransliterationRuleData& data;
+
public:
/**
@@ -169,6 +164,7 @@ public:
int32_t cursorPosition, int32_t cursorOffset,
int32_t* adoptedSegs,
UBool anchorStart, UBool anchorEnd,
+ const TransliterationRuleData& data,
UErrorCode& status);
/**
@@ -192,6 +188,7 @@ public:
int32_t anteContextPos, int32_t postContextPos,
const UnicodeString& outputStr,
int32_t cursorPosition,
+ const TransliterationRuleData& data,
UErrorCode& status);
/**
@@ -213,9 +210,13 @@ public:
/**
* Return the preceding context length. This method is needed to
* support the Transliterator
method
- * getMaximumContextLength()
.
+ * getMaximumContextLength()
. Internally, this is
+ * implemented as the anteContextLength, optionally plus one if
+ * there is a start anchor. The one character anchor gap is
+ * needed to make repeated incremental transliteration with
+ * anchors work.
*/
- virtual int32_t getAnteContextLength(void) const;
+ virtual int32_t getContextLength(void) const;
/**
* Internal method. Returns 8-bit index value for this rule.
@@ -223,24 +224,7 @@ public:
* unless the first character of the key is a set. If it's a
* set, or otherwise can match multiple keys, the index value is -1.
*/
- int16_t getIndexValue(const TransliterationRuleData& data) const;
-
- /**
- * Do a replacement of the input pattern with the output text in
- * the given string, at the given offset. This method assumes
- * that a match has already been found in the given text at the
- * given position.
- * @param text the text containing the substring to be replaced
- * @param offset the offset into the text at which the pattern
- * matches. This is the offset to the point after the ante
- * context, if any, and before the match string and any post
- * context.
- * @param data the RuleBasedTransliterator.Data object specifying
- * context for this transliterator.
- * @return the change in the length of the text
- */
- int32_t replace(Replaceable& text, int32_t offset,
- const TransliterationRuleData& data) const;
+ int16_t getIndexValue() const;
/**
* Internal method. Returns true if this rule matches the given
@@ -252,8 +236,7 @@ public:
* value. If the rule contains only ante context, as in foo)>bar,
* then it will match any key.
*/
- UBool matchesIndexValue(uint8_t v,
- const TransliterationRuleData& data) const;
+ UBool matchesIndexValue(uint8_t v) const;
/**
* Return true if this rule masks another rule. If r1 masks r2 then
@@ -264,88 +247,35 @@ public:
virtual UBool masks(const TransliterationRule& r2) const;
/**
- * Return true if this rule matches the given text.
- * @param text the text, both translated and untranslated
- * @param start the beginning index, inclusive; 0 <= start
- * <= limit
.
- * @param limit the ending index, exclusive; start <= limit
- * <= text.length()
.
- * @param cursor position at which to translate next, representing offset
- * into text. This value must be between start
and
- * limit
.
+ * Attempt a match and replacement at the given position. Return
+ * the degree of match between this rule and the given text. The
+ * degree of match may be mismatch, a partial match, or a full
+ * match. A mismatch means at least one character of the text
+ * does not match the context or key. A partial match means some
+ * context and key characters match, but the text is not long
+ * enough to match all of them. A full match means all context
+ * and key characters match.
+ *
+ * If a full match is obtained, perform a replacement, update pos,
+ * and return U_MATCH. Otherwise both text and pos are unchanged.
+ *
+ * @param text the text
+ * @param pos the position indices
+ * @param incremental if TRUE, test for partial matches that may
+ * be completed by additional text inserted at pos.limit.
+ * @return one of U_MISMATCH
,
+ * U_PARTIAL_MATCH
, or U_MATCH
. If
+ * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
*/
- virtual UBool matches(const Replaceable& text,
- const UTransPosition& pos,
- const TransliterationRuleData& data) const;
-
- /**
- * Return the degree of match between this rule and the given text. The
- * degree of match may be mismatch, a partial match, or a full match. A
- * mismatch means at least one character of the text does not match the
- * context or key. A partial match means some context and key characters
- * match, but the text is not long enough to match all of them. A full
- * match means all context and key characters match.
- * @param text the text, both translated and untranslated
- * @param start the beginning index, inclusive; 0 <= start
- * <= limit
.
- * @param limit the ending index, exclusive; start <= limit
- * <= text.length()
.
- * @param cursor position at which to translate next, representing offset
- * into text. This value must be between start
and
- * limit
.
- * @return one of MISMATCH
, PARTIAL_MATCH
, or
- * FULL_MATCH
.
- * @see #MISMATCH
- * @see #PARTIAL_MATCH
- * @see #FULL_MATCH
- */
- virtual int32_t getMatchDegree(const Replaceable& text,
- const UTransPosition& pos,
- const TransliterationRuleData& data) const;
-
- /**
- * Return the number of characters of the text that match this rule. If
- * there is a mismatch, return -1. If the text is not long enough to match
- * any characters, return 0.
- * @param text the text, both translated and untranslated
- * @param start the beginning index, inclusive; 0 <= start
- * <= limit
.
- * @param limit the ending index, exclusive; start <= limit
- * <= text.length()
.
- * @param cursor position at which to translate next, representing offset
- * into text. This value must be between start
and
- * limit
.
- * @param data a dictionary of variables mapping Character
- * to UnicodeSet
- * @return -1 if there is a mismatch, 0 if the text is not long enough to
- * match any characters, otherwise the number of characters of text that
- * match this rule.
- */
- virtual int32_t getRegionMatchLength(const Replaceable& text,
- const UTransPosition& pos,
- const TransliterationRuleData& data) const;
-
- /**
- * Return true if the given key matches the given text. This method
- * accounts for the fact that the key character may represent a character
- * set. Note that the key and text characters may not be interchanged
- * without altering the results.
- * @param keyChar a character in the match key
- * @param textChar a character in the text being transliterated
- * @param data a dictionary of variables mapping Character
- * to UnicodeSet
- */
- virtual UBool charMatches(UChar keyChar, const Replaceable& textChar,
- int32_t index,
- const UTransPosition& pos,
- const TransliterationRuleData& data) const;
+ UMatchDegree matchAndReplace(Replaceable& text,
+ UTransPosition& pos,
+ UBool incremental) const;
/**
* Create a rule string that represents this rule object. Append
* it to the given string.
*/
virtual UnicodeString& toRule(UnicodeString& pat,
- const TransliterationRuleData& data,
UBool escapeUnprintable) const;
private:
diff --git a/icu4c/source/i18n/rbt_set.cpp b/icu4c/source/i18n/rbt_set.cpp
index ecf3663e189..7e737a0f928 100644
--- a/icu4c/source/i18n/rbt_set.cpp
+++ b/icu4c/source/i18n/rbt_set.cpp
@@ -74,7 +74,7 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
ruleVector->addElement(adoptedRule);
int32_t len;
- if ((len = adoptedRule->getAnteContextLength()) > maxContextLength) {
+ if ((len = adoptedRule->getContextLength()) > maxContextLength) {
maxContextLength = len;
}
@@ -92,8 +92,7 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
* That is, freeze()
may be called multiple times,
* although for optimal performance it shouldn't be.
*/
-void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
- UErrorCode& status) {
+void TransliterationRuleSet::freeze(UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
@@ -124,7 +123,7 @@ void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
int16_t* indexValue = new int16_t[n];
for (j=0; j0 <= start
- * <= limit
.
- * @param limit the ending index, exclusive; start <= limit
- * <= text.length()
.
- * @param cursor position at which to translate next, representing offset
- * into text. This value must be between start
and
- * limit
.
- * @param data a dictionary mapping variables to the sets they
- * represent (maps Character
to UnicodeSet
)
- * @return the matching rule, or null if none found.
+ * Transliterate the given text with the given UTransPosition
+ * indices. Return TRUE if the transliteration should continue
+ * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
+ * Note that FALSE is only ever returned if isIncremental is TRUE.
+ * @param text the text to be transliterated
+ * @param pos the position indices, which will be updated
+ * @param incremental if TRUE, assume new text may be inserted
+ * at index.limit, and return FALSE if thre is a partial match.
+ * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
+ * indicating that transliteration should stop until more text
+ * arrives.
*/
-TransliterationRule*
-TransliterationRuleSet::findMatch(const Replaceable& text,
- const UTransPosition& pos,
- const TransliterationRuleData& data) const {
- /* We only need to check our indexed bin of the rule table,
- * based on the low byte of the first key character.
- */
- int16_t x = (int16_t) (text.charAt(pos.start) & 0xFF);
- for (int32_t i=index[x]; ifindMatch()
, this method does an incremental match.
- * An incremental match requires that there be no partial matches that might
- * pre-empt the full match that is found. If there are partial matches,
- * then null is returned. A non-null result indicates that a full match has
- * been found, and that it cannot be pre-empted by a partial match
- * regardless of what additional text is added to the translation buffer.
- * @param text the text, both translated and untranslated
- * @param start the beginning index, inclusive; 0 <= start
- * <= limit
.
- * @param limit the ending index, exclusive; start <= limit
- * <= text.length()
.
- * @param cursor position at which to translate next, representing offset
- * into text. This value must be between start
and
- * limit
.
- * @param data a dictionary mapping variables to the sets they
- * represent (maps Character
to UnicodeSet
)
- * @param partial output parameter. partial[0]
is set to
- * true if a partial match is returned.
- * @return the matching rule, or null if none found, or if the text buffer
- * does not have enough text yet to unambiguously match a rule.
- */
-TransliterationRule*
-TransliterationRuleSet::findIncrementalMatch(const Replaceable& text,
- const UTransPosition& pos,
- const TransliterationRuleData& data,
- UBool& isPartial) const {
-
- /* We only need to check our indexed bin of the rule table,
- * based on the low byte of the first key character.
- */
- isPartial = FALSE;
- int16_t x = (int16_t) (text.charAt(pos.start) & 0xFF);
- for (int32_t i=index[x]; iRuleBasedTransliterator
. This set encodes
- * the transliteration in one direction from one set of characters or short
- * strings to another. A RuleBasedTransliterator
consists of up to
- * two such sets, one for the forward direction, and one for the reverse.
- *
- * A TransliterationRuleSet
has one important operation, that of
- * finding a matching rule at a given point in the text. This is accomplished
- * by the findMatch()
method.
- *
+ * A set of rules for a RuleBasedTransliterator
.
* @author Alan Liu
*/
class TransliterationRuleSet {
@@ -98,59 +90,24 @@ public:
* That is, freeze()
may be called multiple times,
* although for optimal performance it shouldn't be.
*/
- virtual void freeze(const TransliterationRuleData& data,
- UErrorCode& status);
-
+ virtual void freeze(UErrorCode& status);
+
/**
- * Attempt to find a matching rule at the specified point in the text.
- * @param text the text, both translated and untranslated
- * @param start the beginning index, inclusive; 0 <= start
- * <= limit
.
- * @param limit the ending index, exclusive; start <= limit
- * <= text.length()
.
- * @param cursor position at which to translate next, representing offset
- * into text. This value must be between start
and
- * limit
.
- * @param data a dictionary mapping variables to the sets they
- * represent (maps Character
to UnicodeSet
)
- * null then no filtering is applied.
- * @return the matching rule, or null if none found.
+ * Transliterate the given text with the given UTransPosition
+ * indices. Return TRUE if the transliteration should continue
+ * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
+ * Note that FALSE is only ever returned if isIncremental is TRUE.
+ * @param text the text to be transliterated
+ * @param index the position indices, which will be updated
+ * @param isIncremental if TRUE, assume new text may be inserted
+ * at index.limit, and return FALSE if thre is a partial match.
+ * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
+ * indicating that transliteration should stop until more text
+ * arrives.
*/
- virtual TransliterationRule* findMatch(const Replaceable& text,
- const UTransPosition& pos,
- const TransliterationRuleData& data) const;
-
- /**
- * Attempt to find a matching rule at the specified point in the text.
- * Unlike findMatch()
, this method does an incremental match.
- * An incremental match requires that there be no partial matches that might
- * pre-empt the full match that is found. If there are partial matches,
- * then null is returned. A non-null result indicates that a full match has
- * been found, and that it cannot be pre-empted by a partial match
- * regardless of what additional text is added to the translation buffer.
- * @param text the text, both translated and untranslated
- * @param start the beginning index, inclusive; 0 <= start
- * <= limit
.
- * @param limit the ending index, exclusive; start <= limit
- * <= text.length()
.
- * @param cursor position at which to translate next, representing offset
- * into text. This value must be between start
and
- * limit
.
- * @param data a dictionary mapping variables to the sets they
- * represent (maps Character
to UnicodeSet
)
- * @param partial output parameter. partial[0]
is set to
- * true if a partial match is returned.
- * @param filter the filter. Any character for which
- * filter.isIn() returns false will not be
- * altered by this transliterator. If filter is
- * null then no filtering is applied.
- * @return the matching rule, or null if none found, or if the text buffer
- * does not have enough text yet to unambiguously match a rule.
- */
- virtual TransliterationRule* findIncrementalMatch(const Replaceable& text,
- const UTransPosition& pos,
- const TransliterationRuleData& data,
- UBool& isPartial) const;
+ UBool transliterate(Replaceable& text,
+ UTransPosition& index,
+ UBool isIncremental);
/**
* Create rule strings that represents this rule set.
@@ -158,7 +115,6 @@ public:
* contents will be deleted.
*/
virtual UnicodeString& toRules(UnicodeString& result,
- const TransliterationRuleData& data,
UBool escapeUnprintable) const;
};
#endif
diff --git a/icu4c/source/i18n/symtable.h b/icu4c/source/i18n/symtable.h
index c600468289a..1e87a5a4eec 100644
--- a/icu4c/source/i18n/symtable.h
+++ b/icu4c/source/i18n/symtable.h
@@ -44,7 +44,7 @@ public:
* Lookup the UnicodeSet associated with the given character, and
* return it. Return null if not found.
*/
- virtual const UnicodeSet* lookupSet(UChar ch) const = 0;
+ virtual const UnicodeSet* lookupSet(UChar32 ch) const = 0;
/**
* Parse a symbol reference name from the given string, starting
diff --git a/icu4c/source/i18n/translit.cpp b/icu4c/source/i18n/translit.cpp
index dbb5d147598..03a2f91f9c5 100644
--- a/icu4c/source/i18n/translit.cpp
+++ b/icu4c/source/i18n/translit.cpp
@@ -281,10 +281,20 @@ void Transliterator::transliterate(Replaceable& text,
*/
void Transliterator::transliterate(Replaceable& text,
UTransPosition& index,
- UChar insertion,
+ UChar32 insertion,
UErrorCode& status) const {
UnicodeString str(insertion);
- _transliterate(text, index, &str, status);
+ if (UTF_IS_LEAD(insertion)) {
+ // Oops, the caller passed us a single lead surrogate. In
+ // general, we don't support this, but we'll do the caller a
+ // favor in the special case of LEAD followed by TRAIL
+ // insertion. Anything else won't work.
+ text.handleReplaceBetween(index.limit, index.limit, str);
+ ++index.limit;
+ ++index.contextLimit;
+ } else {
+ _transliterate(text, index, &str, status);
+ }
}
/**
@@ -351,8 +361,18 @@ void Transliterator::_transliterate(Replaceable& text,
filteredTransliterate(text, index, TRUE);
- index.contextStart = uprv_max(index.start - getMaximumContextLength(),
- originalStart);
+ // The purpose of the code below is to keep the context small
+ // while doing incremental transliteration. When part of the left
+ // context (between contextStart and start) is no longer needed,
+ // we try to advance contextStart past that portion. We use the
+ // maximum context length to do so.
+ int32_t newCS = index.start;
+ int32_t n = getMaximumContextLength();
+ while (newCS > originalStart && n-- > 0) {
+ --newCS;
+ newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1;
+ }
+ index.contextStart = uprv_max(newCS, originalStart);
}
/**
diff --git a/icu4c/source/i18n/unifltlg.cpp b/icu4c/source/i18n/unifltlg.cpp
index 3f8bf42efbc..bfd689e65c4 100644
--- a/icu4c/source/i18n/unifltlg.cpp
+++ b/icu4c/source/i18n/unifltlg.cpp
@@ -21,7 +21,7 @@ public:
NullFilter(UBool r) { result = r; }
NullFilter(const NullFilter& f) : UnicodeFilter(f) { result = f.result; }
virtual ~NullFilter() {}
- virtual UBool contains(UChar /*c*/) const { return result; }
+ virtual UBool contains(UChar32 /*c*/) const { return result; }
virtual UnicodeFilter* clone() const { return new NullFilter(*this); }
};
@@ -31,7 +31,7 @@ public:
UnicodeNotFilter(UnicodeFilter* adopted);
UnicodeNotFilter(const UnicodeNotFilter&);
virtual ~UnicodeNotFilter();
- virtual UBool contains(UChar c) const;
+ virtual UBool contains(UChar32 c) const;
virtual UnicodeFilter* clone() const;
};
@@ -39,7 +39,7 @@ UnicodeNotFilter::UnicodeNotFilter(UnicodeFilter* adopted) : filt(adopted) {}
UnicodeNotFilter::UnicodeNotFilter(const UnicodeNotFilter& f)
: UnicodeFilter(f), filt(f.filt->clone()) {}
UnicodeNotFilter::~UnicodeNotFilter() { delete filt; }
-UBool UnicodeNotFilter::contains(UChar c) const { return !filt->contains(c); }
+UBool UnicodeNotFilter::contains(UChar32 c) const { return !filt->contains(c); }
UnicodeFilter* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); }
/**
@@ -61,7 +61,7 @@ public:
UnicodeAndFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2);
UnicodeAndFilter(const UnicodeAndFilter&);
virtual ~UnicodeAndFilter();
- virtual UBool contains(UChar c) const;
+ virtual UBool contains(UChar32 c) const;
virtual UnicodeFilter* clone() const;
};
@@ -69,7 +69,7 @@ UnicodeAndFilter::UnicodeAndFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1
UnicodeAndFilter::UnicodeAndFilter(const UnicodeAndFilter& f)
: UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
UnicodeAndFilter::~UnicodeAndFilter() { delete filt1; delete filt2; }
-UBool UnicodeAndFilter::contains(UChar c) const { return filt1->contains(c) && filt2->contains(c); }
+UBool UnicodeAndFilter::contains(UChar32 c) const { return filt1->contains(c) && filt2->contains(c); }
UnicodeFilter* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); }
/**
@@ -99,7 +99,7 @@ public:
UnicodeOrFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2);
UnicodeOrFilter(const UnicodeOrFilter&);
virtual ~UnicodeOrFilter();
- virtual UBool contains(UChar c) const;
+ virtual UBool contains(UChar32 c) const;
virtual UnicodeFilter* clone() const;
};
@@ -107,7 +107,7 @@ UnicodeOrFilter::UnicodeOrFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f
UnicodeOrFilter::UnicodeOrFilter(const UnicodeOrFilter& f)
: UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
UnicodeOrFilter::~UnicodeOrFilter() { delete filt1; delete filt2; }
-UBool UnicodeOrFilter::contains(UChar c) const { return filt1->contains(c) || filt2->contains(c); }
+UBool UnicodeOrFilter::contains(UChar32 c) const { return filt1->contains(c) || filt2->contains(c); }
UnicodeFilter* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); }
/**
diff --git a/icu4c/source/i18n/uniset.cpp b/icu4c/source/i18n/uniset.cpp
index ce46ac24249..d8aaa9971ae 100644
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@@ -542,17 +542,6 @@ UBool UnicodeSet::contains(UChar32 c) const {
return ((i & 1) != 0); // return true if odd
}
-/**
- * Implement UnicodeFilter:
- * Returns true if this set contains the specified char.
- *
- * @return true if this set contains the specified char.
- * @draft
- */
-UBool UnicodeSet::contains(UChar c) const {
- return contains((UChar32) c);
-}
-
/**
* Returns true if this set contains any character whose low byte
* is the given value. This is used by RuleBasedTransliterator for
@@ -581,6 +570,24 @@ UBool UnicodeSet::containsIndexValue(uint8_t v) const {
return FALSE;
}
+/**
+ * Implementation of UnicodeMatcher::matches().
+ */
+UMatchDegree UnicodeSet::matches(const Replaceable& text,
+ int32_t& offset,
+ int32_t limit,
+ UBool incremental) const {
+ if (offset == limit) {
+ if (contains(TransliterationRule::ETHER)) {
+ return incremental ? U_PARTIAL_MATCH : U_MATCH;
+ } else {
+ return U_MISMATCH;
+ }
+ } else {
+ return UnicodeFilter::matches(text, offset, limit, incremental);
+ }
+}
+
/**
* Adds the specified range to this set if it is not already
* present. If this set already contains the specified range,
@@ -895,7 +902,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
UBool invert = FALSE;
clear();
- int32_t lastChar = -1; // This is either a char (0..FFFF) or -1
+ const UChar32 NONE = (UChar32) -1;
+ UChar32 lastChar = NONE; // This is either a char (0..10FFFF) or NONE
UChar lastOp = 0;
/* This loop iterates over the characters in the pattern. We start at
@@ -916,8 +924,9 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
// mode 1: '[' seen; if next is '^' or ':' then special
// mode 2: '[' '^'? seen; parse pattern and close with ']'
// mode 3: '[:' seen; parse category and close with ':]'
+ // mode 4: Pattern closed cleanly
int8_t mode = 0;
- int32_t openPos = 0; // offset to opening '['
+ int32_t colonPos = 0; // Expected pos of ':' in '[:'
int32_t i = pos.getIndex();
int32_t limit = pattern.length();
UnicodeSet nestedAux;
@@ -930,7 +939,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
const UnicodeString* varValueBuffer = NULL;
int32_t ivarValueBuffer = 0;
int32_t anchor = 0;
- for (; i