diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index b23f61306f6..f80f4a4e3c4 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -71,7 +71,8 @@ udat.o umsg.o \ unifltlg.o unirange.o uniset.o unitohex.o unum.o \ dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o brkdict.o nultrans.o jamohang.o hangjamo.o \ remtrans.o utrans.o \ -titletrn.o tolowtrn.o toupptrn.o xformtrn.o name2uni.o uni2name.o nortrans.o +titletrn.o tolowtrn.o toupptrn.o xformtrn.o name2uni.o uni2name.o nortrans.o \ +unifilt.o STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O)) diff --git a/icu4c/source/i18n/i18n.dsp b/icu4c/source/i18n/i18n.dsp index ac1682dd392..43443d55481 100644 --- a/icu4c/source/i18n/i18n.dsp +++ b/icu4c/source/i18n/i18n.dsp @@ -318,6 +318,10 @@ SOURCE=.\uni2name.cpp # End Source File # Begin Source File +SOURCE=.\unifilt.cpp +# End Source File +# Begin Source File + SOURCE=.\unifltlg.cpp # End Source File # Begin Source File @@ -1548,6 +1552,25 @@ InputPath=.\unicode\unifltlg.h # End Source File # Begin Source File +SOURCE=.\unicode\unimatch.h + +!IF "$(CFG)" == "i18n - Win32 Release" + +!ELSEIF "$(CFG)" == "i18n - Win32 Debug" + +# Begin Custom Build +InputPath=.\unicode\unimatch.h + +"..\..\include\unicode\unimatch.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy unicode\unimatch.h ..\..\include\unicode + +# End Custom Build + +!ENDIF + +# End Source File +# Begin Source File + SOURCE=.\unirange.h # End Source File # Begin Source File diff --git a/icu4c/source/i18n/rbt.cpp b/icu4c/source/i18n/rbt.cpp index 0882b5db375..f1c36064110 100644 --- a/icu4c/source/i18n/rbt.cpp +++ b/icu4c/source/i18n/rbt.cpp @@ -89,18 +89,18 @@ RuleBasedTransliterator::clone(void) const { void RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, UBool isIncremental) const { - /* We keep start and limit fixed the entire time, - * relative to the text -- limit may move numerically if text is - * inserted or removed. The cursor moves from start to limit, with - * replacements happening under it. + /* We keep contextStart and contextLimit fixed the entire time, + * relative to the text -- contextLimit may move numerically if + * text is inserted or removed. The start offset moves toward + * limit, with replacements happening under it. * * Example: rules 1. ab>x|y * 2. yc>z * - * |eabcd start - no match, advance cursor - * e|abcd match rule 1 - change text & adjust cursor - * ex|ycd match rule 2 - change text & adjust cursor - * exz|d no match, advance cursor + * |eabcd begin - no match, advance start + * e|abcd match rule 1 - change text & adjust start + * ex|ycd match rule 2 - change text & adjust start + * exz|d no match, advance start * exzd| done */ @@ -121,39 +121,14 @@ RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& loopLimit <<= 4; } - UBool isPartial = FALSE; - - while (index.start < index.limit && loopCount <= loopLimit) { - TransliterationRule* r = isIncremental ? - data->ruleSet.findIncrementalMatch(text, index, *data, isPartial) : - data->ruleSet.findMatch(text, index, *data); - - /* If we match a rule then apply it by replacing the key - * with the rule output and repositioning the cursor - * appropriately. If we get a partial match, then we - * can't do anything without more text; return with the - * cursor at the current position. If we get null, then - * there is no match at this position, and we can advance - * the cursor. - */ - if (r == 0) { - if (isPartial) { // always FALSE unless isIncremental - break; - } else { - ++index.start; - } - } else { - // Delegate replacement to TransliterationRule object - int32_t lenDelta = r->replace(text, index.start, *data); - index.limit += lenDelta; - index.contextLimit += lenDelta; - index.start += r->getCursorPos(); - ++loopCount; - } + while (index.start < index.limit && + loopCount <= loopLimit && + data->ruleSet.transliterate(text, index, isIncremental)) { + ++loopCount; } } UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, UBool escapeUnprintable) const { - return data->ruleSet.toRules(rulesSource, *data, escapeUnprintable); + return data->ruleSet.toRules(rulesSource, escapeUnprintable); } diff --git a/icu4c/source/i18n/rbt_data.cpp b/icu4c/source/i18n/rbt_data.cpp index 626532f7b74..87323237a1c 100644 --- a/icu4c/source/i18n/rbt_data.cpp +++ b/icu4c/source/i18n/rbt_data.cpp @@ -64,13 +64,13 @@ TransliterationRuleData::~TransliterationRuleData() { } const UnicodeSet* -TransliterationRuleData::lookupSet(UChar standIn) const { +TransliterationRuleData::lookupSet(UChar32 standIn) const { int32_t i = standIn - setVariablesBase; return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0; } int32_t -TransliterationRuleData::lookupSegmentReference(UChar c) const { +TransliterationRuleData::lookupSegmentReference(UChar32 c) const { int32_t i = c - segmentBase; return (i >= 0 && i < 9) ? i : -1; } diff --git a/icu4c/source/i18n/rbt_data.h b/icu4c/source/i18n/rbt_data.h index 737df3d9fc6..c81ac9ceda0 100644 --- a/icu4c/source/i18n/rbt_data.h +++ b/icu4c/source/i18n/rbt_data.h @@ -90,14 +90,14 @@ public: ~TransliterationRuleData(); - const UnicodeSet* lookupSet(UChar standIn) const; + const UnicodeSet* lookupSet(UChar32 standIn) const; /** * Return the zero-based index of the segment represented by the given * character, or -1 if none. Repeat: This is a zero-based return value, * 0..8, even though these are notated "$1".."$9". */ - int32_t lookupSegmentReference(UChar c) const; + int32_t lookupSegmentReference(UChar32 c) const; /** * Return the character used to stand for the given segment reference. diff --git a/icu4c/source/i18n/rbt_pars.cpp b/icu4c/source/i18n/rbt_pars.cpp index 1bdd0e1d2a7..61c3a926a23 100644 --- a/icu4c/source/i18n/rbt_pars.cpp +++ b/icu4c/source/i18n/rbt_pars.cpp @@ -75,7 +75,7 @@ public: virtual const UnicodeString* lookup(const UnicodeString& s) const; - virtual const UnicodeSet* lookupSet(UChar ch) const; + virtual const UnicodeSet* lookupSet(UChar32 ch) const; virtual UnicodeString parseReference(const UnicodeString& text, ParsePosition& pos, int32_t limit) const; @@ -95,7 +95,7 @@ const UnicodeString* ParseData::lookup(const UnicodeString& name) const { /** * Implement SymbolTable API. */ -const UnicodeSet* ParseData::lookupSet(UChar ch) const { +const UnicodeSet* ParseData::lookupSet(UChar32 ch) const { // Note that we cannot use data.lookupSet() because the // set array has not been constructed yet. const UnicodeSet* set = NULL; @@ -682,7 +682,7 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult, // Index the rules if (U_SUCCESS(status)) { - data->ruleSet.freeze(*data, status); + data->ruleSet.freeze(status); if (idSplitPointResult < 0) { idSplitPointResult = idBlockResult.length(); } @@ -849,6 +849,7 @@ int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) { right->text, right->cursor, right->cursorOffset, left->createSegments(), left->anchorStart, left->anchorEnd, + *data, status), status); return pos; diff --git a/icu4c/source/i18n/rbt_rule.cpp b/icu4c/source/i18n/rbt_rule.cpp index 0709f4ea997..45008a150b0 100644 --- a/icu4c/source/i18n/rbt_rule.cpp +++ b/icu4c/source/i18n/rbt_rule.cpp @@ -52,7 +52,9 @@ TransliterationRule::TransliterationRule(const UnicodeString& input, int32_t cursorPosition, int32_t cursorOffset, int32_t* adoptedSegs, UBool anchorStart, UBool anchorEnd, - UErrorCode& status) { + const TransliterationRuleData& theData, + UErrorCode& status) : + data(theData) { init(input, anteContextPos, postContextPos, outputStr, cursorPosition, cursorOffset, adoptedSegs, anchorStart, anchorEnd, status); @@ -79,7 +81,9 @@ TransliterationRule::TransliterationRule(const UnicodeString& input, int32_t anteContextPos, int32_t postContextPos, const UnicodeString& outputStr, int32_t cursorPosition, - UErrorCode& status) { + const TransliterationRuleData& theData, + UErrorCode& status) : + data(theData) { init(input, anteContextPos, postContextPos, outputStr, cursorPosition, 0, NULL, FALSE, FALSE, status); } @@ -92,7 +96,9 @@ TransliterationRule::TransliterationRule(TransliterationRule& other) : output(other.output), anteContextLength(other.anteContextLength), keyLength(other.keyLength), - cursorPos(other.cursorPos) { + cursorPos(other.cursorPos), + flags(other.flags), + data(other.data) { segments = 0; if (other.segments != 0) { @@ -153,32 +159,27 @@ void TransliterationRule::init(const UnicodeString& input, // We don't validate the segments array. The caller must // guarantee that the segments are well-formed. this->segments = adoptedSegs; + // Find the position of the first segment index that is after the + // anteContext (in the key). Note that this may be a start or a + // limit index. + firstKeySeg = -1; + if (segments != 0) { + do { + ++firstKeySeg; + } while (segments[firstKeySeg] >= 0 && + segments[firstKeySeg] < anteContextLength); + if (segments[firstKeySeg] < 0) { + firstKeySeg = -1; + } + } - // Implement anchors by inserting an ETHER character on the - // left or right. If on the left, then the indices must be - // incremented. If on the right, no index change is - // necessary. - if (anchorStart || anchorEnd) { - pattern.truncate(0); - if (anchorStart) { - pattern.append(ETHER); - ++anteContextLength; - // Adjust segment offsets - if (segments != 0) { - int32_t *p = segments; - // The end marker is a -1. - while (*p != -1) { - ++(*p); - ++p; - } - } - } - pattern.append(input); - if (anchorEnd) { - pattern.append(ETHER); - } - } else { - pattern = input; + pattern = input; + flags = 0; + if (anchorStart) { + flags |= ANCHOR_START; + } + if (anchorEnd) { + flags |= ANCHOR_END; } } @@ -197,10 +198,14 @@ int32_t TransliterationRule::getCursorPos(void) const { /** * Return the preceding context length. This method is needed to * support the Transliterator method - * getMaximumContextLength(). + * getMaximumContextLength(). Internally, this is + * implemented as the anteContextLength, optionally plus one if + * there is a start anchor. The one character anchor gap is + * needed to make repeated incremental transliteration with + * anchors work. */ -int32_t TransliterationRule::getAnteContextLength(void) const { - return anteContextLength; +int32_t TransliterationRule::getContextLength(void) const { + return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); } /** @@ -209,81 +214,16 @@ int32_t TransliterationRule::getAnteContextLength(void) const { * unless the first character of the key is a set. If it's a * set, or otherwise can match multiple keys, the index value is -1. */ -int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data) const { +int16_t TransliterationRule::getIndexValue() const { if (anteContextLength == pattern.length()) { // A pattern with just ante context {such as foo)>bar} can // match any key. return -1; } - UChar c = pattern.charAt(anteContextLength); + UChar32 c = pattern.char32At(anteContextLength); return (int16_t)(data.lookupSet(c) == NULL ? (c & 0xFF) : -1); } -/** - * Do a replacement of the input pattern with the output text in - * the given string, at the given offset. This method assumes - * that a match has already been found in the given text at the - * given position. - * @param text the text containing the substring to be replaced - * @param offset the offset into the text at which the pattern - * matches. This is the offset to the point after the ante - * context, if any, and before the match string and any post - * context. - * @param data the RuleBasedTransliterator.Data object specifying - * context for this transliterator. - * @return the change in the length of the text - */ -int32_t TransliterationRule::replace(Replaceable& text, int32_t offset, - const TransliterationRuleData& data) const { - if (segments == NULL) { - text.handleReplaceBetween(offset, offset + keyLength, output); - return output.length() - keyLength; - } else { - /* When there are segments to be copied, use the Replaceable.copy() - * API in order to retain out-of-band data. Copy everything to the - * point after the key, then delete the key. That is, copy things - * into offset + keyLength, then replace offset .. offset + - * keyLength with the empty string. - * - * Minimize the number of calls to Replaceable.replace() and - * Replaceable.copy(). - */ - int32_t textStart = offset - anteContextLength; - int32_t dest = offset + keyLength; // copy new text to here - UnicodeString buf; - for (int32_t i=0; i 0) { - text.handleReplaceBetween(dest, dest, buf); - dest += buf.length(); - buf.remove(); - } - // Copy segment with out-of-band data - b *= 2; - text.copy(textStart + segments[b], - textStart + segments[b+1], dest); - dest += segments[b+1] - segments[b]; - } - - } - // Insert any accumulated straight text. - if (buf.length() > 0) { - text.handleReplaceBetween(dest, dest, buf); - dest += buf.length(); - } - // Delete the key - buf.remove(); - text.handleReplaceBetween(offset, offset + keyLength, buf); - return dest - (offset + keyLength) - keyLength; - } -} - /** * Internal method. Returns true if this rule matches the given * index value. The index value is an 8-bit integer, 0..255, @@ -294,14 +234,13 @@ int32_t TransliterationRule::replace(Replaceable& text, int32_t offset, * value. If the rule contains only ante context, as in foo)>bar, * then it will match any key. */ -UBool TransliterationRule::matchesIndexValue(uint8_t v, - const TransliterationRuleData& data) const { +UBool TransliterationRule::matchesIndexValue(uint8_t v) const { if (anteContextLength == pattern.length()) { // A pattern with just ante context {such as foo)>bar} can // match any key. return TRUE; } - UChar c = pattern.charAt(anteContextLength); + UChar32 c = pattern.char32At(anteContextLength); const UnicodeSet* set = data.lookupSet(c); return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v); } @@ -328,6 +267,22 @@ UBool TransliterationRule::masks(const TransliterationRule& r2) const { * of) the corresponding characters of r2. The superset * operation should be performed to check for UnicodeSet * masking. + * + * Anchors: Two patterns that differ only in anchors only + * mask one another if they are exactly equal, and r2 has + * all the anchors r1 has (optionally, plus some). Here Y + * means the row masks the column, N means it doesn't. + * + * ab ^ab ab$ ^ab$ + * ab Y Y Y Y + * ^ab N Y N Y + * ab$ N N Y Y + * ^ab$ N N N Y + * + * Post context: {a}b masks ab, but not vice versa, since {a}b + * matches everything ab matches, and {a}b matches {|a|}b but ab + * does not. Pre context is different (a{b} does not align with + * ab). */ /* LIMITATION of the current mask algorithm: Some rule @@ -340,126 +295,242 @@ UBool TransliterationRule::masks(const TransliterationRule& r2) const { int32_t left2 = r2.anteContextLength; int32_t right = len - left; int32_t right2 = r2.pattern.length() - left2; - return left <= left2 && right <= right2 && + + // TODO Clean this up -- some logic might be combinable with the + // next statement. + + // Test for anchor masking + if (left == left2 && right == right2 && + keyLength <= r2.keyLength && + 0 == r2.pattern.compare(0, len, pattern)) { + // The following boolean logic implements the table above + return (flags == r2.flags) || + (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || + ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); + } + + return left <= left2 && + (right < right2 || + (right == right2 && keyLength <= r2.keyLength)) && 0 == r2.pattern.compare(left2 - left, len, pattern); } /** - * Return true if this rule matches the given text. - * @param text the text, both translated and untranslated - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param cursor position at which to translate next, representing offset - * into text. This value must be between start and - * limit. + * Attempt a match and replacement at the given position. Return + * the degree of match between this rule and the given text. The + * degree of match may be mismatch, a partial match, or a full + * match. A mismatch means at least one character of the text + * does not match the context or key. A partial match means some + * context and key characters match, but the text is not long + * enough to match all of them. A full match means all context + * and key characters match. + * + * If a full match is obtained, perform a replacement, update pos, + * and return U_MATCH. Otherwise both text and pos are unchanged. + * + * @param text the text + * @param pos the position indices + * @param incremental if TRUE, test for partial matches that may + * be completed by additional text inserted at pos.limit. + * @return one of U_MISMATCH, + * U_PARTIAL_MATCH, or U_MATCH. If + * incremental is FALSE then U_PARTIAL_MATCH will not be returned. */ -UBool TransliterationRule::matches(const Replaceable& text, - const UTransPosition& pos, - const TransliterationRuleData& data) const { - // Match anteContext, key, and postContext - int32_t cursor = pos.start - anteContextLength; - // Quick length check; this is a performance win for long rules. - // Widen by one (on both sides) to allow anchor matching. - if (cursor < (pos.contextStart - 1) || - (cursor + pattern.length()) > (pos.contextLimit + 1)) { - return FALSE; - } - for (int32_t i=0; i0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param cursor position at which to translate next, representing offset - * into text. This value must be between start and - * limit. - * @return one of MISMATCH, PARTIAL_MATCH, or - * FULL_MATCH. - * @see #MISMATCH - * @see #PARTIAL_MATCH - * @see #FULL_MATCH - */ -int32_t TransliterationRule::getMatchDegree(const Replaceable& text, - const UTransPosition& pos, - const TransliterationRuleData& data) const { - int len = getRegionMatchLength(text, pos, data); - return len < anteContextLength ? MISMATCH : - (len < pattern.length() ? PARTIAL_MATCH : FULL_MATCH); -} + // ============================ MATCH =========================== -/** - * Return the number of characters of the text that match this rule. If - * there is a mismatch, return -1. If the text is not long enough to match - * any characters, return 0. - * @param text the text, both translated and untranslated - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param cursor position at which to translate next, representing offset - * into text. This value must be between start and - * limit. - * @param data a dictionary of variables mapping Character - * to UnicodeSet - * @return -1 if there is a mismatch, 0 if the text is not long enough to - * match any characters, otherwise the number of characters of text that - * match this rule. - */ -int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text, - const UTransPosition& pos, - const TransliterationRuleData& data) const { - int32_t cursor = pos.start - anteContextLength; - // Quick length check; this is a performance win for long rules. - // Widen by one to allow anchor matching. - if (cursor < (pos.contextStart - 1)) { - return -1; - } + // Record the positions of segments. We assume the following: + // - The maximum number of segments is 9. + // - The segment indices occur in ascending order. That is, + // segment 1 start <= segment 1 limit <= sement 2 start... + // - The segments have been validated such that there are no + // references to nonexistent segments. + // - The end of the segment array is marked by a start of -1. + // Currently, the parser enforces all of these constraints. + // In the future, the first two constraints may be lifted, + // in which case this method will have to be modified. + + int32_t segPos[18]; + int32_t iSeg = firstKeySeg - 1; + int32_t nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1; + + // ------------------------ Ante Context ------------------------ + + // A mismatch in the ante context, or with the start anchor, + // is an outright U_MISMATCH regardless of whether we are + // incremental or not. + int32_t cursor = pos.start - 1; int32_t i; - for (i=0; i=0; --i) { + while (i == nextSegPos) { + segPos[iSeg] = cursor; + nextSegPos == (--iSeg >= 0) ? segments[iSeg] : -1; + } + UChar keyChar = pattern.charAt(i); + const UnicodeSet* set = data.lookupSet(keyChar); + if (set == 0) { + if (cursor >= pos.contextStart && + keyChar == text.charAt(cursor)) { + --cursor; + } else { + return U_MISMATCH; + } + } else { + // Subtract 1 from contextStart to make it a reverse limit + if (set->matches(text, cursor, pos.contextStart-1, FALSE) + != U_MATCH) { + return U_MISMATCH; + } } } - return i; -} -/** - * Return true if the given key matches the given text. This method - * accounts for the fact that the key character may represent a character - * set. Note that the key and text characters may not be interchanged - * without altering the results. - * @param keyChar a character in the match key - * @param textChar a character in the text being transliterated - * @param data a dictionary of variables mapping Character - * to UnicodeSet - */ -UBool TransliterationRule::charMatches(UChar keyChar, const Replaceable& text, - int32_t index, - const UTransPosition& pos, - const TransliterationRuleData& data) const { - const UnicodeSet* set = 0; - UChar textChar = (index >= pos.contextStart && index < pos.contextLimit) - ? text.charAt(index) : ETHER; - return ((set = data.lookupSet(keyChar)) == 0) ? - keyChar == textChar : set->contains(textChar); + // ------------------------ Start Anchor ------------------------ + + if ((flags & ANCHOR_START) && cursor != (pos.contextStart-1)) { + return U_MISMATCH; + } + + // -------------------- Key and Post Context -------------------- + + iSeg = firstKeySeg; + nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1; + + i = 0; + cursor = pos.start; + int32_t keyLimit = 0; + while (i < (pattern.length() - anteContextLength)) { + if (incremental && cursor == pos.contextLimit) { + // We've reached the context limit without a mismatch and + // without completing our match. + return U_PARTIAL_MATCH; + } + if (cursor == pos.limit && i < keyLength) { + // We're still in the pattern key but we're entering the + // post context. + return U_MISMATCH; + } + while (i == nextSegPos) { + segPos[iSeg] = cursor; + nextSegPos = segments[++iSeg]; + } + if (i == keyLength) { + keyLimit = cursor; + } + UChar keyChar = pattern.charAt(anteContextLength + i++); + const UnicodeSet* set = data.lookupSet(keyChar); + if (set == 0) { + // Don't need the cursor < pos.contextLimit check if + // incremental is TRUE (because it's done above); do need + // it otherwise. + if (cursor < pos.contextLimit && + keyChar == text.charAt(cursor)) { + ++cursor; + } else { + return U_MISMATCH; + } + } else { + UMatchDegree m = + set->matches(text, cursor, pos.contextLimit, incremental); + if (m != U_MATCH) { + return m; + } + } + } + while (i == nextSegPos) { + segPos[iSeg] = cursor; + nextSegPos = segments[++iSeg]; + } + if (i == keyLength) { + keyLimit = cursor; + } + + // ------------------------- Stop Anchor ------------------------ + + if ((flags & ANCHOR_END) != 0) { + if (cursor != pos.contextLimit) { + return U_MISMATCH; + } + if (incremental) { + return U_PARTIAL_MATCH; + } + } + + // =========================== REPLACE ========================== + + // We have a full match. The key is between pos.start and + // keyLimit. Segment indices have been recorded in segPos[]. + // Perform a replacement. + + int32_t lenDelta = 0; + + if (segments == NULL) { + text.handleReplaceBetween(pos.start, keyLimit, output); + lenDelta = output.length() - (keyLimit - pos.start); + pos.start += cursorPos; + } else { + /* When there are segments to be copied, use the Replaceable.copy() + * API in order to retain out-of-band data. Copy everything to the + * point after the key, then delete the key. That is, copy things + * into offset + keyLength, then replace offset .. offset + + * keyLength with the empty string. + * + * Minimize the number of calls to Replaceable.replace() and + * Replaceable.copy(). + */ + int32_t dest = keyLimit; // copy new text to here + UnicodeString buf; + for (i=0; i 0) { + text.handleReplaceBetween(dest, dest, buf); + dest += buf.length(); + buf.remove(); + } + // Copy segment with out-of-band data + b *= 2; + text.copy(segPos[b], segPos[b+1], dest); + dest += segPos[b+1] - segPos[b]; + } + i += UTF_CHAR_LENGTH(c); + } + // Insert any accumulated straight text. + if (buf.length() > 0) { + text.handleReplaceBetween(dest, dest, buf); + dest += buf.length(); + } + if (i == cursorPos) { + // Record the position of the cursor + cursor = dest; + } + // Delete the key + buf.remove(); + text.handleReplaceBetween(pos.start, keyLimit, buf); + lenDelta = dest - keyLimit - (keyLimit - pos.start); + pos.start = cursor - (keyLimit - pos.start); + } + + pos.limit += lenDelta; + pos.contextLimit += lenDelta; + + return U_MATCH; } /** @@ -570,7 +641,6 @@ void TransliterationRule::_appendToRule(UnicodeString& rule, * given string. */ UnicodeString& TransliterationRule::toRule(UnicodeString& rule, - const TransliterationRuleData& data, UBool escapeUnprintable) const { int32_t i; @@ -674,3 +744,5 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule, return rule; } + +//eof diff --git a/icu4c/source/i18n/rbt_rule.h b/icu4c/source/i18n/rbt_rule.h index e6dd4cdcee2..f2779ccc7d6 100644 --- a/icu4c/source/i18n/rbt_rule.h +++ b/icu4c/source/i18n/rbt_rule.h @@ -10,6 +10,7 @@ #include "unicode/unistr.h" #include "unicode/utrans.h" +#include "unicode/unimatch.h" class Replaceable; class TransliterationRuleData; @@ -36,39 +37,6 @@ class TransliterationRule { public: - /** - * Constants returned by getMatchDegree() indicating - * the degree of match between the text and this rule. - * @see #getMatchDegree - */ - enum { - /** - * Constant returned by getMatchDegree() - * indicating a mismatch between the text and this rule. One - * or more characters of the context or key do not match the - * text. - */ - MISMATCH, - - /** - * Constant returned by getMatchDegree() - * indicating a partial match between the text and this rule. - * All characters of the text match the corresponding context - * or key, but more characters are required for a complete - * match. There are some key or context characters at the end - * of the pattern that remain unmatched because the text isn't - * long enough. - */ - PARTIAL_MATCH, - - /** - * Constant returned by getMatchDegree() - * indicating a complete match between the text and this rule. - * The text matches all context and key characters. - */ - FULL_MATCH - }; - /** * The character at index i, where i < contextStart || i >= contextLimit, * is ETHER. This allows explicit matching by rules and UnicodeSets @@ -109,6 +77,14 @@ private: */ int32_t* segments; + /** + * A value we compute from segments. The first index into segments[] + * that is >= anteContextLength. That is, the first one that is within + * the forward scanned part of the pattern -- the key or the postContext. + * If there are no segments, this has the value -1. + */ + int32_t firstKeySeg; + /** * The length of the string that must match before the key. If * zero, then there is no matching requirement before the key. @@ -130,6 +106,25 @@ private: */ int32_t cursorPos; + /** + * Miscellaneous attributes. + */ + int8_t flags; + + /** + * Flag attributes. + */ + enum { + ANCHOR_START = 1, + ANCHOR_END = 2, + }; + + /** + * A reference to the data for this rule. The data provides + * lookup services for matchers and segments. + */ + const TransliterationRuleData& data; + public: /** @@ -169,6 +164,7 @@ public: int32_t cursorPosition, int32_t cursorOffset, int32_t* adoptedSegs, UBool anchorStart, UBool anchorEnd, + const TransliterationRuleData& data, UErrorCode& status); /** @@ -192,6 +188,7 @@ public: int32_t anteContextPos, int32_t postContextPos, const UnicodeString& outputStr, int32_t cursorPosition, + const TransliterationRuleData& data, UErrorCode& status); /** @@ -213,9 +210,13 @@ public: /** * Return the preceding context length. This method is needed to * support the Transliterator method - * getMaximumContextLength(). + * getMaximumContextLength(). Internally, this is + * implemented as the anteContextLength, optionally plus one if + * there is a start anchor. The one character anchor gap is + * needed to make repeated incremental transliteration with + * anchors work. */ - virtual int32_t getAnteContextLength(void) const; + virtual int32_t getContextLength(void) const; /** * Internal method. Returns 8-bit index value for this rule. @@ -223,24 +224,7 @@ public: * unless the first character of the key is a set. If it's a * set, or otherwise can match multiple keys, the index value is -1. */ - int16_t getIndexValue(const TransliterationRuleData& data) const; - - /** - * Do a replacement of the input pattern with the output text in - * the given string, at the given offset. This method assumes - * that a match has already been found in the given text at the - * given position. - * @param text the text containing the substring to be replaced - * @param offset the offset into the text at which the pattern - * matches. This is the offset to the point after the ante - * context, if any, and before the match string and any post - * context. - * @param data the RuleBasedTransliterator.Data object specifying - * context for this transliterator. - * @return the change in the length of the text - */ - int32_t replace(Replaceable& text, int32_t offset, - const TransliterationRuleData& data) const; + int16_t getIndexValue() const; /** * Internal method. Returns true if this rule matches the given @@ -252,8 +236,7 @@ public: * value. If the rule contains only ante context, as in foo)>bar, * then it will match any key. */ - UBool matchesIndexValue(uint8_t v, - const TransliterationRuleData& data) const; + UBool matchesIndexValue(uint8_t v) const; /** * Return true if this rule masks another rule. If r1 masks r2 then @@ -264,88 +247,35 @@ public: virtual UBool masks(const TransliterationRule& r2) const; /** - * Return true if this rule matches the given text. - * @param text the text, both translated and untranslated - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param cursor position at which to translate next, representing offset - * into text. This value must be between start and - * limit. + * Attempt a match and replacement at the given position. Return + * the degree of match between this rule and the given text. The + * degree of match may be mismatch, a partial match, or a full + * match. A mismatch means at least one character of the text + * does not match the context or key. A partial match means some + * context and key characters match, but the text is not long + * enough to match all of them. A full match means all context + * and key characters match. + * + * If a full match is obtained, perform a replacement, update pos, + * and return U_MATCH. Otherwise both text and pos are unchanged. + * + * @param text the text + * @param pos the position indices + * @param incremental if TRUE, test for partial matches that may + * be completed by additional text inserted at pos.limit. + * @return one of U_MISMATCH, + * U_PARTIAL_MATCH, or U_MATCH. If + * incremental is FALSE then U_PARTIAL_MATCH will not be returned. */ - virtual UBool matches(const Replaceable& text, - const UTransPosition& pos, - const TransliterationRuleData& data) const; - - /** - * Return the degree of match between this rule and the given text. The - * degree of match may be mismatch, a partial match, or a full match. A - * mismatch means at least one character of the text does not match the - * context or key. A partial match means some context and key characters - * match, but the text is not long enough to match all of them. A full - * match means all context and key characters match. - * @param text the text, both translated and untranslated - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param cursor position at which to translate next, representing offset - * into text. This value must be between start and - * limit. - * @return one of MISMATCH, PARTIAL_MATCH, or - * FULL_MATCH. - * @see #MISMATCH - * @see #PARTIAL_MATCH - * @see #FULL_MATCH - */ - virtual int32_t getMatchDegree(const Replaceable& text, - const UTransPosition& pos, - const TransliterationRuleData& data) const; - - /** - * Return the number of characters of the text that match this rule. If - * there is a mismatch, return -1. If the text is not long enough to match - * any characters, return 0. - * @param text the text, both translated and untranslated - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param cursor position at which to translate next, representing offset - * into text. This value must be between start and - * limit. - * @param data a dictionary of variables mapping Character - * to UnicodeSet - * @return -1 if there is a mismatch, 0 if the text is not long enough to - * match any characters, otherwise the number of characters of text that - * match this rule. - */ - virtual int32_t getRegionMatchLength(const Replaceable& text, - const UTransPosition& pos, - const TransliterationRuleData& data) const; - - /** - * Return true if the given key matches the given text. This method - * accounts for the fact that the key character may represent a character - * set. Note that the key and text characters may not be interchanged - * without altering the results. - * @param keyChar a character in the match key - * @param textChar a character in the text being transliterated - * @param data a dictionary of variables mapping Character - * to UnicodeSet - */ - virtual UBool charMatches(UChar keyChar, const Replaceable& textChar, - int32_t index, - const UTransPosition& pos, - const TransliterationRuleData& data) const; + UMatchDegree matchAndReplace(Replaceable& text, + UTransPosition& pos, + UBool incremental) const; /** * Create a rule string that represents this rule object. Append * it to the given string. */ virtual UnicodeString& toRule(UnicodeString& pat, - const TransliterationRuleData& data, UBool escapeUnprintable) const; private: diff --git a/icu4c/source/i18n/rbt_set.cpp b/icu4c/source/i18n/rbt_set.cpp index ecf3663e189..7e737a0f928 100644 --- a/icu4c/source/i18n/rbt_set.cpp +++ b/icu4c/source/i18n/rbt_set.cpp @@ -74,7 +74,7 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule, ruleVector->addElement(adoptedRule); int32_t len; - if ((len = adoptedRule->getAnteContextLength()) > maxContextLength) { + if ((len = adoptedRule->getContextLength()) > maxContextLength) { maxContextLength = len; } @@ -92,8 +92,7 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule, * That is, freeze() may be called multiple times, * although for optimal performance it shouldn't be. */ -void TransliterationRuleSet::freeze(const TransliterationRuleData& data, - UErrorCode& status) { +void TransliterationRuleSet::freeze(UErrorCode& status) { if (U_FAILURE(status)) { return; } @@ -124,7 +123,7 @@ void TransliterationRuleSet::freeze(const TransliterationRuleData& data, int16_t* indexValue = new int16_t[n]; for (j=0; jelementAt(j); - indexValue[j] = r->getIndexValue(data); + indexValue[j] = r->getIndexValue(); } for (x=0; x<256; ++x) { index[x] = v.size(); @@ -139,7 +138,7 @@ void TransliterationRuleSet::freeze(const TransliterationRuleData& data, // matchesIndexValue check. In practice this happens // rarely, so we seldom tread this code path. TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j); - if (r->matchesIndexValue((uint8_t)x, data)) { + if (r->matchesIndexValue((uint8_t)x)) { v.addElement(r); } } @@ -192,87 +191,40 @@ void TransliterationRuleSet::freeze(const TransliterationRuleData& data, } /** - * Attempt to find a matching rule at the specified point in the text. - * @param text the text, both translated and untranslated - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param cursor position at which to translate next, representing offset - * into text. This value must be between start and - * limit. - * @param data a dictionary mapping variables to the sets they - * represent (maps Character to UnicodeSet) - * @return the matching rule, or null if none found. + * Transliterate the given text with the given UTransPosition + * indices. Return TRUE if the transliteration should continue + * or FALSE if it should halt (because of a U_PARTIAL_MATCH match). + * Note that FALSE is only ever returned if isIncremental is TRUE. + * @param text the text to be transliterated + * @param pos the position indices, which will be updated + * @param incremental if TRUE, assume new text may be inserted + * at index.limit, and return FALSE if thre is a partial match. + * @return TRUE unless a U_PARTIAL_MATCH has been obtained, + * indicating that transliteration should stop until more text + * arrives. */ -TransliterationRule* -TransliterationRuleSet::findMatch(const Replaceable& text, - const UTransPosition& pos, - const TransliterationRuleData& data) const { - /* We only need to check our indexed bin of the rule table, - * based on the low byte of the first key character. - */ - int16_t x = (int16_t) (text.charAt(pos.start) & 0xFF); - for (int32_t i=index[x]; imatches(text, pos, data)) { - return rules[i]; +UBool TransliterationRuleSet::transliterate(Replaceable& text, + UTransPosition& pos, + UBool incremental) { + int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF); + for (int32_t i=index[indexByte]; imatchAndReplace(text, pos, incremental); + switch (m) { + case U_MATCH: + return TRUE; + case U_PARTIAL_MATCH: + return FALSE; } } - return NULL; -} - -/** - * Attempt to find a matching rule at the specified point in the text. - * Unlike findMatch(), this method does an incremental match. - * An incremental match requires that there be no partial matches that might - * pre-empt the full match that is found. If there are partial matches, - * then null is returned. A non-null result indicates that a full match has - * been found, and that it cannot be pre-empted by a partial match - * regardless of what additional text is added to the translation buffer. - * @param text the text, both translated and untranslated - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param cursor position at which to translate next, representing offset - * into text. This value must be between start and - * limit. - * @param data a dictionary mapping variables to the sets they - * represent (maps Character to UnicodeSet) - * @param partial output parameter. partial[0] is set to - * true if a partial match is returned. - * @return the matching rule, or null if none found, or if the text buffer - * does not have enough text yet to unambiguously match a rule. - */ -TransliterationRule* -TransliterationRuleSet::findIncrementalMatch(const Replaceable& text, - const UTransPosition& pos, - const TransliterationRuleData& data, - UBool& isPartial) const { - - /* We only need to check our indexed bin of the rule table, - * based on the low byte of the first key character. - */ - isPartial = FALSE; - int16_t x = (int16_t) (text.charAt(pos.start) & 0xFF); - for (int32_t i=index[x]; igetMatchDegree(text, pos, data); - switch (match) { - case TransliterationRule::FULL_MATCH: - return rules[i]; - case TransliterationRule::PARTIAL_MATCH: - isPartial = TRUE; - return NULL; - } - } - return NULL; + // No match or partial match from any rule + ++pos.start; + return TRUE; } /** * Create rule strings that represents this rule set. */ UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource, - const TransliterationRuleData& data, UBool escapeUnprintable) const { int32_t i; int32_t count = index[256]; @@ -281,7 +233,7 @@ UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource, if (i != 0) { ruleSource.append((UChar) 0x000A /*\n*/); } - rules[i]->toRule(ruleSource, data, escapeUnprintable); + rules[i]->toRule(ruleSource, escapeUnprintable); } return ruleSource; } diff --git a/icu4c/source/i18n/rbt_set.h b/icu4c/source/i18n/rbt_set.h index d473a6768e5..7666037bb52 100644 --- a/icu4c/source/i18n/rbt_set.h +++ b/icu4c/source/i18n/rbt_set.h @@ -18,15 +18,7 @@ class UnicodeFilter; class UnicodeString; /** - * A set of rules for a RuleBasedTransliterator. This set encodes - * the transliteration in one direction from one set of characters or short - * strings to another. A RuleBasedTransliterator consists of up to - * two such sets, one for the forward direction, and one for the reverse. - * - *

A TransliterationRuleSet has one important operation, that of - * finding a matching rule at a given point in the text. This is accomplished - * by the findMatch() method. - * + * A set of rules for a RuleBasedTransliterator. * @author Alan Liu */ class TransliterationRuleSet { @@ -98,59 +90,24 @@ public: * That is, freeze() may be called multiple times, * although for optimal performance it shouldn't be. */ - virtual void freeze(const TransliterationRuleData& data, - UErrorCode& status); - + virtual void freeze(UErrorCode& status); + /** - * Attempt to find a matching rule at the specified point in the text. - * @param text the text, both translated and untranslated - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param cursor position at which to translate next, representing offset - * into text. This value must be between start and - * limit. - * @param data a dictionary mapping variables to the sets they - * represent (maps Character to UnicodeSet) - * null then no filtering is applied. - * @return the matching rule, or null if none found. + * Transliterate the given text with the given UTransPosition + * indices. Return TRUE if the transliteration should continue + * or FALSE if it should halt (because of a U_PARTIAL_MATCH match). + * Note that FALSE is only ever returned if isIncremental is TRUE. + * @param text the text to be transliterated + * @param index the position indices, which will be updated + * @param isIncremental if TRUE, assume new text may be inserted + * at index.limit, and return FALSE if thre is a partial match. + * @return TRUE unless a U_PARTIAL_MATCH has been obtained, + * indicating that transliteration should stop until more text + * arrives. */ - virtual TransliterationRule* findMatch(const Replaceable& text, - const UTransPosition& pos, - const TransliterationRuleData& data) const; - - /** - * Attempt to find a matching rule at the specified point in the text. - * Unlike findMatch(), this method does an incremental match. - * An incremental match requires that there be no partial matches that might - * pre-empt the full match that is found. If there are partial matches, - * then null is returned. A non-null result indicates that a full match has - * been found, and that it cannot be pre-empted by a partial match - * regardless of what additional text is added to the translation buffer. - * @param text the text, both translated and untranslated - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= text.length(). - * @param cursor position at which to translate next, representing offset - * into text. This value must be between start and - * limit. - * @param data a dictionary mapping variables to the sets they - * represent (maps Character to UnicodeSet) - * @param partial output parameter. partial[0] is set to - * true if a partial match is returned. - * @param filter the filter. Any character for which - * filter.isIn() returns false will not be - * altered by this transliterator. If filter is - * null then no filtering is applied. - * @return the matching rule, or null if none found, or if the text buffer - * does not have enough text yet to unambiguously match a rule. - */ - virtual TransliterationRule* findIncrementalMatch(const Replaceable& text, - const UTransPosition& pos, - const TransliterationRuleData& data, - UBool& isPartial) const; + UBool transliterate(Replaceable& text, + UTransPosition& index, + UBool isIncremental); /** * Create rule strings that represents this rule set. @@ -158,7 +115,6 @@ public: * contents will be deleted. */ virtual UnicodeString& toRules(UnicodeString& result, - const TransliterationRuleData& data, UBool escapeUnprintable) const; }; #endif diff --git a/icu4c/source/i18n/symtable.h b/icu4c/source/i18n/symtable.h index c600468289a..1e87a5a4eec 100644 --- a/icu4c/source/i18n/symtable.h +++ b/icu4c/source/i18n/symtable.h @@ -44,7 +44,7 @@ public: * Lookup the UnicodeSet associated with the given character, and * return it. Return null if not found. */ - virtual const UnicodeSet* lookupSet(UChar ch) const = 0; + virtual const UnicodeSet* lookupSet(UChar32 ch) const = 0; /** * Parse a symbol reference name from the given string, starting diff --git a/icu4c/source/i18n/translit.cpp b/icu4c/source/i18n/translit.cpp index dbb5d147598..03a2f91f9c5 100644 --- a/icu4c/source/i18n/translit.cpp +++ b/icu4c/source/i18n/translit.cpp @@ -281,10 +281,20 @@ void Transliterator::transliterate(Replaceable& text, */ void Transliterator::transliterate(Replaceable& text, UTransPosition& index, - UChar insertion, + UChar32 insertion, UErrorCode& status) const { UnicodeString str(insertion); - _transliterate(text, index, &str, status); + if (UTF_IS_LEAD(insertion)) { + // Oops, the caller passed us a single lead surrogate. In + // general, we don't support this, but we'll do the caller a + // favor in the special case of LEAD followed by TRAIL + // insertion. Anything else won't work. + text.handleReplaceBetween(index.limit, index.limit, str); + ++index.limit; + ++index.contextLimit; + } else { + _transliterate(text, index, &str, status); + } } /** @@ -351,8 +361,18 @@ void Transliterator::_transliterate(Replaceable& text, filteredTransliterate(text, index, TRUE); - index.contextStart = uprv_max(index.start - getMaximumContextLength(), - originalStart); + // The purpose of the code below is to keep the context small + // while doing incremental transliteration. When part of the left + // context (between contextStart and start) is no longer needed, + // we try to advance contextStart past that portion. We use the + // maximum context length to do so. + int32_t newCS = index.start; + int32_t n = getMaximumContextLength(); + while (newCS > originalStart && n-- > 0) { + --newCS; + newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1; + } + index.contextStart = uprv_max(newCS, originalStart); } /** diff --git a/icu4c/source/i18n/unifltlg.cpp b/icu4c/source/i18n/unifltlg.cpp index 3f8bf42efbc..bfd689e65c4 100644 --- a/icu4c/source/i18n/unifltlg.cpp +++ b/icu4c/source/i18n/unifltlg.cpp @@ -21,7 +21,7 @@ public: NullFilter(UBool r) { result = r; } NullFilter(const NullFilter& f) : UnicodeFilter(f) { result = f.result; } virtual ~NullFilter() {} - virtual UBool contains(UChar /*c*/) const { return result; } + virtual UBool contains(UChar32 /*c*/) const { return result; } virtual UnicodeFilter* clone() const { return new NullFilter(*this); } }; @@ -31,7 +31,7 @@ public: UnicodeNotFilter(UnicodeFilter* adopted); UnicodeNotFilter(const UnicodeNotFilter&); virtual ~UnicodeNotFilter(); - virtual UBool contains(UChar c) const; + virtual UBool contains(UChar32 c) const; virtual UnicodeFilter* clone() const; }; @@ -39,7 +39,7 @@ UnicodeNotFilter::UnicodeNotFilter(UnicodeFilter* adopted) : filt(adopted) {} UnicodeNotFilter::UnicodeNotFilter(const UnicodeNotFilter& f) : UnicodeFilter(f), filt(f.filt->clone()) {} UnicodeNotFilter::~UnicodeNotFilter() { delete filt; } -UBool UnicodeNotFilter::contains(UChar c) const { return !filt->contains(c); } +UBool UnicodeNotFilter::contains(UChar32 c) const { return !filt->contains(c); } UnicodeFilter* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); } /** @@ -61,7 +61,7 @@ public: UnicodeAndFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2); UnicodeAndFilter(const UnicodeAndFilter&); virtual ~UnicodeAndFilter(); - virtual UBool contains(UChar c) const; + virtual UBool contains(UChar32 c) const; virtual UnicodeFilter* clone() const; }; @@ -69,7 +69,7 @@ UnicodeAndFilter::UnicodeAndFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1 UnicodeAndFilter::UnicodeAndFilter(const UnicodeAndFilter& f) : UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {} UnicodeAndFilter::~UnicodeAndFilter() { delete filt1; delete filt2; } -UBool UnicodeAndFilter::contains(UChar c) const { return filt1->contains(c) && filt2->contains(c); } +UBool UnicodeAndFilter::contains(UChar32 c) const { return filt1->contains(c) && filt2->contains(c); } UnicodeFilter* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); } /** @@ -99,7 +99,7 @@ public: UnicodeOrFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2); UnicodeOrFilter(const UnicodeOrFilter&); virtual ~UnicodeOrFilter(); - virtual UBool contains(UChar c) const; + virtual UBool contains(UChar32 c) const; virtual UnicodeFilter* clone() const; }; @@ -107,7 +107,7 @@ UnicodeOrFilter::UnicodeOrFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f UnicodeOrFilter::UnicodeOrFilter(const UnicodeOrFilter& f) : UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {} UnicodeOrFilter::~UnicodeOrFilter() { delete filt1; delete filt2; } -UBool UnicodeOrFilter::contains(UChar c) const { return filt1->contains(c) || filt2->contains(c); } +UBool UnicodeOrFilter::contains(UChar32 c) const { return filt1->contains(c) || filt2->contains(c); } UnicodeFilter* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); } /** diff --git a/icu4c/source/i18n/uniset.cpp b/icu4c/source/i18n/uniset.cpp index ce46ac24249..d8aaa9971ae 100644 --- a/icu4c/source/i18n/uniset.cpp +++ b/icu4c/source/i18n/uniset.cpp @@ -542,17 +542,6 @@ UBool UnicodeSet::contains(UChar32 c) const { return ((i & 1) != 0); // return true if odd } -/** - * Implement UnicodeFilter: - * Returns true if this set contains the specified char. - * - * @return true if this set contains the specified char. - * @draft - */ -UBool UnicodeSet::contains(UChar c) const { - return contains((UChar32) c); -} - /** * Returns true if this set contains any character whose low byte * is the given value. This is used by RuleBasedTransliterator for @@ -581,6 +570,24 @@ UBool UnicodeSet::containsIndexValue(uint8_t v) const { return FALSE; } +/** + * Implementation of UnicodeMatcher::matches(). + */ +UMatchDegree UnicodeSet::matches(const Replaceable& text, + int32_t& offset, + int32_t limit, + UBool incremental) const { + if (offset == limit) { + if (contains(TransliterationRule::ETHER)) { + return incremental ? U_PARTIAL_MATCH : U_MATCH; + } else { + return U_MISMATCH; + } + } else { + return UnicodeFilter::matches(text, offset, limit, incremental); + } +} + /** * Adds the specified range to this set if it is not already * present. If this set already contains the specified range, @@ -895,7 +902,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, UBool invert = FALSE; clear(); - int32_t lastChar = -1; // This is either a char (0..FFFF) or -1 + const UChar32 NONE = (UChar32) -1; + UChar32 lastChar = NONE; // This is either a char (0..10FFFF) or NONE UChar lastOp = 0; /* This loop iterates over the characters in the pattern. We start at @@ -916,8 +924,9 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, // mode 1: '[' seen; if next is '^' or ':' then special // mode 2: '[' '^'? seen; parse pattern and close with ']' // mode 3: '[:' seen; parse category and close with ':]' + // mode 4: Pattern closed cleanly int8_t mode = 0; - int32_t openPos = 0; // offset to opening '[' + int32_t colonPos = 0; // Expected pos of ':' in '[:' int32_t i = pos.getIndex(); int32_t limit = pattern.length(); UnicodeSet nestedAux; @@ -930,7 +939,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, const UnicodeString* varValueBuffer = NULL; int32_t ivarValueBuffer = 0; int32_t anchor = 0; - for (; ilength()) { - c = varValueBuffer->charAt(ivarValueBuffer++); + c = varValueBuffer->char32At(ivarValueBuffer); + ivarValueBuffer += UTF_CHAR_LENGTH(c); nestedSet = symbols->lookupSet(c); // may be NULL nestedPatDone = FALSE; } else { varValueBuffer = NULL; - c = pattern.charAt(i); + c = pattern.char32At(i); + i += UTF_CHAR_LENGTH(c); } } else { - c = pattern.charAt(i); + c = pattern.char32At(i); + i += UTF_CHAR_LENGTH(c); } // Ignore whitespace. This is not Unicode whitespace, but Java // whitespace, a subset of Unicode whitespace. - if (Unicode::isWhitespace(c)) { + if (u_isspace(c)) { continue; } @@ -971,7 +983,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, case 0: if (c == SET_OPEN) { mode = 1; // Next look for '^' or ':' - openPos = i; + colonPos = i; // Expect ':' at next offset continue; } else { // throw new IllegalArgumentException("Missing opening '['"); @@ -986,9 +998,10 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, newPat.append(c); continue; // Back to top to fetch next character case COLON: - if (i == openPos+1) { - // '[:' cannot have whitespace in it - --i; + // '[:' cannot have whitespace in it. 'i' has already + // been advanced. + if (i-1 == colonPos) { + --i; // Backup to the '[' c = SET_OPEN; mode = 3; // Fall through and parse category using the same @@ -1018,15 +1031,13 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, * interpret '\\uxxxx' Unicode escapes here (as literals). */ if (c == BACKSLASH) { - ++i; // Advance past '\\' UChar32 escaped = pattern.unescapeAt(i); if (escaped == (UChar32) -1) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } isLiteral = TRUE; - --i; // Move i back to last parsed character - c = (UChar) escaped; + c = escaped; } /* Parse variable references. These are treated as literals. If a @@ -1036,7 +1047,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, * Set variables are only looked up if varCharToSet is not null. */ else if (symbols != NULL && !isLiteral && c == SymbolTable::SYMBOL_REF) { - pos.setIndex(++i); + pos.setIndex(i); UnicodeString name = symbols->parseReference(pattern, pos, limit); if (name.length() != 0) { varValueBuffer = symbols->lookup(name); @@ -1052,7 +1063,6 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, // Got a null; this means we have an isolated $. // Tentatively assume this is an anchor. anchor = 1; - --i; // Back up so loop increment works properly } continue; // Back to the top to get varValueBuffer[0] } @@ -1069,9 +1079,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, nestedPatStart = newPat.length(); // Handle "[:...:]", representing a character category - UChar d = charAfter(pattern, i); - if (d == COLON) { - i += 2; + if (i < pattern.length() && pattern.charAt(i) == COLON) { + ++i; int32_t j = pattern.indexOf(CATEGORY_CLOSE, i); if (j < 0) { // throw new IllegalArgumentException("Missing \":]\""); @@ -1086,7 +1095,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, if (U_FAILURE(status)) { return; } - i = j+1; // Make i point to ']' in ":]" + i = j+2; // Advance i past ":]" // Use a rebuilt pattern. If we are top level, // then there is already a SET_OPEN in newPat, and @@ -1105,11 +1114,13 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, // loop. This is one of 2 ways we leave this // loop if the pattern is well-formed. *this = *nestedSet; + mode = 4; break; } } else { // Recurse to get the pairs for this nested set. - pos.setIndex(i); + // Backup i to '['. + pos.setIndex(--i); switch (lastOp) { case HYPHEN: case INTERSECTION: @@ -1122,7 +1133,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, if (U_FAILURE(status)) { return; } - i = pos.getIndex() - 1; // - 1 to point at ']' + i = pos.getIndex(); } } } @@ -1136,7 +1147,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, * ']' have special meanings. */ if (nestedSet != NULL) { - if (lastChar >= 0) { + if (lastChar != NONE) { if (lastOp != 0) { // throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp); status = U_ILLEGAL_ARGUMENT_ERROR; @@ -1154,7 +1165,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, } else { _appendToPat(newPat, lastChar, FALSE); } - lastChar = -1; + lastChar = NONE; } switch (lastOp) { case HYPHEN: @@ -1193,9 +1204,11 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, newPat.append((UChar)SymbolTable::SYMBOL_REF); add(TransliterationRule::ETHER); } + mode = 4; break; } else if (lastOp == 0 && !isLiteral && (c == HYPHEN || c == INTERSECTION)) { - lastOp = c; + // assert(c <= 0xFFFF); + lastOp = (UChar) c; } else if (lastOp == HYPHEN) { if (lastChar >= c) { // Don't allow redundant (a-a) or empty (b-a) ranges; @@ -1210,14 +1223,14 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, newPat.append(HYPHEN); _appendToPat(newPat, c, FALSE); lastOp = 0; - lastChar = -1; + lastChar = NONE; } else if (lastOp != 0) { // We have & or & // throw new IllegalArgumentException("Unquoted " + lastOp); status = U_ILLEGAL_ARGUMENT_ERROR; return; } else { - if (lastChar >= 0) { + if (lastChar != NONE) { // We have add(lastChar, lastChar); _appendToPat(newPat, lastChar, FALSE); @@ -1226,7 +1239,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, } } - if (lastChar >= 0) { + if (lastChar != NONE) { add(lastChar, lastChar); _appendToPat(newPat, lastChar, FALSE); } @@ -1252,19 +1265,13 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern, complement(); } - /** - * i indexes the last character we parsed or is pattern.length(). In - * the latter case, we have run off the end without finding a closing - * ']'. Otherwise, we know i < pattern.length(), and we set the - * ParsePosition to the next character to be parsed. - */ - if (i == limit) { + if (mode != 4) { // throw new IllegalArgumentException("Missing ']'"); status = U_ILLEGAL_ARGUMENT_ERROR; return; } - pos.setIndex(i+1); + pos.setIndex(i); // Use the rebuilt pattern (newPat) only if necessary. Prefer the // generated pattern. @@ -1393,14 +1400,6 @@ const UnicodeSet& UnicodeSet::getCategorySet(int8_t cat) { // Implementation: Utility methods //---------------------------------------------------------------- -/** - * Returns the character after the given position, or '\uFFFE' if - * there is none. - */ -UChar UnicodeSet::charAfter(const UnicodeString& str, int32_t i) { - return ((++i) < str.length()) ? str.charAt(i) : (UChar)0xFFFE; -} - void UnicodeSet::ensureCapacity(int32_t newLen) { if (newLen <= capacity) return; capacity = newLen + GROW_EXTRA; diff --git a/icu4c/source/test/intltest/hajatrts.cpp b/icu4c/source/test/intltest/hajatrts.cpp index ef77fc104b0..9a3aa17b1f7 100644 --- a/icu4c/source/test/intltest/hajatrts.cpp +++ b/icu4c/source/test/intltest/hajatrts.cpp @@ -75,7 +75,7 @@ class TestHangulFilter : public UnicodeFilter { virtual UnicodeFilter* clone() const { return new TestHangulFilter(*this); } - virtual UBool contains(UChar c) const { + virtual UBool contains(UChar32 c) const { if(c == 0xae4c ) return FALSE; else diff --git a/icu4c/source/test/intltest/hxuntrts.cpp b/icu4c/source/test/intltest/hxuntrts.cpp index b9b39911619..400d44ed2f3 100644 --- a/icu4c/source/test/intltest/hxuntrts.cpp +++ b/icu4c/source/test/intltest/hxuntrts.cpp @@ -59,7 +59,7 @@ class TestHexFilter : public UnicodeFilter { virtual UnicodeFilter* clone() const { return new TestHexFilter(*this); } - virtual UBool contains(UChar c) const { + virtual UBool contains(UChar32 c) const { if(c == 0x0061 || c == 0x0063 ) return FALSE; else diff --git a/icu4c/source/test/intltest/intltest.cpp b/icu4c/source/test/intltest/intltest.cpp index add312860bd..bc83ab1351b 100644 --- a/icu4c/source/test/intltest/intltest.cpp +++ b/icu4c/source/test/intltest/intltest.cpp @@ -314,14 +314,20 @@ IntlTest::prettify(const UnicodeString &source, target.remove(); target += "\""; - for (i = 0; i < source.length(); i += 1) + for (i = 0; i < source.length(); ) { - UChar ch = source[i]; + UChar32 ch = source.char32At(i); + i += UTF_CHAR_LENGTH(ch); if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E) { - target += "\\u"; - appendHex(ch, 4, target); + if (ch <= 0xFFFF) { + target += "\\u"; + appendHex(ch, 4, target); + } else { + target += "\\U"; + appendHex(ch, 8, target); + } } else { @@ -343,9 +349,10 @@ IntlTest::prettify(const UnicodeString &source, UBool parseBackslash) target.remove(); target += "\""; - for (i = 0; i < source.length(); i += 1) + for (i = 0; i < source.length();) { - UChar ch = source[i]; + UChar32 ch = source.char32At(i); + i += UTF_CHAR_LENGTH(ch); if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E) { @@ -365,8 +372,13 @@ IntlTest::prettify(const UnicodeString &source, UBool parseBackslash) target.truncate(target.length() - 1); } } - target += "\\u"; - appendHex(ch, 4, target); + if (ch <= 0xFFFF) { + target += "\\u"; + appendHex(ch, 4, target); + } else { + target += "\\U"; + appendHex(ch, 8, target); + } } else { diff --git a/icu4c/source/test/intltest/jahatrts.cpp b/icu4c/source/test/intltest/jahatrts.cpp index 7dd37b10e87..e622d5b5006 100644 --- a/icu4c/source/test/intltest/jahatrts.cpp +++ b/icu4c/source/test/intltest/jahatrts.cpp @@ -73,7 +73,7 @@ class TestJamoFilter : public UnicodeFilter { virtual UnicodeFilter* clone() const { return new TestJamoFilter(*this); } - virtual UBool contains(UChar c) const { + virtual UBool contains(UChar32 c) const { if(c == 0x1101 ) return FALSE; else diff --git a/icu4c/source/test/intltest/transapi.cpp b/icu4c/source/test/intltest/transapi.cpp index c1727095049..c6fe5047c45 100644 --- a/icu4c/source/test/intltest/transapi.cpp +++ b/icu4c/source/test/intltest/transapi.cpp @@ -618,7 +618,7 @@ class TestFilter1 : public UnicodeFilter { virtual UnicodeFilter* clone() const { return new TestFilter1(*this); } - virtual UBool contains(UChar c) const { + virtual UBool contains(UChar32 c) const { if(c==0x63 || c==0x61 || c==0x43 || c==0x41) return FALSE; else @@ -629,7 +629,7 @@ class TestFilter2 : public UnicodeFilter { virtual UnicodeFilter* clone() const { return new TestFilter2(*this); } - virtual UBool contains(UChar c) const { + virtual UBool contains(UChar32 c) const { if(c==0x65 || c==0x6c) return FALSE; else @@ -640,7 +640,7 @@ class TestFilter3 : public UnicodeFilter { virtual UnicodeFilter* clone() const { return new TestFilter3(*this); } - virtual UBool contains(UChar c) const { + virtual UBool contains(UChar32 c) const { if(c==0x6f || c==0x77) return FALSE; else diff --git a/icu4c/source/test/intltest/transtst.cpp b/icu4c/source/test/intltest/transtst.cpp index 473e9d2400d..be5d4e8a53a 100644 --- a/icu4c/source/test/intltest/transtst.cpp +++ b/icu4c/source/test/intltest/transtst.cpp @@ -66,6 +66,8 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec, TESTCASE(30,TestCompoundFilter); TESTCASE(31,TestRemove); TESTCASE(32,TestToRules); + TESTCASE(33,TestContext); + TESTCASE(34,TestSupplemental); default: name = ""; break; } } @@ -152,7 +154,9 @@ void TransliteratorTest::TestSimpleRules(void) { */ expect(UnicodeString("ab>x|y;", "") + "yc>z", - "eabcd", "exzd"); /* Another set of rules: + "eabcd", "exzd"); + + /* Another set of rules: * 1. ab>x|yzacw * 2. za>q * 3. qc>r @@ -476,7 +480,7 @@ class TestFilter : public UnicodeFilter { virtual UnicodeFilter* clone() const { return new TestFilter(*this); } - virtual UBool contains(UChar c) const { + virtual UBool contains(UChar32 c) const { return c != (UChar)0x0063 /*c*/; } }; @@ -506,6 +510,12 @@ void TransliteratorTest::TestFiltering(void) { * Test anchors */ void TransliteratorTest::TestAnchors(void) { + expect(UnicodeString("^a > 0; a$ > 2 ; a > 1;", ""), + "aaa", + "012"); + expect(UnicodeString("$s=[z$]; $s{a>0; a}$s>2; a>1;", ""), + "aaa", + "012"); expect(UnicodeString("^ab > 01 ;" " ab > |8 ;" " b > k ;" @@ -1451,18 +1461,44 @@ void TransliteratorTest::TestToRules(void) { } } +void TransliteratorTest::TestContext() { + UTransPosition pos = {0, 2, 0, 1}; // cs cl s l + expect("de > x; {d}e > y;", + "de", + "ye", + &pos); + + expect("ab{c} > z;", + "xadabdabcy", + "xadabdabzy"); +} + +void TransliteratorTest::TestSupplemental() { + expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];" + "a > $a; $s > i;"), + CharsToUnicodeString("ab\\U0001030Fx"), + CharsToUnicodeString("\\U00010300bix")); + + expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];" + "$b=[A-Z\\U00010400-\\U0001044D];" + "($a)($b) > $2 $1;"), + CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"), + CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301")); +} + //====================================================================== // Support methods //====================================================================== void TransliteratorTest::expect(const UnicodeString& rules, const UnicodeString& source, - const UnicodeString& expectedResult) { + const UnicodeString& expectedResult, + UTransPosition *pos) { UErrorCode status = U_ZERO_ERROR; Transliterator *t = new RuleBasedTransliterator("", rules, status); if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); } else { - expect(*t, source, expectedResult); + expect(*t, source, expectedResult, pos); } delete t; } @@ -1477,34 +1513,49 @@ void TransliteratorTest::expect(const Transliterator& t, void TransliteratorTest::expect(const Transliterator& t, const UnicodeString& source, - const UnicodeString& expectedResult) { - UnicodeString result(source); - t.transliterate(result); - expectAux(t.getID() + ":String", source, result, expectedResult); + const UnicodeString& expectedResult, + UTransPosition *pos) { + if (pos == 0) { + UnicodeString result(source); + t.transliterate(result); + expectAux(t.getID() + ":String", source, result, expectedResult); + } + + UTransPosition index={0, 0, 0, 0}; + if (pos != 0) { + index = *pos; + } UnicodeString rsource(source); - t.transliterate(rsource); + if (pos == 0) { + t.transliterate(rsource); + } else { + // Do it all at once -- below we do it incrementally + t.finishTransliteration(rsource, *pos); + } expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult); // Test keyboard (incremental) transliteration -- this result // must be the same after we finalize (see below). - rsource.remove(); - UTransPosition index={0, 0, 0, 0}; UnicodeString log; - - for (int32_t i=0; i "); + rsource.remove(); + if (pos != 0) { + rsource = source; + formatInput(log, rsource, index); + log.append(" -> "); UErrorCode status = U_ZERO_ERROR; - t.transliterate(rsource, index, source.charAt(i), status); - // Append the string buffer with a vertical bar '|' where - // the committed index is. - UnicodeString left, right; - rsource.extractBetween(0, index.start, left); - rsource.extractBetween(index.start, rsource.length(), right); - log.append(left).append((UChar)PIPE).append(right); + t.transliterate(rsource, index, status); + formatInput(log, rsource, index); + } else { + for (int32_t i=0; i "); + UErrorCode status = U_ZERO_ERROR; + t.transliterate(rsource, index, source.charAt(i), status); + formatInput(log, rsource, index); + } } // As a final step in keyboard transliteration, we must call @@ -1518,6 +1569,41 @@ void TransliteratorTest::expect(const Transliterator& t, expectedResult); } +/** + * @param appendTo result is appended to this param. + * @param input the string being transliterated + * @param pos the index struct + */ +UnicodeString& TransliteratorTest::formatInput(UnicodeString &appendTo, + const UnicodeString& input, + const UTransPosition& pos) { + // Output a string of the form aaa{bbb|ccc|ddd}eee, where + // the {} indicate the context start and limit, and the || + // indicate the start and limit. + if (0 <= pos.contextStart && + pos.contextStart <= pos.start && + pos.start <= pos.limit && + pos.limit <= pos.contextLimit && + pos.contextLimit <= input.length()) { + + UnicodeString a, b, c, d, e; + input.extractBetween(0, pos.contextStart, a); + input.extractBetween(pos.contextStart, pos.start, b); + input.extractBetween(pos.start, pos.limit, c); + input.extractBetween(pos.limit, pos.contextLimit, d); + input.extractBetween(pos.contextLimit, input.length(), e); + appendTo.append(a).append((UChar)123/*{*/).append(b). + append((UChar)PIPE).append(c).append((UChar)PIPE).append(d). + append((UChar)125/*}*/).append(e); + } else { + appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" + + pos.contextStart + ", s=" + pos.start + ", l=" + + pos.limit + ", cl=" + pos.contextLimit + "} on " + + input); + } + return appendTo; +} + void TransliteratorTest::expectAux(const UnicodeString& tag, const UnicodeString& source, const UnicodeString& result, diff --git a/icu4c/source/test/intltest/transtst.h b/icu4c/source/test/intltest/transtst.h index a57fba4c598..8f0572e72fd 100644 --- a/icu4c/source/test/intltest/transtst.h +++ b/icu4c/source/test/intltest/transtst.h @@ -11,6 +11,7 @@ #define TRANSTST_H #include "unicode/utypes.h" +#include "unicode/translit.h" #include "intltest.h" class Transliterator; @@ -167,13 +168,18 @@ class TransliteratorTest : public IntlTest { void TestToRules(void); + void TestContext(void); + + void TestSupplemental(void); + //====================================================================== // Support methods //====================================================================== protected: void expect(const UnicodeString& rules, const UnicodeString& source, - const UnicodeString& expectedResult); + const UnicodeString& expectedResult, + UTransPosition *pos=0); void expect(const Transliterator& t, const UnicodeString& source, @@ -182,7 +188,8 @@ class TransliteratorTest : public IntlTest { void expect(const Transliterator& t, const UnicodeString& source, - const UnicodeString& expectedResult); + const UnicodeString& expectedResult, + UTransPosition *pos=0); void expectAux(const UnicodeString& tag, const UnicodeString& source, @@ -192,6 +199,10 @@ class TransliteratorTest : public IntlTest { virtual void expectAux(const UnicodeString& tag, const UnicodeString& summary, UBool pass, const UnicodeString& expectedResult); + + static UnicodeString& formatInput(UnicodeString &appendTo, + const UnicodeString& input, + const UTransPosition& pos); }; #endif diff --git a/icu4c/source/test/intltest/ufltlgts.cpp b/icu4c/source/test/intltest/ufltlgts.cpp index 2da1634883b..94c1b834f34 100644 --- a/icu4c/source/test/intltest/ufltlgts.cpp +++ b/icu4c/source/test/intltest/ufltlgts.cpp @@ -39,7 +39,7 @@ class Filter1: public UnicodeFilter{ virtual UnicodeFilter* clone() const{ return new Filter1(*this); } - virtual UBool contains(UChar c) const { + virtual UBool contains(UChar32 c) const { if(c == 0x0061 || c == 0x0041 || c == 0x0063 || c == 0x0043) return FALSE; else @@ -50,7 +50,7 @@ class Filter2: public UnicodeFilter{ virtual UnicodeFilter* clone() const{ return new Filter2(*this); } - virtual UBool contains(UChar c) const { + virtual UBool contains(UChar32 c) const { if(c == 0x0079 || c == 0x0059 || c == 0x007a || c == 0x005a || c == 0x0061 || c == 0x0063) return FALSE; else diff --git a/icu4c/source/test/intltest/unhxtrts.cpp b/icu4c/source/test/intltest/unhxtrts.cpp index e81d94f23c3..539d3e0185b 100644 --- a/icu4c/source/test/intltest/unhxtrts.cpp +++ b/icu4c/source/test/intltest/unhxtrts.cpp @@ -71,7 +71,7 @@ class TestUniFilter : public UnicodeFilter { virtual UnicodeFilter* clone() const { return new TestUniFilter(*this); } - virtual UBool contains(UChar c) const { + virtual UBool contains(UChar32 c) const { if(c==0x0063 || c==0x0061 || c==0x0043 || c==0x0041) return FALSE; else