mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 14:31:31 +00:00
ICU-1052 redesign of engine to support supplemental characters
X-SVN-Rev: 5341
This commit is contained in:
parent
d7c3eebf46
commit
7edf9d3e80
23 changed files with 675 additions and 637 deletions
|
@ -71,7 +71,8 @@ udat.o umsg.o \
|
|||
unifltlg.o unirange.o uniset.o unitohex.o unum.o \
|
||||
dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o brkdict.o nultrans.o jamohang.o hangjamo.o \
|
||||
remtrans.o utrans.o \
|
||||
titletrn.o tolowtrn.o toupptrn.o xformtrn.o name2uni.o uni2name.o nortrans.o
|
||||
titletrn.o tolowtrn.o toupptrn.o xformtrn.o name2uni.o uni2name.o nortrans.o \
|
||||
unifilt.o
|
||||
|
||||
STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))
|
||||
|
||||
|
|
|
@ -318,6 +318,10 @@ SOURCE=.\uni2name.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unifilt.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unifltlg.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -1548,6 +1552,25 @@ InputPath=.\unicode\unifltlg.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\unimatch.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
||||
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unicode\unimatch.h
|
||||
|
||||
"..\..\include\unicode\unimatch.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy unicode\unimatch.h ..\..\include\unicode
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unirange.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
|
|
@ -89,18 +89,18 @@ RuleBasedTransliterator::clone(void) const {
|
|||
void
|
||||
RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
|
||||
UBool isIncremental) const {
|
||||
/* We keep start and limit fixed the entire time,
|
||||
* relative to the text -- limit may move numerically if text is
|
||||
* inserted or removed. The cursor moves from start to limit, with
|
||||
* replacements happening under it.
|
||||
/* We keep contextStart and contextLimit fixed the entire time,
|
||||
* relative to the text -- contextLimit may move numerically if
|
||||
* text is inserted or removed. The start offset moves toward
|
||||
* limit, with replacements happening under it.
|
||||
*
|
||||
* Example: rules 1. ab>x|y
|
||||
* 2. yc>z
|
||||
*
|
||||
* |eabcd start - no match, advance cursor
|
||||
* e|abcd match rule 1 - change text & adjust cursor
|
||||
* ex|ycd match rule 2 - change text & adjust cursor
|
||||
* exz|d no match, advance cursor
|
||||
* |eabcd begin - no match, advance start
|
||||
* e|abcd match rule 1 - change text & adjust start
|
||||
* ex|ycd match rule 2 - change text & adjust start
|
||||
* exz|d no match, advance start
|
||||
* exzd| done
|
||||
*/
|
||||
|
||||
|
@ -121,39 +121,14 @@ RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition&
|
|||
loopLimit <<= 4;
|
||||
}
|
||||
|
||||
UBool isPartial = FALSE;
|
||||
|
||||
while (index.start < index.limit && loopCount <= loopLimit) {
|
||||
TransliterationRule* r = isIncremental ?
|
||||
data->ruleSet.findIncrementalMatch(text, index, *data, isPartial) :
|
||||
data->ruleSet.findMatch(text, index, *data);
|
||||
|
||||
/* If we match a rule then apply it by replacing the key
|
||||
* with the rule output and repositioning the cursor
|
||||
* appropriately. If we get a partial match, then we
|
||||
* can't do anything without more text; return with the
|
||||
* cursor at the current position. If we get null, then
|
||||
* there is no match at this position, and we can advance
|
||||
* the cursor.
|
||||
*/
|
||||
if (r == 0) {
|
||||
if (isPartial) { // always FALSE unless isIncremental
|
||||
break;
|
||||
} else {
|
||||
++index.start;
|
||||
}
|
||||
} else {
|
||||
// Delegate replacement to TransliterationRule object
|
||||
int32_t lenDelta = r->replace(text, index.start, *data);
|
||||
index.limit += lenDelta;
|
||||
index.contextLimit += lenDelta;
|
||||
index.start += r->getCursorPos();
|
||||
++loopCount;
|
||||
}
|
||||
while (index.start < index.limit &&
|
||||
loopCount <= loopLimit &&
|
||||
data->ruleSet.transliterate(text, index, isIncremental)) {
|
||||
++loopCount;
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
|
||||
UBool escapeUnprintable) const {
|
||||
return data->ruleSet.toRules(rulesSource, *data, escapeUnprintable);
|
||||
return data->ruleSet.toRules(rulesSource, escapeUnprintable);
|
||||
}
|
||||
|
|
|
@ -64,13 +64,13 @@ TransliterationRuleData::~TransliterationRuleData() {
|
|||
}
|
||||
|
||||
const UnicodeSet*
|
||||
TransliterationRuleData::lookupSet(UChar standIn) const {
|
||||
TransliterationRuleData::lookupSet(UChar32 standIn) const {
|
||||
int32_t i = standIn - setVariablesBase;
|
||||
return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0;
|
||||
}
|
||||
|
||||
int32_t
|
||||
TransliterationRuleData::lookupSegmentReference(UChar c) const {
|
||||
TransliterationRuleData::lookupSegmentReference(UChar32 c) const {
|
||||
int32_t i = c - segmentBase;
|
||||
return (i >= 0 && i < 9) ? i : -1;
|
||||
}
|
||||
|
|
|
@ -90,14 +90,14 @@ public:
|
|||
|
||||
~TransliterationRuleData();
|
||||
|
||||
const UnicodeSet* lookupSet(UChar standIn) const;
|
||||
const UnicodeSet* lookupSet(UChar32 standIn) const;
|
||||
|
||||
/**
|
||||
* Return the zero-based index of the segment represented by the given
|
||||
* character, or -1 if none. Repeat: This is a zero-based return value,
|
||||
* 0..8, even though these are notated "$1".."$9".
|
||||
*/
|
||||
int32_t lookupSegmentReference(UChar c) const;
|
||||
int32_t lookupSegmentReference(UChar32 c) const;
|
||||
|
||||
/**
|
||||
* Return the character used to stand for the given segment reference.
|
||||
|
|
|
@ -75,7 +75,7 @@ public:
|
|||
|
||||
virtual const UnicodeString* lookup(const UnicodeString& s) const;
|
||||
|
||||
virtual const UnicodeSet* lookupSet(UChar ch) const;
|
||||
virtual const UnicodeSet* lookupSet(UChar32 ch) const;
|
||||
|
||||
virtual UnicodeString parseReference(const UnicodeString& text,
|
||||
ParsePosition& pos, int32_t limit) const;
|
||||
|
@ -95,7 +95,7 @@ const UnicodeString* ParseData::lookup(const UnicodeString& name) const {
|
|||
/**
|
||||
* Implement SymbolTable API.
|
||||
*/
|
||||
const UnicodeSet* ParseData::lookupSet(UChar ch) const {
|
||||
const UnicodeSet* ParseData::lookupSet(UChar32 ch) const {
|
||||
// Note that we cannot use data.lookupSet() because the
|
||||
// set array has not been constructed yet.
|
||||
const UnicodeSet* set = NULL;
|
||||
|
@ -682,7 +682,7 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
|
|||
|
||||
// Index the rules
|
||||
if (U_SUCCESS(status)) {
|
||||
data->ruleSet.freeze(*data, status);
|
||||
data->ruleSet.freeze(status);
|
||||
if (idSplitPointResult < 0) {
|
||||
idSplitPointResult = idBlockResult.length();
|
||||
}
|
||||
|
@ -849,6 +849,7 @@ int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) {
|
|||
right->text, right->cursor, right->cursorOffset,
|
||||
left->createSegments(),
|
||||
left->anchorStart, left->anchorEnd,
|
||||
*data,
|
||||
status), status);
|
||||
|
||||
return pos;
|
||||
|
|
|
@ -52,7 +52,9 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
|
|||
int32_t cursorPosition, int32_t cursorOffset,
|
||||
int32_t* adoptedSegs,
|
||||
UBool anchorStart, UBool anchorEnd,
|
||||
UErrorCode& status) {
|
||||
const TransliterationRuleData& theData,
|
||||
UErrorCode& status) :
|
||||
data(theData) {
|
||||
init(input, anteContextPos, postContextPos,
|
||||
outputStr, cursorPosition, cursorOffset, adoptedSegs,
|
||||
anchorStart, anchorEnd, status);
|
||||
|
@ -79,7 +81,9 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
|
|||
int32_t anteContextPos, int32_t postContextPos,
|
||||
const UnicodeString& outputStr,
|
||||
int32_t cursorPosition,
|
||||
UErrorCode& status) {
|
||||
const TransliterationRuleData& theData,
|
||||
UErrorCode& status) :
|
||||
data(theData) {
|
||||
init(input, anteContextPos, postContextPos,
|
||||
outputStr, cursorPosition, 0, NULL, FALSE, FALSE, status);
|
||||
}
|
||||
|
@ -92,7 +96,9 @@ TransliterationRule::TransliterationRule(TransliterationRule& other) :
|
|||
output(other.output),
|
||||
anteContextLength(other.anteContextLength),
|
||||
keyLength(other.keyLength),
|
||||
cursorPos(other.cursorPos) {
|
||||
cursorPos(other.cursorPos),
|
||||
flags(other.flags),
|
||||
data(other.data) {
|
||||
|
||||
segments = 0;
|
||||
if (other.segments != 0) {
|
||||
|
@ -153,32 +159,27 @@ void TransliterationRule::init(const UnicodeString& input,
|
|||
// We don't validate the segments array. The caller must
|
||||
// guarantee that the segments are well-formed.
|
||||
this->segments = adoptedSegs;
|
||||
// Find the position of the first segment index that is after the
|
||||
// anteContext (in the key). Note that this may be a start or a
|
||||
// limit index.
|
||||
firstKeySeg = -1;
|
||||
if (segments != 0) {
|
||||
do {
|
||||
++firstKeySeg;
|
||||
} while (segments[firstKeySeg] >= 0 &&
|
||||
segments[firstKeySeg] < anteContextLength);
|
||||
if (segments[firstKeySeg] < 0) {
|
||||
firstKeySeg = -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Implement anchors by inserting an ETHER character on the
|
||||
// left or right. If on the left, then the indices must be
|
||||
// incremented. If on the right, no index change is
|
||||
// necessary.
|
||||
if (anchorStart || anchorEnd) {
|
||||
pattern.truncate(0);
|
||||
if (anchorStart) {
|
||||
pattern.append(ETHER);
|
||||
++anteContextLength;
|
||||
// Adjust segment offsets
|
||||
if (segments != 0) {
|
||||
int32_t *p = segments;
|
||||
// The end marker is a -1.
|
||||
while (*p != -1) {
|
||||
++(*p);
|
||||
++p;
|
||||
}
|
||||
}
|
||||
}
|
||||
pattern.append(input);
|
||||
if (anchorEnd) {
|
||||
pattern.append(ETHER);
|
||||
}
|
||||
} else {
|
||||
pattern = input;
|
||||
pattern = input;
|
||||
flags = 0;
|
||||
if (anchorStart) {
|
||||
flags |= ANCHOR_START;
|
||||
}
|
||||
if (anchorEnd) {
|
||||
flags |= ANCHOR_END;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -197,10 +198,14 @@ int32_t TransliterationRule::getCursorPos(void) const {
|
|||
/**
|
||||
* Return the preceding context length. This method is needed to
|
||||
* support the <code>Transliterator</code> method
|
||||
* <code>getMaximumContextLength()</code>.
|
||||
* <code>getMaximumContextLength()</code>. Internally, this is
|
||||
* implemented as the anteContextLength, optionally plus one if
|
||||
* there is a start anchor. The one character anchor gap is
|
||||
* needed to make repeated incremental transliteration with
|
||||
* anchors work.
|
||||
*/
|
||||
int32_t TransliterationRule::getAnteContextLength(void) const {
|
||||
return anteContextLength;
|
||||
int32_t TransliterationRule::getContextLength(void) const {
|
||||
return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -209,81 +214,16 @@ int32_t TransliterationRule::getAnteContextLength(void) const {
|
|||
* unless the first character of the key is a set. If it's a
|
||||
* set, or otherwise can match multiple keys, the index value is -1.
|
||||
*/
|
||||
int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data) const {
|
||||
int16_t TransliterationRule::getIndexValue() const {
|
||||
if (anteContextLength == pattern.length()) {
|
||||
// A pattern with just ante context {such as foo)>bar} can
|
||||
// match any key.
|
||||
return -1;
|
||||
}
|
||||
UChar c = pattern.charAt(anteContextLength);
|
||||
UChar32 c = pattern.char32At(anteContextLength);
|
||||
return (int16_t)(data.lookupSet(c) == NULL ? (c & 0xFF) : -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Do a replacement of the input pattern with the output text in
|
||||
* the given string, at the given offset. This method assumes
|
||||
* that a match has already been found in the given text at the
|
||||
* given position.
|
||||
* @param text the text containing the substring to be replaced
|
||||
* @param offset the offset into the text at which the pattern
|
||||
* matches. This is the offset to the point after the ante
|
||||
* context, if any, and before the match string and any post
|
||||
* context.
|
||||
* @param data the RuleBasedTransliterator.Data object specifying
|
||||
* context for this transliterator.
|
||||
* @return the change in the length of the text
|
||||
*/
|
||||
int32_t TransliterationRule::replace(Replaceable& text, int32_t offset,
|
||||
const TransliterationRuleData& data) const {
|
||||
if (segments == NULL) {
|
||||
text.handleReplaceBetween(offset, offset + keyLength, output);
|
||||
return output.length() - keyLength;
|
||||
} else {
|
||||
/* When there are segments to be copied, use the Replaceable.copy()
|
||||
* API in order to retain out-of-band data. Copy everything to the
|
||||
* point after the key, then delete the key. That is, copy things
|
||||
* into offset + keyLength, then replace offset .. offset +
|
||||
* keyLength with the empty string.
|
||||
*
|
||||
* Minimize the number of calls to Replaceable.replace() and
|
||||
* Replaceable.copy().
|
||||
*/
|
||||
int32_t textStart = offset - anteContextLength;
|
||||
int32_t dest = offset + keyLength; // copy new text to here
|
||||
UnicodeString buf;
|
||||
for (int32_t i=0; i<output.length(); ++i) {
|
||||
UChar c = output.charAt(i);
|
||||
int32_t b = data.lookupSegmentReference(c);
|
||||
if (b < 0) {
|
||||
// Accumulate straight (non-segment) text.
|
||||
buf.append(c);
|
||||
} else {
|
||||
// Insert any accumulated straight text.
|
||||
if (buf.length() > 0) {
|
||||
text.handleReplaceBetween(dest, dest, buf);
|
||||
dest += buf.length();
|
||||
buf.remove();
|
||||
}
|
||||
// Copy segment with out-of-band data
|
||||
b *= 2;
|
||||
text.copy(textStart + segments[b],
|
||||
textStart + segments[b+1], dest);
|
||||
dest += segments[b+1] - segments[b];
|
||||
}
|
||||
|
||||
}
|
||||
// Insert any accumulated straight text.
|
||||
if (buf.length() > 0) {
|
||||
text.handleReplaceBetween(dest, dest, buf);
|
||||
dest += buf.length();
|
||||
}
|
||||
// Delete the key
|
||||
buf.remove();
|
||||
text.handleReplaceBetween(offset, offset + keyLength, buf);
|
||||
return dest - (offset + keyLength) - keyLength;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method. Returns true if this rule matches the given
|
||||
* index value. The index value is an 8-bit integer, 0..255,
|
||||
|
@ -294,14 +234,13 @@ int32_t TransliterationRule::replace(Replaceable& text, int32_t offset,
|
|||
* value. If the rule contains only ante context, as in foo)>bar,
|
||||
* then it will match any key.
|
||||
*/
|
||||
UBool TransliterationRule::matchesIndexValue(uint8_t v,
|
||||
const TransliterationRuleData& data) const {
|
||||
UBool TransliterationRule::matchesIndexValue(uint8_t v) const {
|
||||
if (anteContextLength == pattern.length()) {
|
||||
// A pattern with just ante context {such as foo)>bar} can
|
||||
// match any key.
|
||||
return TRUE;
|
||||
}
|
||||
UChar c = pattern.charAt(anteContextLength);
|
||||
UChar32 c = pattern.char32At(anteContextLength);
|
||||
const UnicodeSet* set = data.lookupSet(c);
|
||||
return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
|
||||
}
|
||||
|
@ -328,6 +267,22 @@ UBool TransliterationRule::masks(const TransliterationRule& r2) const {
|
|||
* of) the corresponding characters of r2. The superset
|
||||
* operation should be performed to check for UnicodeSet
|
||||
* masking.
|
||||
*
|
||||
* Anchors: Two patterns that differ only in anchors only
|
||||
* mask one another if they are exactly equal, and r2 has
|
||||
* all the anchors r1 has (optionally, plus some). Here Y
|
||||
* means the row masks the column, N means it doesn't.
|
||||
*
|
||||
* ab ^ab ab$ ^ab$
|
||||
* ab Y Y Y Y
|
||||
* ^ab N Y N Y
|
||||
* ab$ N N Y Y
|
||||
* ^ab$ N N N Y
|
||||
*
|
||||
* Post context: {a}b masks ab, but not vice versa, since {a}b
|
||||
* matches everything ab matches, and {a}b matches {|a|}b but ab
|
||||
* does not. Pre context is different (a{b} does not align with
|
||||
* ab).
|
||||
*/
|
||||
|
||||
/* LIMITATION of the current mask algorithm: Some rule
|
||||
|
@ -340,126 +295,242 @@ UBool TransliterationRule::masks(const TransliterationRule& r2) const {
|
|||
int32_t left2 = r2.anteContextLength;
|
||||
int32_t right = len - left;
|
||||
int32_t right2 = r2.pattern.length() - left2;
|
||||
return left <= left2 && right <= right2 &&
|
||||
|
||||
// TODO Clean this up -- some logic might be combinable with the
|
||||
// next statement.
|
||||
|
||||
// Test for anchor masking
|
||||
if (left == left2 && right == right2 &&
|
||||
keyLength <= r2.keyLength &&
|
||||
0 == r2.pattern.compare(0, len, pattern)) {
|
||||
// The following boolean logic implements the table above
|
||||
return (flags == r2.flags) ||
|
||||
(!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) ||
|
||||
((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END));
|
||||
}
|
||||
|
||||
return left <= left2 &&
|
||||
(right < right2 ||
|
||||
(right == right2 && keyLength <= r2.keyLength)) &&
|
||||
0 == r2.pattern.compare(left2 - left, len, pattern);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule matches the given text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* Attempt a match and replacement at the given position. Return
|
||||
* the degree of match between this rule and the given text. The
|
||||
* degree of match may be mismatch, a partial match, or a full
|
||||
* match. A mismatch means at least one character of the text
|
||||
* does not match the context or key. A partial match means some
|
||||
* context and key characters match, but the text is not long
|
||||
* enough to match all of them. A full match means all context
|
||||
* and key characters match.
|
||||
*
|
||||
* If a full match is obtained, perform a replacement, update pos,
|
||||
* and return U_MATCH. Otherwise both text and pos are unchanged.
|
||||
*
|
||||
* @param text the text
|
||||
* @param pos the position indices
|
||||
* @param incremental if TRUE, test for partial matches that may
|
||||
* be completed by additional text inserted at pos.limit.
|
||||
* @return one of <code>U_MISMATCH</code>,
|
||||
* <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
|
||||
* incremental is FALSE then U_PARTIAL_MATCH will not be returned.
|
||||
*/
|
||||
UBool TransliterationRule::matches(const Replaceable& text,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data) const {
|
||||
// Match anteContext, key, and postContext
|
||||
int32_t cursor = pos.start - anteContextLength;
|
||||
// Quick length check; this is a performance win for long rules.
|
||||
// Widen by one (on both sides) to allow anchor matching.
|
||||
if (cursor < (pos.contextStart - 1) ||
|
||||
(cursor + pattern.length()) > (pos.contextLimit + 1)) {
|
||||
return FALSE;
|
||||
}
|
||||
for (int32_t i=0; i<pattern.length(); ++i, ++cursor) {
|
||||
if (!charMatches(pattern.charAt(i), text, cursor, pos,
|
||||
data)) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
||||
UTransPosition& pos,
|
||||
UBool incremental) const {
|
||||
// Matching and replacing are done in one method because the
|
||||
// replacement operation needs information obtained during the
|
||||
// match. Another way to do this is to have the match method
|
||||
// create a match result struct with relevant offsets, and to pass
|
||||
// this into the replace method.
|
||||
|
||||
/**
|
||||
* Return the degree of match between this rule and the given text. The
|
||||
* degree of match may be mismatch, a partial match, or a full match. A
|
||||
* mismatch means at least one character of the text does not match the
|
||||
* context or key. A partial match means some context and key characters
|
||||
* match, but the text is not long enough to match all of them. A full
|
||||
* match means all context and key characters match.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @return one of <code>MISMATCH</code>, <code>PARTIAL_MATCH</code>, or
|
||||
* <code>FULL_MATCH</code>.
|
||||
* @see #MISMATCH
|
||||
* @see #PARTIAL_MATCH
|
||||
* @see #FULL_MATCH
|
||||
*/
|
||||
int32_t TransliterationRule::getMatchDegree(const Replaceable& text,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data) const {
|
||||
int len = getRegionMatchLength(text, pos, data);
|
||||
return len < anteContextLength ? MISMATCH :
|
||||
(len < pattern.length() ? PARTIAL_MATCH : FULL_MATCH);
|
||||
}
|
||||
// ============================ MATCH ===========================
|
||||
|
||||
/**
|
||||
* Return the number of characters of the text that match this rule. If
|
||||
* there is a mismatch, return -1. If the text is not long enough to match
|
||||
* any characters, return 0.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @return -1 if there is a mismatch, 0 if the text is not long enough to
|
||||
* match any characters, otherwise the number of characters of text that
|
||||
* match this rule.
|
||||
*/
|
||||
int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data) const {
|
||||
int32_t cursor = pos.start - anteContextLength;
|
||||
// Quick length check; this is a performance win for long rules.
|
||||
// Widen by one to allow anchor matching.
|
||||
if (cursor < (pos.contextStart - 1)) {
|
||||
return -1;
|
||||
}
|
||||
// Record the positions of segments. We assume the following:
|
||||
// - The maximum number of segments is 9.
|
||||
// - The segment indices occur in ascending order. That is,
|
||||
// segment 1 start <= segment 1 limit <= sement 2 start...
|
||||
// - The segments have been validated such that there are no
|
||||
// references to nonexistent segments.
|
||||
// - The end of the segment array is marked by a start of -1.
|
||||
// Currently, the parser enforces all of these constraints.
|
||||
// In the future, the first two constraints may be lifted,
|
||||
// in which case this method will have to be modified.
|
||||
|
||||
int32_t segPos[18];
|
||||
int32_t iSeg = firstKeySeg - 1;
|
||||
int32_t nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1;
|
||||
|
||||
// ------------------------ Ante Context ------------------------
|
||||
|
||||
// A mismatch in the ante context, or with the start anchor,
|
||||
// is an outright U_MISMATCH regardless of whether we are
|
||||
// incremental or not.
|
||||
int32_t cursor = pos.start - 1;
|
||||
int32_t i;
|
||||
for (i=0; i<pattern.length() && cursor<pos.contextLimit; ++i, ++cursor) {
|
||||
if (!charMatches(pattern.charAt(i), text, cursor, pos,
|
||||
data)) {
|
||||
return -1;
|
||||
for (i=anteContextLength-1; i>=0; --i) {
|
||||
while (i == nextSegPos) {
|
||||
segPos[iSeg] = cursor;
|
||||
nextSegPos == (--iSeg >= 0) ? segments[iSeg] : -1;
|
||||
}
|
||||
UChar keyChar = pattern.charAt(i);
|
||||
const UnicodeSet* set = data.lookupSet(keyChar);
|
||||
if (set == 0) {
|
||||
if (cursor >= pos.contextStart &&
|
||||
keyChar == text.charAt(cursor)) {
|
||||
--cursor;
|
||||
} else {
|
||||
return U_MISMATCH;
|
||||
}
|
||||
} else {
|
||||
// Subtract 1 from contextStart to make it a reverse limit
|
||||
if (set->matches(text, cursor, pos.contextStart-1, FALSE)
|
||||
!= U_MATCH) {
|
||||
return U_MISMATCH;
|
||||
}
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the given key matches the given text. This method
|
||||
* accounts for the fact that the key character may represent a character
|
||||
* set. Note that the key and text characters may not be interchanged
|
||||
* without altering the results.
|
||||
* @param keyChar a character in the match key
|
||||
* @param textChar a character in the text being transliterated
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
*/
|
||||
UBool TransliterationRule::charMatches(UChar keyChar, const Replaceable& text,
|
||||
int32_t index,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data) const {
|
||||
const UnicodeSet* set = 0;
|
||||
UChar textChar = (index >= pos.contextStart && index < pos.contextLimit)
|
||||
? text.charAt(index) : ETHER;
|
||||
return ((set = data.lookupSet(keyChar)) == 0) ?
|
||||
keyChar == textChar : set->contains(textChar);
|
||||
// ------------------------ Start Anchor ------------------------
|
||||
|
||||
if ((flags & ANCHOR_START) && cursor != (pos.contextStart-1)) {
|
||||
return U_MISMATCH;
|
||||
}
|
||||
|
||||
// -------------------- Key and Post Context --------------------
|
||||
|
||||
iSeg = firstKeySeg;
|
||||
nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1;
|
||||
|
||||
i = 0;
|
||||
cursor = pos.start;
|
||||
int32_t keyLimit = 0;
|
||||
while (i < (pattern.length() - anteContextLength)) {
|
||||
if (incremental && cursor == pos.contextLimit) {
|
||||
// We've reached the context limit without a mismatch and
|
||||
// without completing our match.
|
||||
return U_PARTIAL_MATCH;
|
||||
}
|
||||
if (cursor == pos.limit && i < keyLength) {
|
||||
// We're still in the pattern key but we're entering the
|
||||
// post context.
|
||||
return U_MISMATCH;
|
||||
}
|
||||
while (i == nextSegPos) {
|
||||
segPos[iSeg] = cursor;
|
||||
nextSegPos = segments[++iSeg];
|
||||
}
|
||||
if (i == keyLength) {
|
||||
keyLimit = cursor;
|
||||
}
|
||||
UChar keyChar = pattern.charAt(anteContextLength + i++);
|
||||
const UnicodeSet* set = data.lookupSet(keyChar);
|
||||
if (set == 0) {
|
||||
// Don't need the cursor < pos.contextLimit check if
|
||||
// incremental is TRUE (because it's done above); do need
|
||||
// it otherwise.
|
||||
if (cursor < pos.contextLimit &&
|
||||
keyChar == text.charAt(cursor)) {
|
||||
++cursor;
|
||||
} else {
|
||||
return U_MISMATCH;
|
||||
}
|
||||
} else {
|
||||
UMatchDegree m =
|
||||
set->matches(text, cursor, pos.contextLimit, incremental);
|
||||
if (m != U_MATCH) {
|
||||
return m;
|
||||
}
|
||||
}
|
||||
}
|
||||
while (i == nextSegPos) {
|
||||
segPos[iSeg] = cursor;
|
||||
nextSegPos = segments[++iSeg];
|
||||
}
|
||||
if (i == keyLength) {
|
||||
keyLimit = cursor;
|
||||
}
|
||||
|
||||
// ------------------------- Stop Anchor ------------------------
|
||||
|
||||
if ((flags & ANCHOR_END) != 0) {
|
||||
if (cursor != pos.contextLimit) {
|
||||
return U_MISMATCH;
|
||||
}
|
||||
if (incremental) {
|
||||
return U_PARTIAL_MATCH;
|
||||
}
|
||||
}
|
||||
|
||||
// =========================== REPLACE ==========================
|
||||
|
||||
// We have a full match. The key is between pos.start and
|
||||
// keyLimit. Segment indices have been recorded in segPos[].
|
||||
// Perform a replacement.
|
||||
|
||||
int32_t lenDelta = 0;
|
||||
|
||||
if (segments == NULL) {
|
||||
text.handleReplaceBetween(pos.start, keyLimit, output);
|
||||
lenDelta = output.length() - (keyLimit - pos.start);
|
||||
pos.start += cursorPos;
|
||||
} else {
|
||||
/* When there are segments to be copied, use the Replaceable.copy()
|
||||
* API in order to retain out-of-band data. Copy everything to the
|
||||
* point after the key, then delete the key. That is, copy things
|
||||
* into offset + keyLength, then replace offset .. offset +
|
||||
* keyLength with the empty string.
|
||||
*
|
||||
* Minimize the number of calls to Replaceable.replace() and
|
||||
* Replaceable.copy().
|
||||
*/
|
||||
int32_t dest = keyLimit; // copy new text to here
|
||||
UnicodeString buf;
|
||||
for (i=0; i<output.length(); ) {
|
||||
if (i == cursorPos) {
|
||||
// Record the position of the cursor
|
||||
cursor = dest;
|
||||
}
|
||||
UChar32 c = output.char32At(i);
|
||||
int32_t b = data.lookupSegmentReference(c);
|
||||
if (b < 0) {
|
||||
// Accumulate straight (non-segment) text.
|
||||
buf.append(c);
|
||||
} else {
|
||||
// Insert any accumulated straight text.
|
||||
if (buf.length() > 0) {
|
||||
text.handleReplaceBetween(dest, dest, buf);
|
||||
dest += buf.length();
|
||||
buf.remove();
|
||||
}
|
||||
// Copy segment with out-of-band data
|
||||
b *= 2;
|
||||
text.copy(segPos[b], segPos[b+1], dest);
|
||||
dest += segPos[b+1] - segPos[b];
|
||||
}
|
||||
i += UTF_CHAR_LENGTH(c);
|
||||
}
|
||||
// Insert any accumulated straight text.
|
||||
if (buf.length() > 0) {
|
||||
text.handleReplaceBetween(dest, dest, buf);
|
||||
dest += buf.length();
|
||||
}
|
||||
if (i == cursorPos) {
|
||||
// Record the position of the cursor
|
||||
cursor = dest;
|
||||
}
|
||||
// Delete the key
|
||||
buf.remove();
|
||||
text.handleReplaceBetween(pos.start, keyLimit, buf);
|
||||
lenDelta = dest - keyLimit - (keyLimit - pos.start);
|
||||
pos.start = cursor - (keyLimit - pos.start);
|
||||
}
|
||||
|
||||
pos.limit += lenDelta;
|
||||
pos.contextLimit += lenDelta;
|
||||
|
||||
return U_MATCH;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -570,7 +641,6 @@ void TransliterationRule::_appendToRule(UnicodeString& rule,
|
|||
* given string.
|
||||
*/
|
||||
UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
|
||||
const TransliterationRuleData& data,
|
||||
UBool escapeUnprintable) const {
|
||||
int32_t i;
|
||||
|
||||
|
@ -674,3 +744,5 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
|
|||
|
||||
return rule;
|
||||
}
|
||||
|
||||
//eof
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/utrans.h"
|
||||
#include "unicode/unimatch.h"
|
||||
|
||||
class Replaceable;
|
||||
class TransliterationRuleData;
|
||||
|
@ -36,39 +37,6 @@ class TransliterationRule {
|
|||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Constants returned by <code>getMatchDegree()</code> indicating
|
||||
* the degree of match between the text and this rule.
|
||||
* @see #getMatchDegree
|
||||
*/
|
||||
enum {
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code>
|
||||
* indicating a mismatch between the text and this rule. One
|
||||
* or more characters of the context or key do not match the
|
||||
* text.
|
||||
*/
|
||||
MISMATCH,
|
||||
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code>
|
||||
* indicating a partial match between the text and this rule.
|
||||
* All characters of the text match the corresponding context
|
||||
* or key, but more characters are required for a complete
|
||||
* match. There are some key or context characters at the end
|
||||
* of the pattern that remain unmatched because the text isn't
|
||||
* long enough.
|
||||
*/
|
||||
PARTIAL_MATCH,
|
||||
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code>
|
||||
* indicating a complete match between the text and this rule.
|
||||
* The text matches all context and key characters.
|
||||
*/
|
||||
FULL_MATCH
|
||||
};
|
||||
|
||||
/**
|
||||
* The character at index i, where i < contextStart || i >= contextLimit,
|
||||
* is ETHER. This allows explicit matching by rules and UnicodeSets
|
||||
|
@ -109,6 +77,14 @@ private:
|
|||
*/
|
||||
int32_t* segments;
|
||||
|
||||
/**
|
||||
* A value we compute from segments. The first index into segments[]
|
||||
* that is >= anteContextLength. That is, the first one that is within
|
||||
* the forward scanned part of the pattern -- the key or the postContext.
|
||||
* If there are no segments, this has the value -1.
|
||||
*/
|
||||
int32_t firstKeySeg;
|
||||
|
||||
/**
|
||||
* The length of the string that must match before the key. If
|
||||
* zero, then there is no matching requirement before the key.
|
||||
|
@ -130,6 +106,25 @@ private:
|
|||
*/
|
||||
int32_t cursorPos;
|
||||
|
||||
/**
|
||||
* Miscellaneous attributes.
|
||||
*/
|
||||
int8_t flags;
|
||||
|
||||
/**
|
||||
* Flag attributes.
|
||||
*/
|
||||
enum {
|
||||
ANCHOR_START = 1,
|
||||
ANCHOR_END = 2,
|
||||
};
|
||||
|
||||
/**
|
||||
* A reference to the data for this rule. The data provides
|
||||
* lookup services for matchers and segments.
|
||||
*/
|
||||
const TransliterationRuleData& data;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
|
@ -169,6 +164,7 @@ public:
|
|||
int32_t cursorPosition, int32_t cursorOffset,
|
||||
int32_t* adoptedSegs,
|
||||
UBool anchorStart, UBool anchorEnd,
|
||||
const TransliterationRuleData& data,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
|
@ -192,6 +188,7 @@ public:
|
|||
int32_t anteContextPos, int32_t postContextPos,
|
||||
const UnicodeString& outputStr,
|
||||
int32_t cursorPosition,
|
||||
const TransliterationRuleData& data,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
|
@ -213,9 +210,13 @@ public:
|
|||
/**
|
||||
* Return the preceding context length. This method is needed to
|
||||
* support the <code>Transliterator</code> method
|
||||
* <code>getMaximumContextLength()</code>.
|
||||
* <code>getMaximumContextLength()</code>. Internally, this is
|
||||
* implemented as the anteContextLength, optionally plus one if
|
||||
* there is a start anchor. The one character anchor gap is
|
||||
* needed to make repeated incremental transliteration with
|
||||
* anchors work.
|
||||
*/
|
||||
virtual int32_t getAnteContextLength(void) const;
|
||||
virtual int32_t getContextLength(void) const;
|
||||
|
||||
/**
|
||||
* Internal method. Returns 8-bit index value for this rule.
|
||||
|
@ -223,24 +224,7 @@ public:
|
|||
* unless the first character of the key is a set. If it's a
|
||||
* set, or otherwise can match multiple keys, the index value is -1.
|
||||
*/
|
||||
int16_t getIndexValue(const TransliterationRuleData& data) const;
|
||||
|
||||
/**
|
||||
* Do a replacement of the input pattern with the output text in
|
||||
* the given string, at the given offset. This method assumes
|
||||
* that a match has already been found in the given text at the
|
||||
* given position.
|
||||
* @param text the text containing the substring to be replaced
|
||||
* @param offset the offset into the text at which the pattern
|
||||
* matches. This is the offset to the point after the ante
|
||||
* context, if any, and before the match string and any post
|
||||
* context.
|
||||
* @param data the RuleBasedTransliterator.Data object specifying
|
||||
* context for this transliterator.
|
||||
* @return the change in the length of the text
|
||||
*/
|
||||
int32_t replace(Replaceable& text, int32_t offset,
|
||||
const TransliterationRuleData& data) const;
|
||||
int16_t getIndexValue() const;
|
||||
|
||||
/**
|
||||
* Internal method. Returns true if this rule matches the given
|
||||
|
@ -252,8 +236,7 @@ public:
|
|||
* value. If the rule contains only ante context, as in foo)>bar,
|
||||
* then it will match any key.
|
||||
*/
|
||||
UBool matchesIndexValue(uint8_t v,
|
||||
const TransliterationRuleData& data) const;
|
||||
UBool matchesIndexValue(uint8_t v) const;
|
||||
|
||||
/**
|
||||
* Return true if this rule masks another rule. If r1 masks r2 then
|
||||
|
@ -264,88 +247,35 @@ public:
|
|||
virtual UBool masks(const TransliterationRule& r2) const;
|
||||
|
||||
/**
|
||||
* Return true if this rule matches the given text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* Attempt a match and replacement at the given position. Return
|
||||
* the degree of match between this rule and the given text. The
|
||||
* degree of match may be mismatch, a partial match, or a full
|
||||
* match. A mismatch means at least one character of the text
|
||||
* does not match the context or key. A partial match means some
|
||||
* context and key characters match, but the text is not long
|
||||
* enough to match all of them. A full match means all context
|
||||
* and key characters match.
|
||||
*
|
||||
* If a full match is obtained, perform a replacement, update pos,
|
||||
* and return U_MATCH. Otherwise both text and pos are unchanged.
|
||||
*
|
||||
* @param text the text
|
||||
* @param pos the position indices
|
||||
* @param incremental if TRUE, test for partial matches that may
|
||||
* be completed by additional text inserted at pos.limit.
|
||||
* @return one of <code>U_MISMATCH</code>,
|
||||
* <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
|
||||
* incremental is FALSE then U_PARTIAL_MATCH will not be returned.
|
||||
*/
|
||||
virtual UBool matches(const Replaceable& text,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data) const;
|
||||
|
||||
/**
|
||||
* Return the degree of match between this rule and the given text. The
|
||||
* degree of match may be mismatch, a partial match, or a full match. A
|
||||
* mismatch means at least one character of the text does not match the
|
||||
* context or key. A partial match means some context and key characters
|
||||
* match, but the text is not long enough to match all of them. A full
|
||||
* match means all context and key characters match.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @return one of <code>MISMATCH</code>, <code>PARTIAL_MATCH</code>, or
|
||||
* <code>FULL_MATCH</code>.
|
||||
* @see #MISMATCH
|
||||
* @see #PARTIAL_MATCH
|
||||
* @see #FULL_MATCH
|
||||
*/
|
||||
virtual int32_t getMatchDegree(const Replaceable& text,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data) const;
|
||||
|
||||
/**
|
||||
* Return the number of characters of the text that match this rule. If
|
||||
* there is a mismatch, return -1. If the text is not long enough to match
|
||||
* any characters, return 0.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @return -1 if there is a mismatch, 0 if the text is not long enough to
|
||||
* match any characters, otherwise the number of characters of text that
|
||||
* match this rule.
|
||||
*/
|
||||
virtual int32_t getRegionMatchLength(const Replaceable& text,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data) const;
|
||||
|
||||
/**
|
||||
* Return true if the given key matches the given text. This method
|
||||
* accounts for the fact that the key character may represent a character
|
||||
* set. Note that the key and text characters may not be interchanged
|
||||
* without altering the results.
|
||||
* @param keyChar a character in the match key
|
||||
* @param textChar a character in the text being transliterated
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
*/
|
||||
virtual UBool charMatches(UChar keyChar, const Replaceable& textChar,
|
||||
int32_t index,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data) const;
|
||||
UMatchDegree matchAndReplace(Replaceable& text,
|
||||
UTransPosition& pos,
|
||||
UBool incremental) const;
|
||||
|
||||
/**
|
||||
* Create a rule string that represents this rule object. Append
|
||||
* it to the given string.
|
||||
*/
|
||||
virtual UnicodeString& toRule(UnicodeString& pat,
|
||||
const TransliterationRuleData& data,
|
||||
UBool escapeUnprintable) const;
|
||||
private:
|
||||
|
||||
|
|
|
@ -74,7 +74,7 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
|
|||
ruleVector->addElement(adoptedRule);
|
||||
|
||||
int32_t len;
|
||||
if ((len = adoptedRule->getAnteContextLength()) > maxContextLength) {
|
||||
if ((len = adoptedRule->getContextLength()) > maxContextLength) {
|
||||
maxContextLength = len;
|
||||
}
|
||||
|
||||
|
@ -92,8 +92,7 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
|
|||
* That is, <code>freeze()</code> may be called multiple times,
|
||||
* although for optimal performance it shouldn't be.
|
||||
*/
|
||||
void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
|
||||
UErrorCode& status) {
|
||||
void TransliterationRuleSet::freeze(UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
@ -124,7 +123,7 @@ void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
|
|||
int16_t* indexValue = new int16_t[n];
|
||||
for (j=0; j<n; ++j) {
|
||||
TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
|
||||
indexValue[j] = r->getIndexValue(data);
|
||||
indexValue[j] = r->getIndexValue();
|
||||
}
|
||||
for (x=0; x<256; ++x) {
|
||||
index[x] = v.size();
|
||||
|
@ -139,7 +138,7 @@ void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
|
|||
// matchesIndexValue check. In practice this happens
|
||||
// rarely, so we seldom tread this code path.
|
||||
TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
|
||||
if (r->matchesIndexValue((uint8_t)x, data)) {
|
||||
if (r->matchesIndexValue((uint8_t)x)) {
|
||||
v.addElement(r);
|
||||
}
|
||||
}
|
||||
|
@ -192,87 +191,40 @@ void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
|
|||
}
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param data a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @return the matching rule, or null if none found.
|
||||
* Transliterate the given text with the given UTransPosition
|
||||
* indices. Return TRUE if the transliteration should continue
|
||||
* or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
|
||||
* Note that FALSE is only ever returned if isIncremental is TRUE.
|
||||
* @param text the text to be transliterated
|
||||
* @param pos the position indices, which will be updated
|
||||
* @param incremental if TRUE, assume new text may be inserted
|
||||
* at index.limit, and return FALSE if thre is a partial match.
|
||||
* @return TRUE unless a U_PARTIAL_MATCH has been obtained,
|
||||
* indicating that transliteration should stop until more text
|
||||
* arrives.
|
||||
*/
|
||||
TransliterationRule*
|
||||
TransliterationRuleSet::findMatch(const Replaceable& text,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data) const {
|
||||
/* We only need to check our indexed bin of the rule table,
|
||||
* based on the low byte of the first key character.
|
||||
*/
|
||||
int16_t x = (int16_t) (text.charAt(pos.start) & 0xFF);
|
||||
for (int32_t i=index[x]; i<index[x+1]; ++i) {
|
||||
if (rules[i]->matches(text, pos, data)) {
|
||||
return rules[i];
|
||||
UBool TransliterationRuleSet::transliterate(Replaceable& text,
|
||||
UTransPosition& pos,
|
||||
UBool incremental) {
|
||||
int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF);
|
||||
for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) {
|
||||
UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental);
|
||||
switch (m) {
|
||||
case U_MATCH:
|
||||
return TRUE;
|
||||
case U_PARTIAL_MATCH:
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* Unlike <code>findMatch()</code>, this method does an incremental match.
|
||||
* An incremental match requires that there be no partial matches that might
|
||||
* pre-empt the full match that is found. If there are partial matches,
|
||||
* then null is returned. A non-null result indicates that a full match has
|
||||
* been found, and that it cannot be pre-empted by a partial match
|
||||
* regardless of what additional text is added to the translation buffer.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param data a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param partial output parameter. <code>partial[0]</code> is set to
|
||||
* true if a partial match is returned.
|
||||
* @return the matching rule, or null if none found, or if the text buffer
|
||||
* does not have enough text yet to unambiguously match a rule.
|
||||
*/
|
||||
TransliterationRule*
|
||||
TransliterationRuleSet::findIncrementalMatch(const Replaceable& text,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data,
|
||||
UBool& isPartial) const {
|
||||
|
||||
/* We only need to check our indexed bin of the rule table,
|
||||
* based on the low byte of the first key character.
|
||||
*/
|
||||
isPartial = FALSE;
|
||||
int16_t x = (int16_t) (text.charAt(pos.start) & 0xFF);
|
||||
for (int32_t i=index[x]; i<index[x+1]; ++i) {
|
||||
int32_t match = rules[i]->getMatchDegree(text, pos, data);
|
||||
switch (match) {
|
||||
case TransliterationRule::FULL_MATCH:
|
||||
return rules[i];
|
||||
case TransliterationRule::PARTIAL_MATCH:
|
||||
isPartial = TRUE;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
// No match or partial match from any rule
|
||||
++pos.start;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create rule strings that represents this rule set.
|
||||
*/
|
||||
UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
|
||||
const TransliterationRuleData& data,
|
||||
UBool escapeUnprintable) const {
|
||||
int32_t i;
|
||||
int32_t count = index[256];
|
||||
|
@ -281,7 +233,7 @@ UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
|
|||
if (i != 0) {
|
||||
ruleSource.append((UChar) 0x000A /*\n*/);
|
||||
}
|
||||
rules[i]->toRule(ruleSource, data, escapeUnprintable);
|
||||
rules[i]->toRule(ruleSource, escapeUnprintable);
|
||||
}
|
||||
return ruleSource;
|
||||
}
|
||||
|
|
|
@ -18,15 +18,7 @@ class UnicodeFilter;
|
|||
class UnicodeString;
|
||||
|
||||
/**
|
||||
* A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
|
||||
* the transliteration in one direction from one set of characters or short
|
||||
* strings to another. A <code>RuleBasedTransliterator</code> consists of up to
|
||||
* two such sets, one for the forward direction, and one for the reverse.
|
||||
*
|
||||
* <p>A <code>TransliterationRuleSet</code> has one important operation, that of
|
||||
* finding a matching rule at a given point in the text. This is accomplished
|
||||
* by the <code>findMatch()</code> method.
|
||||
*
|
||||
* A set of rules for a <code>RuleBasedTransliterator</code>.
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class TransliterationRuleSet {
|
||||
|
@ -98,59 +90,24 @@ public:
|
|||
* That is, <code>freeze()</code> may be called multiple times,
|
||||
* although for optimal performance it shouldn't be.
|
||||
*/
|
||||
virtual void freeze(const TransliterationRuleData& data,
|
||||
UErrorCode& status);
|
||||
|
||||
virtual void freeze(UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param data a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found.
|
||||
* Transliterate the given text with the given UTransPosition
|
||||
* indices. Return TRUE if the transliteration should continue
|
||||
* or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
|
||||
* Note that FALSE is only ever returned if isIncremental is TRUE.
|
||||
* @param text the text to be transliterated
|
||||
* @param index the position indices, which will be updated
|
||||
* @param isIncremental if TRUE, assume new text may be inserted
|
||||
* at index.limit, and return FALSE if thre is a partial match.
|
||||
* @return TRUE unless a U_PARTIAL_MATCH has been obtained,
|
||||
* indicating that transliteration should stop until more text
|
||||
* arrives.
|
||||
*/
|
||||
virtual TransliterationRule* findMatch(const Replaceable& text,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data) const;
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* Unlike <code>findMatch()</code>, this method does an incremental match.
|
||||
* An incremental match requires that there be no partial matches that might
|
||||
* pre-empt the full match that is found. If there are partial matches,
|
||||
* then null is returned. A non-null result indicates that a full match has
|
||||
* been found, and that it cannot be pre-empted by a partial match
|
||||
* regardless of what additional text is added to the translation buffer.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param data a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param partial output parameter. <code>partial[0]</code> is set to
|
||||
* true if a partial match is returned.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found, or if the text buffer
|
||||
* does not have enough text yet to unambiguously match a rule.
|
||||
*/
|
||||
virtual TransliterationRule* findIncrementalMatch(const Replaceable& text,
|
||||
const UTransPosition& pos,
|
||||
const TransliterationRuleData& data,
|
||||
UBool& isPartial) const;
|
||||
UBool transliterate(Replaceable& text,
|
||||
UTransPosition& index,
|
||||
UBool isIncremental);
|
||||
|
||||
/**
|
||||
* Create rule strings that represents this rule set.
|
||||
|
@ -158,7 +115,6 @@ public:
|
|||
* contents will be deleted.
|
||||
*/
|
||||
virtual UnicodeString& toRules(UnicodeString& result,
|
||||
const TransliterationRuleData& data,
|
||||
UBool escapeUnprintable) const;
|
||||
};
|
||||
#endif
|
||||
|
|
|
@ -44,7 +44,7 @@ public:
|
|||
* Lookup the UnicodeSet associated with the given character, and
|
||||
* return it. Return <tt>null</tt> if not found.
|
||||
*/
|
||||
virtual const UnicodeSet* lookupSet(UChar ch) const = 0;
|
||||
virtual const UnicodeSet* lookupSet(UChar32 ch) const = 0;
|
||||
|
||||
/**
|
||||
* Parse a symbol reference name from the given string, starting
|
||||
|
|
|
@ -281,10 +281,20 @@ void Transliterator::transliterate(Replaceable& text,
|
|||
*/
|
||||
void Transliterator::transliterate(Replaceable& text,
|
||||
UTransPosition& index,
|
||||
UChar insertion,
|
||||
UChar32 insertion,
|
||||
UErrorCode& status) const {
|
||||
UnicodeString str(insertion);
|
||||
_transliterate(text, index, &str, status);
|
||||
if (UTF_IS_LEAD(insertion)) {
|
||||
// Oops, the caller passed us a single lead surrogate. In
|
||||
// general, we don't support this, but we'll do the caller a
|
||||
// favor in the special case of LEAD followed by TRAIL
|
||||
// insertion. Anything else won't work.
|
||||
text.handleReplaceBetween(index.limit, index.limit, str);
|
||||
++index.limit;
|
||||
++index.contextLimit;
|
||||
} else {
|
||||
_transliterate(text, index, &str, status);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -351,8 +361,18 @@ void Transliterator::_transliterate(Replaceable& text,
|
|||
|
||||
filteredTransliterate(text, index, TRUE);
|
||||
|
||||
index.contextStart = uprv_max(index.start - getMaximumContextLength(),
|
||||
originalStart);
|
||||
// The purpose of the code below is to keep the context small
|
||||
// while doing incremental transliteration. When part of the left
|
||||
// context (between contextStart and start) is no longer needed,
|
||||
// we try to advance contextStart past that portion. We use the
|
||||
// maximum context length to do so.
|
||||
int32_t newCS = index.start;
|
||||
int32_t n = getMaximumContextLength();
|
||||
while (newCS > originalStart && n-- > 0) {
|
||||
--newCS;
|
||||
newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1;
|
||||
}
|
||||
index.contextStart = uprv_max(newCS, originalStart);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -21,7 +21,7 @@ public:
|
|||
NullFilter(UBool r) { result = r; }
|
||||
NullFilter(const NullFilter& f) : UnicodeFilter(f) { result = f.result; }
|
||||
virtual ~NullFilter() {}
|
||||
virtual UBool contains(UChar /*c*/) const { return result; }
|
||||
virtual UBool contains(UChar32 /*c*/) const { return result; }
|
||||
virtual UnicodeFilter* clone() const { return new NullFilter(*this); }
|
||||
};
|
||||
|
||||
|
@ -31,7 +31,7 @@ public:
|
|||
UnicodeNotFilter(UnicodeFilter* adopted);
|
||||
UnicodeNotFilter(const UnicodeNotFilter&);
|
||||
virtual ~UnicodeNotFilter();
|
||||
virtual UBool contains(UChar c) const;
|
||||
virtual UBool contains(UChar32 c) const;
|
||||
virtual UnicodeFilter* clone() const;
|
||||
};
|
||||
|
||||
|
@ -39,7 +39,7 @@ UnicodeNotFilter::UnicodeNotFilter(UnicodeFilter* adopted) : filt(adopted) {}
|
|||
UnicodeNotFilter::UnicodeNotFilter(const UnicodeNotFilter& f)
|
||||
: UnicodeFilter(f), filt(f.filt->clone()) {}
|
||||
UnicodeNotFilter::~UnicodeNotFilter() { delete filt; }
|
||||
UBool UnicodeNotFilter::contains(UChar c) const { return !filt->contains(c); }
|
||||
UBool UnicodeNotFilter::contains(UChar32 c) const { return !filt->contains(c); }
|
||||
UnicodeFilter* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); }
|
||||
|
||||
/**
|
||||
|
@ -61,7 +61,7 @@ public:
|
|||
UnicodeAndFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2);
|
||||
UnicodeAndFilter(const UnicodeAndFilter&);
|
||||
virtual ~UnicodeAndFilter();
|
||||
virtual UBool contains(UChar c) const;
|
||||
virtual UBool contains(UChar32 c) const;
|
||||
virtual UnicodeFilter* clone() const;
|
||||
};
|
||||
|
||||
|
@ -69,7 +69,7 @@ UnicodeAndFilter::UnicodeAndFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1
|
|||
UnicodeAndFilter::UnicodeAndFilter(const UnicodeAndFilter& f)
|
||||
: UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
|
||||
UnicodeAndFilter::~UnicodeAndFilter() { delete filt1; delete filt2; }
|
||||
UBool UnicodeAndFilter::contains(UChar c) const { return filt1->contains(c) && filt2->contains(c); }
|
||||
UBool UnicodeAndFilter::contains(UChar32 c) const { return filt1->contains(c) && filt2->contains(c); }
|
||||
UnicodeFilter* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); }
|
||||
|
||||
/**
|
||||
|
@ -99,7 +99,7 @@ public:
|
|||
UnicodeOrFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2);
|
||||
UnicodeOrFilter(const UnicodeOrFilter&);
|
||||
virtual ~UnicodeOrFilter();
|
||||
virtual UBool contains(UChar c) const;
|
||||
virtual UBool contains(UChar32 c) const;
|
||||
virtual UnicodeFilter* clone() const;
|
||||
};
|
||||
|
||||
|
@ -107,7 +107,7 @@ UnicodeOrFilter::UnicodeOrFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f
|
|||
UnicodeOrFilter::UnicodeOrFilter(const UnicodeOrFilter& f)
|
||||
: UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
|
||||
UnicodeOrFilter::~UnicodeOrFilter() { delete filt1; delete filt2; }
|
||||
UBool UnicodeOrFilter::contains(UChar c) const { return filt1->contains(c) || filt2->contains(c); }
|
||||
UBool UnicodeOrFilter::contains(UChar32 c) const { return filt1->contains(c) || filt2->contains(c); }
|
||||
UnicodeFilter* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); }
|
||||
|
||||
/**
|
||||
|
|
|
@ -542,17 +542,6 @@ UBool UnicodeSet::contains(UChar32 c) const {
|
|||
return ((i & 1) != 0); // return true if odd
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement UnicodeFilter:
|
||||
* Returns <tt>true</tt> if this set contains the specified char.
|
||||
*
|
||||
* @return <tt>true</tt> if this set contains the specified char.
|
||||
* @draft
|
||||
*/
|
||||
UBool UnicodeSet::contains(UChar c) const {
|
||||
return contains((UChar32) c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns <tt>true</tt> if this set contains any character whose low byte
|
||||
* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
|
||||
|
@ -581,6 +570,24 @@ UBool UnicodeSet::containsIndexValue(uint8_t v) const {
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of UnicodeMatcher::matches().
|
||||
*/
|
||||
UMatchDegree UnicodeSet::matches(const Replaceable& text,
|
||||
int32_t& offset,
|
||||
int32_t limit,
|
||||
UBool incremental) const {
|
||||
if (offset == limit) {
|
||||
if (contains(TransliterationRule::ETHER)) {
|
||||
return incremental ? U_PARTIAL_MATCH : U_MATCH;
|
||||
} else {
|
||||
return U_MISMATCH;
|
||||
}
|
||||
} else {
|
||||
return UnicodeFilter::matches(text, offset, limit, incremental);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the specified range to this set if it is not already
|
||||
* present. If this set already contains the specified range,
|
||||
|
@ -895,7 +902,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
UBool invert = FALSE;
|
||||
clear();
|
||||
|
||||
int32_t lastChar = -1; // This is either a char (0..FFFF) or -1
|
||||
const UChar32 NONE = (UChar32) -1;
|
||||
UChar32 lastChar = NONE; // This is either a char (0..10FFFF) or NONE
|
||||
UChar lastOp = 0;
|
||||
|
||||
/* This loop iterates over the characters in the pattern. We start at
|
||||
|
@ -916,8 +924,9 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
// mode 1: '[' seen; if next is '^' or ':' then special
|
||||
// mode 2: '[' '^'? seen; parse pattern and close with ']'
|
||||
// mode 3: '[:' seen; parse category and close with ':]'
|
||||
// mode 4: Pattern closed cleanly
|
||||
int8_t mode = 0;
|
||||
int32_t openPos = 0; // offset to opening '['
|
||||
int32_t colonPos = 0; // Expected pos of ':' in '[:'
|
||||
int32_t i = pos.getIndex();
|
||||
int32_t limit = pattern.length();
|
||||
UnicodeSet nestedAux;
|
||||
|
@ -930,7 +939,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
const UnicodeString* varValueBuffer = NULL;
|
||||
int32_t ivarValueBuffer = 0;
|
||||
int32_t anchor = 0;
|
||||
for (; i<limit; i+=((varValueBuffer==NULL)?1:0)) {
|
||||
UChar32 c;
|
||||
while (i<limit) {
|
||||
/* If the next element is a single character, c will be set to it,
|
||||
* and nestedSet will be null. In this case isLiteral indicates
|
||||
* whether the character should assume special meaning if it has
|
||||
|
@ -941,23 +951,25 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
*/
|
||||
nestedSet = NULL;
|
||||
UBool isLiteral = FALSE;
|
||||
UChar c;
|
||||
if (varValueBuffer != NULL) {
|
||||
if (ivarValueBuffer < varValueBuffer->length()) {
|
||||
c = varValueBuffer->charAt(ivarValueBuffer++);
|
||||
c = varValueBuffer->char32At(ivarValueBuffer);
|
||||
ivarValueBuffer += UTF_CHAR_LENGTH(c);
|
||||
nestedSet = symbols->lookupSet(c); // may be NULL
|
||||
nestedPatDone = FALSE;
|
||||
} else {
|
||||
varValueBuffer = NULL;
|
||||
c = pattern.charAt(i);
|
||||
c = pattern.char32At(i);
|
||||
i += UTF_CHAR_LENGTH(c);
|
||||
}
|
||||
} else {
|
||||
c = pattern.charAt(i);
|
||||
c = pattern.char32At(i);
|
||||
i += UTF_CHAR_LENGTH(c);
|
||||
}
|
||||
|
||||
// Ignore whitespace. This is not Unicode whitespace, but Java
|
||||
// whitespace, a subset of Unicode whitespace.
|
||||
if (Unicode::isWhitespace(c)) {
|
||||
if (u_isspace(c)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -971,7 +983,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
case 0:
|
||||
if (c == SET_OPEN) {
|
||||
mode = 1; // Next look for '^' or ':'
|
||||
openPos = i;
|
||||
colonPos = i; // Expect ':' at next offset
|
||||
continue;
|
||||
} else {
|
||||
// throw new IllegalArgumentException("Missing opening '['");
|
||||
|
@ -986,9 +998,10 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
newPat.append(c);
|
||||
continue; // Back to top to fetch next character
|
||||
case COLON:
|
||||
if (i == openPos+1) {
|
||||
// '[:' cannot have whitespace in it
|
||||
--i;
|
||||
// '[:' cannot have whitespace in it. 'i' has already
|
||||
// been advanced.
|
||||
if (i-1 == colonPos) {
|
||||
--i; // Backup to the '['
|
||||
c = SET_OPEN;
|
||||
mode = 3;
|
||||
// Fall through and parse category using the same
|
||||
|
@ -1018,15 +1031,13 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
* interpret '\\uxxxx' Unicode escapes here (as literals).
|
||||
*/
|
||||
if (c == BACKSLASH) {
|
||||
++i; // Advance past '\\'
|
||||
UChar32 escaped = pattern.unescapeAt(i);
|
||||
if (escaped == (UChar32) -1) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
isLiteral = TRUE;
|
||||
--i; // Move i back to last parsed character
|
||||
c = (UChar) escaped;
|
||||
c = escaped;
|
||||
}
|
||||
|
||||
/* Parse variable references. These are treated as literals. If a
|
||||
|
@ -1036,7 +1047,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
* Set variables are only looked up if varCharToSet is not null.
|
||||
*/
|
||||
else if (symbols != NULL && !isLiteral && c == SymbolTable::SYMBOL_REF) {
|
||||
pos.setIndex(++i);
|
||||
pos.setIndex(i);
|
||||
UnicodeString name = symbols->parseReference(pattern, pos, limit);
|
||||
if (name.length() != 0) {
|
||||
varValueBuffer = symbols->lookup(name);
|
||||
|
@ -1052,7 +1063,6 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
// Got a null; this means we have an isolated $.
|
||||
// Tentatively assume this is an anchor.
|
||||
anchor = 1;
|
||||
--i; // Back up so loop increment works properly
|
||||
}
|
||||
continue; // Back to the top to get varValueBuffer[0]
|
||||
}
|
||||
|
@ -1069,9 +1079,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
nestedPatStart = newPat.length();
|
||||
|
||||
// Handle "[:...:]", representing a character category
|
||||
UChar d = charAfter(pattern, i);
|
||||
if (d == COLON) {
|
||||
i += 2;
|
||||
if (i < pattern.length() && pattern.charAt(i) == COLON) {
|
||||
++i;
|
||||
int32_t j = pattern.indexOf(CATEGORY_CLOSE, i);
|
||||
if (j < 0) {
|
||||
// throw new IllegalArgumentException("Missing \":]\"");
|
||||
|
@ -1086,7 +1095,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
i = j+1; // Make i point to ']' in ":]"
|
||||
i = j+2; // Advance i past ":]"
|
||||
|
||||
// Use a rebuilt pattern. If we are top level,
|
||||
// then there is already a SET_OPEN in newPat, and
|
||||
|
@ -1105,11 +1114,13 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
// loop. This is one of 2 ways we leave this
|
||||
// loop if the pattern is well-formed.
|
||||
*this = *nestedSet;
|
||||
mode = 4;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Recurse to get the pairs for this nested set.
|
||||
pos.setIndex(i);
|
||||
// Backup i to '['.
|
||||
pos.setIndex(--i);
|
||||
switch (lastOp) {
|
||||
case HYPHEN:
|
||||
case INTERSECTION:
|
||||
|
@ -1122,7 +1133,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
i = pos.getIndex() - 1; // - 1 to point at ']'
|
||||
i = pos.getIndex();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1136,7 +1147,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
* ']' have special meanings.
|
||||
*/
|
||||
if (nestedSet != NULL) {
|
||||
if (lastChar >= 0) {
|
||||
if (lastChar != NONE) {
|
||||
if (lastOp != 0) {
|
||||
// throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
@ -1154,7 +1165,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
} else {
|
||||
_appendToPat(newPat, lastChar, FALSE);
|
||||
}
|
||||
lastChar = -1;
|
||||
lastChar = NONE;
|
||||
}
|
||||
switch (lastOp) {
|
||||
case HYPHEN:
|
||||
|
@ -1193,9 +1204,11 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
newPat.append((UChar)SymbolTable::SYMBOL_REF);
|
||||
add(TransliterationRule::ETHER);
|
||||
}
|
||||
mode = 4;
|
||||
break;
|
||||
} else if (lastOp == 0 && !isLiteral && (c == HYPHEN || c == INTERSECTION)) {
|
||||
lastOp = c;
|
||||
// assert(c <= 0xFFFF);
|
||||
lastOp = (UChar) c;
|
||||
} else if (lastOp == HYPHEN) {
|
||||
if (lastChar >= c) {
|
||||
// Don't allow redundant (a-a) or empty (b-a) ranges;
|
||||
|
@ -1210,14 +1223,14 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
newPat.append(HYPHEN);
|
||||
_appendToPat(newPat, c, FALSE);
|
||||
lastOp = 0;
|
||||
lastChar = -1;
|
||||
lastChar = NONE;
|
||||
} else if (lastOp != 0) {
|
||||
// We have <set>&<char> or <char>&<char>
|
||||
// throw new IllegalArgumentException("Unquoted " + lastOp);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
} else {
|
||||
if (lastChar >= 0) {
|
||||
if (lastChar != NONE) {
|
||||
// We have <char><char>
|
||||
add(lastChar, lastChar);
|
||||
_appendToPat(newPat, lastChar, FALSE);
|
||||
|
@ -1226,7 +1239,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
}
|
||||
}
|
||||
|
||||
if (lastChar >= 0) {
|
||||
if (lastChar != NONE) {
|
||||
add(lastChar, lastChar);
|
||||
_appendToPat(newPat, lastChar, FALSE);
|
||||
}
|
||||
|
@ -1252,19 +1265,13 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
|
|||
complement();
|
||||
}
|
||||
|
||||
/**
|
||||
* i indexes the last character we parsed or is pattern.length(). In
|
||||
* the latter case, we have run off the end without finding a closing
|
||||
* ']'. Otherwise, we know i < pattern.length(), and we set the
|
||||
* ParsePosition to the next character to be parsed.
|
||||
*/
|
||||
if (i == limit) {
|
||||
if (mode != 4) {
|
||||
// throw new IllegalArgumentException("Missing ']'");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
pos.setIndex(i+1);
|
||||
pos.setIndex(i);
|
||||
|
||||
// Use the rebuilt pattern (newPat) only if necessary. Prefer the
|
||||
// generated pattern.
|
||||
|
@ -1393,14 +1400,6 @@ const UnicodeSet& UnicodeSet::getCategorySet(int8_t cat) {
|
|||
// Implementation: Utility methods
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns the character after the given position, or '\uFFFE' if
|
||||
* there is none.
|
||||
*/
|
||||
UChar UnicodeSet::charAfter(const UnicodeString& str, int32_t i) {
|
||||
return ((++i) < str.length()) ? str.charAt(i) : (UChar)0xFFFE;
|
||||
}
|
||||
|
||||
void UnicodeSet::ensureCapacity(int32_t newLen) {
|
||||
if (newLen <= capacity) return;
|
||||
capacity = newLen + GROW_EXTRA;
|
||||
|
|
|
@ -75,7 +75,7 @@ class TestHangulFilter : public UnicodeFilter {
|
|||
virtual UnicodeFilter* clone() const {
|
||||
return new TestHangulFilter(*this);
|
||||
}
|
||||
virtual UBool contains(UChar c) const {
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
if(c == 0xae4c )
|
||||
return FALSE;
|
||||
else
|
||||
|
|
|
@ -59,7 +59,7 @@ class TestHexFilter : public UnicodeFilter {
|
|||
virtual UnicodeFilter* clone() const {
|
||||
return new TestHexFilter(*this);
|
||||
}
|
||||
virtual UBool contains(UChar c) const {
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
if(c == 0x0061 || c == 0x0063 )
|
||||
return FALSE;
|
||||
else
|
||||
|
|
|
@ -314,14 +314,20 @@ IntlTest::prettify(const UnicodeString &source,
|
|||
target.remove();
|
||||
target += "\"";
|
||||
|
||||
for (i = 0; i < source.length(); i += 1)
|
||||
for (i = 0; i < source.length(); )
|
||||
{
|
||||
UChar ch = source[i];
|
||||
UChar32 ch = source.char32At(i);
|
||||
i += UTF_CHAR_LENGTH(ch);
|
||||
|
||||
if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E)
|
||||
{
|
||||
target += "\\u";
|
||||
appendHex(ch, 4, target);
|
||||
if (ch <= 0xFFFF) {
|
||||
target += "\\u";
|
||||
appendHex(ch, 4, target);
|
||||
} else {
|
||||
target += "\\U";
|
||||
appendHex(ch, 8, target);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -343,9 +349,10 @@ IntlTest::prettify(const UnicodeString &source, UBool parseBackslash)
|
|||
target.remove();
|
||||
target += "\"";
|
||||
|
||||
for (i = 0; i < source.length(); i += 1)
|
||||
for (i = 0; i < source.length();)
|
||||
{
|
||||
UChar ch = source[i];
|
||||
UChar32 ch = source.char32At(i);
|
||||
i += UTF_CHAR_LENGTH(ch);
|
||||
|
||||
if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E)
|
||||
{
|
||||
|
@ -365,8 +372,13 @@ IntlTest::prettify(const UnicodeString &source, UBool parseBackslash)
|
|||
target.truncate(target.length() - 1);
|
||||
}
|
||||
}
|
||||
target += "\\u";
|
||||
appendHex(ch, 4, target);
|
||||
if (ch <= 0xFFFF) {
|
||||
target += "\\u";
|
||||
appendHex(ch, 4, target);
|
||||
} else {
|
||||
target += "\\U";
|
||||
appendHex(ch, 8, target);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -73,7 +73,7 @@ class TestJamoFilter : public UnicodeFilter {
|
|||
virtual UnicodeFilter* clone() const {
|
||||
return new TestJamoFilter(*this);
|
||||
}
|
||||
virtual UBool contains(UChar c) const {
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
if(c == 0x1101 )
|
||||
return FALSE;
|
||||
else
|
||||
|
|
|
@ -618,7 +618,7 @@ class TestFilter1 : public UnicodeFilter {
|
|||
virtual UnicodeFilter* clone() const {
|
||||
return new TestFilter1(*this);
|
||||
}
|
||||
virtual UBool contains(UChar c) const {
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
if(c==0x63 || c==0x61 || c==0x43 || c==0x41)
|
||||
return FALSE;
|
||||
else
|
||||
|
@ -629,7 +629,7 @@ class TestFilter2 : public UnicodeFilter {
|
|||
virtual UnicodeFilter* clone() const {
|
||||
return new TestFilter2(*this);
|
||||
}
|
||||
virtual UBool contains(UChar c) const {
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
if(c==0x65 || c==0x6c)
|
||||
return FALSE;
|
||||
else
|
||||
|
@ -640,7 +640,7 @@ class TestFilter3 : public UnicodeFilter {
|
|||
virtual UnicodeFilter* clone() const {
|
||||
return new TestFilter3(*this);
|
||||
}
|
||||
virtual UBool contains(UChar c) const {
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
if(c==0x6f || c==0x77)
|
||||
return FALSE;
|
||||
else
|
||||
|
|
|
@ -66,6 +66,8 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
|
|||
TESTCASE(30,TestCompoundFilter);
|
||||
TESTCASE(31,TestRemove);
|
||||
TESTCASE(32,TestToRules);
|
||||
TESTCASE(33,TestContext);
|
||||
TESTCASE(34,TestSupplemental);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
@ -152,7 +154,9 @@ void TransliteratorTest::TestSimpleRules(void) {
|
|||
*/
|
||||
expect(UnicodeString("ab>x|y;", "") +
|
||||
"yc>z",
|
||||
"eabcd", "exzd"); /* Another set of rules:
|
||||
"eabcd", "exzd");
|
||||
|
||||
/* Another set of rules:
|
||||
* 1. ab>x|yzacw
|
||||
* 2. za>q
|
||||
* 3. qc>r
|
||||
|
@ -476,7 +480,7 @@ class TestFilter : public UnicodeFilter {
|
|||
virtual UnicodeFilter* clone() const {
|
||||
return new TestFilter(*this);
|
||||
}
|
||||
virtual UBool contains(UChar c) const {
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
return c != (UChar)0x0063 /*c*/;
|
||||
}
|
||||
};
|
||||
|
@ -506,6 +510,12 @@ void TransliteratorTest::TestFiltering(void) {
|
|||
* Test anchors
|
||||
*/
|
||||
void TransliteratorTest::TestAnchors(void) {
|
||||
expect(UnicodeString("^a > 0; a$ > 2 ; a > 1;", ""),
|
||||
"aaa",
|
||||
"012");
|
||||
expect(UnicodeString("$s=[z$]; $s{a>0; a}$s>2; a>1;", ""),
|
||||
"aaa",
|
||||
"012");
|
||||
expect(UnicodeString("^ab > 01 ;"
|
||||
" ab > |8 ;"
|
||||
" b > k ;"
|
||||
|
@ -1451,18 +1461,44 @@ void TransliteratorTest::TestToRules(void) {
|
|||
}
|
||||
}
|
||||
|
||||
void TransliteratorTest::TestContext() {
|
||||
UTransPosition pos = {0, 2, 0, 1}; // cs cl s l
|
||||
expect("de > x; {d}e > y;",
|
||||
"de",
|
||||
"ye",
|
||||
&pos);
|
||||
|
||||
expect("ab{c} > z;",
|
||||
"xadabdabcy",
|
||||
"xadabdabzy");
|
||||
}
|
||||
|
||||
void TransliteratorTest::TestSupplemental() {
|
||||
expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];"
|
||||
"a > $a; $s > i;"),
|
||||
CharsToUnicodeString("ab\\U0001030Fx"),
|
||||
CharsToUnicodeString("\\U00010300bix"));
|
||||
|
||||
expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];"
|
||||
"$b=[A-Z\\U00010400-\\U0001044D];"
|
||||
"($a)($b) > $2 $1;"),
|
||||
CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
|
||||
CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
void TransliteratorTest::expect(const UnicodeString& rules,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult) {
|
||||
const UnicodeString& expectedResult,
|
||||
UTransPosition *pos) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator *t = new RuleBasedTransliterator("<ID>", rules, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("FAIL: Transliterator constructor failed");
|
||||
} else {
|
||||
expect(*t, source, expectedResult);
|
||||
expect(*t, source, expectedResult, pos);
|
||||
}
|
||||
delete t;
|
||||
}
|
||||
|
@ -1477,34 +1513,49 @@ void TransliteratorTest::expect(const Transliterator& t,
|
|||
|
||||
void TransliteratorTest::expect(const Transliterator& t,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult) {
|
||||
UnicodeString result(source);
|
||||
t.transliterate(result);
|
||||
expectAux(t.getID() + ":String", source, result, expectedResult);
|
||||
const UnicodeString& expectedResult,
|
||||
UTransPosition *pos) {
|
||||
if (pos == 0) {
|
||||
UnicodeString result(source);
|
||||
t.transliterate(result);
|
||||
expectAux(t.getID() + ":String", source, result, expectedResult);
|
||||
}
|
||||
|
||||
UTransPosition index={0, 0, 0, 0};
|
||||
if (pos != 0) {
|
||||
index = *pos;
|
||||
}
|
||||
|
||||
UnicodeString rsource(source);
|
||||
t.transliterate(rsource);
|
||||
if (pos == 0) {
|
||||
t.transliterate(rsource);
|
||||
} else {
|
||||
// Do it all at once -- below we do it incrementally
|
||||
t.finishTransliteration(rsource, *pos);
|
||||
}
|
||||
expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
|
||||
|
||||
// Test keyboard (incremental) transliteration -- this result
|
||||
// must be the same after we finalize (see below).
|
||||
rsource.remove();
|
||||
UTransPosition index={0, 0, 0, 0};
|
||||
UnicodeString log;
|
||||
|
||||
for (int32_t i=0; i<source.length(); ++i) {
|
||||
if (i != 0) {
|
||||
log.append(" + ");
|
||||
}
|
||||
log.append(source.charAt(i)).append(" -> ");
|
||||
rsource.remove();
|
||||
if (pos != 0) {
|
||||
rsource = source;
|
||||
formatInput(log, rsource, index);
|
||||
log.append(" -> ");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
t.transliterate(rsource, index, source.charAt(i), status);
|
||||
// Append the string buffer with a vertical bar '|' where
|
||||
// the committed index is.
|
||||
UnicodeString left, right;
|
||||
rsource.extractBetween(0, index.start, left);
|
||||
rsource.extractBetween(index.start, rsource.length(), right);
|
||||
log.append(left).append((UChar)PIPE).append(right);
|
||||
t.transliterate(rsource, index, status);
|
||||
formatInput(log, rsource, index);
|
||||
} else {
|
||||
for (int32_t i=0; i<source.length(); ++i) {
|
||||
if (i != 0) {
|
||||
log.append(" + ");
|
||||
}
|
||||
log.append(source.charAt(i)).append(" -> ");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
t.transliterate(rsource, index, source.charAt(i), status);
|
||||
formatInput(log, rsource, index);
|
||||
}
|
||||
}
|
||||
|
||||
// As a final step in keyboard transliteration, we must call
|
||||
|
@ -1518,6 +1569,41 @@ void TransliteratorTest::expect(const Transliterator& t,
|
|||
expectedResult);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param appendTo result is appended to this param.
|
||||
* @param input the string being transliterated
|
||||
* @param pos the index struct
|
||||
*/
|
||||
UnicodeString& TransliteratorTest::formatInput(UnicodeString &appendTo,
|
||||
const UnicodeString& input,
|
||||
const UTransPosition& pos) {
|
||||
// Output a string of the form aaa{bbb|ccc|ddd}eee, where
|
||||
// the {} indicate the context start and limit, and the ||
|
||||
// indicate the start and limit.
|
||||
if (0 <= pos.contextStart &&
|
||||
pos.contextStart <= pos.start &&
|
||||
pos.start <= pos.limit &&
|
||||
pos.limit <= pos.contextLimit &&
|
||||
pos.contextLimit <= input.length()) {
|
||||
|
||||
UnicodeString a, b, c, d, e;
|
||||
input.extractBetween(0, pos.contextStart, a);
|
||||
input.extractBetween(pos.contextStart, pos.start, b);
|
||||
input.extractBetween(pos.start, pos.limit, c);
|
||||
input.extractBetween(pos.limit, pos.contextLimit, d);
|
||||
input.extractBetween(pos.contextLimit, input.length(), e);
|
||||
appendTo.append(a).append((UChar)123/*{*/).append(b).
|
||||
append((UChar)PIPE).append(c).append((UChar)PIPE).append(d).
|
||||
append((UChar)125/*}*/).append(e);
|
||||
} else {
|
||||
appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
|
||||
pos.contextStart + ", s=" + pos.start + ", l=" +
|
||||
pos.limit + ", cl=" + pos.contextLimit + "} on " +
|
||||
input);
|
||||
}
|
||||
return appendTo;
|
||||
}
|
||||
|
||||
void TransliteratorTest::expectAux(const UnicodeString& tag,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& result,
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#define TRANSTST_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/translit.h"
|
||||
#include "intltest.h"
|
||||
|
||||
class Transliterator;
|
||||
|
@ -167,13 +168,18 @@ class TransliteratorTest : public IntlTest {
|
|||
|
||||
void TestToRules(void);
|
||||
|
||||
void TestContext(void);
|
||||
|
||||
void TestSupplemental(void);
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
protected:
|
||||
void expect(const UnicodeString& rules,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult);
|
||||
const UnicodeString& expectedResult,
|
||||
UTransPosition *pos=0);
|
||||
|
||||
void expect(const Transliterator& t,
|
||||
const UnicodeString& source,
|
||||
|
@ -182,7 +188,8 @@ class TransliteratorTest : public IntlTest {
|
|||
|
||||
void expect(const Transliterator& t,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult);
|
||||
const UnicodeString& expectedResult,
|
||||
UTransPosition *pos=0);
|
||||
|
||||
void expectAux(const UnicodeString& tag,
|
||||
const UnicodeString& source,
|
||||
|
@ -192,6 +199,10 @@ class TransliteratorTest : public IntlTest {
|
|||
virtual void expectAux(const UnicodeString& tag,
|
||||
const UnicodeString& summary, UBool pass,
|
||||
const UnicodeString& expectedResult);
|
||||
|
||||
static UnicodeString& formatInput(UnicodeString &appendTo,
|
||||
const UnicodeString& input,
|
||||
const UTransPosition& pos);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -39,7 +39,7 @@ class Filter1: public UnicodeFilter{
|
|||
virtual UnicodeFilter* clone() const{
|
||||
return new Filter1(*this);
|
||||
}
|
||||
virtual UBool contains(UChar c) const {
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
if(c == 0x0061 || c == 0x0041 || c == 0x0063 || c == 0x0043)
|
||||
return FALSE;
|
||||
else
|
||||
|
@ -50,7 +50,7 @@ class Filter2: public UnicodeFilter{
|
|||
virtual UnicodeFilter* clone() const{
|
||||
return new Filter2(*this);
|
||||
}
|
||||
virtual UBool contains(UChar c) const {
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
if(c == 0x0079 || c == 0x0059 || c == 0x007a || c == 0x005a || c == 0x0061 || c == 0x0063)
|
||||
return FALSE;
|
||||
else
|
||||
|
|
|
@ -71,7 +71,7 @@ class TestUniFilter : public UnicodeFilter {
|
|||
virtual UnicodeFilter* clone() const {
|
||||
return new TestUniFilter(*this);
|
||||
}
|
||||
virtual UBool contains(UChar c) const {
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
if(c==0x0063 || c==0x0061 || c==0x0043 || c==0x0041)
|
||||
return FALSE;
|
||||
else
|
||||
|
|
Loading…
Add table
Reference in a new issue