ICU-1052 redesign of engine to support supplemental characters

X-SVN-Rev: 5341
This commit is contained in:
Alan Liu 2001-07-25 19:11:02 +00:00
parent d7c3eebf46
commit 7edf9d3e80
23 changed files with 675 additions and 637 deletions

View file

@ -71,7 +71,8 @@ udat.o umsg.o \
unifltlg.o unirange.o uniset.o unitohex.o unum.o \
dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o brkdict.o nultrans.o jamohang.o hangjamo.o \
remtrans.o utrans.o \
titletrn.o tolowtrn.o toupptrn.o xformtrn.o name2uni.o uni2name.o nortrans.o
titletrn.o tolowtrn.o toupptrn.o xformtrn.o name2uni.o uni2name.o nortrans.o \
unifilt.o
STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))

View file

@ -318,6 +318,10 @@ SOURCE=.\uni2name.cpp
# End Source File
# Begin Source File
SOURCE=.\unifilt.cpp
# End Source File
# Begin Source File
SOURCE=.\unifltlg.cpp
# End Source File
# Begin Source File
@ -1548,6 +1552,25 @@ InputPath=.\unicode\unifltlg.h
# End Source File
# Begin Source File
SOURCE=.\unicode\unimatch.h
!IF "$(CFG)" == "i18n - Win32 Release"
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
# Begin Custom Build
InputPath=.\unicode\unimatch.h
"..\..\include\unicode\unimatch.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy unicode\unimatch.h ..\..\include\unicode
# End Custom Build
!ENDIF
# End Source File
# Begin Source File
SOURCE=.\unirange.h
# End Source File
# Begin Source File

View file

@ -89,18 +89,18 @@ RuleBasedTransliterator::clone(void) const {
void
RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
UBool isIncremental) const {
/* We keep start and limit fixed the entire time,
* relative to the text -- limit may move numerically if text is
* inserted or removed. The cursor moves from start to limit, with
* replacements happening under it.
/* We keep contextStart and contextLimit fixed the entire time,
* relative to the text -- contextLimit may move numerically if
* text is inserted or removed. The start offset moves toward
* limit, with replacements happening under it.
*
* Example: rules 1. ab>x|y
* 2. yc>z
*
* |eabcd start - no match, advance cursor
* e|abcd match rule 1 - change text & adjust cursor
* ex|ycd match rule 2 - change text & adjust cursor
* exz|d no match, advance cursor
* |eabcd begin - no match, advance start
* e|abcd match rule 1 - change text & adjust start
* ex|ycd match rule 2 - change text & adjust start
* exz|d no match, advance start
* exzd| done
*/
@ -121,39 +121,14 @@ RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition&
loopLimit <<= 4;
}
UBool isPartial = FALSE;
while (index.start < index.limit && loopCount <= loopLimit) {
TransliterationRule* r = isIncremental ?
data->ruleSet.findIncrementalMatch(text, index, *data, isPartial) :
data->ruleSet.findMatch(text, index, *data);
/* If we match a rule then apply it by replacing the key
* with the rule output and repositioning the cursor
* appropriately. If we get a partial match, then we
* can't do anything without more text; return with the
* cursor at the current position. If we get null, then
* there is no match at this position, and we can advance
* the cursor.
*/
if (r == 0) {
if (isPartial) { // always FALSE unless isIncremental
break;
} else {
++index.start;
}
} else {
// Delegate replacement to TransliterationRule object
int32_t lenDelta = r->replace(text, index.start, *data);
index.limit += lenDelta;
index.contextLimit += lenDelta;
index.start += r->getCursorPos();
++loopCount;
}
while (index.start < index.limit &&
loopCount <= loopLimit &&
data->ruleSet.transliterate(text, index, isIncremental)) {
++loopCount;
}
}
UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
UBool escapeUnprintable) const {
return data->ruleSet.toRules(rulesSource, *data, escapeUnprintable);
return data->ruleSet.toRules(rulesSource, escapeUnprintable);
}

View file

@ -64,13 +64,13 @@ TransliterationRuleData::~TransliterationRuleData() {
}
const UnicodeSet*
TransliterationRuleData::lookupSet(UChar standIn) const {
TransliterationRuleData::lookupSet(UChar32 standIn) const {
int32_t i = standIn - setVariablesBase;
return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0;
}
int32_t
TransliterationRuleData::lookupSegmentReference(UChar c) const {
TransliterationRuleData::lookupSegmentReference(UChar32 c) const {
int32_t i = c - segmentBase;
return (i >= 0 && i < 9) ? i : -1;
}

View file

@ -90,14 +90,14 @@ public:
~TransliterationRuleData();
const UnicodeSet* lookupSet(UChar standIn) const;
const UnicodeSet* lookupSet(UChar32 standIn) const;
/**
* Return the zero-based index of the segment represented by the given
* character, or -1 if none. Repeat: This is a zero-based return value,
* 0..8, even though these are notated "$1".."$9".
*/
int32_t lookupSegmentReference(UChar c) const;
int32_t lookupSegmentReference(UChar32 c) const;
/**
* Return the character used to stand for the given segment reference.

View file

@ -75,7 +75,7 @@ public:
virtual const UnicodeString* lookup(const UnicodeString& s) const;
virtual const UnicodeSet* lookupSet(UChar ch) const;
virtual const UnicodeSet* lookupSet(UChar32 ch) const;
virtual UnicodeString parseReference(const UnicodeString& text,
ParsePosition& pos, int32_t limit) const;
@ -95,7 +95,7 @@ const UnicodeString* ParseData::lookup(const UnicodeString& name) const {
/**
* Implement SymbolTable API.
*/
const UnicodeSet* ParseData::lookupSet(UChar ch) const {
const UnicodeSet* ParseData::lookupSet(UChar32 ch) const {
// Note that we cannot use data.lookupSet() because the
// set array has not been constructed yet.
const UnicodeSet* set = NULL;
@ -682,7 +682,7 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
// Index the rules
if (U_SUCCESS(status)) {
data->ruleSet.freeze(*data, status);
data->ruleSet.freeze(status);
if (idSplitPointResult < 0) {
idSplitPointResult = idBlockResult.length();
}
@ -849,6 +849,7 @@ int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) {
right->text, right->cursor, right->cursorOffset,
left->createSegments(),
left->anchorStart, left->anchorEnd,
*data,
status), status);
return pos;

View file

@ -52,7 +52,9 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
int32_t cursorPosition, int32_t cursorOffset,
int32_t* adoptedSegs,
UBool anchorStart, UBool anchorEnd,
UErrorCode& status) {
const TransliterationRuleData& theData,
UErrorCode& status) :
data(theData) {
init(input, anteContextPos, postContextPos,
outputStr, cursorPosition, cursorOffset, adoptedSegs,
anchorStart, anchorEnd, status);
@ -79,7 +81,9 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
int32_t anteContextPos, int32_t postContextPos,
const UnicodeString& outputStr,
int32_t cursorPosition,
UErrorCode& status) {
const TransliterationRuleData& theData,
UErrorCode& status) :
data(theData) {
init(input, anteContextPos, postContextPos,
outputStr, cursorPosition, 0, NULL, FALSE, FALSE, status);
}
@ -92,7 +96,9 @@ TransliterationRule::TransliterationRule(TransliterationRule& other) :
output(other.output),
anteContextLength(other.anteContextLength),
keyLength(other.keyLength),
cursorPos(other.cursorPos) {
cursorPos(other.cursorPos),
flags(other.flags),
data(other.data) {
segments = 0;
if (other.segments != 0) {
@ -153,32 +159,27 @@ void TransliterationRule::init(const UnicodeString& input,
// We don't validate the segments array. The caller must
// guarantee that the segments are well-formed.
this->segments = adoptedSegs;
// Find the position of the first segment index that is after the
// anteContext (in the key). Note that this may be a start or a
// limit index.
firstKeySeg = -1;
if (segments != 0) {
do {
++firstKeySeg;
} while (segments[firstKeySeg] >= 0 &&
segments[firstKeySeg] < anteContextLength);
if (segments[firstKeySeg] < 0) {
firstKeySeg = -1;
}
}
// Implement anchors by inserting an ETHER character on the
// left or right. If on the left, then the indices must be
// incremented. If on the right, no index change is
// necessary.
if (anchorStart || anchorEnd) {
pattern.truncate(0);
if (anchorStart) {
pattern.append(ETHER);
++anteContextLength;
// Adjust segment offsets
if (segments != 0) {
int32_t *p = segments;
// The end marker is a -1.
while (*p != -1) {
++(*p);
++p;
}
}
}
pattern.append(input);
if (anchorEnd) {
pattern.append(ETHER);
}
} else {
pattern = input;
pattern = input;
flags = 0;
if (anchorStart) {
flags |= ANCHOR_START;
}
if (anchorEnd) {
flags |= ANCHOR_END;
}
}
@ -197,10 +198,14 @@ int32_t TransliterationRule::getCursorPos(void) const {
/**
* Return the preceding context length. This method is needed to
* support the <code>Transliterator</code> method
* <code>getMaximumContextLength()</code>.
* <code>getMaximumContextLength()</code>. Internally, this is
* implemented as the anteContextLength, optionally plus one if
* there is a start anchor. The one character anchor gap is
* needed to make repeated incremental transliteration with
* anchors work.
*/
int32_t TransliterationRule::getAnteContextLength(void) const {
return anteContextLength;
int32_t TransliterationRule::getContextLength(void) const {
return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0);
}
/**
@ -209,81 +214,16 @@ int32_t TransliterationRule::getAnteContextLength(void) const {
* unless the first character of the key is a set. If it's a
* set, or otherwise can match multiple keys, the index value is -1.
*/
int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data) const {
int16_t TransliterationRule::getIndexValue() const {
if (anteContextLength == pattern.length()) {
// A pattern with just ante context {such as foo)>bar} can
// match any key.
return -1;
}
UChar c = pattern.charAt(anteContextLength);
UChar32 c = pattern.char32At(anteContextLength);
return (int16_t)(data.lookupSet(c) == NULL ? (c & 0xFF) : -1);
}
/**
* Do a replacement of the input pattern with the output text in
* the given string, at the given offset. This method assumes
* that a match has already been found in the given text at the
* given position.
* @param text the text containing the substring to be replaced
* @param offset the offset into the text at which the pattern
* matches. This is the offset to the point after the ante
* context, if any, and before the match string and any post
* context.
* @param data the RuleBasedTransliterator.Data object specifying
* context for this transliterator.
* @return the change in the length of the text
*/
int32_t TransliterationRule::replace(Replaceable& text, int32_t offset,
const TransliterationRuleData& data) const {
if (segments == NULL) {
text.handleReplaceBetween(offset, offset + keyLength, output);
return output.length() - keyLength;
} else {
/* When there are segments to be copied, use the Replaceable.copy()
* API in order to retain out-of-band data. Copy everything to the
* point after the key, then delete the key. That is, copy things
* into offset + keyLength, then replace offset .. offset +
* keyLength with the empty string.
*
* Minimize the number of calls to Replaceable.replace() and
* Replaceable.copy().
*/
int32_t textStart = offset - anteContextLength;
int32_t dest = offset + keyLength; // copy new text to here
UnicodeString buf;
for (int32_t i=0; i<output.length(); ++i) {
UChar c = output.charAt(i);
int32_t b = data.lookupSegmentReference(c);
if (b < 0) {
// Accumulate straight (non-segment) text.
buf.append(c);
} else {
// Insert any accumulated straight text.
if (buf.length() > 0) {
text.handleReplaceBetween(dest, dest, buf);
dest += buf.length();
buf.remove();
}
// Copy segment with out-of-band data
b *= 2;
text.copy(textStart + segments[b],
textStart + segments[b+1], dest);
dest += segments[b+1] - segments[b];
}
}
// Insert any accumulated straight text.
if (buf.length() > 0) {
text.handleReplaceBetween(dest, dest, buf);
dest += buf.length();
}
// Delete the key
buf.remove();
text.handleReplaceBetween(offset, offset + keyLength, buf);
return dest - (offset + keyLength) - keyLength;
}
}
/**
* Internal method. Returns true if this rule matches the given
* index value. The index value is an 8-bit integer, 0..255,
@ -294,14 +234,13 @@ int32_t TransliterationRule::replace(Replaceable& text, int32_t offset,
* value. If the rule contains only ante context, as in foo)>bar,
* then it will match any key.
*/
UBool TransliterationRule::matchesIndexValue(uint8_t v,
const TransliterationRuleData& data) const {
UBool TransliterationRule::matchesIndexValue(uint8_t v) const {
if (anteContextLength == pattern.length()) {
// A pattern with just ante context {such as foo)>bar} can
// match any key.
return TRUE;
}
UChar c = pattern.charAt(anteContextLength);
UChar32 c = pattern.char32At(anteContextLength);
const UnicodeSet* set = data.lookupSet(c);
return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
}
@ -328,6 +267,22 @@ UBool TransliterationRule::masks(const TransliterationRule& r2) const {
* of) the corresponding characters of r2. The superset
* operation should be performed to check for UnicodeSet
* masking.
*
* Anchors: Two patterns that differ only in anchors only
* mask one another if they are exactly equal, and r2 has
* all the anchors r1 has (optionally, plus some). Here Y
* means the row masks the column, N means it doesn't.
*
* ab ^ab ab$ ^ab$
* ab Y Y Y Y
* ^ab N Y N Y
* ab$ N N Y Y
* ^ab$ N N N Y
*
* Post context: {a}b masks ab, but not vice versa, since {a}b
* matches everything ab matches, and {a}b matches {|a|}b but ab
* does not. Pre context is different (a{b} does not align with
* ab).
*/
/* LIMITATION of the current mask algorithm: Some rule
@ -340,126 +295,242 @@ UBool TransliterationRule::masks(const TransliterationRule& r2) const {
int32_t left2 = r2.anteContextLength;
int32_t right = len - left;
int32_t right2 = r2.pattern.length() - left2;
return left <= left2 && right <= right2 &&
// TODO Clean this up -- some logic might be combinable with the
// next statement.
// Test for anchor masking
if (left == left2 && right == right2 &&
keyLength <= r2.keyLength &&
0 == r2.pattern.compare(0, len, pattern)) {
// The following boolean logic implements the table above
return (flags == r2.flags) ||
(!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) ||
((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END));
}
return left <= left2 &&
(right < right2 ||
(right == right2 && keyLength <= r2.keyLength)) &&
0 == r2.pattern.compare(left2 - left, len, pattern);
}
/**
* Return true if this rule matches the given text.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* Attempt a match and replacement at the given position. Return
* the degree of match between this rule and the given text. The
* degree of match may be mismatch, a partial match, or a full
* match. A mismatch means at least one character of the text
* does not match the context or key. A partial match means some
* context and key characters match, but the text is not long
* enough to match all of them. A full match means all context
* and key characters match.
*
* If a full match is obtained, perform a replacement, update pos,
* and return U_MATCH. Otherwise both text and pos are unchanged.
*
* @param text the text
* @param pos the position indices
* @param incremental if TRUE, test for partial matches that may
* be completed by additional text inserted at pos.limit.
* @return one of <code>U_MISMATCH</code>,
* <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
* incremental is FALSE then U_PARTIAL_MATCH will not be returned.
*/
UBool TransliterationRule::matches(const Replaceable& text,
const UTransPosition& pos,
const TransliterationRuleData& data) const {
// Match anteContext, key, and postContext
int32_t cursor = pos.start - anteContextLength;
// Quick length check; this is a performance win for long rules.
// Widen by one (on both sides) to allow anchor matching.
if (cursor < (pos.contextStart - 1) ||
(cursor + pattern.length()) > (pos.contextLimit + 1)) {
return FALSE;
}
for (int32_t i=0; i<pattern.length(); ++i, ++cursor) {
if (!charMatches(pattern.charAt(i), text, cursor, pos,
data)) {
return FALSE;
}
}
return TRUE;
}
UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
UTransPosition& pos,
UBool incremental) const {
// Matching and replacing are done in one method because the
// replacement operation needs information obtained during the
// match. Another way to do this is to have the match method
// create a match result struct with relevant offsets, and to pass
// this into the replace method.
/**
* Return the degree of match between this rule and the given text. The
* degree of match may be mismatch, a partial match, or a full match. A
* mismatch means at least one character of the text does not match the
* context or key. A partial match means some context and key characters
* match, but the text is not long enough to match all of them. A full
* match means all context and key characters match.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @return one of <code>MISMATCH</code>, <code>PARTIAL_MATCH</code>, or
* <code>FULL_MATCH</code>.
* @see #MISMATCH
* @see #PARTIAL_MATCH
* @see #FULL_MATCH
*/
int32_t TransliterationRule::getMatchDegree(const Replaceable& text,
const UTransPosition& pos,
const TransliterationRuleData& data) const {
int len = getRegionMatchLength(text, pos, data);
return len < anteContextLength ? MISMATCH :
(len < pattern.length() ? PARTIAL_MATCH : FULL_MATCH);
}
// ============================ MATCH ===========================
/**
* Return the number of characters of the text that match this rule. If
* there is a mismatch, return -1. If the text is not long enough to match
* any characters, return 0.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
* @return -1 if there is a mismatch, 0 if the text is not long enough to
* match any characters, otherwise the number of characters of text that
* match this rule.
*/
int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text,
const UTransPosition& pos,
const TransliterationRuleData& data) const {
int32_t cursor = pos.start - anteContextLength;
// Quick length check; this is a performance win for long rules.
// Widen by one to allow anchor matching.
if (cursor < (pos.contextStart - 1)) {
return -1;
}
// Record the positions of segments. We assume the following:
// - The maximum number of segments is 9.
// - The segment indices occur in ascending order. That is,
// segment 1 start <= segment 1 limit <= sement 2 start...
// - The segments have been validated such that there are no
// references to nonexistent segments.
// - The end of the segment array is marked by a start of -1.
// Currently, the parser enforces all of these constraints.
// In the future, the first two constraints may be lifted,
// in which case this method will have to be modified.
int32_t segPos[18];
int32_t iSeg = firstKeySeg - 1;
int32_t nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1;
// ------------------------ Ante Context ------------------------
// A mismatch in the ante context, or with the start anchor,
// is an outright U_MISMATCH regardless of whether we are
// incremental or not.
int32_t cursor = pos.start - 1;
int32_t i;
for (i=0; i<pattern.length() && cursor<pos.contextLimit; ++i, ++cursor) {
if (!charMatches(pattern.charAt(i), text, cursor, pos,
data)) {
return -1;
for (i=anteContextLength-1; i>=0; --i) {
while (i == nextSegPos) {
segPos[iSeg] = cursor;
nextSegPos == (--iSeg >= 0) ? segments[iSeg] : -1;
}
UChar keyChar = pattern.charAt(i);
const UnicodeSet* set = data.lookupSet(keyChar);
if (set == 0) {
if (cursor >= pos.contextStart &&
keyChar == text.charAt(cursor)) {
--cursor;
} else {
return U_MISMATCH;
}
} else {
// Subtract 1 from contextStart to make it a reverse limit
if (set->matches(text, cursor, pos.contextStart-1, FALSE)
!= U_MATCH) {
return U_MISMATCH;
}
}
}
return i;
}
/**
* Return true if the given key matches the given text. This method
* accounts for the fact that the key character may represent a character
* set. Note that the key and text characters may not be interchanged
* without altering the results.
* @param keyChar a character in the match key
* @param textChar a character in the text being transliterated
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
*/
UBool TransliterationRule::charMatches(UChar keyChar, const Replaceable& text,
int32_t index,
const UTransPosition& pos,
const TransliterationRuleData& data) const {
const UnicodeSet* set = 0;
UChar textChar = (index >= pos.contextStart && index < pos.contextLimit)
? text.charAt(index) : ETHER;
return ((set = data.lookupSet(keyChar)) == 0) ?
keyChar == textChar : set->contains(textChar);
// ------------------------ Start Anchor ------------------------
if ((flags & ANCHOR_START) && cursor != (pos.contextStart-1)) {
return U_MISMATCH;
}
// -------------------- Key and Post Context --------------------
iSeg = firstKeySeg;
nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1;
i = 0;
cursor = pos.start;
int32_t keyLimit = 0;
while (i < (pattern.length() - anteContextLength)) {
if (incremental && cursor == pos.contextLimit) {
// We've reached the context limit without a mismatch and
// without completing our match.
return U_PARTIAL_MATCH;
}
if (cursor == pos.limit && i < keyLength) {
// We're still in the pattern key but we're entering the
// post context.
return U_MISMATCH;
}
while (i == nextSegPos) {
segPos[iSeg] = cursor;
nextSegPos = segments[++iSeg];
}
if (i == keyLength) {
keyLimit = cursor;
}
UChar keyChar = pattern.charAt(anteContextLength + i++);
const UnicodeSet* set = data.lookupSet(keyChar);
if (set == 0) {
// Don't need the cursor < pos.contextLimit check if
// incremental is TRUE (because it's done above); do need
// it otherwise.
if (cursor < pos.contextLimit &&
keyChar == text.charAt(cursor)) {
++cursor;
} else {
return U_MISMATCH;
}
} else {
UMatchDegree m =
set->matches(text, cursor, pos.contextLimit, incremental);
if (m != U_MATCH) {
return m;
}
}
}
while (i == nextSegPos) {
segPos[iSeg] = cursor;
nextSegPos = segments[++iSeg];
}
if (i == keyLength) {
keyLimit = cursor;
}
// ------------------------- Stop Anchor ------------------------
if ((flags & ANCHOR_END) != 0) {
if (cursor != pos.contextLimit) {
return U_MISMATCH;
}
if (incremental) {
return U_PARTIAL_MATCH;
}
}
// =========================== REPLACE ==========================
// We have a full match. The key is between pos.start and
// keyLimit. Segment indices have been recorded in segPos[].
// Perform a replacement.
int32_t lenDelta = 0;
if (segments == NULL) {
text.handleReplaceBetween(pos.start, keyLimit, output);
lenDelta = output.length() - (keyLimit - pos.start);
pos.start += cursorPos;
} else {
/* When there are segments to be copied, use the Replaceable.copy()
* API in order to retain out-of-band data. Copy everything to the
* point after the key, then delete the key. That is, copy things
* into offset + keyLength, then replace offset .. offset +
* keyLength with the empty string.
*
* Minimize the number of calls to Replaceable.replace() and
* Replaceable.copy().
*/
int32_t dest = keyLimit; // copy new text to here
UnicodeString buf;
for (i=0; i<output.length(); ) {
if (i == cursorPos) {
// Record the position of the cursor
cursor = dest;
}
UChar32 c = output.char32At(i);
int32_t b = data.lookupSegmentReference(c);
if (b < 0) {
// Accumulate straight (non-segment) text.
buf.append(c);
} else {
// Insert any accumulated straight text.
if (buf.length() > 0) {
text.handleReplaceBetween(dest, dest, buf);
dest += buf.length();
buf.remove();
}
// Copy segment with out-of-band data
b *= 2;
text.copy(segPos[b], segPos[b+1], dest);
dest += segPos[b+1] - segPos[b];
}
i += UTF_CHAR_LENGTH(c);
}
// Insert any accumulated straight text.
if (buf.length() > 0) {
text.handleReplaceBetween(dest, dest, buf);
dest += buf.length();
}
if (i == cursorPos) {
// Record the position of the cursor
cursor = dest;
}
// Delete the key
buf.remove();
text.handleReplaceBetween(pos.start, keyLimit, buf);
lenDelta = dest - keyLimit - (keyLimit - pos.start);
pos.start = cursor - (keyLimit - pos.start);
}
pos.limit += lenDelta;
pos.contextLimit += lenDelta;
return U_MATCH;
}
/**
@ -570,7 +641,6 @@ void TransliterationRule::_appendToRule(UnicodeString& rule,
* given string.
*/
UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
const TransliterationRuleData& data,
UBool escapeUnprintable) const {
int32_t i;
@ -674,3 +744,5 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
return rule;
}
//eof

View file

@ -10,6 +10,7 @@
#include "unicode/unistr.h"
#include "unicode/utrans.h"
#include "unicode/unimatch.h"
class Replaceable;
class TransliterationRuleData;
@ -36,39 +37,6 @@ class TransliterationRule {
public:
/**
* Constants returned by <code>getMatchDegree()</code> indicating
* the degree of match between the text and this rule.
* @see #getMatchDegree
*/
enum {
/**
* Constant returned by <code>getMatchDegree()</code>
* indicating a mismatch between the text and this rule. One
* or more characters of the context or key do not match the
* text.
*/
MISMATCH,
/**
* Constant returned by <code>getMatchDegree()</code>
* indicating a partial match between the text and this rule.
* All characters of the text match the corresponding context
* or key, but more characters are required for a complete
* match. There are some key or context characters at the end
* of the pattern that remain unmatched because the text isn't
* long enough.
*/
PARTIAL_MATCH,
/**
* Constant returned by <code>getMatchDegree()</code>
* indicating a complete match between the text and this rule.
* The text matches all context and key characters.
*/
FULL_MATCH
};
/**
* The character at index i, where i < contextStart || i >= contextLimit,
* is ETHER. This allows explicit matching by rules and UnicodeSets
@ -109,6 +77,14 @@ private:
*/
int32_t* segments;
/**
* A value we compute from segments. The first index into segments[]
* that is >= anteContextLength. That is, the first one that is within
* the forward scanned part of the pattern -- the key or the postContext.
* If there are no segments, this has the value -1.
*/
int32_t firstKeySeg;
/**
* The length of the string that must match before the key. If
* zero, then there is no matching requirement before the key.
@ -130,6 +106,25 @@ private:
*/
int32_t cursorPos;
/**
* Miscellaneous attributes.
*/
int8_t flags;
/**
* Flag attributes.
*/
enum {
ANCHOR_START = 1,
ANCHOR_END = 2,
};
/**
* A reference to the data for this rule. The data provides
* lookup services for matchers and segments.
*/
const TransliterationRuleData& data;
public:
/**
@ -169,6 +164,7 @@ public:
int32_t cursorPosition, int32_t cursorOffset,
int32_t* adoptedSegs,
UBool anchorStart, UBool anchorEnd,
const TransliterationRuleData& data,
UErrorCode& status);
/**
@ -192,6 +188,7 @@ public:
int32_t anteContextPos, int32_t postContextPos,
const UnicodeString& outputStr,
int32_t cursorPosition,
const TransliterationRuleData& data,
UErrorCode& status);
/**
@ -213,9 +210,13 @@ public:
/**
* Return the preceding context length. This method is needed to
* support the <code>Transliterator</code> method
* <code>getMaximumContextLength()</code>.
* <code>getMaximumContextLength()</code>. Internally, this is
* implemented as the anteContextLength, optionally plus one if
* there is a start anchor. The one character anchor gap is
* needed to make repeated incremental transliteration with
* anchors work.
*/
virtual int32_t getAnteContextLength(void) const;
virtual int32_t getContextLength(void) const;
/**
* Internal method. Returns 8-bit index value for this rule.
@ -223,24 +224,7 @@ public:
* unless the first character of the key is a set. If it's a
* set, or otherwise can match multiple keys, the index value is -1.
*/
int16_t getIndexValue(const TransliterationRuleData& data) const;
/**
* Do a replacement of the input pattern with the output text in
* the given string, at the given offset. This method assumes
* that a match has already been found in the given text at the
* given position.
* @param text the text containing the substring to be replaced
* @param offset the offset into the text at which the pattern
* matches. This is the offset to the point after the ante
* context, if any, and before the match string and any post
* context.
* @param data the RuleBasedTransliterator.Data object specifying
* context for this transliterator.
* @return the change in the length of the text
*/
int32_t replace(Replaceable& text, int32_t offset,
const TransliterationRuleData& data) const;
int16_t getIndexValue() const;
/**
* Internal method. Returns true if this rule matches the given
@ -252,8 +236,7 @@ public:
* value. If the rule contains only ante context, as in foo)>bar,
* then it will match any key.
*/
UBool matchesIndexValue(uint8_t v,
const TransliterationRuleData& data) const;
UBool matchesIndexValue(uint8_t v) const;
/**
* Return true if this rule masks another rule. If r1 masks r2 then
@ -264,88 +247,35 @@ public:
virtual UBool masks(const TransliterationRule& r2) const;
/**
* Return true if this rule matches the given text.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* Attempt a match and replacement at the given position. Return
* the degree of match between this rule and the given text. The
* degree of match may be mismatch, a partial match, or a full
* match. A mismatch means at least one character of the text
* does not match the context or key. A partial match means some
* context and key characters match, but the text is not long
* enough to match all of them. A full match means all context
* and key characters match.
*
* If a full match is obtained, perform a replacement, update pos,
* and return U_MATCH. Otherwise both text and pos are unchanged.
*
* @param text the text
* @param pos the position indices
* @param incremental if TRUE, test for partial matches that may
* be completed by additional text inserted at pos.limit.
* @return one of <code>U_MISMATCH</code>,
* <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
* incremental is FALSE then U_PARTIAL_MATCH will not be returned.
*/
virtual UBool matches(const Replaceable& text,
const UTransPosition& pos,
const TransliterationRuleData& data) const;
/**
* Return the degree of match between this rule and the given text. The
* degree of match may be mismatch, a partial match, or a full match. A
* mismatch means at least one character of the text does not match the
* context or key. A partial match means some context and key characters
* match, but the text is not long enough to match all of them. A full
* match means all context and key characters match.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @return one of <code>MISMATCH</code>, <code>PARTIAL_MATCH</code>, or
* <code>FULL_MATCH</code>.
* @see #MISMATCH
* @see #PARTIAL_MATCH
* @see #FULL_MATCH
*/
virtual int32_t getMatchDegree(const Replaceable& text,
const UTransPosition& pos,
const TransliterationRuleData& data) const;
/**
* Return the number of characters of the text that match this rule. If
* there is a mismatch, return -1. If the text is not long enough to match
* any characters, return 0.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
* @return -1 if there is a mismatch, 0 if the text is not long enough to
* match any characters, otherwise the number of characters of text that
* match this rule.
*/
virtual int32_t getRegionMatchLength(const Replaceable& text,
const UTransPosition& pos,
const TransliterationRuleData& data) const;
/**
* Return true if the given key matches the given text. This method
* accounts for the fact that the key character may represent a character
* set. Note that the key and text characters may not be interchanged
* without altering the results.
* @param keyChar a character in the match key
* @param textChar a character in the text being transliterated
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
*/
virtual UBool charMatches(UChar keyChar, const Replaceable& textChar,
int32_t index,
const UTransPosition& pos,
const TransliterationRuleData& data) const;
UMatchDegree matchAndReplace(Replaceable& text,
UTransPosition& pos,
UBool incremental) const;
/**
* Create a rule string that represents this rule object. Append
* it to the given string.
*/
virtual UnicodeString& toRule(UnicodeString& pat,
const TransliterationRuleData& data,
UBool escapeUnprintable) const;
private:

View file

@ -74,7 +74,7 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
ruleVector->addElement(adoptedRule);
int32_t len;
if ((len = adoptedRule->getAnteContextLength()) > maxContextLength) {
if ((len = adoptedRule->getContextLength()) > maxContextLength) {
maxContextLength = len;
}
@ -92,8 +92,7 @@ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
* That is, <code>freeze()</code> may be called multiple times,
* although for optimal performance it shouldn't be.
*/
void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
UErrorCode& status) {
void TransliterationRuleSet::freeze(UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
@ -124,7 +123,7 @@ void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
int16_t* indexValue = new int16_t[n];
for (j=0; j<n; ++j) {
TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
indexValue[j] = r->getIndexValue(data);
indexValue[j] = r->getIndexValue();
}
for (x=0; x<256; ++x) {
index[x] = v.size();
@ -139,7 +138,7 @@ void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
// matchesIndexValue check. In practice this happens
// rarely, so we seldom tread this code path.
TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
if (r->matchesIndexValue((uint8_t)x, data)) {
if (r->matchesIndexValue((uint8_t)x)) {
v.addElement(r);
}
}
@ -192,87 +191,40 @@ void TransliterationRuleSet::freeze(const TransliterationRuleData& data,
}
/**
* Attempt to find a matching rule at the specified point in the text.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param data a dictionary mapping variables to the sets they
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
* @return the matching rule, or null if none found.
* Transliterate the given text with the given UTransPosition
* indices. Return TRUE if the transliteration should continue
* or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
* Note that FALSE is only ever returned if isIncremental is TRUE.
* @param text the text to be transliterated
* @param pos the position indices, which will be updated
* @param incremental if TRUE, assume new text may be inserted
* at index.limit, and return FALSE if thre is a partial match.
* @return TRUE unless a U_PARTIAL_MATCH has been obtained,
* indicating that transliteration should stop until more text
* arrives.
*/
TransliterationRule*
TransliterationRuleSet::findMatch(const Replaceable& text,
const UTransPosition& pos,
const TransliterationRuleData& data) const {
/* We only need to check our indexed bin of the rule table,
* based on the low byte of the first key character.
*/
int16_t x = (int16_t) (text.charAt(pos.start) & 0xFF);
for (int32_t i=index[x]; i<index[x+1]; ++i) {
if (rules[i]->matches(text, pos, data)) {
return rules[i];
UBool TransliterationRuleSet::transliterate(Replaceable& text,
UTransPosition& pos,
UBool incremental) {
int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF);
for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) {
UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental);
switch (m) {
case U_MATCH:
return TRUE;
case U_PARTIAL_MATCH:
return FALSE;
}
}
return NULL;
}
/**
* Attempt to find a matching rule at the specified point in the text.
* Unlike <code>findMatch()</code>, this method does an incremental match.
* An incremental match requires that there be no partial matches that might
* pre-empt the full match that is found. If there are partial matches,
* then null is returned. A non-null result indicates that a full match has
* been found, and that it cannot be pre-empted by a partial match
* regardless of what additional text is added to the translation buffer.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param data a dictionary mapping variables to the sets they
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
* @param partial output parameter. <code>partial[0]</code> is set to
* true if a partial match is returned.
* @return the matching rule, or null if none found, or if the text buffer
* does not have enough text yet to unambiguously match a rule.
*/
TransliterationRule*
TransliterationRuleSet::findIncrementalMatch(const Replaceable& text,
const UTransPosition& pos,
const TransliterationRuleData& data,
UBool& isPartial) const {
/* We only need to check our indexed bin of the rule table,
* based on the low byte of the first key character.
*/
isPartial = FALSE;
int16_t x = (int16_t) (text.charAt(pos.start) & 0xFF);
for (int32_t i=index[x]; i<index[x+1]; ++i) {
int32_t match = rules[i]->getMatchDegree(text, pos, data);
switch (match) {
case TransliterationRule::FULL_MATCH:
return rules[i];
case TransliterationRule::PARTIAL_MATCH:
isPartial = TRUE;
return NULL;
}
}
return NULL;
// No match or partial match from any rule
++pos.start;
return TRUE;
}
/**
* Create rule strings that represents this rule set.
*/
UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
const TransliterationRuleData& data,
UBool escapeUnprintable) const {
int32_t i;
int32_t count = index[256];
@ -281,7 +233,7 @@ UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
if (i != 0) {
ruleSource.append((UChar) 0x000A /*\n*/);
}
rules[i]->toRule(ruleSource, data, escapeUnprintable);
rules[i]->toRule(ruleSource, escapeUnprintable);
}
return ruleSource;
}

View file

@ -18,15 +18,7 @@ class UnicodeFilter;
class UnicodeString;
/**
* A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
* the transliteration in one direction from one set of characters or short
* strings to another. A <code>RuleBasedTransliterator</code> consists of up to
* two such sets, one for the forward direction, and one for the reverse.
*
* <p>A <code>TransliterationRuleSet</code> has one important operation, that of
* finding a matching rule at a given point in the text. This is accomplished
* by the <code>findMatch()</code> method.
*
* A set of rules for a <code>RuleBasedTransliterator</code>.
* @author Alan Liu
*/
class TransliterationRuleSet {
@ -98,59 +90,24 @@ public:
* That is, <code>freeze()</code> may be called multiple times,
* although for optimal performance it shouldn't be.
*/
virtual void freeze(const TransliterationRuleData& data,
UErrorCode& status);
virtual void freeze(UErrorCode& status);
/**
* Attempt to find a matching rule at the specified point in the text.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param data a dictionary mapping variables to the sets they
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
* <tt>null</tt> then no filtering is applied.
* @return the matching rule, or null if none found.
* Transliterate the given text with the given UTransPosition
* indices. Return TRUE if the transliteration should continue
* or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
* Note that FALSE is only ever returned if isIncremental is TRUE.
* @param text the text to be transliterated
* @param index the position indices, which will be updated
* @param isIncremental if TRUE, assume new text may be inserted
* at index.limit, and return FALSE if thre is a partial match.
* @return TRUE unless a U_PARTIAL_MATCH has been obtained,
* indicating that transliteration should stop until more text
* arrives.
*/
virtual TransliterationRule* findMatch(const Replaceable& text,
const UTransPosition& pos,
const TransliterationRuleData& data) const;
/**
* Attempt to find a matching rule at the specified point in the text.
* Unlike <code>findMatch()</code>, this method does an incremental match.
* An incremental match requires that there be no partial matches that might
* pre-empt the full match that is found. If there are partial matches,
* then null is returned. A non-null result indicates that a full match has
* been found, and that it cannot be pre-empted by a partial match
* regardless of what additional text is added to the translation buffer.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param data a dictionary mapping variables to the sets they
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
* @param partial output parameter. <code>partial[0]</code> is set to
* true if a partial match is returned.
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return the matching rule, or null if none found, or if the text buffer
* does not have enough text yet to unambiguously match a rule.
*/
virtual TransliterationRule* findIncrementalMatch(const Replaceable& text,
const UTransPosition& pos,
const TransliterationRuleData& data,
UBool& isPartial) const;
UBool transliterate(Replaceable& text,
UTransPosition& index,
UBool isIncremental);
/**
* Create rule strings that represents this rule set.
@ -158,7 +115,6 @@ public:
* contents will be deleted.
*/
virtual UnicodeString& toRules(UnicodeString& result,
const TransliterationRuleData& data,
UBool escapeUnprintable) const;
};
#endif

View file

@ -44,7 +44,7 @@ public:
* Lookup the UnicodeSet associated with the given character, and
* return it. Return <tt>null</tt> if not found.
*/
virtual const UnicodeSet* lookupSet(UChar ch) const = 0;
virtual const UnicodeSet* lookupSet(UChar32 ch) const = 0;
/**
* Parse a symbol reference name from the given string, starting

View file

@ -281,10 +281,20 @@ void Transliterator::transliterate(Replaceable& text,
*/
void Transliterator::transliterate(Replaceable& text,
UTransPosition& index,
UChar insertion,
UChar32 insertion,
UErrorCode& status) const {
UnicodeString str(insertion);
_transliterate(text, index, &str, status);
if (UTF_IS_LEAD(insertion)) {
// Oops, the caller passed us a single lead surrogate. In
// general, we don't support this, but we'll do the caller a
// favor in the special case of LEAD followed by TRAIL
// insertion. Anything else won't work.
text.handleReplaceBetween(index.limit, index.limit, str);
++index.limit;
++index.contextLimit;
} else {
_transliterate(text, index, &str, status);
}
}
/**
@ -351,8 +361,18 @@ void Transliterator::_transliterate(Replaceable& text,
filteredTransliterate(text, index, TRUE);
index.contextStart = uprv_max(index.start - getMaximumContextLength(),
originalStart);
// The purpose of the code below is to keep the context small
// while doing incremental transliteration. When part of the left
// context (between contextStart and start) is no longer needed,
// we try to advance contextStart past that portion. We use the
// maximum context length to do so.
int32_t newCS = index.start;
int32_t n = getMaximumContextLength();
while (newCS > originalStart && n-- > 0) {
--newCS;
newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1;
}
index.contextStart = uprv_max(newCS, originalStart);
}
/**

View file

@ -21,7 +21,7 @@ public:
NullFilter(UBool r) { result = r; }
NullFilter(const NullFilter& f) : UnicodeFilter(f) { result = f.result; }
virtual ~NullFilter() {}
virtual UBool contains(UChar /*c*/) const { return result; }
virtual UBool contains(UChar32 /*c*/) const { return result; }
virtual UnicodeFilter* clone() const { return new NullFilter(*this); }
};
@ -31,7 +31,7 @@ public:
UnicodeNotFilter(UnicodeFilter* adopted);
UnicodeNotFilter(const UnicodeNotFilter&);
virtual ~UnicodeNotFilter();
virtual UBool contains(UChar c) const;
virtual UBool contains(UChar32 c) const;
virtual UnicodeFilter* clone() const;
};
@ -39,7 +39,7 @@ UnicodeNotFilter::UnicodeNotFilter(UnicodeFilter* adopted) : filt(adopted) {}
UnicodeNotFilter::UnicodeNotFilter(const UnicodeNotFilter& f)
: UnicodeFilter(f), filt(f.filt->clone()) {}
UnicodeNotFilter::~UnicodeNotFilter() { delete filt; }
UBool UnicodeNotFilter::contains(UChar c) const { return !filt->contains(c); }
UBool UnicodeNotFilter::contains(UChar32 c) const { return !filt->contains(c); }
UnicodeFilter* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); }
/**
@ -61,7 +61,7 @@ public:
UnicodeAndFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2);
UnicodeAndFilter(const UnicodeAndFilter&);
virtual ~UnicodeAndFilter();
virtual UBool contains(UChar c) const;
virtual UBool contains(UChar32 c) const;
virtual UnicodeFilter* clone() const;
};
@ -69,7 +69,7 @@ UnicodeAndFilter::UnicodeAndFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1
UnicodeAndFilter::UnicodeAndFilter(const UnicodeAndFilter& f)
: UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
UnicodeAndFilter::~UnicodeAndFilter() { delete filt1; delete filt2; }
UBool UnicodeAndFilter::contains(UChar c) const { return filt1->contains(c) && filt2->contains(c); }
UBool UnicodeAndFilter::contains(UChar32 c) const { return filt1->contains(c) && filt2->contains(c); }
UnicodeFilter* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); }
/**
@ -99,7 +99,7 @@ public:
UnicodeOrFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2);
UnicodeOrFilter(const UnicodeOrFilter&);
virtual ~UnicodeOrFilter();
virtual UBool contains(UChar c) const;
virtual UBool contains(UChar32 c) const;
virtual UnicodeFilter* clone() const;
};
@ -107,7 +107,7 @@ UnicodeOrFilter::UnicodeOrFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f
UnicodeOrFilter::UnicodeOrFilter(const UnicodeOrFilter& f)
: UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
UnicodeOrFilter::~UnicodeOrFilter() { delete filt1; delete filt2; }
UBool UnicodeOrFilter::contains(UChar c) const { return filt1->contains(c) || filt2->contains(c); }
UBool UnicodeOrFilter::contains(UChar32 c) const { return filt1->contains(c) || filt2->contains(c); }
UnicodeFilter* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); }
/**

View file

@ -542,17 +542,6 @@ UBool UnicodeSet::contains(UChar32 c) const {
return ((i & 1) != 0); // return true if odd
}
/**
* Implement UnicodeFilter:
* Returns <tt>true</tt> if this set contains the specified char.
*
* @return <tt>true</tt> if this set contains the specified char.
* @draft
*/
UBool UnicodeSet::contains(UChar c) const {
return contains((UChar32) c);
}
/**
* Returns <tt>true</tt> if this set contains any character whose low byte
* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
@ -581,6 +570,24 @@ UBool UnicodeSet::containsIndexValue(uint8_t v) const {
return FALSE;
}
/**
* Implementation of UnicodeMatcher::matches().
*/
UMatchDegree UnicodeSet::matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental) const {
if (offset == limit) {
if (contains(TransliterationRule::ETHER)) {
return incremental ? U_PARTIAL_MATCH : U_MATCH;
} else {
return U_MISMATCH;
}
} else {
return UnicodeFilter::matches(text, offset, limit, incremental);
}
}
/**
* Adds the specified range to this set if it is not already
* present. If this set already contains the specified range,
@ -895,7 +902,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
UBool invert = FALSE;
clear();
int32_t lastChar = -1; // This is either a char (0..FFFF) or -1
const UChar32 NONE = (UChar32) -1;
UChar32 lastChar = NONE; // This is either a char (0..10FFFF) or NONE
UChar lastOp = 0;
/* This loop iterates over the characters in the pattern. We start at
@ -916,8 +924,9 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
// mode 1: '[' seen; if next is '^' or ':' then special
// mode 2: '[' '^'? seen; parse pattern and close with ']'
// mode 3: '[:' seen; parse category and close with ':]'
// mode 4: Pattern closed cleanly
int8_t mode = 0;
int32_t openPos = 0; // offset to opening '['
int32_t colonPos = 0; // Expected pos of ':' in '[:'
int32_t i = pos.getIndex();
int32_t limit = pattern.length();
UnicodeSet nestedAux;
@ -930,7 +939,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
const UnicodeString* varValueBuffer = NULL;
int32_t ivarValueBuffer = 0;
int32_t anchor = 0;
for (; i<limit; i+=((varValueBuffer==NULL)?1:0)) {
UChar32 c;
while (i<limit) {
/* If the next element is a single character, c will be set to it,
* and nestedSet will be null. In this case isLiteral indicates
* whether the character should assume special meaning if it has
@ -941,23 +951,25 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
*/
nestedSet = NULL;
UBool isLiteral = FALSE;
UChar c;
if (varValueBuffer != NULL) {
if (ivarValueBuffer < varValueBuffer->length()) {
c = varValueBuffer->charAt(ivarValueBuffer++);
c = varValueBuffer->char32At(ivarValueBuffer);
ivarValueBuffer += UTF_CHAR_LENGTH(c);
nestedSet = symbols->lookupSet(c); // may be NULL
nestedPatDone = FALSE;
} else {
varValueBuffer = NULL;
c = pattern.charAt(i);
c = pattern.char32At(i);
i += UTF_CHAR_LENGTH(c);
}
} else {
c = pattern.charAt(i);
c = pattern.char32At(i);
i += UTF_CHAR_LENGTH(c);
}
// Ignore whitespace. This is not Unicode whitespace, but Java
// whitespace, a subset of Unicode whitespace.
if (Unicode::isWhitespace(c)) {
if (u_isspace(c)) {
continue;
}
@ -971,7 +983,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
case 0:
if (c == SET_OPEN) {
mode = 1; // Next look for '^' or ':'
openPos = i;
colonPos = i; // Expect ':' at next offset
continue;
} else {
// throw new IllegalArgumentException("Missing opening '['");
@ -986,9 +998,10 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
newPat.append(c);
continue; // Back to top to fetch next character
case COLON:
if (i == openPos+1) {
// '[:' cannot have whitespace in it
--i;
// '[:' cannot have whitespace in it. 'i' has already
// been advanced.
if (i-1 == colonPos) {
--i; // Backup to the '['
c = SET_OPEN;
mode = 3;
// Fall through and parse category using the same
@ -1018,15 +1031,13 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
* interpret '\\uxxxx' Unicode escapes here (as literals).
*/
if (c == BACKSLASH) {
++i; // Advance past '\\'
UChar32 escaped = pattern.unescapeAt(i);
if (escaped == (UChar32) -1) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
isLiteral = TRUE;
--i; // Move i back to last parsed character
c = (UChar) escaped;
c = escaped;
}
/* Parse variable references. These are treated as literals. If a
@ -1036,7 +1047,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
* Set variables are only looked up if varCharToSet is not null.
*/
else if (symbols != NULL && !isLiteral && c == SymbolTable::SYMBOL_REF) {
pos.setIndex(++i);
pos.setIndex(i);
UnicodeString name = symbols->parseReference(pattern, pos, limit);
if (name.length() != 0) {
varValueBuffer = symbols->lookup(name);
@ -1052,7 +1063,6 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
// Got a null; this means we have an isolated $.
// Tentatively assume this is an anchor.
anchor = 1;
--i; // Back up so loop increment works properly
}
continue; // Back to the top to get varValueBuffer[0]
}
@ -1069,9 +1079,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
nestedPatStart = newPat.length();
// Handle "[:...:]", representing a character category
UChar d = charAfter(pattern, i);
if (d == COLON) {
i += 2;
if (i < pattern.length() && pattern.charAt(i) == COLON) {
++i;
int32_t j = pattern.indexOf(CATEGORY_CLOSE, i);
if (j < 0) {
// throw new IllegalArgumentException("Missing \":]\"");
@ -1086,7 +1095,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
if (U_FAILURE(status)) {
return;
}
i = j+1; // Make i point to ']' in ":]"
i = j+2; // Advance i past ":]"
// Use a rebuilt pattern. If we are top level,
// then there is already a SET_OPEN in newPat, and
@ -1105,11 +1114,13 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
// loop. This is one of 2 ways we leave this
// loop if the pattern is well-formed.
*this = *nestedSet;
mode = 4;
break;
}
} else {
// Recurse to get the pairs for this nested set.
pos.setIndex(i);
// Backup i to '['.
pos.setIndex(--i);
switch (lastOp) {
case HYPHEN:
case INTERSECTION:
@ -1122,7 +1133,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
if (U_FAILURE(status)) {
return;
}
i = pos.getIndex() - 1; // - 1 to point at ']'
i = pos.getIndex();
}
}
}
@ -1136,7 +1147,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
* ']' have special meanings.
*/
if (nestedSet != NULL) {
if (lastChar >= 0) {
if (lastChar != NONE) {
if (lastOp != 0) {
// throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
status = U_ILLEGAL_ARGUMENT_ERROR;
@ -1154,7 +1165,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
} else {
_appendToPat(newPat, lastChar, FALSE);
}
lastChar = -1;
lastChar = NONE;
}
switch (lastOp) {
case HYPHEN:
@ -1193,9 +1204,11 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
newPat.append((UChar)SymbolTable::SYMBOL_REF);
add(TransliterationRule::ETHER);
}
mode = 4;
break;
} else if (lastOp == 0 && !isLiteral && (c == HYPHEN || c == INTERSECTION)) {
lastOp = c;
// assert(c <= 0xFFFF);
lastOp = (UChar) c;
} else if (lastOp == HYPHEN) {
if (lastChar >= c) {
// Don't allow redundant (a-a) or empty (b-a) ranges;
@ -1210,14 +1223,14 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
newPat.append(HYPHEN);
_appendToPat(newPat, c, FALSE);
lastOp = 0;
lastChar = -1;
lastChar = NONE;
} else if (lastOp != 0) {
// We have <set>&<char> or <char>&<char>
// throw new IllegalArgumentException("Unquoted " + lastOp);
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
} else {
if (lastChar >= 0) {
if (lastChar != NONE) {
// We have <char><char>
add(lastChar, lastChar);
_appendToPat(newPat, lastChar, FALSE);
@ -1226,7 +1239,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
}
}
if (lastChar >= 0) {
if (lastChar != NONE) {
add(lastChar, lastChar);
_appendToPat(newPat, lastChar, FALSE);
}
@ -1252,19 +1265,13 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
complement();
}
/**
* i indexes the last character we parsed or is pattern.length(). In
* the latter case, we have run off the end without finding a closing
* ']'. Otherwise, we know i < pattern.length(), and we set the
* ParsePosition to the next character to be parsed.
*/
if (i == limit) {
if (mode != 4) {
// throw new IllegalArgumentException("Missing ']'");
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
pos.setIndex(i+1);
pos.setIndex(i);
// Use the rebuilt pattern (newPat) only if necessary. Prefer the
// generated pattern.
@ -1393,14 +1400,6 @@ const UnicodeSet& UnicodeSet::getCategorySet(int8_t cat) {
// Implementation: Utility methods
//----------------------------------------------------------------
/**
* Returns the character after the given position, or '\uFFFE' if
* there is none.
*/
UChar UnicodeSet::charAfter(const UnicodeString& str, int32_t i) {
return ((++i) < str.length()) ? str.charAt(i) : (UChar)0xFFFE;
}
void UnicodeSet::ensureCapacity(int32_t newLen) {
if (newLen <= capacity) return;
capacity = newLen + GROW_EXTRA;

View file

@ -75,7 +75,7 @@ class TestHangulFilter : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
return new TestHangulFilter(*this);
}
virtual UBool contains(UChar c) const {
virtual UBool contains(UChar32 c) const {
if(c == 0xae4c )
return FALSE;
else

View file

@ -59,7 +59,7 @@ class TestHexFilter : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
return new TestHexFilter(*this);
}
virtual UBool contains(UChar c) const {
virtual UBool contains(UChar32 c) const {
if(c == 0x0061 || c == 0x0063 )
return FALSE;
else

View file

@ -314,14 +314,20 @@ IntlTest::prettify(const UnicodeString &source,
target.remove();
target += "\"";
for (i = 0; i < source.length(); i += 1)
for (i = 0; i < source.length(); )
{
UChar ch = source[i];
UChar32 ch = source.char32At(i);
i += UTF_CHAR_LENGTH(ch);
if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E)
{
target += "\\u";
appendHex(ch, 4, target);
if (ch <= 0xFFFF) {
target += "\\u";
appendHex(ch, 4, target);
} else {
target += "\\U";
appendHex(ch, 8, target);
}
}
else
{
@ -343,9 +349,10 @@ IntlTest::prettify(const UnicodeString &source, UBool parseBackslash)
target.remove();
target += "\"";
for (i = 0; i < source.length(); i += 1)
for (i = 0; i < source.length();)
{
UChar ch = source[i];
UChar32 ch = source.char32At(i);
i += UTF_CHAR_LENGTH(ch);
if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E)
{
@ -365,8 +372,13 @@ IntlTest::prettify(const UnicodeString &source, UBool parseBackslash)
target.truncate(target.length() - 1);
}
}
target += "\\u";
appendHex(ch, 4, target);
if (ch <= 0xFFFF) {
target += "\\u";
appendHex(ch, 4, target);
} else {
target += "\\U";
appendHex(ch, 8, target);
}
}
else
{

View file

@ -73,7 +73,7 @@ class TestJamoFilter : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
return new TestJamoFilter(*this);
}
virtual UBool contains(UChar c) const {
virtual UBool contains(UChar32 c) const {
if(c == 0x1101 )
return FALSE;
else

View file

@ -618,7 +618,7 @@ class TestFilter1 : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
return new TestFilter1(*this);
}
virtual UBool contains(UChar c) const {
virtual UBool contains(UChar32 c) const {
if(c==0x63 || c==0x61 || c==0x43 || c==0x41)
return FALSE;
else
@ -629,7 +629,7 @@ class TestFilter2 : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
return new TestFilter2(*this);
}
virtual UBool contains(UChar c) const {
virtual UBool contains(UChar32 c) const {
if(c==0x65 || c==0x6c)
return FALSE;
else
@ -640,7 +640,7 @@ class TestFilter3 : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
return new TestFilter3(*this);
}
virtual UBool contains(UChar c) const {
virtual UBool contains(UChar32 c) const {
if(c==0x6f || c==0x77)
return FALSE;
else

View file

@ -66,6 +66,8 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE(30,TestCompoundFilter);
TESTCASE(31,TestRemove);
TESTCASE(32,TestToRules);
TESTCASE(33,TestContext);
TESTCASE(34,TestSupplemental);
default: name = ""; break;
}
}
@ -152,7 +154,9 @@ void TransliteratorTest::TestSimpleRules(void) {
*/
expect(UnicodeString("ab>x|y;", "") +
"yc>z",
"eabcd", "exzd"); /* Another set of rules:
"eabcd", "exzd");
/* Another set of rules:
* 1. ab>x|yzacw
* 2. za>q
* 3. qc>r
@ -476,7 +480,7 @@ class TestFilter : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
return new TestFilter(*this);
}
virtual UBool contains(UChar c) const {
virtual UBool contains(UChar32 c) const {
return c != (UChar)0x0063 /*c*/;
}
};
@ -506,6 +510,12 @@ void TransliteratorTest::TestFiltering(void) {
* Test anchors
*/
void TransliteratorTest::TestAnchors(void) {
expect(UnicodeString("^a > 0; a$ > 2 ; a > 1;", ""),
"aaa",
"012");
expect(UnicodeString("$s=[z$]; $s{a>0; a}$s>2; a>1;", ""),
"aaa",
"012");
expect(UnicodeString("^ab > 01 ;"
" ab > |8 ;"
" b > k ;"
@ -1451,18 +1461,44 @@ void TransliteratorTest::TestToRules(void) {
}
}
void TransliteratorTest::TestContext() {
UTransPosition pos = {0, 2, 0, 1}; // cs cl s l
expect("de > x; {d}e > y;",
"de",
"ye",
&pos);
expect("ab{c} > z;",
"xadabdabcy",
"xadabdabzy");
}
void TransliteratorTest::TestSupplemental() {
expect(CharsToUnicodeString("$a=\\U00010300; $s=[\\U00010300-\\U00010323];"
"a > $a; $s > i;"),
CharsToUnicodeString("ab\\U0001030Fx"),
CharsToUnicodeString("\\U00010300bix"));
expect(CharsToUnicodeString("$a=[a-z\\U00010300-\\U00010323];"
"$b=[A-Z\\U00010400-\\U0001044D];"
"($a)($b) > $2 $1;"),
CharsToUnicodeString("aB\\U00010300\\U00010400c\\U00010401\\U00010301D"),
CharsToUnicodeString("Ba\\U00010400\\U00010300\\U00010401cD\\U00010301"));
}
//======================================================================
// Support methods
//======================================================================
void TransliteratorTest::expect(const UnicodeString& rules,
const UnicodeString& source,
const UnicodeString& expectedResult) {
const UnicodeString& expectedResult,
UTransPosition *pos) {
UErrorCode status = U_ZERO_ERROR;
Transliterator *t = new RuleBasedTransliterator("<ID>", rules, status);
if (U_FAILURE(status)) {
errln("FAIL: Transliterator constructor failed");
} else {
expect(*t, source, expectedResult);
expect(*t, source, expectedResult, pos);
}
delete t;
}
@ -1477,34 +1513,49 @@ void TransliteratorTest::expect(const Transliterator& t,
void TransliteratorTest::expect(const Transliterator& t,
const UnicodeString& source,
const UnicodeString& expectedResult) {
UnicodeString result(source);
t.transliterate(result);
expectAux(t.getID() + ":String", source, result, expectedResult);
const UnicodeString& expectedResult,
UTransPosition *pos) {
if (pos == 0) {
UnicodeString result(source);
t.transliterate(result);
expectAux(t.getID() + ":String", source, result, expectedResult);
}
UTransPosition index={0, 0, 0, 0};
if (pos != 0) {
index = *pos;
}
UnicodeString rsource(source);
t.transliterate(rsource);
if (pos == 0) {
t.transliterate(rsource);
} else {
// Do it all at once -- below we do it incrementally
t.finishTransliteration(rsource, *pos);
}
expectAux(t.getID() + ":Replaceable", source, rsource, expectedResult);
// Test keyboard (incremental) transliteration -- this result
// must be the same after we finalize (see below).
rsource.remove();
UTransPosition index={0, 0, 0, 0};
UnicodeString log;
for (int32_t i=0; i<source.length(); ++i) {
if (i != 0) {
log.append(" + ");
}
log.append(source.charAt(i)).append(" -> ");
rsource.remove();
if (pos != 0) {
rsource = source;
formatInput(log, rsource, index);
log.append(" -> ");
UErrorCode status = U_ZERO_ERROR;
t.transliterate(rsource, index, source.charAt(i), status);
// Append the string buffer with a vertical bar '|' where
// the committed index is.
UnicodeString left, right;
rsource.extractBetween(0, index.start, left);
rsource.extractBetween(index.start, rsource.length(), right);
log.append(left).append((UChar)PIPE).append(right);
t.transliterate(rsource, index, status);
formatInput(log, rsource, index);
} else {
for (int32_t i=0; i<source.length(); ++i) {
if (i != 0) {
log.append(" + ");
}
log.append(source.charAt(i)).append(" -> ");
UErrorCode status = U_ZERO_ERROR;
t.transliterate(rsource, index, source.charAt(i), status);
formatInput(log, rsource, index);
}
}
// As a final step in keyboard transliteration, we must call
@ -1518,6 +1569,41 @@ void TransliteratorTest::expect(const Transliterator& t,
expectedResult);
}
/**
* @param appendTo result is appended to this param.
* @param input the string being transliterated
* @param pos the index struct
*/
UnicodeString& TransliteratorTest::formatInput(UnicodeString &appendTo,
const UnicodeString& input,
const UTransPosition& pos) {
// Output a string of the form aaa{bbb|ccc|ddd}eee, where
// the {} indicate the context start and limit, and the ||
// indicate the start and limit.
if (0 <= pos.contextStart &&
pos.contextStart <= pos.start &&
pos.start <= pos.limit &&
pos.limit <= pos.contextLimit &&
pos.contextLimit <= input.length()) {
UnicodeString a, b, c, d, e;
input.extractBetween(0, pos.contextStart, a);
input.extractBetween(pos.contextStart, pos.start, b);
input.extractBetween(pos.start, pos.limit, c);
input.extractBetween(pos.limit, pos.contextLimit, d);
input.extractBetween(pos.contextLimit, input.length(), e);
appendTo.append(a).append((UChar)123/*{*/).append(b).
append((UChar)PIPE).append(c).append((UChar)PIPE).append(d).
append((UChar)125/*}*/).append(e);
} else {
appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
pos.contextStart + ", s=" + pos.start + ", l=" +
pos.limit + ", cl=" + pos.contextLimit + "} on " +
input);
}
return appendTo;
}
void TransliteratorTest::expectAux(const UnicodeString& tag,
const UnicodeString& source,
const UnicodeString& result,

View file

@ -11,6 +11,7 @@
#define TRANSTST_H
#include "unicode/utypes.h"
#include "unicode/translit.h"
#include "intltest.h"
class Transliterator;
@ -167,13 +168,18 @@ class TransliteratorTest : public IntlTest {
void TestToRules(void);
void TestContext(void);
void TestSupplemental(void);
//======================================================================
// Support methods
//======================================================================
protected:
void expect(const UnicodeString& rules,
const UnicodeString& source,
const UnicodeString& expectedResult);
const UnicodeString& expectedResult,
UTransPosition *pos=0);
void expect(const Transliterator& t,
const UnicodeString& source,
@ -182,7 +188,8 @@ class TransliteratorTest : public IntlTest {
void expect(const Transliterator& t,
const UnicodeString& source,
const UnicodeString& expectedResult);
const UnicodeString& expectedResult,
UTransPosition *pos=0);
void expectAux(const UnicodeString& tag,
const UnicodeString& source,
@ -192,6 +199,10 @@ class TransliteratorTest : public IntlTest {
virtual void expectAux(const UnicodeString& tag,
const UnicodeString& summary, UBool pass,
const UnicodeString& expectedResult);
static UnicodeString& formatInput(UnicodeString &appendTo,
const UnicodeString& input,
const UTransPosition& pos);
};
#endif

View file

@ -39,7 +39,7 @@ class Filter1: public UnicodeFilter{
virtual UnicodeFilter* clone() const{
return new Filter1(*this);
}
virtual UBool contains(UChar c) const {
virtual UBool contains(UChar32 c) const {
if(c == 0x0061 || c == 0x0041 || c == 0x0063 || c == 0x0043)
return FALSE;
else
@ -50,7 +50,7 @@ class Filter2: public UnicodeFilter{
virtual UnicodeFilter* clone() const{
return new Filter2(*this);
}
virtual UBool contains(UChar c) const {
virtual UBool contains(UChar32 c) const {
if(c == 0x0079 || c == 0x0059 || c == 0x007a || c == 0x005a || c == 0x0061 || c == 0x0063)
return FALSE;
else

View file

@ -71,7 +71,7 @@ class TestUniFilter : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
return new TestUniFilter(*this);
}
virtual UBool contains(UChar c) const {
virtual UBool contains(UChar32 c) const {
if(c==0x0063 || c==0x0061 || c==0x0043 || c==0x0041)
return FALSE;
else