ICU-5170 move RBBI from CharacterIterator to UText

X-SVN-Rev: 19579
This commit is contained in:
Andy Heninger 2006-04-22 05:29:27 +00:00
parent aca85b53cf
commit 9f85d5dd08
11 changed files with 588 additions and 595 deletions

View file

@ -71,22 +71,23 @@ UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
}
int32_t
UnhandledEngine::findBreaks( CharacterIterator *text,
UnhandledEngine::findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
int32_t breakType,
UStack &/*foundBreaks*/ ) const {
if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
UChar32 c = text->current32();
UChar32 c = utext_current32(text);
if (reverse) {
while(text->getIndex() > startPos && fHandled[breakType]->contains(c)) {
c = text->previous32();
while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
c = utext_previous32(text);
}
}
else {
while(text->getIndex() < endPos && fHandled[breakType]->contains(c)) {
c = text->next32();
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
utext_next32(text); // TODO: recast loop to work with post-increment operations.
c = utext_current32(text);
}
}
}
@ -164,7 +165,6 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
dictnlength = 0;
status = U_BUFFER_OVERFLOW_ERROR;
}
if (U_SUCCESS(status) && dictfname) {
UChar* extStart=u_strchr(dictfname, 0x002e);
int len = 0;

View file

@ -10,10 +10,10 @@
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/utext.h"
U_NAMESPACE_BEGIN
class CharacterIterator;
class UnicodeSet;
class UStack;
@ -58,7 +58,7 @@ class LanguageBreakEngine : public UMemory {
/**
* <p>Find any breaks within a run in the supplied text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* @param text A UText representing the text. The
* iterator is left at the end of the run of characters which the engine
* is capable of handling.
* @param startPos The start of the run within the supplied text.
@ -69,7 +69,7 @@ class LanguageBreakEngine : public UMemory {
* @param foundBreaks An allocated C array of the breaks found, if any
* @return The number of breaks found.
*/
virtual int32_t findBreaks( CharacterIterator *text,
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
@ -183,7 +183,7 @@ class UnhandledEngine : public LanguageBreakEngine {
/**
* <p>Find any breaks within a run in the supplied text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* @param text A UText representing the text (TODO: UText). The
* iterator is left at the end of the run of characters which the engine
* is capable of handling.
* @param startPos The start of the run within the supplied text.
@ -194,7 +194,7 @@ class UnhandledEngine : public LanguageBreakEngine {
* @param foundBreaks An allocated C array of the breaks found, if any
* @return The number of breaks found.
*/
virtual int32_t findBreaks( CharacterIterator *text,
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UBool reverse,

View file

@ -41,7 +41,7 @@ DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
}
int32_t
DictionaryBreakEngine::findBreaks( CharacterIterator *text,
DictionaryBreakEngine::findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
@ -50,30 +50,31 @@ DictionaryBreakEngine::findBreaks( CharacterIterator *text,
int32_t result = 0;
// Find the span of characters included in the set.
int32_t start = text->getIndex();
int32_t start = (int32_t)utext_getNativeIndex(text);
int32_t current;
int32_t rangeStart;
int32_t rangeEnd;
UChar32 c = text->current32();
UChar32 c = utext_current32(text);
if (reverse) {
UBool isDict = fSet.contains(c);
while((current = text->getIndex()) > startPos && isDict) {
c = text->previous32();
while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
c = utext_previous32(text);
isDict = fSet.contains(c);
}
rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
rangeEnd = start + 1;
}
else {
while((current = text->getIndex()) < endPos && fSet.contains(c)) {
c = text->next32();
while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
utext_next32(text); // TODO: recast loop for postincrement
c = utext_current32(text);
}
rangeStart = start;
rangeEnd = current;
}
if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
text->setIndex(current);
utext_setNativeIndex(text, current);
}
return result;
@ -116,14 +117,14 @@ class PossibleWord {
~PossibleWord();
// Fill the list of candidates if needed, select the longest, and return the number found
int candidates( CharacterIterator *text, const TrieWordDictionary *dict, int32_t rangeEnd );
int candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd );
// Select the currently marked candidate, point after it in the text, and invalidate self
int32_t acceptMarked( CharacterIterator *text );
int32_t acceptMarked( UText *text );
// Back up from the current candidate to the next shorter one; return TRUE if that exists
// and point the text after it
UBool backUp( CharacterIterator *text );
UBool backUp( UText *text );
// Return the longest prefix this candidate location shares with a dictionary word
int32_t longestPrefix();
@ -142,19 +143,19 @@ PossibleWord::~PossibleWord() {
}
inline int
PossibleWord::candidates( CharacterIterator *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
// TODO: If getIndex is too slow, use offset < 0 and add discardAll()
int32_t start = text->getIndex();
int32_t start = (int32_t)utext_getNativeIndex(text);
if (start != offset) {
offset = start;
prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
// Dictionary leaves text after longest prefix, not longest word. Back up.
if (count <= 0) {
text->setIndex(start);
utext_setNativeIndex(text, start);
}
}
if (count > 0) {
text->setIndex(start+lengths[count-1]);
utext_setNativeIndex(text, start+lengths[count-1]);
}
current = count-1;
mark = current;
@ -162,15 +163,15 @@ PossibleWord::candidates( CharacterIterator *text, const TrieWordDictionary *dic
}
inline int32_t
PossibleWord::acceptMarked( CharacterIterator *text ) {
text->setIndex(offset + lengths[mark]);
PossibleWord::acceptMarked( UText *text ) {
utext_setNativeIndex(text, offset + lengths[mark]);
return lengths[mark];
}
inline UBool
PossibleWord::backUp( CharacterIterator *text ) {
PossibleWord::backUp( UText *text ) {
if (current > 0) {
text->setIndex(offset + lengths[--current]);
utext_setNativeIndex(text, offset + lengths[--current]);
return TRUE;
}
return FALSE;
@ -231,7 +232,7 @@ ThaiBreakEngine::~ThaiBreakEngine() {
}
int32_t
ThaiBreakEngine::divideUpDictionaryRange( CharacterIterator *text,
ThaiBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const {
@ -246,9 +247,9 @@ ThaiBreakEngine::divideUpDictionaryRange( CharacterIterator *text,
PossibleWord words[THAI_LOOKAHEAD];
UChar32 uc;
text->setIndex(rangeStart);
utext_setNativeIndex(text, rangeStart);
while (U_SUCCESS(status) && (current = text->getIndex()) < rangeEnd) {
while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
wordLength = 0;
// Look for candidate words at the current position
@ -263,7 +264,7 @@ ThaiBreakEngine::divideUpDictionaryRange( CharacterIterator *text,
// If there was more than one, see which one can take us forward the most words
else if (candidates > 1) {
// If we're already at the end of the range, we're done
if (text->getIndex() >= rangeEnd) {
if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
goto foundBest;
}
do {
@ -276,7 +277,7 @@ ThaiBreakEngine::divideUpDictionaryRange( CharacterIterator *text,
}
// If we're already at the end of the range, we're done
if (text->getIndex() >= rangeEnd) {
if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
goto foundBest;
}
@ -302,7 +303,7 @@ foundBest:
// just found (if there is one), but only if the preceding word does not exceed
// the threshold.
// The text iterator should now be positioned at the end of the word we found.
if (text->getIndex() < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
// if it is a dictionary word, do nothing. If it isn't, then if there is
// no preceding word, or the non-word shares less than the minimum threshold
// of characters with a dictionary word, then scan to resynchronize
@ -312,10 +313,11 @@ foundBest:
// Look for a plausible word boundary
//TODO: This section will need a rework for UText.
int32_t remaining = rangeEnd - (current+wordLength);
UChar32 pc = text->current32();
UChar32 pc = utext_current32(text);
int32_t chars = 0;
while (TRUE) {
uc = text->next32();
utext_next32(text);
uc = utext_current32(text);
// TODO: Here we're counting on the fact that the SA languages are all
// in the BMP. This should get fixed with the UText rework.
chars += 1;
@ -329,7 +331,7 @@ foundBest:
// checking the dictionary. That is just a performance filter,
// but it's not clear it's faster than checking the trie.
int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
text->setIndex(current+wordLength+chars);
utext_setNativeIndex(text, current+wordLength+chars);
if (candidates > 0) {
break;
}
@ -347,49 +349,52 @@ foundBest:
}
else {
// Back up to where we were for next iteration
text->setIndex(current+wordLength);
utext_setNativeIndex(text, current+wordLength);
}
}
// Never stop before a combining mark.
int32_t currPos;
while ((currPos = text->getIndex()) < rangeEnd && fMarkSet.contains(text->current32())) {
wordLength += text->move32(1, CharacterIterator::kCurrent) - currPos;
while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
utext_next32(text);
wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
}
// Look ahead for possible suffixes if a dictionary word does not follow.
// We do this in code rather than using a rule so that the heuristic
// resynch continues to function. For example, one of the suffix characters
// could be a typo in the middle of a word.
if (text->getIndex() < rangeEnd && wordLength > 0) {
if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
&& fSuffixSet.contains(uc = text->current32())) {
&& fSuffixSet.contains(uc = utext_current32(text))) {
if (uc == THAI_PAIYANNOI) {
if (!fSuffixSet.contains(text->previous32())) {
if (!fSuffixSet.contains(utext_previous32(text))) {
// Skip over previous end and PAIYANNOI
text->move32(2, CharacterIterator::kCurrent);
utext_next32(text);
utext_next32(text);
wordLength += 1; // Add PAIYANNOI to word
uc = text->current32(); // Fetch next character
uc = utext_current32(text); // Fetch next character
}
else {
// Restore prior position
text->move32(1, CharacterIterator::kCurrent);
utext_next32(text);
}
}
if (uc == THAI_MAIYAMOK) {
if (text->previous32() != THAI_MAIYAMOK) {
if (utext_previous32(text) != THAI_MAIYAMOK) {
// Skip over previous end and MAIYAMOK
text->move32(2, CharacterIterator::kCurrent);
utext_next32(text);
utext_next32(text);
wordLength += 1; // Add MAIYAMOK to word
}
else {
// Restore prior position
text->move32(1, CharacterIterator::kCurrent);
utext_next32(text);
}
}
}
else {
text->setIndex(current+wordLength);
utext_setNativeIndex(text, current+wordLength);
}
}

View file

@ -10,11 +10,12 @@
#include "unicode/utypes.h"
#include "unicode/uniset.h"
#include "unicode/utext.h"
#include "brkeng.h"
U_NAMESPACE_BEGIN
class CharacterIterator;
class TrieWordDictionary;
/*******************************************************************
@ -78,7 +79,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
/**
* <p>Find any breaks within a run in the supplied text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* @param text A UText representing the text. The
* iterator is left at the end of the run of characters which the engine
* is capable of handling.
* @param startPos The start of the run within the supplied text.
@ -89,7 +90,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
* @param foundBreaks An allocated C array of the breaks found, if any
* @return The number of breaks found.
*/
virtual int32_t findBreaks( CharacterIterator *text,
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
@ -115,13 +116,13 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
/**
* <p>Divide up a range of known dictionary characters.</p>
*
* @param text A CharacterIterator representing the text
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( CharacterIterator *text,
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const = 0;
@ -172,13 +173,13 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
/**
* <p>Divide up a range of known dictionary characters.</p>
*
* @param text A CharacterIterator representing the text
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( CharacterIterator *text,
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const;

File diff suppressed because it is too large Load diff

View file

@ -87,27 +87,27 @@ MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status )
if (fTrie == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
fIter = new UCharCharacterIterator(0, 0);
if (fIter == NULL) {
fIter = utext_openUChars(NULL, NULL, 0, &status);
if (U_SUCCESS(status) && fIter == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {
fTrie = NULL;
fIter = new UCharCharacterIterator(NULL, 0);
if (fIter == NULL) {
fIter = utext_openUChars(NULL, NULL, 0, &status);
if (U_SUCCESS(status) && fIter == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
MutableTrieDictionary::~MutableTrieDictionary() {
delete fTrie;
delete fIter;
utext_close(fIter);
}
int32_t
MutableTrieDictionary::search( CharacterIterator *text,
MutableTrieDictionary::search( UText *text,
int32_t maxLength,
int32_t *lengths,
int &count,
@ -121,7 +121,7 @@ MutableTrieDictionary::search( CharacterIterator *text,
pMatched = TRUE;
int i;
UChar uc = text->current();
UChar uc = utext_current32(text);
for (i = 0; i < maxLength && p != NULL; ++i) {
while (p != NULL) {
if (uc < p->ch) {
@ -147,7 +147,8 @@ MutableTrieDictionary::search( CharacterIterator *text,
}
up = p;
p = p->equal;
uc = text->next();
uc = utext_next32(text);
uc = utext_current32(text);
}
// Note that there is no way to reach here with up == 0 unless
@ -170,13 +171,14 @@ MutableTrieDictionary::addWord( const UChar *word,
TernaryNode *parent;
UBool pMatched;
int count;
fIter->setText(word, length);
fIter = utext_openUChars(fIter, word, length, &status);
int matched;
matched = search(fIter, length, NULL, count, 0, parent, pMatched);
while (matched++ < length) {
UChar uc = fIter->nextPostInc();
UChar uc = utext_next32(fIter); // TODO: supplemetary support?
U_ASSERT(uc != U_SENTINEL);
TernaryNode *newNode = new TernaryNode(uc);
if (newNode == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
@ -211,7 +213,7 @@ MutableTrieDictionary::addWords( UEnumeration *words,
}
int32_t
MutableTrieDictionary::matches( CharacterIterator *text,
MutableTrieDictionary::matches( UText *text,
int32_t maxLength,
int32_t *lengths,
int &count,
@ -413,8 +415,7 @@ CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
fData = NULL;
}
}
CompactTrieDictionary::CompactTrieDictionary(const void *data,
CompactTrieDictionary::CompactTrieDictionary( const void *data,
UErrorCode &status )
: fUData(NULL)
{
@ -460,7 +461,7 @@ getCompactNode(const CompactTrieHeader *header, uint16_t node) {
}
int32_t
CompactTrieDictionary::matches( CharacterIterator *text,
CompactTrieDictionary::matches( UText *text,
int32_t maxLength,
int32_t *lengths,
int &count,
@ -469,7 +470,7 @@ CompactTrieDictionary::matches( CharacterIterator *text,
const CompactTrieNode *node = getCompactNode(fData, fData->root);
int mycount = 0;
UChar uc = text->current();
UChar uc = utext_current32(text);
int i = 0;
while (node != NULL) {
@ -498,7 +499,8 @@ CompactTrieDictionary::matches( CharacterIterator *text,
// We hit a non-equal character; return
goto exit;
}
uc = text->next();
utext_next32(text);
uc = utext_current32(text);
++i;
}
// To get here we must have come through the whole list successfully;
@ -518,7 +520,8 @@ CompactTrieDictionary::matches( CharacterIterator *text,
if (uc == hnode->entries[middle].ch) {
// We hit a match; get the next node and next character
node = getCompactNode(fData, hnode->entries[middle].equal);
uc = text->next();
utext_next32(text);
uc = utext_current32(text);
++i;
break;
}

View file

@ -10,6 +10,7 @@
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/utext.h"
struct UEnumeration;
struct UDataSwapper;
@ -45,8 +46,6 @@ triedict_swap(const UDataSwapper *ds,
U_NAMESPACE_BEGIN
class CharacterIterator;
class UCharCharacterIterator;
class StringEnumeration;
struct CompactTrieHeader;
@ -76,7 +75,7 @@ class U_COMMON_API TrieWordDictionary : public UMemory {
/**
* <p>Find dictionary words that match the text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* @param text A UText representing the text. The
* iterator is left after the longest prefix match in the dictionary.
* @param start The current position in text.
* @param maxLength The maximum number of code units to match.
@ -85,7 +84,7 @@ class U_COMMON_API TrieWordDictionary : public UMemory {
* @param limit The size of the lengths array; this limits the number of words output.
* @return The number of characters in text that were matched.
*/
virtual int32_t matches( CharacterIterator *text,
virtual int32_t matches( UText *text,
int32_t maxLength,
int32_t *lengths,
int &count,
@ -123,11 +122,11 @@ class U_COMMON_API MutableTrieDictionary : public TrieWordDictionary {
TernaryNode *fTrie;
/**
* A UCharCharacterIterator for internal use
* A UText for internal use
* @internal
*/
UCharCharacterIterator *fIter;
UText *fIter;
friend class CompactTrieDictionary; // For fast conversion
@ -150,7 +149,7 @@ class U_COMMON_API MutableTrieDictionary : public TrieWordDictionary {
/**
* <p>Find dictionary words that match the text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* @param text A UText representing the text. The
* iterator is left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
@ -158,7 +157,7 @@ class U_COMMON_API MutableTrieDictionary : public TrieWordDictionary {
* @param limit The size of the lengths array; this limits the number of words output.
* @return The number of characters in text that were matched.
*/
virtual int32_t matches( CharacterIterator *text,
virtual int32_t matches( UText *text,
int32_t maxLength,
int32_t *lengths,
int &count,
@ -196,7 +195,7 @@ protected:
/**
* <p>Search the dictionary for matches.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* @param text A UText representing the text. The
* iterator is left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
@ -206,7 +205,7 @@ protected:
* @param pMatched The returned parent node matched the input
* @return The number of characters in text that were matched.
*/
virtual int32_t search( CharacterIterator *text,
virtual int32_t search( UText *text,
int32_t maxLength,
int32_t *lengths,
int &count,
@ -232,21 +231,21 @@ private:
* to save space.</p>
*/
class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {
private:
private:
/**
* The root node of the trie
*/
const CompactTrieHeader *fData;
const CompactTrieHeader *fData;
/**
* A UBool indicating whether or not we own the fData.
*/
UBool fOwnData;
UBool fOwnData;
UDataMemory *fUData;
public:
public:
/**
* <p>Construct a dictionary from a UDataMemory.</p>
*
@ -279,7 +278,7 @@ public:
/**
* <p>Find dictionary words that match the text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* @param text A UText representing the text. The
* iterator is left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
@ -287,7 +286,7 @@ public:
* @param limit The size of the lengths array; this limits the number of words output.
* @return The number of characters in text that were matched.
*/
virtual int32_t matches( CharacterIterator *text,
virtual int32_t matches( UText *text,
int32_t rangeEnd,
int32_t *lengths,
int &count,

View file

@ -144,8 +144,6 @@ public:
/**
* Return a CharacterIterator over the text being analyzed.
* Changing the state of the returned iterator can have undefined consequences
* on the operation of the break iterator. If you need to change it, clone it first.
* @stable ICU 2.0
*/
virtual const CharacterIterator& getText(void) const = 0;
@ -193,6 +191,8 @@ public:
/**
* Change the text over which this operates. The text boundary is
* reset to the start.
* Note that setText(UText *) provides similar functionality to this function,
* and is more efficient.
* @param it The CharacterIterator used to change the text.
* @stable ICU 2.0
*/

View file

@ -63,10 +63,17 @@ class U_COMMON_API RuleBasedBreakIterator : public BreakIterator {
protected:
/**
* The character iterator through which this BreakIterator accesses the text
* The UText through which this BreakIterator accesses the text
* @internal
*/
CharacterIterator* fText;
UText *fText;
/**
* A character iterator that refers to the same text as the UText, above.
* Lazily created when requested by a caller.
* Only included for compatibility with old API, which was based on CharacterIterators.
*/
CharacterIterator *fCharIter;
/**
* The rule data for this BreakIterator instance
@ -280,14 +287,27 @@ public:
//=======================================================================
/**
* Return a CharacterIterator over the text being analyzed. This version
* of this method returns the actual CharacterIterator we're using internally.
* Changing the state of this iterator can have undefined consequences. If
* you need to change it, clone it first.
* Return a CharacterIterator over the text being analyzed.
* The returned character iterator is owned by the break iterator, and must
* not be deleted by the caller. Repeated calls to this function may
* return the same CharacterIterator.
* <p/>
* The returned character iterator must not be used concurrently with
* the break iterator. If concurrent operation is needed, clone the
* returned character iterator first and operate on the clone.
* <p/>
* This function is not thread safe. Two threads must not make concurrent
* calls to BreakIterator::getText(). This is an exception to the general
* rules for thread safety in ICU, which are that const functions are
* thread safe.
* <p/>
* The function getUText() provides similar functionality, and is more efficient.
* TODO: deprecate this function?
*
* @return An iterator over the text being analyzed.
* @stable ICU 2.0
* @stable ICU 2.0
*/
virtual const CharacterIterator& getText(void) const;
virtual CharacterIterator& getText(void) const;
/**
@ -340,7 +360,6 @@ public:
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
* @return The offset of the beginning of the text.
* @stable ICU 2.0
*/
@ -348,7 +367,6 @@ public:
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
* @return The text's past-the-end offset.
* @stable ICU 2.0
*/

View file

@ -260,8 +260,10 @@ void RBBIAPITest::TestGetSetAdoptText()
CharacterIterator* text3= new StringCharacterIterator(str2, 3, 10, 3); // "ond str"
wordIter1->setText(str1);
if(wordIter1->getText() != *text1)
errln((UnicodeString)"ERROR:1 error in setText or getText ");
CharacterIterator *tci = &wordIter1->getText();
UnicodeString tstr;
tci->getText(tstr);
TEST_ASSERT(tstr == str1);
if(wordIter1->current() != 0)
errln((UnicodeString)"ERROR:1 setText did not set the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n");
@ -273,9 +275,14 @@ void RBBIAPITest::TestGetSetAdoptText()
charIter1->adoptText(text1Clone);
if( wordIter1->getText() == charIter1->getText() ||
wordIter1->getText() != *text2 || charIter1->getText() != *text1 )
errln((UnicodeString)"ERROR:2 error is getText or setText()");
TEST_ASSERT(wordIter1->getText() != charIter1->getText());
tci = &wordIter1->getText();
tci->getText(tstr);
TEST_ASSERT(tstr == str2);
tci = &charIter1->getText();
tci->getText(tstr);
TEST_ASSERT(tstr == str1);
RuleBasedBreakIterator* rb=(RuleBasedBreakIterator*)wordIter1->clone();
rb->adoptText(text1);
@ -286,13 +293,17 @@ void RBBIAPITest::TestGetSetAdoptText()
errln((UnicodeString)"ERROR:2 error in adoptText ");
// Adopt where iterator range is less than the entire orignal source string.
// (With the change of the break engine to working with UText internally,
// CharacterIterators starting at positions other than zero are not supported)
rb->adoptText(text3);
if(rb->preceding(2) != 3) {
errln((UnicodeString)"ERROR:3 error in adoptText ");
}
if(rb->following(11) != BreakIterator::DONE) {
errln((UnicodeString)"ERROR:4 error in adoptText ");
}
TEST_ASSERT(rb->preceding(2) == 0);
TEST_ASSERT(rb->following(11) == BreakIterator::DONE);
//if(rb->preceding(2) != 3) {
// errln((UnicodeString)"ERROR:3 error in adoptText ");
//}
//if(rb->following(11) != BreakIterator::DONE) {
// errln((UnicodeString)"ERROR:4 error in adoptText ");
//}
// UText API
//
@ -344,7 +355,8 @@ void RBBIAPITest::TestGetSetAdoptText()
TEST_ASSERT(pos==UBRK_DONE);
status = U_ZERO_ERROR;
UText *gut2 = utext_openUnicodeString(NULL,NULL,&status);
UnicodeString sEmpty;
UText *gut2 = utext_openUnicodeString(NULL, &sEmpty, &status);
wordIter1->getUText(gut2, status);
TEST_ASSERT_SUCCESS(status);
utext_close(gut2);

View file

@ -412,64 +412,7 @@ void RBBITest::TestMixedThaiLineBreak()
// @suwit - end of changes
// Arabic numerals should always be separated from surrounding Thai text
/*
ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
thaiLineSelection->addElement("39");
ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
// words in non-Thai scripts should always be separated from surrounding Thai text
ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e14", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e2d\\u0e1a", 0, status);
thaiLineSelection->addElement("Java");
ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e19", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ", 0, status);
// Thai numerals should always be separated from the text surrounding them
ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e53\\u0e59", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
// Thai text should interact correctly with punctuation and symbols
ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21", 0, status);
// ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28", 0, status);
// ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e17\\u0e22)", 0, status);
ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)", 0, status);
// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary
ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e1b\\u0e34\\u0e14", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);
*/
/* remove the old data sample.
// The Unicode Linebreak TR says do not break before or after quotes.
// So this test is changed ot not break around the quote.
// TODO: should Thai break around the around the quotes, like the original behavior here?
// ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"", 0, status);
// ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""
"\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e22.", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e35\\u0e49", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "$200", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status);
ADD_DATACHUNK(thaiLineSelection, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status);
*/
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
if (U_FAILURE(status))
{
@ -788,14 +731,18 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
if(exec) TestJapaneseLineBreak(); break;
case 2: name = "TestStatusReturn";
if(exec) TestStatusReturn(); break;
case 3: name = "TestLineBreakData";
if(exec) TestLineBreakData(); break;
case 4: name = "TestEmptyString";
if(exec) TestEmptyString(); break;
case 5: name = "TestGetAvailableLocales";
if(exec) TestGetAvailableLocales(); break;
case 6: name = "TestGetDisplayName";
if(exec) TestGetDisplayName(); break;
case 7: name = "TestEndBehaviour";
if(exec) TestEndBehaviour(); break;
case 8: name = "TestMixedThaiLineBreak";
@ -1176,15 +1123,19 @@ void RBBITest::TestBug4153072() {
UnicodeString str("...Hello, World!...");
int32_t begin = 3;
int32_t end = str.length() - 3;
UBool dummy;
UBool onBoundary;
StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
iter->adoptText(textIterator);
int index;
// Note: with the switch to UText, there is no way to restrict the
// iteration range to begin at an index other than zero.
// String character iterators created with a non-zero bound are
// treated by RBBI as being empty.
for (index = -1; index < begin + 1; ++index) {
dummy = iter->isBoundary(index);
if (index < begin && dummy == TRUE) {
errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +
onBoundary = iter->isBoundary(index);
if (index == 0? !onBoundary : onBoundary) {
errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
" and begin index = " + begin);
}
}
@ -1323,11 +1274,12 @@ void RBBITest::executeTest(TestParams *t) {
if (expectedTagVal == -1) {
expectedTagVal = 0;
}
int32_t line = t->srcLine->elementAti(bp);
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
" Actual, Expected status = %4d, %4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
}
}
@ -1375,6 +1327,7 @@ void RBBITest::executeTest(TestParams *t) {
if (expectedTagVal == -1) {
expectedTagVal = 0;
}
int line = t->srcLine->elementAti(bp);
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"