ICU-5170 move RBBI from CharacterIterator to UText

X-SVN-Rev: 19579
2025-04-10 07:39:16 +00:00 · 2006-04-22 05:29:27 +00:00 · 2006-04-22 05:29:27 +00:00 · 9f85d5dd08
commit 9f85d5dd08
parent aca85b53cf
11 changed files with 588 additions and 595 deletions
--- a/icu4c/source/common/brkeng.cpp
+++ b/icu4c/source/common/brkeng.cpp
@ -71,22 +71,23 @@ UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
 }

 int32_t
-UnhandledEngine::findBreaks( CharacterIterator *text,
+UnhandledEngine::findBreaks( UText *text,
                                 int32_t startPos,
                                 int32_t endPos,
                                 UBool reverse,
                                 int32_t breakType,
                                 UStack &/*foundBreaks*/ ) const {
    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
-        UChar32 c = text->current32();
+        UChar32 c = utext_current32(text); 
        if (reverse) {
-            while(text->getIndex() > startPos && fHandled[breakType]->contains(c)) {
-                c = text->previous32();
+            while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
+                c = utext_previous32(text);
            }
        }
        else {
-            while(text->getIndex() < endPos && fHandled[breakType]->contains(c)) {
-                c = text->next32();
+            while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
+                utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
+                c = utext_current32(text);
            }
        }
    }
@ -164,7 +165,6 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
            dictnlength = 0;
            status = U_BUFFER_OVERFLOW_ERROR;
        }
-
        if (U_SUCCESS(status) && dictfname) {
            UChar* extStart=u_strchr(dictfname, 0x002e);
            int len = 0;
--- a/icu4c/source/common/brkeng.h
+++ b/icu4c/source/common/brkeng.h
@ -10,10 +10,10 @@

 #include "unicode/utypes.h"
 #include "unicode/uobject.h"
+#include "unicode/utext.h"

 U_NAMESPACE_BEGIN

-class CharacterIterator;
 class UnicodeSet;
 class UStack;

@ -58,7 +58,7 @@ class LanguageBreakEngine : public UMemory {
 /**
  * <p>Find any breaks within a run in the supplied text.</p>
  *
-  * @param text A CharacterIterator representing the text (TODO: UText). The
+  * @param text A UText representing the text. The
  * iterator is left at the end of the run of characters which the engine
  * is capable of handling.
  * @param startPos The start of the run within the supplied text.
@ -69,7 +69,7 @@ class LanguageBreakEngine : public UMemory {
  * @param foundBreaks An allocated C array of the breaks found, if any
  * @return The number of breaks found.
  */
-  virtual int32_t findBreaks( CharacterIterator *text,
+  virtual int32_t findBreaks( UText *text,
                              int32_t startPos,
                              int32_t endPos,
                              UBool reverse,
@ -183,7 +183,7 @@ class UnhandledEngine : public LanguageBreakEngine {
 /**
  * <p>Find any breaks within a run in the supplied text.</p>
  *
-  * @param text A CharacterIterator representing the text (TODO: UText). The
+  * @param text A UText representing the text (TODO: UText). The
  * iterator is left at the end of the run of characters which the engine
  * is capable of handling.
  * @param startPos The start of the run within the supplied text.
@ -194,7 +194,7 @@ class UnhandledEngine : public LanguageBreakEngine {
  * @param foundBreaks An allocated C array of the breaks found, if any
  * @return The number of breaks found.
  */
-  virtual int32_t findBreaks( CharacterIterator *text,
+  virtual int32_t findBreaks( UText *text,
                              int32_t startPos,
                              int32_t endPos,
                              UBool reverse,
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@ -41,7 +41,7 @@ DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
 }

 int32_t
-DictionaryBreakEngine::findBreaks( CharacterIterator *text,
+DictionaryBreakEngine::findBreaks( UText *text,
                                 int32_t startPos,
                                 int32_t endPos,
                                 UBool reverse,
@ -50,30 +50,31 @@ DictionaryBreakEngine::findBreaks( CharacterIterator *text,
    int32_t result = 0;

    // Find the span of characters included in the set.
-    int32_t start = text->getIndex();
+    int32_t start = (int32_t)utext_getNativeIndex(text);
    int32_t current;
    int32_t rangeStart;
    int32_t rangeEnd;
-    UChar32 c = text->current32();
+    UChar32 c = utext_current32(text);
    if (reverse) {
        UBool   isDict = fSet.contains(c);
-        while((current = text->getIndex()) > startPos && isDict) {
-            c = text->previous32();
+        while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
+            c = utext_previous32(text);
            isDict = fSet.contains(c);
        }
        rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
        rangeEnd = start + 1;
    }
    else {
-        while((current = text->getIndex()) < endPos && fSet.contains(c)) {
-            c = text->next32();
+        while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
+            utext_next32(text);         // TODO:  recast loop for postincrement
+            c = utext_current32(text);
        }
        rangeStart = start;
        rangeEnd = current;
    }
    if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
-        text->setIndex(current);
+        utext_setNativeIndex(text, current);
    }
    
    return result;
@ -116,14 +117,14 @@ class PossibleWord {
  ~PossibleWord();
  
  // Fill the list of candidates if needed, select the longest, and return the number found
-  int       candidates( CharacterIterator *text, const TrieWordDictionary *dict, int32_t rangeEnd );
+  int       candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd );
  
  // Select the currently marked candidate, point after it in the text, and invalidate self
-  int32_t   acceptMarked( CharacterIterator *text );
+  int32_t   acceptMarked( UText *text );
  
  // Back up from the current candidate to the next shorter one; return TRUE if that exists
  // and point the text after it
-  UBool     backUp( CharacterIterator *text );
+  UBool     backUp( UText *text );
  
  // Return the longest prefix this candidate location shares with a dictionary word
  int32_t   longestPrefix();
@ -142,19 +143,19 @@ PossibleWord::~PossibleWord() {
 }

 inline int
-PossibleWord::candidates( CharacterIterator *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
+PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
-    int32_t start = text->getIndex();
+    int32_t start = (int32_t)utext_getNativeIndex(text);
    if (start != offset) {
        offset = start;
        prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
        // Dictionary leaves text after longest prefix, not longest word. Back up.
        if (count <= 0) {
-            text->setIndex(start);
+            utext_setNativeIndex(text, start);
        }
    }
    if (count > 0) {
-        text->setIndex(start+lengths[count-1]);
+        utext_setNativeIndex(text, start+lengths[count-1]);
    }
    current = count-1;
    mark = current;
@ -162,15 +163,15 @@ PossibleWord::candidates( CharacterIterator *text, const TrieWordDictionary *dic
 }

 inline int32_t
-PossibleWord::acceptMarked( CharacterIterator *text ) {
-    text->setIndex(offset + lengths[mark]);
+PossibleWord::acceptMarked( UText *text ) {
+    utext_setNativeIndex(text, offset + lengths[mark]);
    return lengths[mark];
 }

 inline UBool
-PossibleWord::backUp( CharacterIterator *text ) {
+PossibleWord::backUp( UText *text ) {
    if (current > 0) {
-        text->setIndex(offset + lengths[--current]);
+        utext_setNativeIndex(text, offset + lengths[--current]);
        return TRUE;
    }
    return FALSE;
@ -231,7 +232,7 @@ ThaiBreakEngine::~ThaiBreakEngine() {
 }

 int32_t
-ThaiBreakEngine::divideUpDictionaryRange( CharacterIterator *text,
+ThaiBreakEngine::divideUpDictionaryRange( UText *text,
                                                int32_t rangeStart,
                                                int32_t rangeEnd,
                                                UStack &foundBreaks ) const {
@ -246,9 +247,9 @@ ThaiBreakEngine::divideUpDictionaryRange( CharacterIterator *text,
    PossibleWord words[THAI_LOOKAHEAD];
    UChar32 uc;
    
-    text->setIndex(rangeStart);
+    utext_setNativeIndex(text, rangeStart);
    
-    while (U_SUCCESS(status) && (current = text->getIndex()) < rangeEnd) {
+    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
        wordLength = 0;

        // Look for candidate words at the current position
@ -263,7 +264,7 @@ ThaiBreakEngine::divideUpDictionaryRange( CharacterIterator *text,
        // If there was more than one, see which one can take us forward the most words
        else if (candidates > 1) {
            // If we're already at the end of the range, we're done
-            if (text->getIndex() >= rangeEnd) {
+            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
                goto foundBest;
            }
            do {
@ -276,7 +277,7 @@ ThaiBreakEngine::divideUpDictionaryRange( CharacterIterator *text,
                    }
                    
                    // If we're already at the end of the range, we're done
-                    if (text->getIndex() >= rangeEnd) {
+                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
                        goto foundBest;
                    }
                    
@ -302,7 +303,7 @@ foundBest:
        // just found (if there is one), but only if the preceding word does not exceed
        // the threshold.
        // The text iterator should now be positioned at the end of the word we found.
-        if (text->getIndex() < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
+        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
            // if it is a dictionary word, do nothing. If it isn't, then if there is
            // no preceding word, or the non-word shares less than the minimum threshold
            // of characters with a dictionary word, then scan to resynchronize
@ -312,10 +313,11 @@ foundBest:
                // Look for a plausible word boundary
                //TODO: This section will need a rework for UText.
                int32_t remaining = rangeEnd - (current+wordLength);
-                UChar32 pc = text->current32();
+                UChar32 pc = utext_current32(text);
                int32_t chars = 0;
                while (TRUE) {
-                    uc = text->next32();
+                    utext_next32(text);
+                    uc = utext_current32(text);
                    // TODO: Here we're counting on the fact that the SA languages are all
                    // in the BMP. This should get fixed with the UText rework.
                    chars += 1;
@ -329,7 +331,7 @@ foundBest:
                        // checking the dictionary. That is just a performance filter,
                        // but it's not clear it's faster than checking the trie.
                        int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
-                        text->setIndex(current+wordLength+chars);
+                        utext_setNativeIndex(text, current+wordLength+chars);
                        if (candidates > 0) {
                            break;
                        }
@ -347,49 +349,52 @@ foundBest:
            }
            else {
                // Back up to where we were for next iteration
-                text->setIndex(current+wordLength);
+                utext_setNativeIndex(text, current+wordLength);
            }
        }
        
        // Never stop before a combining mark.
        int32_t currPos;
-        while ((currPos = text->getIndex()) < rangeEnd && fMarkSet.contains(text->current32())) {
-            wordLength += text->move32(1, CharacterIterator::kCurrent) - currPos;
+        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
+            utext_next32(text);
+            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
        }
        
        // Look ahead for possible suffixes if a dictionary word does not follow.
        // We do this in code rather than using a rule so that the heuristic
        // resynch continues to function. For example, one of the suffix characters
        // could be a typo in the middle of a word.
-        if (text->getIndex() < rangeEnd && wordLength > 0) {
+        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
-                && fSuffixSet.contains(uc = text->current32())) {
+                && fSuffixSet.contains(uc = utext_current32(text))) {
                if (uc == THAI_PAIYANNOI) {
-                    if (!fSuffixSet.contains(text->previous32())) {
+                    if (!fSuffixSet.contains(utext_previous32(text))) {
                        // Skip over previous end and PAIYANNOI
-                        text->move32(2, CharacterIterator::kCurrent);
+                        utext_next32(text);
+                        utext_next32(text);
                        wordLength += 1;            // Add PAIYANNOI to word
-                        uc = text->current32();     // Fetch next character
+                        uc = utext_current32(text);     // Fetch next character
                    }
                    else {
                        // Restore prior position
-                        text->move32(1, CharacterIterator::kCurrent);
+                        utext_next32(text);
                    }
                }
                if (uc == THAI_MAIYAMOK) {
-                    if (text->previous32() != THAI_MAIYAMOK) {
+                    if (utext_previous32(text) != THAI_MAIYAMOK) {
                        // Skip over previous end and MAIYAMOK
-                        text->move32(2, CharacterIterator::kCurrent);
+                        utext_next32(text);
+                        utext_next32(text);
                        wordLength += 1;            // Add MAIYAMOK to word
                    }
                    else {
                        // Restore prior position
-                        text->move32(1, CharacterIterator::kCurrent);
+                        utext_next32(text);
                    }
                }
            }
            else {
-                text->setIndex(current+wordLength);
+                utext_setNativeIndex(text, current+wordLength);
            }
        }
        
--- a/icu4c/source/common/dictbe.h
+++ b/icu4c/source/common/dictbe.h
@ -10,11 +10,12 @@

 #include "unicode/utypes.h"
 #include "unicode/uniset.h"
+#include "unicode/utext.h"
+
 #include "brkeng.h"

 U_NAMESPACE_BEGIN

-class CharacterIterator;
 class TrieWordDictionary;

 /*******************************************************************
@ -78,7 +79,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
 /**
  * <p>Find any breaks within a run in the supplied text.</p>
  *
-  * @param text A CharacterIterator representing the text (TODO: UText). The
+  * @param text A UText representing the text. The
  * iterator is left at the end of the run of characters which the engine
  * is capable of handling.
  * @param startPos The start of the run within the supplied text.
@ -89,7 +90,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
  * @param foundBreaks An allocated C array of the breaks found, if any
  * @return The number of breaks found.
  */
-  virtual int32_t findBreaks( CharacterIterator *text,
+  virtual int32_t findBreaks( UText *text,
                              int32_t startPos,
                              int32_t endPos,
                              UBool reverse,
@ -115,13 +116,13 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
 /**
  * <p>Divide up a range of known dictionary characters.</p>
  *
-  * @param text A CharacterIterator representing the text
+  * @param text A UText representing the text
  * @param rangeStart The start of the range of dictionary characters
  * @param rangeEnd The end of the range of dictionary characters
  * @param foundBreaks Output of C array of int32_t break positions, or 0
  * @return The number of breaks found
  */
-  virtual int32_t divideUpDictionaryRange( CharacterIterator *text,
+  virtual int32_t divideUpDictionaryRange( UText *text,
                                           int32_t rangeStart,
                                           int32_t rangeEnd,
                                           UStack &foundBreaks ) const = 0;
@ -172,13 +173,13 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
 /**
  * <p>Divide up a range of known dictionary characters.</p>
  *
-  * @param text A CharacterIterator representing the text
+  * @param text A UText representing the text
  * @param rangeStart The start of the range of dictionary characters
  * @param rangeEnd The end of the range of dictionary characters
  * @param foundBreaks Output of C array of int32_t break positions, or 0
  * @return The number of breaks found
  */
-  virtual int32_t divideUpDictionaryRange( CharacterIterator *text,
+  virtual int32_t divideUpDictionaryRange( UText *text,
                                           int32_t rangeStart,
                                           int32_t rangeEnd,
                                           UStack &foundBreaks ) const;
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
--- a/icu4c/source/common/triedict.cpp
+++ b/icu4c/source/common/triedict.cpp
@ -87,27 +87,27 @@ MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status )
    if (fTrie == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
    }
-    fIter = new UCharCharacterIterator(0, 0);
-    if (fIter == NULL) {
+    fIter = utext_openUChars(NULL, NULL, 0, &status);
+    if (U_SUCCESS(status) && fIter == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
    }
 }

 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {
    fTrie = NULL;
-    fIter = new UCharCharacterIterator(NULL, 0);
-    if (fIter == NULL) {
+    fIter = utext_openUChars(NULL, NULL, 0, &status);
+    if (U_SUCCESS(status) && fIter == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
    }
 }

 MutableTrieDictionary::~MutableTrieDictionary() {
    delete fTrie;
-    delete fIter;
+    utext_close(fIter);
 }

 int32_t
-MutableTrieDictionary::search( CharacterIterator *text,
+MutableTrieDictionary::search( UText *text,
                                   int32_t maxLength,
                                   int32_t *lengths,
                                   int &count,
@ -121,7 +121,7 @@ MutableTrieDictionary::search( CharacterIterator *text,
    pMatched = TRUE;
    int i;

-    UChar uc = text->current();
+    UChar uc = utext_current32(text);
    for (i = 0; i < maxLength && p != NULL; ++i) {
        while (p != NULL) {
            if (uc < p->ch) {
@ -147,7 +147,8 @@ MutableTrieDictionary::search( CharacterIterator *text,
        }
        up = p;
        p = p->equal;
-        uc = text->next();
+        uc = utext_next32(text);
+        uc = utext_current32(text);
    }
    
    // Note that there is no way to reach here with up == 0 unless
@ -170,13 +171,14 @@ MutableTrieDictionary::addWord( const UChar *word,
    TernaryNode *parent;
    UBool pMatched;
    int count;
-    fIter->setText(word, length);
+    fIter = utext_openUChars(fIter, word, length, &status);
    
    int matched;
    matched = search(fIter, length, NULL, count, 0, parent, pMatched);
    
    while (matched++ < length) {
-        UChar uc = fIter->nextPostInc();
+        UChar uc = utext_next32(fIter);  // TODO:  supplemetary support?
+        U_ASSERT(uc != U_SENTINEL);
        TernaryNode *newNode = new TernaryNode(uc);
        if (newNode == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
@ -211,7 +213,7 @@ MutableTrieDictionary::addWords( UEnumeration *words,
 }

 int32_t
-MutableTrieDictionary::matches( CharacterIterator *text,
+MutableTrieDictionary::matches( UText *text,
                                int32_t maxLength,
                                int32_t *lengths,
                                int &count,
@ -413,8 +415,7 @@ CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
        fData = NULL;
    }
 }
-
-CompactTrieDictionary::CompactTrieDictionary(const void *data,
+CompactTrieDictionary::CompactTrieDictionary( const void *data,
                                                UErrorCode &status )
 : fUData(NULL)
 {
@ -460,7 +461,7 @@ getCompactNode(const CompactTrieHeader *header, uint16_t node) {
 }

 int32_t
-CompactTrieDictionary::matches( CharacterIterator *text,
+CompactTrieDictionary::matches( UText *text,
                                int32_t maxLength,
                                int32_t *lengths,
                                int &count,
@ -469,7 +470,7 @@ CompactTrieDictionary::matches( CharacterIterator *text,
    const CompactTrieNode *node = getCompactNode(fData, fData->root);
    int mycount = 0;

-    UChar uc = text->current();
+    UChar uc = utext_current32(text);
    int i = 0;

    while (node != NULL) {
@ -498,7 +499,8 @@ CompactTrieDictionary::matches( CharacterIterator *text,
                    // We hit a non-equal character; return
                    goto exit;
                }
-                uc = text->next();
+                utext_next32(text);
+                uc = utext_current32(text);
                ++i;
            }
            // To get here we must have come through the whole list successfully;
@ -518,7 +520,8 @@ CompactTrieDictionary::matches( CharacterIterator *text,
                if (uc == hnode->entries[middle].ch) {
                    // We hit a match; get the next node and next character
                    node = getCompactNode(fData, hnode->entries[middle].equal);
-                    uc = text->next();
+                    utext_next32(text);
+                    uc = utext_current32(text);
                    ++i;
                    break;
                }
--- a/icu4c/source/common/triedict.h
+++ b/icu4c/source/common/triedict.h
@ -10,6 +10,7 @@

 #include "unicode/utypes.h"
 #include "unicode/uobject.h"
+#include "unicode/utext.h"

 struct UEnumeration;
 struct UDataSwapper;
@ -45,8 +46,6 @@ triedict_swap(const UDataSwapper *ds,

 U_NAMESPACE_BEGIN

-class CharacterIterator;
-class UCharCharacterIterator;
 class StringEnumeration;
 struct CompactTrieHeader;

@ -76,7 +75,7 @@ class U_COMMON_API TrieWordDictionary : public UMemory {
 /**
  * <p>Find dictionary words that match the text.</p>
  *
-  * @param text A CharacterIterator representing the text (TODO: UText). The
+  * @param text A UText representing the text. The
  * iterator is left after the longest prefix match in the dictionary.
  * @param start The current position in text.
  * @param maxLength The maximum number of code units to match.
@ -85,7 +84,7 @@ class U_COMMON_API TrieWordDictionary : public UMemory {
  * @param limit The size of the lengths array; this limits the number of words output.
  * @return The number of characters in text that were matched.
  */
-  virtual int32_t matches( CharacterIterator *text,
+  virtual int32_t matches( UText *text,
                              int32_t maxLength,
                              int32_t *lengths,
                              int &count,
@ -123,11 +122,11 @@ class U_COMMON_API MutableTrieDictionary : public TrieWordDictionary {
  TernaryNode               *fTrie;

    /**
-     * A UCharCharacterIterator for internal use
+     * A UText for internal use
     * @internal
     */

-  UCharCharacterIterator    *fIter;
+  UText    *fIter;

  friend class CompactTrieDictionary;   // For fast conversion

@ -150,7 +149,7 @@ class U_COMMON_API MutableTrieDictionary : public TrieWordDictionary {
 /**
  * <p>Find dictionary words that match the text.</p>
  *
-  * @param text A CharacterIterator representing the text (TODO: UText). The
+  * @param text A UText representing the text. The
  * iterator is left after the longest prefix match in the dictionary.
  * @param maxLength The maximum number of code units to match.
  * @param lengths An array that is filled with the lengths of words that matched.
@ -158,7 +157,7 @@ class U_COMMON_API MutableTrieDictionary : public TrieWordDictionary {
  * @param limit The size of the lengths array; this limits the number of words output.
  * @return The number of characters in text that were matched.
  */
-  virtual int32_t matches( CharacterIterator *text,
+  virtual int32_t matches( UText *text,
                              int32_t maxLength,
                              int32_t *lengths,
                              int &count,
@ -196,7 +195,7 @@ protected:
 /**
  * <p>Search the dictionary for matches.</p>
  *
-  * @param text A CharacterIterator representing the text (TODO: UText). The
+  * @param text A UText representing the text. The
  * iterator is left after the longest prefix match in the dictionary.
  * @param maxLength The maximum number of code units to match.
  * @param lengths An array that is filled with the lengths of words that matched.
@ -206,7 +205,7 @@ protected:
  * @param pMatched The returned parent node matched the input
  * @return The number of characters in text that were matched.
  */
-  virtual int32_t search( CharacterIterator *text,
+  virtual int32_t search( UText *text,
                              int32_t maxLength,
                              int32_t *lengths,
                              int &count,
@ -232,21 +231,21 @@ private:
 * to save space.</p>
 */
 class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {
-private:
+ private:
    /**
     * The root node of the trie
     */
-    const CompactTrieHeader   *fData;
+
+  const CompactTrieHeader   *fData;

    /**
     * A UBool indicating whether or not we own the fData.
     */
-    UBool                     fOwnData;

+  UBool                     fOwnData;

    UDataMemory              *fUData;
-public:
-
+ public:
  /**
   * <p>Construct a dictionary from a UDataMemory.</p>
   *
@ -279,7 +278,7 @@ public:
 /**
  * <p>Find dictionary words that match the text.</p>
  *
-  * @param text A CharacterIterator representing the text (TODO: UText). The
+  * @param text A UText representing the text. The
  * iterator is left after the longest prefix match in the dictionary.
  * @param maxLength The maximum number of code units to match.
  * @param lengths An array that is filled with the lengths of words that matched.
@ -287,7 +286,7 @@ public:
  * @param limit The size of the lengths array; this limits the number of words output.
  * @return The number of characters in text that were matched.
  */
-  virtual int32_t matches( CharacterIterator *text,
+  virtual int32_t matches( UText *text,
                              int32_t rangeEnd,
                              int32_t *lengths,
                              int &count,
--- a/icu4c/source/common/unicode/brkiter.h
+++ b/icu4c/source/common/unicode/brkiter.h
@ -144,8 +144,6 @@ public:

    /**
     * Return a CharacterIterator over the text being analyzed.
-     * Changing the state of the returned iterator can have undefined consequences
-     * on the operation of the break iterator.  If you need to change it, clone it first.
     * @stable ICU 2.0
     */
    virtual const CharacterIterator& getText(void) const = 0;
@ -193,6 +191,8 @@ public:
    /**
     * Change the text over which this operates. The text boundary is
     * reset to the start.
+     * Note that setText(UText *) provides similar functionality to this function,
+     * and is more efficient.
     * @param it The CharacterIterator used to change the text.
     * @stable ICU 2.0
     */
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@ -63,10 +63,17 @@ class U_COMMON_API RuleBasedBreakIterator : public BreakIterator {

 protected:
    /**
-     * The character iterator through which this BreakIterator accesses the text
+     * The UText through which this BreakIterator accesses the text
     * @internal
     */
-    CharacterIterator*  fText;
+    UText  *fText;
+
+    /**
+     *   A character iterator that refers to the same text as the UText, above.
+     *   Lazily created when requested by a caller.
+     *   Only included for compatibility with old API, which was based on CharacterIterators.
+     */
+    CharacterIterator  *fCharIter;

    /**
     * The rule data for this BreakIterator instance
@ -280,14 +287,27 @@ public:
    //=======================================================================

    /**
-     * Return a CharacterIterator over the text being analyzed.  This version
-     * of this method returns the actual CharacterIterator we're using internally.
-     * Changing the state of this iterator can have undefined consequences.  If
-     * you need to change it, clone it first.
+     * Return a CharacterIterator over the text being analyzed.
+     * The returned character iterator is owned by the break iterator, and must
+     * not be deleted by the caller.  Repeated calls to this function may
+     * return the same CharacterIterator.
+     * <p/>
+     * The returned character iterator must not be used concurrently with
+     * the break iterator.  If concurrent operation is needed, clone the
+     * returned character iterator first and operate on the clone.
+     * <p/>
+     * This function is not thread safe.  Two threads must not make concurrent
+     * calls to BreakIterator::getText(). This is an exception to the general
+     * rules for thread safety in ICU, which are that const functions are
+     * thread safe.
+     * <p/>
+     * The function getUText() provides similar functionality, and is more efficient.
+     * TODO:  deprecate this function?
+     *
     * @return An iterator over the text being analyzed.
-     *  @stable ICU 2.0
+     * @stable ICU 2.0
     */
-    virtual const CharacterIterator& getText(void) const;
+    virtual  CharacterIterator& getText(void) const;


    /**
@ -340,7 +360,6 @@ public:

    /**
     * Sets the current iteration position to the beginning of the text.
-     * (i.e., the CharacterIterator's starting offset).
     * @return The offset of the beginning of the text.
     *  @stable ICU 2.0
     */
@ -348,7 +367,6 @@ public:

    /**
     * Sets the current iteration position to the end of the text.
-     * (i.e., the CharacterIterator's ending offset).
     * @return The text's past-the-end offset.
     *  @stable ICU 2.0
     */
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@ -260,8 +260,10 @@ void RBBIAPITest::TestGetSetAdoptText()
    CharacterIterator* text3= new StringCharacterIterator(str2, 3, 10, 3); //  "ond str"
    
    wordIter1->setText(str1);
-    if(wordIter1->getText() != *text1)
-       errln((UnicodeString)"ERROR:1 error in setText or getText ");
+    CharacterIterator *tci = &wordIter1->getText();
+    UnicodeString      tstr;
+    tci->getText(tstr);
+    TEST_ASSERT(tstr == str1);
    if(wordIter1->current() != 0)
        errln((UnicodeString)"ERROR:1 setText did not set the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n");

@ -273,9 +275,14 @@ void RBBIAPITest::TestGetSetAdoptText()


    charIter1->adoptText(text1Clone);
-    if( wordIter1->getText() == charIter1->getText() || 
-        wordIter1->getText() != *text2 ||  charIter1->getText() != *text1 )
-        errln((UnicodeString)"ERROR:2 error is getText or setText()");
+    TEST_ASSERT(wordIter1->getText() != charIter1->getText());
+    tci = &wordIter1->getText();
+    tci->getText(tstr);
+    TEST_ASSERT(tstr == str2);
+    tci = &charIter1->getText();
+    tci->getText(tstr);
+    TEST_ASSERT(tstr == str1);
+

    RuleBasedBreakIterator* rb=(RuleBasedBreakIterator*)wordIter1->clone();
    rb->adoptText(text1);
@ -286,13 +293,17 @@ void RBBIAPITest::TestGetSetAdoptText()
        errln((UnicodeString)"ERROR:2 error in adoptText ");

    // Adopt where iterator range is less than the entire orignal source string.
+    //   (With the change of the break engine to working with UText internally,
+    //    CharacterIterators starting at positions other than zero are not supported)
    rb->adoptText(text3);
-    if(rb->preceding(2) != 3) {
-        errln((UnicodeString)"ERROR:3 error in adoptText ");
-    }
-    if(rb->following(11) != BreakIterator::DONE) {
-        errln((UnicodeString)"ERROR:4 error in adoptText ");
-    }
+    TEST_ASSERT(rb->preceding(2) == 0);
+    TEST_ASSERT(rb->following(11) == BreakIterator::DONE);
+    //if(rb->preceding(2) != 3) {
+    //    errln((UnicodeString)"ERROR:3 error in adoptText ");
+    //}
+    //if(rb->following(11) != BreakIterator::DONE) {
+    //    errln((UnicodeString)"ERROR:4 error in adoptText ");
+    //}

    // UText API
    //
@ -344,7 +355,8 @@ void RBBIAPITest::TestGetSetAdoptText()
    TEST_ASSERT(pos==UBRK_DONE);

    status = U_ZERO_ERROR;
-    UText *gut2 = utext_openUnicodeString(NULL,NULL,&status);
+    UnicodeString sEmpty;
+    UText *gut2 = utext_openUnicodeString(NULL, &sEmpty, &status);
    wordIter1->getUText(gut2, status);
    TEST_ASSERT_SUCCESS(status);
    utext_close(gut2);
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -412,64 +412,7 @@ void RBBITest::TestMixedThaiLineBreak()

    // @suwit - end of changes

-    // Arabic numerals should always be separated from surrounding Thai text
-/*
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
-        thaiLineSelection->addElement("39");
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);

-        // words in non-Thai scripts should always be separated from surrounding Thai text
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e14", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e2d\\u0e1a", 0, status);
-        thaiLineSelection->addElement("Java");
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e19", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ", 0, status);
-
-        // Thai numerals should always be separated from the text surrounding them
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e53\\u0e59", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status);
-
-        // Thai text should interact correctly with punctuation and symbols
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21", 0, status);
-//        ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28", 0, status);
-//        ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e17\\u0e22)", 0, status);
-ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)", 0, status);
-// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e1b\\u0e34\\u0e14", 0, status);
-        ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);
-*/
-
-    /*  remove the old data sample.
-    // The Unicode Linebreak TR says do not break before or after quotes.
-    //    So this test is changed ot not break around the quote.
-    //    TODO:  should Thai break around the around the quotes, like the original behavior here?
-//    ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"", 0, status);
-//    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
-      ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""
-                                                         "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
-
-    ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
-
-    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status);
-    ADD_DATACHUNK(thaiLineSelection, "\\u0e22.", 0, status);
-    ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e35\\u0e49", 0, status);
-    ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status);
-    ADD_DATACHUNK(thaiLineSelection, "$200", 0, status);
-    ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status);
-    ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status);
-    ADD_DATACHUNK(thaiLineSelection, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status);
-*/
    RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
    if (U_FAILURE(status))
    {
@ -788,14 +731,18 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
            if(exec) TestJapaneseLineBreak();                 break;
        case 2: name = "TestStatusReturn";
            if(exec) TestStatusReturn();                       break;
+
        case 3: name = "TestLineBreakData";
            if(exec) TestLineBreakData();                      break;
        case 4: name = "TestEmptyString";
            if(exec) TestEmptyString();                        break;
+
        case 5: name = "TestGetAvailableLocales";
            if(exec) TestGetAvailableLocales();                break;
+
        case 6: name = "TestGetDisplayName";
            if(exec) TestGetDisplayName();                     break;
+
        case 7: name = "TestEndBehaviour";
            if(exec) TestEndBehaviour();                       break;
        case 8: name = "TestMixedThaiLineBreak";
@ -1176,15 +1123,19 @@ void RBBITest::TestBug4153072() {
    UnicodeString str("...Hello, World!...");
    int32_t begin = 3;
    int32_t end = str.length() - 3;
-    UBool dummy;
+    UBool onBoundary;

    StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
    iter->adoptText(textIterator);
    int index;
+    // Note: with the switch to UText, there is no way to restrict the
+    //       iteration range to begin at an index other than zero.
+    //       String character iterators created with a non-zero bound are
+    //         treated by RBBI as being empty.
    for (index = -1; index < begin + 1; ++index) {
-        dummy = iter->isBoundary(index);
-        if (index < begin && dummy == TRUE) {
-            errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +
+        onBoundary = iter->isBoundary(index);
+        if (index == 0?  !onBoundary : onBoundary) {
+            errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
                            " and begin index = " + begin);
        }
    }
@ -1323,11 +1274,12 @@ void RBBITest::executeTest(TestParams *t) {
            if (expectedTagVal == -1) {
                expectedTagVal = 0;
            }
+            int32_t line = t->srcLine->elementAti(bp);
            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
            if (rs != expectedTagVal) {
                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
                      "          Actual, Expected status = %4d, %4d",
-                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
+                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
            }
        }

@ -1375,6 +1327,7 @@ void RBBITest::executeTest(TestParams *t) {
            if (expectedTagVal == -1) {
                expectedTagVal = 0;
            }
+            int line = t->srcLine->elementAti(bp);
            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
            if (rs != expectedTagVal) {
                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"