ICU-8550 Dictionary Break Iterator, fixes to work with UTF-8 text.

X-SVN-Rev: 35724
2025-04-08 06:53:45 +00:00 · 2014-05-17 00:44:39 +00:00 · 2014-05-17 00:44:39 +00:00 · f71b9053d2
commit f71b9053d2
parent 68c893b2f1
8 changed files with 726 additions and 487 deletions
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
--- a/icu4c/source/common/dictbe.h
+++ b/icu4c/source/common/dictbe.h
@ -1,6 +1,6 @@
 /**
 *******************************************************************************
- * Copyright (C) 2006,2012-2013, International Business Machines Corporation   *
+ * Copyright (C) 2006-2014, International Business Machines Corporation   *
 * and others. All Rights Reserved.                                            *
 *******************************************************************************
 */
@ -17,6 +17,7 @@
 U_NAMESPACE_BEGIN

 class DictionaryMatcher;
+class Normalizer2;

 /*******************************************************************
 * DictionaryBreakEngine
@ -326,7 +327,8 @@ class CjkBreakEngine : public DictionaryBreakEngine {
  UnicodeSet                fKatakanaWordSet;
  UnicodeSet                fHiraganaWordSet;

-  DictionaryMatcher  *fDictionary;
+  DictionaryMatcher        *fDictionary;
+  const Normalizer2        *nfkcNorm2;

 public:

--- a/icu4c/source/common/dictionarydata.cpp
+++ b/icu4c/source/common/dictionarydata.cpp
@ -40,22 +40,31 @@ int32_t UCharsDictionaryMatcher::getType() const {
    return DictionaryData::TRIE_TYPE_UCHARS;
 }

-int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
+int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
+                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
+                            int32_t *prefix) const {
+
    UCharsTrie uct(characters);
-    UChar32 c = utext_next32(text);
-    if (c < 0) {
-        return 0;
-    }
-    UStringTrieResult result = uct.first(c);
-    int32_t numChars = 1;
-    count = 0;
-    for (;;) {
+    int32_t startingTextIndex = utext_getNativeIndex(text);
+    int32_t wordCount = 0;
+    int32_t codePointsMatched = 0;
+
+    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
+        UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
+        int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
+        codePointsMatched += 1;
        if (USTRINGTRIE_HAS_VALUE(result)) {
-            if (count < limit) {
+            if (wordCount < limit) {
                if (values != NULL) {
-                    values[count] = uct.getValue();
+                    values[wordCount] = uct.getValue();
                }
-                lengths[count++] = numChars;
+                if (lengths != NULL) {
+                    lengths[wordCount] = lengthMatched;
+                }
+                if (cpLengths != NULL) {
+                    cpLengths[wordCount] = codePointsMatched;
+                }
+                ++wordCount;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
@ -64,20 +73,15 @@ int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }
-
-        // TODO: why do we have a text limit if the UText knows its length?
-        if (numChars >= maxLength) {
+        if (lengthMatched >= maxLength) {
            break;
        }
-
-        c = utext_next32(text);
-        if (c < 0) {
-            break;
-        }
-        ++numChars;
-        result = uct.next(c);
    }
-    return numChars;
+
+    if (prefix != NULL) {
+        *prefix = codePointsMatched;
+    }
+    return wordCount;
 }

 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
@ -104,22 +108,30 @@ int32_t BytesDictionaryMatcher::getType() const {
    return DictionaryData::TRIE_TYPE_BYTES;
 }

-int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
+int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
+                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
+                            int32_t *prefix) const {
    BytesTrie bt(characters);
-    UChar32 c = utext_next32(text);
-    if (c < 0) {
-        return 0;
-    }
-    UStringTrieResult result = bt.first(transform(c));
-    int32_t numChars = 1;
-    count = 0;
-    for (;;) {
+    int32_t startingTextIndex = utext_getNativeIndex(text);
+    int32_t wordCount = 0;
+    int32_t codePointsMatched = 0;
+
+    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
+        UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
+        int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
+        codePointsMatched += 1;
        if (USTRINGTRIE_HAS_VALUE(result)) {
-            if (count < limit) {
+            if (wordCount < limit) {
                if (values != NULL) {
-                    values[count] = bt.getValue();
+                    values[wordCount] = bt.getValue();
                }
-                lengths[count++] = numChars;
+                if (lengths != NULL) {
+                    lengths[wordCount] = lengthMatched;
+                }
+                if (cpLengths != NULL) {
+                    cpLengths[wordCount] = codePointsMatched;
+                }
+                ++wordCount;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
@ -128,20 +140,15 @@ int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }
-
-        // TODO: why do we have a text limit if the UText knows its length?
-        if (numChars >= maxLength) {
+        if (lengthMatched >= maxLength) {
            break;
        }
-
-        c = utext_next32(text);
-        if (c < 0) {
-            break;
-        }
-        ++numChars;
-        result = bt.next(transform(c));
    }
-    return numChars;
+
+    if (prefix != NULL) {
+        *prefix = codePointsMatched;
+    }
+    return wordCount;
 }


--- a/icu4c/source/common/dictionarydata.h
+++ b/icu4c/source/common/dictionarydata.h
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
-* Copyright (C) 2013, International Business Machines
+* Copyright (C) 2014, International Business Machines
 * Corporation and others.  All Rights Reserved.
 *******************************************************************************
 * dictionarydata.h
@ -66,10 +66,32 @@ public:
 */
 class U_COMMON_API DictionaryMatcher : public UMemory {
 public:
+    DictionaryMatcher() {};
    virtual ~DictionaryMatcher();
    // this should emulate CompactTrieDictionary::matches()
-    virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count,
-                            int32_t limit, int32_t *values = NULL) const = 0;
+    /*  @param text      The text in which to look for matching words. Matching begins
+     *                   at the current position of the UText.
+     *  @param maxLength The max length of match to consider. Units are the native indexing
+     *                   units of the UText.
+     *  @param limit     Capacity of output arrays, which is also the maximum number of
+     *                   matching words to be found.
+     *  @param lengths   output array, filled with the lengths of the matches, in order,
+     *                   from shortest to longest. Lengths are in native indexing units
+     *                   of the UText. May be NULL.
+     *  @param cpLengths output array, filled with the lengths of the matches, in order,
+     *                   from shortest to longest. Lengths are the number of Unicode code points.
+     *                   May be NULL.
+     *  @param values    Output array, filled with the values associated with the words found.
+     *                   May be NULL.
+     *  @param prefix    Output parameter, the code point length of the prefix match, even if that
+     *                   prefix didn't lead to a complete word. Will always be >= the cpLength
+     *                   of the longest complete word matched. May be NULL.
+     *  @return          Number of matching words found.
+     */
+    virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
+                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
+                            int32_t *prefix) const = 0;
+
    /** @return DictionaryData::TRIE_TYPE_XYZ */
    virtual int32_t getType() const = 0;
 };
@ -81,8 +103,9 @@ public:
    // The UDataMemory * will be closed on this object's destruction.
    UCharsDictionaryMatcher(const UChar *c, UDataMemory *f) : characters(c), file(f) { }
    virtual ~UCharsDictionaryMatcher();
-    virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count,
-                            int32_t limit, int32_t *values = NULL) const;
+    virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
+                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
+                            int32_t *prefix) const;
    virtual int32_t getType() const;
 private:
    const UChar *characters;
@ -98,8 +121,9 @@ public:
    BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f)
            : characters(c), transformConstant(t), file(f) { }
    virtual ~BytesDictionaryMatcher();
-    virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count,
-                            int32_t limit, int32_t *values = NULL) const;
+    virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
+                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
+                            int32_t *prefix) const;
    virtual int32_t getType() const;
 private:
    UChar32 transform(UChar32 c) const;
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -701,6 +701,22 @@ int32_t RuleBasedBreakIterator::previous(void) {
 * @return The position of the first break after the current position.
 */
 int32_t RuleBasedBreakIterator::following(int32_t offset) {
+    // if the offset passed in is already past the end of the text,
+    // just return DONE; if it's before the beginning, return the
+    // text's starting offset
+    if (fText == NULL || offset >= utext_nativeLength(fText)) {
+        last();
+        return next();
+    }
+    else if (offset < 0) {
+        return first();
+    }
+
+    // Move requested offset to a code point start. It might be on a trail surrogate,
+    // or on a trail byte if the input is UTF-8.
+    utext_setNativeIndex(fText, offset);
+    offset = utext_getNativeIndex(fText);
+
    // if we have cached break positions and offset is in the range
    // covered by them, use them
    // TODO: could use binary search
@ -722,20 +738,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
        }
    }

-    // if the offset passed in is already past the end of the text,
-    // just return DONE; if it's before the beginning, return the
-    // text's starting offset
-    fLastRuleStatusIndex  = 0;
-    fLastStatusIndexValid = TRUE;
-    if (fText == NULL || offset >= utext_nativeLength(fText)) {
-        last();
-        return next();
-    }
-    else if (offset < 0) {
-        return first();
-    }
-
-    // otherwise, set our internal iteration position (temporarily)
+    // Set our internal iteration position (temporarily)
    // to the position passed in.  If this is the _beginning_ position,
    // then we can just use next() to get our return value

@ -747,6 +750,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
        // move forward one codepoint to prepare for moving back to a
        // safe point.
        // this handles offset being between a supplementary character
+        // TODO: is this still needed, with move to code point boundary handled above?
        (void)UTEXT_NEXT32(fText);
        // handlePrevious will move most of the time to < 1 boundary away
        handlePrevious(fData->fSafeRevTable);
@ -809,6 +813,21 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
 * @return The position of the last boundary before the starting position.
 */
 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
+    // if the offset passed in is already past the end of the text,
+    // just return DONE; if it's before the beginning, return the
+    // text's starting offset
+    if (fText == NULL || offset > utext_nativeLength(fText)) {
+        return last();
+    }
+    else if (offset < 0) {
+        return first();
+    }
+
+    // Move requested offset to a code point start. It might be on a trail surrogate,
+    // or on a trail byte if the input is UTF-8.
+    utext_setNativeIndex(fText, offset);
+    offset = utext_getNativeIndex(fText);
+
    // if we have cached break positions and offset is in the range
    // covered by them, use them
    if (fCachedBreakPositions != NULL) {
@ -834,17 +853,6 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
        }
    }

-    // if the offset passed in is already past the end of the text,
-    // just return DONE; if it's before the beginning, return the
-    // text's starting offset
-    if (fText == NULL || offset > utext_nativeLength(fText)) {
-        // return BreakIterator::DONE;
-        return last();
-    }
-    else if (offset < 0) {
-        return first();
-    }
-
    // if we start by updating the current iteration position to the
    // position specified by the caller, we can just use previous()
    // to carry out this operation
@ -1578,30 +1586,6 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
        return (reverse ? startPos : endPos);
    }
    
-    // Bug 5532.  The dictionary code will crash if the input text is UTF-8
-    //      because native indexes are different from UTF-16 indexes.
-    //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
-    //      It wont give the right breaks, but it's better than a crash.
-    //
-    //      Check the type of the UText by checking its pFuncs field, which
-    //      is UText's function dispatch table.  It will be the same for all
-    //      UTF-8 UTexts and different for any other UText type.
-    //
-    //      We have no other type of UText available with non-UTF-16 native indexing.
-    //      This whole check will go away once the dictionary code is fixed.
-    static const void *utext_utf8Funcs;
-    if (utext_utf8Funcs == NULL) {
-        // Cache the UTF-8 UText function pointer value.
-        UErrorCode status = U_ZERO_ERROR;
-        UText tempUText = UTEXT_INITIALIZER; 
-        utext_openUTF8(&tempUText, NULL, 0, &status);
-        utext_utf8Funcs = tempUText.pFuncs;
-        utext_close(&tempUText);
-    }
-    if (fText->pFuncs == utext_utf8Funcs) {
-        return (reverse ? startPos : endPos);
-    }
-
    // Starting from the starting point, scan towards the proposed result,
    // looking for the first dictionary character (which may be the one
    // we're on, if we're starting in the middle of a range).
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT:
- * Copyright (c) 1999-2013, International Business Machines Corporation and
+ * Copyright (c) 1999-2014, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
 /************************************************************************
@ -31,9 +31,9 @@
 #include "intltest.h"
 #include "rbbitst.h"
 #include <string.h>
+#include "charstr.h"
 #include "uvector.h"
 #include "uvectr32.h"
-#include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "unicode/numfmt.h"
@ -354,27 +354,19 @@ void RBBITest::TestStatusReturn() {
 }


-static void printStringBreaks(UnicodeString ustr, int expected[],
-                              int expectedcount)
-{
+static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
    UErrorCode status = U_ZERO_ERROR;
    char name[100];
    printf("code    alpha extend alphanum type word sent line name\n");
-    int j;
-    for (j = 0; j < ustr.length(); j ++) {
-        if (expectedcount > 0) {
-            int k;
-            for (k = 0; k < expectedcount; k ++) {
-                if (j == expected[k]) {
-                    printf("------------------------------------------------ %d\n",
-                           j);
-                }
-            }
-        }
-        UChar32 c = ustr.char32At(j);
-        if (c > 0xffff) {
-            j ++;
+    int nextExpectedIndex = 0;
+    utext_setNativeIndex(tstr, 0);
+    for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
+        if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
+            printf("------------------------------------------------ %d\n", j);
+            ++nextExpectedIndex;
        }
+
+        UChar32 c = utext_next32(tstr);
        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
        printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
                           u_isUAlphabetic(c),
@ -400,6 +392,19 @@ static void printStringBreaks(UnicodeString ustr, int expected[],
 }


+static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
+   UErrorCode status = U_ZERO_ERROR;
+   UText *tstr = NULL;
+   tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
+   if (U_FAILURE(status)) {
+       printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
+       return;
+    }
+   printStringBreaks(tstr, expected, expectedCount);
+   utext_close(tstr);
+}
+
+
 void RBBITest::TestBug3818() {
    UErrorCode  status = U_ZERO_ERROR;

@ -830,23 +835,173 @@ void RBBITest::TestBug5775() {
 //------------------------------------------------------------------------------

 struct TestParams {
-    BreakIterator   *bi;
-    UnicodeString    dataToBreak;
-    UVector32       *expectedBreaks;
-    UVector32       *srcLine;
+    BreakIterator   *bi;                   // Break iterator is set while parsing test source.
+                                           //   Changed out whenever test data changes break type.
+
+    UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
+    UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
+    UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
    UVector32       *srcCol;
+
+    UText           *textToBreak;          // UText, could be UTF8 or UTF16.
+    UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
+    CharString       utf8String;           // UTF-8 form of text to break.
+
+    TestParams(UErrorCode &status) : dataToBreak() {
+        bi               = NULL;
+        expectedBreaks   = new UVector32(status);
+        srcLine          = new UVector32(status);
+        srcCol           = new UVector32(status);
+        textToBreak      = NULL;
+        textMap          = new UVector32(status);
+    }
+
+    ~TestParams() {
+        delete bi;
+        delete expectedBreaks;
+        delete srcLine;
+        delete srcCol;
+        utext_close(textToBreak);
+        delete textMap;
+    }
+    
+    int32_t getSrcLine(int32_t bp);
+    int32_t getExpectedBreak(int32_t bp);
+    int32_t getSrcCol(int32_t bp);
+
+    void setUTF16(UErrorCode &status);
+    void setUTF8(UErrorCode &status);
 };

-void RBBITest::executeTest(TestParams *t) {
+// Append a UnicodeString to a CharString with UTF-8 encoding.
+// Substitute any invalid chars.
+//   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
+static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return;
+    }
+    int32_t utf8Length;
+    u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
+                       src.getBuffer(), src.length(),   // UTF-16 data
+                       0xfffd, NULL,                    // Substitution char, number of subs.
+                       &status);
+    if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
+        return;
+    }
+    status = U_ZERO_ERROR;
+    int32_t capacity;
+    char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
+    u_strToUTF8WithSub(buffer, utf8Length, NULL,
+                       src.getBuffer(), src.length(),
+                       0xfffd, NULL, &status);
+    dest.append(buffer, utf8Length, status);
+}
+  
+
+void TestParams::setUTF16(UErrorCode &status) {
+    textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
+    textMap->removeAllElements();
+    for (int32_t i=0; i<dataToBreak.length(); i++) {
+        if (i == dataToBreak.getChar32Start(i)) {
+            textMap->addElement(i, status);
+        } else {
+            textMap->addElement(-1, status);
+        }
+    }
+    textMap->addElement(dataToBreak.length(), status);
+    U_ASSERT(dataToBreak.length() + 1 == textMap->size());
+}
+
+
+void TestParams::setUTF8(UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return;
+    }
+    utf8String.clear();
+    CharStringAppend(utf8String, dataToBreak, status);
+    textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    textMap->removeAllElements();
+    int32_t utf16Index = 0;
+    for (;;) {
+        textMap->addElement(utf16Index, status);
+        UChar32 c32 = utext_current32(textToBreak);
+        if (c32 < 0) {
+            break;
+        }
+        utf16Index += U16_LENGTH(c32);
+        utext_next32(textToBreak);
+        while (textMap->size() < utext_getNativeIndex(textToBreak)) {
+            textMap->addElement(-1, status);
+        }
+    }
+    U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
+}
+
+
+int32_t TestParams::getSrcLine(int bp) {
+    if (bp >= textMap->size()) {
+        bp = textMap->size() - 1;
+    }
+    int32_t i = 0;
+    for(; bp >= 0 ; --bp) {
+        // Move to a character boundary if we are not on one already.
+        i = textMap->elementAti(bp);
+        if (i >= 0) {
+            break;
+        }
+    }
+    return srcLine->elementAti(i);
+}
+
+
+int32_t TestParams::getExpectedBreak(int bp) {
+    if (bp >= textMap->size()) {
+        return 0;
+    }
+    int32_t i = textMap->elementAti(bp);
+    int32_t retVal = 0;
+    if (i >= 0) {
+        retVal = expectedBreaks->elementAti(i);
+    }
+    return retVal;
+}
+
+
+int32_t TestParams::getSrcCol(int bp) {
+    if (bp >= textMap->size()) {
+        bp = textMap->size() - 1;
+    }
+    int32_t i = 0;
+    for(; bp >= 0; --bp) {
+        // Move bp to a character boundary if we are not on one already.
+        i = textMap->elementAti(bp);
+        if (i >= 0) {
+            break;
+        }
+    }
+    return srcCol->elementAti(i);
+}
+
+
+void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
    int32_t    bp;
    int32_t    prevBP;
    int32_t    i;

+    TEST_ASSERT_SUCCESS(status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+
    if (t->bi == NULL) {
        return;
    }

-    t->bi->setText(t->dataToBreak);
+    t->bi->setText(t->textToBreak, status);
    //
    //  Run the iterator forward
    //
@ -855,93 +1010,92 @@ void RBBITest::executeTest(TestParams *t) {
        if (prevBP ==  bp) {
            // Fail for lack of forward progress.
            errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
-                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
+                bp, t->getSrcLine(bp), t->getSrcCol(bp));
            break;
        }

-        // Check that there were we didn't miss an expected break between the last one
+        // Check that there we didn't miss an expected break between the last one
        //  and this one.
        for (i=prevBP+1; i<bp; i++) {
-            if (t->expectedBreaks->elementAti(i) != 0) {
+            if (t->getExpectedBreak(i) != 0) {
                int expected[] = {0, i};
                printStringBreaks(t->dataToBreak, expected, 2);
                errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
-                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
+                      i, t->getSrcLine(i), t->getSrcCol(i));
            }
        }

        // Check that the break we did find was expected
-        if (t->expectedBreaks->elementAti(bp) == 0) {
+        if (t->getExpectedBreak(bp) == 0) {
            int expected[] = {0, bp};
-            printStringBreaks(t->dataToBreak, expected, 2);
+            printStringBreaks(t->textToBreak, expected, 2);
            errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
-                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
+                bp, t->getSrcLine(bp), t->getSrcCol(bp));
        } else {
            // The break was expected.
            //   Check that the {nnn} tag value is correct.
-            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
+            int32_t expectedTagVal = t->getExpectedBreak(bp);
            if (expectedTagVal == -1) {
                expectedTagVal = 0;
            }
-            int32_t line = t->srcLine->elementAti(bp);
+            int32_t line = t->getSrcLine(bp);
            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
            if (rs != expectedTagVal) {
                errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
                      "          Actual, Expected status = %4d, %4d",
-                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
+                    bp, line, t->getSrcCol(bp), rs, expectedTagVal);
            }
        }

-
        prevBP = bp;
    }

    // Verify that there were no missed expected breaks after the last one found
-    for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
-        if (t->expectedBreaks->elementAti(i) != 0) {
+    for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
+        if (t->getExpectedBreak(i) != 0) {
            errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
-                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
+                      i, t->getSrcLine(i), t->getSrcCol(i));
        }
    }

    //
    //  Run the iterator backwards, verify that the same breaks are found.
    //
-    prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
+    prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
        if (prevBP ==  bp) {
            // Fail for lack of progress.
            errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
-                bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
+                bp, t->getSrcLine(bp), t->getSrcCol(bp));
            break;
        }

-        // Check that there were we didn't miss an expected break between the last one
+        // Check that we didn't miss an expected break between the last one
        //  and this one.  (UVector returns zeros for index out of bounds.)
        for (i=prevBP-1; i>bp; i--) {
-            if (t->expectedBreaks->elementAti(i) != 0) {
-                errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
-                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
+            if (t->getExpectedBreak(i) != 0) {
+                errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
+                      i, t->getSrcLine(i), t->getSrcCol(i));
            }
        }

        // Check that the break we did find was expected
-        if (t->expectedBreaks->elementAti(bp) == 0) {
+        if (t->getExpectedBreak(bp) == 0) {
            errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
-                   bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
+                   bp, t->getSrcLine(bp), t->getSrcCol(bp));
        } else {
            // The break was expected.
            //   Check that the {nnn} tag value is correct.
-            int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
+            int32_t expectedTagVal = t->getExpectedBreak(bp);
            if (expectedTagVal == -1) {
                expectedTagVal = 0;
            }
-            int line = t->srcLine->elementAti(bp);
-            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
+            int line = t->getSrcLine(bp);
+            int32_t rs = t->bi->getRuleStatus();
            if (rs != expectedTagVal) {
                errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
                      "          Actual, Expected status = %4d, %4d",
-                    bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
+                    bp, line, t->getSrcCol(bp), rs, expectedTagVal);
            }
        }

@ -950,30 +1104,30 @@ void RBBITest::executeTest(TestParams *t) {

    // Verify that there were no missed breaks prior to the last one found
    for (i=prevBP-1; i>=0; i--) {
-        if (t->expectedBreaks->elementAti(i) != 0) {
+        if (t->getExpectedBreak(i) != 0) {
            errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
-                      i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
+                      i, t->getSrcLine(i), t->getSrcCol(i));
        }
    }

    // Check isBoundary()
-    for (i=0; i<t->expectedBreaks->size(); i++) {
-        UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
+    for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
+        UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
        UBool boundaryFound    = t->bi->isBoundary(i);
        if (boundaryExpected != boundaryFound) {
            errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
                  "        Expected, Actual= %s, %s",
-                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
+                  i, t->getSrcLine(i), t->getSrcCol(i),
                  boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
        }
    }

    // Check following()
-    for (i=0; i<t->expectedBreaks->size(); i++) {
+    for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
        int32_t actualBreak = t->bi->following(i);
        int32_t expectedBreak = BreakIterator::DONE;
-        for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
-            if (t->expectedBreaks->elementAti(j) != 0) {
+        for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
+            if (t->getExpectedBreak(j) != 0) {
                expectedBreak = j;
                break;
            }
@ -981,17 +1135,24 @@ void RBBITest::executeTest(TestParams *t) {
        if (expectedBreak != actualBreak) {
            errln("following(%d) incorrect. File line,col= %4d,%4d\n"
                  "        Expected, Actual= %d, %d",
-                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
+                  i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
        }
    }

    // Check preceding()
-    for (i=t->expectedBreaks->size(); i>=0; i--) {
+    for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
        int32_t actualBreak = t->bi->preceding(i);
        int32_t expectedBreak = BreakIterator::DONE;

-        for (int32_t j=i-1; j >= 0; j--) {
-            if (t->expectedBreaks->elementAti(j) != 0) {
+        // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
+        // preceding(trailing byte) will return the index of some preceding code point,
+        // not the lead byte of the current code point, even though that has a smaller index.
+        // Therefore, start looking at the expected break data not at i-1, but at
+        // the start of code point index - 1.
+        utext_setNativeIndex(t->textToBreak, i);
+        int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
+        for (; j >= 0; j--) {
+            if (t->getExpectedBreak(j) != 0) {
                expectedBreak = j;
                break;
            }
@ -999,7 +1160,7 @@ void RBBITest::executeTest(TestParams *t) {
        if (expectedBreak != actualBreak) {
            errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
                  "        Expected, Actual= %d, %d",
-                  i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
+                  i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
        }
    }
 }
@ -1011,11 +1172,7 @@ void RBBITest::TestExtended() {
    Locale          locale("");

    UnicodeString       rules;
-    TestParams          tp;
-    tp.bi             = NULL;
-    tp.expectedBreaks = new UVector32(status);
-    tp.srcLine        = new UVector32(status);
-    tp.srcCol         = new UVector32(status);
+    TestParams          tp(status);

    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
    if (U_FAILURE(status)) {
@ -1190,7 +1347,16 @@ void RBBITest::TestExtended() {
                charIdx += 6;

                // RUN THE TEST!
-                executeTest(&tp);
+                status = U_ZERO_ERROR;
+                tp.setUTF16(status);
+                executeTest(&tp, status);
+                TEST_ASSERT_SUCCESS(status);
+
+                // Run again, this time with UTF-8 text wrapped in a UText.
+                status = U_ZERO_ERROR;
+                tp.setUTF8(status);
+                TEST_ASSERT_SUCCESS(status);
+                executeTest(&tp, status);
                break;
            }

@ -1356,10 +1522,6 @@ void RBBITest::TestExtended() {
    }

 end_test:
-    delete tp.bi;
-    delete tp.expectedBreaks;
-    delete tp.srcLine;
-    delete tp.srcCol;
    delete [] testFile;
 #endif
 }
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 1999-2013, International Business Machines
+ * Copyright (c) 1999-2014, International Business Machines
 * Corporation and others. All Rights Reserved.
 *************************************************************************
 *   Date        Name        Description
@ -57,7 +57,7 @@ public:

    void TestExtended();
    UChar *ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status);
-    void executeTest(TestParams *);
+    void executeTest(TestParams *, UErrorCode &status);

    void TestWordBreaks();
    void TestWordBoundary();
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@ -33,11 +33,10 @@


 #   Temp debugging tests 
-<word>
-<data>•Isn't<200></data>
-<char>
-<data>•\U00010020•\U00010000\N{COMBINING MACRON}•</data>
+<sent>
+<data>•\u00c0.•</data>

+#<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u2029•</data>
 ########################################################################################
 #
 #