ICU-2292 added safe forward and backwards rules

X-SVN-Rev: 13648
2025-04-08 06:53:45 +00:00 · 2003-11-09 06:52:44 +00:00 · 2003-11-09 06:52:44 +00:00 · 41ac2f557b
commit 41ac2f557b
parent d0370e2786
7 changed files with 301 additions and 169 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -405,8 +405,8 @@ int32_t RuleBasedBreakIterator::previous(void) {
        return BreakIterator::DONE;
    }

-    if (fData->fSafeRevTable != NULL) {
-        return handleNewPrevious();
+    if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
+        return handlePrevious(fData->fReverseTable);
    }

    // old rule syntax
@ -486,27 +486,56 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
    if (fData->fSafeRevTable != NULL) {
        // new rule syntax
        /// todo synwee 
-        /// fText->setIndex(offset);
-        fText->setIndex(fText->startIndex());
-
-        result = fText->startIndex();
-    }
-    else {
-        // otherwise, we have to sync up first.  Use handlePrevious() to back
-        // us up to a known break position before the specified position (if
-        // we can determine that the specified position is a break position,
-        // we don't back up at all).  This may or may not be the last break
-        // position at or before our starting position.  Advance forward
-        // from here until we've passed the starting position.  The position
-        // we stop on will be the first break position after the specified one.
-        // old rule syntax
-
        fText->setIndex(offset);
-        if (offset == fText->startIndex()) {
-            return handleNext();
+        // move forward one codepoint to prepare for moving back to a
+        // safe point.
+        // this handles offset being between a supplementary character
+        fText->next32();
+        // handlePrevious will move most of the time to < 1 boundary away
+        handlePrevious(fData->fSafeRevTable);
+        int32_t result = next();
+        while (result <= offset) {
+            result = next();
        }
-        result = previous();
+        return result;
    }
+    if (fData->fSafeFwdTable != NULL) {
+        // backup plan if forward safe table is not available
+        fText->setIndex(offset);
+        fText->previous32();
+        // handle next will give result >= offset
+        handleNext(fData->fSafeFwdTable);
+        // previous will give result 0 or 1 boundary away from offset, 
+        // most of the time
+        // we have to 
+        int32_t oldresult = previous();
+        while (oldresult > offset) {
+            int32_t result = previous();
+            if (result <= offset) {
+                return oldresult;
+            }
+            oldresult = result;
+        }
+        int32_t result = next();
+        if (result <= offset) {
+            return next();
+        }
+        return result;
+    }
+    // otherwise, we have to sync up first.  Use handlePrevious() to back
+    // us up to a known break position before the specified position (if
+    // we can determine that the specified position is a break position,
+    // we don't back up at all).  This may or may not be the last break
+    // position at or before our starting position.  Advance forward
+    // from here until we've passed the starting position.  The position
+    // we stop on will be the first break position after the specified one.
+    // old rule syntax
+
+    fText->setIndex(offset);
+    if (offset == fText->startIndex()) {
+        return handleNext();
+    }
+    result = previous();

    while (result != BreakIterator::DONE && result <= offset) {
        result = next();
@ -537,15 +566,43 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
    // position specified by the caller, we can just use previous()
    // to carry out this operation

-    if (fData->fSafeRevTable != NULL) {
+    if (fData->fSafeFwdTable != NULL) {
        /// todo synwee
        // new rule syntax
-        int32_t result = fText->endIndex();
-        fText->setIndex(result);
-        while (result != BreakIterator::DONE && result >= offset) {
+        fText->setIndex(offset);
+        // move backwards one codepoint to prepare for moving forwards to a
+        // safe point.
+        // this handles offset being between a supplementary character
+        fText->previous32();
+        handleNext(fData->fSafeFwdTable);
+        int32_t result = previous();
+        while (result >= offset) {
            result = previous();
        }
+        return result;
+    }
+    if (fData->fSafeRevTable != NULL) {
+        // backup plan if forward safe table is not available
+        fText->setIndex(offset);
+        fText->next32();
+        // handle previous will give result <= offset
+        handlePrevious(fData->fSafeRevTable);

+        // next will give result 0 or 1 boundary away from offset, 
+        // most of the time
+        // we have to 
+        int32_t oldresult = next();
+        while (oldresult < offset) {
+            int32_t result = next();
+            if (result >= offset) {
+                return oldresult;
+            }
+            oldresult = result;
+        }
+        int32_t result = previous();
+        if (result >= offset) {
+            return previous();
+        }
        return result;
    }

@ -568,6 +625,11 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
        return TRUE;
    }

+    if (offset == fText->endIndex()) {
+        last();       // For side effects on current position, tag values.
+        return TRUE;
+    }
+
    // out-of-range indexes are never boundary positions
    if (offset < fText->startIndex()) {
        first();       // For side effects on current position, tag values.
@ -608,7 +670,11 @@ int32_t RuleBasedBreakIterator::current(void) const {
 //     value every time the state machine passes through an accepting state.
 //
 //-----------------------------------------------------------------------------------
-int32_t RuleBasedBreakIterator::handleNext(void) {
+int32_t RuleBasedBreakIterator::handleNext() {
+    return handleNext(fData->fForwardTable);
+}
+
+int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
    if (fTrace) {
        RBBIDebugPrintf("Handle Next   pos   char  state category  \n");
    }
@ -637,7 +703,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
    fLastBreakTag = 0;

    row = (RBBIStateTableRow *)    // Point to starting row of state table.
-        (fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
+        (statetable->fTableData + (statetable->fRowLen * state));

    // Character Category fetch for starting character.
    //    See comments on character category code within loop, below.
@ -700,7 +766,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
        // look up a state transition in the state table
        state = row->fNextState[category];
        row = (RBBIStateTableRow *)
-            (fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
+            (statetable->fTableData + (statetable->fRowLen * state));

        // Get the next character.  Doing it here positions the iterator
        //    to the correct position for recording matches in the code that
@ -913,14 +979,14 @@ continueOn:
 //      The logic of this function is very similar to handleNext(), above.
 //
 //-----------------------------------------------------------------------------------
-int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
-    if (fText == NULL || fData == NULL) {
+int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
+    if (fText == NULL || statetable == NULL) {
        return 0;
    }
    // break tag is no longer valid after icu switched to exact backwards
    // positioning.
    fLastBreakTagValid = FALSE;
-    if (fData->fReverseTable == NULL) {
+    if (statetable == NULL) {
        return fText->setToStart();
    }

@ -938,7 +1004,7 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
    RBBIStateTableRow *row;

    row = (RBBIStateTableRow *)
-        (this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
+        (statetable->fTableData + (state * statetable->fRowLen));
    UTRIE_GET16(&fData->fTrie, c, category);
    if ((category & 0x4000) != 0)  {
        fDictionaryCharCount++;
@ -954,8 +1020,7 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
        // if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
        if (hasPassedStartText) { 
            // if we have already considered the start of the text
-            if (fData->fLookAheadHardBreak == TRUE 
-                && row->fLookAhead != 0) {
+            if (row->fLookAhead != 0 && lookaheadResult == 0) {
                result = 0;
            }
            break;
@ -987,7 +1052,7 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
        // look up a state transition in the backwards state table
        state = row->fNextState[category];
        row = (RBBIStateTableRow *)
-            (this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
+            (statetable->fTableData + (state * statetable->fRowLen));
    
        if (row->fAccepting == -1) {
            // Match found, common case, could have lookahead so we move on to check it
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@ -30,6 +30,7 @@ struct RBBIDataHeader;
 class  RuleBasedBreakIteratorTables;
 class  BreakIterator;
 class  RBBIDataWrapper;
+struct RBBIStateTable;



@ -480,9 +481,21 @@ private:
     * The various calling methods then iterate forward from this safe position to
     * the appropriate position to return.  (For more information, see the description
     * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
+     * @param statetable state table used of moving backwards
     * @internal
     */
-    int32_t handleNewPrevious(void);
+    int32_t handlePrevious(const RBBIStateTable *statetable);
+
+    /**
+     * This method is the actual implementation of the next() method.  All iteration
+     * vectors through here.  This method initializes the state machine to state 1
+     * and advances through the text character by character until we reach the end
+     * of the text or the state machine transitions to state 0.  We update our return
+     * value every time the state machine passes through a possible end state.
+     * @param statetable state table used of moving forwards
+     * @internal
+     */
+    int32_t handleNext(const RBBIStateTable *statetable);
 };

 //------------------------------------------------------------------------------
--- a/icu4c/source/data/brkitr/char.txt
+++ b/icu4c/source/data/brkitr/char.txt
@ -50,4 +50,12 @@ $BackOneCluster;

 !!safe_reverse;

-$BackOneCluster; 
+# rule 6, 7, 8
+$V+ $L;
+ 
+## -------------------------------------------------
+
+!!safe_forward;
+
+# rule 6, 7, 8
+$V+ $T;
--- a/icu4c/source/data/brkitr/line.txt
+++ b/icu4c/source/data/brkitr/line.txt
@ -341,7 +341,7 @@ $CM* $ALPlus $CM+ / $LB5Breaks;
 !!safe_reverse;

 # LB 7
-$CM* [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
 $CM+ $SP / .;

 # LB 9
@ -362,17 +362,17 @@ $CL $CM* ($NU | $IS);
 !!safe_forward;

 # LB 7
-[^$BK $CR $LF $NL $ZW $SP] $CM*;
-$SP $CM+ / .;
+[^$BK $CR $LF $NL $ZW $SP] $CM+;
+$SP $CM+ / [^$CM];

 # LB 9
-$OP $CM* $SP*;
+$OP $CM* $SP+;

 # LB 10
-$QU $CM* $SP*;
+$QU $CM* $SP+;

 # LB 11
-$CL $CM* $SP*;
+$CL $CM* $SP+;

 # LB 18
 $HY $CM* $NU; 
--- a/icu4c/source/data/brkitr/sent.txt
+++ b/icu4c/source/data/brkitr/sent.txt
@ -9,7 +9,6 @@
 #      These rules are based on TR 29 version 4.0.0
 #
    
-!!chain;

 #
 # Character categories as defined in TR 29
@ -31,85 +30,79 @@ $Term  = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
 $Close   = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
           [[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
           
-$Extend  = [[:Grapheme_Extend = TRUE:]]; 
+           

-$ATermEx = $ATerm $Extend*;
-$NumericEx = $Numeric $Extend*;
-$UpperEx = $Upper $Extend*;
-$CloseEx = $Close $Extend*;
-$SpEx = $Sp $Extend*;
-$LowerEx = $Lower $Extend*;
-$TermEx = $Term $Extend*;
+# Define extended forms of the character classes,
+#   incorporate grapheme cluster + format chars.

-# rule 6
+$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+$ATermEx    = $ATerm   $Extend* $Format*;
+$NumericEx  = $Numeric $Extend* $Format*;
+$UpperEx    = $Upper   $Extend* $Format*;
+$TermEx     = $Term    $Extend* $Format*;

-$ATermEx $Format* $NumericEx;         
+#
+#  $SepSeq keeps together CRLF as a separator.  (CRLF is a grapheme cluster)
+#
+$SepSeq  = $Sep | \u000d\u000a;

-# rule 7
+# $InteriorChars are those that never trigger a following break.
+$InteriorChars = [^$Term $ATerm $Sep];   #Note:  includes Extend and Format chars

-$UpperEx $ATermEx $Format* $UpperEx;
+## -------------------------------------------------

-# rule 8
+!!forward;

-$ATermEx $Format* $CloseEx* $Format* $SpEx $Format* 
-    [^$OLetter $Upper $Lower $Sep]* $Extend* $Format* $LowerEx;
-
-# rule 9 forced to exit by / [^$Close $Sp]
-
-($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* $Sep;
-($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($CloseEx | $SpEx) / [^$Close $Sp];
-
-# rule 10 forced to exit by / [^$Sp];
+# Rule 6.  Match an ATerm (.) that does not cause a break because a number immediately follows it.
+$NumberFollows = $InteriorChars* $ATermEx $NumericEx;


-($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $Sep;
-($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $SpEx / [^$Sp];
+# Rule 7.  $UppersSurround   Match a no-break sentence fragment containing a . surrounded by Uppers
+$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;

+# Rule 8   Matches a sentence fragment containing "." that should not cause a sentence break,
+#          because a lower case word follows the period.
+$LowerWordFollows  = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;

-# rule 11 partly included in rule 9 and 10
-$TermEx;
-$ATermEx;
+# Rules 3, 9, 10, 11
+#                       Matches a simple sentence, or the trailing part of a complex sentence,
+#                       where a simple sentence contains no interior "."s.
+$TermEndSequence   = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq?;
+$EndSequence       = $InteriorChars* $SepSeq?;

-# rule 12
-
-([^$Term $ATerm $Sep] $Extend*)+;
-([^$Term $ATerm $Sep] $Extend* $Format*)+ ($Term | $ATerm | $Sep);
+# Put them all together.  
+($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $TermEndSequence{0};   # status = UBRK_SENTENCE_TERM
+($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $EndSequence{100};     # status = UBRK_SENTENCE_SEP
+    
+## -------------------------------------------------

+!!reverse;
+ 
 #
 #  Reverse Rules
 #
+$EndGorp                  = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp);
+$RevEndSequence           = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*;
+$ReverseLowerWordFollows  = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*;
+$ReverseUpperSurround     = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*;
+$ReverseNumberFollows     = $Numeric $Format* $Extend* $ATerm $InteriorChars*;

-$BackATermEx = $Extend* $ATerm;
-$BackNumericEx = $Extend* $Numeric;
-$BackUpperEx = $Extend* $Upper;
-$BackCloseEx = $Extend* $Close;
-$BackSpEx = $Extend* $Sp;
-$BackLowerEx = $Extend* $Lower;
-$BackTermEx = $Extend* $Term;
+$RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?;

-# rule 3 
-
-! $Sep .;
-
-# rule 6
-
-! $BackNumericEx $Format* $BackATermEx;         
+## -------------------------------------------------

+## !!safe_reverse;
+ 
 # rule 7
+## $Extend* $ATerm $Format* $Extend* $Upper;

-! $BackUpperEx $Format* $BackATermEx $BackUpperEx;
+# rule 11
+## ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm);
+
+## -------------------------------------------------
+
+!!safe_forward;

 # rule 8

-! $BackLowerEx $Format* $Extend* [^$OLetter $Upper $Lower $Sep]* $Format* 
-    $BackSpEx $Format* $BackCloseEx* $Format* $BackATermEx;
-
-# rules 9, 10, 11, 12
-
-$Any = [^$Term $ATerm $Sep];
-$Safe = [^$Term $ATerm $Sep $Sp $Close];
-$BackEnd = ($BackSpEx $Format*)* ($BackCloseEx $Format*)* ($BackTermEx | $BackATermEx);
-! $BackEnd;
-! $BackEnd? $Any* $Safe;
-! $BackEnd? $Any* $Close / ($BackSpEx $Format*)+ ($BackTermEx | $BackATermEx);
-! $BackEnd? $Any* $Sp / $Sep; 
+## $Lower .;
--- a/icu4c/source/data/brkitr/word.txt
+++ b/icu4c/source/data/brkitr/word.txt
@ -173,5 +173,42 @@ $BackKatakanaEx $Format* $BackKatakanaEx;

 !!safe_reverse;

-$Extend* [^$Extend];
-$BackACMLetterEx / $Format;
+# rule 3
+$Extend+ [^$Extend];
+
+# rule 4
+$Format+ $BackABaseLetterEx;
+$Format+ $BackACMLetterEx / $Format;
+$Format+ $BackNumericEx;
+$Format+ $BackMidLetterEx;
+$Format+ $BackMidNumLetEx;
+$Format+ $BackMidNumEx;
+$Format+ $BackKatakanaEx;
+
+# rule 6
+($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx;
+($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Format;
+
+# rule 11
+($MidNum | $MidNumLet) $Format* $BackNumericEx;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# rule 3
+$Extend+;
+
+# rule 4
+$Format+ $ALetterEx;
+$Format+ $NumericEx;
+$Format+ $MidLetterEx;
+$Format+ $MidNumLetEx;
+$Format+ $MidNumEx;
+$Format+ $KatakanaEx;
+
+# rule 6
+($MidLetter | $MidNumLet) $Format* $ALetterEx;
+
+# rule 11
+($MidNum | $MidNumLet) $Format* $NumericEx;
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -589,7 +589,6 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
             if(exec) TestWordBoundary();                 break;
        case 14: name = "TestLineBreaks";
             if(exec) TestLineBreaks();                   break;
-        /***
        case 15: name = "TestSentBreaks";
             if(exec) TestSentBreaks();                 break;
        case 16: name = "TestExtended";
@ -603,7 +602,6 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
 #endif
             }
             break;
-        ***/
        default: name = ""; break; //needed to end loop
    }
 }
@ -3005,6 +3003,74 @@ static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t d
 }
 #endif

+static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 
+                                    BreakIterator *bi,
+                                    int expected[], 
+                                    int expectedcount)
+{
+    int count = 0;
+    int i = 0;
+    int forward[20];
+    bi->setText(ustr);
+    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
+        forward[count] = i;
+        if (count < expectedcount && expected[count] != i) {
+            test->errln("break forward test failed: expected %d but got %d", 
+                        expected[count], i);
+            break;
+        }
+        count ++;
+    }
+    if (count != expectedcount) {
+        printStringBreaks(ustr, expected, expectedcount);
+        test->errln("break test failed: missed %d match", 
+                    expectedcount - count);
+        return;
+    }
+    // testing boundaries
+    for (i = 1; i < expectedcount; i ++) {
+        int j = expected[i - 1];
+        if (!bi->isBoundary(j)) {
+            printStringBreaks(ustr, expected, expectedcount);
+            test->errln("Expected boundary at position %d", j);
+            return;
+        }
+        for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
+            if (bi->isBoundary(j)) {
+                printStringBreaks(ustr, expected, expectedcount);
+                test->errln("Not expecting boundary at position %d", j);
+                return;
+            }
+        }
+    }
+
+    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
+        count --;
+        if (forward[count] != i) {
+            test->errln("happy break test reverse failed: expected %d but got %d", 
+                        forward[count], i);
+            break;
+        }
+    }
+    if (count != 0) {
+        printStringBreaks(ustr, expected, expectedcount);
+        test->errln("happy break test failed: missed a match");
+        return;
+    }
+
+    // testing preceding
+    for (i = 0; i < expectedcount - 1; i ++) {
+        int j = expected[i] + 1;
+        for (; j <= expected[i + 1]; j ++) {
+            if (bi->preceding(j) != expected[i]) {
+                printStringBreaks(ustr, expected, expectedcount);
+                test->errln("Not expecting backwards boundary at position %d", j);
+                return;
+            }
+        }
+    }    
+}
+
 void RBBITest::TestWordBreaks(void)
 {
    // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
@ -3015,6 +3081,7 @@ void RBBITest::TestWordBreaks(void)
    UChar         str[25]; 
    char          *strlist[] = 
    {
+    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
    "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
    "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
@ -3051,13 +3118,13 @@ void RBBITest::TestWordBreaks(void)
    };
    int loop;
    for (loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
+        // printf("looping %d\n", loop);
        u_unescape(strlist[loop], str, 25);
        UnicodeString ustr(str);
        // RBBICharMonkey monkey;
        RBBIWordMonkey monkey;

        int expected[20];
-        int forward[20];
        int expectedcount = 0;

        monkey.setText(ustr);
@ -3066,33 +3133,7 @@ void RBBITest::TestWordBreaks(void)
            expected[expectedcount ++] = i;
        }

-        int count = 0;
-        bi->setText(ustr);
-        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
-            forward[count] = i;
-            if (count > 20 || expected[count] != i) {
-                 errln("happy break forward test failed: expected %d but got %d", 
-                       expected[count], i);
-            }
-            count ++;
-        }
-        if (count != expectedcount) {
-            printStringBreaks(ustr, expected, expectedcount);
-            errln("happy break test failed: missed a match");
-            break;
-        }
-        for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
-            count --;
-            if (forward[count] != i) {
-                printStringBreaks(ustr, expected, expectedcount);
-                errln("happy break test reverse failed: expected %d but got %d", 
-                      forward[count], i);
-                break;
-            }
-        }
-        if (count != 0) {
-            errln("happy break test failed: missed a match");
-        }
+        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
    }
 }

@ -3105,7 +3146,9 @@ void RBBITest::TestWordBoundary(void)
    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
    UChar         str[20]; 
    char          *strlist[] = 
-    {"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
+    {
+    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
+    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
    "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
    "\\u2027\\U000e0067\\u0a47\\u00b7",
    "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
@ -3136,6 +3179,7 @@ void RBBITest::TestWordBoundary(void)
    };
    int loop;
    for (loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
+        // printf("looping %d\n", loop);
        u_unescape(strlist[loop], str, 20);
        UnicodeString ustr(str);
        int forward[20];
@ -3153,7 +3197,7 @@ void RBBITest::TestWordBoundary(void)
                        printStringBreaks(ustr, forward, count);
                        errln("happy boundary test failed: expected %d not a boundary", 
                               j);
-                        break;
+                        return;
                    }
                }
            }
@ -3161,7 +3205,7 @@ void RBBITest::TestWordBoundary(void)
                printStringBreaks(ustr, forward, count);
                errln("happy boundary test failed: expected %d a boundary", 
                       i);
-                break;
+                return;
            }
            prev = i;
        }
@ -3176,6 +3220,9 @@ void RBBITest::TestLineBreaks(void)
    UChar         str[20]; 
    char          *strlist[] = 
    {
+     "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
+     "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
+     "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
@ -3207,7 +3254,6 @@ void RBBITest::TestLineBreaks(void)
        RBBILineMonkey monkey;

        int expected[20];
-        int forward[20];
        int expectedcount = 0;

        monkey.setText(ustr);
@ -3216,35 +3262,7 @@ void RBBITest::TestLineBreaks(void)
            expected[expectedcount ++] = i;
        }

-        int count = 0;
-        bi->setText(ustr);
-        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
-            forward[count] = i;
-            if (count < expectedcount && expected[count] != i) {
-                 errln("happy break forward test failed: expected %d but got %d", 
-                       expected[count], i);
-            }
-            count ++;
-        }
-        if (count != expectedcount) {
-            printStringBreaks(ustr, expected, expectedcount);
-            errln("happy break test failed: missed %d match", 
-                  expectedcount - count);
-            break;
-        }
-         for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
-            count --;
-            if (forward[count] != i) {
-                printStringBreaks(ustr, expected, expectedcount);
-                errln("happy break test reverse failed: expected %d but got %d", 
-                      forward[count], i);
-                break;
-            }
-        }
-        if (count != 0) {
-            errln("happy break test failed: missed a match");
-            break;
-        }
+        testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
    }
 }

@ -3266,12 +3284,10 @@ void RBBITest::TestSentBreaks(void)
     "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
    };
    int loop;
+    int forward[100];
    for (loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
-        printf("looping %d\n", loop);
        u_unescape(strlist[loop], str, 100);
        UnicodeString ustr(str);
-        
-        int forward[20];

        int count = 0;
        bi->setText(ustr);