ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
2025-04-10 07:39:16 +00:00 · 2005-03-23 02:13:53 +00:00 · 2005-03-23 02:13:53 +00:00 · cd85b65d35
commit cd85b65d35
parent 475c03442f
9 changed files with 608 additions and 463 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -982,12 +982,8 @@ continueOn:
 //
 //  handlePrevious()
 //
-//      This method backs the iterator back up to a "safe position" in the text.
-//      This is a position that we know, without any context, may be any position
-//      not more than 2 breaks away. Occasionally, the position may be less than
-//      one break away.
-//      The various calling methods then iterate forward from this safe position to
-//      the appropriate position to return.
+//      Iterate backwards, according to the logic of the reverse rules.
+//      This version handles the exact style backwards rules.
 //
 //      The logic of this function is very similar to handleNext(), above.
 //
@ -1005,14 +1001,12 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)

    int32_t            state              = START_STATE;
    int32_t            category;
-    int32_t            lastCategory       = 0;
    UBool              hasPassedStartText = !fText->hasPrevious();
    UChar32            c                  = fText->previous32();
    // previous character
    int32_t            result             = fText->getIndex();
    int32_t            lookaheadStatus    = 0;
    int32_t            lookaheadResult    = 0;
-    int32_t            lookaheadTagIdx    = 0;
    UBool              lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;

    RBBIStateTableRow *row;
@ -1031,20 +1025,14 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)

    // loop until we reach the beginning of the text or transition to state 0
    for (;;) {
-        // if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
        if (hasPassedStartText) {
-            // if we have already considered the start of the text
-            if (row->fLookAhead != 0 && lookaheadResult == 0) {
-                result = 0;
-            }
-            break;
+            // end of input is hardwired by rule builder as category #1.
+            category = 1;
+        } else {
+            //  look up the current character's category
+            UTRIE_GET16(&fData->fTrie, c, category);
        }

-        // save the last character's category and look up the current
-        // character's category
-        lastCategory = category;
-        UTRIE_GET16(&fData->fTrie, c, category);
-
        // Check the dictionary bit in the character's category.
        //    Counter is only used by dictionary based iterators.
        //
@ -1073,8 +1061,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
        if (row->fAccepting == -1) {
            // Match found, common case, could have lookahead so we move on to check it
            result = fText->getIndex();
-            /// added
-            fLastRuleStatusIndex   = row->fTagIdx;   // Remember the break status (tag) value.
        }

        if (row->fLookAhead != 0) {
@ -1083,7 +1069,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
                // Lookahead match is completed.  Set the result accordingly, but only
                // if no other rule has matched further in the mean time.
                result               = lookaheadResult;
-                fLastRuleStatusIndex = lookaheadTagIdx;
                lookaheadStatus      = 0;
                /// i think we have to back up to read the lookahead character again
                /// fText->setIndex(lookaheadResult);
@ -1100,7 +1085,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
                    fText->setIndex(result);
                    return result;
                }
-                category = lastCategory;
                fText->setIndex(result);

                goto continueOn;
@ -1109,7 +1093,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
            int32_t    r         = fText->getIndex();
            lookaheadResult      = r;
            lookaheadStatus      = row->fLookAhead;
-            fLastRuleStatusIndex = row->fTagIdx;
            goto continueOn;
        }

@ -1119,21 +1102,33 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
            goto continueOn;
        }

-        lookaheadStatus = 0;     // clear out any pending look-ahead matches.
+
+        // This is a plain (non-look-ahead) accepting state
+        if (!lookAheadHardBreak) {
+            lookaheadStatus = 0;     // clear out any pending look-ahead matches.
+                                     //  But only if not doing the lookAheadHardBreak option,
+                                     //  which needs to force a break no matter what is going
+                                     //  on with the rest of the match, i.e. we can't abandon
+                                     //  a partially completed look-ahead match because some
+                                     //  other rule matched further than the '/' position
+                                     //  in the look-ahead match.
+        }

 continueOn:
        if (state == STOP_STATE) {
            break;
        }

-        // then advance one character backwards
+        if (hasPassedStartText) {
+            break;
+        }
+
+        // Advance one character backwards
        hasPassedStartText = !fText->hasPrevious();
        c = fText->previous32();
    }

-    // Note:  the result postion isn't what is returned to the user by previous(),
-    //        but where the implementation of previous() turns around and
-    //        starts iterating forward again.
+
    fText->setIndex(result);

    return result;
--- a/icu4c/source/common/rbbisetb.cpp
+++ b/icu4c/source/common/rbbisetb.cpp
@ -3,7 +3,7 @@
 //
 /*
 ***************************************************************************
-*   Copyright (C) 2002-2004 International Business Machines Corporation   *
+*   Copyright (C) 2002-2005 International Business Machines Corporation   *
 *   and others. All rights reserved.                                      *
 ***************************************************************************
 */
@ -147,7 +147,7 @@ void RBBISetBuilder::build() {
    //  Find the set of non-overlapping ranges of characters
    //
    int  ni;
-    for (ni=0; ; ni++) {
+    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
        if (usetNode==NULL) {
            break;
@ -222,6 +222,10 @@ void RBBISetBuilder::build() {
    //    The groups are numbered, and these group numbers are the set of
    //    input symbols recognized by the run-time state machine.
    //
+    //    Numbering: # 0  (state table column 0) is unused.
+    //               # 1  is reserved - table column 1 is for end-of-input
+    //               # 2  is the first range list.
+    //
    RangeDescriptor *rlSearchRange;
    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
        for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
@ -232,12 +236,32 @@ void RBBISetBuilder::build() {
        }
        if (rlRange->fNum == 0) {
            fGroupCount ++;
-            rlRange->fNum = fGroupCount;
+            rlRange->fNum = fGroupCount+1; 
            rlRange->setDictionaryFlag();
-            addValToSets(rlRange->fIncludesSets, fGroupCount);
+            addValToSets(rlRange->fIncludesSets, fGroupCount+1);
        }
    }

+    // Handle input sets that contain the special string {eof}.
+    //   Column 1 of the state table is reserved for EOF on input.
+    //   Add this column value (1) to the equivalent expression
+    //     subtree for each UnicodeSet that contains the string {eof}
+    //   Because EOF is not a character in the normal sense, it doesn't
+    //   affect the computation of ranges or TRIE.
+    static UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
+    UnicodeString eofString(eofUString);
+    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
+        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
+        if (usetNode==NULL) {
+            break;
+        }
+        UnicodeSet      *inputSet = usetNode->fInputSet;
+        if (inputSet->contains(eofString)) {
+            addValToSet(usetNode, 1);
+        }
+    }
+
+
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}

@ -296,7 +320,7 @@ void RBBISetBuilder::serializeTrie(uint8_t *where) {
 //------------------------------------------------------------------------
 //
 //  addValToSets     Add a runtime-mapped input value to each uset from a
-//                   list of uset nodes.
+//                   list of uset nodes. (val corresponds to a state table column.)
 //                   For each of the original Unicode sets - which correspond
 //                   directly to uset nodes - a logically equivalent expression
 //                   is constructed in terms of the remapped runtime input
@ -312,35 +336,38 @@ void  RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {

    for (ix=0; ix<sets->size(); ix++) {
        RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
-        RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
-        leafNode->fVal = (unsigned short)val;
-        if (usetNode->fLeftChild == NULL) {
-            usetNode->fLeftChild = leafNode;
-            leafNode->fParent    = usetNode;
-        } else {
-            // There are already input symbols present for this set.
-            // Set up an OR node, with the previous stuff as the left child
-            //   and the new value as the right child.
-            RBBINode *orNode = new RBBINode(RBBINode::opOr);
-            orNode->fLeftChild  = usetNode->fLeftChild;
-            orNode->fRightChild = leafNode;
-            orNode->fLeftChild->fParent  = orNode;
-            orNode->fRightChild->fParent = orNode;
-            usetNode->fLeftChild = orNode;
-            orNode->fParent = usetNode;
-        }
+        addValToSet(usetNode, val);
+    }
+}
+
+void  RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
+    RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
+    leafNode->fVal = (unsigned short)val;
+    if (usetNode->fLeftChild == NULL) {
+        usetNode->fLeftChild = leafNode;
+        leafNode->fParent    = usetNode;
+    } else {
+        // There are already input symbols present for this set.
+        // Set up an OR node, with the previous stuff as the left child
+        //   and the new value as the right child.
+        RBBINode *orNode = new RBBINode(RBBINode::opOr);
+        orNode->fLeftChild  = usetNode->fLeftChild;
+        orNode->fRightChild = leafNode;
+        orNode->fLeftChild->fParent  = orNode;
+        orNode->fRightChild->fParent = orNode;
+        usetNode->fLeftChild = orNode;
+        orNode->fParent = usetNode;
    }
 }


-
 //------------------------------------------------------------------------
 //
-//   getNumOutputSets
+//   getNumCharCategories
 //
 //------------------------------------------------------------------------
 int32_t  RBBISetBuilder::getNumCharCategories() const {
-    return fGroupCount + 1;
+    return fGroupCount + 2;
 }


--- a/icu4c/source/common/rbbisetb.h
+++ b/icu4c/source/common/rbbisetb.h
@ -2,7 +2,7 @@
 //  rbbisetb.h
 /*
 **********************************************************************
-*   Copyright (c) 2001-2004, International Business Machines
+*   Copyright (c) 2001-2005, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@ -79,7 +79,8 @@ public:
    ~RBBISetBuilder();

    void     build();
-    void     addValToSets(UVector *sets, uint32_t val);
+    void     addValToSets(UVector *sets,      uint32_t val);
+    void     addValToSet (RBBINode *usetNode, uint32_t val);
    int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
                                   //    runtime state machine, which are the same as
                                   //    columns in the DFA state table
@ -110,8 +111,9 @@ private:
    // Groups correspond to character categories -
    //       groups of ranges that are in the same original UnicodeSets.
    //       fGroupCount is the index of the last used group.
-    //       The value is also the number of columns in the RBBI state table being compiled.
-    //       Index 0 is not used.  Funny counting.
+    //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
+    //       State table column 0 is not used.  Column 1 is for end-of-input.
+    //       column 2 is for group 0.  Funny counting.
    int32_t               fGroupCount;

    RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@ -4,7 +4,7 @@

 /*
 **********************************************************************
-*   Copyright (c) 2002-2004, International Business Machines
+*   Copyright (c) 2002-2005, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@ -343,7 +343,7 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
    // get a list of all endmarker nodes.
    tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);

-    // get a list all leaf nodes 
+    // get a list all leaf nodes
    tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
    if (U_FAILURE(*fStatus)) {
        return;
@ -383,10 +383,12 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
        //        into the rule file.
        if (fRB->fLBCMNoChain) {
            UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
-            U_ASSERT(c != -1);
-            ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
-            if (cLBProp == U_LB_COMBINING_MARK) {
-                continue;
+            if (c != -1) {
+                // c == -1 occurs with sets containing only the {eof} marker string.
+                ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
+                if (cLBProp == U_LB_COMBINING_MARK) {
+                    continue;
+                }
            }
        }

@ -572,14 +574,29 @@ void     RBBITableBuilder::flagAcceptingStates() {
                // Any non-zero value for fAccepting means this is an accepting node.
                // The value is what will be returned to the user as the break status.
                // If no other value was specified, force it to -1.
-                sd->fAccepting = endMarker->fVal;
-                if (sd->fAccepting == 0) {
-                    sd->fAccepting = -1;
+
+                if (sd->fAccepting==0) {
+					// State hasn't been marked as accepting yet.  Do it now.
+                    sd->fAccepting = endMarker->fVal;
+                    if (sd->fAccepting == 0) {
+                        sd->fAccepting = -1;
+					}
                }
+                if (sd->fAccepting==-1 && endMarker->fVal != 0) {
+					// Both lookahead and non-lookahead accepting for this state.
+					// Favor the look-ahead.  Expedient for line break.
+					// TODO:  need a more elegant resolution for conflicting rules.
+					sd->fAccepting = endMarker->fVal;
+				}
+				    // implicit else:
+				    // if sd->fAccepting already had a value other than 0 or -1, leave it be.

                // If the end marker node is from a look-ahead rule, set
                //   the fLookAhead field or this state also.
                if (endMarker->fLookAheadEnd) {
+					// TODO:  don't change value if already set?
+					// TODO:  allow for more than one active look-ahead rule in engine.
+					//        Make value here an index to a side array in engine?
                    sd->fLookAhead = sd->fAccepting;
                }
            }
@ -644,7 +661,7 @@ void     RBBITableBuilder::flagTaggedStates() {
    }
    for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
        tagNode = (RBBINode *)tagNodes.elementAt(i);
-        
+
        for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
            if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
@ -686,9 +703,9 @@ void  RBBITableBuilder::mergeRuleStatusVals() {
        fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
        fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
    }
-        
-    //    For each state 
-    for (n=0; n<fDStates->size(); n++) {     
+
+    //    For each state
+    for (n=0; n<fDStates->size(); n++) {
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
        UVector *thisStatesTagValues = sd->fTagVals;
        if (thisStatesTagValues == NULL) {
@ -704,7 +721,7 @@ void  RBBITableBuilder::mergeRuleStatusVals() {
        sd->fTagsIdx = -1;
        int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
        int32_t  nextTagGroupStart = 0;
-        
+
        // Loop runs once per group of tags in the global list
        while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
            thisTagGroupStart = nextTagGroupStart;
@ -718,21 +735,21 @@ void  RBBITableBuilder::mergeRuleStatusVals() {
            // The lengths match, go ahead and compare the actual tag values
            //    between this state and the group from the global list.
            for (i=0; i<thisStatesTagValues->size(); i++) {
-                if (thisStatesTagValues->elementAti(i) != 
+                if (thisStatesTagValues->elementAti(i) !=
                    fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
-                    // Mismatch.  
+                    // Mismatch.
                    break;
                }
            }
-            
+
            if (i == thisStatesTagValues->size()) {
                // We found a set of tag values in the global list that match
                //   those for this state.  Use them.
                sd->fTagsIdx = thisTagGroupStart;
-                break;   
+                break;
            }
        }
-        
+
        if (sd->fTagsIdx == -1) {
            // No suitable entry in the global tag list already.  Add one
            sd->fTagsIdx = fRB->fRuleStatusVals->size();
@ -1027,7 +1044,7 @@ void RBBITableBuilder::printRuleStatusTable() {

    RBBIDebugPrintf("index |  tags \n");
    RBBIDebugPrintf("-------------------\n");
-    
+
    while (nextRecord < tbl->size()) {
        thisRecord = nextRecord;
        nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
@ -1057,7 +1074,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
    fTagVals   = NULL;
    fPositions = NULL;
    fDtran     = NULL;
-    
+
    fDtran     = new UVector(lastInputSymbol+1, *fStatus);
    if (U_FAILURE(*fStatus)) {
        return;
--- a/icu4c/source/data/brkitr/line.txt
+++ b/icu4c/source/data/brkitr/line.txt
@ -14,7 +14,43 @@

 !!chain;
 !!LBCMNoChain;
+
+
 !!lookAheadHardBreak;
+#
+#  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
+#                          and only used for the line break rules.
+#
+#           It is used in the implementation of the incredibly annoying rule LB 7c
+#           which says to treat any combining mark that is not attached to a base
+#           character as if it were of class AL  (alphabetic).
+#
+#           The problem occurs in the reverse rules.
+#
+#           Consider a sequence like, with correct breaks as shown
+#               LF  ID  CM  AL  AL
+#                  ^       ^       ^
+#           Then consider the sequence without the initial ID (ideographic)
+#                 LF  CM  AL  AL
+#                    ^           ^
+#           Our CM, which in the first example was attached to the ideograph,
+#           is now unattached, becomes an alpha, and joins in with the other
+#           alphas.
+#
+#           When interating forwards, these sequences do not present any problems
+#           When interating backwards, we need to look ahead when encountering
+#           a CM to see whether it attaches to something further on or not.
+#           (Look-ahead in a reverse rule is looking towards the start)
+#
+#           If the CM is unattached, we need to force a break.
+#
+#           !!lookAheadHardBreak forces the run time state machine to
+#           stop immediately when a look ahead rule ( '/' operator) matches,
+#           and set the match position to that of the look-ahead operator,
+#           no matter what other rules may be in play at the time.
+#
+#           See rule LB 19 for an example.
+#

 $AI = [:LineBreak =  Ambiguous:];
 $AL = [:LineBreak =  Alphabetic:];
@ -60,7 +96,7 @@ $ZW = [:LineBreak =  ZWSpace:];
 #                               XX  (Unknown, unassigned)
 #                         as $AL  (Alphabetic)
 #
-$ALPlus = $AL | $AI | $SA | $XX;
+$ALPlus = [$AL $AI $SA $XX];

 #
 #  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
@ -87,7 +123,6 @@ $OPcm = $OP $CM*;
 $POcm = $PO $CM*;
 $PRcm = $PR $CM*;
 $QUcm = $QU $CM*;
-$SPcm = $SP $CM*;
 $SYcm = $SY $CM*;
 $WJcm = $WJ $CM*;

@ -120,45 +155,87 @@ $OP $CM+;
 $PO $CM+;
 $PR $CM+;
 $QU $CM+;
-$SP $CM+;
 $SY $CM+;
 $WJ $CM+;

+#
+# CAN_CM  is the set of characters that may combine with CM combining chars.
+#         Note that Linebreak UAX 14's concept of a combining char and the rules
+#         for what they can combine with are _very_ different from the rest of Unicode.
+#
+#         Note that $CM itself is left out of this set.  If CM is needed as a base
+#         it must be listed separately in the rule.
+#
+$CAN_CM  = [^$BK $CR $LF $NL $ZW $SP $CM];       # Bases that can   take CMs
+$CANT_CM = [ $BK $CR $LF $NL $ZW $SP $CM];       # Bases that can't take CMs

 #
-#  Rule LB 3
-$LB3Breaks = [$BK $CR $LF $NL];
-$LB3NonBreaks = [^$BK $CR $LF $NL];
-$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
+# AL_FOLLOW  set of chars that can unconditionally follow an AL
+#            Needed in rules where stand-alone $CM s are treated as AL.
+#            Chaining is disabled with CM because it causes other failures,
+#            so for this one case we need to manually list out longer sequences.
+#
+$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
+$AL_FOLLOW_CM   = [$CL $EX $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus];
+$AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];

-$LB3NonBreaks?     $LB3Breaks {100};
-$LB5NonBreaks $CM* $LB3Breaks {100};
+
+#
+#  Rule LB 3    Mandatory (Hard) breaks.
+#
+$LB3Breaks    = [$BK $CR $LF $NL];
+$LB3NonBreaks = [^$BK $CR $LF $NL];
+
+$LB3NonBreaks?  $LB3Breaks {100};    # LB 3c  do not break before hard breaks.
+$CAN_CM $CM*    $LB3Breaks {100};
+$CM+            $LB3Breaks {100};
 $CR $LF {100};

 # LB 4         x SP
 #              x ZW
-$ZW [$SP $ZW];
-$LB5NonBreaks $CM* [$SP $ZW];
+$LB3NonBreaks [$SP $ZW];
+$CAN_CM $CM*  [$SP $ZW];
+$CM+          [$SP $ZW];

 # LB 5         Break after zero width space
-$LB5Breaks = [$LB3Breaks $ZW];
+$LB5Breaks    = [$LB3Breaks $ZW];
+$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];


-# LB 7     Combining marks.      $SP $CM needs to behave like $ID.
-#                                X   $CM needs to behave like X, where X is not $SP.   
+# LB 7     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 
 #                                $CM not covered by the above needs to behave like $AL   
-#                                
-$LB5NonBreaks $CM+;    #  Stick together any combining sequences that don't match other rules.
+#                                See definition of $CAN_CM.

+$CAN_CM $CM+;              #  Stick together any combining sequences that don't match other rules.
+$CM+;
+
+#
 # LB 8
-$LB5NonBreaks $CM* $CL;
-$LB5NonBreaks $CM* $EX;
-$LB5NonBreaks $CM* $IS;
-$LB5NonBreaks $CM* $SY;
+#
+$LB5NonBreaks $CL;
+$CAN_CM $CM*  $CL;
+$CM+          $CL;              # by rule 7c, stand-alone CM behaves as AL

+$LB5NonBreaks $EX;
+$CAN_CM $CM*  $EX;
+$CM+          $EX;              # by rule 7c, stand-alone CM behaves as AL
+
+$LB5NonBreaks $IS;
+$CAN_CM $CM*  $IS;
+$CM+          $IS;              # by rule 7c, stand-alone CM behaves as AL
+
+$LB5NonBreaks $SY;
+$CAN_CM $CM*  $SY;
+$CM+          $SY;              # by rule 7c, stand-alone CM behaves as AL
+
+
+#
 # LB 9
-$OPcm $SP* .?;
-$OPcm $SP* $LB5NonBreaks $CM*;
+#
+$OPcm $SP* $CAN_CM $CM*;
+$OPcm $SP* $CANT_CM;
+
+$OPcm $SP+ $CM+ $AL_FOLLOW?;    # by rule 7c, stand-alone CM behaves as AL

 # LB 10
 $QUcm $SP* $OPcm;
@ -167,58 +244,71 @@ $QUcm $SP* $OPcm;
 $CLcm $SP* $NScm;

 # LB 11a
-($B2cm)+;
+$B2cm $SP* $B2cm;

 # LB 11b  Word Joiner
-$LB5NonBreaks $CM* $WJcm;
-$WJcm .?;
+#
+$CAN_CM $CM*  $WJcm;
+$LB5NonBreaks $WJcm;
+$CM+          $WJcm;
+
+$WJcm [^$CAN_CM];
+$WJcm $CAN_CM $CM*;

 # LB 12
-$LB12NonBreaks = [$LB5NonBreaks - $SP];
+$LB12NonBreaks = [$LB5NonBreaks - [$SP]];
+$LB12Breaks    = [$LB5Breaks $SP];

 # LB 13
 #         x GL
 $LB12NonBreaks $CM* $GLcm;
-$SP $CM+            $GLcm;                      # LB7a  SP CM+ behaves as ID
+$CM+                $GLcm;
+

 #         GL  x
-$GLcm .?;
+#
+$GLcm $LB12Breaks;
 $GLcm $LB12NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
-$GLcm $SP $CM+;               # SP CM+ behaves as ID
+                              #  TODO:  I don't think we need this rule.
+                              #         All but $CM will chain off of preceding rule.
+                              #         $GLcm will pick up the CM case by itself.

 # LB 14
 #         x QU
 $LB12NonBreaks $CM* $QUcm;
-$SP $CM+            $QUcm;                      # LB7a  SP CM+ behaves as ID
+$CM+                $QUcm;

 #         QU  x
 $QUcm .?;
 $QUcm $LB12NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
-$QUcm $SP $CM+;               # SP CM+ behaves as ID
+                              #  TODO:  I don't think this rule is needed.
+

 # LB 14a
+#        <break>  $CB
+#        $CB   <break>
+
 $LB14NonBreaks = [$LB12NonBreaks - $CB];
-$LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;

 # LB 15
-$LB14CanBreakAfter ($BAcm | $HYcm | $NScm);
-$BBcm [^$CB];
-$BBcm [^$CB $CR $LF $BK $NL $ZW] $CM*;
+$LB14NonBreaks $CM* ($BAcm | $HYcm | $NScm); 
+
+$BBcm [^$CB];                                  #  $BB  x
+$BBcm $LB14NonBreaks $CM*;

 # LB 16
 $ALcm    $INcm;
 $CM+     $INcm;     #  by rule 7c, any otherwise unattached CM behaves as AL
 $IDcm    $INcm;
-$SP $CM+ $INcm;     # by rule 7a, $SP $CM behaves like ID
 $INcm    $INcm;
 $NUcm    $INcm;


 # $LB 17
-($IDcm | $SP $CM+) $POcm;
-$ALcm+ $NUcm;       # includes $LB19
-$CM+   $NUcm;       # Rule 7c
-$NUcm $ALcm+;
+$IDcm  $POcm;
+$ALcm  $NUcm;       # includes $LB19
+$CM+   $NUcm;       # Rule 7c, any otherwise unattached CM behaves as AL
+$NUcm  $ALcm;

 # LB 18
 $PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* $CLcm? $POcm?;
@ -237,7 +327,10 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);


 # LB 19
-$CM* $ALcm+;    # The $CM* is from rule 7C, and unattached CM is treated as AL
+$ALcm $ALcm;
+$CM+ $ALcm;      # The $CM+ is from rule 7C, and unattached CM is treated as AL
+
+# LB 19b
 $IScm $ALcm;

 #
@ -269,39 +362,86 @@ $CM+ $OP;
 $CM+ $PO;
 $CM+ $PR;
 $CM+ $QU;
-$CM+ $SP;
 $CM+ $SY;
 $CM+ $WJ;
+$CM+;
+ 
+
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break>  [CM]  [whatever]
+#  The CM needs to behave as an AL
+#
+$AL_FOLLOW $CM+ / (
+          [$BK $CR $LF $NL $ZW {eof}] |
+          $SP+ $CM+ $SP |
+          $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB9 will match, need to surpress this break.
+                                               #  LB9 says     OP SP* x .        
+                                               #    becomes    OP SP* x AL
+                                               #    becomes    OP SP* x CM+ AL_FOLLOW
+                                               #
+                                               # Further note:  the $AL in [$AL {eof}] is only to work around
+                                               #                a rule compiler bug which complains about
+                                               #                empty sets otherwise.
+          
+#
+#  Sequences of the form  (shown forwards)
+#      [CANT_CM]  <break> [CM]  <break>  [PR]
+#  The CM needs to behave as an AL
+#  This rule is concerned about getting the second of the two <breaks> in place.
+#
+[$PR   ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
+
+

 # LB 3

-$LB3Breaks $LB3NonBreaks;
-$LB3Breaks $CM* $LB5NonBreaks;
+$LB3Breaks [$LB3NonBreaks-$CM];
+$LB3Breaks $CM+ $CAN_CM;
 $LF $CR;

+
 # LB 4         x SP
 #              x ZW
-[$SP $ZW] $LB3NonBreaks;
-[$SP $ZW] $CM* $LB5NonBreaks;
+[$SP $ZW] [$LB3NonBreaks-$CM];
+[$SP $ZW] $CM+ $CAN_CM;

 # LB 5 Break after zero width space


 # LB 7 Combining marks.
-#    $SP $CM needs to behave like $ID.
-#    X   $CM needs to behave like X, where X is not $SP.
+#    X   $CM needs to behave like X, where X is not $SP or controls.
 #    $CM not covered by the above needs to behave like $AL
 # Stick together any combining sequences that don't match other rules.
-$CM+ $LB5NonBreaks;
+$CM+ $CAN_CM;

 # LB 8
-$CL $CM* $LB5NonBreaks;
-$EX $CM* $LB5NonBreaks;
-$IS $CM* $LB5NonBreaks;
-$SY $CM* $LB5NonBreaks;
+$CL $CM+ $CAN_CM;
+$EX $CM+ $CAN_CM;
+$IS $CM+ $CAN_CM;
+$SY $CM+ $CAN_CM;
+
+$CL [$LB5NonBreaks-$CM];
+$EX [$LB5NonBreaks-$CM];
+$IS [$LB5NonBreaks-$CM];
+$SY [$LB5NonBreaks-$CM];
+
+# Rule 9 & 8 together.
+#   This really wants to chain at the $CM+ (which is acting as an $AL)
+#   except for $CM chaining being disabled.
+[$CL $EX $IS $SY] $CM+ $SP+ $CM* $OP;  
+
+# LB 9    OP SP* x
+#
+$CM* $CAN_CM    $SP* $CM* $OP;
+     $CANT_CM   $SP* $CM* $OP;
+$AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 7, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
+     
+     $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
+$CM* $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
+$SY $CM $SP+ $OP;   # Experiment.  Remove.
+

-# LB 9
-$LB5NonBreaks $SP* $CM* $OP;

 # LB 10
 $CM* $OP $SP* $CM* $QU;
@ -310,48 +450,57 @@ $CM* $OP $SP* $CM* $QU;
 $CM* $NS $SP* $CM* $CL;

 # LB 11a
-($CM* $B2)+;
+$CM* $B2 $SP* $CM* $B2;

 # LB 11b
-$CM* $WJ $CM* $LB5NonBreaks;
-$CM* $LB5NonBreaks $CM* $WJ;
-. $CM* $WJ;
+$CM* $WJ $CM* $CAN_CM;
+$CM* $WJ      [$LB5NonBreaks-$CM];
+
+     $CANT_CM $CM* $WJ;
+$CM* $CAN_CM  $CM* $WJ;

 # LB 12

-# LB 14
-$CM* $GL $CM* $LB12NonBreaks;
-$CM* $GL $CM+ $SP;
-$CM* $LB5NonBreaks $CM* $GL;
+# LB 13
+#         x GL
+#
+$CM* $GL $CM* [$LB12NonBreaks-$CM];

-# LB 14
-$CM* $QU $CM* $LB12NonBreaks;
-$CM* $QU $CM+ $SP;    # CM+ SP behaves as ID
-$CM* $LB5NonBreaks $CM* $QU;
+#
+#     GL  x
+#
+$CANT_CM $CM* $GL;
+$CM* $CAN_CM $CM* $GL;

-# LB 14a
-$BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);
+
+
+#
+# LB 14
+#
+$CM* $QU $CM* $CAN_CM;                                #   . x QU
+$CM* $QU      $LB12NonBreaks;
+
+
+$CM* $CAN_CM  $CM* $QU;                               #   QU x .
+     $CANT_CM $CM* $QU;

 # LB 15
-$CM* ($BA | $HY | $NS) $BackLB14CanBreakAfter;
-($CM* ($BA | $HY | $NS))+ $CM+ / $LB5Breaks;
-[$CR $LF $BK $NL $ZW] $CM* $BB;
-$CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
+$CM* ($BA | $HY | $NS) $CM* [$LB14NonBreaks-$CM];     #  . x (BA | HY | NS)
+
+$CM* [$LB14NonBreaks-$CM] $CM* $BB;                   #  BB x .
+[^$CB] $CM* $BB;                                      # 
+
+

 # LB 16
 $CM* $IN $CM* $ALPlus;
-# by rule 7c, any otherwise unattached CM behaves as AL
-$CM* $IN $CM+ / $LB5Breaks;
-
-$CM* $IN $CM* ($ID | $CM $SP);
+$CM* $IN $CM* $ID;
 $CM* $IN $CM* $IN;
 $CM* $IN $CM* $NU;

 # $LB 17
-$CM* $PO $CM* ($ID | $CM $SP);
-$CM* $NU ($CM* $ALPlus)+; # includes $LB19
-$CM* $NU $CM+ / $LB5Breaks;        # Rule 7c
-
+$CM* $PO $CM* $ID;
+$CM* $NU $CM* $ALPlus;
 $CM* $ALPlus $CM* $NU;

 # LB 18
@ -371,13 +520,13 @@ $CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;

 # LB 19
 $CM* $ALPlus $CM* $ALPlus;
-# The $CM* is from rule 7C, and unattached CM is treated as AL
-$CM* $ALPlus $CM* $IS;
-$CM* $ALPlus $CM+ / $LB5Breaks;

-## problem state table can't handle lookahead when it is at the
-## start of the string, currently handled in the rbbi code
-## todo fix this
+
+# LB 19b
+$CM* $ALPlus $CM* $IS;
+
+
+

 ## -------------------------------------------------

@ -395,6 +544,7 @@ $SP+ $CM* $QU;

 # LB 11
 $SP+ $CM* $CL;
+$SP+ $CM* $B2;

 # LB 18
 ($CM* ($IS | $SY))+ $CM* $NU;
@ -404,18 +554,14 @@ $CL $CM* ($NU | $IS | $SY);

 !!safe_forward;

-# LB 7
-[^$BK $CR $LF $NL $ZW $SP] $CM+;
-$SP $CM+ / [^$CM];
+# Skip forward over all character classes that are involved in
+#   rules containing patterns with possibly more than one char
+#   of context.
+#
+#  It might be slightly more efficient to have specific rules
+#  instead of one generic one, but only if we could
+#  turn off rule chaining.  We don't want to move more
+#  than necessary.
+#
+[$CM $OP $QU $CL $B2 $PR $HY $SP]+ [^$CM $OP $QU $CL $B2 $PR $HY];

-# LB 9
-$OP $CM* $SP+;
-
-# LB 10
-$QU $CM* $SP+;
-
-# LB 11
-$CL $CM* $SP+;
-
-# LB 18
-$CM* $PRcm? ($OPcm | $HYcm)? $NU;
--- a/icu4c/source/data/brkitr/sent.txt
+++ b/icu4c/source/data/brkitr/sent.txt
@ -1,5 +1,5 @@
 #
-#   Copyright (C) 2002-2004, International Business Machines Corporation and others.
+#   Copyright (C) 2002-2005, International Business Machines Corporation and others.
 #       All Rights Reserved.
 #
 #   file:  sent.txt
@ -111,10 +111,12 @@ $End? $Join [$RULE12 - $Sp - $Close];

 # forces a break at the beginning of text "$Sp blah blah blah"
 # remember the break iterators takes the longest match
-$End? $Join $Sp / [^$Term $ATerm $Sp $Close];
+$NOT_T_A_S_C = [^$Term $ATerm $Sp $Close];
+$End? $Join $Sp / [$NOT_T_A_S_C {eof}];

 # forces a break at the beginning of text "$Close blah blah blah"
-$End? $Join $Close / [^$Term $ATerm $Close];
+$NOT_T_A_C = [^$Term $ATerm $Close];
+$End? $Join $Close / [$NOT_T_A_C {eof}];

 ## -------------------------------------------------

--- a/icu4c/source/data/brkitr/word.txt
+++ b/icu4c/source/data/brkitr/word.txt
@ -17,62 +17,30 @@

 !!chain;

-$Katakana  = [[:Script = KATAKANA:]
-			  [:name = VERTICAL KANA REPEAT MARK:]
-			  [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:]
-			  [:name = VERTICAL KANA REPEAT MARK UPPER HALF:]
-			  [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:]
-			  [:name = VERTICAL KANA REPEAT MARK LOWER HALF:]
-			  [:name = KATAKANA-HIRAGANA VOICED SOUND MARK:]
-			  [:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:]
-			  [:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:]
-			  [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
-			  [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
-			  [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
-			  [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
-
-
-$ALetter   = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
-						   - [:Ideographic:]
-						   - $Katakana
-						   - [:Script = Hiragana:]
-						   - [:Script = Thai:]
-						   - [:Script = Lao:]
-						   - [:Grapheme_Extend = TRUE:]];
-						   
-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
-			  [:name = HEBREW PUNCTUATION GERSHAYIM:]
-			  [:name = RIGHT SINGLE QUOTATION MARK:]
-			  [:name = HYPHENATION POINT:]
-			  [:name = COLON:]];
-
-			
-$MidNum    = [[:LineBreak = Infix_Numeric:] - [:name = COLON:]];
-$Numeric   = [:LineBreak = Numeric:];
-$ExtendNumLet = [[:Connector_Punctuation:] 
-			- [:name = KATAKANA MIDDLE DOT:]
-			- [:name = HALFWIDTH KATAKANA MIDDLE DOT:]];
-			
-			

 #
 #  Character Class Definitions.
-#    The names are those from TR29.
 #

+$Format       = [\p{Word_Break = Format}];
+$Katakana     = [\p{Word_Break = Katakana}];
+$ALetter      = [\p{Word_Break = ALetter}];
+$MidLetter    = [\p{Word_Break = MidLetter}];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{General_Category=Connector_Punctuation}];
+
+
 $CR             = \u000d;
 $LF             = \u000a;
-$Extend         = [[:Grapheme_Extend = TRUE:]];
-$Control        = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
-$Format         = [[:Cf:] - $Extend];
-$Hiragana       = [:Hiragana:];
-$Ideographic    = [:IDEOGRAPHIC:];
+$Extend         = [\p{Grapheme_Cluster_Break = Extend}];
+$Control        = [\p{Grapheme_Cluster_Break = Control}];

-$ALetterEx      = $ALetter     $Extend*;
-$NumericEx      = $Numeric     $Extend*;
-$MidNumEx       = $MidNum      $Extend*;
-$MidLetterEx    = $MidLetter   $Extend*;
-$KatakanaEx     = $Katakana    $Extend*;
+$KatakanaEx     = $Katakana     $Extend*;
+$ALetterEx      = $ALetter      $Extend*;
+$MidLetterEx    = $MidLetter    $Extend*;
+$MidNumEx       = $MidNum       $Extend*;
+$NumericEx      = $Numeric      $Extend*;
 $ExtendNumLetEx = $ExtendNumLet $Extend*;

 ## -------------------------------------------------
@ -81,7 +49,7 @@ $ExtendNumLetEx = $ExtendNumLet $Extend*;


 # Rule 3 - don't break grapheme clusters.
-# see character breaks
+#          see character breaks.

 $CR $LF;
 [^$Control] $Extend+;
@ -114,8 +82,9 @@ $NumericEx $Format* $MidNumEx $Format* $NumericEx {100};
 # rule 13

 $KatakanaEx  $Format* $KatakanaEx {300};
-$Hiragana    $Extend* {300};
-$Ideographic $Extend* {400};
+
+[\p{Hiragana}]    $Extend* {300};     # To get tag values.
+[\p{Ideographic}] $Extend* {400};

 # rule 13a/b

--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -320,16 +320,16 @@ static void printStringBreaks(UnicodeString ustr, int expected[],
            j ++;
        }
        u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
-        printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c, 
-                           u_isUAlphabetic(c), 
+        printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c,
+                           u_isUAlphabetic(c),
                           u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
-                           u_isalnum(c), 
-                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 
-                                                  u_charType(c), 
-                                                  U_SHORT_PROPERTY_NAME), 
-                           u_getPropertyValueName(UCHAR_LINE_BREAK, 
-                                                  u_getIntPropertyValue(c, 
-                                                             UCHAR_LINE_BREAK), 
+                           u_isalnum(c),
+                           u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
+                                                  u_charType(c),
+                                                  U_SHORT_PROPERTY_NAME),
+                           u_getPropertyValueName(UCHAR_LINE_BREAK,
+                                                  u_getIntPropertyValue(c,
+                                                             UCHAR_LINE_BREAK),
                                                  U_SHORT_PROPERTY_NAME),
                           name);
    }
@ -390,9 +390,9 @@ void RBBITest::TestMixedThaiLineBreak()


    // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
-	// start
+    // start

-	ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
@ -406,9 +406,9 @@ void RBBITest::TestMixedThaiLineBreak()
    ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
-	ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
+    ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
    ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
-     
+
    // @suwit - end of changes

    // Arabic numerals should always be separated from surrounding Thai text
@ -449,7 +449,7 @@ ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u
        ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);
 */

-    /*  remove the old data sample. 
+    /*  remove the old data sample.
    // The Unicode Linebreak TR says do not break before or after quotes.
    //    So this test is changed ot not break around the quote.
    //    TODO:  should Thai break around the around the quotes, like the original behavior here?
@ -517,21 +517,21 @@ void RBBITest::TestThaiWordBreak() {
    ADD_DATACHUNK(thaiWordSelection, NULL, 0, status);           // Break at start of data


-	// @suwit -- Thai sample data from GVT Guideline
-	// start
-	ADD_DATACHUNK(thaiWordSelection, "\\u0E2B\\u0E19\\u0E36\\u0E48\\u0E07", 0, status); //5
-	ADD_DATACHUNK(thaiWordSelection, "\\u0E04\\u0E33", 0, status); //7
-	ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E17\\u0E22", 0, status); //10
-	ADD_DATACHUNK(thaiWordSelection, "\\u0E2A\\u0E32\\u0E21\\u0E32\\u0E23\\u0E16", 0, status); //16
-	ADD_DATACHUNK(thaiWordSelection, "\\u0E1B\\u0E23\\u0E30\\u0E01\\u0E2D\\u0E1A", 0, status); //22
-	ADD_DATACHUNK(thaiWordSelection, "\\u0E14\\u0E49\\u0E27\\u0E22", 0, status);  //26
-	ADD_DATACHUNK(thaiWordSelection, "\\u0e2b\\u0e25\\u0e32\\u0e22", 0, status);  //30
-	ADD_DATACHUNK(thaiWordSelection, "\\u0e1e\\u0e22\\u0e32\\u0e07\\u0e04\\u0e4c", 0, status);  //36
+    // @suwit -- Thai sample data from GVT Guideline
+    // start
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E2B\\u0E19\\u0E36\\u0E48\\u0E07", 0, status); //5
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E04\\u0E33", 0, status); //7
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E17\\u0E22", 0, status); //10
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E2A\\u0E32\\u0E21\\u0E32\\u0E23\\u0E16", 0, status); //16
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E1B\\u0E23\\u0E30\\u0E01\\u0E2D\\u0E1A", 0, status); //22
+    ADD_DATACHUNK(thaiWordSelection, "\\u0E14\\u0E49\\u0E27\\u0E22", 0, status);  //26
+    ADD_DATACHUNK(thaiWordSelection, "\\u0e2b\\u0e25\\u0e32\\u0e22", 0, status);  //30
+    ADD_DATACHUNK(thaiWordSelection, "\\u0e1e\\u0e22\\u0e32\\u0e07\\u0e04\\u0e4c", 0, status);  //36

    // @suwit - end of changes

    /*  remove the old data sample because Thai translation of the Wizard of Oz is not good testcase for wordbreak API.
-			
+
    ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); //2
    ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); //5
    ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); //6
@ -598,11 +598,11 @@ void RBBITest::TestBug3818() {
    UErrorCode  status = U_ZERO_ERROR;

    // Four Thai words...
-    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 
-                                           0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 
+    static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
+                                           0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
    UnicodeString  thaiStr(thaiWordData);

-    RuleBasedBreakIterator* bi = 
+    RuleBasedBreakIterator* bi =
        (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
    if (U_FAILURE(status) || bi == NULL) {
        errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
@ -655,7 +655,7 @@ void RBBITest::TestJapaneseWordBreak() {
 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
 {
    if (exec) logln("TestSuite RuleBasedBreakIterator: ");
-    
+
    switch (index) {
        case 0: name = "TestBug4153072";
            if(exec) TestBug4153072();                         break;
@ -708,6 +708,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
            if(exec) TestBug3818();                            break;
        case 19: name = "TestJapaneseWordBreak";
            if(exec) TestJapaneseWordBreak();                  break;
+        case 20: name = "TestDebug";
+            if(exec) TestDebug();                              break;

        default: name = ""; break; //needed to end loop
    }
@ -1890,9 +1892,8 @@ void RBBITest::TestLineBreakData() {
            int32_t expectedPos;         // Expected break position (index into test string)

            bi->setText(testString);
-            pos = bi->first();       // TODO:  break iterators always return a match at pos 0.
-            pos = bi->next();        //        Line Break TR says no match at position 0.
-                                     //        Resolve.
+            pos = bi->first();
+            pos = bi->next();

            for (; pos != BreakIterator::DONE; ) {
                expectedPos = expectedBreaks.elementAti(expectedI);
@ -2117,27 +2118,14 @@ RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),

    fSets          = new UVector(status);

-    fKatakanaSet   = new UnicodeSet("[\\p{script=KATAKANA}"
-        "\\u3031-\\u3035\\u309b\\u309c\\u30a0"
-        "\\u30fc\\uff70\\uff9e\\uff9f]", status);
-
-    const UnicodeString ALetterStr( "[[\\p{Alphabetic}"
-                                        "\\u00a0"         // NBSP
-                                        "\\u05f3]"        // Hebrew punct Geresh
-                                        "-[\\p{Ideographic}]"
-                                        "-[\\p{Script=Thai}]"
-                                        "-[\\p{Script=Lao}]"
-                                        "-[\\p{Script=Hiragana}]"
-                                        "-[\\p{Grapheme_Extend}]]");
-    fALetterSet    = new UnicodeSet(ALetterStr, status);
-    fALetterSet->removeAll(*fKatakanaSet);
-
-    fMidLetterSet  = new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027\\u003a]", status);
-    fMidNumSet     = new UnicodeSet("[[\\p{Line_Break=Infix_Numeric}]-[\\u003a]]", status);
-    fNumericSet    = new UnicodeSet("[\\p{Line_Break=Numeric}]", status);
-    fFormatSet     = new UnicodeSet("[\\p{Format}-[\\u200c\\u200d]]", status);
-    fExtendSet     = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
-    fExtendNumLetSet = new UnicodeSet("[\\p{Pc}-[\\u30fb\\uff65]]", status);
+    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]",       status);
+    fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]",     status);
+    fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]",  status);
+    fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]",       status);
+    fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]",      status);
+    fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]",       status);
+    fExtendSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]", status);
+    fExtendNumLetSet = new UnicodeSet("[\\p{General_Category = Connector_Punctuation}]", status);
    fOtherSet      = new UnicodeSet();
    if(U_FAILURE(status)) {
      deferredStatus = status;
@ -2180,7 +2168,7 @@ void RBBIWordMonkey::setText(const UnicodeString &s) {
 int32_t RBBIWordMonkey::next(int32_t prevPos) {
    UErrorCode status = U_ZERO_ERROR;

-    int    p0, p1, p2, p3;    // Indices of the significant code points around the 
+    int    p0, p1, p2, p3;    // Indices of the significant code points around the
                              //   break position being tested.  The candidate break
                              //   location is before p2.

@ -2221,7 +2209,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
            U_ASSERT(U_SUCCESS(status));
            c3 = fText->char32At(p3);
        }
-        
+
        if (p1 == p2) {
            // Still warming up the loop.  (won't work with zero length strings, but we don't care)
            continue;
@ -2275,7 +2263,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {

        // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
        if ( fNumericSet->contains(c0) &&
-             fMidNumSet->contains(c1)  && 
+             fMidNumSet->contains(c1)  &&
            fNumericSet->contains(c2)) {
            continue;
        }
@ -2286,7 +2274,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
            fNumericSet->contains(c3)) {
            continue;
        }
-        
+
        // Rule (13)  Katakana x Katakana
        if (fKatakanaSet->contains(c1) &&
            fKatakanaSet->contains(c2))  {
@ -2301,7 +2289,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
             }

        // Rule 13b
-        if (fExtendNumLetSet->contains(c1) && 
+        if (fExtendNumLetSet->contains(c1) &&
                (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
                fKatakanaSet->contains(c2)))  {
                continue;
@ -2362,7 +2350,7 @@ public:
    virtual  UVector *charClasses();
    virtual  void     setText(const UnicodeString &s);
    virtual  int32_t  next(int32_t i);
-    virtual  void     rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
+    virtual  void     rule7Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
 private:
    UVector      *fSets;

@ -2381,14 +2369,14 @@ private:
    UnicodeSet  *fBA;
    UnicodeSet  *fBB;
    UnicodeSet  *fHY;
-	UnicodeSet  *fH2;
-	UnicodeSet  *fH3;
+    UnicodeSet  *fH2;
+    UnicodeSet  *fH3;
    UnicodeSet  *fCL;
    UnicodeSet  *fEX;
    UnicodeSet  *fIN;
-	UnicodeSet  *fJL;
-	UnicodeSet  *fJV;
-	UnicodeSet  *fJT;
+    UnicodeSet  *fJL;
+    UnicodeSet  *fJV;
+    UnicodeSet  *fJT;
    UnicodeSet  *fNS;
    UnicodeSet  *fOP;
    UnicodeSet  *fQU;
@ -2409,12 +2397,11 @@ private:
    int32_t              *fOrigPositions;

    RegexMatcher         *fNumberMatcher;
-    RegexMatcher         *fLB10Matcher;
    RegexMatcher         *fLB11Matcher;
 };


-RBBILineMonkey::RBBILineMonkey() 
+RBBILineMonkey::RBBILineMonkey()
 {
    UErrorCode  status = U_ZERO_ERROR;

@ -2439,7 +2426,7 @@ RBBILineMonkey::RBBILineMonkey()
    fCL    = new UnicodeSet("[\\p{Line_break=CL}]", status);
    fEX    = new UnicodeSet("[\\p{Line_break=EX}]", status);
    fIN    = new UnicodeSet("[\\p{Line_break=IN}]", status);
-	fJL    = new UnicodeSet("[\\p{Line_break=JL}]", status);
+    fJL    = new UnicodeSet("[\\p{Line_break=JL}]", status);
    fJV    = new UnicodeSet("[\\p{Line_break=JV}]", status);
    fJT    = new UnicodeSet("[\\p{Line_break=JT}]", status);
    fNS    = new UnicodeSet("[\\p{Line_break=NS}]", status);
@ -2460,8 +2447,6 @@ RBBILineMonkey::RBBILineMonkey()
    fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
    fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL

-
-
    fSets->addElement(fBK, status);
    fSets->addElement(fCR, status);
    fSets->addElement(fLF, status);
@ -2497,9 +2482,6 @@ RBBILineMonkey::RBBILineMonkey()
    fSets->addElement(fID, status);
    fSets->addElement(fWJ, status);
    fSets->addElement(fSA, status);
-    // fSets->addElement(fXX, status);
-
-

    fNumberMatcher = new RegexMatcher(
        "(\\p{Line_Break=PR}\\p{Line_Break=CM}*)?"
@ -2507,19 +2489,7 @@ RBBILineMonkey::RBBILineMonkey()
        "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
        "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
        "(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
-        "(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?", 
-        0, status);
-
-    fLB10Matcher = new RegexMatcher(
-        "\\p{Line_Break=QU}\\p{Line_Break=CM}*"
-        "\\p{Line_Break=SP}*"
-        "(\\p{Line_Break=OP})\\p{Line_Break=CM}*", 
-        0, status);
-
-    fLB11Matcher = new RegexMatcher(
-        "\\p{Line_Break=CL}\\p{Line_Break=CM}*"
-        "\\p{Line_Break=SP}*"
-        "(\\p{Line_Break=NS})\\p{Line_Break=CM}*", 
+        "(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
        0, status);

    fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
@ -2537,7 +2507,7 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
 }

 //
-//  rule67Adjust
+//  rule7Adjust
 //     Line Break TR rules 6 and 7 implementation.
 //     This deals with combining marks and other sequences that
 //     that must be treated as if they were something other than what they actually are.
@ -2546,21 +2516,20 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
 //     each potential break, once to the chars before the position being checked, then
 //     again to the text following the possible break.
 //
-void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
+void RBBILineMonkey::rule7Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
    if (pos == -1) {
-        // Invalid initial position.  Happens during the warmup iteration of the 
+        // Invalid initial position.  Happens during the warmup iteration of the
        //   main loop in next().
        return;
    }

    int32_t  nPos = *nextPos;
-    
-    
+
    // LB 7b  Keep combining sequences together.
    //  advance over any CM class chars.  Note that Line Break CM is different
-	//  from normal Mc general category.
-    if (!(fBK->contains(*posChar) || fZW->contains(*posChar) || *posChar==0x0a 
-        || *posChar==0x0d || *posChar==0x85)) {
+    //  from normal Mc general category.
+    if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
+          *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
        for (;;) {
            *nextChar = fText->char32At(nPos);
            if (!fCM->contains(*nextChar)) {
@ -2569,16 +2538,11 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
            nPos = fText->moveIndex32(nPos, 1);
        }
    }
-    
-    
-    // LB 7a In a SP CM* sequence, treat the SP as an ID
-    if (nPos != *nextPos && fSP->contains(*posChar)) {
-        *posChar = 0x4e00;   // 0x4e00 is a CJK Ideograph, linebreak type is ID.
-    }
-    
+
+
    // LB 7b Treat X CM* as if it were x.
-    //       No explicit action required.  
-    
+    //       No explicit action required.
+
    // LB 7c  Treat any remaining combining mark as AL
    if (fCM->contains(*posChar)) {
        *posChar = 0x41;   // thisChar = 'A';
@ -2635,11 +2599,27 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
        nextCPPos = fText->moveIndex32(pos, 1);
        nextPos   = nextCPPos;

-        // Break at end of text.
+        // Rule LB2 - Break at end of text.
        if (pos >= fText->length()) {
            break;
        }

+        // Rule LB 7 - adjust for combining sequences.
+        //             We do this one out-of-order because the adjustment does not change anything
+        //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
+        //             be applied.
+        rule7Adjust(prevPos, &prevChar, &pos,     &thisChar);
+        nextCPPos = nextPos = fText->moveIndex32(pos, 1);
+        c = fText->char32At(nextPos);
+        rule7Adjust(pos,     &thisChar, &nextPos, &c);
+
+        // If the loop is still warming up - if we haven't shifted the initial
+        //   -1 positions out of prevPos yet - loop back to advance the
+        //    position in the input without any further looking for breaks.
+        if (prevPos == -1) {
+            continue;
+        }
+
        // LB 3a  Always break after hard line breaks,
        if (fBK->contains(prevChar)) {
            break;
@ -2661,33 +2641,6 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
                continue;
        }

-        // LB 10    QU SP* x OP
-        if (prevPos >= 0) {
-            UnicodeString  subStr10(*fText, prevPos);
-            fLB10Matcher->reset(subStr10);
-            status = U_ZERO_ERROR;
-            if (fLB10Matcher->lookingAt(status)) {  //   /QU CM* SP* (OP) CM*/;
-                // TODO:  Check status codes
-                pos      = prevPos + fLB10Matcher->start(1, status);
-                nextPos  = prevPos + fLB10Matcher->end(0, status);
-                thisChar = fText->char32At(pos);
-                continue;
-            }
-        }
-
-        // LB 11   CL SP* x NS
-        if (prevPos >= 0) {
-            UnicodeString  subStr11(*fText, prevPos);
-            fLB11Matcher->reset(subStr11);
-            status = U_ZERO_ERROR;
-            if (fLB11Matcher->lookingAt(status)) {  //   /QU CM* SP* (OP) CM*/;
-                // TODO:  Check status codes
-                pos      = prevPos + fLB11Matcher->start(1, status);
-                nextPos  = prevPos + fLB11Matcher->end(0, status);
-                thisChar = fText->char32At(pos);
-                continue;
-            }
-        }

        // LB 4  Don't break before spaces or zero-width space.
        if (fSP->contains(thisChar)) {
@ -2703,37 +2656,8 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
            break;
        }

-        // LB LB 7
-        rule67Adjust(prevPos, &prevChar, &pos,     &thisChar);
-        
-        nextCPPos = fText->moveIndex32(pos, 1);
-        nextPos   = nextCPPos;
-        c = fText->char32At(nextPos);
-        // another percularity of LB 4 - Dont break before space
-        if (fSP->contains(thisChar)) {
-            continue;
-        }
-        rule67Adjust(pos,     &thisChar, &nextPos, &c);
-
-        // If the loop is still warming up - if we haven't shifted the initial
-        //   -1 positions out of prevPos yet - loop back to advance the
-        //    position in the input without any further looking for breaks.
-        if (prevPos == -1) {
-            continue;
-        }
-
-        // Re-apply rules 3c, 4 because these could be affected by having
-        //                      a new thisChar from doing rule 6 or 7.
-        if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||   // 3c
-            fBK->contains(thisChar)) {
-                continue;
-        }
-        if (fSP->contains(thisChar)) {    // LB 4
-            continue;
-        }
-        if (fZW->contains(thisChar)) {    // LB 4
-            continue;
-        }
+        // LB 7  Already done, at top of loop.
+        //


        // LB 8  Don't break before closings.
@ -2751,7 +2675,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
        //       Scan backwards, checking for this sequence.
        //       The OP char could include combining marks, so we acually check for
        //           OP CM* SP*
-        //       Another Twist: The Rule 67 fixes may have changed a CP CM
+        //       Another Twist: The Rule 67 fixes may have changed a SP CM
        //       sequence into a ID char, so before scanning back through spaces,
        //       verify that prevChar is indeed a space.  The prevChar variable
        //       may differ from fText[prevPos]
@ -2769,12 +2693,58 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
        }


-        // LB 11a        B2 x B2
-        if (fB2->contains(thisChar) && fB2->contains(prevChar)) {
-            continue;
+        // LB 10    QU SP* x OP
+        if (fSP->contains(prevChar) && fOP->contains(thisChar)) {
+            // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
+            int tPos = prevPos;
+            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
+                tPos = fText->moveIndex32(tPos, -1);
+            }
+            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
+                tPos = fText->moveIndex32(tPos, -1);
+            }
+            if (fQU->contains(fText->char32At(tPos))) {
+                continue;
+            }
        }

-        // LB 11b   
+
+
+        // LB 11   CL SP* x NS
+        //    Scan backwards for SP* CM* CL
+        if (fNS->contains(thisChar)) {
+            int tPos = prevPos;
+            while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
+                tPos = fText->moveIndex32(tPos, -1);
+            }
+            while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
+                tPos = fText->moveIndex32(tPos, -1);
+            }
+            if (fCL->contains(fText->char32At(tPos))) {
+                continue;
+            }
+        }
+
+
+        // LB 11a        B2 SP* x B2
+        if (fB2->contains(thisChar)) {
+            //  Scan backwards, checking for the B2 CM* SP* sequence.
+            tPos = prevPos;
+            if (fSP->contains(prevChar)) {
+                while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
+                    tPos=fText->moveIndex32(tPos, -1);
+                }
+            }
+            while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
+                tPos=fText->moveIndex32(tPos, -1);
+            }
+            if (fB2->contains(fText->char32At(tPos))) {
+                continue;
+            }
+        }
+
+
+        // LB 11b
        //    x  WJ
        //    WJ  x
        if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
@ -2786,7 +2756,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
            break;
        }

-        // LB 13   
+        // LB 13
        //    x  GL
        //    GL  x
        if (fGL->contains(thisChar) || fGL->contains(prevChar)) {
@ -2805,7 +2775,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
            break;
        }

-        // LB 15 
+        // LB 15
        if (fBA->contains(thisChar) ||
            fHY->contains(thisChar) ||
            fNS->contains(thisChar) ||
@ -2818,28 +2788,27 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
            fID->contains(prevChar) && fIN->contains(thisChar) ||
            fIN->contains(prevChar) && fIN->contains(thisChar) ||
            fNU->contains(prevChar) && fIN->contains(thisChar) )   {
-            continue; 
+            continue;
        }


-        // LB 17    ID x PO    (Note:  Leading CM behaves like ID)
+        // LB 17    ID x PO
        //          AL x NU
        //          NU x AL
        if (fID->contains(prevChar) && fPO->contains(thisChar) ||
-            fCM->contains(prevChar) && fPO->contains(thisChar) || 
            fAL->contains(prevChar) && fNU->contains(thisChar) ||
            fNU->contains(prevChar) && fAL->contains(thisChar) )   {
-            continue; 
+            continue;
        }

        // LB 18    Numbers
-        UnicodeString  subStr18(*fText, prevPos);
-        fNumberMatcher->reset(subStr18);
-        if (fNumberMatcher->lookingAt(status)) {
-            // TODO:  Check status codes
+        if (fNumberMatcher->lookingAt(prevPos, status)) {
+            if (U_FAILURE(status)) {
+                break;
+            }
            // Matched a number.  But could have been just a single digit, which would
            //    not represent a "no break here" between prevChar and thisChar
-            int32_t numEndIdx = prevPos + fNumberMatcher->end(status);  // idx of first char following num
+            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
            if (numEndIdx > pos) {
                // Number match includes at least our two chars being checked
                if (numEndIdx > nextPos) {
@ -2847,7 +2816,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
                    //   so that next loop iteration will continue at the end of the number,
                    //   checking for breaks between last char in number & whatever follows.
                    nextPos = numEndIdx;
-                    pos = fCharBI->preceding(numEndIdx); 
+                    pos = fCharBI->preceding(numEndIdx);
                    thisChar = fText->char32At(pos);
                    while (fCM->contains(thisChar)) {
                        pos = fCharBI->preceding(pos);
@ -2861,29 +2830,28 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
        if (fPR->contains(prevChar) && fAL->contains(thisChar)) {
            continue;
        }
-
        if (fPR->contains(prevChar) && fID->contains(thisChar)) {
            continue;
        }

        // LB 18b
-		if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
-			                            fJV->contains(thisChar) ||
-										fH2->contains(thisChar) ||
-										fH3->contains(thisChar))) {
-											continue;
-										}
+        if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
+                                        fJV->contains(thisChar) ||
+                                        fH2->contains(thisChar) ||
+                                        fH3->contains(thisChar))) {
+                                            continue;
+                                        }

        if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
-			(fJV->contains(thisChar) || fJT->contains(thisChar))) {
-				continue;
+            (fJV->contains(thisChar) || fJT->contains(thisChar))) {
+                continue;
        }

        if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
            fJT->contains(thisChar)) {
                continue;
        }
-			
+
        // LB 18c  more Korean
        if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
            fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
@ -2902,9 +2870,6 @@ int32_t RBBILineMonkey::next(int32_t startPos) {



-
-
- 
        // LB 19
        if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
            continue;
@ -2917,9 +2882,9 @@ int32_t RBBILineMonkey::next(int32_t startPos) {

        // LB 20    Break everywhere else
        break;
-            
+
    }
-    
+
    return pos;
 }

@ -2970,8 +2935,6 @@ RBBILineMonkey::~RBBILineMonkey() {

    delete fCharBI;
    delete fNumberMatcher;
-    delete fLB10Matcher;
-    delete fLB11Matcher;
 }


@ -3014,9 +2977,9 @@ static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t d
 }
 #endif

-static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 
+static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
                                    BreakIterator *bi,
-                                    int expected[], 
+                                    int expected[],
                                    int expectedcount)
 {
    int count = 0;
@ -3026,7 +2989,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
    for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
        forward[count] = i;
        if (count < expectedcount && expected[count] != i) {
-            test->errln("break forward test failed: expected %d but got %d", 
+            test->errln("break forward test failed: expected %d but got %d",
                        expected[count], i);
            break;
        }
@ -3034,7 +2997,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
    }
    if (count != expectedcount) {
        printStringBreaks(ustr, expected, expectedcount);
-        test->errln("break test failed: missed %d match", 
+        test->errln("break test failed: missed %d match",
                    expectedcount - count);
        return;
    }
@ -3058,7 +3021,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
    for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
        count --;
        if (forward[count] != i) {
-            test->errln("happy break test reverse failed: expected %d but got %d", 
+            test->errln("happy break test reverse failed: expected %d but got %d",
                        forward[count], i);
            break;
        }
@ -3079,7 +3042,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
                return;
            }
        }
-    }    
+    }
 }

 void RBBITest::TestWordBreaks(void)
@ -3091,8 +3054,8 @@ void RBBITest::TestWordBreaks(void)
    UErrorCode    status = U_ZERO_ERROR;
    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
-    UChar         str[300]; 
-    static const char *strlist[] = 
+    UChar         str[300];
+    static const char *strlist[] =
    {
    "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
    "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
@ -3168,8 +3131,8 @@ void RBBITest::TestWordBoundary(void)
    UErrorCode    status = U_ZERO_ERROR;
    // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
-    UChar         str[50]; 
-    static const char *strlist[] = 
+    UChar         str[50];
+    static const char *strlist[] =
    {
    "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
    "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
@ -3212,7 +3175,7 @@ void RBBITest::TestWordBoundary(void)
        UnicodeString ustr(str);
        int forward[50];
        int count = 0;
-        
+
        bi->setText(ustr);
        int prev = 0;
        int i;
@ -3223,7 +3186,7 @@ void RBBITest::TestWordBoundary(void)
                for (j = prev + 1; j < i; j ++) {
                    if (bi->isBoundary(j)) {
                        printStringBreaks(ustr, forward, count);
-                        errln("happy boundary test failed: expected %d not a boundary", 
+                        errln("happy boundary test failed: expected %d not a boundary",
                               j);
                        return;
                    }
@ -3231,7 +3194,7 @@ void RBBITest::TestWordBoundary(void)
            }
            if (!bi->isBoundary(i)) {
                printStringBreaks(ustr, forward, count);
-                errln("happy boundary test failed: expected %d a boundary", 
+                errln("happy boundary test failed: expected %d a boundary",
                       i);
                return;
            }
@ -3247,8 +3210,8 @@ void RBBITest::TestLineBreaks(void)
    Locale        locale("en");
    UErrorCode    status = U_ZERO_ERROR;
    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
-    UChar         str[50]; 
-    static const char *strlist[] = 
+    UChar         str[50];
+    static const char *strlist[] =
    {
     "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
     "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
@ -3324,14 +3287,14 @@ void RBBITest::TestSentBreaks(void)
    Locale        locale("en");
    UErrorCode    status = U_ZERO_ERROR;
    BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
-    UChar         str[100]; 
-    static const char *strlist[] = 
+    UChar         str[100];
+    static const char *strlist[] =
    {
     "Now\ris\nthe\r\ntime\n\rfor\r\r",
     "This\n",
     "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
     "\"Sentence ending with a quote.\" Bye.",
-     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"", 
+     "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
     "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
     "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
     "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
@ -3427,7 +3390,7 @@ void RBBITest::TestMonkey(char *params) {
        RBBILineMonkey  m;
        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
        if (params == NULL) {
-            loopCount = 50;
+            loopCount = loopCount / 5;   // Line break runs slower than the others.
        }
        if (U_SUCCESS(status)) {
            RunMonkey(bi, m, "line", seed, loopCount);
@ -3642,7 +3605,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
                for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
                    for (;;) {
                        if (endContext >= testText.length()) {break;}
-                        if (expectedBreaks[endContext-1] != 0) { 
+                        if (expectedBreaks[endContext-1] != 0) {
                            if (count == 0) break;
                            count --;
                        }
@ -3655,7 +3618,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
                /***if (strcmp(errorType, "next()") == 0) {
                    startContext = 0;
                    endContext = testText.length();
-                   
+
                    printStringBreaks(testText, expected, expectedCount);
                }***/

@ -3704,5 +3667,28 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
 #endif
 }

+//
+//  TestDebug    -  A place-holder test for debugging purposes.
+//                  For putting in fragments of other tests that can be invoked
+//                  for tracing  without a lot of unwanted extra stuff happening.
+//
+void RBBITest::TestDebug(void) {
+#if 0
+    UErrorCode   status = U_ZERO_ERROR;
+    int pos;
+
+    RuleBasedBreakIterator* bi =
+       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
+       (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
+    UnicodeString s("\\u0E2B\\u0E19\\u0E36\\u0E48\\u0E07\\u0E04\\u0E33");
+    s = s.unescape();
+    bi->setText(s);
+    // bi->last();
+    do {
+        pos = bi->next();
+        printf("%d\n", pos);
+    } while (pos != BreakIterator::DONE);
+#endif
+}

 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -1,6 +1,6 @@
 /********************************************************************
- * COPYRIGHT: 
- * Copyright (c) 1999-2004, International Business Machines Corporation and
+ * COPYRIGHT:
+ * Copyright (c) 1999-2005, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
 /************************************************************************
@ -31,20 +31,20 @@ class  RBBIMonkeyKind;
 */
 class RBBITest: public IntlTest {
 public:
-  
+
    RBBITest();
    virtual ~RBBITest();

    void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
- 
+
    /**
     * Tests rule status return values
-     **/  
+     **/
    void TestStatusReturn();

    /**
     * Run the Unicode Line Break test data.
-     **/  
+     **/
    void TestLineBreakData();

    /**
@ -58,8 +58,8 @@ public:
    void TestBug4153072();
    void TestJapaneseLineBreak();
    void TestThaiLineBreak();
-    void TestMixedThaiLineBreak(); 
-    void TestMaiyamok(); 
+    void TestMixedThaiLineBreak();
+    void TestMaiyamok();
    void TestThaiWordBreak();
    void TestMonkey(char *params);

@ -73,16 +73,17 @@ public:
    void TestSentBreaks();
    void TestBug3818();
    void TestJapaneseWordBreak();
-    
-    
+    void TestDebug();
+
+
 /***********************/
 private:
    /**
     * internal methods to prepare test data
     **/
-   
+
    /**
-     * Perform tests of BreakIterator forward and backward functionality 
+     * Perform tests of BreakIterator forward and backward functionality
     * on different kinds of iterators (word, sentence, line and character).
     * It tests the methods first(), next(), current(), preceding(), following()
     * previous() and isBoundary().
@ -110,7 +111,7 @@ private:
     **/
    void testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td);
    /**
-     * Internal method to perform tests of BreakIterator multiple selection functionality 
+     * Internal method to perform tests of BreakIterator multiple selection functionality
     * on different kinds of iterators (word, sentence, line and character)
     **/
    void doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td);