ICU-2924 RBBI, new style rule format, new line break rules. (14 known test failures, will fix real soon.)

X-SVN-Rev: 13364
2025-04-06 22:15:31 +00:00 · 2003-10-09 01:13:08 +00:00 · 2003-10-09 01:13:08 +00:00 · d4524826ed
commit d4524826ed
parent 95996b6773
13 changed files with 311 additions and 169 deletions
--- a/icu4c/source/common/putil.c
+++ b/icu4c/source/common/putil.c
@ -2650,7 +2650,8 @@ _uBrkErrorName[U_BRK_ERROR_LIMIT - U_BRK_ERROR_START] = {
    "U_BRK_NEW_LINE_IN_QUOTED_STRING",
    "U_BRK_UNDEFINED_VARIABLE",
    "U_BRK_INIT_ERROR",
-    "U_BRK_RULE_EMPTY_SET"
+    "U_BRK_RULE_EMPTY_SET",
+    "U_BRK_UNRECOGNIZED_OPTION"
 };

 static const char * const
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@ -59,6 +59,7 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
    fReverseTree        = NULL;
    fForwardTables      = NULL;
    fReverseTables      = NULL;
+    fChainRules         = FALSE;

    UErrorCode oldstatus = status;   

--- a/icu4c/source/common/rbbirb.h
+++ b/icu4c/source/common/rbbirb.h
@ -1,10 +1,11 @@
 //
 //  rbbirb.h
 //
-//  Copyright (C) 2002, International Business Machines Corporation and others.
+//  Copyright (C) 2002-2003, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
-//  This file contains declarations for several from the Rule Based Break Iterator rule builder.
+//  This file contains declarations for several classes from the
+//    Rule Based Break Iterator rule builder.
 //


@ -122,6 +123,9 @@ public:
    RBBINode                      *fForwardTree;     // The parse trees, generated by the scanner,
    RBBINode                      *fReverseTree;     //   then manipulated by subsequent steps.

+    UBool                         fChainRules;       // True for chained Unicode TR style rules.
+                                                     // False for traditional regexp rules.
+
    RBBISetBuilder                *fSetBuilder;      // Set and Character Category builder.
    UVector                       *fUSetNodes;       // Vector of all uset nodes.

--- a/icu4c/source/common/rbbirpt.h
+++ b/icu4c/source/common/rbbirpt.h
@ -18,14 +18,15 @@ U_NAMESPACE_BEGIN
 // Character classes for RBBI rule scanning.
 //
    static const uint8_t kRuleSet_digit_char = 128;
-    static const uint8_t kRuleSet_rule_char = 129;
-    static const uint8_t kRuleSet_white_space = 130;
-    static const uint8_t kRuleSet_name_char = 131;
-    static const uint8_t kRuleSet_name_start_char = 132;
+    static const uint8_t kRuleSet_white_space = 129;
+    static const uint8_t kRuleSet_rule_char = 130;
+    static const uint8_t kRuleSet_name_start_char = 131;
+    static const uint8_t kRuleSet_name_char = 132;


 enum RBBI_RuleParseAction {
    doExprOrOperator,
+    doOptionEnd,
    doRuleErrorAssignExpr,
    doTagValue,
    doEndAssign,
@ -51,6 +52,7 @@ enum RBBI_RuleParseAction {
    doEndOfRule,
    doUnaryOpPlus,
    doExprStart,
+    doOptionStart,
    doExprCatOperator,
    doReverseDir,
    doCheckVarDef,
@ -73,92 +75,100 @@ struct RBBIRuleTableEl {

 static const struct RBBIRuleTableEl gRuleParseStateTable[] = {
    {doNOP, 0, 0, 0, TRUE}
-    , {doExprStart, 254, 12, 8, FALSE}     //  1      start
-    , {doNOP, 130, 1,0,  TRUE}     //  2 
-    , {doExprStart, 36 /* $ */, 71, 81, FALSE}     //  3 
-    , {doReverseDir, 33 /* ! */, 11,0,  TRUE}     //  4 
+    , {doExprStart, 254, 20, 8, FALSE}     //  1      start
+    , {doNOP, 129, 1,0,  TRUE}     //  2 
+    , {doExprStart, 36 /* $ */, 79, 89, FALSE}     //  3 
+    , {doNOP, 33 /* ! */, 11,0,  TRUE}     //  4 
    , {doNOP, 59 /* ; */, 1,0,  TRUE}     //  5 
    , {doNOP, 252, 0,0,  FALSE}     //  6 
-    , {doExprStart, 255, 12, 8, FALSE}     //  7 
+    , {doExprStart, 255, 20, 8, FALSE}     //  7 
    , {doEndOfRule, 59 /* ; */, 1,0,  TRUE}     //  8      break-rule-end
-    , {doNOP, 130, 8,0,  TRUE}     //  9 
-    , {doRuleError, 255, 86,0,  FALSE}     //  10 
-    , {doExprStart, 255, 12, 8, FALSE}     //  11      reverse-rule
-    , {doRuleChar, 254, 21,0,  TRUE}     //  12      term
-    , {doNOP, 130, 12,0,  TRUE}     //  13 
-    , {doRuleChar, 129, 21,0,  TRUE}     //  14 
-    , {doNOP, 91 /* [ */, 77, 21, FALSE}     //  15 
-    , {doLParen, 40 /* ( */, 12, 21, TRUE}     //  16 
-    , {doNOP, 36 /* $ */, 71, 20, FALSE}     //  17 
-    , {doDotAny, 46 /* . */, 21,0,  TRUE}     //  18 
-    , {doRuleError, 255, 86,0,  FALSE}     //  19 
-    , {doCheckVarDef, 255, 21,0,  FALSE}     //  20      term-var-ref
-    , {doNOP, 130, 21,0,  TRUE}     //  21      expr-mod
-    , {doUnaryOpStar, 42 /* * */, 26,0,  TRUE}     //  22 
-    , {doUnaryOpPlus, 43 /* + */, 26,0,  TRUE}     //  23 
-    , {doUnaryOpQuestion, 63 /* ? */, 26,0,  TRUE}     //  24 
-    , {doNOP, 255, 26,0,  FALSE}     //  25 
-    , {doExprCatOperator, 254, 12,0,  FALSE}     //  26      expr-cont
-    , {doNOP, 130, 26,0,  TRUE}     //  27 
-    , {doExprCatOperator, 129, 12,0,  FALSE}     //  28 
-    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  29 
-    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  30 
-    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  31 
-    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  32 
-    , {doExprCatOperator, 47 /* / */, 38,0,  FALSE}     //  33 
-    , {doExprCatOperator, 123 /* { */, 50,0,  TRUE}     //  34 
-    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  35 
-    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  36 
-    , {doExprFinished, 255, 255,0,  FALSE}     //  37 
-    , {doSlash, 47 /* / */, 40,0,  TRUE}     //  38      look-ahead
-    , {doNOP, 255, 86,0,  FALSE}     //  39 
-    , {doExprCatOperator, 254, 12,0,  FALSE}     //  40      expr-cont-no-slash
-    , {doNOP, 130, 26,0,  TRUE}     //  41 
-    , {doExprCatOperator, 129, 12,0,  FALSE}     //  42 
-    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  43 
-    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  44 
-    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  45 
-    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  46 
-    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  47 
-    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  48 
-    , {doExprFinished, 255, 255,0,  FALSE}     //  49 
-    , {doNOP, 130, 50,0,  TRUE}     //  50      tag-open
-    , {doStartTagValue, 128, 53,0,  FALSE}     //  51 
-    , {doTagExpectedError, 255, 86,0,  FALSE}     //  52 
-    , {doNOP, 130, 57,0,  TRUE}     //  53      tag-value
-    , {doNOP, 125 /* } */, 57,0,  FALSE}     //  54 
-    , {doTagDigit, 128, 53,0,  TRUE}     //  55 
-    , {doTagExpectedError, 255, 86,0,  FALSE}     //  56 
-    , {doNOP, 130, 57,0,  TRUE}     //  57      tag-close
-    , {doTagValue, 125 /* } */, 60,0,  TRUE}     //  58 
-    , {doTagExpectedError, 255, 86,0,  FALSE}     //  59 
-    , {doExprCatOperator, 254, 12,0,  FALSE}     //  60      expr-cont-no-tag
-    , {doNOP, 130, 60,0,  TRUE}     //  61 
-    , {doExprCatOperator, 129, 12,0,  FALSE}     //  62 
-    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  63 
-    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  64 
-    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  65 
-    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  66 
-    , {doExprCatOperator, 47 /* / */, 38,0,  FALSE}     //  67 
-    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  68 
-    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  69 
-    , {doExprFinished, 255, 255,0,  FALSE}     //  70 
-    , {doStartVariableName, 36 /* $ */, 73,0,  TRUE}     //  71      scan-var-name
-    , {doNOP, 255, 86,0,  FALSE}     //  72 
-    , {doNOP, 132, 75,0,  TRUE}     //  73      scan-var-start
-    , {doVariableNameExpectedErr, 255, 86,0,  FALSE}     //  74 
-    , {doNOP, 131, 75,0,  TRUE}     //  75      scan-var-body
-    , {doEndVariableName, 255, 255,0,  FALSE}     //  76 
-    , {doScanUnicodeSet, 91 /* [ */, 255,0,  TRUE}     //  77      scan-unicode-set
-    , {doScanUnicodeSet, 112 /* p */, 255,0,  TRUE}     //  78 
-    , {doScanUnicodeSet, 80 /* P */, 255,0,  TRUE}     //  79 
-    , {doNOP, 255, 86,0,  FALSE}     //  80 
-    , {doNOP, 130, 81,0,  TRUE}     //  81      assign-or-rule
-    , {doStartAssign, 61 /* = */, 12, 84, TRUE}     //  82 
-    , {doNOP, 255, 20, 8, FALSE}     //  83 
-    , {doEndAssign, 59 /* ; */, 1,0,  TRUE}     //  84      assign-end
-    , {doRuleErrorAssignExpr, 255, 86,0,  FALSE}     //  85 
-    , {doExit, 255, 86,0,  TRUE}     //  86      errorDeath
+    , {doNOP, 129, 8,0,  TRUE}     //  9 
+    , {doRuleError, 255, 94,0,  FALSE}     //  10 
+    , {doNOP, 33 /* ! */, 13,0,  TRUE}     //  11      rev-option
+    , {doReverseDir, 255, 19, 8, FALSE}     //  12 
+    , {doOptionStart, 131, 15,0,  TRUE}     //  13      option-scan1
+    , {doRuleError, 255, 94,0,  FALSE}     //  14 
+    , {doNOP, 132, 15,0,  TRUE}     //  15      option-scan2
+    , {doOptionEnd, 129, 1,0,  FALSE}     //  16 
+    , {doOptionEnd, 59 /* ; */, 1,0,  FALSE}     //  17 
+    , {doRuleError, 255, 94,0,  FALSE}     //  18 
+    , {doExprStart, 255, 20, 8, FALSE}     //  19      reverse-rule
+    , {doRuleChar, 254, 29,0,  TRUE}     //  20      term
+    , {doNOP, 129, 20,0,  TRUE}     //  21 
+    , {doRuleChar, 130, 29,0,  TRUE}     //  22 
+    , {doNOP, 91 /* [ */, 85, 29, FALSE}     //  23 
+    , {doLParen, 40 /* ( */, 20, 29, TRUE}     //  24 
+    , {doNOP, 36 /* $ */, 79, 28, FALSE}     //  25 
+    , {doDotAny, 46 /* . */, 29,0,  TRUE}     //  26 
+    , {doRuleError, 255, 94,0,  FALSE}     //  27 
+    , {doCheckVarDef, 255, 29,0,  FALSE}     //  28      term-var-ref
+    , {doNOP, 129, 29,0,  TRUE}     //  29      expr-mod
+    , {doUnaryOpStar, 42 /* * */, 34,0,  TRUE}     //  30 
+    , {doUnaryOpPlus, 43 /* + */, 34,0,  TRUE}     //  31 
+    , {doUnaryOpQuestion, 63 /* ? */, 34,0,  TRUE}     //  32 
+    , {doNOP, 255, 34,0,  FALSE}     //  33 
+    , {doExprCatOperator, 254, 20,0,  FALSE}     //  34      expr-cont
+    , {doNOP, 129, 34,0,  TRUE}     //  35 
+    , {doExprCatOperator, 130, 20,0,  FALSE}     //  36 
+    , {doExprCatOperator, 91 /* [ */, 20,0,  FALSE}     //  37 
+    , {doExprCatOperator, 40 /* ( */, 20,0,  FALSE}     //  38 
+    , {doExprCatOperator, 36 /* $ */, 20,0,  FALSE}     //  39 
+    , {doExprCatOperator, 46 /* . */, 20,0,  FALSE}     //  40 
+    , {doExprCatOperator, 47 /* / */, 46,0,  FALSE}     //  41 
+    , {doExprCatOperator, 123 /* { */, 58,0,  TRUE}     //  42 
+    , {doExprOrOperator, 124 /* | */, 20,0,  TRUE}     //  43 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  44 
+    , {doExprFinished, 255, 255,0,  FALSE}     //  45 
+    , {doSlash, 47 /* / */, 48,0,  TRUE}     //  46      look-ahead
+    , {doNOP, 255, 94,0,  FALSE}     //  47 
+    , {doExprCatOperator, 254, 20,0,  FALSE}     //  48      expr-cont-no-slash
+    , {doNOP, 129, 34,0,  TRUE}     //  49 
+    , {doExprCatOperator, 130, 20,0,  FALSE}     //  50 
+    , {doExprCatOperator, 91 /* [ */, 20,0,  FALSE}     //  51 
+    , {doExprCatOperator, 40 /* ( */, 20,0,  FALSE}     //  52 
+    , {doExprCatOperator, 36 /* $ */, 20,0,  FALSE}     //  53 
+    , {doExprCatOperator, 46 /* . */, 20,0,  FALSE}     //  54 
+    , {doExprOrOperator, 124 /* | */, 20,0,  TRUE}     //  55 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  56 
+    , {doExprFinished, 255, 255,0,  FALSE}     //  57 
+    , {doNOP, 129, 58,0,  TRUE}     //  58      tag-open
+    , {doStartTagValue, 128, 61,0,  FALSE}     //  59 
+    , {doTagExpectedError, 255, 94,0,  FALSE}     //  60 
+    , {doNOP, 129, 65,0,  TRUE}     //  61      tag-value
+    , {doNOP, 125 /* } */, 65,0,  FALSE}     //  62 
+    , {doTagDigit, 128, 61,0,  TRUE}     //  63 
+    , {doTagExpectedError, 255, 94,0,  FALSE}     //  64 
+    , {doNOP, 129, 65,0,  TRUE}     //  65      tag-close
+    , {doTagValue, 125 /* } */, 68,0,  TRUE}     //  66 
+    , {doTagExpectedError, 255, 94,0,  FALSE}     //  67 
+    , {doExprCatOperator, 254, 20,0,  FALSE}     //  68      expr-cont-no-tag
+    , {doNOP, 129, 68,0,  TRUE}     //  69 
+    , {doExprCatOperator, 130, 20,0,  FALSE}     //  70 
+    , {doExprCatOperator, 91 /* [ */, 20,0,  FALSE}     //  71 
+    , {doExprCatOperator, 40 /* ( */, 20,0,  FALSE}     //  72 
+    , {doExprCatOperator, 36 /* $ */, 20,0,  FALSE}     //  73 
+    , {doExprCatOperator, 46 /* . */, 20,0,  FALSE}     //  74 
+    , {doExprCatOperator, 47 /* / */, 46,0,  FALSE}     //  75 
+    , {doExprOrOperator, 124 /* | */, 20,0,  TRUE}     //  76 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  77 
+    , {doExprFinished, 255, 255,0,  FALSE}     //  78 
+    , {doStartVariableName, 36 /* $ */, 81,0,  TRUE}     //  79      scan-var-name
+    , {doNOP, 255, 94,0,  FALSE}     //  80 
+    , {doNOP, 131, 83,0,  TRUE}     //  81      scan-var-start
+    , {doVariableNameExpectedErr, 255, 94,0,  FALSE}     //  82 
+    , {doNOP, 132, 83,0,  TRUE}     //  83      scan-var-body
+    , {doEndVariableName, 255, 255,0,  FALSE}     //  84 
+    , {doScanUnicodeSet, 91 /* [ */, 255,0,  TRUE}     //  85      scan-unicode-set
+    , {doScanUnicodeSet, 112 /* p */, 255,0,  TRUE}     //  86 
+    , {doScanUnicodeSet, 80 /* P */, 255,0,  TRUE}     //  87 
+    , {doNOP, 255, 94,0,  FALSE}     //  88 
+    , {doNOP, 129, 89,0,  TRUE}     //  89      assign-or-rule
+    , {doStartAssign, 61 /* = */, 20, 92, TRUE}     //  90 
+    , {doNOP, 255, 28, 8, FALSE}     //  91 
+    , {doEndAssign, 59 /* ; */, 1,0,  TRUE}     //  92      assign-end
+    , {doRuleErrorAssignExpr, 255, 94,0,  FALSE}     //  93 
+    , {doExit, 255, 94,0,  TRUE}     //  94      errorDeath
 };
 static const char * const RBBIRuleStateNames[] = {    0,
     "start",
@ -170,6 +180,14 @@ static const char * const RBBIRuleStateNames[] = {    0,
    0,
     "break-rule-end",
    0,
+    0,
+     "rev-option",
+    0,
+     "option-scan1",
+    0,
+     "option-scan2",
+    0,
+    0,
    0,
     "reverse-rule",
     "term",
--- a/icu4c/source/common/rbbirpt.txt
+++ b/icu4c/source/common/rbbirpt.txt
@ -58,7 +58,7 @@ start:
    escaped                term                  ^break-rule-end    doExprStart                       
    white_space          n start                     
    '$'                    scan-var-name         ^assign-or-rule    doExprStart
-    '!'                  n reverse-rule                             doReverseDir
+    '!'                  n rev-option                             
    ';'                  n start                                                  # ignore empty rules.
    eof                    exit              
    default                term                  ^break-rule-end    doExprStart
@ -73,9 +73,25 @@ break-rule-end:
     

 #
-#   Reverse Rule    We've just scanned a '!', indicating a reverse direction rule.
-#                   A rule expression must follow.
+#   !               We've just scanned a '!', indicating either a !!key word flag or a
+#                   !Reverse rule.
 #
+rev-option:
+    '!'                  n option-scan1   
+    default                reverse-rule           ^break-rule-end   doReverseDir
+    
+option-scan1:
+    name_start_char      n option-scan2                             doOptionStart
+    default                errorDeath                               doRuleError
+    
+option-scan2:
+    name_char            n option-scan2
+    white_space            start                                    doOptionEnd
+    ';'                    start                                    doOptionEnd
+    default                errorDeath                               doRuleError
+    
+    
+
 reverse-rule:
    default                term                   ^break-rule-end   doExprStart
    
--- a/icu4c/source/common/rbbiscan.cpp
+++ b/icu4c/source/common/rbbiscan.cpp
@ -459,6 +459,21 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action)
        break;


+    case doOptionStart:
+        // Scanning a !!option.   At the start of string.
+        fOptionStart = fScanIndex;
+        break;
+
+    case doOptionEnd:
+        {
+            UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
+            if (opt == "chain") {
+                fRB->fChainRules = TRUE;
+            } else {
+                error(U_BRK_UNRECOGNIZED_OPTION);
+            }
+        }
+        break;

    case doReverseDir:
        fReverseRule = TRUE;
--- a/icu4c/source/common/rbbiscan.h
+++ b/icu4c/source/common/rbbiscan.h
@ -145,6 +145,9 @@ private:

    int32_t                        fRuleNum;         // Counts each rule as it is scanned.

+    int32_t                        fOptionStart;     // Input index of start of a !!option
+                                                     //   keyword, while being scanned.
+
    UnicodeSet *gRuleSet_rule_char;
    UnicodeSet *gRuleSet_white_space;
    UnicodeSet *gRuleSet_name_char;
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@ -119,6 +119,13 @@ void  RBBITableBuilder::build() {
        printPosSets(fTree);
    }

+    //
+    //  For "chained" rules, modify the followPos sets
+    //
+    if (fRB->fChainRules) {
+        calcChainedFollowPos(fTree);
+    }
+
    //
    // Build the DFA state transition tables.
    //
@ -310,6 +317,82 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) {
 }


+
+//-----------------------------------------------------------------------------
+//
+//   calcChainedFollowPos.    Modify the previously calculated followPos sets
+//                            to implement rule chaining.  NOT described by Aho
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) {
+
+    UVector         endMarkerNodes(*fStatus);
+    UVector         leafNodes(*fStatus);
+    int32_t         i;
+
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+
+    // get a list of all endmarker nodes.
+    fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
+
+    // get a list all leaf nodes 
+    fTree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+
+    // Get all nodes that can be the start a match, which is FirstPosition(root)
+    UVector *matchStartNodes = fTree->fFirstPosSet;
+
+
+    // Iteratate over all leaf nodes,
+    //
+    int32_t  endNodeIx;
+    int32_t  startNodeIx;
+    for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
+        RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
+        RBBINode *endNode = NULL;
+
+        // Identify leaf nodes that correspond to overall rule match positions.
+        //   These include an endMarkerNode in their followPos sets.
+        for (i=0; i<endMarkerNodes.size(); i++) {
+            if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
+                endNode = tNode;
+                break;
+            }
+        }
+        if (endNode == NULL) {
+            // node wasn't an end node.  Try again with the next.
+            continue;
+        }
+
+        // We've got a node that can end a match.
+        // Now iterate over the nodes that can start a match, looking for ones
+        //   with the same char class as our ending node.
+        RBBINode *startNode;
+        for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
+            startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
+            if (startNode->fType != RBBINode::leafChar) {
+                continue;
+            }
+
+            if (endNode->fVal == startNode->fVal) {
+                // The end val (character class) of one possible match is the
+                //   same as the start of another.
+
+                // Add all nodes from the followPos of the start node to the
+                //  followPos set of the end node, which will have the effect of
+                //  letting matches transition from a match state at endNode
+                //  to the second char of a match starting with startNode.
+                setAdd(endNode->fFollowPos, startNode->fFollowPos);
+            }
+        }
+    }
+}
+
+
 //-----------------------------------------------------------------------------
 //
 //   buildStateTable()    Determine the set of runtime DFA states and the
--- a/icu4c/source/common/rbbitblb.h
+++ b/icu4c/source/common/rbbitblb.h
@ -49,6 +49,7 @@ private:
    void     calcFirstPos(RBBINode *n);
    void     calcLastPos(RBBINode  *n);
    void     calcFollowPos(RBBINode *n);
+    void     calcChainedFollowPos(RBBINode *n);
    void     buildStateTable();
    void     flagAcceptingStates();
    void     flagLookAheadStates();
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -625,6 +625,7 @@ typedef enum UErrorCode {
    U_BRK_UNDEFINED_VARIABLE,              /**< Use of an undefined $Variable in an RBBI rule.    */
    U_BRK_INIT_ERROR,                      /**< Initialization failure.  Probable missing ICU Data. */
    U_BRK_RULE_EMPTY_SET,                  /**< Rule contains an empty Unicode Set.               */
+    U_BRK_UNRECOGNIZED_OPTION,             /**< !!option in RBBI rules not recognized.            */
    U_BRK_ERROR_LIMIT,                     /**< This must always be the last value to indicate the limit for Break Iterator failures */

    /*
--- a/icu4c/source/data/brkitr/line.txt
+++ b/icu4c/source/data/brkitr/line.txt
@ -17,6 +17,8 @@
 #  Character Classes defined by TR 14.
 #

+!!chain  
+
 $AI = [:LineBreak =  Ambiguous:];
 $AL = [:LineBreak =  Alphabetic:];
 $BA = [:LineBreak =  Break_After:];
@ -82,75 +84,78 @@ $SPcm = $SP $CM*;
 $SYcm = $SY $CM*;


-#  New Lines.  Always break after, never break before.
-#              Rule LB 3
-#
-#  Endings.    NewLine or Zero Width Space, or both.  Rules 4, 5
-#              Because we never break before these things, $Endings
-#              appears at the end of line break rule.
-#
-$NLF = $BK | $CR | $LF | $NL | $CR $LF;
-$EndingsSoft = ($ZW* $SP)* $ZW*;
-$EndingsHard = ($ZW* $SP)* $ZW* $NLF;
+#  
+#  Rule LB 3
+$LB3NonBreaks = [^$BK $CR $LF $NL];               
+$LB3NonBreaks ($BK | $CR | $LF | $NL){100};   
+$CR $LF {100};
+
+# LB 4         x SP
+#              x ZW
+$LB3NonBreaks [$SP $ZW];
+
+# LB 5         Break after zero width space
+$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
+
+# LB 7     Combining marks.  TODO:  get it right!
+#                                   $SP $CM needs to behave like $ID.
+#                                   X   $CM needs to behave like X, where X is not $SP.
+#                                   $CM not covered by the above needs to behave like $AL
+[$LB5NonBreaks] $CM*;    #  Stick together any combining sequences that don't match other rules.
+
+# LB 8     
+$LB5NonBreaks [$CL  $EX $IS $SY];
+
+# LB 9
+$OPcm $SP* $LB3NonBreaks?;           #  Need to force trailing $BKs to rule 3, to get status right.
+$OPcm $SP* [$LB5NonBreaks] $CM*;
+
+# LB 10
+$QUcm $SP* $OPcm;
+
+# LB 11
+$CLcm $SP* $NScm;
+
+# LB 11a
+($B2cm)+;
+
+# LB 11b
+$LB5NonBreaks $GLcm $LB3NonBreaks?;
+$LB5NonBreaks $GLcm [$LB5NonBreaks] $CM*;
+$GLcm $LB3NonBreaks?;
+$GLcm [$LB5NonBreaks] $CM*;
+
+# LB 12
+$LB12NonBreaks = [[$LB5NonBreaks] - [$SP]];
+
+# LB 14
+$LB12NonBreaks $QUcm+ $LB3NonBreaks?;
+$LB12NonBreaks $QUcm+ [$LB5NonBreaks] $CM*;
+$QUcm $LB3NonBreaks?;
+$QUcm [$LB5NonBreaks] $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
+
+# LB 14a
+$LB14NonBreaks = [[$LB12NonBreaks] - [$CB]];


-#
-#  Openings  Sequences that can precede Words, and that should not be separated from them.
-#            Rules LB 9, 10
-#
-$Openings = ((($QUcm ($ZW* $SP)*)? $OPcm ($ZW* $SP)*) | $GLcm)+;
+# LB 15
+$LB14NonBreaks ($BAcm | $HYcm | $NScm);   
+$BBcm [^$CB];

-#
-#  Closings  Seqences that follow words, and that should not be separated from them,
-#            Rule LB 8, 11, 15
-$Closings =  (($ZW* $SP)*( ($CLcm (($ZW* $SP)* $NScm)?  |  $EX  | $IS  | $SY) $CM*) | $BAcm | $HYcm  | $NScm)*;
+# LB 16
+($ALcm | $IDcm | $INcm | $NUcm) $INcm*;

-$WordClosings = ($SP* $CLcm | $SP* $EXcm | $SP* $IScm | $SP* $SYcm | $BAcm | $HYcm | $NScm)*;
-
-#
-#  Words.  Includes mixed Alpha-numerics.
-#          Rules 11a, 16, 17, 19, more or less.
-#
-$Number         =  $PRcm? ($OPcm | $HYcm)? $NU ($NU | $IS)* $CL? $POcm?; # Numbers 
-                                                                       # Regex form, rather than rule 18
-                                                                       
-# Alpha-numeric.   16, 17 
-$Word   =  ($ALcm | $NUcm)+  $INcm*  |
-           $IDcm ($POcm? | $INcm*)   |
-           $CM+  ($POcm? | $INcm*)   |                      # CM with no base is like ID  (LB 7a)
-           $INcm+                    |
-           $CB;                                             # Deviation from Unicode spec for $CB
-                                                            #   We treat as a single char word
-                                                            
-$Dashes = (($B2cm ($ZW* $SP)*)*);                                             # Dashes           11a   
-        
-        
-
-$HYMinus = $HYcm ($NUcm ($NUcm | $IS)* $CL? $POcm?)?;       # For Rle LB15, Don't break before  Hypen-minus,
-                                                            #  we also need to match a whole number, if that
-                                                            #  is what follows the '-'
- 
- 
-        
-$Word15 = $Openings? (
-             ($BBcm* $Openings? ($Word | $Number | $Dashes)? ($BAcm | $HYMinus | $NScm)*) |   # Rule 15. Stuff sticks around words.
-             $BBcm* [^[:Cc:] $BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] $CM*  |                 # Allow characters that don't meet the
-             $BBcm* [^$BK $CR $LF $NL $ZW ($ZW* $SP) $GL ]                                 #  more elaborate definitions for WORD
-             )  $WordClosings?;                                                          #  to be glued.
-       
-$GluedWord  = $Openings? $Word15 ((($ZW* $SP)* $GLcm | $QUcm) $Word15)*;  # "Glue" will stick anything below it together.
- #$GluedWord  = $Openings? $Word15 ((($ZW* $SP)* $Openings) $Word15)*;  # "Glue" will stick anything below it together.
-                                                                    # Rules 13, 14
-
-#
-#  The actual rule, a combination of everything defined above.
-#
-$Openings? $GluedWord  $Closings $EndingsSoft{0};
-$Openings? $GluedWord  $Closings $EndingsHard{100};
-# $GluedWord;
+# $LB 17
+$IDcm $POcm;
+$ALcm+ $NUcm;       # includes $LB19
+$NUcm $ALcm+;


+# LB 18
+$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?;

+# LB 19
+$ALcm+;


 #
@ -161,8 +166,4 @@ $Openings? $GluedWord  $Closings $EndingsHard{100};
 #     containing a space that may inhibit a break from occuring.
 #

-$SpaceGlue  = ([$ZW $CL $IS $NS $OP]  ($CM* $SP)) | (($CM* $SP)+ $OP);
-$ClumpingChars = [^$SP $BK $CR $LF];
-
-#!. . $ClumpingChars*  ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
-!.*;
+!.*;
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -2526,13 +2526,10 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
    //   Depends on the previous char, and whether it eats following CombiningMarks
    //   or not.
    UChar32   c = fText->char32At(prevPos);
-    if (c == 0x0d || c == 0x0a || c == 0x85 || fBK->contains(c) || fSP->contains(c)) {
-        // char doesn't automatically combine with CM.
-        nextPos = fText->moveIndex32(prevPos, 1);
-    } else {
-        nextPos = fCharBI->following(prevPos);
+    nextPos = fText->moveIndex32(prevPos, 1);
+    if (!(c == 0x0d || c == 0x0a || c == 0x85 || c == 0x200b /* ZW */ || fBK->contains(c))) {
        for (;;) {
-            UChar32 c = fText->char32At(nextPos);
+            c = fText->char32At(nextPos);
            if (!fCM->contains(c)) {
                break;
            }
@ -2714,12 +2711,9 @@ fall_through_11:
        }

        // LB 14a  Break around a CB
-        //   NOTE:  DISABLE FOR ICU, FOR NOW.  Too hard to implement in Rules.
-        #if 0
        if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
            break;
        }
-        #endif

        // LB 15 
        if (fBA->contains(thisChar) ||
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@ -22,6 +22,10 @@
 #


+#   Temp debugging tests
+<line>
+<data>•\U00011efa\u275d\u0085•\u0c56•</data>
+<data>•a\u275d\u0085•\u0c56•</data>

 ########################################################################################
 #