From d4524826edd8a680fe5329e7d72a03f55d333f9d Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 9 Oct 2003 01:13:08 +0000 Subject: [PATCH] ICU-2924 RBBI, new style rule format, new line break rules. (14 known test failures, will fix real soon.) X-SVN-Rev: 13364 --- icu4c/source/common/putil.c | 3 +- icu4c/source/common/rbbirb.cpp | 1 + icu4c/source/common/rbbirb.h | 8 +- icu4c/source/common/rbbirpt.h | 192 ++++++++++++++----------- icu4c/source/common/rbbirpt.txt | 22 ++- icu4c/source/common/rbbiscan.cpp | 15 ++ icu4c/source/common/rbbiscan.h | 3 + icu4c/source/common/rbbitblb.cpp | 83 +++++++++++ icu4c/source/common/rbbitblb.h | 1 + icu4c/source/common/unicode/utypes.h | 1 + icu4c/source/data/brkitr/line.txt | 135 ++++++++--------- icu4c/source/test/intltest/rbbitst.cpp | 12 +- icu4c/source/test/testdata/rbbitst.txt | 4 + 13 files changed, 311 insertions(+), 169 deletions(-) diff --git a/icu4c/source/common/putil.c b/icu4c/source/common/putil.c index d6e3488a2be..6e18aa17a23 100644 --- a/icu4c/source/common/putil.c +++ b/icu4c/source/common/putil.c @@ -2650,7 +2650,8 @@ _uBrkErrorName[U_BRK_ERROR_LIMIT - U_BRK_ERROR_START] = { "U_BRK_NEW_LINE_IN_QUOTED_STRING", "U_BRK_UNDEFINED_VARIABLE", "U_BRK_INIT_ERROR", - "U_BRK_RULE_EMPTY_SET" + "U_BRK_RULE_EMPTY_SET", + "U_BRK_UNRECOGNIZED_OPTION" }; static const char * const diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index 01d175f2a17..aea4abfaa85 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -59,6 +59,7 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, fReverseTree = NULL; fForwardTables = NULL; fReverseTables = NULL; + fChainRules = FALSE; UErrorCode oldstatus = status; diff --git a/icu4c/source/common/rbbirb.h b/icu4c/source/common/rbbirb.h index 7b38187fd53..dd6854764ae 100644 --- a/icu4c/source/common/rbbirb.h +++ b/icu4c/source/common/rbbirb.h @@ -1,10 +1,11 @@ // // rbbirb.h // -// Copyright (C) 2002, International Business Machines Corporation and others. +// Copyright (C) 2002-2003, International Business Machines Corporation and others. // All Rights Reserved. // -// This file contains declarations for several from the Rule Based Break Iterator rule builder. +// This file contains declarations for several classes from the +// Rule Based Break Iterator rule builder. // @@ -122,6 +123,9 @@ public: RBBINode *fForwardTree; // The parse trees, generated by the scanner, RBBINode *fReverseTree; // then manipulated by subsequent steps. + UBool fChainRules; // True for chained Unicode TR style rules. + // False for traditional regexp rules. + RBBISetBuilder *fSetBuilder; // Set and Character Category builder. UVector *fUSetNodes; // Vector of all uset nodes. diff --git a/icu4c/source/common/rbbirpt.h b/icu4c/source/common/rbbirpt.h index 174ed3ef7c2..54d53f941d2 100644 --- a/icu4c/source/common/rbbirpt.h +++ b/icu4c/source/common/rbbirpt.h @@ -18,14 +18,15 @@ U_NAMESPACE_BEGIN // Character classes for RBBI rule scanning. // static const uint8_t kRuleSet_digit_char = 128; - static const uint8_t kRuleSet_rule_char = 129; - static const uint8_t kRuleSet_white_space = 130; - static const uint8_t kRuleSet_name_char = 131; - static const uint8_t kRuleSet_name_start_char = 132; + static const uint8_t kRuleSet_white_space = 129; + static const uint8_t kRuleSet_rule_char = 130; + static const uint8_t kRuleSet_name_start_char = 131; + static const uint8_t kRuleSet_name_char = 132; enum RBBI_RuleParseAction { doExprOrOperator, + doOptionEnd, doRuleErrorAssignExpr, doTagValue, doEndAssign, @@ -51,6 +52,7 @@ enum RBBI_RuleParseAction { doEndOfRule, doUnaryOpPlus, doExprStart, + doOptionStart, doExprCatOperator, doReverseDir, doCheckVarDef, @@ -73,92 +75,100 @@ struct RBBIRuleTableEl { static const struct RBBIRuleTableEl gRuleParseStateTable[] = { {doNOP, 0, 0, 0, TRUE} - , {doExprStart, 254, 12, 8, FALSE} // 1 start - , {doNOP, 130, 1,0, TRUE} // 2 - , {doExprStart, 36 /* $ */, 71, 81, FALSE} // 3 - , {doReverseDir, 33 /* ! */, 11,0, TRUE} // 4 + , {doExprStart, 254, 20, 8, FALSE} // 1 start + , {doNOP, 129, 1,0, TRUE} // 2 + , {doExprStart, 36 /* $ */, 79, 89, FALSE} // 3 + , {doNOP, 33 /* ! */, 11,0, TRUE} // 4 , {doNOP, 59 /* ; */, 1,0, TRUE} // 5 , {doNOP, 252, 0,0, FALSE} // 6 - , {doExprStart, 255, 12, 8, FALSE} // 7 + , {doExprStart, 255, 20, 8, FALSE} // 7 , {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 8 break-rule-end - , {doNOP, 130, 8,0, TRUE} // 9 - , {doRuleError, 255, 86,0, FALSE} // 10 - , {doExprStart, 255, 12, 8, FALSE} // 11 reverse-rule - , {doRuleChar, 254, 21,0, TRUE} // 12 term - , {doNOP, 130, 12,0, TRUE} // 13 - , {doRuleChar, 129, 21,0, TRUE} // 14 - , {doNOP, 91 /* [ */, 77, 21, FALSE} // 15 - , {doLParen, 40 /* ( */, 12, 21, TRUE} // 16 - , {doNOP, 36 /* $ */, 71, 20, FALSE} // 17 - , {doDotAny, 46 /* . */, 21,0, TRUE} // 18 - , {doRuleError, 255, 86,0, FALSE} // 19 - , {doCheckVarDef, 255, 21,0, FALSE} // 20 term-var-ref - , {doNOP, 130, 21,0, TRUE} // 21 expr-mod - , {doUnaryOpStar, 42 /* * */, 26,0, TRUE} // 22 - , {doUnaryOpPlus, 43 /* + */, 26,0, TRUE} // 23 - , {doUnaryOpQuestion, 63 /* ? */, 26,0, TRUE} // 24 - , {doNOP, 255, 26,0, FALSE} // 25 - , {doExprCatOperator, 254, 12,0, FALSE} // 26 expr-cont - , {doNOP, 130, 26,0, TRUE} // 27 - , {doExprCatOperator, 129, 12,0, FALSE} // 28 - , {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 29 - , {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 30 - , {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 31 - , {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 32 - , {doExprCatOperator, 47 /* / */, 38,0, FALSE} // 33 - , {doExprCatOperator, 123 /* { */, 50,0, TRUE} // 34 - , {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 35 - , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 36 - , {doExprFinished, 255, 255,0, FALSE} // 37 - , {doSlash, 47 /* / */, 40,0, TRUE} // 38 look-ahead - , {doNOP, 255, 86,0, FALSE} // 39 - , {doExprCatOperator, 254, 12,0, FALSE} // 40 expr-cont-no-slash - , {doNOP, 130, 26,0, TRUE} // 41 - , {doExprCatOperator, 129, 12,0, FALSE} // 42 - , {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 43 - , {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 44 - , {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 45 - , {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 46 - , {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 47 - , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 48 - , {doExprFinished, 255, 255,0, FALSE} // 49 - , {doNOP, 130, 50,0, TRUE} // 50 tag-open - , {doStartTagValue, 128, 53,0, FALSE} // 51 - , {doTagExpectedError, 255, 86,0, FALSE} // 52 - , {doNOP, 130, 57,0, TRUE} // 53 tag-value - , {doNOP, 125 /* } */, 57,0, FALSE} // 54 - , {doTagDigit, 128, 53,0, TRUE} // 55 - , {doTagExpectedError, 255, 86,0, FALSE} // 56 - , {doNOP, 130, 57,0, TRUE} // 57 tag-close - , {doTagValue, 125 /* } */, 60,0, TRUE} // 58 - , {doTagExpectedError, 255, 86,0, FALSE} // 59 - , {doExprCatOperator, 254, 12,0, FALSE} // 60 expr-cont-no-tag - , {doNOP, 130, 60,0, TRUE} // 61 - , {doExprCatOperator, 129, 12,0, FALSE} // 62 - , {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 63 - , {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 64 - , {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 65 - , {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 66 - , {doExprCatOperator, 47 /* / */, 38,0, FALSE} // 67 - , {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 68 - , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 69 - , {doExprFinished, 255, 255,0, FALSE} // 70 - , {doStartVariableName, 36 /* $ */, 73,0, TRUE} // 71 scan-var-name - , {doNOP, 255, 86,0, FALSE} // 72 - , {doNOP, 132, 75,0, TRUE} // 73 scan-var-start - , {doVariableNameExpectedErr, 255, 86,0, FALSE} // 74 - , {doNOP, 131, 75,0, TRUE} // 75 scan-var-body - , {doEndVariableName, 255, 255,0, FALSE} // 76 - , {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 77 scan-unicode-set - , {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 78 - , {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 79 - , {doNOP, 255, 86,0, FALSE} // 80 - , {doNOP, 130, 81,0, TRUE} // 81 assign-or-rule - , {doStartAssign, 61 /* = */, 12, 84, TRUE} // 82 - , {doNOP, 255, 20, 8, FALSE} // 83 - , {doEndAssign, 59 /* ; */, 1,0, TRUE} // 84 assign-end - , {doRuleErrorAssignExpr, 255, 86,0, FALSE} // 85 - , {doExit, 255, 86,0, TRUE} // 86 errorDeath + , {doNOP, 129, 8,0, TRUE} // 9 + , {doRuleError, 255, 94,0, FALSE} // 10 + , {doNOP, 33 /* ! */, 13,0, TRUE} // 11 rev-option + , {doReverseDir, 255, 19, 8, FALSE} // 12 + , {doOptionStart, 131, 15,0, TRUE} // 13 option-scan1 + , {doRuleError, 255, 94,0, FALSE} // 14 + , {doNOP, 132, 15,0, TRUE} // 15 option-scan2 + , {doOptionEnd, 129, 1,0, FALSE} // 16 + , {doOptionEnd, 59 /* ; */, 1,0, FALSE} // 17 + , {doRuleError, 255, 94,0, FALSE} // 18 + , {doExprStart, 255, 20, 8, FALSE} // 19 reverse-rule + , {doRuleChar, 254, 29,0, TRUE} // 20 term + , {doNOP, 129, 20,0, TRUE} // 21 + , {doRuleChar, 130, 29,0, TRUE} // 22 + , {doNOP, 91 /* [ */, 85, 29, FALSE} // 23 + , {doLParen, 40 /* ( */, 20, 29, TRUE} // 24 + , {doNOP, 36 /* $ */, 79, 28, FALSE} // 25 + , {doDotAny, 46 /* . */, 29,0, TRUE} // 26 + , {doRuleError, 255, 94,0, FALSE} // 27 + , {doCheckVarDef, 255, 29,0, FALSE} // 28 term-var-ref + , {doNOP, 129, 29,0, TRUE} // 29 expr-mod + , {doUnaryOpStar, 42 /* * */, 34,0, TRUE} // 30 + , {doUnaryOpPlus, 43 /* + */, 34,0, TRUE} // 31 + , {doUnaryOpQuestion, 63 /* ? */, 34,0, TRUE} // 32 + , {doNOP, 255, 34,0, FALSE} // 33 + , {doExprCatOperator, 254, 20,0, FALSE} // 34 expr-cont + , {doNOP, 129, 34,0, TRUE} // 35 + , {doExprCatOperator, 130, 20,0, FALSE} // 36 + , {doExprCatOperator, 91 /* [ */, 20,0, FALSE} // 37 + , {doExprCatOperator, 40 /* ( */, 20,0, FALSE} // 38 + , {doExprCatOperator, 36 /* $ */, 20,0, FALSE} // 39 + , {doExprCatOperator, 46 /* . */, 20,0, FALSE} // 40 + , {doExprCatOperator, 47 /* / */, 46,0, FALSE} // 41 + , {doExprCatOperator, 123 /* { */, 58,0, TRUE} // 42 + , {doExprOrOperator, 124 /* | */, 20,0, TRUE} // 43 + , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 44 + , {doExprFinished, 255, 255,0, FALSE} // 45 + , {doSlash, 47 /* / */, 48,0, TRUE} // 46 look-ahead + , {doNOP, 255, 94,0, FALSE} // 47 + , {doExprCatOperator, 254, 20,0, FALSE} // 48 expr-cont-no-slash + , {doNOP, 129, 34,0, TRUE} // 49 + , {doExprCatOperator, 130, 20,0, FALSE} // 50 + , {doExprCatOperator, 91 /* [ */, 20,0, FALSE} // 51 + , {doExprCatOperator, 40 /* ( */, 20,0, FALSE} // 52 + , {doExprCatOperator, 36 /* $ */, 20,0, FALSE} // 53 + , {doExprCatOperator, 46 /* . */, 20,0, FALSE} // 54 + , {doExprOrOperator, 124 /* | */, 20,0, TRUE} // 55 + , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 56 + , {doExprFinished, 255, 255,0, FALSE} // 57 + , {doNOP, 129, 58,0, TRUE} // 58 tag-open + , {doStartTagValue, 128, 61,0, FALSE} // 59 + , {doTagExpectedError, 255, 94,0, FALSE} // 60 + , {doNOP, 129, 65,0, TRUE} // 61 tag-value + , {doNOP, 125 /* } */, 65,0, FALSE} // 62 + , {doTagDigit, 128, 61,0, TRUE} // 63 + , {doTagExpectedError, 255, 94,0, FALSE} // 64 + , {doNOP, 129, 65,0, TRUE} // 65 tag-close + , {doTagValue, 125 /* } */, 68,0, TRUE} // 66 + , {doTagExpectedError, 255, 94,0, FALSE} // 67 + , {doExprCatOperator, 254, 20,0, FALSE} // 68 expr-cont-no-tag + , {doNOP, 129, 68,0, TRUE} // 69 + , {doExprCatOperator, 130, 20,0, FALSE} // 70 + , {doExprCatOperator, 91 /* [ */, 20,0, FALSE} // 71 + , {doExprCatOperator, 40 /* ( */, 20,0, FALSE} // 72 + , {doExprCatOperator, 36 /* $ */, 20,0, FALSE} // 73 + , {doExprCatOperator, 46 /* . */, 20,0, FALSE} // 74 + , {doExprCatOperator, 47 /* / */, 46,0, FALSE} // 75 + , {doExprOrOperator, 124 /* | */, 20,0, TRUE} // 76 + , {doExprRParen, 41 /* ) */, 255,0, TRUE} // 77 + , {doExprFinished, 255, 255,0, FALSE} // 78 + , {doStartVariableName, 36 /* $ */, 81,0, TRUE} // 79 scan-var-name + , {doNOP, 255, 94,0, FALSE} // 80 + , {doNOP, 131, 83,0, TRUE} // 81 scan-var-start + , {doVariableNameExpectedErr, 255, 94,0, FALSE} // 82 + , {doNOP, 132, 83,0, TRUE} // 83 scan-var-body + , {doEndVariableName, 255, 255,0, FALSE} // 84 + , {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 85 scan-unicode-set + , {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 86 + , {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 87 + , {doNOP, 255, 94,0, FALSE} // 88 + , {doNOP, 129, 89,0, TRUE} // 89 assign-or-rule + , {doStartAssign, 61 /* = */, 20, 92, TRUE} // 90 + , {doNOP, 255, 28, 8, FALSE} // 91 + , {doEndAssign, 59 /* ; */, 1,0, TRUE} // 92 assign-end + , {doRuleErrorAssignExpr, 255, 94,0, FALSE} // 93 + , {doExit, 255, 94,0, TRUE} // 94 errorDeath }; static const char * const RBBIRuleStateNames[] = { 0, "start", @@ -170,6 +180,14 @@ static const char * const RBBIRuleStateNames[] = { 0, 0, "break-rule-end", 0, + 0, + "rev-option", + 0, + "option-scan1", + 0, + "option-scan2", + 0, + 0, 0, "reverse-rule", "term", diff --git a/icu4c/source/common/rbbirpt.txt b/icu4c/source/common/rbbirpt.txt index fa0dd35928d..9c6a45c1e56 100644 --- a/icu4c/source/common/rbbirpt.txt +++ b/icu4c/source/common/rbbirpt.txt @@ -58,7 +58,7 @@ start: escaped term ^break-rule-end doExprStart white_space n start '$' scan-var-name ^assign-or-rule doExprStart - '!' n reverse-rule doReverseDir + '!' n rev-option ';' n start # ignore empty rules. eof exit default term ^break-rule-end doExprStart @@ -73,9 +73,25 @@ break-rule-end: # -# Reverse Rule We've just scanned a '!', indicating a reverse direction rule. -# A rule expression must follow. +# ! We've just scanned a '!', indicating either a !!key word flag or a +# !Reverse rule. # +rev-option: + '!' n option-scan1 + default reverse-rule ^break-rule-end doReverseDir + +option-scan1: + name_start_char n option-scan2 doOptionStart + default errorDeath doRuleError + +option-scan2: + name_char n option-scan2 + white_space start doOptionEnd + ';' start doOptionEnd + default errorDeath doRuleError + + + reverse-rule: default term ^break-rule-end doExprStart diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp index ccc3455cb40..f9d3d0e1e5f 100644 --- a/icu4c/source/common/rbbiscan.cpp +++ b/icu4c/source/common/rbbiscan.cpp @@ -459,6 +459,21 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action) break; + case doOptionStart: + // Scanning a !!option. At the start of string. + fOptionStart = fScanIndex; + break; + + case doOptionEnd: + { + UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart); + if (opt == "chain") { + fRB->fChainRules = TRUE; + } else { + error(U_BRK_UNRECOGNIZED_OPTION); + } + } + break; case doReverseDir: fReverseRule = TRUE; diff --git a/icu4c/source/common/rbbiscan.h b/icu4c/source/common/rbbiscan.h index 3c83578805f..6ca3f1f0d89 100644 --- a/icu4c/source/common/rbbiscan.h +++ b/icu4c/source/common/rbbiscan.h @@ -145,6 +145,9 @@ private: int32_t fRuleNum; // Counts each rule as it is scanned. + int32_t fOptionStart; // Input index of start of a !!option + // keyword, while being scanned. + UnicodeSet *gRuleSet_rule_char; UnicodeSet *gRuleSet_white_space; UnicodeSet *gRuleSet_name_char; diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index f1add556665..26b3c751b54 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -119,6 +119,13 @@ void RBBITableBuilder::build() { printPosSets(fTree); } + // + // For "chained" rules, modify the followPos sets + // + if (fRB->fChainRules) { + calcChainedFollowPos(fTree); + } + // // Build the DFA state transition tables. // @@ -310,6 +317,82 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) { } + +//----------------------------------------------------------------------------- +// +// calcChainedFollowPos. Modify the previously calculated followPos sets +// to implement rule chaining. NOT described by Aho +// +//----------------------------------------------------------------------------- +void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) { + + UVector endMarkerNodes(*fStatus); + UVector leafNodes(*fStatus); + int32_t i; + + if (U_FAILURE(*fStatus)) { + return; + } + + // get a list of all endmarker nodes. + fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); + + // get a list all leaf nodes + fTree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus); + if (U_FAILURE(*fStatus)) { + return; + } + + // Get all nodes that can be the start a match, which is FirstPosition(root) + UVector *matchStartNodes = fTree->fFirstPosSet; + + + // Iteratate over all leaf nodes, + // + int32_t endNodeIx; + int32_t startNodeIx; + for (endNodeIx=0; endNodeIxfFollowPos->contains(endMarkerNodes.elementAt(i))) { + endNode = tNode; + break; + } + } + if (endNode == NULL) { + // node wasn't an end node. Try again with the next. + continue; + } + + // We've got a node that can end a match. + // Now iterate over the nodes that can start a match, looking for ones + // with the same char class as our ending node. + RBBINode *startNode; + for (startNodeIx = 0; startNodeIxsize(); startNodeIx++) { + startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); + if (startNode->fType != RBBINode::leafChar) { + continue; + } + + if (endNode->fVal == startNode->fVal) { + // The end val (character class) of one possible match is the + // same as the start of another. + + // Add all nodes from the followPos of the start node to the + // followPos set of the end node, which will have the effect of + // letting matches transition from a match state at endNode + // to the second char of a match starting with startNode. + setAdd(endNode->fFollowPos, startNode->fFollowPos); + } + } + } +} + + //----------------------------------------------------------------------------- // // buildStateTable() Determine the set of runtime DFA states and the diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h index 47f7de27a2f..4f72c93e7de 100644 --- a/icu4c/source/common/rbbitblb.h +++ b/icu4c/source/common/rbbitblb.h @@ -49,6 +49,7 @@ private: void calcFirstPos(RBBINode *n); void calcLastPos(RBBINode *n); void calcFollowPos(RBBINode *n); + void calcChainedFollowPos(RBBINode *n); void buildStateTable(); void flagAcceptingStates(); void flagLookAheadStates(); diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h index d2c42e9c084..39be8f94b37 100644 --- a/icu4c/source/common/unicode/utypes.h +++ b/icu4c/source/common/unicode/utypes.h @@ -625,6 +625,7 @@ typedef enum UErrorCode { U_BRK_UNDEFINED_VARIABLE, /**< Use of an undefined $Variable in an RBBI rule. */ U_BRK_INIT_ERROR, /**< Initialization failure. Probable missing ICU Data. */ U_BRK_RULE_EMPTY_SET, /**< Rule contains an empty Unicode Set. */ + U_BRK_UNRECOGNIZED_OPTION, /**< !!option in RBBI rules not recognized. */ U_BRK_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for Break Iterator failures */ /* diff --git a/icu4c/source/data/brkitr/line.txt b/icu4c/source/data/brkitr/line.txt index 2be2cd3ce43..ed4764dbf80 100644 --- a/icu4c/source/data/brkitr/line.txt +++ b/icu4c/source/data/brkitr/line.txt @@ -17,6 +17,8 @@ # Character Classes defined by TR 14. # +!!chain + $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; $BA = [:LineBreak = Break_After:]; @@ -82,75 +84,78 @@ $SPcm = $SP $CM*; $SYcm = $SY $CM*; -# New Lines. Always break after, never break before. -# Rule LB 3 -# -# Endings. NewLine or Zero Width Space, or both. Rules 4, 5 -# Because we never break before these things, $Endings -# appears at the end of line break rule. -# -$NLF = $BK | $CR | $LF | $NL | $CR $LF; -$EndingsSoft = ($ZW* $SP)* $ZW*; -$EndingsHard = ($ZW* $SP)* $ZW* $NLF; +# +# Rule LB 3 +$LB3NonBreaks = [^$BK $CR $LF $NL]; +$LB3NonBreaks ($BK | $CR | $LF | $NL){100}; +$CR $LF {100}; + +# LB 4 x SP +# x ZW +$LB3NonBreaks [$SP $ZW]; + +# LB 5 Break after zero width space +$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]]; + +# LB 7 Combining marks. TODO: get it right! +# $SP $CM needs to behave like $ID. +# X $CM needs to behave like X, where X is not $SP. +# $CM not covered by the above needs to behave like $AL +[$LB5NonBreaks] $CM*; # Stick together any combining sequences that don't match other rules. + +# LB 8 +$LB5NonBreaks [$CL $EX $IS $SY]; + +# LB 9 +$OPcm $SP* $LB3NonBreaks?; # Need to force trailing $BKs to rule 3, to get status right. +$OPcm $SP* [$LB5NonBreaks] $CM*; + +# LB 10 +$QUcm $SP* $OPcm; + +# LB 11 +$CLcm $SP* $NScm; + +# LB 11a +($B2cm)+; + +# LB 11b +$LB5NonBreaks $GLcm $LB3NonBreaks?; +$LB5NonBreaks $GLcm [$LB5NonBreaks] $CM*; +$GLcm $LB3NonBreaks?; +$GLcm [$LB5NonBreaks] $CM*; + +# LB 12 +$LB12NonBreaks = [[$LB5NonBreaks] - [$SP]]; + +# LB 14 +$LB12NonBreaks $QUcm+ $LB3NonBreaks?; +$LB12NonBreaks $QUcm+ [$LB5NonBreaks] $CM*; +$QUcm $LB3NonBreaks?; +$QUcm [$LB5NonBreaks] $CM*; # Don't let a combining mark go onto $CR, $BK, etc. + +# LB 14a +$LB14NonBreaks = [[$LB12NonBreaks] - [$CB]]; -# -# Openings Sequences that can precede Words, and that should not be separated from them. -# Rules LB 9, 10 -# -$Openings = ((($QUcm ($ZW* $SP)*)? $OPcm ($ZW* $SP)*) | $GLcm)+; +# LB 15 +$LB14NonBreaks ($BAcm | $HYcm | $NScm); +$BBcm [^$CB]; -# -# Closings Seqences that follow words, and that should not be separated from them, -# Rule LB 8, 11, 15 -$Closings = (($ZW* $SP)*( ($CLcm (($ZW* $SP)* $NScm)? | $EX | $IS | $SY) $CM*) | $BAcm | $HYcm | $NScm)*; +# LB 16 +($ALcm | $IDcm | $INcm | $NUcm) $INcm*; -$WordClosings = ($SP* $CLcm | $SP* $EXcm | $SP* $IScm | $SP* $SYcm | $BAcm | $HYcm | $NScm)*; - -# -# Words. Includes mixed Alpha-numerics. -# Rules 11a, 16, 17, 19, more or less. -# -$Number = $PRcm? ($OPcm | $HYcm)? $NU ($NU | $IS)* $CL? $POcm?; # Numbers - # Regex form, rather than rule 18 - -# Alpha-numeric. 16, 17 -$Word = ($ALcm | $NUcm)+ $INcm* | - $IDcm ($POcm? | $INcm*) | - $CM+ ($POcm? | $INcm*) | # CM with no base is like ID (LB 7a) - $INcm+ | - $CB; # Deviation from Unicode spec for $CB - # We treat as a single char word - -$Dashes = (($B2cm ($ZW* $SP)*)*); # Dashes 11a - - - -$HYMinus = $HYcm ($NUcm ($NUcm | $IS)* $CL? $POcm?)?; # For Rle LB15, Don't break before Hypen-minus, - # we also need to match a whole number, if that - # is what follows the '-' - - - -$Word15 = $Openings? ( - ($BBcm* $Openings? ($Word | $Number | $Dashes)? ($BAcm | $HYMinus | $NScm)*) | # Rule 15. Stuff sticks around words. - $BBcm* [^[:Cc:] $BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] $CM* | # Allow characters that don't meet the - $BBcm* [^$BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] # more elaborate definitions for WORD - ) $WordClosings?; # to be glued. - -$GluedWord = $Openings? $Word15 ((($ZW* $SP)* $GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together. - #$GluedWord = $Openings? $Word15 ((($ZW* $SP)* $Openings) $Word15)*; # "Glue" will stick anything below it together. - # Rules 13, 14 - -# -# The actual rule, a combination of everything defined above. -# -$Openings? $GluedWord $Closings $EndingsSoft{0}; -$Openings? $GluedWord $Closings $EndingsHard{100}; -# $GluedWord; +# $LB 17 +$IDcm $POcm; +$ALcm+ $NUcm; # includes $LB19 +$NUcm $ALcm+; +# LB 18 +$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?; +# LB 19 +$ALcm+; # @@ -161,8 +166,4 @@ $Openings? $GluedWord $Closings $EndingsHard{100}; # containing a space that may inhibit a break from occuring. # -$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($CM* $SP)) | (($CM* $SP)+ $OP); -$ClumpingChars = [^$SP $BK $CR $LF]; - -#!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR); -!.*; \ No newline at end of file +!.*; diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index be974deba11..c9b6242f093 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -2526,13 +2526,10 @@ int32_t RBBILineMonkey::next(int32_t prevPos) { // Depends on the previous char, and whether it eats following CombiningMarks // or not. UChar32 c = fText->char32At(prevPos); - if (c == 0x0d || c == 0x0a || c == 0x85 || fBK->contains(c) || fSP->contains(c)) { - // char doesn't automatically combine with CM. - nextPos = fText->moveIndex32(prevPos, 1); - } else { - nextPos = fCharBI->following(prevPos); + nextPos = fText->moveIndex32(prevPos, 1); + if (!(c == 0x0d || c == 0x0a || c == 0x85 || c == 0x200b /* ZW */ || fBK->contains(c))) { for (;;) { - UChar32 c = fText->char32At(nextPos); + c = fText->char32At(nextPos); if (!fCM->contains(c)) { break; } @@ -2714,12 +2711,9 @@ fall_through_11: } // LB 14a Break around a CB - // NOTE: DISABLE FOR ICU, FOR NOW. Too hard to implement in Rules. - #if 0 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { break; } - #endif // LB 15 if (fBA->contains(thisChar) || diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 3dbc16fad10..7d1641e6737 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -22,6 +22,10 @@ # +# Temp debugging tests + +•\U00011efa\u275d\u0085•\u0c56• +•a\u275d\u0085•\u0c56• ######################################################################################## #