mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 22:15:31 +00:00
ICU-2924 RBBI, new style rule format, new line break rules. (14 known test failures, will fix real soon.)
X-SVN-Rev: 13364
This commit is contained in:
parent
95996b6773
commit
d4524826ed
13 changed files with 311 additions and 169 deletions
|
@ -2650,7 +2650,8 @@ _uBrkErrorName[U_BRK_ERROR_LIMIT - U_BRK_ERROR_START] = {
|
|||
"U_BRK_NEW_LINE_IN_QUOTED_STRING",
|
||||
"U_BRK_UNDEFINED_VARIABLE",
|
||||
"U_BRK_INIT_ERROR",
|
||||
"U_BRK_RULE_EMPTY_SET"
|
||||
"U_BRK_RULE_EMPTY_SET",
|
||||
"U_BRK_UNRECOGNIZED_OPTION"
|
||||
};
|
||||
|
||||
static const char * const
|
||||
|
|
|
@ -59,6 +59,7 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
|
|||
fReverseTree = NULL;
|
||||
fForwardTables = NULL;
|
||||
fReverseTables = NULL;
|
||||
fChainRules = FALSE;
|
||||
|
||||
UErrorCode oldstatus = status;
|
||||
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
//
|
||||
// rbbirb.h
|
||||
//
|
||||
// Copyright (C) 2002, International Business Machines Corporation and others.
|
||||
// Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains declarations for several from the Rule Based Break Iterator rule builder.
|
||||
// This file contains declarations for several classes from the
|
||||
// Rule Based Break Iterator rule builder.
|
||||
//
|
||||
|
||||
|
||||
|
@ -122,6 +123,9 @@ public:
|
|||
RBBINode *fForwardTree; // The parse trees, generated by the scanner,
|
||||
RBBINode *fReverseTree; // then manipulated by subsequent steps.
|
||||
|
||||
UBool fChainRules; // True for chained Unicode TR style rules.
|
||||
// False for traditional regexp rules.
|
||||
|
||||
RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
|
||||
UVector *fUSetNodes; // Vector of all uset nodes.
|
||||
|
||||
|
|
|
@ -18,14 +18,15 @@ U_NAMESPACE_BEGIN
|
|||
// Character classes for RBBI rule scanning.
|
||||
//
|
||||
static const uint8_t kRuleSet_digit_char = 128;
|
||||
static const uint8_t kRuleSet_rule_char = 129;
|
||||
static const uint8_t kRuleSet_white_space = 130;
|
||||
static const uint8_t kRuleSet_name_char = 131;
|
||||
static const uint8_t kRuleSet_name_start_char = 132;
|
||||
static const uint8_t kRuleSet_white_space = 129;
|
||||
static const uint8_t kRuleSet_rule_char = 130;
|
||||
static const uint8_t kRuleSet_name_start_char = 131;
|
||||
static const uint8_t kRuleSet_name_char = 132;
|
||||
|
||||
|
||||
enum RBBI_RuleParseAction {
|
||||
doExprOrOperator,
|
||||
doOptionEnd,
|
||||
doRuleErrorAssignExpr,
|
||||
doTagValue,
|
||||
doEndAssign,
|
||||
|
@ -51,6 +52,7 @@ enum RBBI_RuleParseAction {
|
|||
doEndOfRule,
|
||||
doUnaryOpPlus,
|
||||
doExprStart,
|
||||
doOptionStart,
|
||||
doExprCatOperator,
|
||||
doReverseDir,
|
||||
doCheckVarDef,
|
||||
|
@ -73,92 +75,100 @@ struct RBBIRuleTableEl {
|
|||
|
||||
static const struct RBBIRuleTableEl gRuleParseStateTable[] = {
|
||||
{doNOP, 0, 0, 0, TRUE}
|
||||
, {doExprStart, 254, 12, 8, FALSE} // 1 start
|
||||
, {doNOP, 130, 1,0, TRUE} // 2
|
||||
, {doExprStart, 36 /* $ */, 71, 81, FALSE} // 3
|
||||
, {doReverseDir, 33 /* ! */, 11,0, TRUE} // 4
|
||||
, {doExprStart, 254, 20, 8, FALSE} // 1 start
|
||||
, {doNOP, 129, 1,0, TRUE} // 2
|
||||
, {doExprStart, 36 /* $ */, 79, 89, FALSE} // 3
|
||||
, {doNOP, 33 /* ! */, 11,0, TRUE} // 4
|
||||
, {doNOP, 59 /* ; */, 1,0, TRUE} // 5
|
||||
, {doNOP, 252, 0,0, FALSE} // 6
|
||||
, {doExprStart, 255, 12, 8, FALSE} // 7
|
||||
, {doExprStart, 255, 20, 8, FALSE} // 7
|
||||
, {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 8 break-rule-end
|
||||
, {doNOP, 130, 8,0, TRUE} // 9
|
||||
, {doRuleError, 255, 86,0, FALSE} // 10
|
||||
, {doExprStart, 255, 12, 8, FALSE} // 11 reverse-rule
|
||||
, {doRuleChar, 254, 21,0, TRUE} // 12 term
|
||||
, {doNOP, 130, 12,0, TRUE} // 13
|
||||
, {doRuleChar, 129, 21,0, TRUE} // 14
|
||||
, {doNOP, 91 /* [ */, 77, 21, FALSE} // 15
|
||||
, {doLParen, 40 /* ( */, 12, 21, TRUE} // 16
|
||||
, {doNOP, 36 /* $ */, 71, 20, FALSE} // 17
|
||||
, {doDotAny, 46 /* . */, 21,0, TRUE} // 18
|
||||
, {doRuleError, 255, 86,0, FALSE} // 19
|
||||
, {doCheckVarDef, 255, 21,0, FALSE} // 20 term-var-ref
|
||||
, {doNOP, 130, 21,0, TRUE} // 21 expr-mod
|
||||
, {doUnaryOpStar, 42 /* * */, 26,0, TRUE} // 22
|
||||
, {doUnaryOpPlus, 43 /* + */, 26,0, TRUE} // 23
|
||||
, {doUnaryOpQuestion, 63 /* ? */, 26,0, TRUE} // 24
|
||||
, {doNOP, 255, 26,0, FALSE} // 25
|
||||
, {doExprCatOperator, 254, 12,0, FALSE} // 26 expr-cont
|
||||
, {doNOP, 130, 26,0, TRUE} // 27
|
||||
, {doExprCatOperator, 129, 12,0, FALSE} // 28
|
||||
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 29
|
||||
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 30
|
||||
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 31
|
||||
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 32
|
||||
, {doExprCatOperator, 47 /* / */, 38,0, FALSE} // 33
|
||||
, {doExprCatOperator, 123 /* { */, 50,0, TRUE} // 34
|
||||
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 35
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 36
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 37
|
||||
, {doSlash, 47 /* / */, 40,0, TRUE} // 38 look-ahead
|
||||
, {doNOP, 255, 86,0, FALSE} // 39
|
||||
, {doExprCatOperator, 254, 12,0, FALSE} // 40 expr-cont-no-slash
|
||||
, {doNOP, 130, 26,0, TRUE} // 41
|
||||
, {doExprCatOperator, 129, 12,0, FALSE} // 42
|
||||
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 43
|
||||
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 44
|
||||
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 45
|
||||
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 46
|
||||
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 47
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 48
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 49
|
||||
, {doNOP, 130, 50,0, TRUE} // 50 tag-open
|
||||
, {doStartTagValue, 128, 53,0, FALSE} // 51
|
||||
, {doTagExpectedError, 255, 86,0, FALSE} // 52
|
||||
, {doNOP, 130, 57,0, TRUE} // 53 tag-value
|
||||
, {doNOP, 125 /* } */, 57,0, FALSE} // 54
|
||||
, {doTagDigit, 128, 53,0, TRUE} // 55
|
||||
, {doTagExpectedError, 255, 86,0, FALSE} // 56
|
||||
, {doNOP, 130, 57,0, TRUE} // 57 tag-close
|
||||
, {doTagValue, 125 /* } */, 60,0, TRUE} // 58
|
||||
, {doTagExpectedError, 255, 86,0, FALSE} // 59
|
||||
, {doExprCatOperator, 254, 12,0, FALSE} // 60 expr-cont-no-tag
|
||||
, {doNOP, 130, 60,0, TRUE} // 61
|
||||
, {doExprCatOperator, 129, 12,0, FALSE} // 62
|
||||
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 63
|
||||
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 64
|
||||
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 65
|
||||
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 66
|
||||
, {doExprCatOperator, 47 /* / */, 38,0, FALSE} // 67
|
||||
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 68
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 69
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 70
|
||||
, {doStartVariableName, 36 /* $ */, 73,0, TRUE} // 71 scan-var-name
|
||||
, {doNOP, 255, 86,0, FALSE} // 72
|
||||
, {doNOP, 132, 75,0, TRUE} // 73 scan-var-start
|
||||
, {doVariableNameExpectedErr, 255, 86,0, FALSE} // 74
|
||||
, {doNOP, 131, 75,0, TRUE} // 75 scan-var-body
|
||||
, {doEndVariableName, 255, 255,0, FALSE} // 76
|
||||
, {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 77 scan-unicode-set
|
||||
, {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 78
|
||||
, {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 79
|
||||
, {doNOP, 255, 86,0, FALSE} // 80
|
||||
, {doNOP, 130, 81,0, TRUE} // 81 assign-or-rule
|
||||
, {doStartAssign, 61 /* = */, 12, 84, TRUE} // 82
|
||||
, {doNOP, 255, 20, 8, FALSE} // 83
|
||||
, {doEndAssign, 59 /* ; */, 1,0, TRUE} // 84 assign-end
|
||||
, {doRuleErrorAssignExpr, 255, 86,0, FALSE} // 85
|
||||
, {doExit, 255, 86,0, TRUE} // 86 errorDeath
|
||||
, {doNOP, 129, 8,0, TRUE} // 9
|
||||
, {doRuleError, 255, 94,0, FALSE} // 10
|
||||
, {doNOP, 33 /* ! */, 13,0, TRUE} // 11 rev-option
|
||||
, {doReverseDir, 255, 19, 8, FALSE} // 12
|
||||
, {doOptionStart, 131, 15,0, TRUE} // 13 option-scan1
|
||||
, {doRuleError, 255, 94,0, FALSE} // 14
|
||||
, {doNOP, 132, 15,0, TRUE} // 15 option-scan2
|
||||
, {doOptionEnd, 129, 1,0, FALSE} // 16
|
||||
, {doOptionEnd, 59 /* ; */, 1,0, FALSE} // 17
|
||||
, {doRuleError, 255, 94,0, FALSE} // 18
|
||||
, {doExprStart, 255, 20, 8, FALSE} // 19 reverse-rule
|
||||
, {doRuleChar, 254, 29,0, TRUE} // 20 term
|
||||
, {doNOP, 129, 20,0, TRUE} // 21
|
||||
, {doRuleChar, 130, 29,0, TRUE} // 22
|
||||
, {doNOP, 91 /* [ */, 85, 29, FALSE} // 23
|
||||
, {doLParen, 40 /* ( */, 20, 29, TRUE} // 24
|
||||
, {doNOP, 36 /* $ */, 79, 28, FALSE} // 25
|
||||
, {doDotAny, 46 /* . */, 29,0, TRUE} // 26
|
||||
, {doRuleError, 255, 94,0, FALSE} // 27
|
||||
, {doCheckVarDef, 255, 29,0, FALSE} // 28 term-var-ref
|
||||
, {doNOP, 129, 29,0, TRUE} // 29 expr-mod
|
||||
, {doUnaryOpStar, 42 /* * */, 34,0, TRUE} // 30
|
||||
, {doUnaryOpPlus, 43 /* + */, 34,0, TRUE} // 31
|
||||
, {doUnaryOpQuestion, 63 /* ? */, 34,0, TRUE} // 32
|
||||
, {doNOP, 255, 34,0, FALSE} // 33
|
||||
, {doExprCatOperator, 254, 20,0, FALSE} // 34 expr-cont
|
||||
, {doNOP, 129, 34,0, TRUE} // 35
|
||||
, {doExprCatOperator, 130, 20,0, FALSE} // 36
|
||||
, {doExprCatOperator, 91 /* [ */, 20,0, FALSE} // 37
|
||||
, {doExprCatOperator, 40 /* ( */, 20,0, FALSE} // 38
|
||||
, {doExprCatOperator, 36 /* $ */, 20,0, FALSE} // 39
|
||||
, {doExprCatOperator, 46 /* . */, 20,0, FALSE} // 40
|
||||
, {doExprCatOperator, 47 /* / */, 46,0, FALSE} // 41
|
||||
, {doExprCatOperator, 123 /* { */, 58,0, TRUE} // 42
|
||||
, {doExprOrOperator, 124 /* | */, 20,0, TRUE} // 43
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 44
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 45
|
||||
, {doSlash, 47 /* / */, 48,0, TRUE} // 46 look-ahead
|
||||
, {doNOP, 255, 94,0, FALSE} // 47
|
||||
, {doExprCatOperator, 254, 20,0, FALSE} // 48 expr-cont-no-slash
|
||||
, {doNOP, 129, 34,0, TRUE} // 49
|
||||
, {doExprCatOperator, 130, 20,0, FALSE} // 50
|
||||
, {doExprCatOperator, 91 /* [ */, 20,0, FALSE} // 51
|
||||
, {doExprCatOperator, 40 /* ( */, 20,0, FALSE} // 52
|
||||
, {doExprCatOperator, 36 /* $ */, 20,0, FALSE} // 53
|
||||
, {doExprCatOperator, 46 /* . */, 20,0, FALSE} // 54
|
||||
, {doExprOrOperator, 124 /* | */, 20,0, TRUE} // 55
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 56
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 57
|
||||
, {doNOP, 129, 58,0, TRUE} // 58 tag-open
|
||||
, {doStartTagValue, 128, 61,0, FALSE} // 59
|
||||
, {doTagExpectedError, 255, 94,0, FALSE} // 60
|
||||
, {doNOP, 129, 65,0, TRUE} // 61 tag-value
|
||||
, {doNOP, 125 /* } */, 65,0, FALSE} // 62
|
||||
, {doTagDigit, 128, 61,0, TRUE} // 63
|
||||
, {doTagExpectedError, 255, 94,0, FALSE} // 64
|
||||
, {doNOP, 129, 65,0, TRUE} // 65 tag-close
|
||||
, {doTagValue, 125 /* } */, 68,0, TRUE} // 66
|
||||
, {doTagExpectedError, 255, 94,0, FALSE} // 67
|
||||
, {doExprCatOperator, 254, 20,0, FALSE} // 68 expr-cont-no-tag
|
||||
, {doNOP, 129, 68,0, TRUE} // 69
|
||||
, {doExprCatOperator, 130, 20,0, FALSE} // 70
|
||||
, {doExprCatOperator, 91 /* [ */, 20,0, FALSE} // 71
|
||||
, {doExprCatOperator, 40 /* ( */, 20,0, FALSE} // 72
|
||||
, {doExprCatOperator, 36 /* $ */, 20,0, FALSE} // 73
|
||||
, {doExprCatOperator, 46 /* . */, 20,0, FALSE} // 74
|
||||
, {doExprCatOperator, 47 /* / */, 46,0, FALSE} // 75
|
||||
, {doExprOrOperator, 124 /* | */, 20,0, TRUE} // 76
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 77
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 78
|
||||
, {doStartVariableName, 36 /* $ */, 81,0, TRUE} // 79 scan-var-name
|
||||
, {doNOP, 255, 94,0, FALSE} // 80
|
||||
, {doNOP, 131, 83,0, TRUE} // 81 scan-var-start
|
||||
, {doVariableNameExpectedErr, 255, 94,0, FALSE} // 82
|
||||
, {doNOP, 132, 83,0, TRUE} // 83 scan-var-body
|
||||
, {doEndVariableName, 255, 255,0, FALSE} // 84
|
||||
, {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 85 scan-unicode-set
|
||||
, {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 86
|
||||
, {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 87
|
||||
, {doNOP, 255, 94,0, FALSE} // 88
|
||||
, {doNOP, 129, 89,0, TRUE} // 89 assign-or-rule
|
||||
, {doStartAssign, 61 /* = */, 20, 92, TRUE} // 90
|
||||
, {doNOP, 255, 28, 8, FALSE} // 91
|
||||
, {doEndAssign, 59 /* ; */, 1,0, TRUE} // 92 assign-end
|
||||
, {doRuleErrorAssignExpr, 255, 94,0, FALSE} // 93
|
||||
, {doExit, 255, 94,0, TRUE} // 94 errorDeath
|
||||
};
|
||||
static const char * const RBBIRuleStateNames[] = { 0,
|
||||
"start",
|
||||
|
@ -170,6 +180,14 @@ static const char * const RBBIRuleStateNames[] = { 0,
|
|||
0,
|
||||
"break-rule-end",
|
||||
0,
|
||||
0,
|
||||
"rev-option",
|
||||
0,
|
||||
"option-scan1",
|
||||
0,
|
||||
"option-scan2",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"reverse-rule",
|
||||
"term",
|
||||
|
|
|
@ -58,7 +58,7 @@ start:
|
|||
escaped term ^break-rule-end doExprStart
|
||||
white_space n start
|
||||
'$' scan-var-name ^assign-or-rule doExprStart
|
||||
'!' n reverse-rule doReverseDir
|
||||
'!' n rev-option
|
||||
';' n start # ignore empty rules.
|
||||
eof exit
|
||||
default term ^break-rule-end doExprStart
|
||||
|
@ -73,9 +73,25 @@ break-rule-end:
|
|||
|
||||
|
||||
#
|
||||
# Reverse Rule We've just scanned a '!', indicating a reverse direction rule.
|
||||
# A rule expression must follow.
|
||||
# ! We've just scanned a '!', indicating either a !!key word flag or a
|
||||
# !Reverse rule.
|
||||
#
|
||||
rev-option:
|
||||
'!' n option-scan1
|
||||
default reverse-rule ^break-rule-end doReverseDir
|
||||
|
||||
option-scan1:
|
||||
name_start_char n option-scan2 doOptionStart
|
||||
default errorDeath doRuleError
|
||||
|
||||
option-scan2:
|
||||
name_char n option-scan2
|
||||
white_space start doOptionEnd
|
||||
';' start doOptionEnd
|
||||
default errorDeath doRuleError
|
||||
|
||||
|
||||
|
||||
reverse-rule:
|
||||
default term ^break-rule-end doExprStart
|
||||
|
||||
|
|
|
@ -459,6 +459,21 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action)
|
|||
break;
|
||||
|
||||
|
||||
case doOptionStart:
|
||||
// Scanning a !!option. At the start of string.
|
||||
fOptionStart = fScanIndex;
|
||||
break;
|
||||
|
||||
case doOptionEnd:
|
||||
{
|
||||
UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
|
||||
if (opt == "chain") {
|
||||
fRB->fChainRules = TRUE;
|
||||
} else {
|
||||
error(U_BRK_UNRECOGNIZED_OPTION);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case doReverseDir:
|
||||
fReverseRule = TRUE;
|
||||
|
|
|
@ -145,6 +145,9 @@ private:
|
|||
|
||||
int32_t fRuleNum; // Counts each rule as it is scanned.
|
||||
|
||||
int32_t fOptionStart; // Input index of start of a !!option
|
||||
// keyword, while being scanned.
|
||||
|
||||
UnicodeSet *gRuleSet_rule_char;
|
||||
UnicodeSet *gRuleSet_white_space;
|
||||
UnicodeSet *gRuleSet_name_char;
|
||||
|
|
|
@ -119,6 +119,13 @@ void RBBITableBuilder::build() {
|
|||
printPosSets(fTree);
|
||||
}
|
||||
|
||||
//
|
||||
// For "chained" rules, modify the followPos sets
|
||||
//
|
||||
if (fRB->fChainRules) {
|
||||
calcChainedFollowPos(fTree);
|
||||
}
|
||||
|
||||
//
|
||||
// Build the DFA state transition tables.
|
||||
//
|
||||
|
@ -310,6 +317,82 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) {
|
|||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// calcChainedFollowPos. Modify the previously calculated followPos sets
|
||||
// to implement rule chaining. NOT described by Aho
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) {
|
||||
|
||||
UVector endMarkerNodes(*fStatus);
|
||||
UVector leafNodes(*fStatus);
|
||||
int32_t i;
|
||||
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// get a list of all endmarker nodes.
|
||||
fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
|
||||
|
||||
// get a list all leaf nodes
|
||||
fTree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Get all nodes that can be the start a match, which is FirstPosition(root)
|
||||
UVector *matchStartNodes = fTree->fFirstPosSet;
|
||||
|
||||
|
||||
// Iteratate over all leaf nodes,
|
||||
//
|
||||
int32_t endNodeIx;
|
||||
int32_t startNodeIx;
|
||||
for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
|
||||
RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
|
||||
RBBINode *endNode = NULL;
|
||||
|
||||
// Identify leaf nodes that correspond to overall rule match positions.
|
||||
// These include an endMarkerNode in their followPos sets.
|
||||
for (i=0; i<endMarkerNodes.size(); i++) {
|
||||
if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
|
||||
endNode = tNode;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (endNode == NULL) {
|
||||
// node wasn't an end node. Try again with the next.
|
||||
continue;
|
||||
}
|
||||
|
||||
// We've got a node that can end a match.
|
||||
// Now iterate over the nodes that can start a match, looking for ones
|
||||
// with the same char class as our ending node.
|
||||
RBBINode *startNode;
|
||||
for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
|
||||
startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
|
||||
if (startNode->fType != RBBINode::leafChar) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (endNode->fVal == startNode->fVal) {
|
||||
// The end val (character class) of one possible match is the
|
||||
// same as the start of another.
|
||||
|
||||
// Add all nodes from the followPos of the start node to the
|
||||
// followPos set of the end node, which will have the effect of
|
||||
// letting matches transition from a match state at endNode
|
||||
// to the second char of a match starting with startNode.
|
||||
setAdd(endNode->fFollowPos, startNode->fFollowPos);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// buildStateTable() Determine the set of runtime DFA states and the
|
||||
|
|
|
@ -49,6 +49,7 @@ private:
|
|||
void calcFirstPos(RBBINode *n);
|
||||
void calcLastPos(RBBINode *n);
|
||||
void calcFollowPos(RBBINode *n);
|
||||
void calcChainedFollowPos(RBBINode *n);
|
||||
void buildStateTable();
|
||||
void flagAcceptingStates();
|
||||
void flagLookAheadStates();
|
||||
|
|
|
@ -625,6 +625,7 @@ typedef enum UErrorCode {
|
|||
U_BRK_UNDEFINED_VARIABLE, /**< Use of an undefined $Variable in an RBBI rule. */
|
||||
U_BRK_INIT_ERROR, /**< Initialization failure. Probable missing ICU Data. */
|
||||
U_BRK_RULE_EMPTY_SET, /**< Rule contains an empty Unicode Set. */
|
||||
U_BRK_UNRECOGNIZED_OPTION, /**< !!option in RBBI rules not recognized. */
|
||||
U_BRK_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for Break Iterator failures */
|
||||
|
||||
/*
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
# Character Classes defined by TR 14.
|
||||
#
|
||||
|
||||
!!chain
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
|
@ -82,75 +84,78 @@ $SPcm = $SP $CM*;
|
|||
$SYcm = $SY $CM*;
|
||||
|
||||
|
||||
# New Lines. Always break after, never break before.
|
||||
# Rule LB 3
|
||||
#
|
||||
# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
|
||||
# Because we never break before these things, $Endings
|
||||
# appears at the end of line break rule.
|
||||
#
|
||||
$NLF = $BK | $CR | $LF | $NL | $CR $LF;
|
||||
$EndingsSoft = ($ZW* $SP)* $ZW*;
|
||||
$EndingsHard = ($ZW* $SP)* $ZW* $NLF;
|
||||
#
|
||||
# Rule LB 3
|
||||
$LB3NonBreaks = [^$BK $CR $LF $NL];
|
||||
$LB3NonBreaks ($BK | $CR | $LF | $NL){100};
|
||||
$CR $LF {100};
|
||||
|
||||
# LB 4 x SP
|
||||
# x ZW
|
||||
$LB3NonBreaks [$SP $ZW];
|
||||
|
||||
# LB 5 Break after zero width space
|
||||
$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 7 Combining marks. TODO: get it right!
|
||||
# $SP $CM needs to behave like $ID.
|
||||
# X $CM needs to behave like X, where X is not $SP.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
[$LB5NonBreaks] $CM*; # Stick together any combining sequences that don't match other rules.
|
||||
|
||||
# LB 8
|
||||
$LB5NonBreaks [$CL $EX $IS $SY];
|
||||
|
||||
# LB 9
|
||||
$OPcm $SP* $LB3NonBreaks?; # Need to force trailing $BKs to rule 3, to get status right.
|
||||
$OPcm $SP* [$LB5NonBreaks] $CM*;
|
||||
|
||||
# LB 10
|
||||
$QUcm $SP* $OPcm;
|
||||
|
||||
# LB 11
|
||||
$CLcm $SP* $NScm;
|
||||
|
||||
# LB 11a
|
||||
($B2cm)+;
|
||||
|
||||
# LB 11b
|
||||
$LB5NonBreaks $GLcm $LB3NonBreaks?;
|
||||
$LB5NonBreaks $GLcm [$LB5NonBreaks] $CM*;
|
||||
$GLcm $LB3NonBreaks?;
|
||||
$GLcm [$LB5NonBreaks] $CM*;
|
||||
|
||||
# LB 12
|
||||
$LB12NonBreaks = [[$LB5NonBreaks] - [$SP]];
|
||||
|
||||
# LB 14
|
||||
$LB12NonBreaks $QUcm+ $LB3NonBreaks?;
|
||||
$LB12NonBreaks $QUcm+ [$LB5NonBreaks] $CM*;
|
||||
$QUcm $LB3NonBreaks?;
|
||||
$QUcm [$LB5NonBreaks] $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
|
||||
# LB 14a
|
||||
$LB14NonBreaks = [[$LB12NonBreaks] - [$CB]];
|
||||
|
||||
|
||||
#
|
||||
# Openings Sequences that can precede Words, and that should not be separated from them.
|
||||
# Rules LB 9, 10
|
||||
#
|
||||
$Openings = ((($QUcm ($ZW* $SP)*)? $OPcm ($ZW* $SP)*) | $GLcm)+;
|
||||
# LB 15
|
||||
$LB14NonBreaks ($BAcm | $HYcm | $NScm);
|
||||
$BBcm [^$CB];
|
||||
|
||||
#
|
||||
# Closings Seqences that follow words, and that should not be separated from them,
|
||||
# Rule LB 8, 11, 15
|
||||
$Closings = (($ZW* $SP)*( ($CLcm (($ZW* $SP)* $NScm)? | $EX | $IS | $SY) $CM*) | $BAcm | $HYcm | $NScm)*;
|
||||
# LB 16
|
||||
($ALcm | $IDcm | $INcm | $NUcm) $INcm*;
|
||||
|
||||
$WordClosings = ($SP* $CLcm | $SP* $EXcm | $SP* $IScm | $SP* $SYcm | $BAcm | $HYcm | $NScm)*;
|
||||
|
||||
#
|
||||
# Words. Includes mixed Alpha-numerics.
|
||||
# Rules 11a, 16, 17, 19, more or less.
|
||||
#
|
||||
$Number = $PRcm? ($OPcm | $HYcm)? $NU ($NU | $IS)* $CL? $POcm?; # Numbers
|
||||
# Regex form, rather than rule 18
|
||||
|
||||
# Alpha-numeric. 16, 17
|
||||
$Word = ($ALcm | $NUcm)+ $INcm* |
|
||||
$IDcm ($POcm? | $INcm*) |
|
||||
$CM+ ($POcm? | $INcm*) | # CM with no base is like ID (LB 7a)
|
||||
$INcm+ |
|
||||
$CB; # Deviation from Unicode spec for $CB
|
||||
# We treat as a single char word
|
||||
|
||||
$Dashes = (($B2cm ($ZW* $SP)*)*); # Dashes 11a
|
||||
|
||||
|
||||
|
||||
$HYMinus = $HYcm ($NUcm ($NUcm | $IS)* $CL? $POcm?)?; # For Rle LB15, Don't break before Hypen-minus,
|
||||
# we also need to match a whole number, if that
|
||||
# is what follows the '-'
|
||||
|
||||
|
||||
|
||||
$Word15 = $Openings? (
|
||||
($BBcm* $Openings? ($Word | $Number | $Dashes)? ($BAcm | $HYMinus | $NScm)*) | # Rule 15. Stuff sticks around words.
|
||||
$BBcm* [^[:Cc:] $BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] $CM* | # Allow characters that don't meet the
|
||||
$BBcm* [^$BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] # more elaborate definitions for WORD
|
||||
) $WordClosings?; # to be glued.
|
||||
|
||||
$GluedWord = $Openings? $Word15 ((($ZW* $SP)* $GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
|
||||
#$GluedWord = $Openings? $Word15 ((($ZW* $SP)* $Openings) $Word15)*; # "Glue" will stick anything below it together.
|
||||
# Rules 13, 14
|
||||
|
||||
#
|
||||
# The actual rule, a combination of everything defined above.
|
||||
#
|
||||
$Openings? $GluedWord $Closings $EndingsSoft{0};
|
||||
$Openings? $GluedWord $Closings $EndingsHard{100};
|
||||
# $GluedWord;
|
||||
# $LB 17
|
||||
$IDcm $POcm;
|
||||
$ALcm+ $NUcm; # includes $LB19
|
||||
$NUcm $ALcm+;
|
||||
|
||||
|
||||
# LB 18
|
||||
$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?;
|
||||
|
||||
# LB 19
|
||||
$ALcm+;
|
||||
|
||||
|
||||
#
|
||||
|
@ -161,8 +166,4 @@ $Openings? $GluedWord $Closings $EndingsHard{100};
|
|||
# containing a space that may inhibit a break from occuring.
|
||||
#
|
||||
|
||||
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($CM* $SP)) | (($CM* $SP)+ $OP);
|
||||
$ClumpingChars = [^$SP $BK $CR $LF];
|
||||
|
||||
#!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
|
||||
!.*;
|
||||
!.*;
|
||||
|
|
|
@ -2526,13 +2526,10 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
|
|||
// Depends on the previous char, and whether it eats following CombiningMarks
|
||||
// or not.
|
||||
UChar32 c = fText->char32At(prevPos);
|
||||
if (c == 0x0d || c == 0x0a || c == 0x85 || fBK->contains(c) || fSP->contains(c)) {
|
||||
// char doesn't automatically combine with CM.
|
||||
nextPos = fText->moveIndex32(prevPos, 1);
|
||||
} else {
|
||||
nextPos = fCharBI->following(prevPos);
|
||||
nextPos = fText->moveIndex32(prevPos, 1);
|
||||
if (!(c == 0x0d || c == 0x0a || c == 0x85 || c == 0x200b /* ZW */ || fBK->contains(c))) {
|
||||
for (;;) {
|
||||
UChar32 c = fText->char32At(nextPos);
|
||||
c = fText->char32At(nextPos);
|
||||
if (!fCM->contains(c)) {
|
||||
break;
|
||||
}
|
||||
|
@ -2714,12 +2711,9 @@ fall_through_11:
|
|||
}
|
||||
|
||||
// LB 14a Break around a CB
|
||||
// NOTE: DISABLE FOR ICU, FOR NOW. Too hard to implement in Rules.
|
||||
#if 0
|
||||
if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
// LB 15
|
||||
if (fBA->contains(thisChar) ||
|
||||
|
|
4
icu4c/source/test/testdata/rbbitst.txt
vendored
4
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -22,6 +22,10 @@
|
|||
#
|
||||
|
||||
|
||||
# Temp debugging tests
|
||||
<line>
|
||||
<data>•\U00011efa\u275d\u0085•\u0c56•</data>
|
||||
<data>•a\u275d\u0085•\u0c56•</data>
|
||||
|
||||
########################################################################################
|
||||
#
|
||||
|
|
Loading…
Add table
Reference in a new issue