ICU-2924 RBBI, new style rule format, new line break rules. (14 known test failures, will fix real soon.)

X-SVN-Rev: 13364
This commit is contained in:
Andy Heninger 2003-10-09 01:13:08 +00:00
parent 95996b6773
commit d4524826ed
13 changed files with 311 additions and 169 deletions

View file

@ -2650,7 +2650,8 @@ _uBrkErrorName[U_BRK_ERROR_LIMIT - U_BRK_ERROR_START] = {
"U_BRK_NEW_LINE_IN_QUOTED_STRING",
"U_BRK_UNDEFINED_VARIABLE",
"U_BRK_INIT_ERROR",
"U_BRK_RULE_EMPTY_SET"
"U_BRK_RULE_EMPTY_SET",
"U_BRK_UNRECOGNIZED_OPTION"
};
static const char * const

View file

@ -59,6 +59,7 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
fReverseTree = NULL;
fForwardTables = NULL;
fReverseTables = NULL;
fChainRules = FALSE;
UErrorCode oldstatus = status;

View file

@ -1,10 +1,11 @@
//
// rbbirb.h
//
// Copyright (C) 2002, International Business Machines Corporation and others.
// Copyright (C) 2002-2003, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for several from the Rule Based Break Iterator rule builder.
// This file contains declarations for several classes from the
// Rule Based Break Iterator rule builder.
//
@ -122,6 +123,9 @@ public:
RBBINode *fForwardTree; // The parse trees, generated by the scanner,
RBBINode *fReverseTree; // then manipulated by subsequent steps.
UBool fChainRules; // True for chained Unicode TR style rules.
// False for traditional regexp rules.
RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
UVector *fUSetNodes; // Vector of all uset nodes.

View file

@ -18,14 +18,15 @@ U_NAMESPACE_BEGIN
// Character classes for RBBI rule scanning.
//
static const uint8_t kRuleSet_digit_char = 128;
static const uint8_t kRuleSet_rule_char = 129;
static const uint8_t kRuleSet_white_space = 130;
static const uint8_t kRuleSet_name_char = 131;
static const uint8_t kRuleSet_name_start_char = 132;
static const uint8_t kRuleSet_white_space = 129;
static const uint8_t kRuleSet_rule_char = 130;
static const uint8_t kRuleSet_name_start_char = 131;
static const uint8_t kRuleSet_name_char = 132;
enum RBBI_RuleParseAction {
doExprOrOperator,
doOptionEnd,
doRuleErrorAssignExpr,
doTagValue,
doEndAssign,
@ -51,6 +52,7 @@ enum RBBI_RuleParseAction {
doEndOfRule,
doUnaryOpPlus,
doExprStart,
doOptionStart,
doExprCatOperator,
doReverseDir,
doCheckVarDef,
@ -73,92 +75,100 @@ struct RBBIRuleTableEl {
static const struct RBBIRuleTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
, {doExprStart, 254, 12, 8, FALSE} // 1 start
, {doNOP, 130, 1,0, TRUE} // 2
, {doExprStart, 36 /* $ */, 71, 81, FALSE} // 3
, {doReverseDir, 33 /* ! */, 11,0, TRUE} // 4
, {doExprStart, 254, 20, 8, FALSE} // 1 start
, {doNOP, 129, 1,0, TRUE} // 2
, {doExprStart, 36 /* $ */, 79, 89, FALSE} // 3
, {doNOP, 33 /* ! */, 11,0, TRUE} // 4
, {doNOP, 59 /* ; */, 1,0, TRUE} // 5
, {doNOP, 252, 0,0, FALSE} // 6
, {doExprStart, 255, 12, 8, FALSE} // 7
, {doExprStart, 255, 20, 8, FALSE} // 7
, {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 8 break-rule-end
, {doNOP, 130, 8,0, TRUE} // 9
, {doRuleError, 255, 86,0, FALSE} // 10
, {doExprStart, 255, 12, 8, FALSE} // 11 reverse-rule
, {doRuleChar, 254, 21,0, TRUE} // 12 term
, {doNOP, 130, 12,0, TRUE} // 13
, {doRuleChar, 129, 21,0, TRUE} // 14
, {doNOP, 91 /* [ */, 77, 21, FALSE} // 15
, {doLParen, 40 /* ( */, 12, 21, TRUE} // 16
, {doNOP, 36 /* $ */, 71, 20, FALSE} // 17
, {doDotAny, 46 /* . */, 21,0, TRUE} // 18
, {doRuleError, 255, 86,0, FALSE} // 19
, {doCheckVarDef, 255, 21,0, FALSE} // 20 term-var-ref
, {doNOP, 130, 21,0, TRUE} // 21 expr-mod
, {doUnaryOpStar, 42 /* * */, 26,0, TRUE} // 22
, {doUnaryOpPlus, 43 /* + */, 26,0, TRUE} // 23
, {doUnaryOpQuestion, 63 /* ? */, 26,0, TRUE} // 24
, {doNOP, 255, 26,0, FALSE} // 25
, {doExprCatOperator, 254, 12,0, FALSE} // 26 expr-cont
, {doNOP, 130, 26,0, TRUE} // 27
, {doExprCatOperator, 129, 12,0, FALSE} // 28
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 29
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 30
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 31
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 32
, {doExprCatOperator, 47 /* / */, 38,0, FALSE} // 33
, {doExprCatOperator, 123 /* { */, 50,0, TRUE} // 34
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 35
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 36
, {doExprFinished, 255, 255,0, FALSE} // 37
, {doSlash, 47 /* / */, 40,0, TRUE} // 38 look-ahead
, {doNOP, 255, 86,0, FALSE} // 39
, {doExprCatOperator, 254, 12,0, FALSE} // 40 expr-cont-no-slash
, {doNOP, 130, 26,0, TRUE} // 41
, {doExprCatOperator, 129, 12,0, FALSE} // 42
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 43
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 44
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 45
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 46
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 47
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 48
, {doExprFinished, 255, 255,0, FALSE} // 49
, {doNOP, 130, 50,0, TRUE} // 50 tag-open
, {doStartTagValue, 128, 53,0, FALSE} // 51
, {doTagExpectedError, 255, 86,0, FALSE} // 52
, {doNOP, 130, 57,0, TRUE} // 53 tag-value
, {doNOP, 125 /* } */, 57,0, FALSE} // 54
, {doTagDigit, 128, 53,0, TRUE} // 55
, {doTagExpectedError, 255, 86,0, FALSE} // 56
, {doNOP, 130, 57,0, TRUE} // 57 tag-close
, {doTagValue, 125 /* } */, 60,0, TRUE} // 58
, {doTagExpectedError, 255, 86,0, FALSE} // 59
, {doExprCatOperator, 254, 12,0, FALSE} // 60 expr-cont-no-tag
, {doNOP, 130, 60,0, TRUE} // 61
, {doExprCatOperator, 129, 12,0, FALSE} // 62
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 63
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 64
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 65
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 66
, {doExprCatOperator, 47 /* / */, 38,0, FALSE} // 67
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 68
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 69
, {doExprFinished, 255, 255,0, FALSE} // 70
, {doStartVariableName, 36 /* $ */, 73,0, TRUE} // 71 scan-var-name
, {doNOP, 255, 86,0, FALSE} // 72
, {doNOP, 132, 75,0, TRUE} // 73 scan-var-start
, {doVariableNameExpectedErr, 255, 86,0, FALSE} // 74
, {doNOP, 131, 75,0, TRUE} // 75 scan-var-body
, {doEndVariableName, 255, 255,0, FALSE} // 76
, {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 77 scan-unicode-set
, {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 78
, {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 79
, {doNOP, 255, 86,0, FALSE} // 80
, {doNOP, 130, 81,0, TRUE} // 81 assign-or-rule
, {doStartAssign, 61 /* = */, 12, 84, TRUE} // 82
, {doNOP, 255, 20, 8, FALSE} // 83
, {doEndAssign, 59 /* ; */, 1,0, TRUE} // 84 assign-end
, {doRuleErrorAssignExpr, 255, 86,0, FALSE} // 85
, {doExit, 255, 86,0, TRUE} // 86 errorDeath
, {doNOP, 129, 8,0, TRUE} // 9
, {doRuleError, 255, 94,0, FALSE} // 10
, {doNOP, 33 /* ! */, 13,0, TRUE} // 11 rev-option
, {doReverseDir, 255, 19, 8, FALSE} // 12
, {doOptionStart, 131, 15,0, TRUE} // 13 option-scan1
, {doRuleError, 255, 94,0, FALSE} // 14
, {doNOP, 132, 15,0, TRUE} // 15 option-scan2
, {doOptionEnd, 129, 1,0, FALSE} // 16
, {doOptionEnd, 59 /* ; */, 1,0, FALSE} // 17
, {doRuleError, 255, 94,0, FALSE} // 18
, {doExprStart, 255, 20, 8, FALSE} // 19 reverse-rule
, {doRuleChar, 254, 29,0, TRUE} // 20 term
, {doNOP, 129, 20,0, TRUE} // 21
, {doRuleChar, 130, 29,0, TRUE} // 22
, {doNOP, 91 /* [ */, 85, 29, FALSE} // 23
, {doLParen, 40 /* ( */, 20, 29, TRUE} // 24
, {doNOP, 36 /* $ */, 79, 28, FALSE} // 25
, {doDotAny, 46 /* . */, 29,0, TRUE} // 26
, {doRuleError, 255, 94,0, FALSE} // 27
, {doCheckVarDef, 255, 29,0, FALSE} // 28 term-var-ref
, {doNOP, 129, 29,0, TRUE} // 29 expr-mod
, {doUnaryOpStar, 42 /* * */, 34,0, TRUE} // 30
, {doUnaryOpPlus, 43 /* + */, 34,0, TRUE} // 31
, {doUnaryOpQuestion, 63 /* ? */, 34,0, TRUE} // 32
, {doNOP, 255, 34,0, FALSE} // 33
, {doExprCatOperator, 254, 20,0, FALSE} // 34 expr-cont
, {doNOP, 129, 34,0, TRUE} // 35
, {doExprCatOperator, 130, 20,0, FALSE} // 36
, {doExprCatOperator, 91 /* [ */, 20,0, FALSE} // 37
, {doExprCatOperator, 40 /* ( */, 20,0, FALSE} // 38
, {doExprCatOperator, 36 /* $ */, 20,0, FALSE} // 39
, {doExprCatOperator, 46 /* . */, 20,0, FALSE} // 40
, {doExprCatOperator, 47 /* / */, 46,0, FALSE} // 41
, {doExprCatOperator, 123 /* { */, 58,0, TRUE} // 42
, {doExprOrOperator, 124 /* | */, 20,0, TRUE} // 43
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 44
, {doExprFinished, 255, 255,0, FALSE} // 45
, {doSlash, 47 /* / */, 48,0, TRUE} // 46 look-ahead
, {doNOP, 255, 94,0, FALSE} // 47
, {doExprCatOperator, 254, 20,0, FALSE} // 48 expr-cont-no-slash
, {doNOP, 129, 34,0, TRUE} // 49
, {doExprCatOperator, 130, 20,0, FALSE} // 50
, {doExprCatOperator, 91 /* [ */, 20,0, FALSE} // 51
, {doExprCatOperator, 40 /* ( */, 20,0, FALSE} // 52
, {doExprCatOperator, 36 /* $ */, 20,0, FALSE} // 53
, {doExprCatOperator, 46 /* . */, 20,0, FALSE} // 54
, {doExprOrOperator, 124 /* | */, 20,0, TRUE} // 55
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 56
, {doExprFinished, 255, 255,0, FALSE} // 57
, {doNOP, 129, 58,0, TRUE} // 58 tag-open
, {doStartTagValue, 128, 61,0, FALSE} // 59
, {doTagExpectedError, 255, 94,0, FALSE} // 60
, {doNOP, 129, 65,0, TRUE} // 61 tag-value
, {doNOP, 125 /* } */, 65,0, FALSE} // 62
, {doTagDigit, 128, 61,0, TRUE} // 63
, {doTagExpectedError, 255, 94,0, FALSE} // 64
, {doNOP, 129, 65,0, TRUE} // 65 tag-close
, {doTagValue, 125 /* } */, 68,0, TRUE} // 66
, {doTagExpectedError, 255, 94,0, FALSE} // 67
, {doExprCatOperator, 254, 20,0, FALSE} // 68 expr-cont-no-tag
, {doNOP, 129, 68,0, TRUE} // 69
, {doExprCatOperator, 130, 20,0, FALSE} // 70
, {doExprCatOperator, 91 /* [ */, 20,0, FALSE} // 71
, {doExprCatOperator, 40 /* ( */, 20,0, FALSE} // 72
, {doExprCatOperator, 36 /* $ */, 20,0, FALSE} // 73
, {doExprCatOperator, 46 /* . */, 20,0, FALSE} // 74
, {doExprCatOperator, 47 /* / */, 46,0, FALSE} // 75
, {doExprOrOperator, 124 /* | */, 20,0, TRUE} // 76
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 77
, {doExprFinished, 255, 255,0, FALSE} // 78
, {doStartVariableName, 36 /* $ */, 81,0, TRUE} // 79 scan-var-name
, {doNOP, 255, 94,0, FALSE} // 80
, {doNOP, 131, 83,0, TRUE} // 81 scan-var-start
, {doVariableNameExpectedErr, 255, 94,0, FALSE} // 82
, {doNOP, 132, 83,0, TRUE} // 83 scan-var-body
, {doEndVariableName, 255, 255,0, FALSE} // 84
, {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 85 scan-unicode-set
, {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 86
, {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 87
, {doNOP, 255, 94,0, FALSE} // 88
, {doNOP, 129, 89,0, TRUE} // 89 assign-or-rule
, {doStartAssign, 61 /* = */, 20, 92, TRUE} // 90
, {doNOP, 255, 28, 8, FALSE} // 91
, {doEndAssign, 59 /* ; */, 1,0, TRUE} // 92 assign-end
, {doRuleErrorAssignExpr, 255, 94,0, FALSE} // 93
, {doExit, 255, 94,0, TRUE} // 94 errorDeath
};
static const char * const RBBIRuleStateNames[] = { 0,
"start",
@ -170,6 +180,14 @@ static const char * const RBBIRuleStateNames[] = { 0,
0,
"break-rule-end",
0,
0,
"rev-option",
0,
"option-scan1",
0,
"option-scan2",
0,
0,
0,
"reverse-rule",
"term",

View file

@ -58,7 +58,7 @@ start:
escaped term ^break-rule-end doExprStart
white_space n start
'$' scan-var-name ^assign-or-rule doExprStart
'!' n reverse-rule doReverseDir
'!' n rev-option
';' n start # ignore empty rules.
eof exit
default term ^break-rule-end doExprStart
@ -73,9 +73,25 @@ break-rule-end:
#
# Reverse Rule We've just scanned a '!', indicating a reverse direction rule.
# A rule expression must follow.
# ! We've just scanned a '!', indicating either a !!key word flag or a
# !Reverse rule.
#
rev-option:
'!' n option-scan1
default reverse-rule ^break-rule-end doReverseDir
option-scan1:
name_start_char n option-scan2 doOptionStart
default errorDeath doRuleError
option-scan2:
name_char n option-scan2
white_space start doOptionEnd
';' start doOptionEnd
default errorDeath doRuleError
reverse-rule:
default term ^break-rule-end doExprStart

View file

@ -459,6 +459,21 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action)
break;
case doOptionStart:
// Scanning a !!option. At the start of string.
fOptionStart = fScanIndex;
break;
case doOptionEnd:
{
UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
if (opt == "chain") {
fRB->fChainRules = TRUE;
} else {
error(U_BRK_UNRECOGNIZED_OPTION);
}
}
break;
case doReverseDir:
fReverseRule = TRUE;

View file

@ -145,6 +145,9 @@ private:
int32_t fRuleNum; // Counts each rule as it is scanned.
int32_t fOptionStart; // Input index of start of a !!option
// keyword, while being scanned.
UnicodeSet *gRuleSet_rule_char;
UnicodeSet *gRuleSet_white_space;
UnicodeSet *gRuleSet_name_char;

View file

@ -119,6 +119,13 @@ void RBBITableBuilder::build() {
printPosSets(fTree);
}
//
// For "chained" rules, modify the followPos sets
//
if (fRB->fChainRules) {
calcChainedFollowPos(fTree);
}
//
// Build the DFA state transition tables.
//
@ -310,6 +317,82 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) {
}
//-----------------------------------------------------------------------------
//
// calcChainedFollowPos. Modify the previously calculated followPos sets
// to implement rule chaining. NOT described by Aho
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) {
UVector endMarkerNodes(*fStatus);
UVector leafNodes(*fStatus);
int32_t i;
if (U_FAILURE(*fStatus)) {
return;
}
// get a list of all endmarker nodes.
fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
// get a list all leaf nodes
fTree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
if (U_FAILURE(*fStatus)) {
return;
}
// Get all nodes that can be the start a match, which is FirstPosition(root)
UVector *matchStartNodes = fTree->fFirstPosSet;
// Iteratate over all leaf nodes,
//
int32_t endNodeIx;
int32_t startNodeIx;
for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
RBBINode *endNode = NULL;
// Identify leaf nodes that correspond to overall rule match positions.
// These include an endMarkerNode in their followPos sets.
for (i=0; i<endMarkerNodes.size(); i++) {
if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
endNode = tNode;
break;
}
}
if (endNode == NULL) {
// node wasn't an end node. Try again with the next.
continue;
}
// We've got a node that can end a match.
// Now iterate over the nodes that can start a match, looking for ones
// with the same char class as our ending node.
RBBINode *startNode;
for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
if (startNode->fType != RBBINode::leafChar) {
continue;
}
if (endNode->fVal == startNode->fVal) {
// The end val (character class) of one possible match is the
// same as the start of another.
// Add all nodes from the followPos of the start node to the
// followPos set of the end node, which will have the effect of
// letting matches transition from a match state at endNode
// to the second char of a match starting with startNode.
setAdd(endNode->fFollowPos, startNode->fFollowPos);
}
}
}
}
//-----------------------------------------------------------------------------
//
// buildStateTable() Determine the set of runtime DFA states and the

View file

@ -49,6 +49,7 @@ private:
void calcFirstPos(RBBINode *n);
void calcLastPos(RBBINode *n);
void calcFollowPos(RBBINode *n);
void calcChainedFollowPos(RBBINode *n);
void buildStateTable();
void flagAcceptingStates();
void flagLookAheadStates();

View file

@ -625,6 +625,7 @@ typedef enum UErrorCode {
U_BRK_UNDEFINED_VARIABLE, /**< Use of an undefined $Variable in an RBBI rule. */
U_BRK_INIT_ERROR, /**< Initialization failure. Probable missing ICU Data. */
U_BRK_RULE_EMPTY_SET, /**< Rule contains an empty Unicode Set. */
U_BRK_UNRECOGNIZED_OPTION, /**< !!option in RBBI rules not recognized. */
U_BRK_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for Break Iterator failures */
/*

View file

@ -17,6 +17,8 @@
# Character Classes defined by TR 14.
#
!!chain
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$BA = [:LineBreak = Break_After:];
@ -82,75 +84,78 @@ $SPcm = $SP $CM*;
$SYcm = $SY $CM*;
# New Lines. Always break after, never break before.
# Rule LB 3
#
# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
# Because we never break before these things, $Endings
# appears at the end of line break rule.
#
$NLF = $BK | $CR | $LF | $NL | $CR $LF;
$EndingsSoft = ($ZW* $SP)* $ZW*;
$EndingsHard = ($ZW* $SP)* $ZW* $NLF;
#
# Rule LB 3
$LB3NonBreaks = [^$BK $CR $LF $NL];
$LB3NonBreaks ($BK | $CR | $LF | $NL){100};
$CR $LF {100};
# LB 4 x SP
# x ZW
$LB3NonBreaks [$SP $ZW];
# LB 5 Break after zero width space
$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
# LB 7 Combining marks. TODO: get it right!
# $SP $CM needs to behave like $ID.
# X $CM needs to behave like X, where X is not $SP.
# $CM not covered by the above needs to behave like $AL
[$LB5NonBreaks] $CM*; # Stick together any combining sequences that don't match other rules.
# LB 8
$LB5NonBreaks [$CL $EX $IS $SY];
# LB 9
$OPcm $SP* $LB3NonBreaks?; # Need to force trailing $BKs to rule 3, to get status right.
$OPcm $SP* [$LB5NonBreaks] $CM*;
# LB 10
$QUcm $SP* $OPcm;
# LB 11
$CLcm $SP* $NScm;
# LB 11a
($B2cm)+;
# LB 11b
$LB5NonBreaks $GLcm $LB3NonBreaks?;
$LB5NonBreaks $GLcm [$LB5NonBreaks] $CM*;
$GLcm $LB3NonBreaks?;
$GLcm [$LB5NonBreaks] $CM*;
# LB 12
$LB12NonBreaks = [[$LB5NonBreaks] - [$SP]];
# LB 14
$LB12NonBreaks $QUcm+ $LB3NonBreaks?;
$LB12NonBreaks $QUcm+ [$LB5NonBreaks] $CM*;
$QUcm $LB3NonBreaks?;
$QUcm [$LB5NonBreaks] $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
# LB 14a
$LB14NonBreaks = [[$LB12NonBreaks] - [$CB]];
#
# Openings Sequences that can precede Words, and that should not be separated from them.
# Rules LB 9, 10
#
$Openings = ((($QUcm ($ZW* $SP)*)? $OPcm ($ZW* $SP)*) | $GLcm)+;
# LB 15
$LB14NonBreaks ($BAcm | $HYcm | $NScm);
$BBcm [^$CB];
#
# Closings Seqences that follow words, and that should not be separated from them,
# Rule LB 8, 11, 15
$Closings = (($ZW* $SP)*( ($CLcm (($ZW* $SP)* $NScm)? | $EX | $IS | $SY) $CM*) | $BAcm | $HYcm | $NScm)*;
# LB 16
($ALcm | $IDcm | $INcm | $NUcm) $INcm*;
$WordClosings = ($SP* $CLcm | $SP* $EXcm | $SP* $IScm | $SP* $SYcm | $BAcm | $HYcm | $NScm)*;
#
# Words. Includes mixed Alpha-numerics.
# Rules 11a, 16, 17, 19, more or less.
#
$Number = $PRcm? ($OPcm | $HYcm)? $NU ($NU | $IS)* $CL? $POcm?; # Numbers
# Regex form, rather than rule 18
# Alpha-numeric. 16, 17
$Word = ($ALcm | $NUcm)+ $INcm* |
$IDcm ($POcm? | $INcm*) |
$CM+ ($POcm? | $INcm*) | # CM with no base is like ID (LB 7a)
$INcm+ |
$CB; # Deviation from Unicode spec for $CB
# We treat as a single char word
$Dashes = (($B2cm ($ZW* $SP)*)*); # Dashes 11a
$HYMinus = $HYcm ($NUcm ($NUcm | $IS)* $CL? $POcm?)?; # For Rle LB15, Don't break before Hypen-minus,
# we also need to match a whole number, if that
# is what follows the '-'
$Word15 = $Openings? (
($BBcm* $Openings? ($Word | $Number | $Dashes)? ($BAcm | $HYMinus | $NScm)*) | # Rule 15. Stuff sticks around words.
$BBcm* [^[:Cc:] $BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] $CM* | # Allow characters that don't meet the
$BBcm* [^$BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] # more elaborate definitions for WORD
) $WordClosings?; # to be glued.
$GluedWord = $Openings? $Word15 ((($ZW* $SP)* $GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
#$GluedWord = $Openings? $Word15 ((($ZW* $SP)* $Openings) $Word15)*; # "Glue" will stick anything below it together.
# Rules 13, 14
#
# The actual rule, a combination of everything defined above.
#
$Openings? $GluedWord $Closings $EndingsSoft{0};
$Openings? $GluedWord $Closings $EndingsHard{100};
# $GluedWord;
# $LB 17
$IDcm $POcm;
$ALcm+ $NUcm; # includes $LB19
$NUcm $ALcm+;
# LB 18
$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?;
# LB 19
$ALcm+;
#
@ -161,8 +166,4 @@ $Openings? $GluedWord $Closings $EndingsHard{100};
# containing a space that may inhibit a break from occuring.
#
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($CM* $SP)) | (($CM* $SP)+ $OP);
$ClumpingChars = [^$SP $BK $CR $LF];
#!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
!.*;
!.*;

View file

@ -2526,13 +2526,10 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
// Depends on the previous char, and whether it eats following CombiningMarks
// or not.
UChar32 c = fText->char32At(prevPos);
if (c == 0x0d || c == 0x0a || c == 0x85 || fBK->contains(c) || fSP->contains(c)) {
// char doesn't automatically combine with CM.
nextPos = fText->moveIndex32(prevPos, 1);
} else {
nextPos = fCharBI->following(prevPos);
nextPos = fText->moveIndex32(prevPos, 1);
if (!(c == 0x0d || c == 0x0a || c == 0x85 || c == 0x200b /* ZW */ || fBK->contains(c))) {
for (;;) {
UChar32 c = fText->char32At(nextPos);
c = fText->char32At(nextPos);
if (!fCM->contains(c)) {
break;
}
@ -2714,12 +2711,9 @@ fall_through_11:
}
// LB 14a Break around a CB
// NOTE: DISABLE FOR ICU, FOR NOW. Too hard to implement in Rules.
#if 0
if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
break;
}
#endif
// LB 15
if (fBA->contains(thisChar) ||

View file

@ -22,6 +22,10 @@
#
# Temp debugging tests
<line>
<data>•\U00011efa\u275d\u0085•\u0c56•</data>
<data>•a\u275d\u0085•\u0c56•</data>
########################################################################################
#