ICU-12081 Initial implementation Emoji break rules and a new RBBI monkey test.

X-SVN-Rev: 38387
This commit is contained in:
Andy Heninger 2016-02-26 21:58:26 +00:00
parent 2cf8965496
commit 9d9256f3b7
47 changed files with 4675 additions and 1291 deletions

View file

@ -983,6 +983,54 @@ enum RBBIRunMode {
};
// Map from look-ahead break states (corresponds to rules) to boundary positions.
// Allows multiple lookahead break rules to be in flight at the same time.
//
// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
// in the state table be sequential, then we can just index an array. And the
// table could also tell us in advance how big that array needs to be.
//
// Before ICU 57 there was just a single simple variable for a look-ahead match that
// was in progress. Two rules at once did not work.
static const int32_t kMaxLookaheads = 8;
struct LookAheadResults {
int32_t fUsedSlotLimit;
int32_t fPositions[8];
int16_t fKeys[8];
LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
int32_t getPosition(int16_t key) {
for (int32_t i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
return fPositions[i];
}
}
U_ASSERT(FALSE);
return -1;
}
void setPosition(int16_t key, int32_t position) {
int32_t i;
for (i=0; i<fUsedSlotLimit; ++i) {
if (fKeys[i] == key) {
fPositions[i] = position;
return;
}
}
if (i >= kMaxLookaheads) {
U_ASSERT(FALSE);
i = kMaxLookaheads - 1;
}
fKeys[i] = key;
fPositions[i] = position;
U_ASSERT(fUsedSlotLimit == i);
fUsedSlotLimit = i + 1;
}
};
//-----------------------------------------------------------------------------------
//
// handleNext(stateTable)
@ -1000,14 +1048,11 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
RBBIStateTableRow *row;
UChar32 c;
int32_t lookaheadStatus = 0;
int32_t lookaheadTagIdx = 0;
int32_t result = 0;
int32_t initialPosition = 0;
int32_t lookaheadResult = 0;
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
const char *tableData = statetable->fTableData;
uint32_t tableRowLen = statetable->fRowLen;
LookAheadResults lookAheadMatches;
int32_t result = 0;
int32_t initialPosition = 0;
const char *tableData = statetable->fTableData;
uint32_t tableRowLen = statetable->fRowLen;
#ifdef RBBI_DEBUG
if (fTrace) {
@ -1050,14 +1095,6 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
// We have already run the loop one last time with the
// character set to the psueudo {eof} value. Now it is time
// to unconditionally bail out.
if (lookaheadResult > result) {
// We ran off the end of the string with a pending look-ahead match.
// Treat this as if the look-ahead condition had been met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
}
break;
}
// Run the loop one last time with the fake end-of-input character category.
@ -1123,38 +1160,23 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
}
if (row->fLookAhead != 0) {
if (lookaheadStatus != 0
&& row->fAccepting == lookaheadStatus) {
// Lookahead match is completed.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
// TODO: make a standalone hard break in a rule work.
if (lookAheadHardBreak) {
UTEXT_SETNATIVEINDEX(fText, result);
return result;
}
// Look-ahead completed, but other rules may match further. Continue on
// TODO: junk this feature? I don't think it's used anywhwere.
goto continueOn;
int16_t completedRule = row->fAccepting;
if (completedRule > 0) {
// Lookahead match is completed.
int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
if (lookaheadResult >= 0) {
fLastRuleStatusIndex = row->fTagIdx;
UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
return lookaheadResult;
}
int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
lookaheadTagIdx = row->fTagIdx;
goto continueOn;
}
int16_t rule = row->fLookAhead;
if (rule != 0) {
// At the position of a '/' in a look-ahead match. Record it.
int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
lookAheadMatches.setPosition(rule, pos);
}
if (row->fAccepting != 0) {
// Because this is an accepting state, any in-progress look-ahead match
// is no longer relavant. Clear out the pending lookahead status.
lookaheadStatus = 0; // clear out any pending look-ahead match.
}
continueOn:
if (state == STOP_STATE) {
// This is the normal exit from the lookup state machine.
// We have advanced through the string until it is certain that no
@ -1216,11 +1238,9 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
RBBIRunMode mode;
RBBIStateTableRow *row;
UChar32 c;
int32_t lookaheadStatus = 0;
LookAheadResults lookAheadMatches;
int32_t result = 0;
int32_t initialPosition = 0;
int32_t lookaheadResult = 0;
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
#ifdef RBBI_DEBUG
if (fTrace) {
@ -1266,13 +1286,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
// We have already run the loop one last time with the
// character set to the psueudo {eof} value. Now it is time
// to unconditionally bail out.
if (lookaheadResult < result) {
// We ran off the end of the string with a pending look-ahead match.
// Treat this as if the look-ahead condition had been met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
lookaheadStatus = 0;
} else if (result == initialPosition) {
if (result == initialPosition) {
// Ran off start, no match found.
// move one index one (towards the start, since we are doing a previous())
UTEXT_SETNATIVEINDEX(fText, initialPosition);
@ -1338,36 +1352,22 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
}
if (row->fLookAhead != 0) {
if (lookaheadStatus != 0
&& row->fAccepting == lookaheadStatus) {
// Lookahead match is completed.
result = lookaheadResult;
lookaheadStatus = 0;
// TODO: make a standalone hard break in a rule work.
if (lookAheadHardBreak) {
UTEXT_SETNATIVEINDEX(fText, result);
return result;
}
// Look-ahead completed, but other rules may match further. Continue on
// TODO: junk this feature? I don't think it's used anywhwere.
goto continueOn;
int16_t completedRule = row->fAccepting;
if (completedRule > 0) {
// Lookahead match is completed.
int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
if (lookaheadResult >= 0) {
UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
return lookaheadResult;
}
int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
goto continueOn;
}
int16_t rule = row->fLookAhead;
if (rule != 0) {
// At the position of a '/' in a look-ahead match. Record it.
int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
lookAheadMatches.setPosition(rule, pos);
}
if (row->fAccepting != 0) {
// Because this is an accepting state, any in-progress look-ahead match
// is no longer relavant. Clear out the pending lookahead status.
lookaheadStatus = 0;
}
continueOn:
if (state == STOP_STATE) {
// This is the normal exit from the lookup state machine.
// We have advanced through the string until it is certain that no

View file

@ -1,6 +1,6 @@
/*
***************************************************************************
* Copyright (C) 2002-2008 International Business Machines Corporation *
* Copyright (C) 2002-2016 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
@ -56,6 +56,8 @@ RBBINode::RBBINode(NodeType t) : UMemory() {
fLastPos = 0;
fNullable = FALSE;
fLookAheadEnd = FALSE;
fRuleRoot = FALSE;
fChainIn = FALSE;
fVal = 0;
fPrecedence = precZero;
@ -86,6 +88,8 @@ RBBINode::RBBINode(const RBBINode &other) : UMemory(other) {
fLastPos = other.fLastPos;
fNullable = other.fNullable;
fVal = other.fVal;
fRuleRoot = FALSE;
fChainIn = other.fChainIn;
UErrorCode status = U_ZERO_ERROR;
fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
fLastPosSet = new UVector(status);
@ -161,6 +165,8 @@ RBBINode *RBBINode::cloneTree() {
}
}
}
n->fRuleRoot = this->fRuleRoot;
n->fChainIn = this->fChainIn;
return n;
}
@ -272,6 +278,12 @@ void RBBINode::findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &s
//
//-------------------------------------------------------------------------
#ifdef RBBI_DEBUG
static int32_t serial(const RBBINode *node) {
return (node == NULL? -1 : node->fSerialNum);
}
void RBBINode::printNode() {
static const char * const nodeTypeNames[] = {
"setRef",
@ -295,9 +307,10 @@ void RBBINode::printNode() {
if (this==NULL) {
RBBIDebugPrintf("%10p", (void *)this);
} else {
RBBIDebugPrintf("%10p %12s %10p %10p %10p %4d %6d %d ",
(void *)this, nodeTypeNames[fType], (void *)fParent, (void *)fLeftChild, (void *)fRightChild,
fSerialNum, fFirstPos, fVal);
RBBIDebugPrintf("%10p %5d %12s %c%c %5d %5d %5d %6d %d ",
(void *)this, fSerialNum, nodeTypeNames[fType], fRuleRoot?'R':' ', fChainIn?'C':' ',
serial(fLeftChild), serial(fRightChild), serial(fParent),
fFirstPos, fVal);
if (fType == varRef) {
RBBI_DEBUG_printUnicodeString(fText);
}
@ -328,11 +341,13 @@ U_CFUNC void RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth)
//
//-------------------------------------------------------------------------
#ifdef RBBI_DEBUG
void RBBINode::printNodeHeader() {
RBBIDebugPrintf(" Address serial type LeftChild RightChild Parent position value\n");
}
void RBBINode::printTree(UBool printHeading) {
if (printHeading) {
RBBIDebugPrintf( "-------------------------------------------------------------------\n"
" Address type Parent LeftChild RightChild serial position value\n"
);
printNodeHeader();
}
this->printNode();
if (this != NULL) {

View file

@ -80,6 +80,10 @@ class RBBINode : public UMemory {
UBool fLookAheadEnd; // For endMark nodes, set TRUE if
// marking the end of a look-ahead rule.
UBool fRuleRoot; // True if this node is the root of a rule.
UBool fChainIn; // True if chaining into this rule is allowed
// (no '^' present).
UVector *fFirstPosSet;
UVector *fLastPosSet; // TODO: rename fFirstPos & fLastPos to avoid confusion.
UVector *fFollowPos;
@ -95,6 +99,7 @@ class RBBINode : public UMemory {
void findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status);
#ifdef RBBI_DEBUG
static void printNodeHeader();
void printNode();
void printTree(UBool withHeading);
#endif
@ -104,6 +109,7 @@ class RBBINode : public UMemory {
UBool operator == (const RBBINode &other); // Private, so these functions won't accidently be used.
#ifdef RBBI_DEBUG
public:
int fSerialNum; // Debugging aids.
#endif
};

View file

@ -40,6 +40,7 @@ enum RBBI_RuleParseAction {
doExprStart,
doLParen,
doNOP,
doNoChain,
doOptionEnd,
doOptionStart,
doReverseDir,
@ -77,101 +78,109 @@ struct RBBIRuleTableEl {
static const struct RBBIRuleTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
, {doExprStart, 254, 21, 8, FALSE} // 1 start
, {doExprStart, 254, 29, 9, FALSE} // 1 start
, {doNOP, 132, 1,0, TRUE} // 2
, {doExprStart, 36 /* $ */, 80, 90, FALSE} // 3
, {doNOP, 33 /* ! */, 11,0, TRUE} // 4
, {doNOP, 59 /* ; */, 1,0, TRUE} // 5
, {doNOP, 252, 0,0, FALSE} // 6
, {doExprStart, 255, 21, 8, FALSE} // 7
, {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 8 break-rule-end
, {doNOP, 132, 8,0, TRUE} // 9
, {doRuleError, 255, 95,0, FALSE} // 10
, {doNOP, 33 /* ! */, 13,0, TRUE} // 11 rev-option
, {doReverseDir, 255, 20, 8, FALSE} // 12
, {doOptionStart, 130, 15,0, TRUE} // 13 option-scan1
, {doRuleError, 255, 95,0, FALSE} // 14
, {doNOP, 129, 15,0, TRUE} // 15 option-scan2
, {doOptionEnd, 255, 17,0, FALSE} // 16
, {doNOP, 59 /* ; */, 1,0, TRUE} // 17 option-scan3
, {doNOP, 132, 17,0, TRUE} // 18
, {doRuleError, 255, 95,0, FALSE} // 19
, {doExprStart, 255, 21, 8, FALSE} // 20 reverse-rule
, {doRuleChar, 254, 30,0, TRUE} // 21 term
, {doNOP, 132, 21,0, TRUE} // 22
, {doRuleChar, 131, 30,0, TRUE} // 23
, {doNOP, 91 /* [ */, 86, 30, FALSE} // 24
, {doLParen, 40 /* ( */, 21, 30, TRUE} // 25
, {doNOP, 36 /* $ */, 80, 29, FALSE} // 26
, {doDotAny, 46 /* . */, 30,0, TRUE} // 27
, {doRuleError, 255, 95,0, FALSE} // 28
, {doCheckVarDef, 255, 30,0, FALSE} // 29 term-var-ref
, {doNOP, 132, 30,0, TRUE} // 30 expr-mod
, {doUnaryOpStar, 42 /* * */, 35,0, TRUE} // 31
, {doUnaryOpPlus, 43 /* + */, 35,0, TRUE} // 32
, {doUnaryOpQuestion, 63 /* ? */, 35,0, TRUE} // 33
, {doNOP, 255, 35,0, FALSE} // 34
, {doExprCatOperator, 254, 21,0, FALSE} // 35 expr-cont
, {doNOP, 132, 35,0, TRUE} // 36
, {doExprCatOperator, 131, 21,0, FALSE} // 37
, {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 38
, {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 39
, {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 40
, {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 41
, {doExprCatOperator, 47 /* / */, 47,0, FALSE} // 42
, {doExprCatOperator, 123 /* { */, 59,0, TRUE} // 43
, {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 44
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 45
, {doExprFinished, 255, 255,0, FALSE} // 46
, {doSlash, 47 /* / */, 49,0, TRUE} // 47 look-ahead
, {doNOP, 255, 95,0, FALSE} // 48
, {doExprCatOperator, 254, 21,0, FALSE} // 49 expr-cont-no-slash
, {doNOP, 132, 35,0, TRUE} // 50
, {doExprCatOperator, 131, 21,0, FALSE} // 51
, {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 52
, {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 53
, {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 54
, {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 55
, {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 56
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
, {doExprFinished, 255, 255,0, FALSE} // 58
, {doNOP, 132, 59,0, TRUE} // 59 tag-open
, {doStartTagValue, 128, 62,0, FALSE} // 60
, {doTagExpectedError, 255, 95,0, FALSE} // 61
, {doNOP, 132, 66,0, TRUE} // 62 tag-value
, {doNOP, 125 /* } */, 66,0, FALSE} // 63
, {doTagDigit, 128, 62,0, TRUE} // 64
, {doTagExpectedError, 255, 95,0, FALSE} // 65
, {doNOP, 132, 66,0, TRUE} // 66 tag-close
, {doTagValue, 125 /* } */, 69,0, TRUE} // 67
, {doTagExpectedError, 255, 95,0, FALSE} // 68
, {doExprCatOperator, 254, 21,0, FALSE} // 69 expr-cont-no-tag
, {doNOP, 132, 69,0, TRUE} // 70
, {doExprCatOperator, 131, 21,0, FALSE} // 71
, {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 72
, {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 73
, {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 74
, {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 75
, {doExprCatOperator, 47 /* / */, 47,0, FALSE} // 76
, {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 77
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 78
, {doExprFinished, 255, 255,0, FALSE} // 79
, {doStartVariableName, 36 /* $ */, 82,0, TRUE} // 80 scan-var-name
, {doNOP, 255, 95,0, FALSE} // 81
, {doNOP, 130, 84,0, TRUE} // 82 scan-var-start
, {doVariableNameExpectedErr, 255, 95,0, FALSE} // 83
, {doNOP, 129, 84,0, TRUE} // 84 scan-var-body
, {doEndVariableName, 255, 255,0, FALSE} // 85
, {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 86 scan-unicode-set
, {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 87
, {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 88
, {doNOP, 255, 95,0, FALSE} // 89
, {doNOP, 132, 90,0, TRUE} // 90 assign-or-rule
, {doStartAssign, 61 /* = */, 21, 93, TRUE} // 91
, {doNOP, 255, 29, 8, FALSE} // 92
, {doEndAssign, 59 /* ; */, 1,0, TRUE} // 93 assign-end
, {doRuleErrorAssignExpr, 255, 95,0, FALSE} // 94
, {doExit, 255, 95,0, TRUE} // 95 errorDeath
, {doNoChain, 94 /* ^ */, 12, 9, TRUE} // 3
, {doExprStart, 36 /* $ */, 88, 98, FALSE} // 4
, {doNOP, 33 /* ! */, 19,0, TRUE} // 5
, {doNOP, 59 /* ; */, 1,0, TRUE} // 6
, {doNOP, 252, 0,0, FALSE} // 7
, {doExprStart, 255, 29, 9, FALSE} // 8
, {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 9 break-rule-end
, {doNOP, 132, 9,0, TRUE} // 10
, {doRuleError, 255, 103,0, FALSE} // 11
, {doExprStart, 254, 29,0, FALSE} // 12 start-after-caret
, {doNOP, 132, 12,0, TRUE} // 13
, {doRuleError, 94 /* ^ */, 103,0, FALSE} // 14
, {doExprStart, 36 /* $ */, 88, 37, FALSE} // 15
, {doRuleError, 59 /* ; */, 103,0, FALSE} // 16
, {doRuleError, 252, 103,0, FALSE} // 17
, {doExprStart, 255, 29,0, FALSE} // 18
, {doNOP, 33 /* ! */, 21,0, TRUE} // 19 rev-option
, {doReverseDir, 255, 28, 9, FALSE} // 20
, {doOptionStart, 130, 23,0, TRUE} // 21 option-scan1
, {doRuleError, 255, 103,0, FALSE} // 22
, {doNOP, 129, 23,0, TRUE} // 23 option-scan2
, {doOptionEnd, 255, 25,0, FALSE} // 24
, {doNOP, 59 /* ; */, 1,0, TRUE} // 25 option-scan3
, {doNOP, 132, 25,0, TRUE} // 26
, {doRuleError, 255, 103,0, FALSE} // 27
, {doExprStart, 255, 29, 9, FALSE} // 28 reverse-rule
, {doRuleChar, 254, 38,0, TRUE} // 29 term
, {doNOP, 132, 29,0, TRUE} // 30
, {doRuleChar, 131, 38,0, TRUE} // 31
, {doNOP, 91 /* [ */, 94, 38, FALSE} // 32
, {doLParen, 40 /* ( */, 29, 38, TRUE} // 33
, {doNOP, 36 /* $ */, 88, 37, FALSE} // 34
, {doDotAny, 46 /* . */, 38,0, TRUE} // 35
, {doRuleError, 255, 103,0, FALSE} // 36
, {doCheckVarDef, 255, 38,0, FALSE} // 37 term-var-ref
, {doNOP, 132, 38,0, TRUE} // 38 expr-mod
, {doUnaryOpStar, 42 /* * */, 43,0, TRUE} // 39
, {doUnaryOpPlus, 43 /* + */, 43,0, TRUE} // 40
, {doUnaryOpQuestion, 63 /* ? */, 43,0, TRUE} // 41
, {doNOP, 255, 43,0, FALSE} // 42
, {doExprCatOperator, 254, 29,0, FALSE} // 43 expr-cont
, {doNOP, 132, 43,0, TRUE} // 44
, {doExprCatOperator, 131, 29,0, FALSE} // 45
, {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 46
, {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 47
, {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 48
, {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 49
, {doExprCatOperator, 47 /* / */, 55,0, FALSE} // 50
, {doExprCatOperator, 123 /* { */, 67,0, TRUE} // 51
, {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 52
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 53
, {doExprFinished, 255, 255,0, FALSE} // 54
, {doSlash, 47 /* / */, 57,0, TRUE} // 55 look-ahead
, {doNOP, 255, 103,0, FALSE} // 56
, {doExprCatOperator, 254, 29,0, FALSE} // 57 expr-cont-no-slash
, {doNOP, 132, 43,0, TRUE} // 58
, {doExprCatOperator, 131, 29,0, FALSE} // 59
, {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 60
, {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 61
, {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 62
, {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 63
, {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 64
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 65
, {doExprFinished, 255, 255,0, FALSE} // 66
, {doNOP, 132, 67,0, TRUE} // 67 tag-open
, {doStartTagValue, 128, 70,0, FALSE} // 68
, {doTagExpectedError, 255, 103,0, FALSE} // 69
, {doNOP, 132, 74,0, TRUE} // 70 tag-value
, {doNOP, 125 /* } */, 74,0, FALSE} // 71
, {doTagDigit, 128, 70,0, TRUE} // 72
, {doTagExpectedError, 255, 103,0, FALSE} // 73
, {doNOP, 132, 74,0, TRUE} // 74 tag-close
, {doTagValue, 125 /* } */, 77,0, TRUE} // 75
, {doTagExpectedError, 255, 103,0, FALSE} // 76
, {doExprCatOperator, 254, 29,0, FALSE} // 77 expr-cont-no-tag
, {doNOP, 132, 77,0, TRUE} // 78
, {doExprCatOperator, 131, 29,0, FALSE} // 79
, {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 80
, {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 81
, {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 82
, {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 83
, {doExprCatOperator, 47 /* / */, 55,0, FALSE} // 84
, {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 85
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 86
, {doExprFinished, 255, 255,0, FALSE} // 87
, {doStartVariableName, 36 /* $ */, 90,0, TRUE} // 88 scan-var-name
, {doNOP, 255, 103,0, FALSE} // 89
, {doNOP, 130, 92,0, TRUE} // 90 scan-var-start
, {doVariableNameExpectedErr, 255, 103,0, FALSE} // 91
, {doNOP, 129, 92,0, TRUE} // 92 scan-var-body
, {doEndVariableName, 255, 255,0, FALSE} // 93
, {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 94 scan-unicode-set
, {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 95
, {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 96
, {doNOP, 255, 103,0, FALSE} // 97
, {doNOP, 132, 98,0, TRUE} // 98 assign-or-rule
, {doStartAssign, 61 /* = */, 29, 101, TRUE} // 99
, {doNOP, 255, 37, 9, FALSE} // 100
, {doEndAssign, 59 /* ; */, 1,0, TRUE} // 101 assign-end
, {doRuleErrorAssignExpr, 255, 103,0, FALSE} // 102
, {doExit, 255, 103,0, TRUE} // 103 errorDeath
};
#ifdef RBBI_DEBUG
static const char * const RBBIRuleStateNames[] = { 0,
@ -181,9 +190,17 @@ static const char * const RBBIRuleStateNames[] = { 0,
0,
0,
0,
0,
0,
"break-rule-end",
0,
0,
"start-after-caret",
0,
0,
0,
0,
0,
0,
"rev-option",
0,

View file

@ -1,7 +1,7 @@
#*****************************************************************************
#
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
# Copyright (C) 2002-2016, International Business Machines Corporation and others.
# All Rights Reserved.
#
#*****************************************************************************
@ -19,6 +19,7 @@
# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
# that are then built with the rule parser.
#
# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
#
# Here is the syntax of the state definitions in this file:
@ -57,6 +58,7 @@
start:
escaped term ^break-rule-end doExprStart
white_space n start
'^' n start-after-caret ^break-rule-end doNoChain
'$' scan-var-name ^assign-or-rule doExprStart
'!' n rev-option
';' n start # ignore empty rules.
@ -71,7 +73,21 @@ break-rule-end:
white_space n break-rule-end
default errorDeath doRuleError
#
# start of a rule, after having seen a '^' (inhibits rule chain in).
# Similar to the main 'start' state in most respects, except
# - empty rule is an error.
# - A second '^' is an error.
#
start-after-caret:
escaped term doExprStart
white_space n start-after-caret
'^' errorDeath doRuleError # two '^'s
'$' scan-var-name ^term-var-ref doExprStart
';' errorDeath doRuleError # ^ ;
eof errorDeath doRuleError
default term doExprStart
#
# ! We've just scanned a '!', indicating either a !!key word flag or a
# !Reverse rule.

View file

@ -1,7 +1,7 @@
//
// file: rbbiscan.cpp
//
// Copyright (C) 2002-2015, International Business Machines Corporation and others.
// Copyright (C) 2002-2016, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains the Rule Based Break Iterator Rule Builder functions for
@ -87,24 +87,27 @@ U_NAMESPACE_BEGIN
RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
{
fRB = rb;
fScanIndex = 0;
fNextIndex = 0;
fQuoteMode = FALSE;
fLineNum = 1;
fCharNum = 0;
fLastChar = 0;
fStateTable = NULL;
fStack[0] = 0;
fStackPtr = 0;
fStack[fStackPtr] = 0;
fNodeStackPtr = 0;
fRuleNum = 0;
fNodeStack[0] = NULL;
fSymbolTable = NULL;
fSetTable = NULL;
fScanIndex = 0;
fNextIndex = 0;
fNodeStackPtr = 0;
fReverseRule = FALSE;
fLookAheadRule = FALSE;
fNoChainInRule = FALSE;
fLineNum = 1;
fCharNum = 0;
fQuoteMode = FALSE;
fSymbolTable = NULL;
fSetTable = NULL;
fRuleNum = 0;
fOptionStart = 0;
// Do not check status until after all critical fields are sufficiently initialized
// that the destructor can run cleanly.
@ -205,6 +208,12 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
break;
case doNoChain:
// Scanned a '^' while on the rule start state.
fNoChainInRule = TRUE;
break;
case doExprOrOperator:
{
fixOpStack(RBBINode::precOpCat);
@ -318,11 +327,11 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
#endif
U_ASSERT(fNodeStackPtr == 1);
RBBINode *thisRule = fNodeStack[fNodeStackPtr];
// If this rule includes a look-ahead '/', add a endMark node to the
// expression tree.
if (fLookAheadRule) {
RBBINode *thisRule = fNodeStack[fNodeStackPtr];
RBBINode *endNode = pushNewNode(RBBINode::endMark);
RBBINode *catNode = pushNewNode(RBBINode::opCat);
if (U_FAILURE(*fRB->fStatus)) {
@ -334,8 +343,24 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
fNodeStack[fNodeStackPtr] = catNode;
endNode->fVal = fRuleNum;
endNode->fLookAheadEnd = TRUE;
thisRule = catNode;
// TODO: Disable chaining out of look-ahead (hard break) rules.
// The break on rule match is forced, so there is no point in building up
// the state table to chain into another rule for a longer match.
}
// Mark this node as being the root of a rule.
thisRule->fRuleRoot = TRUE;
// Flag if chaining into this rule is wanted.
//
if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain
!fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
thisRule->fChainIn = TRUE;
}
// All rule expressions are ORed together.
// The ';' that terminates an expression really just functions as a '|' with
// a low operator prededence.
@ -372,6 +397,7 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
}
fReverseRule = FALSE; // in preparation for the next rule.
fLookAheadRule = FALSE;
fNoChainInRule = FALSE;
fNodeStackPtr = 0;
}
break;
@ -994,7 +1020,7 @@ void RBBIRuleScanner::parse() {
for (;;) {
#ifdef RBBI_DEBUG
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);}
#endif
if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) {
// Table row specified an individual character, not a set, and

View file

@ -52,6 +52,7 @@ public:
struct RBBIRuleChar {
UChar32 fChar;
UBool fEscaped;
RBBIRuleChar() : fChar(0), fEscaped(FALSE) {};
};
RBBIRuleScanner(RBBIRuleBuilder *rb);
@ -127,6 +128,8 @@ private:
UBool fLookAheadRule; // True if the rule includes a '/'
// somewhere within it.
UBool fNoChainInRule; // True if the current rule starts with a '^'.
RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
// $variable symbols.

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (c) 2002-2009, International Business Machines
* Copyright (c) 2002-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -78,7 +78,7 @@ void RBBITableBuilder::build() {
fTree = fTree->flattenVariables();
#ifdef RBBI_DEBUG
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
RBBIDebugPuts("Parse tree after flattening variable references.");
RBBIDebugPuts("\nParse tree after flattening variable references.");
fTree->printTree(TRUE);
}
#endif
@ -136,7 +136,7 @@ void RBBITableBuilder::build() {
fTree->flattenSets();
#ifdef RBBI_DEBUG
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
fTree->printTree(TRUE);
}
#endif
@ -375,6 +375,25 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) {
}
//-----------------------------------------------------------------------------
//
// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
// as roots of a rule to a destination vector.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
if (node == NULL || U_FAILURE(*fStatus)) {
return;
}
if (node->fRuleRoot) {
dest->addElement(node, *fStatus);
// Note: rules cannot nest. If we found a rule start node,
// no child node can also be a start node.
return;
}
addRuleRootNodes(dest, node->fLeftChild);
addRuleRootNodes(dest, node->fRightChild);
}
//-----------------------------------------------------------------------------
//
@ -401,19 +420,24 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
return;
}
// Get all nodes that can be the start a match, which is FirstPosition()
// of the portion of the tree corresponding to user-written rules.
// See the tree description in bofFixup().
RBBINode *userRuleRoot = tree;
if (fRB->fSetBuilder->sawBOF()) {
userRuleRoot = tree->fLeftChild->fRightChild;
// Collect all leaf nodes that can start matches for rules
// with inbound chaining enabled, which is the union of the
// firstPosition sets from each of the rule root nodes.
UVector ruleRootNodes(*fStatus);
addRuleRootNodes(&ruleRootNodes, tree);
UVector matchStartNodes(*fStatus);
for (int i=0; i<ruleRootNodes.size(); ++i) {
RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i));
if (node->fChainIn) {
setAdd(&matchStartNodes, node->fFirstPosSet);
}
}
if (U_FAILURE(*fStatus)) {
return;
}
U_ASSERT(userRuleRoot != NULL);
UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
// Iteratate over all leaf nodes,
//
int32_t endNodeIx;
int32_t startNodeIx;
@ -455,8 +479,8 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
// Now iterate over the nodes that can start a match, looking for ones
// with the same char class as our ending node.
RBBINode *startNode;
for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
if (startNode->fType != RBBINode::leafChar) {
continue;
}
@ -1032,6 +1056,8 @@ void RBBITableBuilder::printPosSets(RBBINode *n) {
if (n==NULL) {
return;
}
printf("\n");
RBBINode::printNodeHeader();
n->printNode();
RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
@ -1141,8 +1167,8 @@ void RBBITableBuilder::exportTable(void *where) {
void RBBITableBuilder::printSet(UVector *s) {
int32_t i;
for (i=0; i<s->size(); i++) {
void *v = s->elementAt(i);
RBBIDebugPrintf("%10p", v);
const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
}
RBBIDebugPrintf("\n");
}

View file

@ -4,7 +4,7 @@
/*
**********************************************************************
* Copyright (c) 2002-2005, International Business Machines
* Copyright (c) 2002-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -58,6 +58,8 @@ private:
void flagTaggedStates();
void mergeRuleStatusVals();
void addRuleRootNodes(UVector *dest, RBBINode *node);
// Set functions for UVector.
// TODO: make a USet subclass of UVector

View file

@ -1,12 +1,12 @@
#
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
# Copyright (C) 2002-2016, International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: char.txt
#
# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
# These rules are based on UAX #29 Revision 28 (Draft 3) for Unicode Version 9.0
#
#
@ -14,9 +14,9 @@
#
$CR = [\p{Grapheme_Cluster_Break = CR}];
$LF = [\p{Grapheme_Cluster_Break = LF}];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$Control = [[\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]];
# TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
$Extend = [[\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]];
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
$Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
@ -30,10 +30,18 @@ $T = [\p{Grapheme_Cluster_Break = T}];
$LV = [\p{Grapheme_Cluster_Break = LV}];
$LVT = [\p{Grapheme_Cluster_Break = LVT}];
# Emoji defintions scraped from http://www.unicode.org/Public/emoji/2.0//emoji-data.txt
$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
$E_Modifier = [\U0001F3FB-\U0001F3FF];
$ZWJ = [\u200D];
$GAZ = [\U0001F455-\U0001F469\U0001F48B\U0001F5E8\u2764];
## -------------------------------------------------
!!chain;
!!lookAheadHardBreak;
!!forward;
$CR $LF;
@ -42,13 +50,24 @@ $L ($L | $V | $LV | $LVT);
($LV | $V) ($V | $T);
($LVT | $T) $T;
$Regional_Indicator $Regional_Indicator;
# GB 8. Keep pairs of regional indicators together
# Note that hard break '/' rule triggers only if there are three or more initial RIs,
[^$Control $CR $LF] $Extend;
^$Regional_Indicator $Regional_Indicator / $Regional_Indicator;
^$Regional_Indicator $Regional_Indicator;
# GB 9
[^$Control $CR $LF] ($Extend | $ZWJ);
# GB 9a (only for extended grapheme clusters)
[^$Control $CR $LF] $SpacingMark;
# TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
# GB 9b Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
# GB9c Emoji proposal
($E_Base | $GAZ) $E_Modifier;
# GB 9d Don't break between ZWJ and Glue_After_Zwj
$ZWJ $GAZ;
## -------------------------------------------------
@ -58,23 +77,29 @@ $LF $CR;
($V | $T) ($LV | $V);
$T ($LVT | $T);
$Regional_Indicator $Regional_Indicator;
# GB 8. Going backwards, we must scan through any number of regional indicators as pairs.
#
$Regional_Indicator $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)* [{eof}[^$Regional_Indicator]];
$Extend [^$Control $CR $LF];
# GB 9
($Extend | $ZWJ) [^$Control $CR $LF]; #note that this will chain into Regional_Indicator when needed.
# GB 9a
$SpacingMark [^$Control $CR $LF];
# TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
# GB 9b Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
# GB 9c
$E_Modifier ($E_Base | $GAZ);
# GB 9d Don't break between ZWJ and Glue_After_Zwj
$GAZ $ZWJ;
## -------------------------------------------------
# We don't logically need safe char break rules, but if we don't provide any at all
# the engine for preceding() and following() will fall back to the
# old style inefficient algorithm.
!!safe_reverse;
$LF $CR;
$Regional_Indicator $Regional_Indicator;
## -------------------------------------------------
!!safe_forward;
$CR $LF;
$Regional_Indicator $Regional_Indicator;

View file

@ -1,13 +1,16 @@
# Copyright (c) 2002-2015 International Business Machines Corporation and
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line.txt
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
#
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
@ -20,8 +23,6 @@
#
!!chain;
!!LBCMNoChain;
!!lookAheadHardBreak;
#
@ -59,8 +60,13 @@
# See rule LB 19 for an example.
#
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
$EM = [\U0001F3FB-\U0001F3FF];
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BA = [:LineBreak = Break_After:];
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
@ -68,7 +74,7 @@ $B2 = [:LineBreak = Break_Both:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
$CM = [:LineBreak = Combining_Mark:];
$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
@ -77,7 +83,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
$ID = [:LineBreak = Ideographic:];
$ID = [[:LineBreak = Ideographic:][\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
@ -99,6 +105,7 @@ $SY = [:LineBreak = Break_Symbols:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
@ -131,7 +138,6 @@ $HLcm = $HL $CM*;
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
@ -160,6 +166,8 @@ $BB $CM+;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
$EB $CM+;
$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
@ -208,7 +216,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
@ -216,13 +224,13 @@ $CR $LF {100};
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
$CM+ $LB4Breaks {100};
^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
$CM+ [$SP $ZW];
^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
@ -233,20 +241,23 @@ $CM+ [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x ID Emoji proposal.
#
$ZWJ ($ID | $EB | $EM);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
$CM+;
^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
$CM+ $WJcm;
^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
@ -257,13 +268,13 @@ $WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
$CM+ GLcm;
^$CM+ $GLcm;
@ -272,23 +283,23 @@ $CM+ GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
@ -318,12 +329,10 @@ $LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
$CM+ $QUcm;
^$CM+ $QUcm;
# QU x
$QUcm .?;
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
# TODO: I don't think this rule is needed.
# LB 20
@ -335,14 +344,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
^$CM+ ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
#
$HLcm ($HYcm | $BAcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
@ -351,25 +361,25 @@ $SYcm $HLcm;
# LB 22
($ALcm | $HLcm) $INcm;
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
$IDcm $INcm;
($ID | $EB | $EM) $CM* $INcm;
$INcm $INcm;
$NUcm $INcm;
# $LB 23
$IDcm $POcm;
($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
$PRcm $IDcm;
$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
@ -393,18 +403,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
# LB 30a Do not break between regional indicators.
$RIcm $RIcm;
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
@ -413,34 +432,36 @@ $RIcm $RIcm;
!!reverse;
$CM+ $ALPlus;
$CM+ $BA;
$CM+ $BB;
$CM+ $B2;
$CM+ $CL;
$CM+ $CP;
$CM+ $EX;
$CM+ $GL;
$CM+ $HL;
$CM+ $HY;
$CM+ $H2;
$CM+ $H3;
$CM+ $ID;
$CM+ $IN;
$CM+ $IS;
$CM+ $JL;
$CM+ $JV;
$CM+ $JT;
$CM+ $NS;
$CM+ $NU;
$CM+ $OP;
$CM+ $PO;
$CM+ $PR;
$CM+ $QU;
$CM+ $RI;
$CM+ $SY;
$CM+ $WJ;
$CM+;
^$CM+ $ALPlus;
^$CM+ $BA;
^$CM+ $BB;
^$CM+ $B2;
^$CM+ $CL;
^$CM+ $CP;
^$CM+ $EB;
^$CM+ $EM;
^$CM+ $EX;
^$CM+ $GL;
^$CM+ $HL;
^$CM+ $HY;
^$CM+ $H2;
^$CM+ $H3;
^$CM+ $ID;
^$CM+ $IN;
^$CM+ $IS;
^$CM+ $JL;
^$CM+ $JV;
^$CM+ $JT;
^$CM+ $NS;
^$CM+ $NU;
^$CM+ $OP;
^$CM+ $PO;
^$CM+ $PR;
^$CM+ $QU;
^$CM+ $RI;
^$CM+ $SY;
^$CM+ $WJ;
^$CM+;
#
@ -452,14 +473,14 @@ $AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
@ -471,7 +492,7 @@ $AL_FOLLOW $CM+ / (
# LB 4, 5, 5
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
@ -488,30 +509,37 @@ $LF $CR;
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
# The ZWJ will look like a CM to whatever precedes it.
#
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
$CM+ $CAN_CM;
^$CM+ $CAN_CM;
# LB 11
$CM* $WJ $CM* $CAN_CM;
$CM* $WJ [$LB8NonBreaks-$CM];
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CM* $CAN_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CM* $CAN_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
@ -532,28 +560,26 @@ $SY [$LB8NonBreaks-$CM];
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
$CM* $CAN_CM $SP* $CM* $OP;
$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
$CM* $OP $SP* $CM* $QU;
$OP $SP* $CM* $QU;
# LB 16
$CM* $NS $SP* $CM* ($CL | $CP);
$NS $SP* $CM* ($CL | $CP);
# LB 17
$CM* $B2 $SP* $CM* $B2;
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
@ -562,82 +588,100 @@ $CM* $B2 $SP* $CM* $B2;
#
# LB 19
#
$CM* $QU $CM* $CAN_CM; # . x QU
$CM* $QU $LB18NonBreaks;
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CM* $CAN_CM $CM* $QU; # QU x .
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 21
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA) $CM* $HL;
# LB21b (reverse)
$CM* $HL $CM* $SY;
$HL $CM* $SY;
# LB 22
$CM* $IN $CM* ($ALPlus | $HL);
$CM* $IN $CM* $EX;
$CM* $IN $CM* $ID;
$CM* $IN $CM* $IN;
$CM* $IN $CM* $NU;
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
$IN $CM* $IN;
$IN $CM* $NU;
# LB 23
$CM* $PO $CM* $ID;
$CM* $NU $CM* ($ALPlus | $HL);
$CM* ($ALPlus | $HL) $CM* $NU;
$PO $CM* ($ID | $EB | $EM);
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB 24
$CM* $ID $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PO;
($ID | $EB | $EM) $CM* $PR;
($ALPlus | $HL) $CM* $PR;
($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
$CM* ($JT | $JV) $CM* ($H2 | $JV);
$CM* $JT $CM* ($H3 | $JT);
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
$CM* ($ALPlus | $HL) $CM* $IS;
($ALPlus | $HL) $CM* $IS;
# LB 30
$CM* $OP $CM* ($ALPlus | $HL | $NU);
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
$CM* $RI $CM* $RI;
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
# LB 9
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
$CM+ $SP / .;
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -658,6 +702,9 @@ $CM* ($HY | $BA) $CM* $HL;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
# LB 30
($CM* $RI)+;
# For dictionary-based break
$dictionary $dictionary;
@ -674,6 +721,6 @@ $dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,14 +1,17 @@
# Copyright (c) 2002-2015 International Business Machines Corporation and
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_fi.txt
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
#
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
@ -22,8 +25,6 @@
#
!!chain;
!!LBCMNoChain;
!!lookAheadHardBreak;
#
@ -61,9 +62,14 @@
# See rule LB 19 for an example.
#
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
$EM = [\U0001F3FB-\U0001F3FF];
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$BA = [[:LineBreak = Break_After:] - [\u2010]];
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BA = [:LineBreak = Break_After:];
$HH = [\u2010];
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
@ -71,7 +77,7 @@ $B2 = [:LineBreak = Break_Both:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
$CM = [:LineBreak = Combining_Mark:];
$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
@ -80,7 +86,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
$ID = [:LineBreak = Ideographic:];
$ID = [[:LineBreak = Ideographic:][\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
@ -102,6 +108,7 @@ $SY = [:LineBreak = Break_Symbols:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
@ -135,7 +142,6 @@ $HLcm = $HL $CM*;
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
@ -165,6 +171,8 @@ $BB $CM+;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
$EB $CM+;
$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
@ -213,7 +221,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
@ -221,13 +229,13 @@ $CR $LF {100};
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
$CM+ $LB4Breaks {100};
^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
$CM+ [$SP $ZW];
^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
@ -238,20 +246,23 @@ $CM+ [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x ID Emoji proposal.
#
$ZWJ ($ID | $EB | $EM);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
$CM+;
^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
$CM+ $WJcm;
^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
@ -262,13 +273,13 @@ $WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
$CM+ GLcm;
^$CM+ $GLcm;
@ -277,23 +288,23 @@ $CM+ GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
@ -323,12 +334,10 @@ $LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
$CM+ $QUcm;
^$CM+ $QUcm;
# QU x
$QUcm .?;
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
# TODO: I don't think this rule is needed.
# LB 20
@ -344,6 +353,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL;
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm);
($HY | $HH) $AL;
^$CM+ ($BAcm | $HYcm | $HHcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
@ -359,25 +369,25 @@ $SYcm $HLcm;
# LB 22
($ALcm | $HLcm) $INcm;
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
$IDcm $INcm;
($ID | $EB | $EM) $CM* $INcm;
$INcm $INcm;
$NUcm $INcm;
# $LB 23
$IDcm $POcm;
($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
$PRcm $IDcm;
$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
@ -401,18 +411,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
# LB 30a Do not break between regional indicators.
$RIcm $RIcm;
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
@ -421,35 +440,37 @@ $RIcm $RIcm;
!!reverse;
$CM+ $ALPlus;
$CM+ $BA;
$CM+ $HH;
$CM+ $BB;
$CM+ $B2;
$CM+ $CL;
$CM+ $CP;
$CM+ $EX;
$CM+ $GL;
$CM+ $HL;
$CM+ $HY;
$CM+ $H2;
$CM+ $H3;
$CM+ $ID;
$CM+ $IN;
$CM+ $IS;
$CM+ $JL;
$CM+ $JV;
$CM+ $JT;
$CM+ $NS;
$CM+ $NU;
$CM+ $OP;
$CM+ $PO;
$CM+ $PR;
$CM+ $QU;
$CM+ $RI;
$CM+ $SY;
$CM+ $WJ;
$CM+;
^$CM+ $ALPlus;
^$CM+ $BA;
^$CM+ $HH;
^$CM+ $BB;
^$CM+ $B2;
^$CM+ $CL;
^$CM+ $CP;
^$CM+ $EB;
^$CM+ $EM;
^$CM+ $EX;
^$CM+ $GL;
^$CM+ $HL;
^$CM+ $HY;
^$CM+ $H2;
^$CM+ $H3;
^$CM+ $ID;
^$CM+ $IN;
^$CM+ $IS;
^$CM+ $JL;
^$CM+ $JV;
^$CM+ $JT;
^$CM+ $NS;
^$CM+ $NU;
^$CM+ $OP;
^$CM+ $PO;
^$CM+ $PR;
^$CM+ $QU;
^$CM+ $RI;
^$CM+ $SY;
^$CM+ $WJ;
^$CM+;
#
@ -461,14 +482,14 @@ $AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
@ -480,7 +501,7 @@ $AL_FOLLOW $CM+ / (
# LB 4, 5, 5
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
@ -497,30 +518,37 @@ $LF $CR;
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
# The ZWJ will look like a CM to whatever precedes it.
#
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
$CM+ $CAN_CM;
^$CM+ $CAN_CM;
# LB 11
$CM* $WJ $CM* $CAN_CM;
$CM* $WJ [$LB8NonBreaks-$CM];
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CM* $CAN_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CM* $CAN_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
@ -541,28 +569,26 @@ $SY [$LB8NonBreaks-$CM];
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
$CM* $CAN_CM $SP* $CM* $OP;
$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
$CM* $OP $SP* $CM* $QU;
$OP $SP* $CM* $QU;
# LB 16
$CM* $NS $SP* $CM* ($CL | $CP);
$NS $SP* $CM* ($CL | $CP);
# LB 17
$CM* $B2 $SP* $CM* $B2;
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
@ -571,13 +597,13 @@ $CM* $B2 $SP* $CM* $B2;
#
# LB 19
#
$CM* $QU $CM* $CAN_CM; # . x QU
$CM* $QU $LB18NonBreaks;
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CM* $CAN_CM $CM* $QU; # QU x .
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
@ -587,69 +613,87 @@ $CM* $CAN_CM $CM* $QU; # QU x .
$AL ($HY | $HH) / $SP;
# LB 21
$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
# LB21b (reverse)
$CM* $HL $CM* $SY;
$HL $CM* $SY;
# LB 22
$CM* $IN $CM* ($ALPlus | $HL);
$CM* $IN $CM* $EX;
$CM* $IN $CM* $ID;
$CM* $IN $CM* $IN;
$CM* $IN $CM* $NU;
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
$IN $CM* $IN;
$IN $CM* $NU;
# LB 23
$CM* $PO $CM* $ID;
$CM* $NU $CM* ($ALPlus | $HL);
$CM* ($ALPlus | $HL) $CM* $NU;
$PO $CM* ($ID | $EB | $EM);
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB 24
$CM* $ID $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PO;
($ID | $EB | $EM) $CM* $PR;
($ALPlus | $HL) $CM* $PR;
($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
$CM* ($JT | $JV) $CM* ($H2 | $JV);
$CM* $JT $CM* ($H3 | $JT);
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
$CM* ($ALPlus | $HL) $CM* $IS;
($ALPlus | $HL) $CM* $IS;
# LB 30
$CM* $OP $CM* ($ALPlus | $HL | $NU);
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
$CM* $RI $CM* $RI;
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
# LB 9
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
$CM+ $SP / .;
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -670,6 +714,9 @@ $CM* ($HY | $BA | $HH) $CM* $HL;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
# LB 30
($CM* $RI)+;
# For dictionary-based break
$dictionary $dictionary;
@ -686,6 +733,6 @@ $dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,13 +1,17 @@
# Copyright (c) 2002-2015 International Business Machines Corporation and
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_loose.txt
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
#
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
@ -26,8 +30,6 @@
#
!!chain;
!!LBCMNoChain;
!!lookAheadHardBreak;
#
@ -65,8 +67,13 @@
# See rule LB 19 for an example.
#
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
$EM = [\U0001F3FB-\U0001F3FF];
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BA = [:LineBreak = Break_After:];
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
@ -74,7 +81,7 @@ $B2 = [:LineBreak = Break_Both:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
$CM = [:LineBreak = Combining_Mark:];
$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
@ -83,7 +90,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
$ID = [[:LineBreak = Ideographic:] $CJ];
$ID = [[:LineBreak = Ideographic:]$CJ[\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
@ -106,6 +113,7 @@ $SY = [:LineBreak = Break_Symbols:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
@ -138,7 +146,6 @@ $HLcm = $HL $CM*;
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
@ -168,6 +175,8 @@ $BB $CM+;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
$EB $CM+;
$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
@ -217,7 +226,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
@ -225,13 +234,13 @@ $CR $LF {100};
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
$CM+ $LB4Breaks {100};
^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
$CM+ [$SP $ZW];
^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
@ -242,20 +251,23 @@ $CM+ [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x ID Emoji proposal.
#
$ZWJ ($ID | $EB | $EM);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
$CM+;
^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
$CM+ $WJcm;
^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
@ -266,13 +278,13 @@ $WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
$CM+ GLcm;
^$CM+ $GLcm;
@ -281,23 +293,23 @@ $CM+ GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
@ -329,12 +341,10 @@ $LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
$CM+ $QUcm;
^$CM+ $QUcm;
# QU x
$QUcm .?;
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
# TODO: I don't think this rule is needed.
# LB 20
@ -347,14 +357,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# BB x
#
# DO allow breaks here before NSXcm, so don't include it
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
^$CM+ ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
#
$HLcm ($HYcm | $BAcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
@ -363,25 +374,25 @@ $SYcm $HLcm;
# LB 22
($ALcm | $HLcm) $INcm;
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
$IDcm $INcm;
($ID | $EB | $EM) $CM* $INcm;
# $INcm $INcm; # delete this rule for CSS loose
$NUcm $INcm;
# $LB 23
$IDcm $POcm;
($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
$PRcm $IDcm;
$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
@ -405,18 +416,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
# LB 30a Do not break between regional indicators.
$RIcm $RIcm;
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
@ -425,35 +445,37 @@ $RIcm $RIcm;
!!reverse;
$CM+ $ALPlus;
$CM+ $BA;
$CM+ $BB;
$CM+ $B2;
$CM+ $CL;
$CM+ $CP;
$CM+ $EX;
$CM+ $GL;
$CM+ $HL;
$CM+ $HY;
$CM+ $H2;
$CM+ $H3;
$CM+ $ID;
$CM+ $IN;
$CM+ $IS;
$CM+ $JL;
$CM+ $JV;
$CM+ $JT;
$CM+ $NS;
$CM+ $NSX;
$CM+ $NU;
$CM+ $OP;
$CM+ $PO;
$CM+ $PR;
$CM+ $QU;
$CM+ $RI;
$CM+ $SY;
$CM+ $WJ;
$CM+;
^$CM+ $ALPlus;
^$CM+ $BA;
^$CM+ $BB;
^$CM+ $B2;
^$CM+ $CL;
^$CM+ $CP;
^$CM+ $EB;
^$CM+ $EM;
^$CM+ $EX;
^$CM+ $GL;
^$CM+ $HL;
^$CM+ $HY;
^$CM+ $H2;
^$CM+ $H3;
^$CM+ $ID;
^$CM+ $IN;
^$CM+ $IS;
^$CM+ $JL;
^$CM+ $JV;
^$CM+ $JT;
^$CM+ $NS;
^$CM+ $NSX;
^$CM+ $NU;
^$CM+ $OP;
^$CM+ $PO;
^$CM+ $PR;
^$CM+ $QU;
^$CM+ $RI;
^$CM+ $SY;
^$CM+ $WJ;
^$CM+;
#
@ -465,14 +487,14 @@ $AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
@ -484,7 +506,7 @@ $AL_FOLLOW $CM+ / (
# LB 4, 5, 5
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
@ -501,30 +523,37 @@ $LF $CR;
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
# The ZWJ will look like a CM to whatever precedes it.
#
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
$CM+ $CAN_CM;
^$CM+ $CAN_CM;
# LB 11
$CM* $WJ $CM* $CAN_CM;
$CM* $WJ [$LB8NonBreaks-$CM];
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CM* $CAN_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CM* $CAN_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
@ -545,29 +574,27 @@ $SY [$LB8NonBreaks-$CM];
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
$CM* $CAN_CM $SP* $CM* $OP;
$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
$CM* $OP $SP* $CM* $QU;
$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
$CM* $NS $SP* $CM* ($CL | $CP);
$NS $SP* $CM* ($CL | $CP);
# LB 17
$CM* $B2 $SP* $CM* $B2;
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
@ -576,13 +603,13 @@ $CM* $B2 $SP* $CM* $B2;
#
# LB 19
#
$CM* $QU $CM* $CAN_CM; # . x QU
$CM* $QU $LB18NonBreaks;
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CM* $CAN_CM $CM* $QU; # QU x .
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
@ -590,69 +617,88 @@ $CM* $CAN_CM $CM* $QU; # QU x .
# LB 21
# Don't include $NSX here
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA) $CM* $HL;
# LB21b (reverse)
$CM* $HL $CM* $SY;
$HL $CM* $SY;
# LB 22
$CM* $IN $CM* ($ALPlus | $HL);
$CM* $IN $CM* $EX;
$CM* $IN $CM* $ID;
# $CM* $IN $CM* $IN; # delete this rule for CSS loose
$CM* $IN $CM* $NU;
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
# $IN $CM* $IN; # delete this rule for CSS loose
$IN $CM* $NU;
# LB 23
$CM* $PO $CM* $ID;
$CM* $NU $CM* ($ALPlus | $HL);
$CM* ($ALPlus | $HL) $CM* $NU;
$PO $CM* ($ID | $EB | $EM);
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB 24
$CM* $ID $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PO;
($ID | $EB | $EM) $CM* $PR;
($ALPlus | $HL) $CM* $PR;
($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
$CM* ($JT | $JV) $CM* ($H2 | $JV);
$CM* $JT $CM* ($H3 | $JT);
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
$CM* ($ALPlus | $HL) $CM* $IS;
($ALPlus | $HL) $CM* $IS;
# LB 30
$CM* $OP $CM* ($ALPlus | $HL | $NU);
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
$CM* $RI $CM* $RI;
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
# Line Loose tailoring: Don't include NSX here.
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
# LB 9
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
$CM+ $SP / .;
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -673,6 +719,9 @@ $CM* ($HY | $BA) $CM* $HL;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
# LB 30
($CM* $RI)+;
# For dictionary-based break
$dictionary $dictionary;
@ -689,6 +738,6 @@ $dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,12 +1,16 @@
# Copyright (c) 2002-2015 International Business Machines Corporation and
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_loose_cj.txt
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
#
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
#
# tailored as noted in 2nd paragraph below..
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
@ -33,8 +37,6 @@
#
!!chain;
!!LBCMNoChain;
!!lookAheadHardBreak;
#
@ -72,8 +74,13 @@
# See rule LB 19 for an example.
#
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
$EM = [\U0001F3FB-\U0001F3FF];
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$BB = [:LineBreak = Break_Before:];
@ -82,7 +89,7 @@ $B2 = [:LineBreak = Break_Both:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
$CM = [:LineBreak = Combining_Mark:];
$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EXX = [\uFF01 \uFF1F];
@ -92,7 +99,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
$ID = [[:LineBreak = Ideographic:] $CJ];
$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
@ -117,6 +124,7 @@ $SY = [:LineBreak = Break_Symbols:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
@ -151,7 +159,6 @@ $HLcm = $HL $CM*;
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
@ -184,6 +191,8 @@ $BB $CM+;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
$EB $CM+;
$EM $CM+;
$EX $CM+;
$EXX $CM+;
$GL $CM+;
@ -236,7 +245,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
@ -244,13 +253,13 @@ $CR $LF {100};
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
$CM+ $LB4Breaks {100};
^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
$CM+ [$SP $ZW];
^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
@ -261,20 +270,23 @@ $CM+ [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x ID Emoji proposal.
#
$ZWJ ($ID | $EB | $EM);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
$CM+;
^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
$CM+ $WJcm;
^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
@ -285,14 +297,13 @@ $WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
$CM+ GLcm;
^$CM+ $GLcm;
#
@ -301,23 +312,23 @@ $CM+ GLcm;
# Do not include $EXX here
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
@ -349,12 +360,10 @@ $LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
$CM+ $QUcm;
^$CM+ $QUcm;
# QU x
$QUcm .?;
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
# TODO: I don't think this rule is needed.
# LB 20
@ -368,13 +377,14 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
#
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
^$CM+ ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
#
$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
@ -383,19 +393,19 @@ $SYcm $HLcm;
# LB 22
($ALcm | $HLcm) $INcm;
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
$IDcm $INcm;
($ID | $EB | $EM) $CM* $INcm;
# $INcm $INcm; # delete this rule for CSS loose
$NUcm $INcm;
# LB 23
# $LB 23
# Do not include $POX here
$IDcm $POcm;
($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
@ -403,7 +413,7 @@ $NUcm $HLcm;
# LB 24
#
# Do not include $PRX here
$PRcm $IDcm;
$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
($POcm | $POXcm) ($ALcm | $HLcm);
@ -429,18 +439,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
# LB 30a Do not break between regional indicators.
$RIcm $RIcm;
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
@ -449,39 +468,41 @@ $RIcm $RIcm;
!!reverse;
$CM+ $ALPlus;
$CM+ $BA;
$CM+ $BAX;
$CM+ $BB;
$CM+ $B2;
$CM+ $CL;
$CM+ $CP;
$CM+ $EX;
$CM+ $EXX;
$CM+ $GL;
$CM+ $HL;
$CM+ $HY;
$CM+ $H2;
$CM+ $H3;
$CM+ $ID;
$CM+ $IN;
$CM+ $IS;
$CM+ $JL;
$CM+ $JV;
$CM+ $JT;
$CM+ $NS;
$CM+ $NSX;
$CM+ $NU;
$CM+ $OP;
$CM+ $PO;
$CM+ $POX;
$CM+ $PR;
$CM+ $PRX;
$CM+ $QU;
$CM+ $RI;
$CM+ $SY;
$CM+ $WJ;
$CM+;
^$CM+ $ALPlus;
^$CM+ $BA;
^$CM+ $BAX;
^$CM+ $BB;
^$CM+ $B2;
^$CM+ $CL;
^$CM+ $CP;
^$CM+ $EB;
^$CM+ $EM;
^$CM+ $EX;
^$CM+ $EXX;
^$CM+ $GL;
^$CM+ $HL;
^$CM+ $HY;
^$CM+ $H2;
^$CM+ $H3;
^$CM+ $ID;
^$CM+ $IN;
^$CM+ $IS;
^$CM+ $JL;
^$CM+ $JV;
^$CM+ $JT;
^$CM+ $NS;
^$CM+ $NSX;
^$CM+ $NU;
^$CM+ $OP;
^$CM+ $PO;
^$CM+ $POX;
^$CM+ $PR;
^$CM+ $PRX;
^$CM+ $QU;
^$CM+ $RI;
^$CM+ $SY;
^$CM+ $WJ;
^$CM+;
#
@ -493,14 +514,14 @@ $AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
@ -512,7 +533,7 @@ $AL_FOLLOW $CM+ / (
# LB 4, 5, 5
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
@ -529,30 +550,37 @@ $LF $CR;
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
# The ZWJ will look like a CM to whatever precedes it.
#
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
$CM+ $CAN_CM;
^$CM+ $CAN_CM;
# LB 11
$CM* $WJ $CM* $CAN_CM;
$CM* $WJ [$LB8NonBreaks-$CM];
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CM* $CAN_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CM* $CAN_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
@ -574,29 +602,27 @@ $SY [$LB8NonBreaks-$CM];
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
$CM* $CAN_CM $SP* $CM* $OP;
$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
$CM* $OP $SP* $CM* $QU;
$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
$CM* $NS $SP* $CM* ($CL | $CP);
$NS $SP* $CM* ($CL | $CP);
# LB 17
$CM* $B2 $SP* $CM* $B2;
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
@ -605,13 +631,13 @@ $CM* $B2 $SP* $CM* $B2;
#
# LB 19
#
$CM* $QU $CM* $CAN_CM; # . x QU
$CM* $QU $LB18NonBreaks;
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CM* $CAN_CM $CM* $QU; # QU x .
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
@ -619,73 +645,90 @@ $CM* $CAN_CM $CM* $QU; # QU x .
# LB 21
# Don't include $BAX or $NSX here
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL;
# LB21b (reverse)
$CM* $HL $CM* $SY;
$HL $CM* $SY;
# LB 22
$CM* $IN $CM* ($ALPlus | $HL);
$CM* $IN $CM* $EX;
$CM* $IN $CM* $ID;
# $CM* $IN $CM* $IN; # delete this rule for CSS loose
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
# $IN $CM* $IN; # delete this rule for CSS loose
$CM* $IN $CM* $NU;
# LB 23
# Do not include $POX here
$CM* $PO $CM* $ID;
$CM* $NU $CM* ($ALPlus | $HL);
$CM* ($ALPlus | $HL) $CM* $NU;
$PO $CM* ($ID | $EB | $EM);
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB 24
# Do not include $PRX here
$CM* $ID $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PR;
$CM* ($ALPlus | $HL) $CM* ($PO | $POX);
($ID | $EB | $EM) $CM* $PR;
($ALPlus | $HL) $CM* $PR;
($ALPlus | $HL) $CM* ($PO | $POX);
# LB 25
# Here do not include $POX at the beginning or $PRX at the end
($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?;
# LB 26
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
$CM* ($JT | $JV) $CM* ($H2 | $JV);
$CM* $JT $CM* ($H3 | $JT);
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
# Do not include $POX or $PRX here
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
$CM* ($ALPlus | $HL) $CM* $IS;
($ALPlus | $HL) $CM* $IS;
# LB 30
$CM* $OP $CM* ($ALPlus | $HL | $NU);
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
$CM* $RI $CM* $RI;
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
# LB 9
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
$CM+ $SP / .;
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -706,6 +749,9 @@ $CM* ($HY | $BA | $BAX) $CM* $HL;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
# LB 30
($CM* $RI)+;
# For dictionary-based break
$dictionary $dictionary;
@ -722,6 +768,6 @@ $dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $dictionary];
^[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,13 +1,17 @@
# Copyright (c) 2002-2015 International Business Machines Corporation and
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_loose_fi.txt
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
#
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
@ -24,8 +28,6 @@
#
!!chain;
!!LBCMNoChain;
!!lookAheadHardBreak;
#
@ -63,8 +65,13 @@
# See rule LB 19 for an example.
#
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
$EM = [\U0001F3FB-\U0001F3FF];
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BA = [[:LineBreak = Break_After:] - [\u2010]];
$HH = [\u2010];
$BB = [:LineBreak = Break_Before:];
@ -73,7 +80,7 @@ $B2 = [:LineBreak = Break_Both:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
$CM = [:LineBreak = Combining_Mark:];
$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
@ -82,7 +89,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
$ID = [[:LineBreak = Ideographic:] $CJ];
$ID = [[:LineBreak = Ideographic:]$CJ[\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
@ -105,6 +112,7 @@ $SY = [:LineBreak = Break_Symbols:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
@ -138,7 +146,6 @@ $HLcm = $HL $CM*;
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
@ -169,6 +176,8 @@ $BB $CM+;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
$EB $CM+;
$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
@ -218,7 +227,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
@ -226,13 +235,13 @@ $CR $LF {100};
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
$CM+ $LB4Breaks {100};
^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
$CM+ [$SP $ZW];
^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
@ -243,20 +252,23 @@ $CM+ [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x ID Emoji proposal.
#
$ZWJ ($ID | $EB | $EM);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
$CM+;
^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
$CM+ $WJcm;
^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
@ -267,13 +279,13 @@ $WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
$CM+ GLcm;
^$CM+ $GLcm;
@ -282,23 +294,23 @@ $CM+ GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
@ -330,13 +342,10 @@ $LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
$CM+ $QUcm;
^$CM+ $QUcm;
# QU x
$QUcm .?;
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
# TODO: I don't think this rule is needed.
# LB 20
# <break> $CB
@ -352,13 +361,14 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL;
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm);
($HY | $HH) $AL;
^$CM+ ($BAcm | $HHcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
#
$HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
@ -367,25 +377,25 @@ $SYcm $HLcm;
# LB 22
($ALcm | $HLcm) $INcm;
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
$IDcm $INcm;
($ID | $EB | $EM) $CM* $INcm;
$INcm $INcm;
$NUcm $INcm;
# $LB 23
$IDcm $POcm;
($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
$PRcm $IDcm;
$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
@ -409,18 +419,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
# LB 30a Do not break between regional indicators.
$RIcm $RIcm;
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM] {eof}];
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM $ID $EB $EM] {eof}];
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM] {eof}];
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX {eof}];
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
@ -429,36 +448,38 @@ $RIcm $RIcm;
!!reverse;
$CM+ $ALPlus;
$CM+ $BA;
$CM+ $HH;
$CM+ $BB;
$CM+ $B2;
$CM+ $CL;
$CM+ $CP;
$CM+ $EX;
$CM+ $GL;
$CM+ $HL;
$CM+ $HY;
$CM+ $H2;
$CM+ $H3;
$CM+ $ID;
$CM+ $IN;
$CM+ $IS;
$CM+ $JL;
$CM+ $JV;
$CM+ $JT;
$CM+ $NS;
$CM+ $NSX;
$CM+ $NU;
$CM+ $OP;
$CM+ $PO;
$CM+ $PR;
$CM+ $QU;
$CM+ $RI;
$CM+ $SY;
$CM+ $WJ;
$CM+;
^$CM+ $ALPlus;
^$CM+ $BA;
^$CM+ $BB;
^$CM+ $B2;
^$CM+ $CL;
^$CM+ $CP;
^$CM+ $EB;
^$CM+ $EM;
^$CM+ $EX;
^$CM+ $GL;
^$CM+ $HH;
^$CM+ $HL;
^$CM+ $HY;
^$CM+ $H2;
^$CM+ $H3;
^$CM+ $ID;
^$CM+ $IN;
^$CM+ $IS;
^$CM+ $JL;
^$CM+ $JV;
^$CM+ $JT;
^$CM+ $NS;
^$CM+ $NSX;
^$CM+ $NU;
^$CM+ $OP;
^$CM+ $PO;
^$CM+ $PR;
^$CM+ $QU;
^$CM+ $RI;
^$CM+ $SY;
^$CM+ $WJ;
^$CM+;
#
@ -470,14 +491,14 @@ $AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
@ -489,7 +510,7 @@ $AL_FOLLOW $CM+ / (
# LB 4, 5, 5
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
@ -506,30 +527,37 @@ $LF $CR;
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
# The ZWJ will look like a CM to whatever precedes it.
#
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
$CM+ $CAN_CM;
^$CM+ $CAN_CM;
# LB 11
$CM* $WJ $CM* $CAN_CM;
$CM* $WJ [$LB8NonBreaks-$CM];
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CM* $CAN_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CM* $CAN_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
@ -550,29 +578,27 @@ $SY [$LB8NonBreaks-$CM];
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
$CM* $CAN_CM $SP* $CM* $OP;
$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
$CM* $OP $SP* $CM* $QU;
$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
$CM* $NS $SP* $CM* ($CL | $CP);
$NS $SP* $CM* ($CL | $CP);
# LB 17
$CM* $B2 $SP* $CM* $B2;
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
@ -581,13 +607,13 @@ $CM* $B2 $SP* $CM* $B2;
#
# LB 19
#
$CM* $QU $CM* $CAN_CM; # . x QU
$CM* $QU $LB18NonBreaks;
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CM* $CAN_CM $CM* $QU; # QU x .
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
@ -598,69 +624,87 @@ $AL ($HY | $HH) / $SP;
# LB 21
# Don't include $NSX here
$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
# LB21b (reverse)
$CM* $HL $CM* $SY;
$HL $CM* $SY;
# LB 22
$CM* $IN $CM* ($ALPlus | $HL);
$CM* $IN $CM* $EX;
$CM* $IN $CM* $ID;
$CM* $IN $CM* $IN;
$CM* $IN $CM* $NU;
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
$IN $CM* $IN;
$IN $CM* $NU;
# LB 23
$CM* $PO $CM* $ID;
$CM* $NU $CM* ($ALPlus | $HL);
$CM* ($ALPlus | $HL) $CM* $NU;
$PO $CM* ($ID | $EB | $EM);
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB 24
$CM* $ID $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PO;
($ID | $EB | $EM) $CM* $PR;
($ALPlus | $HL) $CM* $PR;
($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
$CM* ($JT | $JV) $CM* ($H2 | $JV);
$CM* $JT $CM* ($H3 | $JT);
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
$CM* ($ALPlus | $HL) $CM* $IS;
($ALPlus | $HL) $CM* $IS;
# LB 30
$CM* $OP $CM* ($ALPlus | $HL | $NU);
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
$CM* $RI $CM* $RI;
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
[{bof} $NS $NSX $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $NSX $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
# LB 9
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
$CM+ $SP / .;
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -675,12 +719,15 @@ $SP+ $CM* ($CL | $CP);
$SP+ $CM* $B2;
# LB 21
$CM* ($HY | $BA | $HH) $CM* $HL;
($HY | $BA | $HH) $CM* $HL;
# LB 25
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
# LB 30
($CM* $RI)+;
# For dictionary-based break
$dictionary $dictionary;
@ -697,6 +744,6 @@ $dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,13 +1,17 @@
# Copyright (c) 2002-2015 International Business Machines Corporation and
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_normal.txt
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
#
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
@ -23,8 +27,6 @@
#
!!chain;
!!LBCMNoChain;
!!lookAheadHardBreak;
#
@ -62,8 +64,13 @@
# See rule LB 19 for an example.
#
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
$EM = [\U0001F3FB-\U0001F3FF];
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BA = [:LineBreak = Break_After:];
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
@ -71,7 +78,7 @@ $B2 = [:LineBreak = Break_Both:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
$CM = [:LineBreak = Combining_Mark:];
$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
@ -80,7 +87,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
$ID = [[:LineBreak = Ideographic:] $CJ];
$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
@ -102,6 +109,7 @@ $SY = [:LineBreak = Break_Symbols:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
@ -134,7 +142,6 @@ $HLcm = $HL $CM*;
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
@ -163,6 +170,8 @@ $BB $CM+;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
$EB $CM+;
$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
@ -211,7 +220,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
@ -219,13 +228,13 @@ $CR $LF {100};
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
$CM+ $LB4Breaks {100};
^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
$CM+ [$SP $ZW];
^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
@ -236,20 +245,23 @@ $CM+ [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x ID Emoji proposal.
#
$ZWJ ($ID | $EB | $EM);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
$CM+;
^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
$CM+ $WJcm;
^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
@ -260,13 +272,13 @@ $WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
$CM+ GLcm;
^$CM+ $GLcm;
@ -275,23 +287,23 @@ $CM+ GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
@ -321,12 +333,10 @@ $LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
$CM+ $QUcm;
^$CM+ $QUcm;
# QU x
$QUcm .?;
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
# TODO: I don't think this rule is needed.
# LB 20
@ -338,14 +348,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
^$CM+ ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
#
$HLcm ($HYcm | $BAcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
@ -354,25 +365,25 @@ $SYcm $HLcm;
# LB 22
($ALcm | $HLcm) $INcm;
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
$IDcm $INcm;
($ID | $EB | $EM) $CM* $INcm;
$INcm $INcm;
$NUcm $INcm;
# $LB 23
$IDcm $POcm;
($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
$PRcm $IDcm;
$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
@ -396,18 +407,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
# LB 30a Do not break between regional indicators.
$RIcm $RIcm;
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
@ -416,34 +436,36 @@ $RIcm $RIcm;
!!reverse;
$CM+ $ALPlus;
$CM+ $BA;
$CM+ $BB;
$CM+ $B2;
$CM+ $CL;
$CM+ $CP;
$CM+ $EX;
$CM+ $GL;
$CM+ $HL;
$CM+ $HY;
$CM+ $H2;
$CM+ $H3;
$CM+ $ID;
$CM+ $IN;
$CM+ $IS;
$CM+ $JL;
$CM+ $JV;
$CM+ $JT;
$CM+ $NS;
$CM+ $NU;
$CM+ $OP;
$CM+ $PO;
$CM+ $PR;
$CM+ $QU;
$CM+ $RI;
$CM+ $SY;
$CM+ $WJ;
$CM+;
^$CM+ $ALPlus;
^$CM+ $BA;
^$CM+ $BB;
^$CM+ $B2;
^$CM+ $CL;
^$CM+ $CP;
^$CM+ $EB;
^$CM+ $EM;
^$CM+ $EX;
^$CM+ $GL;
^$CM+ $HL;
^$CM+ $HY;
^$CM+ $H2;
^$CM+ $H3;
^$CM+ $ID;
^$CM+ $IN;
^$CM+ $IS;
^$CM+ $JL;
^$CM+ $JV;
^$CM+ $JT;
^$CM+ $NS;
^$CM+ $NU;
^$CM+ $OP;
^$CM+ $PO;
^$CM+ $PR;
^$CM+ $QU;
^$CM+ $RI;
^$CM+ $SY;
^$CM+ $WJ;
^$CM+;
#
@ -455,14 +477,14 @@ $AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
@ -474,7 +496,7 @@ $AL_FOLLOW $CM+ / (
# LB 4, 5, 5
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
@ -491,30 +513,37 @@ $LF $CR;
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
# The ZWJ will look like a CM to whatever precedes it.
#
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
$CM+ $CAN_CM;
^$CM+ $CAN_CM;
# LB 11
$CM* $WJ $CM* $CAN_CM;
$CM* $WJ [$LB8NonBreaks-$CM];
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CM* $CAN_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CM* $CAN_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
@ -535,28 +564,26 @@ $SY [$LB8NonBreaks-$CM];
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
$CM* $CAN_CM $SP* $CM* $OP;
$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
$CM* $OP $SP* $CM* $QU;
$OP $SP* $CM* $QU;
# LB 16
$CM* $NS $SP* $CM* ($CL | $CP);
$NS $SP* $CM* ($CL | $CP);
# LB 17
$CM* $B2 $SP* $CM* $B2;
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
@ -565,82 +592,100 @@ $CM* $B2 $SP* $CM* $B2;
#
# LB 19
#
$CM* $QU $CM* $CAN_CM; # . x QU
$CM* $QU $LB18NonBreaks;
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CM* $CAN_CM $CM* $QU; # QU x .
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 21
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA) $CM* $HL;
# LB21b (reverse)
$CM* $HL $CM* $SY;
$HL $CM* $SY;
# LB 22
$CM* $IN $CM* ($ALPlus | $HL);
$CM* $IN $CM* $EX;
$CM* $IN $CM* $ID;
$CM* $IN $CM* $IN;
$CM* $IN $CM* $NU;
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
$IN $CM* $IN;
$IN $CM* $NU;
# LB 23
$CM* $PO $CM* $ID;
$CM* $NU $CM* ($ALPlus | $HL);
$CM* ($ALPlus | $HL) $CM* $NU;
$PO $CM* ($ID | $EB | $EM);
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB 24
$CM* $ID $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PO;
($ID | $EB | $EM) $CM* $PR;
($ALPlus | $HL) $CM* $PR;
($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
$CM* ($JT | $JV) $CM* ($H2 | $JV);
$CM* $JT $CM* ($H3 | $JT);
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
$CM* ($ALPlus | $HL) $CM* $IS;
($ALPlus | $HL) $CM* $IS;
# LB 30
$CM* $OP $CM* ($ALPlus | $HL | $NU);
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
$CM* $RI $CM* $RI;
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
# LB 9
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
$CM+ $SP / .;
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -661,6 +706,9 @@ $CM* ($HY | $BA) $CM* $HL;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
# LB 30
($CM* $RI)+;
# For dictionary-based break
$dictionary $dictionary;
@ -677,6 +725,6 @@ $dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,13 +1,17 @@
# Copyright (c) 2002-2015 International Business Machines Corporation and
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_normal_cj.txt
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
#
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
@ -24,8 +28,6 @@
#
!!chain;
!!LBCMNoChain;
!!lookAheadHardBreak;
#
@ -63,8 +65,13 @@
# See rule LB 19 for an example.
#
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
$EM = [\U0001F3FB-\U0001F3FF];
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$BB = [:LineBreak = Break_Before:];
@ -73,7 +80,7 @@ $B2 = [:LineBreak = Break_Both:];
$CB = [:LineBreak = Contingent_Break:];
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
$CL = [:LineBreak = Close_Punctuation:];
$CM = [:LineBreak = Combining_Mark:];
$CM = [[:LineBreak = Combining_Mark:] \u200d];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
$EX = [:LineBreak = Exclamation:];
@ -82,7 +89,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
$HY = [:LineBreak = Hyphen:];
$H2 = [:LineBreak = H2:];
$H3 = [:LineBreak = H3:];
$ID = [[:LineBreak = Ideographic:] $CJ];
$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB];
$IN = [:LineBreak = Inseperable:];
$IS = [:LineBreak = Infix_Numeric:];
$JL = [:LineBreak = JL:];
@ -105,6 +112,7 @@ $SY = [:LineBreak = Break_Symbols:];
$WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [\u200d];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
@ -138,7 +146,6 @@ $HLcm = $HL $CM*;
$HYcm = $HY $CM*;
$H2cm = $H2 $CM*;
$H3cm = $H3 $CM*;
$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$JLcm = $JL $CM*;
@ -169,6 +176,8 @@ $BB $CM+;
$B2 $CM+;
$CL $CM+;
$CP $CM+;
$EB $CM+;
$EM $CM+;
$EX $CM+;
$GL $CM+;
$HL $CM+;
@ -218,7 +227,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
# Rule LB 4, 5 Mandatory (Hard) breaks.
#
$LB4Breaks = [$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL];
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
$CR $LF {100};
#
@ -226,13 +235,13 @@ $CR $LF {100};
#
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
$CAN_CM $CM* $LB4Breaks {100};
$CM+ $LB4Breaks {100};
^$CM+ $LB4Breaks {100};
# LB 7 x SP
# x ZW
$LB4NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
$CM+ [$SP $ZW];
^$CM+ [$SP $ZW];
#
# LB 8 Break after zero width space
@ -243,20 +252,23 @@ $CM+ [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x ID Emoji proposal.
#
$ZWJ ($ID | $EB | $EM);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
$CM+;
^$CM+;
#
# LB 11 Do not break before or after WORD JOINER & related characters.
#
$CAN_CM $CM* $WJcm;
$LB8NonBreaks $WJcm;
$CM+ $WJcm;
^$CM+ $WJcm;
$WJcm $CANT_CM;
$WJcm $CAN_CM $CM*;
@ -267,13 +279,13 @@ $WJcm $CAN_CM $CM*;
#
$GLcm $CAN_CM $CM*;
$GLcm $CANT_CM;
#
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
$CM+ GLcm;
^$CM+ $GLcm;
@ -282,23 +294,23 @@ $CM+ GLcm;
#
$LB8NonBreaks $CL;
$CAN_CM $CM* $CL;
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $CP;
$CAN_CM $CM* $CP;
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $EX;
$CAN_CM $CM* $EX;
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $IS;
$CAN_CM $CM* $IS;
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
$LB8NonBreaks $SY;
$CAN_CM $CM* $SY;
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
#
@ -330,12 +342,10 @@ $LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
$LB18NonBreaks $CM* $QUcm;
$CM+ $QUcm;
^$CM+ $QUcm;
# QU x
$QUcm .?;
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
# TODO: I don't think this rule is needed.
# LB 20
@ -348,14 +358,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# BB x
#
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
^$CM+ ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB20NonBreaks $CM*;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
#
$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
@ -364,25 +375,25 @@ $SYcm $HLcm;
# LB 22
($ALcm | $HLcm) $INcm;
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
$EXcm $INcm;
$IDcm $INcm;
($ID | $EB | $EM) $CM* $INcm;
$INcm $INcm;
$NUcm $INcm;
# $LB 23
$IDcm $POcm;
($ID | $EB | $EM) $CM* $POcm;
$ALcm $NUcm; # includes $LB19
$HLcm $NUcm;
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
$NUcm $HLcm;
#
# LB 24
#
$PRcm $IDcm;
$PRcm ($ID | $EB | $EM);
$PRcm ($ALcm | $HLcm);
$POcm ($ALcm | $HLcm);
@ -406,18 +417,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
# LB 28 Do not break between alphabetics
#
($ALcm | $HLcm) ($ALcm | $HLcm);
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
# LB 29
$IScm ($ALcm | $HLcm);
# LB 30
($ALcm | $HLcm | $NUcm) $OPcm;
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CPcm ($ALcm | $HLcm | $NUcm);
# LB 30a Do not break between regional indicators.
$RIcm $RIcm;
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
@ -426,36 +446,38 @@ $RIcm $RIcm;
!!reverse;
$CM+ $ALPlus;
$CM+ $BA;
$CM+ $BAX;
$CM+ $BB;
$CM+ $B2;
$CM+ $CL;
$CM+ $CP;
$CM+ $EX;
$CM+ $GL;
$CM+ $HL;
$CM+ $HY;
$CM+ $H2;
$CM+ $H3;
$CM+ $ID;
$CM+ $IN;
$CM+ $IS;
$CM+ $JL;
$CM+ $JV;
$CM+ $JT;
$CM+ $NS;
$CM+ $NSX;
$CM+ $NU;
$CM+ $OP;
$CM+ $PO;
$CM+ $PR;
$CM+ $QU;
$CM+ $RI;
$CM+ $SY;
$CM+ $WJ;
$CM+;
^$CM+ $ALPlus;
^$CM+ $BA;
^$CM+ $BAX;
^$CM+ $BB;
^$CM+ $B2;
^$CM+ $CL;
^$CM+ $CP;
^$CM+ $EB;
^$CM+ $EM;
^$CM+ $EX;
^$CM+ $GL;
^$CM+ $HL;
^$CM+ $HY;
^$CM+ $H2;
^$CM+ $H3;
^$CM+ $ID;
^$CM+ $IN;
^$CM+ $IS;
^$CM+ $JL;
^$CM+ $JV;
^$CM+ $JT;
^$CM+ $NS;
^$CM+ $NSX;
^$CM+ $NU;
^$CM+ $OP;
^$CM+ $PO;
^$CM+ $PR;
^$CM+ $QU;
^$CM+ $RI;
^$CM+ $SY;
^$CM+ $WJ;
^$CM+;
#
@ -467,14 +489,14 @@ $AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
@ -486,7 +508,7 @@ $AL_FOLLOW $CM+ / (
# LB 4, 5, 5
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
@ -503,30 +525,37 @@ $LF $CR;
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
# The ZWJ will look like a CM to whatever precedes it.
#
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
$CM+ $CAN_CM;
^$CM+ $CAN_CM;
# LB 11
$CM* $WJ $CM* $CAN_CM;
$CM* $WJ [$LB8NonBreaks-$CM];
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CM* $CAN_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CM* $CAN_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
@ -547,29 +576,27 @@ $SY [$LB8NonBreaks-$CM];
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 14 OP SP* x
#
$CM* $CAN_CM $SP* $CM* $OP;
$CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
# LB 15
$CM* $OP $SP* $CM* $QU;
$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
$CM* $NS $SP* $CM* ($CL | $CP);
$NS $SP* $CM* ($CL | $CP);
# LB 17
$CM* $B2 $SP* $CM* $B2;
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
@ -578,13 +605,13 @@ $CM* $B2 $SP* $CM* $B2;
#
# LB 19
#
$CM* $QU $CM* $CAN_CM; # . x QU
$CM* $QU $LB18NonBreaks;
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CM* $CAN_CM $CM* $QU; # QU x .
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
@ -592,69 +619,87 @@ $CM* $CAN_CM $CM* $QU; # QU x .
# LB 21
# Don't include $BAX or $NSX here
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL;
# LB21a Don't break after Hebrew + Hyphen.
([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL;
# LB21b (reverse)
$CM* $HL $CM* $SY;
$HL $CM* $SY;
# LB 22
$CM* $IN $CM* ($ALPlus | $HL);
$CM* $IN $CM* $EX;
$CM* $IN $CM* $ID;
$CM* $IN $CM* $IN;
$CM* $IN $CM* $NU;
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
$IN $CM* $IN;
$IN $CM* $NU;
# LB 23
$CM* $PO $CM* $ID;
$CM* $NU $CM* ($ALPlus | $HL);
$CM* ($ALPlus | $HL) $CM* $NU;
$PO $CM* ($ID | $EB | $EM);
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB 24
$CM* $ID $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PR;
$CM* ($ALPlus | $HL) $CM* $PO;
($ID | $EB | $EM) $CM* $PR;
($ALPlus | $HL) $CM* $PR;
($ALPlus | $HL) $CM* $PO;
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
$CM* ($JT | $JV) $CM* ($H2 | $JV);
$CM* $JT $CM* ($H3 | $JT);
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
$CM* ($ALPlus | $HL) $CM* $IS;
($ALPlus | $HL) $CM* $IS;
# LB 30
$CM* $OP $CM* ($ALPlus | $HL | $NU);
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
$CM* $RI $CM* $RI;
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
# LB 9
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
$CM+ $SP / .;
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -675,6 +720,9 @@ $CM* ($HY | $BA | $BAX) $CM* $HL;
($CM* ($IS | $SY))+ $CM* $NU;
($CL | $CP) $CM* ($NU | $IS | $SY);
# LB 30
($CM* $RI)+;
# For dictionary-based break
$dictionary $dictionary;
@ -691,6 +739,6 @@ $dictionary $dictionary;
# turn off rule chaining. We don't want to move more
# than necessary.
#
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $dictionary];
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,4 +1,4 @@
# Copyright (c) 2002-2015 International Business Machines Corporation and
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line_normal_fi.txt
@ -269,7 +269,7 @@ $GLcm $CANT_CM;
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
$CM+ GLcm;
$CM+ $GLcm;

View file

@ -1,12 +1,13 @@
#
# Copyright (C) 2002-2015, International Business Machines Corporation
# Copyright (C) 2002-2016, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
# These rules are based on UAX #29 Revision 27 for Unicode Version 8.0
# with additions from L2/16-011R3 for Emoji sequences.
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
@ -24,12 +25,17 @@
# Character Class Definitions.
#
$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
$E_Modifier = [\U0001F3FB-\U0001F3FF];
$ZWJ = [\u200D];
$GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}];
$Newline = [\p{Word_Break = Newline} ];
$Extend = [[\p{Word_Break = Extend}][:Block=Tags:]];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
$Format = [\p{Word_Break = Format}];
$Format = [[\p{Word_Break = Format}] - [:Block=Tags:]];
$Katakana = [\p{Word_Break = Katakana}];
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
$ALetter = [\p{Word_Break = ALetter}];
@ -66,21 +72,21 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
# except when they appear at the beginning of a region of text.
#
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
$KatakanaEx = $Katakana ($Extend | $Format)*;
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
$Ideographic = [\p{Ideographic}];
$HiraganaEx = $Hiragana ($Extend | $Format)*;
$IdeographicEx = $Ideographic ($Extend | $Format)*;
$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
## -------------------------------------------------
@ -91,12 +97,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format)*;
#
$CR $LF;
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
#
$ZWJ $GAZ;
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
# of a region of Text. The rule here comes into play when the start of text
# begins with a group of Format chars, or with a "word" consisting of a single
# char that is not in any of the listed word break categories followed by
# format char(s), or is not a CJK dictionary character.
[^$CR $LF $Newline]? ($Extend | $Format)+;
[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
$NumericEx {100};
$ALetterEx {200};
@ -106,6 +117,10 @@ $KatakanaEx {400}; # note: these status values override those from rule 5
$HiraganaEx {400}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
$E_Base ($Extend | $Format | $ZWJ)*;
$E_Modifier ($Extend | $Format | $ZWJ)*;
$GAZ ($Extend | $Format | $ZWJ)*;
#
# rule 5
# Do not break between most letters.
@ -157,36 +172,48 @@ $ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {400}; # (13b)
# rule 13c
$Regional_IndicatorEx $Regional_IndicatorEx;
# Pairs of Regional Indicators stay together.
# With rule chaining disabled by ^, this rule will match exactly two of them.
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
#
^$Regional_IndicatorEx $Regional_IndicatorEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
# rule 13d
# E_Base x E_Modifier
#
($E_Base | $GAZ) ($Format | $Extend | $ZWJ)* $E_Modifier;
## -------------------------------------------------
!!reverse;
$BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter;
$BackALetterEx = ($Format | $Extend)* $ALetterPlus;
$BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote;
$BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote;
$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;
$BackNumericEx = ($Format | $Extend)* $Numeric;
$BackMidNumEx = ($Format | $Extend)* $MidNum;
$BackMidLetterEx = ($Format | $Extend)* $MidLetter;
$BackKatakanaEx = ($Format | $Extend)* $Katakana;
$BackHiraganaEx = ($Format | $Extend)* $Hiragana;
$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet;
$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus;
$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote;
$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote;
$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet;
$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric;
$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum;
$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter;
$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana;
$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana;
$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
# rule 3
$LF $CR;
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
#
$GAZ $ZWJ;
# rule 4
($Format | $Extend)* [^$CR $LF $Newline]?;
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
# rule 5
@ -229,18 +256,32 @@ $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $B
# rule 13c
$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
$GAZ $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
$GAZ $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable;
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
# rule 13d
$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $GAZ);
## -------------------------------------------------
!!safe_reverse;
# rule 3
($Extend | $Format)+ .?;
($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
@ -252,6 +293,9 @@ $Double_Quote $BackHebrew_LetterEx;
# rule 11
($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
# rule 13c
$BackRegional_IndicatorEx*;
# For dictionary-based break
$dictionary $dictionary;
@ -260,7 +304,7 @@ $dictionary $dictionary;
!!safe_forward;
# rule 4
($Extend | $Format)+ .?;
($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
@ -271,5 +315,8 @@ $Double_QuoteEx $Hebrew_LetterEx;
# rule 11
($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
# rule 13c
$Regional_IndicatorEx*;
# For dictionary-based break
$dictionary $dictionary;

View file

@ -1,12 +1,13 @@
#
# Copyright (C) 2002-2015, International Business Machines Corporation
# Copyright (C) 2002-2016, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word_POSIX.txt
#
# ICU Word Break Rules, POSIX locale.
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
# These rules are based on UAX #29 Revision 27 for Unicode Version 8.0
# with additions from L2/16-011R3 for Emoji sequences.
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
@ -24,12 +25,17 @@
# Character Class Definitions.
#
$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
$E_Modifier = [\U0001F3FB-\U0001F3FF];
$ZWJ = [\u200D];
$GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}];
$Newline = [\p{Word_Break = Newline} ];
$Extend = [[\p{Word_Break = Extend}][:Block=Tags:]];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
$Format = [\p{Word_Break = Format}];
$Format = [[\p{Word_Break = Format}] - [:Block=Tags:]];
$Katakana = [\p{Word_Break = Katakana}];
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
$ALetter = [\p{Word_Break = ALetter}];
@ -50,7 +56,7 @@ $Hiragana = [:Hiragana:];
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
$Control = [\p{Grapheme_Cluster_Break = Control}];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji = [$Han $Hiragana $Katakana];
@ -62,25 +68,25 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
#
# Rules 4 Ignore Format and Extend characters,
# Rules 4 Ignore Format and Extend characters,
# except when they appear at the beginning of a region of text.
#
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
$KatakanaEx = $Katakana ($Extend | $Format)*;
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
$Ideographic = [\p{Ideographic}];
$HiraganaEx = $Hiragana ($Extend | $Format)*;
$IdeographicEx = $Ideographic ($Extend | $Format)*;
$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
## -------------------------------------------------
@ -91,12 +97,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format)*;
#
$CR $LF;
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
#
$ZWJ $GAZ;
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
# of a region of Text. The rule here comes into play when the start of text
# begins with a group of Format chars, or with a "word" consisting of a single
# char that is not in any of the listed word break categories followed by
# format char(s), or is not a CJK dictionary character.
[^$CR $LF $Newline]? ($Extend | $Format)+;
[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
$NumericEx {100};
$ALetterEx {200};
@ -106,6 +117,10 @@ $KatakanaEx {400}; # note: these status values override those from rule 5
$HiraganaEx {400}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
$E_Base ($Extend | $Format | $ZWJ)*;
$E_Modifier ($Extend | $Format | $ZWJ)*;
$GAZ ($Extend | $Format | $ZWJ)*;
#
# rule 5
# Do not break between most letters.
@ -133,7 +148,7 @@ $NumericEx $NumericEx {100};
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 11 and 12
# rule 11 and 12
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
@ -157,36 +172,48 @@ $ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {400}; # (13b)
# rule 13c
$Regional_IndicatorEx $Regional_IndicatorEx;
# Pairs of Regional Indicators stay together.
# With rule chaining disabled by ^, this rule will match exactly two of them.
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
#
^$Regional_IndicatorEx $Regional_IndicatorEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
# rule 13d
# E_Base x E_Modifier
#
($E_Base | $GAZ) ($Format | $Extend | $ZWJ)* $E_Modifier;
## -------------------------------------------------
!!reverse;
$BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter;
$BackALetterEx = ($Format | $Extend)* $ALetterPlus;
$BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote;
$BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote;
$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;
$BackNumericEx = ($Format | $Extend)* $Numeric;
$BackMidNumEx = ($Format | $Extend)* $MidNum;
$BackMidLetterEx = ($Format | $Extend)* $MidLetter;
$BackKatakanaEx = ($Format | $Extend)* $Katakana;
$BackHiraganaEx = ($Format | $Extend)* $Hiragana;
$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet;
$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus;
$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote;
$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote;
$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet;
$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric;
$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum;
$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter;
$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana;
$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana;
$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
# rule 3
$LF $CR;
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
#
$GAZ $ZWJ;
# rule 4
($Format | $Extend)* [^$CR $LF $Newline]?;
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
# rule 5
@ -225,22 +252,36 @@ $BackKatakanaEx $BackKatakanaEx;
# rules 13 a/b
#
$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
# rule 13c
$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
$GAZ $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
$GAZ $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable;
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
# rule 13d
$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $GAZ);
## -------------------------------------------------
!!safe_reverse;
# rule 3
($Extend | $Format)+ .?;
($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
@ -252,6 +293,9 @@ $Double_Quote $BackHebrew_LetterEx;
# rule 11
($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
# rule 13c
$BackRegional_IndicatorEx*;
# For dictionary-based break
$dictionary $dictionary;
@ -260,7 +304,7 @@ $dictionary $dictionary;
!!safe_forward;
# rule 4
($Extend | $Format)+ .?;
($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
@ -271,5 +315,8 @@ $Double_QuoteEx $Hebrew_LetterEx;
# rule 11
($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
# rule 13c
$Regional_IndicatorEx*;
# For dictionary-based break
$dictionary $dictionary;

View file

@ -543,7 +543,7 @@ static void TestBreakIteratorRules() {
* keep together 'abc', but only when followed by 'def', OTHERWISE
* just return one char at a time.
*/
char rules[] = "abc{666}/def;\n [\\p{L} - [a]]* {2}; . {1};";
char rules[] = "abc/def{666};\n [\\p{L} - [a]]* {2}; . {1};";
/* 0123456789012345678 */
char data[] = "abcdex abcdefgh-def"; /* the test data string */
char breaks[] = "** ** * ** *"; /* * the expected break positions */

View file

@ -49,7 +49,7 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \
tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o \
tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
bytestrietest.o ucharstrietest.o \
itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
itrbbi.o rbbiapts.o rbbitst.o rbbimonkeytest.o ittrans.o transapi.o cpdtrtst.o \
testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
jamotest.o srchtest.o reptest.o regextst.o \
itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2015, International Business Machines Corporation and
* Copyright (c) 1997-2016, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -1790,6 +1790,39 @@ float IntlTest::random() {
return random(&RAND_SEED);
}
/*
* Integer random number class implementation.
* Similar to C++ std::minstd_rand, with the same algorithm & constants.
*/
IntlTest::icu_rand::icu_rand(uint32_t seed) {
seed = seed % 2147483647UL;
if (seed == 0) {
seed = 1;
}
fLast = seed;
}
IntlTest::icu_rand::~icu_rand() {};
void IntlTest::icu_rand::seed(uint32_t seed) {
if (seed == 0) {
seed = 1;
}
fLast = seed;
}
uint32_t IntlTest::icu_rand::operator() () {
fLast = ((uint64_t)fLast * 48271UL) % 2147483647UL;
return fLast;
}
uint32_t IntlTest::icu_rand::getSeed() {
return (uint32_t) fLast;
}
static inline UChar toHex(int32_t i) {
return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10)));
}

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2015, International Business Machines Corporation and
* COPYRIGHT:
* Copyright (c) 1997-2016, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -18,7 +18,7 @@
#if U_NO_DEFAULT_INCLUDE_UTF_HEADERS
/* deprecated - make tests pass with U_NO_DEFAULT_INCLUDE_UTF_HEADERS */
#include "unicode/utf_old.h"
#include "unicode/utf_old.h"
#endif
U_NAMESPACE_USE
@ -166,7 +166,7 @@ public:
/**
* Replaces isICUVersionAtLeast and isICUVersionBefore
* log that an issue is known.
* Usually used this way:
* Usually used this way:
* <code>if( ... && logKnownIssue("12345", "some bug")) continue; </code>
* @param ticket ticket string, "12345" or "cldrbug:1234"
* @param message optional message string
@ -230,11 +230,11 @@ public:
void errcheckln(UErrorCode status, const char *fmt, ...);
// Print ALL named errors encountered so far
void printErrors();
void printErrors();
// print known issues. return TRUE if there were any.
UBool printKnownIssues();
virtual void usage( void ) ;
/**
@ -253,6 +253,30 @@ public:
*/
static float random();
/**
* Integer random numbers, similar to C++ std::minstd_rand, with the same algorithm
* and constants. Allow additional access to internal state, for use by monkey tests,
* which need to recreate previous random sequences beginning near a failure point.
*/
class icu_rand {
public:
icu_rand(uint32_t seed = 1);
~icu_rand();
void seed(uint32_t seed);
uint32_t operator()();
/**
* Get a seed corresponding to the current state of the generator.
* Seeding any generator with this value will cause it to produce the
* same sequence as this one will from this point forward.
*/
uint32_t getSeed();
private:
uint32_t fLast;
};
enum { kMaxProps = 16 };
virtual void setProperty(const char* propline);
@ -320,7 +344,7 @@ private:
int32_t dataErrorCount;
IntlTest* caller;
char* testPath; // specifies subtests
char basePath[1024];
char currName[1024]; // current test name

View file

@ -238,6 +238,7 @@
<DisableLanguageExtensions>false</DisableLanguageExtensions>
</ClCompile>
<ClCompile Include="rbbitst.cpp" />
<ClCompile Include="rbbimonkeytest.cpp" />
<ClCompile Include="itspoof.cpp" />
<ClCompile Include="allcoll.cpp" />
<ClCompile Include="alphaindextst.cpp" />
@ -434,6 +435,7 @@
<ClInclude Include="itrbbi.h" />
<ClInclude Include="rbbiapts.h" />
<ClInclude Include="rbbitst.h" />
<ClInclude Include="rbbimonkeytest.h" />
<ClInclude Include="itspoof.h" />
<ClInclude Include="allcoll.h" />
<ClInclude Include="alphaindextst.h" />

View file

@ -70,6 +70,9 @@
<ClCompile Include="rbbitst.cpp">
<Filter>break iteration</Filter>
</ClCompile>
<ClCompile Include="rbbimonkeytest.cpp">
<Filter>break iteration</Filter>
</ClCompile>
<ClCompile Include="itspoof.cpp">
<Filter>spoof detection</Filter>
</ClCompile>
@ -504,6 +507,9 @@
<ClInclude Include="rbbitst.h">
<Filter>break iteration</Filter>
</ClInclude>
<ClInclude Include="rbbimonkeytest.h">
<Filter>break iteration</Filter>
</ClInclude>
<ClInclude Include="itspoof.h">
<Filter>spoof detection</Filter>
</ClInclude>

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1998-2012, International Business Machines Corporation
* Copyright (C) 1998-2016, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*/
@ -16,30 +16,23 @@
#if !UCONFIG_NO_BREAK_ITERATION
#include "intltest.h"
#include "itrbbi.h"
#include "rbbiapts.h"
#include "rbbitst.h"
#define TESTCLASS(n,classname) \
case n: \
name = #classname; \
if (exec) { \
logln(#classname "---"); \
logln(""); \
classname t; \
callTest(t, par); \
} \
break
#include "rbbimonkeytest.h"
void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
{
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
switch (index) {
TESTCLASS(0, RBBIAPITest);
TESTCLASS(1, RBBITest);
default: name=""; break;
if (exec) {
logln("TestSuite RuleBasedBreakIterator: ");
}
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO_CLASS(RBBIAPITest);
TESTCASE_AUTO_CLASS(RBBITest);
TESTCASE_AUTO_CLASS(RBBIMonkeyTest);
TESTCASE_AUTO_END;
}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -0,0 +1,976 @@
/********************************************************************
* Copyright (c) 2016, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
#include "unicode/utypes.h"
#include "rbbimonkeytest.h"
#include "unicode/utypes.h"
#include "unicode/brkiter.h"
#include "unicode/utf16.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstr.h"
#include "uelement.h"
#include "uhash.h"
#include "iostream"
#include "string"
using namespace icu;
void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function.
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(testMonkey);
TESTCASE_AUTO_END;
}
//---------------------------------------------------------------------------------------
//
// class BreakRule implementation.
//
//---------------------------------------------------------------------------------------
BreakRule::BreakRule() // : all field default initialized.
{
}
BreakRule::~BreakRule() {};
//---------------------------------------------------------------------------------------
//
// class BreakRules implementation.
//
//---------------------------------------------------------------------------------------
BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
uhash_compareUnicodeString,
NULL, // value comparator.
&status));
if (U_FAILURE(status)) {
return;
}
uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
fBreakRules.setDeleter(uprv_deleteUObject);
fCharClassList.adoptInstead(new UVector(status));
fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
"(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative lookbehind for '{' or '=' or '[:'
// (the identifier is a unicode property name or value)
"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
0, status));
// Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
"(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
"[ \\t]*+" // Match white space.
"(#.*)?+" // Optional # plus whatever follows
"\\R$" // new-line at end of line.
), 0, status));
// Match (initial parse) of a character class defintion line.
fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
"[ \\t]*" // leading white space
"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
"[ \\t]*=[ \\t]*" // =
"(?<ClassDef>.*?)" // The char class UnicodeSet expression
"[ \\t]*;$"), // ; <end of line>
0, status));
// Match (initial parse) of a break rule line.
fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
"[ \\t]*" // leading white space
"(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
"[ \\t]*:[ \\t]*" // :
"(?<RuleDef>.*?)" // The rule definition
"[ \\t]*;$"), // ; <end of line>
0, status));
}
BreakRules::~BreakRules() {};
CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
// Create the expanded definition for this char class,
// replacing any set references with the corresponding definition.
UnicodeString expandedDef;
UnicodeString emptyString;
fSetRefsMatcher->reset(definition);
while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
const UnicodeString name =
fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
expandedDef.append(expansionForName);
}
fSetRefsMatcher->appendTail(expandedDef);
// Verify that the expanded set defintion is valid.
if (fMonkeyImpl->fDumpExpansions) {
printf("epandedDef: %s\n", CStr(expandedDef)());
}
UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
if (U_FAILURE(status)) {
IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
u_errorName(status), CStr(name)());
return NULL;
}
CharClass *cclass = new CharClass(name, definition, expandedDef, s);
CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
new UnicodeString(name), // Key, owned by hash table.
cclass, // Value, owned by hash table.
&status));
if (previousClass != NULL) {
// Duplicate class def.
// These are legitimate, they are adustments of an existing class.
// TODO: will need to keep the old around when we handle tailorings.
IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
delete previousClass;
}
return cclass;
}
void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
LocalPointer<BreakRule> thisRule(new BreakRule);
thisRule->fName = name;
thisRule->fRule = definition;
// If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
// This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
UnicodeString emptyString;
// Expand the char class definitions within the rule.
fSetRefsMatcher->reset(definition);
while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
const UnicodeString name =
fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
if (!nameClass) {
IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
__FILE__, __LINE__, CStr(name)(), CStr(definition)());
}
const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
thisRule->fExpandedRule.append(expansionForName);
}
fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
// Replace the divide sign (\u00f7) with a regular expression named capture.
// When running the rules, a match that includes this group means we found a break position.
int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
if (dividePos >= 0) {
thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
}
if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message.
}
// UAX break rule set definitions can be empty, just [].
// Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
// also matches nothing.
static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
int32_t where = 0;
while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
}
if (fMonkeyImpl->fDumpExpansions) {
printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
}
// Compile a regular expression for this rule.
thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
if (U_FAILURE(status)) {
IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
__FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
return;
}
// Put this new rule into the vector of all Rules.
fBreakRules.addElement(thisRule.orphan(), status);
}
bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
if (keyword == UnicodeString("locale")) {
CharString localeName;
localeName.append(CStr(value)(), -1, status);
fLocale = Locale::createFromName(localeName.data());
return true;
}
if (keyword == UnicodeString("type")) {
if (value == UnicodeString("grapheme")) {
fType = UBRK_CHARACTER;
} else if (value == UnicodeString("word")) {
fType = UBRK_WORD;
} else if (value == UnicodeString("line")) {
fType = UBRK_LINE;
} else if (value == UnicodeString("sentence")) {
fType = UBRK_SENTENCE;
} else {
IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)());
}
return true;
}
// TODO: add tailoring base setting here.
return false;
}
RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
if (U_FAILURE(status)) {
return NULL;
}
RuleBasedBreakIterator *bi = NULL;
switch(fType) {
case UBRK_CHARACTER:
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
break;
case UBRK_WORD:
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
break;
case UBRK_LINE:
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
break;
case UBRK_SENTENCE:
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
break;
default:
IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
status = U_ILLEGAL_ARGUMENT_ERROR;
}
return bi;
}
void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
UnicodeString emptyString;
for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line.
if (U_FAILURE(status)) {
return;
}
int32_t lineLength = 0;
const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
if (lineBuf == NULL) {
break;
}
UnicodeString line(lineBuf, lineLength);
// Strip comment lines.
fCommentsMatcher->reset(line);
line = fCommentsMatcher->replaceFirst(emptyString, status);
if (line.isEmpty()) {
continue;
}
// Recognize character class definition and keyword lines
fClassDefMatcher->reset(line);
if (fClassDefMatcher->matches(status)) {
UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
if (fMonkeyImpl->fDumpExpansions) {
printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
}
if (setKeywordParameter(className, classDef, status)) {
// The scanned item was "type = ..." or "locale = ...", etc.
// which are not actual character classes.
continue;
}
addCharClass(className, classDef, status);
continue;
}
// Recognize rule lines.
fRuleDefMatcher->reset(line);
if (fRuleDefMatcher->matches(status)) {
UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
if (fMonkeyImpl->fDumpExpansions) {
printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
}
addRule(ruleName, ruleDef, status);
continue;
}
IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
__FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
}
// Build the vector of char classes, omitting the dictionary class if there is one.
// This will be used when constructing the random text to be tested.
// Also compute the "other" set, consisting of any characters not included in
// one or more of the user defined sets.
UnicodeSet otherSet((UChar32)0, 0x10ffff);
int32_t pos = UHASH_FIRST;
const UHashElement *el = NULL;
while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
// printf(" Adding %s\n", CStr(*ccName)());
if (*ccName != cclass->fName) {
IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
__FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
}
const UnicodeSet *set = cclass->fSet.getAlias();
otherSet.removeAll(*set);
if (*ccName == UnicodeString("dictionary")) {
fDictionarySet = *set;
} else {
fCharClassList->addElement(cclass, status);
}
}
if (!otherSet.isEmpty()) {
// fprintf(stderr, "have an other set.\n");
UnicodeString pattern;
CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
fCharClassList->addElement(cclass, status);
}
}
const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
int32_t localIter = 0;
int32_t &it = iter? *iter : localIter;
while (it < fCharClassList->size()) {
const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
++it;
if (cc->fSet->contains(c)) {
return cc;
}
}
return NULL;
}
//---------------------------------------------------------------------------------------
//
// class MonkeyTestData implementation.
//
//---------------------------------------------------------------------------------------
void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
const int32_t dataLength = 1000;
// Fill the test string with random characters.
// First randomly pick a char class, then randomly pick a character from that class.
// Exclude any characters from the dictionary set.
// std::cout << "Populating Test Data" << std::endl;
fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages,
// allowing recreation of failing data.
fBkRules = rules;
fString.remove();
for (int32_t n=0; n<dataLength;) {
int charClassIndex = rand() % rules->fCharClassList->size();
const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
if (cclass->fSet->size() == 0) {
// Some rules or tailorings do end up with empty char classes.
continue;
}
int32_t charIndex = rand() % cclass->fSet->size();
UChar32 c = cclass->fSet->charAt(charIndex);
if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
// Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
// Don't let random unpaired surrogates combine in the test data because they might
// produce an unwanted dictionary character.
continue;
}
if (!rules->fDictionarySet.contains(c)) {
fString.append(c);
++n;
}
}
// Reset each rule matcher regex with this new string.
// (Although we are always using the same string object, ICU regular expressions
// don't like the underlying string data changing without doing a reset).
for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
rule->fRuleMatcher->reset(fString);
}
// Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
// Expected and Actual breaks are one longer than the input string; a non-zero value
// will indicate a boundary preceding that position.
clearActualBreaks();
fExpectedBreaks = fActualBreaks;
fRuleForPosition = fActualBreaks;
f2ndRuleForPos = fActualBreaks;
// Apply reference rules to find the expected breaks.
fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text.
// ICU always reports a break there.
// The reference rules do not have a means to do so.
int32_t strIdx = 0;
while (strIdx < fString.length()) {
BreakRule *matchingRule = NULL;
UBool hasBreak = FALSE;
int32_t ruleNum = 0;
int32_t matchStart = 0;
int32_t matchEnd = 0;
int32_t breakGroup = 0;
for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
rule->fRuleMatcher->reset();
if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
// A candidate rule match, check further to see if we take it or continue to check other rules.
// Matches of zero or one codepoint count only if they also specify a break.
matchStart = rule->fRuleMatcher->start(status);
matchEnd = rule->fRuleMatcher->end(status);
breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
hasBreak = U_SUCCESS(status);
if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
status = U_ZERO_ERROR;
}
if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
matchingRule = rule;
break;
}
}
}
if (matchingRule == NULL) {
// No reference rule matched. This is an error in the rules that should never happen.
IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
__FILE__, __LINE__, strIdx);
dump(strIdx);
status = U_INVALID_FORMAT_ERROR;
return;
}
if (matchingRule->fRuleMatcher->group(status).length() == 0) {
// Zero length rule match. This is also an error in the rule expressions.
IntlTest::gTest->errln("%s:%d Zero length rule match.",
__FILE__, __LINE__);
status = U_INVALID_FORMAT_ERROR;
return;
}
// Record which rule matched over the length of the match.
for (int i = matchStart; i < matchEnd; i++) {
if (fRuleForPosition.charAt(i) == 0) {
fRuleForPosition.setCharAt(i, (UChar)ruleNum);
} else {
f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
}
}
// Break positions appear in rules as a matching named capture of zero length at the break position,
// the adjusted pattern contains (?<BreakPosition>)
if (hasBreak) {
int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
if (U_FAILURE(status) || breakPos < 0) {
// Rule specified a break, but that break wasn't part of the match, even
// though the rule as a whole matched.
// Can't happen with regular expressions derived from (equivalent to) ICU break rules.
// Shouldn't get here.
IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
status = U_INVALID_FORMAT_ERROR;
break;
}
fExpectedBreaks.setCharAt(breakPos, (UChar)1);
// printf("recording break at %d\n", breakPos);
// For the next iteration, pick up applying rules immediately after the break,
// which may differ from end of the match. The matching rule may have included
// context following the boundary that needs to be looked at again.
strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
} else {
// Original rule didn't specify a break.
// Continue applying rules starting on the last code point of this match.
strIdx = fString.moveIndex32(matchEnd, -1);
if (strIdx == matchStart) {
// Match was only one code point, no progress if we continue.
// Shouldn't get here, case is filtered out at top of loop.
CharString ruleName;
ruleName.appendInvariantChars(matchingRule->fName, status);
IntlTest::gTest->errln("%s:%d Rule %s internal error",
__FILE__, __LINE__, ruleName.data());
status = U_INVALID_FORMAT_ERROR;
break;
}
}
if (U_FAILURE(status)) {
IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
__FILE__, __LINE__, u_errorName(status));
break;
}
}
}
void MonkeyTestData::clearActualBreaks() {
fActualBreaks.remove();
// Actual Breaks length is one longer than the data string length, allowing
// for breaks before the first and after the last character in the data.
for (int32_t i=0; i<=fString.length(); i++) {
fActualBreaks.append((UChar)0);
}
}
void MonkeyTestData::dump(int32_t around) const {
printf("\n"
" char break Rule Character\n"
" pos code class R I name name\n"
"---------------------------------------------------------------------------------------------\n");
int32_t start;
int32_t end;
if (around == -1) {
start = 0;
end = fString.length();
} else {
// Display context around a failure.
start = fString.moveIndex32(around, -30);
end = fString.moveIndex32(around, +30);
}
for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
UErrorCode status = U_ZERO_ERROR;
UChar32 c = fString.char32At(charIdx);
const CharClass *cc = fBkRules->getClassForChar(c);
CharString ccName;
ccName.appendInvariantChars(cc->fName, status);
CharString ruleName, secondRuleName;
const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
ruleName.appendInvariantChars(rule->fName, status);
if (f2ndRuleForPos.charAt(charIdx) > 0) {
const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
secondRuleName.appendInvariantChars(secondRule->fName, status);
}
char cName[200];
u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
charIdx, c, ccName.data(),
fExpectedBreaks.charAt(charIdx) ? '*' : '.',
fActualBreaks.charAt(charIdx) ? '*' : '.',
ruleName.data(), secondRuleName.data(), cName
);
}
}
//---------------------------------------------------------------------------------------
//
// class RBBIMonkeyImpl
//
//---------------------------------------------------------------------------------------
RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
(void)status; // suppress unused parameter compiler warning.
}
// RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
// reference rules and creating the icu breakiterator to test,
// with its type and locale coming from the reference rules.
void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
fRuleFileName = ruleFile;
openBreakRules(ruleFile, status);
if (U_FAILURE(status)) {
IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
return;
}
fRuleSet.adoptInstead(new BreakRules(this, status));
fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
if (U_FAILURE(status)) {
IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
return;
}
fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
fTestData.adoptInstead(new MonkeyTestData());
}
RBBIMonkeyImpl::~RBBIMonkeyImpl() {
}
void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
CharString path;
path.append(IntlTest::getSourceTestData(status), status);
path.append("break_rules" U_FILE_SEP_STRING, status);
path.appendPathPart(fileName, status);
const char *codePage = "UTF-8";
fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
}
void RBBIMonkeyImpl::startTest() {
fThread.start(); // invokes runTest() in a separate thread.
}
void RBBIMonkeyImpl::join() {
fThread.join();
}
#define MONKEY_ERROR(msg, index) { \
IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
__FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
if (fVerbose) { fTestData->dump(index); } \
status = U_INVALID_STATE_ERROR; \
}
void RBBIMonkeyImpl::runTest() {
UErrorCode status = U_ZERO_ERROR;
int32_t errorCount = 0;
for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
status = U_ZERO_ERROR;
fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
// fTestData->dump();
testForwards(status);
testPrevious(status);
testFollowing(status);
testPreceding(status);
testIsBoundary(status);
if (fLoopCount < 0 && loopCount % 100 == 0) {
fprintf(stderr, ".");
}
if (U_FAILURE(status)) {
if (++errorCount > 10) {
return;
}
}
}
}
void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fTestData->clearActualBreaks();
fBI->setText(fTestData->fString);
int32_t previousBreak = -2;
for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
if (bk <= previousBreak) {
MONKEY_ERROR("Break Iterator Stall", bk);
return;
}
if (bk < 0 || bk > fTestData->fString.length()) {
MONKEY_ERROR("Boundary out of bounds", bk);
return;
}
fTestData->fActualBreaks.setCharAt(bk, 1);
}
checkResults("testForwards", FORWARD, status);
}
void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fTestData->clearActualBreaks();
fBI->setText(fTestData->fString);
int32_t nextBreak = -1;
for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
int32_t bk = fBI->following(i);
if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
continue;
}
if (bk == nextBreak && bk > i) {
// i is in the gap between two breaks.
continue;
}
if (i == nextBreak && bk > nextBreak) {
fTestData->fActualBreaks.setCharAt(bk, 1);
nextBreak = bk;
continue;
}
MONKEY_ERROR("following(i)", i);
return;
}
checkResults("testFollowing", FORWARD, status);
}
void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
if (U_FAILURE(status)) {return;}
fTestData->clearActualBreaks();
fBI->setText(fTestData->fString);
int32_t previousBreak = INT32_MAX;
for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
if (bk >= previousBreak) {
MONKEY_ERROR("Break Iterator Stall", bk);
return;
}
if (bk < 0 || bk > fTestData->fString.length()) {
MONKEY_ERROR("Boundary out of bounds", bk);
return;
}
fTestData->fActualBreaks.setCharAt(bk, 1);
}
checkResults("testPrevius", REVERSE, status);
}
void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fTestData->clearActualBreaks();
fBI->setText(fTestData->fString);
int32_t nextBreak = fTestData->fString.length()+1;
for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
int32_t bk = fBI->preceding(i);
// printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
if (bk == BreakIterator::DONE && i == 0) {
continue;
}
if (bk == nextBreak && bk < i) {
// i is in the gap between two breaks.
continue;
}
if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
// i indexes to a trailing surrogate.
// Break Iterators treat an index to either half as referring to the supplemental code point,
// with preceding going to some preceding code point.
if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
MONKEY_ERROR("preceding of trailing surrogate error", i);
}
continue;
}
if (i == nextBreak && bk < nextBreak) {
fTestData->fActualBreaks.setCharAt(bk, 1);
nextBreak = bk;
continue;
}
MONKEY_ERROR("preceding(i)", i);
return;
}
checkResults("testPreceding", REVERSE, status);
}
void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fTestData->clearActualBreaks();
fBI->setText(fTestData->fString);
for (int i=fTestData->fString.length(); i>=0; --i) {
if (fBI->isBoundary(i)) {
fTestData->fActualBreaks.setCharAt(i, 1);
}
}
checkResults("testForwards", FORWARD, status);
}
void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
if (direction == FORWARD) {
for (int i=0; i<=fTestData->fString.length(); ++i) {
if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
__FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
if (fVerbose) {
fTestData->dump(i);
}
status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely
break; // produce many redundant errors.
}
}
} else {
for (int i=fTestData->fString.length(); i>=0; i--) {
if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
__FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
if (fVerbose) {
fTestData->dump(i);
}
status = U_INVALID_STATE_ERROR;
break;
}
}
}
}
//---------------------------------------------------------------------------------------
//
// class RBBIMonkeyTest implementation.
//
//---------------------------------------------------------------------------------------
RBBIMonkeyTest::RBBIMonkeyTest() {
}
RBBIMonkeyTest::~RBBIMonkeyTest() {
}
// params, taken from this->fParams.
// rules=file_name Name of file containing the reference rules.
// seed=nnnnn Random number starting seed.
// Setting the seed allows errors to be reproduced.
// loop=nnn Looping count. Controls running time.
// -1: run forever.
// 0 or greater: run length.
// expansions debug option, show expansions of rules and sets.
// verbose Display details of the failure.
//
// Parameters on the intltest command line follow the test name, and are preceded by '@'.
// For example,
// intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
//
void RBBIMonkeyTest::testMonkey() {
// printf("Test parameters: %s\n", fParams);
UnicodeString params(fParams);
UErrorCode status = U_ZERO_ERROR;
const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
"line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
NULL };
CharString testNameFromParams;
if (getStringParam("rules", params, testNameFromParams, status)) {
tests[0] = testNameFromParams.data();
tests[1] = NULL;
}
int64_t loopCount = quick? 100 : 5000;
getIntParam("loop", params, loopCount, status);
UBool dumpExpansions = FALSE;
getBoolParam("expansions", params, dumpExpansions, status);
UBool verbose = FALSE;
getBoolParam("verbose", params, verbose, status);
int64_t seed = 0;
getIntParam("seed", params, seed, status);
if (params.length() != 0) {
// Options processing did not consume all of the parameters. Something unrecognized was present.
CharString unrecognizedParameters;
unrecognizedParameters.append(CStr(params)(), -1, status);
errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
return;
}
UVector startedTests(status);
if (U_FAILURE(status)) {
errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
return;
}
// Monkey testing is multi-threaded.
// Each set of break rules to be tested is run in a separate thread.
// Each thread/set of rules gets a separate RBBIMonkeyImpl object.
int32_t i;
for (i=0; tests[i] != NULL; ++i) {
logln("beginning testing of %s", tests[i]);
RBBIMonkeyImpl *test = new RBBIMonkeyImpl(status);
test->fDumpExpansions = dumpExpansions;
test->fVerbose = verbose;
test->fRandomGenerator.seed((uint32_t)seed);
test->fLoopCount = loopCount;
test->setup(tests[i], status);
test->startTest();
startedTests.addElement(test, status);
if (U_FAILURE(status)) {
break;
}
}
if (U_FAILURE(status)) {
errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
}
for (i=0; i<startedTests.size(); ++i) {
RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
test->join();
delete test;
}
}
UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status) {
name.append(" *= *(-?\\d+) *,? *");
RegexMatcher m(name, params, 0, status);
if (m.find()) {
// The param exists. Convert the string to an int.
CharString str;
str.append(CStr(m.group(1, status))(), -1, status);
val = strtol(str.data(), NULL, 10);
// Delete this parameter from the params string.
m.reset();
params = m.replaceFirst(UnicodeString(), status);
return TRUE;
}
return FALSE;
}
UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status) {
name.append(" *= *([^ ,]*) *,? *");
RegexMatcher m(name, params, 0, status);
if (m.find()) {
// The param exists.
dest.append(CStr(m.group(1, status))(), -1, status);
// Delete this parameter from the params string.
m.reset();
params = m.replaceFirst(UnicodeString(), status);
return TRUE;
}
return FALSE;
}
UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status) {
name.append("(?: *= *(true|false))? *,? *");
RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
if (m.find()) {
if (m.start(1, status) > 0) {
// user option included a value.
dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
} else {
// No explicit user value, implies true.
dest = TRUE;
}
// Delete this parameter from the params string.
m.reset();
params = m.replaceFirst(UnicodeString(), status);
return TRUE;
}
return FALSE;
}

View file

@ -0,0 +1,208 @@
/*************************************************************************
* Copyright (c) 2016, International Business Machines
* Corporation and others. All Rights Reserved.
*************************************************************************
*/
#ifndef RBBIMONKEYTEST_H
#define RBBIMONKEYTEST_H
#include "unicode/utypes.h"
#include "intltest.h"
#include "unicode/rbbi.h"
#include "unicode/regex.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/uobject.h"
#include "simplethread.h"
#include "ucbuf.h"
#include "uhash.h"
#include "uvector.h"
//
// TODO:
// Develop a tailoring format.
// Hook to old tests that use monkey impl to get expected data.
// Remove old tests.
class BreakRules; // Forward declaration
class RBBIMonkeyImpl;
/**
* Test the RuleBasedBreakIterator class giving different rules
*/
class RBBIMonkeyTest: public IntlTest {
public:
RBBIMonkeyTest();
virtual ~RBBIMonkeyTest();
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
void testMonkey();
private:
const char *fParams; // Copy of user parameters passed in from IntlTest.
void testRules(const char *ruleFile);
static UBool getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status);
static UBool getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status);
static UBool getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status);
};
// The following classes are internal to the RBBI Monkey Test implementation.
// class CharClass Represents a single character class from the source break rules.
// Inherits from UObject because instances are adopted by UHashtable, which ultimately
// deletes them using hash's object deleter function.
class CharClass: public UObject {
public:
UnicodeString fName;
UnicodeString fOriginalDef; // set definition as it appeared in user supplied rules.
UnicodeString fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively.
LocalPointer<const UnicodeSet> fSet;
CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) :
fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {}
};
// class BreakRule represents a single rule from a set of break rules.
// Each rule has the set definitions expanded, and
// is compiled to a regular expression.
class BreakRule: public UObject {
public:
BreakRule();
~BreakRule();
UnicodeString fName; // Name of the rule.
UnicodeString fRule; // Rule expression, excluding the name, as written in user source.
UnicodeString fExpandedRule; // Rule expression after expanding the set definitions.
LocalPointer<RegexMatcher> fRuleMatcher; // Regular expression that matches the rule.
};
// class BreakRules represents a complete set of break rules, possibly tailored,
// compiled from testdata break rules.
class BreakRules: public UObject {
public:
BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status);
~BreakRules();
void compileRules(UCHARBUF *rules, UErrorCode &status);
const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const;
RBBIMonkeyImpl *fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance.
icu::UVector fBreakRules; // Contents are of type (BreakRule *).
LocalUHashtablePointer fCharClasses; // Key is set name (UnicodeString).
// Value is (CharClass *)
LocalPointer<UVector> fCharClassList; // Char Classes, same contents as fCharClasses values,
// but in a vector so they can be accessed by index.
UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined.
Locale fLocale;
UBreakIteratorType fType;
CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status);
RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status);
LocalPointer<RegexMatcher> fSetRefsMatcher;
LocalPointer<RegexMatcher> fCommentsMatcher;
LocalPointer<RegexMatcher> fClassDefMatcher;
LocalPointer<RegexMatcher> fRuleDefMatcher;
};
// class MonkeyTestData represents a randomly synthesized test data string together
// with the expected break positions obtained by applying
// the test break rules.
class MonkeyTestData: public UObject {
public:
MonkeyTestData() {};
~MonkeyTestData() {};
void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status);
void clearActualBreaks();
void dump(int32_t around = -1) const;
uint32_t fRandomSeed; // The initial seed value from the random number genererator.
const BreakRules *fBkRules; // The break rules used to generate this data.
UnicodeString fString; // The text.
UnicodeString fExpectedBreaks; // Breaks as found by the reference rules.
// Parallel to fString. Non-zero if break preceding.
UnicodeString fActualBreaks; // Breaks as found by ICU break iterator.
UnicodeString fRuleForPosition; // Index into BreakRules.fBreakRules of rule that applied at each position.
// Also parallel to fString.
UnicodeString f2ndRuleForPos; // As above. A 2nd rule applies when the preceding rule
// didn't cause a break, and a subsequent rule match starts
// on the last code point of the preceding match.
};
// class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey
// test for one set of break rules.
//
// When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence
// between instances of RBBIMonkeyImpl and threads.
//
class RBBIMonkeyImpl: public UObject {
public:
RBBIMonkeyImpl(UErrorCode &status);
~RBBIMonkeyImpl();
void setup(const char *ruleFileName, UErrorCode &status);
void startTest();
void runTest();
void join();
LocalUCHARBUFPointer fRuleCharBuffer; // source file contents of the reference rules.
LocalPointer<BreakRules> fRuleSet;
LocalPointer<RuleBasedBreakIterator> fBI;
LocalPointer<MonkeyTestData> fTestData;
IntlTest::icu_rand fRandomGenerator;
const char *fRuleFileName;
UBool fVerbose; // True to do long dump of failing data.
int32_t fLoopCount;
UBool fDumpExpansions; // Debug flag to output epananded form of rules and sets.
enum CheckDirection {
FORWARD = 1,
REVERSE = 2
};
void clearActualBreaks();
void testForwards(UErrorCode &status);
void testPrevious(UErrorCode &status);
void testFollowing(UErrorCode &status);
void testPreceding(UErrorCode &status);
void testIsBoundary(UErrorCode &status);
void checkResults(const char *msg, CheckDirection dir, UErrorCode &status);
class RBBIMonkeyThread: public SimpleThread {
private:
RBBIMonkeyImpl *fMonkeyImpl;
public:
RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {};
void run() U_OVERRIDE { fMonkeyImpl->runTest(); };
};
private:
void openBreakRules(const char *fileName, UErrorCode &status);
RBBIMonkeyThread fThread;
};
#endif // RBBIMONKEYTEST_H

View file

@ -9,36 +9,36 @@
* 01/12/2000 Madhu Updated for changed API and added new tests
************************************************************************/
#include "utypeinfo.h" // for 'typeid' to work
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/utypes.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "unicode/brkiter.h"
#include "unicode/localpointer.h"
#include "unicode/numfmt.h"
#include "unicode/rbbi.h"
#include "unicode/uchar.h"
#include "unicode/utf16.h"
#include "unicode/ucnv.h"
#include "unicode/schriter.h"
#include "unicode/uniset.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/regex.h"
#endif
#include "unicode/schriter.h"
#include "unicode/uchar.h"
#include "unicode/utf16.h"
#include "unicode/ucnv.h"
#include "unicode/uniset.h"
#include "unicode/uscript.h"
#include "unicode/ustring.h"
#include "unicode/utext.h"
#include "charstr.h"
#include "cmemory.h"
#include "intltest.h"
#include "rbbitst.h"
#include <string.h>
#include "charstr.h"
#include "utypeinfo.h" // for 'typeid' to work
#include "uvector.h"
#include "uvectr32.h"
#include <stdio.h>
#include <stdlib.h>
#include "unicode/numfmt.h"
#include "unicode/uscript.h"
#include "cmemory.h"
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
#include "unicode/filteredbrk.h"
@ -56,7 +56,7 @@
//---------------------------------------------
// Note: Before adding new tests to this file, check whether the desired test data can
// Note: Before adding new tests to this file, check whether the desired test data can
// simply be added to the file testdata/rbbitest.txt. In most cases it can,
// it's much less work than writing a new test, diagnostic output in the event of failures
// is good, and the test data file will is shared with ICU4J, so eventually the test
@ -79,7 +79,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
break;
case 2: name = "TestStatusReturn";
if(exec) TestStatusReturn(); break;
#if !UCONFIG_NO_FILE_IO
case 3: name = "TestUnicodeFiles";
if(exec) TestUnicodeFiles(); break;
@ -117,7 +117,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
#endif
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
case 16:
case 16:
name = "TestMonkey"; if(exec) TestMonkey(params); break;
#else
case 16:
@ -323,7 +323,7 @@ void RBBITest::TestStatusReturn() {
"$Numbers = [:N:];\n"
"$Letters+{1};\n"
"$Numbers+{2};\n"
"Help\\ {4}/me\\!;\n"
"Help\\ /me\\!{4};\n"
"[^$Letters $Numbers];\n"
"!.*;\n", -1, US_INV);
UnicodeString testString1 = "abc123..abc Help me Help me!";
@ -334,28 +334,27 @@ void RBBITest::TestStatusReturn() {
UErrorCode status=U_ZERO_ERROR;
UParseError parseError;
BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
if(U_FAILURE(status)) {
dataerrln("FAIL : in construction - %s", u_errorName(status));
} else {
int32_t pos;
int32_t i = 0;
bi->setText(testString1);
for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
if (pos != bounds1[i]) {
errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
break;
}
int tag = bi->getRuleStatus();
if (tag != brkStatus[i]) {
errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
break;
}
i++;
}
dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
return;
}
int32_t pos;
int32_t i = 0;
bi->setText(testString1);
for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
if (pos != bounds1[i]) {
errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
break;
}
int tag = bi->getRuleStatus();
if (tag != brkStatus[i]) {
errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
break;
}
i++;
}
delete bi;
}
@ -817,7 +816,7 @@ void RBBITest::TestBug5775() {
if (bi == NULL) {
return;
}
UnicodeString s("One.\\u00ad Two.", -1, US_INV);
// 01234 56789
s = s.unescape();
@ -869,7 +868,7 @@ struct TestParams {
utext_close(textToBreak);
delete textMap;
}
int32_t getSrcLine(int32_t bp);
int32_t getExpectedBreak(int32_t bp);
int32_t getSrcCol(int32_t bp);
@ -901,7 +900,7 @@ static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorC
0xfffd, NULL, &status);
dest.append(buffer, utf8Length, status);
}
void TestParams::setUTF16(UErrorCode &status) {
textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
@ -1578,7 +1577,7 @@ void RBBITest::TestDictRules() {
//-------------------------------------------------------------------------------
//
// ReadAndConvertFile Read a text data file, convert it to UChars, and
// return the datain one big UChar * buffer, which the caller must delete.
// return the data in one big UChar * buffer, which the caller must delete.
//
// parameters:
// fileName: the name of the file, with no directory part. The test data directory
@ -1780,7 +1779,7 @@ void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *
}
strcpy(testFileName, testDataDirectory);
strcat(testFileName, fileName);
logln("Opening data file %s\n", fileName);
int len;
@ -1858,7 +1857,7 @@ void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *
else if (tokenMatcher.start(4, status) >= 0) {
// Scanned to end of a line, possibly skipping over a comment in the process.
// If the line from the file contained test data, run the test now.
if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
}
@ -2030,6 +2029,10 @@ private:
UnicodeSet *fLVTSet;
UnicodeSet *fHangulSet;
UnicodeSet *fAnySet;
UnicodeSet *fEmojiModifierSet;
UnicodeSet *fEmojiBaseSet;
UnicodeSet *fZWJSet;
UnicodeSet *fGAZSet;
const UnicodeString *fText;
};
@ -2041,8 +2044,8 @@ RBBICharMonkey::RBBICharMonkey() {
fText = NULL;
fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status);
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status);
fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
@ -2059,6 +2062,18 @@ RBBICharMonkey::RBBICharMonkey() {
fHangulSet->addAll(*fLVTSet);
fAnySet = new UnicodeSet(0, 0x10ffff);
fEmojiBaseSet = new UnicodeSet(UnicodeString(
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
"\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
"\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
"\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
fZWJSet = new UnicodeSet(0x200D, 0x200D);
fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status);
fSets = new UVector(status);
fSets->addElement(fCRLFSet, status);
fSets->addElement(fControlSet, status);
@ -2070,6 +2085,10 @@ RBBICharMonkey::RBBICharMonkey() {
fSets->addElement(fSpacingSet, status);
fSets->addElement(fHangulSet, status);
fSets->addElement(fAnySet, status);
fSets->addElement(fEmojiBaseSet, status);
fSets->addElement(fEmojiModifierSet, status);
fSets->addElement(fZWJSet, status);
fSets->addElement(fGAZSet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
@ -2090,7 +2109,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
int breakPos = -1;
UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
if (U_FAILURE(deferredStatus)) {
return -1;
}
@ -2171,12 +2190,20 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
}
// Rule (GB8a) Regional_Indicator x Regional_Indicator
// Note: The first if condition is a little tricky. We only need to force
// a break if there are three or more contiguous RIs. If there are
// only two, a break following will occur via other rules, and will include
// any trailing extend characters, which is needed behavior.
if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
&& fRegionalIndicatorSet->contains(c2)) {
break;
}
if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
continue;
}
// Rule (GB9) Numeric x ALetter
if (fExtendSet->contains(c2)) {
// Rule (GB9) x Extend
if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
continue;
}
@ -2190,6 +2217,16 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
continue;
}
// Rule (GB9c) Emoji_Base x Emoji_Modifier
if ((fEmojiBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
continue;
}
// Rule (GB9d) ZWJ x Glue_After_Zwj
if (fZWJSet->contains(c1) && fGAZSet->contains(c2)) {
continue;
}
// Rule (GB10) Any <break> Any
break;
}
@ -2220,6 +2257,10 @@ RBBICharMonkey::~RBBICharMonkey() {
delete fLVTSet;
delete fHangulSet;
delete fAnySet;
delete fEmojiBaseSet;
delete fEmojiModifierSet;
delete fZWJSet;
delete fGAZSet;
}
//------------------------------------------------------------------------------------------
@ -2245,7 +2286,7 @@ private:
UnicodeSet *fKatakanaSet;
UnicodeSet *fHebrew_LetterSet;
UnicodeSet *fALetterSet;
// TODO(jungshik): Do we still need this change?
// TODO(jungshik): Do we still need this change?
// UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
UnicodeSet *fSingle_QuoteSet;
UnicodeSet *fDouble_QuoteSet;
@ -2258,6 +2299,10 @@ private:
UnicodeSet *fExtendSet;
UnicodeSet *fExtendNumLetSet;
UnicodeSet *fDictionaryCjkSet;
UnicodeSet *fEBaseSet;
UnicodeSet *fEModifierSet;
UnicodeSet *fZWSSet;
UnicodeSet *fGAZSet;
const UnicodeString *fText;
};
@ -2275,7 +2320,7 @@ RBBIWordMonkey::RBBIWordMonkey()
fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
// Exclude Hangul syllables from ALetterSet during testing.
// Leave CJK dictionary characters out from the monkey tests!
#if 0
#if 0
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
"[\\p{Line_Break = Complex_Context}"
"-\\p{Grapheme_Cluster_Break = Extend}"
@ -2300,6 +2345,18 @@ RBBIWordMonkey::RBBIWordMonkey()
fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
fEBaseSet = new UnicodeSet(UnicodeString(
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
"\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
"\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
"\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
fZWSSet = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);;
fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status);
fExtendSet->removeAll(*fZWSSet);
fOtherSet = new UnicodeSet();
if(U_FAILURE(status)) {
deferredStatus = status;
@ -2322,6 +2379,11 @@ RBBIWordMonkey::RBBIWordMonkey()
fOtherSet->removeAll(*fFormatSet);
fOtherSet->removeAll(*fExtendSet);
fOtherSet->removeAll(*fRegionalIndicatorSet);
fOtherSet->removeAll(*fEBaseSet);
fOtherSet->removeAll(*fEModifierSet);
fOtherSet->removeAll(*fZWSSet);
fOtherSet->removeAll(*fGAZSet);
// Inhibit dictionary characters from being tested at all.
fOtherSet->removeAll(*fDictionaryCjkSet);
fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
@ -2344,6 +2406,11 @@ RBBIWordMonkey::RBBIWordMonkey()
fSets->addElement(fOtherSet, status);
fSets->addElement(fExtendNumLetSet, status);
fSets->addElement(fEBaseSet, status);
fSets->addElement(fEModifierSet, status);
fSets->addElement(fZWSSet, status);
fSets->addElement(fGAZSet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
@ -2362,7 +2429,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
int breakPos = -1;
UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
if (U_FAILURE(deferredStatus)) {
return -1;
}
@ -2392,7 +2459,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
break;
};
}
while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWSSet->contains(c3));
if (p1 == p2) {
@ -2411,7 +2478,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
if (c1==0x0D && c2==0x0A) {
continue;
}
// Rule (3a) Break before and after newlines (including CR and LF)
//
if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
@ -2421,6 +2488,15 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
break;
};
// Rule (3c) ZWJ x GAZ (Glue after ZWJ).
// Not ignoring extend chars, so peek into input text to
// get the potential ZWJ, the character immediately preceding c2.
// Sloppy UChar32 indexing: p2-1 may reference trail half
// but char32At will get the full code point.
if (fZWSSet->contains(fText->char32At(p2-1)) && fGAZSet->contains(c2)) {
continue;
}
// Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
(fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
@ -2510,10 +2586,18 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
}
// Rule 13c
if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
break;
}
if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
continue;
}
// Rule 13d
if ((fEBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEModifierSet->contains(c2)) {
continue;
}
// Rule 14. Break found here.
break;
}
@ -2548,6 +2632,10 @@ RBBIWordMonkey::~RBBIWordMonkey() {
delete fRegionalIndicatorSet;
delete fDictionaryCjkSet;
delete fOtherSet;
delete fEBaseSet;
delete fEModifierSet;
delete fZWSSet;
delete fGAZSet;
}
@ -2933,17 +3021,29 @@ private:
UnicodeSet *fHL;
UnicodeSet *fID;
UnicodeSet *fRI;
UnicodeSet *fSA;
UnicodeSet *fXX;
UnicodeSet *fEB;
UnicodeSet *fEM;
UnicodeSet *fZJ;
BreakIterator *fCharBI;
const UnicodeString *fText;
RegexMatcher *fNumberMatcher;
};
RBBILineMonkey::RBBILineMonkey() :
RBBIMonkeyKind(),
fSets(NULL),
fCharBI(NULL),
fText(NULL),
fNumberMatcher(NULL)
RBBILineMonkey::RBBILineMonkey()
{
if (U_FAILURE(deferredStatus)) {
return;
}
UErrorCode status = U_ZERO_ERROR;
fSets = new UVector(status);
@ -2985,24 +3085,35 @@ RBBILineMonkey::RBBILineMonkey()
fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
fEB = new UnicodeSet(UnicodeString(
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
"\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
"\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
"\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
fZJ = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);
if (U_FAILURE(status)) {
deferredStatus = status;
fCharBI = NULL;
fNumberMatcher = NULL;
return;
}
fAL->addAll(*fXX); // Default behavior for XX is identical to AL
fAL->addAll(*fAI); // Default behavior for AI is identical to AL
fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
fID->addAll(*fEB); // Emoji Base and Emoji Modifier behave as ID.
fID->addAll(*fEM);
fAL->removeAll(*fEM);
fAL->remove((UChar32)0x2764); // Emoji Proposal: move u2764 from Al to Id
fID->add((UChar32)0x2764);
fSets->addElement(fBK, status);
fSets->addElement(fCR, status);
fSets->addElement(fLF, status);
@ -3040,10 +3151,12 @@ RBBILineMonkey::RBBILineMonkey()
fSets->addElement(fID, status);
fSets->addElement(fWJ, status);
fSets->addElement(fRI, status);
fSets->addElement(fSA, status);
fSets->addElement(fSG, status);
fSets->addElement(fEB, status);
fSets->addElement(fEM, status);
fSets->addElement(fZJ, status);
const char *rules =
const char *rules =
"((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
"((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
"\\p{Line_Break=NU}\\p{Line_Break=CM}*"
@ -3228,6 +3341,18 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
break;
}
// LB 8a ZJ x ID
// The monkey test's way of ignoring combining characters doesn't work
// for this rule. ZJ is also a CM. Need to get the actual character
// preceding "thisChar", not ignoring combining marks, possibly ZJ.
{
int32_t prevIdx = fText->moveIndex32(pos, -1);
UChar32 prevC = fText->char32At(prevIdx);
if (fZJ->contains(prevC) && fID->contains(thisChar)) {
continue;
}
}
// LB 9, 10 Already done, at top of loop.
//
@ -3245,7 +3370,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
if (fGL->contains(prevChar)) {
continue;
}
// LB 12a
// [^SP BA HY] x GL
if (!(fSP->contains(prevChar) ||
@ -3368,7 +3493,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
// LB 21a
// HL (HY | BA) x
if (fHL->contains(prevCharX2) &&
if (fHL->contains(prevCharX2) &&
(fHY->contains(prevChar) || fBA->contains(prevChar))) {
continue;
}
@ -3495,12 +3620,20 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
continue;
}
// LB30a Do not break between regional indicators.
// RI x RI
// LB30a RI RI <break> RI
// RI x RI
if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
break;
}
if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
continue;
}
// LB30b Emoji Base x Emoji Modifier
if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
continue;
}
// LB 31 Break everywhere else
break;
@ -3555,9 +3688,10 @@ RBBILineMonkey::~RBBILineMonkey() {
delete fHL;
delete fID;
delete fRI;
delete fSA;
delete fSG;
delete fXX;
delete fEB;
delete fEM;
delete fZJ;
delete fCharBI;
delete fNumberMatcher;
@ -3577,6 +3711,9 @@ RBBILineMonkey::~RBBILineMonkey() {
//
// type = char | word | line | sent | title
//
// Example:
// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
//
//-------------------------------------------------------------------------------------------
static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
@ -3853,7 +3990,6 @@ void RBBITest::TestLineBreaks(void)
"\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
"\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
"\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
"\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
"\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
"\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
"\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
@ -3869,25 +4005,19 @@ void RBBITest::TestLineBreaks(void)
"\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
"\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
"\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
"\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
"\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
"\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
"\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
"\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
"\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
"\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
"\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
"\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
"\\u2014\\u0020\\u000a\\u17c5\\u24fc",
"\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
"\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
"\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
"\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
"\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
"\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
"\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
"\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
"\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
"\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
"\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
};
@ -4175,9 +4305,15 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
int32_t charIdx = m_rand() % classSet->size();
UChar32 c = classSet->charAt(charIdx);
if (c < 0) { // TODO: deal with sets containing strings.
errln("c < 0");
errln("%s:%d c < 0", __FILE__, __LINE__);
break;
}
// Do not assemble a supplementary character from randomly generated separate surrogates.
// (It could be a dictionary character)
if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
continue;
}
testText.append(c);
}
@ -4284,7 +4420,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
} else {
if (breakPos >= 0) {
precedingBreaks[breakPos] = 1;
}
}
lastBreakPos = breakPos;
}
}
@ -4379,7 +4515,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
charErrorTxt[sizeof(charErrorTxt)-1] = 0;
const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
errorType, seed, i, charErrorTxt);
@ -4402,15 +4538,15 @@ void RBBITest::TestBug5532(void) {
// Text includes a mixture of Thai and Latin.
const unsigned char utf8Data[] = {
0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
UErrorCode status = U_ZERO_ERROR;
@ -4467,7 +4603,7 @@ void RBBITest::TestBug9983(void) {
rstatus = brkiter->getRuleStatus();
(void)rstatus; // Suppress set but not used warning.
if (iterationCount >= 10) {
break;
break;
}
}
TEST_ASSERT(iterationCount == 6);
@ -4480,7 +4616,7 @@ void RBBITest::TestBug9983(void) {
rstatus = brkiterPOSIX->getRuleStatus();
(void)rstatus; // Suppress set but not used warning.
if (iterationCount >= 10) {
break;
break;
}
}
TEST_ASSERT(iterationCount == 6);

View file

@ -1,5 +1,6 @@
# GraphemeBreakTest-8.0.0.txt
# Date: 2015-02-13, 13:47:15 GMT [MD]
# Hand patched for Emoji breaking proposal L2/16-011R3.
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
@ -9,9 +10,9 @@
# Default Grapheme Break Test
#
# Format:
# <string> (# <comment>)?
# <string> contains hex Unicode code points, with
# ÷ wherever there is a break opportunity, and
# <string> (# <comment>)?
# <string> contains hex Unicode code points, with
# ÷ wherever there is a break opportunity, and
# × wherever there is not.
# <comment> the format can change, but currently it shows:
# - the sample character name
@ -414,10 +415,10 @@
÷ D800 ÷ 0308 ÷ D800 ÷ # ÷ [0.2] <surrogate-D800> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <surrogate-D800> (Control) ÷ [0.3]
÷ 0061 ÷ 1F1E6 ÷ 0062 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) ÷ [999.0] LATIN SMALL LETTER B (Other) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA × 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA × 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ 200B ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [5.0] ZERO WIDTH SPACE (Control) ÷ [4.0] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
÷ 1F1E6 × 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 1F1E6 × 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 1F1E6 × 200D ÷ 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 1F1E6 × 1F1E7 × 200D ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 0020 × 200D ÷ 0646 ÷ # ÷ [0.2] SPACE (Other) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] ARABIC LETTER NOON (Other) ÷ [0.3]

View file

@ -1,5 +1,6 @@
# LineBreakTest-8.0.0.txt
# Date: 2015-04-30, 09:40:15 GMT [MD]
# Hand patched for Emoji break proposal L2/16-011R3
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
@ -6324,13 +6325,14 @@
× 3057 × 3001 ÷ 0061 × 0062 ÷ 3068 ÷ # × [0.3] HIRAGANA LETTER SI (ID) × [13.02] IDEOGRAPHIC COMMA (CL) ÷ [999.0] LATIN SMALL LETTER A (AL) × [28.0] LATIN SMALL LETTER B (AL) ÷ [999.0] HIRAGANA LETTER TO (ID) ÷ [0.3]
× 0061 ÷ 1F1E6 ÷ 0062 ÷ # × [0.3] LATIN SMALL LETTER A (AL) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [999.0] LATIN SMALL LETTER B (AL) ÷ [0.3]
× 1F1F7 × 1F1FA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [0.3]
× 1F1F7 × 1F1FA × 1F1F8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) ÷ [0.3]
× 1F1F7 × 1F1FA × 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]
× 1F1F7 × 1F1FA ÷ 1F1F8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) ÷ [0.3]
× 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]
× 1F1F7 × 1F1FA × 200B ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [8.0] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]
× 05D0 × 002D × 05D0 ÷ # × [0.3] HEBREW LETTER ALEF (HL) × [21.02] HYPHEN-MINUS (HY) × [21.1] HEBREW LETTER ALEF (HL) ÷ [0.3]
× 1F1E6 × 1F1E7 × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
× 1F1E6 × 200D × 1F1E7 × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
× 1F1E6 × 1F1E7 × 200D × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
× 1F1E6 × 1F1E7 ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
# Patched the following two lines for RI pairing. Note ZWJ behaves as CM and logically disappears.
× 1F1E6 × 200D × 1F1E7 ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
× 1F1E6 × 1F1E7 × 200D ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
× 0020 ÷ 200D × 0646 ÷ # × [0.3] SPACE (SP) ÷ [18.0] ZERO WIDTH JOINER (CM) × [28.0] ARABIC LETTER NOON (AL) ÷ [0.3]
× 0646 × 200D × 0020 ÷ # × [0.3] ARABIC LETTER NOON (AL) × [9.0] ZERO WIDTH JOINER (CM) × [7.01] SPACE (SP) ÷ [0.3]
#

View file

@ -1,5 +1,7 @@
# WordBreakTest-8.0.0.txt
# Date: 2015-05-02, 14:48:55 GMT [MD]
# Hand Patched for Emoji breaking proposal L2/16-011R3
#
# Unicode Character Database
# Copyright (c) 1991-2015 Unicode, Inc.
@ -1392,13 +1394,13 @@
÷ 2060 ÷ 0043 × 2060 × 002E × 2060 × 0044 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) ÷ [999.0] LATIN CAPITAL LETTER C (ALetter) × [4.0] WORD JOINER (Format_FE) × [6.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [7.0] LATIN CAPITAL LETTER D (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 0061 ÷ 1F1E6 ÷ 0062 ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) ÷ [999.0] LATIN SMALL LETTER B (ALetter) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA × 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA × 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
÷ 1F1F7 × 1F1FA ÷ 200B ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [999.0] ZERO WIDTH SPACE (Other) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
÷ 05D0 × 0022 × 05D0 ÷ # ÷ [0.2] HEBREW LETTER ALEF (Hebrew_Letter) × [7.2] QUOTATION MARK (Double_Quote) × [7.3] HEBREW LETTER ALEF (Hebrew_Letter) ÷ [0.3]
÷ 1F1E6 × 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 1F1E6 × 200D × 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 1F1E6 × 1F1E7 × 200D × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 1F1E6 × 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 1F1E6 × 200D × 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 1F1E6 × 1F1E7 × 200D ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
÷ 0020 × 200D ÷ 0646 ÷ # ÷ [0.2] SPACE (Other) × [4.0] ZERO WIDTH JOINER (Extend_FE) ÷ [999.0] ARABIC LETTER NOON (ALetter) ÷ [0.3]
÷ 0646 × 200D ÷ 0020 ÷ # ÷ [0.2] ARABIC LETTER NOON (ALetter) × [4.0] ZERO WIDTH JOINER (Extend_FE) ÷ [999.0] SPACE (Other) ÷ [0.3]
÷ 0031 ÷ 003A ÷ 003A ÷ 0031 ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [999.0] COLON (MidLetter) ÷ [999.0] DIGIT ONE (Numeric) ÷ [0.3]

View file

@ -0,0 +1,60 @@
#
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: grapheme.txt
#
# Reference Grapheme Break rules for intltest rbbi/RBBIMonkeyTest
#
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
type = grapheme; # one of grapheme | word | line | sentence
locale = en;
CR = [\u000d];
LF = [\u000a];
Control = [[\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]];
Extend = [[\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]];
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
Prepend = [];
SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
E_Modifier = [\U0001F3FB-\U0001F3FF];
GAZ = [\U0001F455-\U0001F469\U0001F48B\U0001F5E8\u2764];
ZWJ = [\u200D];
#
# Korean Syllable Definitions
#
L = [\p{Grapheme_Cluster_Break = L}];
V = [\p{Grapheme_Cluster_Break = V}];
T = [\p{Grapheme_Cluster_Break = T}];
LV = [\p{Grapheme_Cluster_Break = LV}];
LVT = [\p{Grapheme_Cluster_Break = LVT}];
GB3: CR LF;
GB4: (Control | CR | LF) ÷;
GB5: . ÷ (Control | CR | LF);
GB6: L (L | V | LV | LVT);
GB7: (LV | V) (V | T);
GB8: (LVT | T) T;
# Regional Indicators, split into pairs.
# Note that a pair of RIs that is not followed by a third RI will fall into
# the normal rules for Extend, etc.
#
GB8a.1: Regional_Indicator Regional_Indicator ÷ Regional_Indicator;
GB8a.2: Regional_Indicator Regional_Indicator;
GB9: . Extend;
GB9a: . SpacingMark;
GB9b: Prepend .;
GB9c: (E_Base | GAZ) E_Modifier;
GB9d: ZWJ GAZ;
GB10: . ÷;

View file

@ -0,0 +1,196 @@
#
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: line.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
type = line;
locale = en;
AI = [:LineBreak = Ambiguous:];
AL = [[:LineBreak = Alphabetic:]-[\u2764]];
BA = [:LineBreak = Break_After:];
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
EM = [\U0001F3FB-\U0001F3FF];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
HL = [:LineBreak = Hebrew_Letter:];
HY = [:LineBreak = Hyphen:];
H2 = [:LineBreak = H2:];
H3 = [:LineBreak = H3:];
ID = [[:LineBreak = Ideographic:][\u2764]];
IN = [:LineBreak = Inseperable:];
IS = [:LineBreak = Infix_Numeric:];
JL = [:LineBreak = JL:];
JV = [:LineBreak = JV:];
JT = [:LineBreak = JT:];
LF = [:LineBreak = Line_Feed:];
NL = [:LineBreak = Next_Line:];
NS = [[:LineBreak = Nonstarter:] CJ];
NU = [:LineBreak = Numeric:];
OP = [:LineBreak = Open_Punctuation:];
PO = [:LineBreak = Postfix_Numeric:];
PR = [:LineBreak = Prefix_Numeric:];
QU = [:LineBreak = Quotation:];
RI = [:LineBreak = Regional_Indicator:];
SA = [:LineBreak = Complex_Context:];
SG = [:LineBreak = Surrogate:];
SP = [:LineBreak = Space:];
SY = [:LineBreak = Break_Symbols:];
WJ = [:LineBreak = Word_Joiner:];
XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZJ = [\u200D];
# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
ID = [ID - EB];
AL = [AL - EM];
dictionary = [:LineBreak = Complex_Context:];
# Redfine AL. LB1. TODO: refine according to latest UAX.
AL = [ AL AI SA SG XX ];
LB4: BK ÷;
LB5: CR LF;
LB5.1: CR ÷;
LB5.2: LF ÷;
LB5.3: NL ÷;
LB6: . (BK | CR | LF | NL);
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
# Rules LB14 - LB17.
# Moved before LB7, because they can match a longer sequence that would also match LB7,
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
# "while only the prefix "OP CM SP" matches LB7.1
LB14: OP CM* SP* .;
LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a, from Emoji proposal L2/16-011R3
# ZWJ x ID
LB8a: ZJ (ID | EB | EM);
# LB9: X CM -> X
# LB10: Unattached CM -> AL
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
#
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
# LB13.2 SP CM* [CL CP EX IS SY]
LB13.1: [^NU SP] CM* [CL CP IS SY];
LB13.2: [^SP] CM* EX;
LB13.2: SP [CL CP EX IS SY];
# LB 14-17 are moved above LB 7.
LB18: SP ÷;
LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
# ZJ acts like a CM to the left, combining with CB.
# ZJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZJ (ID | EB | EM);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB23.1: (ID | EB | EM) CM* PO;
LB23.2: (AL | HL | CM) CM* NU;
LB23.3: NU CM* (AL | HL);
LB24.1: PR CM* (ID | EB | EM);
LB24.2: PR CM* (AL | HL);
LB24.3: PO CM* (AL | HL);
# Numbers. Equivalent to Tailoring example 8 from UAX 14.
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
LB26.1: JL CM* (JL | JV | H2 | H3);
LB26.2: (JV | H2) CM* (JV | JT);
LB26.3: (JT | H3) CM* JT;
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
# LB28 Do not break between Alphabetics.
# Unattached (leading) CM treated as AL.
LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZJ (ID | EB | EM);
LB31.2: . CM* ÷;

View file

@ -0,0 +1,204 @@
#
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
#
# file: line_loose.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# This tailors the line break behavior to correspond to CSS
# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
# Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN
type = line;
locale = en@lb=loose;
AI = [:LineBreak = Ambiguous:];
AL = [[:LineBreak = Alphabetic:]-[\u2764]];
BA = [:LineBreak = Break_After:];
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
EM = [\U0001F3FB-\U0001F3FF];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
HL = [:LineBreak = Hebrew_Letter:];
HY = [:LineBreak = Hyphen:];
H2 = [:LineBreak = H2:];
H3 = [:LineBreak = H3:];
ID = [[:LineBreak = Ideographic:] CJ [\u2764]];
IN = [:LineBreak = Inseperable:];
IS = [:LineBreak = Infix_Numeric:];
JL = [:LineBreak = JL:];
JV = [:LineBreak = JV:];
JT = [:LineBreak = JT:];
LF = [:LineBreak = Line_Feed:];
NL = [:LineBreak = Next_Line:];
NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
NS = [[:LineBreak = Nonstarter:] - NSX];
NU = [:LineBreak = Numeric:];
OP = [:LineBreak = Open_Punctuation:];
PO = [:LineBreak = Postfix_Numeric:];
PR = [:LineBreak = Prefix_Numeric:];
QU = [:LineBreak = Quotation:];
RI = [:LineBreak = Regional_Indicator:];
SA = [:LineBreak = Complex_Context:];
SG = [:LineBreak = Surrogate:];
SP = [:LineBreak = Space:];
SY = [:LineBreak = Break_Symbols:];
WJ = [:LineBreak = Word_Joiner:];
XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZJ = [\u200D];
# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
ID = [ID - EB];
AL = [AL - EM];
dictionary = [:LineBreak = Complex_Context:];
# Redfine AL. LB1. TODO: refine according to latest UAX.
AL = [ AL AI SA SG XX ];
LB4: BK ÷;
LB5: CR LF;
LB5.1: CR ÷;
LB5.2: LF ÷;
LB5.3: NL ÷;
LB6: . (BK | CR | LF | NL);
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
# Rules LB14 - LB17.
# Moved before LB7, because they can match a longer sequence that would also match LB7,
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
# "while only the prefix "OP CM SP" matches LB7.1
LB14: OP CM* SP* .;
LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a, from Emoji proposal L2/16-011R3
# ZWJ x ID
LB8a: ZJ (ID | EB | EM);
# LB9: X CM -> X
# LB10: Unattached CM -> AL
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
#
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
# LB13.2 SP CM* [CL CP EX IS SY]
LB13.1: [^NU SP] CM* [CL CP IS SY];
LB13.2: [^SP] CM* EX;
LB13.2: SP [CL CP EX IS SY];
# LB 14-17 are moved above LB 7.
LB18: SP ÷;
LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
# ZJ acts like a CM to the left, combining with CB.
# ZJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZJ (ID | EB | EM);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
LB22.5: NU CM* IN;
LB23.1: (ID | EB | EM) CM* PO;
LB23.2: (AL | HL | CM) CM* NU;
LB23.3: NU CM* (AL | HL);
LB24.1: PR CM* (ID | EB | EM);
LB24.2: PR CM* (AL | HL);
LB24.3: PO CM* (AL | HL);
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
LB26.1: JL CM* (JL | JV | H2 | H3);
LB26.2: (JV | H2) CM* (JV | JT);
LB26.3: (JT | H3) CM* JT;
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
# LB28 Do not break between Alphabetics.
# Unattached (leading) CM treated as AL.
LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZJ (ID | EB | EM);
LB31.2: . CM* ÷;

View file

@ -0,0 +1,225 @@
#
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
#
# file: line_loose_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
#
# This tailors the line break behavior to correspond to CSS
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
# FF65 (all NS) and FF01, FF1F (both EX).
# * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W;
# this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
# * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
# this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
type = line;
locale = ja@lb=loose;
AI = [:LineBreak = Ambiguous:];
AL = [[:LineBreak = Alphabetic:]-[\u2764]];
BAX = [\u2010 \u2013];
BA = [[:LineBreak = Break_After:] - BAX];
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
EM = [\U0001F3FB-\U0001F3FF];
EXX = [\uFF01 \uFF1F];
EX = [[:LineBreak = Exclamation:] - EXX];
GL = [:LineBreak = Glue:];
HL = [:LineBreak = Hebrew_Letter:];
HY = [:LineBreak = Hyphen:];
H2 = [:LineBreak = H2:];
H3 = [:LineBreak = H3:];
ID = [[:LineBreak = Ideographic:][\u2764]CJ];
IN = [:LineBreak = Inseperable:];
IS = [:LineBreak = Infix_Numeric:];
JL = [:LineBreak = JL:];
JV = [:LineBreak = JV:];
JT = [:LineBreak = JT:];
LF = [:LineBreak = Line_Feed:];
NL = [:LineBreak = Next_Line:];
NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
NS = [[:LineBreak = Nonstarter:] - NSX];
NU = [:LineBreak = Numeric:];
OP = [:LineBreak = Open_Punctuation:];
POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
PO = [[:LineBreak = Postfix_Numeric:] - POX];
PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
PR = [[:LineBreak = Prefix_Numeric:] - PRX];
QU = [:LineBreak = Quotation:];
RI = [:LineBreak = Regional_Indicator:];
SA = [:LineBreak = Complex_Context:];
SG = [:LineBreak = Surrogate:];
SP = [:LineBreak = Space:];
SY = [:LineBreak = Break_Symbols:];
WJ = [:LineBreak = Word_Joiner:];
XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZJ = [\u200D];
# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
ID = [ID - EB];
AL = [AL - EM];
dictionary = [:LineBreak = Complex_Context:];
# Redfine AL. LB1. TODO: refine according to latest UAX.
AL = [ AL AI SA SG XX ];
LB4: BK ÷;
LB5: CR LF;
LB5.1: CR ÷;
LB5.2: LF ÷;
LB5.3: NL ÷;
LB6: . (BK | CR | LF | NL);
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
# Rules LB14 - LB17.
# Moved before LB7, because they can match a longer sequence that would also match LB7,
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
# "while only the prefix "OP CM SP" matches LB7.1
LB14: OP CM* SP* .;
LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a, from Emoji proposal L2/16-011R3
# ZWJ x ID
LB8a: ZJ (ID | EB | EM);
# LB9: X CM -> X
# LB10: Unattached CM -> AL
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA BAX HY] CM* GL;
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
#
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
# LB13.2 SP CM* [CL CP EX IS SY]
LB13.1: [^NU SP] CM* [CL CP IS SY];
LB13.2: [^SP] CM* EX;
LB13.2: SP [CL CP EX IS SY];
# LB 14-17 are moved above LB 7.
LB18: SP ÷;
LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
# ZJ acts like a CM to the left, combining with CB.
# ZJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZJ (ID | EB | EM);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
LB22.5: NU CM* IN;
LB23.1: (ID | EB | EM) CM* PO;
LB23.2: (AL | HL | CM) CM* NU;
LB23.3: NU CM* (AL | HL);
LB24.1: PR CM* (ID | EB | EM);
LB24.2: PR CM* (AL | HL);
LB24.3: (PO | POX) CM* (AL | HL);
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
# Loose_cj tailoring: do not include $PRX at the beginning or $POX at the end.
LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PRX | PO))?;
LB26.1: JL CM* (JL | JV | H2 | H3);
LB26.2: (JV | H2) CM* (JV | JT);
LB26.3: (JT | H3) CM* JT;
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
# LB28 Do not break between Alphabetics.
# Unattached (leading) CM treated as AL.
LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZJ (ID | EB | EM);
LB31.2: . CM* ÷;

View file

@ -0,0 +1,210 @@
#
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
#
# file: line_normal.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
#
# This tailors the line break behavior to correspond to CSS
# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
# Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
type = line;
locale = en@lb=normal;
AI = [:LineBreak = Ambiguous:];
AL = [[:LineBreak = Alphabetic:]-[\u2764]];
BA = [:LineBreak = Break_After:];
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
EM = [\U0001F3FB-\U0001F3FF];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
HL = [:LineBreak = Hebrew_Letter:];
HY = [:LineBreak = Hyphen:];
H2 = [:LineBreak = H2:];
H3 = [:LineBreak = H3:];
ID = [[:LineBreak = Ideographic:] CJ [\u2764]];
IN = [:LineBreak = Inseperable:];
IS = [:LineBreak = Infix_Numeric:];
JL = [:LineBreak = JL:];
JV = [:LineBreak = JV:];
JT = [:LineBreak = JT:];
LF = [:LineBreak = Line_Feed:];
NL = [:LineBreak = Next_Line:];
NS = [:LineBreak = Nonstarter:];
NU = [:LineBreak = Numeric:];
OP = [:LineBreak = Open_Punctuation:];
PO = [:LineBreak = Postfix_Numeric:];
PR = [:LineBreak = Prefix_Numeric:];
QU = [:LineBreak = Quotation:];
RI = [:LineBreak = Regional_Indicator:];
SA = [:LineBreak = Complex_Context:];
SG = [:LineBreak = Surrogate:];
SP = [:LineBreak = Space:];
SY = [:LineBreak = Break_Symbols:];
WJ = [:LineBreak = Word_Joiner:];
XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZJ = [\u200D];
# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
ID = [ID - EB];
AL = [AL - EM];
dictionary = [:LineBreak = Complex_Context:];
# Redfine AL. LB1. TODO: refine according to latest UAX.
AL = [ AL AI SA SG XX ];
LB4: BK ÷;
LB5: CR LF;
LB5.1: CR ÷;
LB5.2: LF ÷;
LB5.3: NL ÷;
LB6: . (BK | CR | LF | NL);
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
# Rules LB14 - LB17.
# Moved before LB7, because they can match a longer sequence that would also match LB7,
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
# "while only the prefix "OP CM SP" matches LB7.1
LB14: OP CM* SP* .;
LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a, from Emoji proposal L2/16-011R3
# ZWJ x ID
LB8a: ZJ (ID | EB | EM);
# LB9: X CM -> X
# LB10: Unattached CM -> AL
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
#
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
# LB13.2 SP CM* [CL CP EX IS SY]
LB13.1: [^NU SP] CM* [CL CP IS SY];
LB13.2: [^SP] CM* EX;
LB13.2: SP [CL CP EX IS SY];
# LB 14-17 are moved above LB 7.
LB18: SP ÷;
LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
# ZJ acts like a CM to the left, combining with CB.
# ZJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZJ (ID | EB | EM);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB23.1: (ID | EB | EM) CM* PO;
LB23.2: (AL | HL | CM) CM* NU;
LB23.3: NU CM* (AL | HL);
LB24.1: PR CM* (ID | EB | EM);
LB24.2: PR CM* (AL | HL);
LB24.3: PO CM* (AL | HL);
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
LB26.1: JL CM* (JL | JV | H2 | H3);
LB26.2: (JV | H2) CM* (JV | JT);
LB26.3: (JT | H3) CM* JT;
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
# LB28 Do not break between Alphabetics.
# Unattached (leading) CM treated as AL.
LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZJ (ID | EB | EM);
LB31.2: . CM* ÷;

View file

@ -0,0 +1,218 @@
# Copyright (c) 2016 International Business Machines Corporation and # others. All Rights Reserved.
#
# file: line_normal_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
#
# This tailors the line break behavior to correspond to CSS
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
type = line;
locale = ja@lb=normal;
AI = [:LineBreak = Ambiguous:];
AL = [[:LineBreak = Alphabetic:]-[\u2764]];
BAX = [\u2010 \u2013];
BA = [[:LineBreak = Break_After:] - BAX];
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
EM = [\U0001F3FB-\U0001F3FF];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
HL = [:LineBreak = Hebrew_Letter:];
HY = [:LineBreak = Hyphen:];
H2 = [:LineBreak = H2:];
H3 = [:LineBreak = H3:];
ID = [[:LineBreak = Ideographic:] CJ [\u2764]];
IN = [:LineBreak = Inseperable:];
IS = [:LineBreak = Infix_Numeric:];
JL = [:LineBreak = JL:];
JV = [:LineBreak = JV:];
JT = [:LineBreak = JT:];
LF = [:LineBreak = Line_Feed:];
NL = [:LineBreak = Next_Line:];
NSX = [\u301C \u30A0];
NS = [[:LineBreak = Nonstarter:] - NSX];
NU = [:LineBreak = Numeric:];
OP = [:LineBreak = Open_Punctuation:];
PO = [:LineBreak = Postfix_Numeric:];
PR = [:LineBreak = Prefix_Numeric:];
QU = [:LineBreak = Quotation:];
RI = [:LineBreak = Regional_Indicator:];
SA = [:LineBreak = Complex_Context:];
SG = [:LineBreak = Surrogate:];
SP = [:LineBreak = Space:];
SY = [:LineBreak = Break_Symbols:];
WJ = [:LineBreak = Word_Joiner:];
XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZJ = [\u200D];
# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
ID = [ID - EB];
AL = [AL - EM];
dictionary = [:LineBreak = Complex_Context:];
# Redfine AL. LB1. TODO: refine according to latest UAX.
AL = [ AL AI SA SG XX ];
LB4: BK ÷;
LB5: CR LF;
LB5.1: CR ÷;
LB5.2: LF ÷;
LB5.3: NL ÷;
LB6: . (BK | CR | LF | NL);
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
# Rules LB14 - LB17.
# Moved before LB7, because they can match a longer sequence that would also match LB7,
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
# "while only the prefix "OP CM SP" matches LB7.1
LB14: OP CM* SP* .;
LB15: QU CM* SP* OP;
# Do not break between closing punctuation and $NS, even with intervening spaces
# But DO allow a break between closing punctuation and $NSX, don't include it here
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a, from Emoji proposal L2/16-011R3
# ZWJ x ID
LB8a: ZJ (ID | EB | EM);
# LB9: X CM -> X
# LB10: Unattached CM -> AL
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA BAX HY] CM* GL;
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
#
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
# LB13.2 SP CM* [CL CP EX IS SY]
LB13.1: [^NU SP] CM* [CL CP IS SY];
LB13.2: [^SP] CM* EX;
LB13.2: SP [CL CP EX IS SY];
# LB 14-17 are moved above LB 7.
LB18: SP ÷;
LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
# ZJ acts like a CM to the left, combining with CB.
# ZJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZJ (ID | EB | EM);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
# should "HL BAX" not break when followed by a CB? Thats what the current
# rules do, which is why "[^CM CB]?" includes the ?.
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB23.1: (ID | EB | EM) CM* PO;
LB23.2: (AL | HL | CM) CM* NU;
LB23.3: NU CM* (AL | HL);
LB24.1: PR CM* (ID | EB | EM);
LB24.2: PR CM* (AL | HL);
LB24.3: PO CM* (AL | HL);
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
LB26.1: JL CM* (JL | JV | H2 | H3);
LB26.2: (JV | H2) CM* (JV | JT);
LB26.3: (JT | H3) CM* JT;
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
# LB28 Do not break between Alphabetics.
# Unattached (leading) CM treated as AL.
LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZJ (ID | EB | EM);
LB31.2: . CM* ÷;

View file

@ -0,0 +1,69 @@
file: testdata/break_rules/readme.txt
Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved.
This directory contains the break iterator reference rule files used by intltest rbbi/RBBIMonkeyTest/testMonkey.
The rules in this directory track the boundary rules from Unicode UAX 14 and 29. They are interpretted
to provide an expected set of boundary positions to compare with the results from ICU break iteration.
Each set of reference break rules lives in a separate file.
The list of rule files to run by default is hardcoded into the test code, in rbbimonkeytest.cpp.
Each test file includes
- The type of ICU break interator to create (word, line, sentence, etc.)
- The locale to use
- Character Class definitions
- Rule definitions
To Do
- Syntax for tailoring.
Character Class Definition:
name = set_regular_expression;
Rule Definition:
rule_regular_expression;
name:
[A-Za-z_][A-Za-z0-9_]*
set_regular_expression:
The intersection of an ICU regular expression [set] expression and a UnicodeSet pattern.
(They are mostly the same)
May include previously defined set names, which are logically expanded in-place.
rule_regular_expresson:
An ICU Regular Expression.
May include set names, which are logically expanded in-place.
May include a '÷', which defines a boundary position.
Application of the rules:
Matching begins at the start of text, or after a previously identified boundary.
The pseudo-code below finds the next boundary.
while position < end of text
for each rule
if the text at position matches this rule
if the rule has a '÷'
Boundary is found.
return the position of the '÷' within the match.
else
position = last character of the rule match.
break from the rule loop, continue the outer loop.
This differs from the Unicode UAX algorithm in that each position in the text is
not tested separately. Instead, when a rule match is found, rule application restarts with the last
character of the preceding rule match. ICU's break rules also operate this way.
Expressing rules this way simplifies UAX rules that have leading or trailing context; it
is no longer necessary to write expressions that match the context starting from
any position within it.
This rule form differs from ICU rules in that the rules are applied sequentially, as they
are with the Unicode UAX rules. With the main ICU break rules, all are applied in parallel.
Word Dictionaries
The monkey test does not test dictionary based breaking. The set named 'dicitionary' is special,
as it is in the main ICU rules. For the monkey test, no characters from the dictionary set are
included in the randomly-generated test data.

View file

@ -0,0 +1,43 @@
type = sentence; # one of grapheme | word | line | sentence
locale = en;
CR = [\p{Sentence_Break = CR}];
LF = [\p{Sentence_Break = LF}];
Extend = [\p{Sentence_Break = Extend}];
Sep = [\p{Sentence_Break = Sep}];
Format = [\p{Sentence_Break = Format}];
Sp = [\p{Sentence_Break = Sp}];
Lower = [\p{Sentence_Break = Lower}];
Upper = [\p{Sentence_Break = Upper}];
OLetter = [\p{Sentence_Break = OLetter}];
Numeric = [\p{Sentence_Break = Numeric}];
ATerm = [\p{Sentence_Break = ATerm}];
SContinue = [\p{Sentence_Break = SContinue}];
STerm = [\p{Sentence_Break = STerm}];
Close = [\p{Sentence_Break = Close}];
ParaSep = [Sep CR LF];
SATerm = [STerm ATerm];
ExtFmt = [Extend Format];
# SB2: ÷ eot
# Conventional regular expression matching for '$' as end-of-text also matches
# at a line separator just preceding the physical end of text.
# Instead, use a look-ahead assertion that there is no following character.
SB2: . ÷ (?!.);
SB3: CR LF;
SB4: ParaSep ÷;
# SB5: ignore Format and Extend characters.
SB6: ATerm ExtFmt* Numeric;
SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
# Also covers SB10, SB11.
SB12: . ExtFmt* [^ExtFmt]?;

View file

@ -0,0 +1,97 @@
#
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: word.txt
#
# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
type = word; # one of grapheme | word | line | sentence
locale = en;
E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
E_Modifier = [\U0001F3FB-\U0001F3FF];
ZWJ = [\u200D];
GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
CR = [\p{Word_Break = CR}];
LF = [\p{Word_Break = LF}];
Newline = [\p{Word_Break = Newline}];
Extend = [[[\p{Word_Break = Extend}][:Block=Tags:]]-ZWJ];
Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
Format = [[\p{Word_Break = Format}]-[:Block=Tags:]];
Katakana = [\p{Word_Break = Katakana}];
Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
ALetter = [\p{Word_Break = ALetter}];
Single_Quote = [\p{Word_Break = Single_Quote}];
Double_Quote = [\p{Word_Break = Double_Quote}];
MidNumLet = [\p{Word_Break = MidNumLet}];
MidLetter = [\p{Word_Break = MidLetter}];
MidNum = [\p{Word_Break = MidNum}];
Numeric = [\p{Word_Break = Numeric}];
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
#define dicitionary, with the effect being that those characters don't appear in test data.
Han = [:Han:];
Hiragana = [:Hiragana:];
Control = [\p{Grapheme_Cluster_Break = Control}];
HangulSyllable = [\uac00-\ud7a3];
ComplexContext = [:LineBreak = Complex_Context:];
KanaKanji = [Han Hiragana Katakana];
dictionaryCJK = [KanaKanji HangulSyllable];
dictionary = [ComplexContext dictionaryCJK];
# leave CJK scripts out of ALetterPlus
# Tricky. Redfine a set.
# For tailorings, if it modifies itself, do at end of sets ????
# Tweak redefine to mean replace existing definition at its original location.
# Insert defs without redefine just after last pre-existing def of that name.
# Maybe drop redefine, add warning for sets defined and not used, should catch typos.
ALetter = [ALetter - dictionary];
AHLetter = [ALetter Hebrew_Letter];
MidNumLetQ = [MidNumLet Single_Quote];
ExtFmt = [Extend Format ZWJ];
WB3: CR LF;
WB3a: (Newline | CR | LF) ÷;
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
# (but needed with UAX treat-as scheme.)
WB3c: ZWJ GAZ;
WB5: AHLetter ExtFmt* AHLetter;
# includes both WB6 and WB7
WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter;
WB7a: Hebrew_Letter ExtFmt* Single_Quote;
WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c
WB8: Numeric ExtFmt* Numeric;
WB9: AHLetter ExtFmt* Numeric;
WB10: Numeric ExtFmt* AHLetter;
WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12
WB13: Katakana ExtFmt* Katakana;
WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
# WB rule 13c, pairs of Regional Indicators stay unbroken.
# Interacts with WB3c.
WB13c.1: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ GAZ;
WB13c.2: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
WB13d: (E_Base | GAZ) ExtFmt* E_Modifier;
# Rule WB 14 Any ÷ Any
# Interacts with WB3c, do not break between ZWJ and GAZ.
WB14.1: . ExtFmt* ZWJ GAZ;
WB14.2: . ExtFmt* ÷;

View file

@ -0,0 +1,96 @@
#
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: word_POSIX.txt
#
# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
type = word; # one of grapheme | word | line | sentence
locale = en_US_POSIX;
E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
E_Modifier = [\U0001F3FB-\U0001F3FF];
ZWJ = [\u200D];
GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
CR = [\p{Word_Break = CR}];
LF = [\p{Word_Break = LF}];
Newline = [\p{Word_Break = Newline}];
Extend = [[[\p{Word_Break = Extend}][:Block=Tags:]]-ZWJ];
Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
Format = [[\p{Word_Break = Format}]-[:Block=Tags:]];
Katakana = [\p{Word_Break = Katakana}];
Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
ALetter = [\p{Word_Break = ALetter}];
Single_Quote = [\p{Word_Break = Single_Quote}];
Double_Quote = [\p{Word_Break = Double_Quote}];
MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
MidLetter = [\p{Word_Break = MidLetter} - [\:]];
MidNum = [\p{Word_Break = MidNum} [.]];
Numeric = [\p{Word_Break = Numeric}];
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
#define dicitionary, with the effect being that those characters don't appear in test data.
Han = [:Han:];
Hiragana = [:Hiragana:];
Control = [\p{Grapheme_Cluster_Break = Control}];
HangulSyllable = [\uac00-\ud7a3];
ComplexContext = [:LineBreak = Complex_Context:];
KanaKanji = [Han Hiragana Katakana];
dictionaryCJK = [KanaKanji HangulSyllable];
dictionary = [ComplexContext dictionaryCJK];
# leave CJK scripts out of ALetterPlus
# Tricky. Redfine a set.
# For tailorings, if it modifies itself, do at end of sets ????
# Tweak redefine to mean replace existing definition at its original location.
# Insert defs without redefine just after last pre-existing def of that name.
# Maybe drop redefine, add warning for sets defined and not used, should catch typos.
ALetter = [ALetter - dictionary];
AHLetter = [ALetter Hebrew_Letter];
MidNumLetQ = [MidNumLet Single_Quote];
ExtFmt = [Extend Format ZWJ];
WB3: CR LF;
WB3a: (Newline | CR | LF) ÷;
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
# (but needed with UAX treat-as scheme.)
WB3c: ZWJ GAZ;
WB5: AHLetter ExtFmt* AHLetter;
# includes both WB6 and WB7
WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter;
WB7a: Hebrew_Letter ExtFmt* Single_Quote;
WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c
WB8: Numeric ExtFmt* Numeric;
WB9: AHLetter ExtFmt* Numeric;
WB10: Numeric ExtFmt* AHLetter;
WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12
WB13: Katakana ExtFmt* Katakana;
WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
# WB rule 13c, pairs of Regional Indicators stay unbroken.
# Interacts with WB3c.
WB13c.1: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ GAZ;
WB13c.2: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
WB13d: (E_Base | GAZ) ExtFmt* E_Modifier;
# Rule WB 14 Any ÷ Any
# Interacts with WB3c, do not break between ZWJ and GAZ.
WB14.1: . ExtFmt* ZWJ GAZ;
WB14.2: . ExtFmt* ÷;

View file

@ -1,4 +1,4 @@
# Copyright (c) 2001-2015 International Business Machines
# Copyright (c) 2001-2016 International Business Machines
# Corporation and others. All Rights Reserved.
#
# RBBI Test Data
@ -513,6 +513,18 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
<data>• •\uF8FF\u2028<100>\uF8FF•</data>
<data>• \u200B\u2028<100>\u200B•</data>
# Regional Indicator sequences. They group in pairs. The reverse rules are tricky.
# Sequences are long enough that the non-exaustive monkey test won't reliably pick up problems.
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
# User Guide example
<data>•Parlez-•vous •français ?•</data>

View file

@ -271,7 +271,7 @@
<ClCompile Include="toolutil.cpp">
<DisableLanguageExtensions>false</DisableLanguageExtensions>
</ClCompile>
<ClCompile Include="ucbuf.c" />
<ClCompile Include="ucbuf.cpp" />
<ClCompile Include="ucm.c" />
<ClCompile Include="ucmstate.c" />
<ClCompile Include="unewdata.c" />

View file

@ -1,12 +1,12 @@
/*
*******************************************************************************
*
* Copyright (C) 1998-2014, International Business Machines
* Copyright (C) 1998-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
* File ucbuf.c
* File ucbuf.cpp
*
* Modification History:
*
@ -415,7 +415,7 @@ ucbuf_getcx32(UCHARBUF* buf,UErrorCode* error) {
/* check if u_unescapeAt unescaped and converted
* to c32 or not
*/
if(c32==0xFFFFFFFF){
if(c32==(UChar32)0xFFFFFFFF){
if(buf->showWarning) {
char context[CONTEXT_LEN+1];
int32_t len = CONTEXT_LEN;

View file

@ -1,12 +1,12 @@
/*
*******************************************************************************
*
* Copyright (C) 1998-2015, International Business Machines
* Copyright (C) 1998-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
* File ucbuf.c
* File ucbuf.h
*
* Modification History:
*
@ -17,6 +17,7 @@
*******************************************************************************
*/
#include "unicode/localpointer.h"
#include "unicode/ucnv.h"
#include "filestrm.h"
@ -45,11 +46,11 @@ struct ULine {
/**
* Opens the UCHARBUF with the given file stream and code page for conversion
* @param fileName Name of the file to open.
* @param codepage The encoding of the file stream to convert to Unicode.
* @param codepage The encoding of the file stream to convert to Unicode.
* If *codepoge is NULL on input the API will try to autodetect
* popular Unicode encodings
* @param showWarning Flag to print out warnings to STDOUT
* @param buffered If TRUE performs a buffered read of the input file. If FALSE reads
* @param buffered If TRUE performs a buffered read of the input file. If FALSE reads
* the whole file into memory and converts it.
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
* indicates a failure on entry, the function will immediately return.
@ -82,7 +83,7 @@ U_CAPI int32_t U_EXPORT2
ucbuf_getc32(UCHARBUF* buf,UErrorCode* err);
/**
* Gets a UTF-16 code unit at the current position from the converted buffer after
* Gets a UTF-16 code unit at the current position from the converted buffer after
* unescaping and increments the current position. If the escape sequence is for UTF-32
* code point (\\Uxxxxxxxx) then a UTF-32 codepoint is returned
* @param buf Pointer to UCHARBUF structure
@ -95,7 +96,7 @@ ucbuf_getcx32(UCHARBUF* buf,UErrorCode* err);
/**
* Gets a pointer to the current position in the internal buffer and length of the line.
* It imperative to make a copy of the returned buffere before performing operations on it.
* It imperative to make a copy of the returned buffer before performing operations on it.
* @param buf Pointer to UCHARBUF structure
* @param len Output param to receive the len of the buffer returned till end of the line
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
@ -141,6 +142,14 @@ ucbuf_close(UCHARBUF* buf);
U_NAMESPACE_BEGIN
/**
* \class LocalUCHARBUFPointer
* "Smart pointer" class, closes a UCHARBUF via ucbuf_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
U_NAMESPACE_END
@ -155,7 +164,7 @@ ucbuf_ungetc(int32_t ungetChar,UCHARBUF* buf);
/**
* Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
* Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
* Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
* the converter to correct state for converting the rest of the stream. So the UConverter parameter
* is necessary.
@ -175,7 +184,7 @@ ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv,
int32_t* signatureLength, UErrorCode* status);
/**
* Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
* Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
* Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
* the converter to correct state for converting the rest of the stream. So the UConverter parameter
* is necessary.