mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 22:15:31 +00:00
ICU-12081 Initial implementation Emoji break rules and a new RBBI monkey test.
X-SVN-Rev: 38387
This commit is contained in:
parent
2cf8965496
commit
9d9256f3b7
47 changed files with 4675 additions and 1291 deletions
|
@ -983,6 +983,54 @@ enum RBBIRunMode {
|
|||
};
|
||||
|
||||
|
||||
// Map from look-ahead break states (corresponds to rules) to boundary positions.
|
||||
// Allows multiple lookahead break rules to be in flight at the same time.
|
||||
//
|
||||
// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
|
||||
// in the state table be sequential, then we can just index an array. And the
|
||||
// table could also tell us in advance how big that array needs to be.
|
||||
//
|
||||
// Before ICU 57 there was just a single simple variable for a look-ahead match that
|
||||
// was in progress. Two rules at once did not work.
|
||||
|
||||
static const int32_t kMaxLookaheads = 8;
|
||||
struct LookAheadResults {
|
||||
int32_t fUsedSlotLimit;
|
||||
int32_t fPositions[8];
|
||||
int16_t fKeys[8];
|
||||
|
||||
LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
|
||||
|
||||
int32_t getPosition(int16_t key) {
|
||||
for (int32_t i=0; i<fUsedSlotLimit; ++i) {
|
||||
if (fKeys[i] == key) {
|
||||
return fPositions[i];
|
||||
}
|
||||
}
|
||||
U_ASSERT(FALSE);
|
||||
return -1;
|
||||
}
|
||||
|
||||
void setPosition(int16_t key, int32_t position) {
|
||||
int32_t i;
|
||||
for (i=0; i<fUsedSlotLimit; ++i) {
|
||||
if (fKeys[i] == key) {
|
||||
fPositions[i] = position;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (i >= kMaxLookaheads) {
|
||||
U_ASSERT(FALSE);
|
||||
i = kMaxLookaheads - 1;
|
||||
}
|
||||
fKeys[i] = key;
|
||||
fPositions[i] = position;
|
||||
U_ASSERT(fUsedSlotLimit == i);
|
||||
fUsedSlotLimit = i + 1;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// handleNext(stateTable)
|
||||
|
@ -1000,14 +1048,11 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
|
|||
|
||||
RBBIStateTableRow *row;
|
||||
UChar32 c;
|
||||
int32_t lookaheadStatus = 0;
|
||||
int32_t lookaheadTagIdx = 0;
|
||||
int32_t result = 0;
|
||||
int32_t initialPosition = 0;
|
||||
int32_t lookaheadResult = 0;
|
||||
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
|
||||
const char *tableData = statetable->fTableData;
|
||||
uint32_t tableRowLen = statetable->fRowLen;
|
||||
LookAheadResults lookAheadMatches;
|
||||
int32_t result = 0;
|
||||
int32_t initialPosition = 0;
|
||||
const char *tableData = statetable->fTableData;
|
||||
uint32_t tableRowLen = statetable->fRowLen;
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (fTrace) {
|
||||
|
@ -1050,14 +1095,6 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
|
|||
// We have already run the loop one last time with the
|
||||
// character set to the psueudo {eof} value. Now it is time
|
||||
// to unconditionally bail out.
|
||||
if (lookaheadResult > result) {
|
||||
// We ran off the end of the string with a pending look-ahead match.
|
||||
// Treat this as if the look-ahead condition had been met, and return
|
||||
// the match at the / position from the look-ahead rule.
|
||||
result = lookaheadResult;
|
||||
fLastRuleStatusIndex = lookaheadTagIdx;
|
||||
lookaheadStatus = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
// Run the loop one last time with the fake end-of-input character category.
|
||||
|
@ -1123,38 +1160,23 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
|
|||
fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
|
||||
}
|
||||
|
||||
if (row->fLookAhead != 0) {
|
||||
if (lookaheadStatus != 0
|
||||
&& row->fAccepting == lookaheadStatus) {
|
||||
// Lookahead match is completed.
|
||||
result = lookaheadResult;
|
||||
fLastRuleStatusIndex = lookaheadTagIdx;
|
||||
lookaheadStatus = 0;
|
||||
// TODO: make a standalone hard break in a rule work.
|
||||
if (lookAheadHardBreak) {
|
||||
UTEXT_SETNATIVEINDEX(fText, result);
|
||||
return result;
|
||||
}
|
||||
// Look-ahead completed, but other rules may match further. Continue on
|
||||
// TODO: junk this feature? I don't think it's used anywhwere.
|
||||
goto continueOn;
|
||||
int16_t completedRule = row->fAccepting;
|
||||
if (completedRule > 0) {
|
||||
// Lookahead match is completed.
|
||||
int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
|
||||
if (lookaheadResult >= 0) {
|
||||
fLastRuleStatusIndex = row->fTagIdx;
|
||||
UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
|
||||
return lookaheadResult;
|
||||
}
|
||||
|
||||
int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
lookaheadTagIdx = row->fTagIdx;
|
||||
goto continueOn;
|
||||
}
|
||||
int16_t rule = row->fLookAhead;
|
||||
if (rule != 0) {
|
||||
// At the position of a '/' in a look-ahead match. Record it.
|
||||
int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
|
||||
lookAheadMatches.setPosition(rule, pos);
|
||||
}
|
||||
|
||||
|
||||
if (row->fAccepting != 0) {
|
||||
// Because this is an accepting state, any in-progress look-ahead match
|
||||
// is no longer relavant. Clear out the pending lookahead status.
|
||||
lookaheadStatus = 0; // clear out any pending look-ahead match.
|
||||
}
|
||||
|
||||
continueOn:
|
||||
if (state == STOP_STATE) {
|
||||
// This is the normal exit from the lookup state machine.
|
||||
// We have advanced through the string until it is certain that no
|
||||
|
@ -1216,11 +1238,9 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
RBBIRunMode mode;
|
||||
RBBIStateTableRow *row;
|
||||
UChar32 c;
|
||||
int32_t lookaheadStatus = 0;
|
||||
LookAheadResults lookAheadMatches;
|
||||
int32_t result = 0;
|
||||
int32_t initialPosition = 0;
|
||||
int32_t lookaheadResult = 0;
|
||||
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (fTrace) {
|
||||
|
@ -1266,13 +1286,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
// We have already run the loop one last time with the
|
||||
// character set to the psueudo {eof} value. Now it is time
|
||||
// to unconditionally bail out.
|
||||
if (lookaheadResult < result) {
|
||||
// We ran off the end of the string with a pending look-ahead match.
|
||||
// Treat this as if the look-ahead condition had been met, and return
|
||||
// the match at the / position from the look-ahead rule.
|
||||
result = lookaheadResult;
|
||||
lookaheadStatus = 0;
|
||||
} else if (result == initialPosition) {
|
||||
if (result == initialPosition) {
|
||||
// Ran off start, no match found.
|
||||
// move one index one (towards the start, since we are doing a previous())
|
||||
UTEXT_SETNATIVEINDEX(fText, initialPosition);
|
||||
|
@ -1338,36 +1352,22 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
|
||||
}
|
||||
|
||||
if (row->fLookAhead != 0) {
|
||||
if (lookaheadStatus != 0
|
||||
&& row->fAccepting == lookaheadStatus) {
|
||||
// Lookahead match is completed.
|
||||
result = lookaheadResult;
|
||||
lookaheadStatus = 0;
|
||||
// TODO: make a standalone hard break in a rule work.
|
||||
if (lookAheadHardBreak) {
|
||||
UTEXT_SETNATIVEINDEX(fText, result);
|
||||
return result;
|
||||
}
|
||||
// Look-ahead completed, but other rules may match further. Continue on
|
||||
// TODO: junk this feature? I don't think it's used anywhwere.
|
||||
goto continueOn;
|
||||
int16_t completedRule = row->fAccepting;
|
||||
if (completedRule > 0) {
|
||||
// Lookahead match is completed.
|
||||
int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
|
||||
if (lookaheadResult >= 0) {
|
||||
UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
|
||||
return lookaheadResult;
|
||||
}
|
||||
|
||||
int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
goto continueOn;
|
||||
}
|
||||
int16_t rule = row->fLookAhead;
|
||||
if (rule != 0) {
|
||||
// At the position of a '/' in a look-ahead match. Record it.
|
||||
int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
|
||||
lookAheadMatches.setPosition(rule, pos);
|
||||
}
|
||||
|
||||
|
||||
if (row->fAccepting != 0) {
|
||||
// Because this is an accepting state, any in-progress look-ahead match
|
||||
// is no longer relavant. Clear out the pending lookahead status.
|
||||
lookaheadStatus = 0;
|
||||
}
|
||||
|
||||
continueOn:
|
||||
if (state == STOP_STATE) {
|
||||
// This is the normal exit from the lookup state machine.
|
||||
// We have advanced through the string until it is certain that no
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2002-2008 International Business Machines Corporation *
|
||||
* Copyright (C) 2002-2016 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
***************************************************************************
|
||||
*/
|
||||
|
@ -56,6 +56,8 @@ RBBINode::RBBINode(NodeType t) : UMemory() {
|
|||
fLastPos = 0;
|
||||
fNullable = FALSE;
|
||||
fLookAheadEnd = FALSE;
|
||||
fRuleRoot = FALSE;
|
||||
fChainIn = FALSE;
|
||||
fVal = 0;
|
||||
fPrecedence = precZero;
|
||||
|
||||
|
@ -86,6 +88,8 @@ RBBINode::RBBINode(const RBBINode &other) : UMemory(other) {
|
|||
fLastPos = other.fLastPos;
|
||||
fNullable = other.fNullable;
|
||||
fVal = other.fVal;
|
||||
fRuleRoot = FALSE;
|
||||
fChainIn = other.fChainIn;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
|
||||
fLastPosSet = new UVector(status);
|
||||
|
@ -161,6 +165,8 @@ RBBINode *RBBINode::cloneTree() {
|
|||
}
|
||||
}
|
||||
}
|
||||
n->fRuleRoot = this->fRuleRoot;
|
||||
n->fChainIn = this->fChainIn;
|
||||
return n;
|
||||
}
|
||||
|
||||
|
@ -272,6 +278,12 @@ void RBBINode::findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &s
|
|||
//
|
||||
//-------------------------------------------------------------------------
|
||||
#ifdef RBBI_DEBUG
|
||||
|
||||
static int32_t serial(const RBBINode *node) {
|
||||
return (node == NULL? -1 : node->fSerialNum);
|
||||
}
|
||||
|
||||
|
||||
void RBBINode::printNode() {
|
||||
static const char * const nodeTypeNames[] = {
|
||||
"setRef",
|
||||
|
@ -295,9 +307,10 @@ void RBBINode::printNode() {
|
|||
if (this==NULL) {
|
||||
RBBIDebugPrintf("%10p", (void *)this);
|
||||
} else {
|
||||
RBBIDebugPrintf("%10p %12s %10p %10p %10p %4d %6d %d ",
|
||||
(void *)this, nodeTypeNames[fType], (void *)fParent, (void *)fLeftChild, (void *)fRightChild,
|
||||
fSerialNum, fFirstPos, fVal);
|
||||
RBBIDebugPrintf("%10p %5d %12s %c%c %5d %5d %5d %6d %d ",
|
||||
(void *)this, fSerialNum, nodeTypeNames[fType], fRuleRoot?'R':' ', fChainIn?'C':' ',
|
||||
serial(fLeftChild), serial(fRightChild), serial(fParent),
|
||||
fFirstPos, fVal);
|
||||
if (fType == varRef) {
|
||||
RBBI_DEBUG_printUnicodeString(fText);
|
||||
}
|
||||
|
@ -328,11 +341,13 @@ U_CFUNC void RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth)
|
|||
//
|
||||
//-------------------------------------------------------------------------
|
||||
#ifdef RBBI_DEBUG
|
||||
void RBBINode::printNodeHeader() {
|
||||
RBBIDebugPrintf(" Address serial type LeftChild RightChild Parent position value\n");
|
||||
}
|
||||
|
||||
void RBBINode::printTree(UBool printHeading) {
|
||||
if (printHeading) {
|
||||
RBBIDebugPrintf( "-------------------------------------------------------------------\n"
|
||||
" Address type Parent LeftChild RightChild serial position value\n"
|
||||
);
|
||||
printNodeHeader();
|
||||
}
|
||||
this->printNode();
|
||||
if (this != NULL) {
|
||||
|
|
|
@ -80,6 +80,10 @@ class RBBINode : public UMemory {
|
|||
UBool fLookAheadEnd; // For endMark nodes, set TRUE if
|
||||
// marking the end of a look-ahead rule.
|
||||
|
||||
UBool fRuleRoot; // True if this node is the root of a rule.
|
||||
UBool fChainIn; // True if chaining into this rule is allowed
|
||||
// (no '^' present).
|
||||
|
||||
UVector *fFirstPosSet;
|
||||
UVector *fLastPosSet; // TODO: rename fFirstPos & fLastPos to avoid confusion.
|
||||
UVector *fFollowPos;
|
||||
|
@ -95,6 +99,7 @@ class RBBINode : public UMemory {
|
|||
void findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status);
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
static void printNodeHeader();
|
||||
void printNode();
|
||||
void printTree(UBool withHeading);
|
||||
#endif
|
||||
|
@ -104,6 +109,7 @@ class RBBINode : public UMemory {
|
|||
UBool operator == (const RBBINode &other); // Private, so these functions won't accidently be used.
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
public:
|
||||
int fSerialNum; // Debugging aids.
|
||||
#endif
|
||||
};
|
||||
|
|
|
@ -40,6 +40,7 @@ enum RBBI_RuleParseAction {
|
|||
doExprStart,
|
||||
doLParen,
|
||||
doNOP,
|
||||
doNoChain,
|
||||
doOptionEnd,
|
||||
doOptionStart,
|
||||
doReverseDir,
|
||||
|
@ -77,101 +78,109 @@ struct RBBIRuleTableEl {
|
|||
|
||||
static const struct RBBIRuleTableEl gRuleParseStateTable[] = {
|
||||
{doNOP, 0, 0, 0, TRUE}
|
||||
, {doExprStart, 254, 21, 8, FALSE} // 1 start
|
||||
, {doExprStart, 254, 29, 9, FALSE} // 1 start
|
||||
, {doNOP, 132, 1,0, TRUE} // 2
|
||||
, {doExprStart, 36 /* $ */, 80, 90, FALSE} // 3
|
||||
, {doNOP, 33 /* ! */, 11,0, TRUE} // 4
|
||||
, {doNOP, 59 /* ; */, 1,0, TRUE} // 5
|
||||
, {doNOP, 252, 0,0, FALSE} // 6
|
||||
, {doExprStart, 255, 21, 8, FALSE} // 7
|
||||
, {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 8 break-rule-end
|
||||
, {doNOP, 132, 8,0, TRUE} // 9
|
||||
, {doRuleError, 255, 95,0, FALSE} // 10
|
||||
, {doNOP, 33 /* ! */, 13,0, TRUE} // 11 rev-option
|
||||
, {doReverseDir, 255, 20, 8, FALSE} // 12
|
||||
, {doOptionStart, 130, 15,0, TRUE} // 13 option-scan1
|
||||
, {doRuleError, 255, 95,0, FALSE} // 14
|
||||
, {doNOP, 129, 15,0, TRUE} // 15 option-scan2
|
||||
, {doOptionEnd, 255, 17,0, FALSE} // 16
|
||||
, {doNOP, 59 /* ; */, 1,0, TRUE} // 17 option-scan3
|
||||
, {doNOP, 132, 17,0, TRUE} // 18
|
||||
, {doRuleError, 255, 95,0, FALSE} // 19
|
||||
, {doExprStart, 255, 21, 8, FALSE} // 20 reverse-rule
|
||||
, {doRuleChar, 254, 30,0, TRUE} // 21 term
|
||||
, {doNOP, 132, 21,0, TRUE} // 22
|
||||
, {doRuleChar, 131, 30,0, TRUE} // 23
|
||||
, {doNOP, 91 /* [ */, 86, 30, FALSE} // 24
|
||||
, {doLParen, 40 /* ( */, 21, 30, TRUE} // 25
|
||||
, {doNOP, 36 /* $ */, 80, 29, FALSE} // 26
|
||||
, {doDotAny, 46 /* . */, 30,0, TRUE} // 27
|
||||
, {doRuleError, 255, 95,0, FALSE} // 28
|
||||
, {doCheckVarDef, 255, 30,0, FALSE} // 29 term-var-ref
|
||||
, {doNOP, 132, 30,0, TRUE} // 30 expr-mod
|
||||
, {doUnaryOpStar, 42 /* * */, 35,0, TRUE} // 31
|
||||
, {doUnaryOpPlus, 43 /* + */, 35,0, TRUE} // 32
|
||||
, {doUnaryOpQuestion, 63 /* ? */, 35,0, TRUE} // 33
|
||||
, {doNOP, 255, 35,0, FALSE} // 34
|
||||
, {doExprCatOperator, 254, 21,0, FALSE} // 35 expr-cont
|
||||
, {doNOP, 132, 35,0, TRUE} // 36
|
||||
, {doExprCatOperator, 131, 21,0, FALSE} // 37
|
||||
, {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 38
|
||||
, {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 39
|
||||
, {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 40
|
||||
, {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 41
|
||||
, {doExprCatOperator, 47 /* / */, 47,0, FALSE} // 42
|
||||
, {doExprCatOperator, 123 /* { */, 59,0, TRUE} // 43
|
||||
, {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 44
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 45
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 46
|
||||
, {doSlash, 47 /* / */, 49,0, TRUE} // 47 look-ahead
|
||||
, {doNOP, 255, 95,0, FALSE} // 48
|
||||
, {doExprCatOperator, 254, 21,0, FALSE} // 49 expr-cont-no-slash
|
||||
, {doNOP, 132, 35,0, TRUE} // 50
|
||||
, {doExprCatOperator, 131, 21,0, FALSE} // 51
|
||||
, {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 52
|
||||
, {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 53
|
||||
, {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 54
|
||||
, {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 55
|
||||
, {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 56
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 58
|
||||
, {doNOP, 132, 59,0, TRUE} // 59 tag-open
|
||||
, {doStartTagValue, 128, 62,0, FALSE} // 60
|
||||
, {doTagExpectedError, 255, 95,0, FALSE} // 61
|
||||
, {doNOP, 132, 66,0, TRUE} // 62 tag-value
|
||||
, {doNOP, 125 /* } */, 66,0, FALSE} // 63
|
||||
, {doTagDigit, 128, 62,0, TRUE} // 64
|
||||
, {doTagExpectedError, 255, 95,0, FALSE} // 65
|
||||
, {doNOP, 132, 66,0, TRUE} // 66 tag-close
|
||||
, {doTagValue, 125 /* } */, 69,0, TRUE} // 67
|
||||
, {doTagExpectedError, 255, 95,0, FALSE} // 68
|
||||
, {doExprCatOperator, 254, 21,0, FALSE} // 69 expr-cont-no-tag
|
||||
, {doNOP, 132, 69,0, TRUE} // 70
|
||||
, {doExprCatOperator, 131, 21,0, FALSE} // 71
|
||||
, {doExprCatOperator, 91 /* [ */, 21,0, FALSE} // 72
|
||||
, {doExprCatOperator, 40 /* ( */, 21,0, FALSE} // 73
|
||||
, {doExprCatOperator, 36 /* $ */, 21,0, FALSE} // 74
|
||||
, {doExprCatOperator, 46 /* . */, 21,0, FALSE} // 75
|
||||
, {doExprCatOperator, 47 /* / */, 47,0, FALSE} // 76
|
||||
, {doExprOrOperator, 124 /* | */, 21,0, TRUE} // 77
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 78
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 79
|
||||
, {doStartVariableName, 36 /* $ */, 82,0, TRUE} // 80 scan-var-name
|
||||
, {doNOP, 255, 95,0, FALSE} // 81
|
||||
, {doNOP, 130, 84,0, TRUE} // 82 scan-var-start
|
||||
, {doVariableNameExpectedErr, 255, 95,0, FALSE} // 83
|
||||
, {doNOP, 129, 84,0, TRUE} // 84 scan-var-body
|
||||
, {doEndVariableName, 255, 255,0, FALSE} // 85
|
||||
, {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 86 scan-unicode-set
|
||||
, {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 87
|
||||
, {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 88
|
||||
, {doNOP, 255, 95,0, FALSE} // 89
|
||||
, {doNOP, 132, 90,0, TRUE} // 90 assign-or-rule
|
||||
, {doStartAssign, 61 /* = */, 21, 93, TRUE} // 91
|
||||
, {doNOP, 255, 29, 8, FALSE} // 92
|
||||
, {doEndAssign, 59 /* ; */, 1,0, TRUE} // 93 assign-end
|
||||
, {doRuleErrorAssignExpr, 255, 95,0, FALSE} // 94
|
||||
, {doExit, 255, 95,0, TRUE} // 95 errorDeath
|
||||
, {doNoChain, 94 /* ^ */, 12, 9, TRUE} // 3
|
||||
, {doExprStart, 36 /* $ */, 88, 98, FALSE} // 4
|
||||
, {doNOP, 33 /* ! */, 19,0, TRUE} // 5
|
||||
, {doNOP, 59 /* ; */, 1,0, TRUE} // 6
|
||||
, {doNOP, 252, 0,0, FALSE} // 7
|
||||
, {doExprStart, 255, 29, 9, FALSE} // 8
|
||||
, {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 9 break-rule-end
|
||||
, {doNOP, 132, 9,0, TRUE} // 10
|
||||
, {doRuleError, 255, 103,0, FALSE} // 11
|
||||
, {doExprStart, 254, 29,0, FALSE} // 12 start-after-caret
|
||||
, {doNOP, 132, 12,0, TRUE} // 13
|
||||
, {doRuleError, 94 /* ^ */, 103,0, FALSE} // 14
|
||||
, {doExprStart, 36 /* $ */, 88, 37, FALSE} // 15
|
||||
, {doRuleError, 59 /* ; */, 103,0, FALSE} // 16
|
||||
, {doRuleError, 252, 103,0, FALSE} // 17
|
||||
, {doExprStart, 255, 29,0, FALSE} // 18
|
||||
, {doNOP, 33 /* ! */, 21,0, TRUE} // 19 rev-option
|
||||
, {doReverseDir, 255, 28, 9, FALSE} // 20
|
||||
, {doOptionStart, 130, 23,0, TRUE} // 21 option-scan1
|
||||
, {doRuleError, 255, 103,0, FALSE} // 22
|
||||
, {doNOP, 129, 23,0, TRUE} // 23 option-scan2
|
||||
, {doOptionEnd, 255, 25,0, FALSE} // 24
|
||||
, {doNOP, 59 /* ; */, 1,0, TRUE} // 25 option-scan3
|
||||
, {doNOP, 132, 25,0, TRUE} // 26
|
||||
, {doRuleError, 255, 103,0, FALSE} // 27
|
||||
, {doExprStart, 255, 29, 9, FALSE} // 28 reverse-rule
|
||||
, {doRuleChar, 254, 38,0, TRUE} // 29 term
|
||||
, {doNOP, 132, 29,0, TRUE} // 30
|
||||
, {doRuleChar, 131, 38,0, TRUE} // 31
|
||||
, {doNOP, 91 /* [ */, 94, 38, FALSE} // 32
|
||||
, {doLParen, 40 /* ( */, 29, 38, TRUE} // 33
|
||||
, {doNOP, 36 /* $ */, 88, 37, FALSE} // 34
|
||||
, {doDotAny, 46 /* . */, 38,0, TRUE} // 35
|
||||
, {doRuleError, 255, 103,0, FALSE} // 36
|
||||
, {doCheckVarDef, 255, 38,0, FALSE} // 37 term-var-ref
|
||||
, {doNOP, 132, 38,0, TRUE} // 38 expr-mod
|
||||
, {doUnaryOpStar, 42 /* * */, 43,0, TRUE} // 39
|
||||
, {doUnaryOpPlus, 43 /* + */, 43,0, TRUE} // 40
|
||||
, {doUnaryOpQuestion, 63 /* ? */, 43,0, TRUE} // 41
|
||||
, {doNOP, 255, 43,0, FALSE} // 42
|
||||
, {doExprCatOperator, 254, 29,0, FALSE} // 43 expr-cont
|
||||
, {doNOP, 132, 43,0, TRUE} // 44
|
||||
, {doExprCatOperator, 131, 29,0, FALSE} // 45
|
||||
, {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 46
|
||||
, {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 47
|
||||
, {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 48
|
||||
, {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 49
|
||||
, {doExprCatOperator, 47 /* / */, 55,0, FALSE} // 50
|
||||
, {doExprCatOperator, 123 /* { */, 67,0, TRUE} // 51
|
||||
, {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 52
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 53
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 54
|
||||
, {doSlash, 47 /* / */, 57,0, TRUE} // 55 look-ahead
|
||||
, {doNOP, 255, 103,0, FALSE} // 56
|
||||
, {doExprCatOperator, 254, 29,0, FALSE} // 57 expr-cont-no-slash
|
||||
, {doNOP, 132, 43,0, TRUE} // 58
|
||||
, {doExprCatOperator, 131, 29,0, FALSE} // 59
|
||||
, {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 60
|
||||
, {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 61
|
||||
, {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 62
|
||||
, {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 63
|
||||
, {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 64
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 65
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 66
|
||||
, {doNOP, 132, 67,0, TRUE} // 67 tag-open
|
||||
, {doStartTagValue, 128, 70,0, FALSE} // 68
|
||||
, {doTagExpectedError, 255, 103,0, FALSE} // 69
|
||||
, {doNOP, 132, 74,0, TRUE} // 70 tag-value
|
||||
, {doNOP, 125 /* } */, 74,0, FALSE} // 71
|
||||
, {doTagDigit, 128, 70,0, TRUE} // 72
|
||||
, {doTagExpectedError, 255, 103,0, FALSE} // 73
|
||||
, {doNOP, 132, 74,0, TRUE} // 74 tag-close
|
||||
, {doTagValue, 125 /* } */, 77,0, TRUE} // 75
|
||||
, {doTagExpectedError, 255, 103,0, FALSE} // 76
|
||||
, {doExprCatOperator, 254, 29,0, FALSE} // 77 expr-cont-no-tag
|
||||
, {doNOP, 132, 77,0, TRUE} // 78
|
||||
, {doExprCatOperator, 131, 29,0, FALSE} // 79
|
||||
, {doExprCatOperator, 91 /* [ */, 29,0, FALSE} // 80
|
||||
, {doExprCatOperator, 40 /* ( */, 29,0, FALSE} // 81
|
||||
, {doExprCatOperator, 36 /* $ */, 29,0, FALSE} // 82
|
||||
, {doExprCatOperator, 46 /* . */, 29,0, FALSE} // 83
|
||||
, {doExprCatOperator, 47 /* / */, 55,0, FALSE} // 84
|
||||
, {doExprOrOperator, 124 /* | */, 29,0, TRUE} // 85
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 86
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 87
|
||||
, {doStartVariableName, 36 /* $ */, 90,0, TRUE} // 88 scan-var-name
|
||||
, {doNOP, 255, 103,0, FALSE} // 89
|
||||
, {doNOP, 130, 92,0, TRUE} // 90 scan-var-start
|
||||
, {doVariableNameExpectedErr, 255, 103,0, FALSE} // 91
|
||||
, {doNOP, 129, 92,0, TRUE} // 92 scan-var-body
|
||||
, {doEndVariableName, 255, 255,0, FALSE} // 93
|
||||
, {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 94 scan-unicode-set
|
||||
, {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 95
|
||||
, {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 96
|
||||
, {doNOP, 255, 103,0, FALSE} // 97
|
||||
, {doNOP, 132, 98,0, TRUE} // 98 assign-or-rule
|
||||
, {doStartAssign, 61 /* = */, 29, 101, TRUE} // 99
|
||||
, {doNOP, 255, 37, 9, FALSE} // 100
|
||||
, {doEndAssign, 59 /* ; */, 1,0, TRUE} // 101 assign-end
|
||||
, {doRuleErrorAssignExpr, 255, 103,0, FALSE} // 102
|
||||
, {doExit, 255, 103,0, TRUE} // 103 errorDeath
|
||||
};
|
||||
#ifdef RBBI_DEBUG
|
||||
static const char * const RBBIRuleStateNames[] = { 0,
|
||||
|
@ -181,9 +190,17 @@ static const char * const RBBIRuleStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"break-rule-end",
|
||||
0,
|
||||
0,
|
||||
"start-after-caret",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"rev-option",
|
||||
0,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
|
||||
#*****************************************************************************
|
||||
#
|
||||
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
#*****************************************************************************
|
||||
|
@ -19,6 +19,7 @@
|
|||
# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
|
||||
# that are then built with the rule parser.
|
||||
#
|
||||
# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
|
||||
|
||||
#
|
||||
# Here is the syntax of the state definitions in this file:
|
||||
|
@ -57,6 +58,7 @@
|
|||
start:
|
||||
escaped term ^break-rule-end doExprStart
|
||||
white_space n start
|
||||
'^' n start-after-caret ^break-rule-end doNoChain
|
||||
'$' scan-var-name ^assign-or-rule doExprStart
|
||||
'!' n rev-option
|
||||
';' n start # ignore empty rules.
|
||||
|
@ -71,7 +73,21 @@ break-rule-end:
|
|||
white_space n break-rule-end
|
||||
default errorDeath doRuleError
|
||||
|
||||
|
||||
#
|
||||
# start of a rule, after having seen a '^' (inhibits rule chain in).
|
||||
# Similar to the main 'start' state in most respects, except
|
||||
# - empty rule is an error.
|
||||
# - A second '^' is an error.
|
||||
#
|
||||
start-after-caret:
|
||||
escaped term doExprStart
|
||||
white_space n start-after-caret
|
||||
'^' errorDeath doRuleError # two '^'s
|
||||
'$' scan-var-name ^term-var-ref doExprStart
|
||||
';' errorDeath doRuleError # ^ ;
|
||||
eof errorDeath doRuleError
|
||||
default term doExprStart
|
||||
|
||||
#
|
||||
# ! We've just scanned a '!', indicating either a !!key word flag or a
|
||||
# !Reverse rule.
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
//
|
||||
// file: rbbiscan.cpp
|
||||
//
|
||||
// Copyright (C) 2002-2015, International Business Machines Corporation and others.
|
||||
// Copyright (C) 2002-2016, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains the Rule Based Break Iterator Rule Builder functions for
|
||||
|
@ -87,24 +87,27 @@ U_NAMESPACE_BEGIN
|
|||
RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
|
||||
{
|
||||
fRB = rb;
|
||||
fScanIndex = 0;
|
||||
fNextIndex = 0;
|
||||
fQuoteMode = FALSE;
|
||||
fLineNum = 1;
|
||||
fCharNum = 0;
|
||||
fLastChar = 0;
|
||||
|
||||
fStateTable = NULL;
|
||||
fStack[0] = 0;
|
||||
fStackPtr = 0;
|
||||
fStack[fStackPtr] = 0;
|
||||
fNodeStackPtr = 0;
|
||||
fRuleNum = 0;
|
||||
fNodeStack[0] = NULL;
|
||||
|
||||
fSymbolTable = NULL;
|
||||
fSetTable = NULL;
|
||||
|
||||
fScanIndex = 0;
|
||||
fNextIndex = 0;
|
||||
fNodeStackPtr = 0;
|
||||
|
||||
fReverseRule = FALSE;
|
||||
fLookAheadRule = FALSE;
|
||||
fNoChainInRule = FALSE;
|
||||
|
||||
fLineNum = 1;
|
||||
fCharNum = 0;
|
||||
fQuoteMode = FALSE;
|
||||
fSymbolTable = NULL;
|
||||
fSetTable = NULL;
|
||||
fRuleNum = 0;
|
||||
fOptionStart = 0;
|
||||
|
||||
// Do not check status until after all critical fields are sufficiently initialized
|
||||
// that the destructor can run cleanly.
|
||||
|
@ -205,6 +208,12 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
|
|||
break;
|
||||
|
||||
|
||||
case doNoChain:
|
||||
// Scanned a '^' while on the rule start state.
|
||||
fNoChainInRule = TRUE;
|
||||
break;
|
||||
|
||||
|
||||
case doExprOrOperator:
|
||||
{
|
||||
fixOpStack(RBBINode::precOpCat);
|
||||
|
@ -318,11 +327,11 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
|
|||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
|
||||
#endif
|
||||
U_ASSERT(fNodeStackPtr == 1);
|
||||
RBBINode *thisRule = fNodeStack[fNodeStackPtr];
|
||||
|
||||
// If this rule includes a look-ahead '/', add a endMark node to the
|
||||
// expression tree.
|
||||
if (fLookAheadRule) {
|
||||
RBBINode *thisRule = fNodeStack[fNodeStackPtr];
|
||||
RBBINode *endNode = pushNewNode(RBBINode::endMark);
|
||||
RBBINode *catNode = pushNewNode(RBBINode::opCat);
|
||||
if (U_FAILURE(*fRB->fStatus)) {
|
||||
|
@ -334,8 +343,24 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
|
|||
fNodeStack[fNodeStackPtr] = catNode;
|
||||
endNode->fVal = fRuleNum;
|
||||
endNode->fLookAheadEnd = TRUE;
|
||||
thisRule = catNode;
|
||||
|
||||
// TODO: Disable chaining out of look-ahead (hard break) rules.
|
||||
// The break on rule match is forced, so there is no point in building up
|
||||
// the state table to chain into another rule for a longer match.
|
||||
}
|
||||
|
||||
// Mark this node as being the root of a rule.
|
||||
thisRule->fRuleRoot = TRUE;
|
||||
|
||||
// Flag if chaining into this rule is wanted.
|
||||
//
|
||||
if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain
|
||||
!fNoChainInRule) { // and no '^' chain-in inhibit was on this rule
|
||||
thisRule->fChainIn = TRUE;
|
||||
}
|
||||
|
||||
|
||||
// All rule expressions are ORed together.
|
||||
// The ';' that terminates an expression really just functions as a '|' with
|
||||
// a low operator prededence.
|
||||
|
@ -372,6 +397,7 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
|
|||
}
|
||||
fReverseRule = FALSE; // in preparation for the next rule.
|
||||
fLookAheadRule = FALSE;
|
||||
fNoChainInRule = FALSE;
|
||||
fNodeStackPtr = 0;
|
||||
}
|
||||
break;
|
||||
|
@ -994,7 +1020,7 @@ void RBBIRuleScanner::parse() {
|
|||
|
||||
for (;;) {
|
||||
#ifdef RBBI_DEBUG
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);}
|
||||
#endif
|
||||
if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) {
|
||||
// Table row specified an individual character, not a set, and
|
||||
|
|
|
@ -52,6 +52,7 @@ public:
|
|||
struct RBBIRuleChar {
|
||||
UChar32 fChar;
|
||||
UBool fEscaped;
|
||||
RBBIRuleChar() : fChar(0), fEscaped(FALSE) {};
|
||||
};
|
||||
|
||||
RBBIRuleScanner(RBBIRuleBuilder *rb);
|
||||
|
@ -127,6 +128,8 @@ private:
|
|||
UBool fLookAheadRule; // True if the rule includes a '/'
|
||||
// somewhere within it.
|
||||
|
||||
UBool fNoChainInRule; // True if the current rule starts with a '^'.
|
||||
|
||||
RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
|
||||
// $variable symbols.
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2009, International Business Machines
|
||||
* Copyright (c) 2002-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -78,7 +78,7 @@ void RBBITableBuilder::build() {
|
|||
fTree = fTree->flattenVariables();
|
||||
#ifdef RBBI_DEBUG
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
|
||||
RBBIDebugPuts("Parse tree after flattening variable references.");
|
||||
RBBIDebugPuts("\nParse tree after flattening variable references.");
|
||||
fTree->printTree(TRUE);
|
||||
}
|
||||
#endif
|
||||
|
@ -136,7 +136,7 @@ void RBBITableBuilder::build() {
|
|||
fTree->flattenSets();
|
||||
#ifdef RBBI_DEBUG
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
|
||||
RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
|
||||
RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
|
||||
fTree->printTree(TRUE);
|
||||
}
|
||||
#endif
|
||||
|
@ -375,6 +375,25 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) {
|
|||
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
|
||||
// as roots of a rule to a destination vector.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
|
||||
if (node == NULL || U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
if (node->fRuleRoot) {
|
||||
dest->addElement(node, *fStatus);
|
||||
// Note: rules cannot nest. If we found a rule start node,
|
||||
// no child node can also be a start node.
|
||||
return;
|
||||
}
|
||||
addRuleRootNodes(dest, node->fLeftChild);
|
||||
addRuleRootNodes(dest, node->fRightChild);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -401,19 +420,24 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
|
|||
return;
|
||||
}
|
||||
|
||||
// Get all nodes that can be the start a match, which is FirstPosition()
|
||||
// of the portion of the tree corresponding to user-written rules.
|
||||
// See the tree description in bofFixup().
|
||||
RBBINode *userRuleRoot = tree;
|
||||
if (fRB->fSetBuilder->sawBOF()) {
|
||||
userRuleRoot = tree->fLeftChild->fRightChild;
|
||||
// Collect all leaf nodes that can start matches for rules
|
||||
// with inbound chaining enabled, which is the union of the
|
||||
// firstPosition sets from each of the rule root nodes.
|
||||
|
||||
UVector ruleRootNodes(*fStatus);
|
||||
addRuleRootNodes(&ruleRootNodes, tree);
|
||||
|
||||
UVector matchStartNodes(*fStatus);
|
||||
for (int i=0; i<ruleRootNodes.size(); ++i) {
|
||||
RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i));
|
||||
if (node->fChainIn) {
|
||||
setAdd(&matchStartNodes, node->fFirstPosSet);
|
||||
}
|
||||
}
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
U_ASSERT(userRuleRoot != NULL);
|
||||
UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
|
||||
|
||||
|
||||
// Iteratate over all leaf nodes,
|
||||
//
|
||||
int32_t endNodeIx;
|
||||
int32_t startNodeIx;
|
||||
|
||||
|
@ -455,8 +479,8 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
|
|||
// Now iterate over the nodes that can start a match, looking for ones
|
||||
// with the same char class as our ending node.
|
||||
RBBINode *startNode;
|
||||
for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
|
||||
startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
|
||||
for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
|
||||
startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
|
||||
if (startNode->fType != RBBINode::leafChar) {
|
||||
continue;
|
||||
}
|
||||
|
@ -1032,6 +1056,8 @@ void RBBITableBuilder::printPosSets(RBBINode *n) {
|
|||
if (n==NULL) {
|
||||
return;
|
||||
}
|
||||
printf("\n");
|
||||
RBBINode::printNodeHeader();
|
||||
n->printNode();
|
||||
RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
|
||||
|
||||
|
@ -1141,8 +1167,8 @@ void RBBITableBuilder::exportTable(void *where) {
|
|||
void RBBITableBuilder::printSet(UVector *s) {
|
||||
int32_t i;
|
||||
for (i=0; i<s->size(); i++) {
|
||||
void *v = s->elementAt(i);
|
||||
RBBIDebugPrintf("%10p", v);
|
||||
const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
|
||||
RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2005, International Business Machines
|
||||
* Copyright (c) 2002-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -58,6 +58,8 @@ private:
|
|||
void flagTaggedStates();
|
||||
void mergeRuleStatusVals();
|
||||
|
||||
void addRuleRootNodes(UVector *dest, RBBINode *node);
|
||||
|
||||
// Set functions for UVector.
|
||||
// TODO: make a USet subclass of UVector
|
||||
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
#
|
||||
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# file: char.txt
|
||||
#
|
||||
# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
|
||||
# These rules are based on UAX #29 Revision 28 (Draft 3) for Unicode Version 9.0
|
||||
#
|
||||
|
||||
#
|
||||
|
@ -14,9 +14,9 @@
|
|||
#
|
||||
$CR = [\p{Grapheme_Cluster_Break = CR}];
|
||||
$LF = [\p{Grapheme_Cluster_Break = LF}];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$Control = [[\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]];
|
||||
# TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
|
||||
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
|
||||
$Extend = [[\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]];
|
||||
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
|
||||
$Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
|
||||
|
||||
|
@ -30,10 +30,18 @@ $T = [\p{Grapheme_Cluster_Break = T}];
|
|||
$LV = [\p{Grapheme_Cluster_Break = LV}];
|
||||
$LVT = [\p{Grapheme_Cluster_Break = LVT}];
|
||||
|
||||
# Emoji defintions scraped from http://www.unicode.org/Public/emoji/2.0//emoji-data.txt
|
||||
|
||||
$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
|
||||
$E_Modifier = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
$ZWJ = [\u200D];
|
||||
$GAZ = [\U0001F455-\U0001F469\U0001F48B\U0001F5E8\u2764];
|
||||
|
||||
## -------------------------------------------------
|
||||
!!chain;
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
!!forward;
|
||||
|
||||
$CR $LF;
|
||||
|
@ -42,13 +50,24 @@ $L ($L | $V | $LV | $LVT);
|
|||
($LV | $V) ($V | $T);
|
||||
($LVT | $T) $T;
|
||||
|
||||
$Regional_Indicator $Regional_Indicator;
|
||||
# GB 8. Keep pairs of regional indicators together
|
||||
# Note that hard break '/' rule triggers only if there are three or more initial RIs,
|
||||
|
||||
[^$Control $CR $LF] $Extend;
|
||||
^$Regional_Indicator $Regional_Indicator / $Regional_Indicator;
|
||||
^$Regional_Indicator $Regional_Indicator;
|
||||
|
||||
# GB 9
|
||||
[^$Control $CR $LF] ($Extend | $ZWJ);
|
||||
|
||||
# GB 9a (only for extended grapheme clusters)
|
||||
[^$Control $CR $LF] $SpacingMark;
|
||||
# TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
|
||||
# GB 9b Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
|
||||
|
||||
# GB9c Emoji proposal
|
||||
($E_Base | $GAZ) $E_Modifier;
|
||||
|
||||
# GB 9d Don't break between ZWJ and Glue_After_Zwj
|
||||
$ZWJ $GAZ;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
|
@ -58,23 +77,29 @@ $LF $CR;
|
|||
($V | $T) ($LV | $V);
|
||||
$T ($LVT | $T);
|
||||
|
||||
$Regional_Indicator $Regional_Indicator;
|
||||
# GB 8. Going backwards, we must scan through any number of regional indicators as pairs.
|
||||
#
|
||||
$Regional_Indicator $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)* [{eof}[^$Regional_Indicator]];
|
||||
|
||||
$Extend [^$Control $CR $LF];
|
||||
# GB 9
|
||||
($Extend | $ZWJ) [^$Control $CR $LF]; #note that this will chain into Regional_Indicator when needed.
|
||||
|
||||
# GB 9a
|
||||
$SpacingMark [^$Control $CR $LF];
|
||||
# TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
|
||||
# GB 9b Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
|
||||
|
||||
# GB 9c
|
||||
$E_Modifier ($E_Base | $GAZ);
|
||||
|
||||
# GB 9d Don't break between ZWJ and Glue_After_Zwj
|
||||
$GAZ $ZWJ;
|
||||
|
||||
## -------------------------------------------------
|
||||
# We don't logically need safe char break rules, but if we don't provide any at all
|
||||
# the engine for preceding() and following() will fall back to the
|
||||
# old style inefficient algorithm.
|
||||
|
||||
!!safe_reverse;
|
||||
$LF $CR;
|
||||
$Regional_Indicator $Regional_Indicator;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
$CR $LF;
|
||||
|
||||
$Regional_Indicator $Regional_Indicator;
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
# Copyright (c) 2002-2015 International Business Machines Corporation and
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
|
@ -20,8 +23,6 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
|
@ -59,8 +60,13 @@
|
|||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
$EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
|
@ -68,7 +74,7 @@ $B2 = [:LineBreak = Break_Both:];
|
|||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
$CM = [:LineBreak = Combining_Mark:];
|
||||
$CM = [[:LineBreak = Combining_Mark:] \u200d];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
|
@ -77,7 +83,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
|
|||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
$ID = [:LineBreak = Ideographic:];
|
||||
$ID = [[:LineBreak = Ideographic:][\u2764] - $EB];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
|
@ -99,6 +105,7 @@ $SY = [:LineBreak = Break_Symbols:];
|
|||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [\u200d];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
|
@ -131,7 +138,6 @@ $HLcm = $HL $CM*;
|
|||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
|
@ -160,6 +166,8 @@ $BB $CM+;
|
|||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
|
@ -208,7 +216,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
|||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
|
@ -216,13 +224,13 @@ $CR $LF {100};
|
|||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
$CM+ $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
$CM+ [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
|
@ -233,20 +241,23 @@ $CM+ [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x ID Emoji proposal.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
$CM+;
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
$CM+ $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
|
@ -257,13 +268,13 @@ $WJcm $CAN_CM $CM*;
|
|||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
|
||||
$CM+ GLcm;
|
||||
^$CM+ $GLcm;
|
||||
|
||||
|
||||
|
||||
|
@ -272,23 +283,23 @@ $CM+ GLcm;
|
|||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
|
@ -318,12 +329,10 @@ $LB18Breaks = [$LB8Breaks $SP];
|
|||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
$CM+ $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
# TODO: I don't think this rule is needed.
|
||||
|
||||
|
||||
# LB 20
|
||||
|
@ -335,14 +344,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
^$CM+ ($BAcm | $HYcm | $NScm);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm) [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
|
@ -351,25 +361,25 @@ $SYcm $HLcm;
|
|||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
$IDcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
|
||||
|
||||
# $LB 23
|
||||
$IDcm $POcm;
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm $IDcm;
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
|
||||
|
@ -393,18 +403,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
|||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
|
||||
# LB 30a Do not break between regional indicators.
|
||||
$RIcm $RIcm;
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
|
@ -413,34 +432,36 @@ $RIcm $RIcm;
|
|||
|
||||
!!reverse;
|
||||
|
||||
$CM+ $ALPlus;
|
||||
$CM+ $BA;
|
||||
$CM+ $BB;
|
||||
$CM+ $B2;
|
||||
$CM+ $CL;
|
||||
$CM+ $CP;
|
||||
$CM+ $EX;
|
||||
$CM+ $GL;
|
||||
$CM+ $HL;
|
||||
$CM+ $HY;
|
||||
$CM+ $H2;
|
||||
$CM+ $H3;
|
||||
$CM+ $ID;
|
||||
$CM+ $IN;
|
||||
$CM+ $IS;
|
||||
$CM+ $JL;
|
||||
$CM+ $JV;
|
||||
$CM+ $JT;
|
||||
$CM+ $NS;
|
||||
$CM+ $NU;
|
||||
$CM+ $OP;
|
||||
$CM+ $PO;
|
||||
$CM+ $PR;
|
||||
$CM+ $QU;
|
||||
$CM+ $RI;
|
||||
$CM+ $SY;
|
||||
$CM+ $WJ;
|
||||
$CM+;
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
|
||||
|
||||
#
|
||||
|
@ -452,14 +473,14 @@ $AL_FOLLOW $CM+ / (
|
|||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
|
@ -471,7 +492,7 @@ $AL_FOLLOW $CM+ / (
|
|||
|
||||
|
||||
|
||||
# LB 4, 5, 5
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
|
@ -488,30 +509,37 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
$CM+ $CAN_CM;
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
$CM* $WJ $CM* $CAN_CM;
|
||||
$CM* $WJ [$LB8NonBreaks-$CM];
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CM* $CAN_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CM* $CAN_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
|
@ -532,28 +560,26 @@ $SY [$LB8NonBreaks-$CM];
|
|||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CM* $CAN_CM $SP* $CM* $OP;
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
|
||||
|
||||
# LB 15
|
||||
$CM* $OP $SP* $CM* $QU;
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$CM* $NS $SP* $CM* ($CL | $CP);
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$CM* $B2 $SP* $CM* $B2;
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
@ -562,82 +588,100 @@ $CM* $B2 $SP* $CM* $B2;
|
|||
#
|
||||
# LB 19
|
||||
#
|
||||
$CM* $QU $CM* $CAN_CM; # . x QU
|
||||
$CM* $QU $LB18NonBreaks;
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CM* $CAN_CM $CM* $QU; # QU x .
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
#
|
||||
|
||||
# LB 21
|
||||
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$CM* $HL $CM* $SY;
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$CM* $IN $CM* ($ALPlus | $HL);
|
||||
$CM* $IN $CM* $EX;
|
||||
$CM* $IN $CM* $ID;
|
||||
$CM* $IN $CM* $IN;
|
||||
$CM* $IN $CM* $NU;
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$CM* $PO $CM* $ID;
|
||||
$CM* $NU $CM* ($ALPlus | $HL);
|
||||
$CM* ($ALPlus | $HL) $CM* $NU;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
$CM* $ID $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PO;
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
$CM* ($JT | $JV) $CM* ($H2 | $JV);
|
||||
$CM* $JT $CM* ($H3 | $JT);
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
$CM* ($ALPlus | $HL) $CM* $IS;
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$CM* $OP $CM* ($ALPlus | $HL | $NU);
|
||||
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
$CM* $RI $CM* $RI;
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
$CM+ $SP / .;
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -658,6 +702,9 @@ $CM* ($HY | $BA) $CM* $HL;
|
|||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
@ -674,6 +721,6 @@ $dictionary $dictionary;
|
|||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
# Copyright (c) 2002-2015 International Business Machines Corporation and
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line_fi.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
|
@ -22,8 +25,6 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
|
@ -61,9 +62,14 @@
|
|||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
$EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [[:LineBreak = Break_After:] - [\u2010]];
|
||||
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010];
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
|
@ -71,7 +77,7 @@ $B2 = [:LineBreak = Break_Both:];
|
|||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
$CM = [:LineBreak = Combining_Mark:];
|
||||
$CM = [[:LineBreak = Combining_Mark:] \u200d];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
|
@ -80,7 +86,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
|
|||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
$ID = [:LineBreak = Ideographic:];
|
||||
$ID = [[:LineBreak = Ideographic:][\u2764] - $EB];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
|
@ -102,6 +108,7 @@ $SY = [:LineBreak = Break_Symbols:];
|
|||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [\u200d];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
|
@ -135,7 +142,6 @@ $HLcm = $HL $CM*;
|
|||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
|
@ -165,6 +171,8 @@ $BB $CM+;
|
|||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
|
@ -213,7 +221,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
|||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
|
@ -221,13 +229,13 @@ $CR $LF {100};
|
|||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
$CM+ $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
$CM+ [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
|
@ -238,20 +246,23 @@ $CM+ [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x ID Emoji proposal.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
$CM+;
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
$CM+ $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
|
@ -262,13 +273,13 @@ $WJcm $CAN_CM $CM*;
|
|||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
|
||||
$CM+ GLcm;
|
||||
^$CM+ $GLcm;
|
||||
|
||||
|
||||
|
||||
|
@ -277,23 +288,23 @@ $CM+ GLcm;
|
|||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
|
@ -323,12 +334,10 @@ $LB18Breaks = [$LB8Breaks $SP];
|
|||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
$CM+ $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
# TODO: I don't think this rule is needed.
|
||||
|
||||
|
||||
# LB 20
|
||||
|
@ -344,6 +353,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL;
|
||||
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm);
|
||||
($HY | $HH) $AL;
|
||||
^$CM+ ($BAcm | $HYcm | $HHcm | $NScm);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
|
@ -359,25 +369,25 @@ $SYcm $HLcm;
|
|||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
$IDcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
|
||||
|
||||
# $LB 23
|
||||
$IDcm $POcm;
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm $IDcm;
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
|
||||
|
@ -401,18 +411,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
|||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
|
||||
# LB 30a Do not break between regional indicators.
|
||||
$RIcm $RIcm;
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
|
@ -421,35 +440,37 @@ $RIcm $RIcm;
|
|||
|
||||
!!reverse;
|
||||
|
||||
$CM+ $ALPlus;
|
||||
$CM+ $BA;
|
||||
$CM+ $HH;
|
||||
$CM+ $BB;
|
||||
$CM+ $B2;
|
||||
$CM+ $CL;
|
||||
$CM+ $CP;
|
||||
$CM+ $EX;
|
||||
$CM+ $GL;
|
||||
$CM+ $HL;
|
||||
$CM+ $HY;
|
||||
$CM+ $H2;
|
||||
$CM+ $H3;
|
||||
$CM+ $ID;
|
||||
$CM+ $IN;
|
||||
$CM+ $IS;
|
||||
$CM+ $JL;
|
||||
$CM+ $JV;
|
||||
$CM+ $JT;
|
||||
$CM+ $NS;
|
||||
$CM+ $NU;
|
||||
$CM+ $OP;
|
||||
$CM+ $PO;
|
||||
$CM+ $PR;
|
||||
$CM+ $QU;
|
||||
$CM+ $RI;
|
||||
$CM+ $SY;
|
||||
$CM+ $WJ;
|
||||
$CM+;
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $HH;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
|
||||
|
||||
#
|
||||
|
@ -461,14 +482,14 @@ $AL_FOLLOW $CM+ / (
|
|||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
|
@ -480,7 +501,7 @@ $AL_FOLLOW $CM+ / (
|
|||
|
||||
|
||||
|
||||
# LB 4, 5, 5
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
|
@ -497,30 +518,37 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
$CM+ $CAN_CM;
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
$CM* $WJ $CM* $CAN_CM;
|
||||
$CM* $WJ [$LB8NonBreaks-$CM];
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CM* $CAN_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CM* $CAN_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
|
@ -541,28 +569,26 @@ $SY [$LB8NonBreaks-$CM];
|
|||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CM* $CAN_CM $SP* $CM* $OP;
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
|
||||
|
||||
# LB 15
|
||||
$CM* $OP $SP* $CM* $QU;
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$CM* $NS $SP* $CM* ($CL | $CP);
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$CM* $B2 $SP* $CM* $B2;
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
@ -571,13 +597,13 @@ $CM* $B2 $SP* $CM* $B2;
|
|||
#
|
||||
# LB 19
|
||||
#
|
||||
$CM* $QU $CM* $CAN_CM; # . x QU
|
||||
$CM* $QU $LB18NonBreaks;
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CM* $CAN_CM $CM* $QU; # QU x .
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
|
@ -587,69 +613,87 @@ $CM* $CAN_CM $CM* $QU; # QU x .
|
|||
$AL ($HY | $HH) / $SP;
|
||||
|
||||
# LB 21
|
||||
$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$CM* $HL $CM* $SY;
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$CM* $IN $CM* ($ALPlus | $HL);
|
||||
$CM* $IN $CM* $EX;
|
||||
$CM* $IN $CM* $ID;
|
||||
$CM* $IN $CM* $IN;
|
||||
$CM* $IN $CM* $NU;
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$CM* $PO $CM* $ID;
|
||||
$CM* $NU $CM* ($ALPlus | $HL);
|
||||
$CM* ($ALPlus | $HL) $CM* $NU;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
$CM* $ID $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PO;
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
$CM* ($JT | $JV) $CM* ($H2 | $JV);
|
||||
$CM* $JT $CM* ($H3 | $JT);
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
$CM* ($ALPlus | $HL) $CM* $IS;
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$CM* $OP $CM* ($ALPlus | $HL | $NU);
|
||||
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
$CM* $RI $CM* $RI;
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
$CM+ $SP / .;
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -670,6 +714,9 @@ $CM* ($HY | $BA | $HH) $CM* $HL;
|
|||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
@ -686,6 +733,6 @@ $dictionary $dictionary;
|
|||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
# Copyright (c) 2002-2015 International Business Machines Corporation and
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line_loose.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
|
@ -26,8 +30,6 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
|
@ -65,8 +67,13 @@
|
|||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
$EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
|
@ -74,7 +81,7 @@ $B2 = [:LineBreak = Break_Both:];
|
|||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
$CM = [:LineBreak = Combining_Mark:];
|
||||
$CM = [[:LineBreak = Combining_Mark:] \u200d];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
|
@ -83,7 +90,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
|
|||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ];
|
||||
$ID = [[:LineBreak = Ideographic:]$CJ[\u2764] - $EB];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
|
@ -106,6 +113,7 @@ $SY = [:LineBreak = Break_Symbols:];
|
|||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [\u200d];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
|
@ -138,7 +146,6 @@ $HLcm = $HL $CM*;
|
|||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
|
@ -168,6 +175,8 @@ $BB $CM+;
|
|||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
|
@ -217,7 +226,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
|||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
|
@ -225,13 +234,13 @@ $CR $LF {100};
|
|||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
$CM+ $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
$CM+ [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
|
@ -242,20 +251,23 @@ $CM+ [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x ID Emoji proposal.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
$CM+;
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
$CM+ $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
|
@ -266,13 +278,13 @@ $WJcm $CAN_CM $CM*;
|
|||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
|
||||
$CM+ GLcm;
|
||||
^$CM+ $GLcm;
|
||||
|
||||
|
||||
|
||||
|
@ -281,23 +293,23 @@ $CM+ GLcm;
|
|||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
|
@ -329,12 +341,10 @@ $LB18Breaks = [$LB8Breaks $SP];
|
|||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
$CM+ $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
# TODO: I don't think this rule is needed.
|
||||
|
||||
|
||||
# LB 20
|
||||
|
@ -347,14 +357,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# BB x
|
||||
#
|
||||
# DO allow breaks here before NSXcm, so don't include it
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
^$CM+ ($BAcm | $HYcm | $NScm);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm) [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
|
@ -363,25 +374,25 @@ $SYcm $HLcm;
|
|||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
$IDcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
# $INcm $INcm; # delete this rule for CSS loose
|
||||
$NUcm $INcm;
|
||||
|
||||
|
||||
# $LB 23
|
||||
$IDcm $POcm;
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm $IDcm;
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
|
||||
|
@ -405,18 +416,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
|||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
|
||||
# LB 30a Do not break between regional indicators.
|
||||
$RIcm $RIcm;
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
|
@ -425,35 +445,37 @@ $RIcm $RIcm;
|
|||
|
||||
!!reverse;
|
||||
|
||||
$CM+ $ALPlus;
|
||||
$CM+ $BA;
|
||||
$CM+ $BB;
|
||||
$CM+ $B2;
|
||||
$CM+ $CL;
|
||||
$CM+ $CP;
|
||||
$CM+ $EX;
|
||||
$CM+ $GL;
|
||||
$CM+ $HL;
|
||||
$CM+ $HY;
|
||||
$CM+ $H2;
|
||||
$CM+ $H3;
|
||||
$CM+ $ID;
|
||||
$CM+ $IN;
|
||||
$CM+ $IS;
|
||||
$CM+ $JL;
|
||||
$CM+ $JV;
|
||||
$CM+ $JT;
|
||||
$CM+ $NS;
|
||||
$CM+ $NSX;
|
||||
$CM+ $NU;
|
||||
$CM+ $OP;
|
||||
$CM+ $PO;
|
||||
$CM+ $PR;
|
||||
$CM+ $QU;
|
||||
$CM+ $RI;
|
||||
$CM+ $SY;
|
||||
$CM+ $WJ;
|
||||
$CM+;
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NSX;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
|
||||
|
||||
#
|
||||
|
@ -465,14 +487,14 @@ $AL_FOLLOW $CM+ / (
|
|||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
|
@ -484,7 +506,7 @@ $AL_FOLLOW $CM+ / (
|
|||
|
||||
|
||||
|
||||
# LB 4, 5, 5
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
|
@ -501,30 +523,37 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
$CM+ $CAN_CM;
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
$CM* $WJ $CM* $CAN_CM;
|
||||
$CM* $WJ [$LB8NonBreaks-$CM];
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CM* $CAN_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CM* $CAN_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
|
@ -545,29 +574,27 @@ $SY [$LB8NonBreaks-$CM];
|
|||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CM* $CAN_CM $SP* $CM* $OP;
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
|
||||
|
||||
# LB 15
|
||||
$CM* $OP $SP* $CM* $QU;
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
# Don't include $NSX here
|
||||
$CM* $NS $SP* $CM* ($CL | $CP);
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$CM* $B2 $SP* $CM* $B2;
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
@ -576,13 +603,13 @@ $CM* $B2 $SP* $CM* $B2;
|
|||
#
|
||||
# LB 19
|
||||
#
|
||||
$CM* $QU $CM* $CAN_CM; # . x QU
|
||||
$CM* $QU $LB18NonBreaks;
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CM* $CAN_CM $CM* $QU; # QU x .
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
|
@ -590,69 +617,88 @@ $CM* $CAN_CM $CM* $QU; # QU x .
|
|||
|
||||
# LB 21
|
||||
# Don't include $NSX here
|
||||
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$CM* $HL $CM* $SY;
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$CM* $IN $CM* ($ALPlus | $HL);
|
||||
$CM* $IN $CM* $EX;
|
||||
$CM* $IN $CM* $ID;
|
||||
# $CM* $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$CM* $IN $CM* $NU;
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$CM* $PO $CM* $ID;
|
||||
$CM* $NU $CM* ($ALPlus | $HL);
|
||||
$CM* ($ALPlus | $HL) $CM* $NU;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
$CM* $ID $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PO;
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
$CM* ($JT | $JV) $CM* ($H2 | $JV);
|
||||
$CM* $JT $CM* ($H3 | $JT);
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
$CM* ($ALPlus | $HL) $CM* $IS;
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$CM* $OP $CM* ($ALPlus | $HL | $NU);
|
||||
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
$CM* $RI $CM* $RI;
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
# Line Loose tailoring: Don't include NSX here.
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
$CM+ $SP / .;
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -673,6 +719,9 @@ $CM* ($HY | $BA) $CM* $HL;
|
|||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
@ -689,6 +738,6 @@ $dictionary $dictionary;
|
|||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,12 +1,16 @@
|
|||
# Copyright (c) 2002-2015 International Business Machines Corporation and
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line_loose_cj.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
|
@ -33,8 +37,6 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
|
@ -72,8 +74,13 @@
|
|||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
$EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
|
||||
$BAX = [\u2010 \u2013];
|
||||
$BA = [[:LineBreak = Break_After:] - $BAX];
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
|
@ -82,7 +89,7 @@ $B2 = [:LineBreak = Break_Both:];
|
|||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
$CM = [:LineBreak = Combining_Mark:];
|
||||
$CM = [[:LineBreak = Combining_Mark:] \u200d];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EXX = [\uFF01 \uFF1F];
|
||||
|
@ -92,7 +99,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
|
|||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ];
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
|
@ -117,6 +124,7 @@ $SY = [:LineBreak = Break_Symbols:];
|
|||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [\u200d];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
|
@ -151,7 +159,6 @@ $HLcm = $HL $CM*;
|
|||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
|
@ -184,6 +191,8 @@ $BB $CM+;
|
|||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$EXX $CM+;
|
||||
$GL $CM+;
|
||||
|
@ -236,7 +245,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
|||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
|
@ -244,13 +253,13 @@ $CR $LF {100};
|
|||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
$CM+ $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
$CM+ [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
|
@ -261,20 +270,23 @@ $CM+ [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x ID Emoji proposal.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
$CM+;
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
$CM+ $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
|
@ -285,14 +297,13 @@ $WJcm $CAN_CM $CM*;
|
|||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
|
||||
$CM+ GLcm;
|
||||
|
||||
^$CM+ $GLcm;
|
||||
|
||||
|
||||
#
|
||||
|
@ -301,23 +312,23 @@ $CM+ GLcm;
|
|||
# Do not include $EXX here
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
|
@ -349,12 +360,10 @@ $LB18Breaks = [$LB8Breaks $SP];
|
|||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
$CM+ $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
# TODO: I don't think this rule is needed.
|
||||
|
||||
|
||||
# LB 20
|
||||
|
@ -368,13 +377,14 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
#
|
||||
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
^$CM+ ($BAcm | $HYcm | $NScm);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
|
@ -383,19 +393,19 @@ $SYcm $HLcm;
|
|||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
$IDcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
# $INcm $INcm; # delete this rule for CSS loose
|
||||
$NUcm $INcm;
|
||||
|
||||
|
||||
# LB 23
|
||||
# $LB 23
|
||||
# Do not include $POX here
|
||||
$IDcm $POcm;
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
|
||||
|
@ -403,7 +413,7 @@ $NUcm $HLcm;
|
|||
# LB 24
|
||||
#
|
||||
# Do not include $PRX here
|
||||
$PRcm $IDcm;
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
($POcm | $POXcm) ($ALcm | $HLcm);
|
||||
|
||||
|
@ -429,18 +439,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
|||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
|
||||
# LB 30a Do not break between regional indicators.
|
||||
$RIcm $RIcm;
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
|
@ -449,39 +468,41 @@ $RIcm $RIcm;
|
|||
|
||||
!!reverse;
|
||||
|
||||
$CM+ $ALPlus;
|
||||
$CM+ $BA;
|
||||
$CM+ $BAX;
|
||||
$CM+ $BB;
|
||||
$CM+ $B2;
|
||||
$CM+ $CL;
|
||||
$CM+ $CP;
|
||||
$CM+ $EX;
|
||||
$CM+ $EXX;
|
||||
$CM+ $GL;
|
||||
$CM+ $HL;
|
||||
$CM+ $HY;
|
||||
$CM+ $H2;
|
||||
$CM+ $H3;
|
||||
$CM+ $ID;
|
||||
$CM+ $IN;
|
||||
$CM+ $IS;
|
||||
$CM+ $JL;
|
||||
$CM+ $JV;
|
||||
$CM+ $JT;
|
||||
$CM+ $NS;
|
||||
$CM+ $NSX;
|
||||
$CM+ $NU;
|
||||
$CM+ $OP;
|
||||
$CM+ $PO;
|
||||
$CM+ $POX;
|
||||
$CM+ $PR;
|
||||
$CM+ $PRX;
|
||||
$CM+ $QU;
|
||||
$CM+ $RI;
|
||||
$CM+ $SY;
|
||||
$CM+ $WJ;
|
||||
$CM+;
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BAX;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $EXX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NSX;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $POX;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $PRX;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
|
||||
|
||||
#
|
||||
|
@ -493,14 +514,14 @@ $AL_FOLLOW $CM+ / (
|
|||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
|
@ -512,7 +533,7 @@ $AL_FOLLOW $CM+ / (
|
|||
|
||||
|
||||
|
||||
# LB 4, 5, 5
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
|
@ -529,30 +550,37 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
$CM+ $CAN_CM;
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
$CM* $WJ $CM* $CAN_CM;
|
||||
$CM* $WJ [$LB8NonBreaks-$CM];
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CM* $CAN_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CM* $CAN_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
|
@ -574,29 +602,27 @@ $SY [$LB8NonBreaks-$CM];
|
|||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CM* $CAN_CM $SP* $CM* $OP;
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
|
||||
|
||||
# LB 15
|
||||
$CM* $OP $SP* $CM* $QU;
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
# Don't include $NSX here
|
||||
$CM* $NS $SP* $CM* ($CL | $CP);
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$CM* $B2 $SP* $CM* $B2;
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
@ -605,13 +631,13 @@ $CM* $B2 $SP* $CM* $B2;
|
|||
#
|
||||
# LB 19
|
||||
#
|
||||
$CM* $QU $CM* $CAN_CM; # . x QU
|
||||
$CM* $QU $LB18NonBreaks;
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CM* $CAN_CM $CM* $QU; # QU x .
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
|
@ -619,73 +645,90 @@ $CM* $CAN_CM $CM* $QU; # QU x .
|
|||
|
||||
# LB 21
|
||||
# Don't include $BAX or $NSX here
|
||||
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$CM* $HL $CM* $SY;
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$CM* $IN $CM* ($ALPlus | $HL);
|
||||
$CM* $IN $CM* $EX;
|
||||
$CM* $IN $CM* $ID;
|
||||
# $CM* $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$CM* $IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
# Do not include $POX here
|
||||
$CM* $PO $CM* $ID;
|
||||
$CM* $NU $CM* ($ALPlus | $HL);
|
||||
$CM* ($ALPlus | $HL) $CM* $NU;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
# Do not include $PRX here
|
||||
$CM* $ID $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* ($PO | $POX);
|
||||
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* ($PO | $POX);
|
||||
|
||||
# LB 25
|
||||
# Here do not include $POX at the beginning or $PRX at the end
|
||||
($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?;
|
||||
|
||||
# LB 26
|
||||
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
$CM* ($JT | $JV) $CM* ($H2 | $JV);
|
||||
$CM* $JT $CM* ($H3 | $JT);
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
# Do not include $POX or $PRX here
|
||||
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
$CM* ($ALPlus | $HL) $CM* $IS;
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$CM* $OP $CM* ($ALPlus | $HL | $NU);
|
||||
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
$CM* $RI $CM* $RI;
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
$CM+ $SP / .;
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -706,6 +749,9 @@ $CM* ($HY | $BA | $BAX) $CM* $HL;
|
|||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
@ -722,6 +768,6 @@ $dictionary $dictionary;
|
|||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $dictionary];
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
# Copyright (c) 2002-2015 International Business Machines Corporation and
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line_loose_fi.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
|
@ -24,8 +28,6 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
|
@ -63,8 +65,13 @@
|
|||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
$EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
|
||||
$BA = [[:LineBreak = Break_After:] - [\u2010]];
|
||||
$HH = [\u2010];
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
|
@ -73,7 +80,7 @@ $B2 = [:LineBreak = Break_Both:];
|
|||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
$CM = [:LineBreak = Combining_Mark:];
|
||||
$CM = [[:LineBreak = Combining_Mark:] \u200d];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
|
@ -82,7 +89,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
|
|||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ];
|
||||
$ID = [[:LineBreak = Ideographic:]$CJ[\u2764] - $EB];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
|
@ -105,6 +112,7 @@ $SY = [:LineBreak = Break_Symbols:];
|
|||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [\u200d];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
|
@ -138,7 +146,6 @@ $HLcm = $HL $CM*;
|
|||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
|
@ -169,6 +176,8 @@ $BB $CM+;
|
|||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
|
@ -218,7 +227,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
|||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
|
@ -226,13 +235,13 @@ $CR $LF {100};
|
|||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
$CM+ $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
$CM+ [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
|
@ -243,20 +252,23 @@ $CM+ [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x ID Emoji proposal.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
$CM+;
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
$CM+ $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
|
@ -267,13 +279,13 @@ $WJcm $CAN_CM $CM*;
|
|||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
|
||||
$CM+ GLcm;
|
||||
^$CM+ $GLcm;
|
||||
|
||||
|
||||
|
||||
|
@ -282,23 +294,23 @@ $CM+ GLcm;
|
|||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
|
@ -330,13 +342,10 @@ $LB18Breaks = [$LB8Breaks $SP];
|
|||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
$CM+ $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
# TODO: I don't think this rule is needed.
|
||||
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
|
@ -352,13 +361,14 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL;
|
||||
$LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm);
|
||||
($HY | $HH) $AL;
|
||||
^$CM+ ($BAcm | $HHcm | $HYcm | $NScm);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
|
@ -367,25 +377,25 @@ $SYcm $HLcm;
|
|||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
$IDcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
|
||||
|
||||
# $LB 23
|
||||
$IDcm $POcm;
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm $IDcm;
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
|
||||
|
@ -409,18 +419,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
|||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
|
||||
# LB 30a Do not break between regional indicators.
|
||||
$RIcm $RIcm;
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HH $HY $NS $NSX {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
|
@ -429,36 +448,38 @@ $RIcm $RIcm;
|
|||
|
||||
!!reverse;
|
||||
|
||||
$CM+ $ALPlus;
|
||||
$CM+ $BA;
|
||||
$CM+ $HH;
|
||||
$CM+ $BB;
|
||||
$CM+ $B2;
|
||||
$CM+ $CL;
|
||||
$CM+ $CP;
|
||||
$CM+ $EX;
|
||||
$CM+ $GL;
|
||||
$CM+ $HL;
|
||||
$CM+ $HY;
|
||||
$CM+ $H2;
|
||||
$CM+ $H3;
|
||||
$CM+ $ID;
|
||||
$CM+ $IN;
|
||||
$CM+ $IS;
|
||||
$CM+ $JL;
|
||||
$CM+ $JV;
|
||||
$CM+ $JT;
|
||||
$CM+ $NS;
|
||||
$CM+ $NSX;
|
||||
$CM+ $NU;
|
||||
$CM+ $OP;
|
||||
$CM+ $PO;
|
||||
$CM+ $PR;
|
||||
$CM+ $QU;
|
||||
$CM+ $RI;
|
||||
$CM+ $SY;
|
||||
$CM+ $WJ;
|
||||
$CM+;
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HH;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NSX;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
|
||||
|
||||
#
|
||||
|
@ -470,14 +491,14 @@ $AL_FOLLOW $CM+ / (
|
|||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
|
@ -489,7 +510,7 @@ $AL_FOLLOW $CM+ / (
|
|||
|
||||
|
||||
|
||||
# LB 4, 5, 5
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
|
@ -506,30 +527,37 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
$CM+ $CAN_CM;
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
$CM* $WJ $CM* $CAN_CM;
|
||||
$CM* $WJ [$LB8NonBreaks-$CM];
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CM* $CAN_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CM* $CAN_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
|
@ -550,29 +578,27 @@ $SY [$LB8NonBreaks-$CM];
|
|||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CM* $CAN_CM $SP* $CM* $OP;
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
|
||||
|
||||
# LB 15
|
||||
$CM* $OP $SP* $CM* $QU;
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
# Don't include $NSX here
|
||||
$CM* $NS $SP* $CM* ($CL | $CP);
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$CM* $B2 $SP* $CM* $B2;
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
@ -581,13 +607,13 @@ $CM* $B2 $SP* $CM* $B2;
|
|||
#
|
||||
# LB 19
|
||||
#
|
||||
$CM* $QU $CM* $CAN_CM; # . x QU
|
||||
$CM* $QU $LB18NonBreaks;
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CM* $CAN_CM $CM* $QU; # QU x .
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
|
@ -598,69 +624,87 @@ $AL ($HY | $HH) / $SP;
|
|||
|
||||
# LB 21
|
||||
# Don't include $NSX here
|
||||
$CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$CM* $HL $CM* $SY;
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$CM* $IN $CM* ($ALPlus | $HL);
|
||||
$CM* $IN $CM* $EX;
|
||||
$CM* $IN $CM* $ID;
|
||||
$CM* $IN $CM* $IN;
|
||||
$CM* $IN $CM* $NU;
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$CM* $PO $CM* $ID;
|
||||
$CM* $NU $CM* ($ALPlus | $HL);
|
||||
$CM* ($ALPlus | $HL) $CM* $NU;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
$CM* $ID $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PO;
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
$CM* ($JT | $JV) $CM* ($H2 | $JV);
|
||||
$CM* $JT $CM* ($H3 | $JT);
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
$CM* ($ALPlus | $HL) $CM* $IS;
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$CM* $OP $CM* ($ALPlus | $HL | $NU);
|
||||
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
$CM* $RI $CM* $RI;
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $NSX $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $NSX $HY $BA $HH $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
$CM+ $SP / .;
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -675,12 +719,15 @@ $SP+ $CM* ($CL | $CP);
|
|||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 21
|
||||
$CM* ($HY | $BA | $HH) $CM* $HL;
|
||||
($HY | $BA | $HH) $CM* $HL;
|
||||
|
||||
# LB 25
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
@ -697,6 +744,6 @@ $dictionary $dictionary;
|
|||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $HH $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
# Copyright (c) 2002-2015 International Business Machines Corporation and
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line_normal.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
|
@ -23,8 +27,6 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
|
@ -62,8 +64,13 @@
|
|||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
$EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
|
@ -71,7 +78,7 @@ $B2 = [:LineBreak = Break_Both:];
|
|||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
$CM = [:LineBreak = Combining_Mark:];
|
||||
$CM = [[:LineBreak = Combining_Mark:] \u200d];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
|
@ -80,7 +87,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
|
|||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ];
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
|
@ -102,6 +109,7 @@ $SY = [:LineBreak = Break_Symbols:];
|
|||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [\u200d];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
|
@ -134,7 +142,6 @@ $HLcm = $HL $CM*;
|
|||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
|
@ -163,6 +170,8 @@ $BB $CM+;
|
|||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
|
@ -211,7 +220,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
|||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
|
@ -219,13 +228,13 @@ $CR $LF {100};
|
|||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
$CM+ $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
$CM+ [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
|
@ -236,20 +245,23 @@ $CM+ [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x ID Emoji proposal.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
$CM+;
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
$CM+ $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
|
@ -260,13 +272,13 @@ $WJcm $CAN_CM $CM*;
|
|||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;
|
||||
$CM+ GLcm;
|
||||
^$CM+ $GLcm;
|
||||
|
||||
|
||||
|
||||
|
@ -275,23 +287,23 @@ $CM+ GLcm;
|
|||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
|
@ -321,12 +333,10 @@ $LB18Breaks = [$LB8Breaks $SP];
|
|||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
$CM+ $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
# TODO: I don't think this rule is needed.
|
||||
|
||||
|
||||
# LB 20
|
||||
|
@ -338,14 +348,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
^$CM+ ($BAcm | $HYcm | $NScm);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm) [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
|
@ -354,25 +365,25 @@ $SYcm $HLcm;
|
|||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
$IDcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
|
||||
|
||||
# $LB 23
|
||||
$IDcm $POcm;
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm $IDcm;
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
|
||||
|
@ -396,18 +407,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
|||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
|
||||
# LB 30a Do not break between regional indicators.
|
||||
$RIcm $RIcm;
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
|
@ -416,34 +436,36 @@ $RIcm $RIcm;
|
|||
|
||||
!!reverse;
|
||||
|
||||
$CM+ $ALPlus;
|
||||
$CM+ $BA;
|
||||
$CM+ $BB;
|
||||
$CM+ $B2;
|
||||
$CM+ $CL;
|
||||
$CM+ $CP;
|
||||
$CM+ $EX;
|
||||
$CM+ $GL;
|
||||
$CM+ $HL;
|
||||
$CM+ $HY;
|
||||
$CM+ $H2;
|
||||
$CM+ $H3;
|
||||
$CM+ $ID;
|
||||
$CM+ $IN;
|
||||
$CM+ $IS;
|
||||
$CM+ $JL;
|
||||
$CM+ $JV;
|
||||
$CM+ $JT;
|
||||
$CM+ $NS;
|
||||
$CM+ $NU;
|
||||
$CM+ $OP;
|
||||
$CM+ $PO;
|
||||
$CM+ $PR;
|
||||
$CM+ $QU;
|
||||
$CM+ $RI;
|
||||
$CM+ $SY;
|
||||
$CM+ $WJ;
|
||||
$CM+;
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
|
||||
|
||||
#
|
||||
|
@ -455,14 +477,14 @@ $AL_FOLLOW $CM+ / (
|
|||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
|
@ -474,7 +496,7 @@ $AL_FOLLOW $CM+ / (
|
|||
|
||||
|
||||
|
||||
# LB 4, 5, 5
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
|
@ -491,30 +513,37 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
$CM+ $CAN_CM;
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
$CM* $WJ $CM* $CAN_CM;
|
||||
$CM* $WJ [$LB8NonBreaks-$CM];
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CM* $CAN_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CM* $CAN_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
|
@ -535,28 +564,26 @@ $SY [$LB8NonBreaks-$CM];
|
|||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CM* $CAN_CM $SP* $CM* $OP;
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
|
||||
|
||||
# LB 15
|
||||
$CM* $OP $SP* $CM* $QU;
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$CM* $NS $SP* $CM* ($CL | $CP);
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$CM* $B2 $SP* $CM* $B2;
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
@ -565,82 +592,100 @@ $CM* $B2 $SP* $CM* $B2;
|
|||
#
|
||||
# LB 19
|
||||
#
|
||||
$CM* $QU $CM* $CAN_CM; # . x QU
|
||||
$CM* $QU $LB18NonBreaks;
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CM* $CAN_CM $CM* $QU; # QU x .
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
#
|
||||
|
||||
# LB 21
|
||||
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$CM* $HL $CM* $SY;
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$CM* $IN $CM* ($ALPlus | $HL);
|
||||
$CM* $IN $CM* $EX;
|
||||
$CM* $IN $CM* $ID;
|
||||
$CM* $IN $CM* $IN;
|
||||
$CM* $IN $CM* $NU;
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$CM* $PO $CM* $ID;
|
||||
$CM* $NU $CM* ($ALPlus | $HL);
|
||||
$CM* ($ALPlus | $HL) $CM* $NU;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
$CM* $ID $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PO;
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
$CM* ($JT | $JV) $CM* ($H2 | $JV);
|
||||
$CM* $JT $CM* ($H3 | $JT);
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
$CM* ($ALPlus | $HL) $CM* $IS;
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$CM* $OP $CM* ($ALPlus | $HL | $NU);
|
||||
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
$CM* $RI $CM* $RI;
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
$CM+ $SP / .;
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -661,6 +706,9 @@ $CM* ($HY | $BA) $CM* $HL;
|
|||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
@ -677,6 +725,6 @@ $dictionary $dictionary;
|
|||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
# Copyright (c) 2002-2015 International Business Machines Corporation and
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line_normal_cj.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
|
@ -24,8 +28,6 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
|
@ -63,8 +65,13 @@
|
|||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
# Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
|
||||
|
||||
$EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
$EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$AL = [[:LineBreak = Alphabetic:] - [$EM\u2764]];
|
||||
$BAX = [\u2010 \u2013];
|
||||
$BA = [[:LineBreak = Break_After:] - $BAX];
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
|
@ -73,7 +80,7 @@ $B2 = [:LineBreak = Break_Both:];
|
|||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
$CM = [:LineBreak = Combining_Mark:];
|
||||
$CM = [[:LineBreak = Combining_Mark:] \u200d];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
|
@ -82,7 +89,7 @@ $HL = [:LineBreak = Hebrew_Letter:];
|
|||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ];
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ [\u2764] - $EB];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
|
@ -105,6 +112,7 @@ $SY = [:LineBreak = Break_Symbols:];
|
|||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [\u200d];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
|
@ -138,7 +146,6 @@ $HLcm = $HL $CM*;
|
|||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
|
@ -169,6 +176,8 @@ $BB $CM+;
|
|||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$CP $CM+;
|
||||
$EB $CM+;
|
||||
$EM $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
|
@ -218,7 +227,7 @@ $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
|||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
|
@ -226,13 +235,13 @@ $CR $LF {100};
|
|||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
$CM+ $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
$CM+ [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
|
@ -243,20 +252,23 @@ $CM+ [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x ID Emoji proposal.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
$CM+;
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
$CM+ $WJcm;
|
||||
^$CM+ $WJcm;
|
||||
|
||||
$WJcm $CANT_CM;
|
||||
$WJcm $CAN_CM $CM*;
|
||||
|
@ -267,13 +279,13 @@ $WJcm $CAN_CM $CM*;
|
|||
#
|
||||
$GLcm $CAN_CM $CM*;
|
||||
$GLcm $CANT_CM;
|
||||
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
|
||||
$CM+ GLcm;
|
||||
^$CM+ $GLcm;
|
||||
|
||||
|
||||
|
||||
|
@ -282,23 +294,23 @@ $CM+ GLcm;
|
|||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
|
@ -330,12 +342,10 @@ $LB18Breaks = [$LB8Breaks $SP];
|
|||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
$CM+ $QUcm;
|
||||
^$CM+ $QUcm;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
# TODO: I don't think this rule is needed.
|
||||
|
||||
|
||||
# LB 20
|
||||
|
@ -348,14 +358,15 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
^$CM+ ($BAcm | $HYcm | $NScm);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
|
@ -364,25 +375,25 @@ $SYcm $HLcm;
|
|||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EXcm $INcm;
|
||||
$IDcm $INcm;
|
||||
($ID | $EB | $EM) $CM* $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
|
||||
|
||||
# $LB 23
|
||||
$IDcm $POcm;
|
||||
($ID | $EB | $EM) $CM* $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
^$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm $IDcm;
|
||||
$PRcm ($ID | $EB | $EM);
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
|
||||
|
@ -406,18 +417,27 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
|||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
^$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $HLcm);
|
||||
|
||||
# LB 30
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
^$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CPcm ($ALcm | $HLcm | $NUcm);
|
||||
|
||||
# LB 30a Do not break between regional indicators.
|
||||
$RIcm $RIcm;
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RIcm $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RIcm $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RIcm $RIcm $ZWJ ($ID | $EB | $EM);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
|
@ -426,36 +446,38 @@ $RIcm $RIcm;
|
|||
|
||||
!!reverse;
|
||||
|
||||
$CM+ $ALPlus;
|
||||
$CM+ $BA;
|
||||
$CM+ $BAX;
|
||||
$CM+ $BB;
|
||||
$CM+ $B2;
|
||||
$CM+ $CL;
|
||||
$CM+ $CP;
|
||||
$CM+ $EX;
|
||||
$CM+ $GL;
|
||||
$CM+ $HL;
|
||||
$CM+ $HY;
|
||||
$CM+ $H2;
|
||||
$CM+ $H3;
|
||||
$CM+ $ID;
|
||||
$CM+ $IN;
|
||||
$CM+ $IS;
|
||||
$CM+ $JL;
|
||||
$CM+ $JV;
|
||||
$CM+ $JT;
|
||||
$CM+ $NS;
|
||||
$CM+ $NSX;
|
||||
$CM+ $NU;
|
||||
$CM+ $OP;
|
||||
$CM+ $PO;
|
||||
$CM+ $PR;
|
||||
$CM+ $QU;
|
||||
$CM+ $RI;
|
||||
$CM+ $SY;
|
||||
$CM+ $WJ;
|
||||
$CM+;
|
||||
^$CM+ $ALPlus;
|
||||
^$CM+ $BA;
|
||||
^$CM+ $BAX;
|
||||
^$CM+ $BB;
|
||||
^$CM+ $B2;
|
||||
^$CM+ $CL;
|
||||
^$CM+ $CP;
|
||||
^$CM+ $EB;
|
||||
^$CM+ $EM;
|
||||
^$CM+ $EX;
|
||||
^$CM+ $GL;
|
||||
^$CM+ $HL;
|
||||
^$CM+ $HY;
|
||||
^$CM+ $H2;
|
||||
^$CM+ $H3;
|
||||
^$CM+ $ID;
|
||||
^$CM+ $IN;
|
||||
^$CM+ $IS;
|
||||
^$CM+ $JL;
|
||||
^$CM+ $JV;
|
||||
^$CM+ $JT;
|
||||
^$CM+ $NS;
|
||||
^$CM+ $NSX;
|
||||
^$CM+ $NU;
|
||||
^$CM+ $OP;
|
||||
^$CM+ $PO;
|
||||
^$CM+ $PR;
|
||||
^$CM+ $QU;
|
||||
^$CM+ $RI;
|
||||
^$CM+ $SY;
|
||||
^$CM+ $WJ;
|
||||
^$CM+;
|
||||
|
||||
|
||||
#
|
||||
|
@ -467,14 +489,14 @@ $AL_FOLLOW $CM+ / (
|
|||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
|
@ -486,7 +508,7 @@ $AL_FOLLOW $CM+ / (
|
|||
|
||||
|
||||
|
||||
# LB 4, 5, 5
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
|
@ -503,30 +525,37 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
$CM+ $CAN_CM;
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
$CM* $WJ $CM* $CAN_CM;
|
||||
$CM* $WJ [$LB8NonBreaks-$CM];
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CM* $CAN_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CM* $CAN_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
|
@ -547,29 +576,27 @@ $SY [$LB8NonBreaks-$CM];
|
|||
# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CM* $CAN_CM $SP* $CM* $OP;
|
||||
$CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
|
||||
|
||||
# LB 15
|
||||
$CM* $OP $SP* $CM* $QU;
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
# Don't include $NSX here
|
||||
$CM* $NS $SP* $CM* ($CL | $CP);
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$CM* $B2 $SP* $CM* $B2;
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
@ -578,13 +605,13 @@ $CM* $B2 $SP* $CM* $B2;
|
|||
#
|
||||
# LB 19
|
||||
#
|
||||
$CM* $QU $CM* $CAN_CM; # . x QU
|
||||
$CM* $QU $LB18NonBreaks;
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CM* $CAN_CM $CM* $QU; # QU x .
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
|
@ -592,69 +619,87 @@ $CM* $CAN_CM $CM* $QU; # QU x .
|
|||
|
||||
# LB 21
|
||||
# Don't include $BAX or $NSX here
|
||||
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL;
|
||||
# LB21a Don't break after Hebrew + Hyphen.
|
||||
([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$CM* $HL $CM* $SY;
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$CM* $IN $CM* ($ALPlus | $HL);
|
||||
$CM* $IN $CM* $EX;
|
||||
$CM* $IN $CM* $ID;
|
||||
$CM* $IN $CM* $IN;
|
||||
$CM* $IN $CM* $NU;
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$CM* $PO $CM* $ID;
|
||||
$CM* $NU $CM* ($ALPlus | $HL);
|
||||
$CM* ($ALPlus | $HL) $CM* $NU;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
$CM* $ID $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PO;
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PR;
|
||||
($ALPlus | $HL) $CM* $PO;
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
$CM* ($JT | $JV) $CM* ($H2 | $JV);
|
||||
$CM* $JT $CM* ($H3 | $JT);
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
$CM* ($ALPlus | $HL) $CM* $IS;
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$CM* $OP $CM* ($ALPlus | $HL | $NU);
|
||||
$CM* ($ALPlus | $HL | $NU) $CM* $CP;
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
$CM* $RI $CM* $RI;
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
$CM+ $SP / .;
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -675,6 +720,9 @@ $CM* ($HY | $BA | $BAX) $CM* $HL;
|
|||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
@ -691,6 +739,6 @@ $dictionary $dictionary;
|
|||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $dictionary];
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) 2002-2015 International Business Machines Corporation and
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line_normal_fi.txt
|
||||
|
@ -269,7 +269,7 @@ $GLcm $CANT_CM;
|
|||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm;
|
||||
$CM+ GLcm;
|
||||
$CM+ $GLcm;
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
#
|
||||
# Copyright (C) 2002-2015, International Business Machines Corporation
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
|
||||
# These rules are based on UAX #29 Revision 27 for Unicode Version 8.0
|
||||
# with additions from L2/16-011R3 for Emoji sequences.
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
@ -24,12 +25,17 @@
|
|||
# Character Class Definitions.
|
||||
#
|
||||
|
||||
$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
$E_Modifier = [\U0001F3FB-\U0001F3FF];
|
||||
$ZWJ = [\u200D];
|
||||
$GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
|
||||
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Newline = [\p{Word_Break = Newline} ];
|
||||
$Extend = [[\p{Word_Break = Extend}][:Block=Tags:]];
|
||||
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Format = [[\p{Word_Break = Format}] - [:Block=Tags:]];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
|
@ -66,21 +72,21 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
|||
# except when they appear at the beginning of a region of text.
|
||||
#
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
$KatakanaEx = $Katakana ($Extend | $Format)*;
|
||||
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
|
||||
$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
|
||||
$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
|
||||
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
|
||||
$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
|
||||
$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
|
||||
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format)*;
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
|
@ -91,12 +97,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format)*;
|
|||
#
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
$ZWJ $GAZ;
|
||||
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
# of a region of Text. The rule here comes into play when the start of text
|
||||
# begins with a group of Format chars, or with a "word" consisting of a single
|
||||
# char that is not in any of the listed word break categories followed by
|
||||
# format char(s), or is not a CJK dictionary character.
|
||||
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||
[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
|
||||
|
||||
$NumericEx {100};
|
||||
$ALetterEx {200};
|
||||
|
@ -106,6 +117,10 @@ $KatakanaEx {400}; # note: these status values override those from rule 5
|
|||
$HiraganaEx {400}; # by virtue of being numerically larger.
|
||||
$IdeographicEx {400}; #
|
||||
|
||||
$E_Base ($Extend | $Format | $ZWJ)*;
|
||||
$E_Modifier ($Extend | $Format | $ZWJ)*;
|
||||
$GAZ ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
#
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
|
@ -157,36 +172,48 @@ $ExtendNumLetEx $NumericEx {100}; # (13b)
|
|||
$ExtendNumLetEx $KatakanaEx {400}; # (13b)
|
||||
|
||||
# rule 13c
|
||||
|
||||
$Regional_IndicatorEx $Regional_IndicatorEx;
|
||||
# Pairs of Regional Indicators stay together.
|
||||
# With rule chaining disabled by ^, this rule will match exactly two of them.
|
||||
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
|
||||
#
|
||||
^$Regional_IndicatorEx $Regional_IndicatorEx;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
# rule 13d
|
||||
# E_Base x E_Modifier
|
||||
#
|
||||
($E_Base | $GAZ) ($Format | $Extend | $ZWJ)* $E_Modifier;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
$BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter;
|
||||
$BackALetterEx = ($Format | $Extend)* $ALetterPlus;
|
||||
$BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote;
|
||||
$BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote;
|
||||
$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;
|
||||
$BackNumericEx = ($Format | $Extend)* $Numeric;
|
||||
$BackMidNumEx = ($Format | $Extend)* $MidNum;
|
||||
$BackMidLetterEx = ($Format | $Extend)* $MidLetter;
|
||||
$BackKatakanaEx = ($Format | $Extend)* $Katakana;
|
||||
$BackHiraganaEx = ($Format | $Extend)* $Hiragana;
|
||||
$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet;
|
||||
$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
|
||||
$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
|
||||
$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus;
|
||||
$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote;
|
||||
$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote;
|
||||
$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet;
|
||||
$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric;
|
||||
$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum;
|
||||
$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter;
|
||||
$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana;
|
||||
$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana;
|
||||
$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
|
||||
$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
||||
|
||||
# rule 3
|
||||
$LF $CR;
|
||||
|
||||
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
$GAZ $ZWJ;
|
||||
|
||||
# rule 4
|
||||
($Format | $Extend)* [^$CR $LF $Newline]?;
|
||||
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
|
||||
|
||||
# rule 5
|
||||
|
||||
|
@ -229,18 +256,32 @@ $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $B
|
|||
|
||||
# rule 13c
|
||||
|
||||
$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
|
||||
^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
$GAZ $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
$GAZ $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable;
|
||||
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
|
||||
|
||||
# rule 13d
|
||||
|
||||
$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $GAZ);
|
||||
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# rule 3
|
||||
($Extend | $Format)+ .?;
|
||||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
|
||||
|
@ -252,6 +293,9 @@ $Double_Quote $BackHebrew_LetterEx;
|
|||
# rule 11
|
||||
($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
|
||||
|
||||
# rule 13c
|
||||
$BackRegional_IndicatorEx*;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
@ -260,7 +304,7 @@ $dictionary $dictionary;
|
|||
!!safe_forward;
|
||||
|
||||
# rule 4
|
||||
($Extend | $Format)+ .?;
|
||||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
|
||||
|
@ -271,5 +315,8 @@ $Double_QuoteEx $Hebrew_LetterEx;
|
|||
# rule 11
|
||||
($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
|
||||
|
||||
# rule 13c
|
||||
$Regional_IndicatorEx*;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
#
|
||||
# Copyright (C) 2002-2015, International Business Machines Corporation
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word_POSIX.txt
|
||||
#
|
||||
# ICU Word Break Rules, POSIX locale.
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
|
||||
# These rules are based on UAX #29 Revision 27 for Unicode Version 8.0
|
||||
# with additions from L2/16-011R3 for Emoji sequences.
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
@ -24,12 +25,17 @@
|
|||
# Character Class Definitions.
|
||||
#
|
||||
|
||||
$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
$E_Modifier = [\U0001F3FB-\U0001F3FF];
|
||||
$ZWJ = [\u200D];
|
||||
$GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
|
||||
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Newline = [\p{Word_Break = Newline} ];
|
||||
$Extend = [[\p{Word_Break = Extend}][:Block=Tags:]];
|
||||
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Format = [[\p{Word_Break = Format}] - [:Block=Tags:]];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
|
@ -50,7 +56,7 @@ $Hiragana = [:Hiragana:];
|
|||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
|
@ -62,25 +68,25 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
|||
|
||||
|
||||
#
|
||||
# Rules 4 Ignore Format and Extend characters,
|
||||
# Rules 4 Ignore Format and Extend characters,
|
||||
# except when they appear at the beginning of a region of text.
|
||||
#
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
$KatakanaEx = $Katakana ($Extend | $Format)*;
|
||||
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
|
||||
$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
|
||||
$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
|
||||
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
|
||||
$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
|
||||
$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
|
||||
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format)*;
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
|
@ -91,12 +97,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format)*;
|
|||
#
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
$ZWJ $GAZ;
|
||||
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
# of a region of Text. The rule here comes into play when the start of text
|
||||
# begins with a group of Format chars, or with a "word" consisting of a single
|
||||
# char that is not in any of the listed word break categories followed by
|
||||
# format char(s), or is not a CJK dictionary character.
|
||||
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||
[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
|
||||
|
||||
$NumericEx {100};
|
||||
$ALetterEx {200};
|
||||
|
@ -106,6 +117,10 @@ $KatakanaEx {400}; # note: these status values override those from rule 5
|
|||
$HiraganaEx {400}; # by virtue of being numerically larger.
|
||||
$IdeographicEx {400}; #
|
||||
|
||||
$E_Base ($Extend | $Format | $ZWJ)*;
|
||||
$E_Modifier ($Extend | $Format | $ZWJ)*;
|
||||
$GAZ ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
#
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
|
@ -133,7 +148,7 @@ $NumericEx $NumericEx {100};
|
|||
|
||||
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 11 and 12
|
||||
# rule 11 and 12
|
||||
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
|
||||
|
||||
|
@ -157,36 +172,48 @@ $ExtendNumLetEx $NumericEx {100}; # (13b)
|
|||
$ExtendNumLetEx $KatakanaEx {400}; # (13b)
|
||||
|
||||
# rule 13c
|
||||
|
||||
$Regional_IndicatorEx $Regional_IndicatorEx;
|
||||
# Pairs of Regional Indicators stay together.
|
||||
# With rule chaining disabled by ^, this rule will match exactly two of them.
|
||||
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
|
||||
#
|
||||
^$Regional_IndicatorEx $Regional_IndicatorEx;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
# rule 13d
|
||||
# E_Base x E_Modifier
|
||||
#
|
||||
($E_Base | $GAZ) ($Format | $Extend | $ZWJ)* $E_Modifier;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
$BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter;
|
||||
$BackALetterEx = ($Format | $Extend)* $ALetterPlus;
|
||||
$BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote;
|
||||
$BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote;
|
||||
$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;
|
||||
$BackNumericEx = ($Format | $Extend)* $Numeric;
|
||||
$BackMidNumEx = ($Format | $Extend)* $MidNum;
|
||||
$BackMidLetterEx = ($Format | $Extend)* $MidLetter;
|
||||
$BackKatakanaEx = ($Format | $Extend)* $Katakana;
|
||||
$BackHiraganaEx = ($Format | $Extend)* $Hiragana;
|
||||
$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet;
|
||||
$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
|
||||
$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
|
||||
$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus;
|
||||
$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote;
|
||||
$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote;
|
||||
$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet;
|
||||
$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric;
|
||||
$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum;
|
||||
$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter;
|
||||
$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana;
|
||||
$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana;
|
||||
$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
|
||||
$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
||||
|
||||
# rule 3
|
||||
$LF $CR;
|
||||
|
||||
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
$GAZ $ZWJ;
|
||||
|
||||
# rule 4
|
||||
($Format | $Extend)* [^$CR $LF $Newline]?;
|
||||
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
|
||||
|
||||
# rule 5
|
||||
|
||||
|
@ -225,22 +252,36 @@ $BackKatakanaEx $BackKatakanaEx;
|
|||
# rules 13 a/b
|
||||
#
|
||||
$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
|
||||
($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
|
||||
($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
|
||||
|
||||
# rule 13c
|
||||
|
||||
$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
|
||||
^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
$GAZ $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
$GAZ $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable;
|
||||
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
|
||||
|
||||
# rule 13d
|
||||
|
||||
$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $GAZ);
|
||||
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# rule 3
|
||||
($Extend | $Format)+ .?;
|
||||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
|
||||
|
@ -252,6 +293,9 @@ $Double_Quote $BackHebrew_LetterEx;
|
|||
# rule 11
|
||||
($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
|
||||
|
||||
# rule 13c
|
||||
$BackRegional_IndicatorEx*;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
@ -260,7 +304,7 @@ $dictionary $dictionary;
|
|||
!!safe_forward;
|
||||
|
||||
# rule 4
|
||||
($Extend | $Format)+ .?;
|
||||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
|
||||
|
@ -271,5 +315,8 @@ $Double_QuoteEx $Hebrew_LetterEx;
|
|||
# rule 11
|
||||
($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
|
||||
|
||||
# rule 13c
|
||||
$Regional_IndicatorEx*;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -543,7 +543,7 @@ static void TestBreakIteratorRules() {
|
|||
* keep together 'abc', but only when followed by 'def', OTHERWISE
|
||||
* just return one char at a time.
|
||||
*/
|
||||
char rules[] = "abc{666}/def;\n [\\p{L} - [a]]* {2}; . {1};";
|
||||
char rules[] = "abc/def{666};\n [\\p{L} - [a]]* {2}; . {1};";
|
||||
/* 0123456789012345678 */
|
||||
char data[] = "abcdex abcdefgh-def"; /* the test data string */
|
||||
char breaks[] = "** ** * ** *"; /* * the expected break positions */
|
||||
|
|
|
@ -49,7 +49,7 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \
|
|||
tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o \
|
||||
tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
|
||||
bytestrietest.o ucharstrietest.o \
|
||||
itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
|
||||
itrbbi.o rbbiapts.o rbbitst.o rbbimonkeytest.o ittrans.o transapi.o cpdtrtst.o \
|
||||
testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
|
||||
jamotest.o srchtest.o reptest.o regextst.o \
|
||||
itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2015, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2016, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -1790,6 +1790,39 @@ float IntlTest::random() {
|
|||
return random(&RAND_SEED);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Integer random number class implementation.
|
||||
* Similar to C++ std::minstd_rand, with the same algorithm & constants.
|
||||
*/
|
||||
IntlTest::icu_rand::icu_rand(uint32_t seed) {
|
||||
seed = seed % 2147483647UL;
|
||||
if (seed == 0) {
|
||||
seed = 1;
|
||||
}
|
||||
fLast = seed;
|
||||
}
|
||||
|
||||
IntlTest::icu_rand::~icu_rand() {};
|
||||
|
||||
void IntlTest::icu_rand::seed(uint32_t seed) {
|
||||
if (seed == 0) {
|
||||
seed = 1;
|
||||
}
|
||||
fLast = seed;
|
||||
}
|
||||
|
||||
uint32_t IntlTest::icu_rand::operator() () {
|
||||
fLast = ((uint64_t)fLast * 48271UL) % 2147483647UL;
|
||||
return fLast;
|
||||
}
|
||||
|
||||
uint32_t IntlTest::icu_rand::getSeed() {
|
||||
return (uint32_t) fLast;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static inline UChar toHex(int32_t i) {
|
||||
return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10)));
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2015, International Business Machines Corporation and
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2016, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -18,7 +18,7 @@
|
|||
|
||||
#if U_NO_DEFAULT_INCLUDE_UTF_HEADERS
|
||||
/* deprecated - make tests pass with U_NO_DEFAULT_INCLUDE_UTF_HEADERS */
|
||||
#include "unicode/utf_old.h"
|
||||
#include "unicode/utf_old.h"
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
@ -166,7 +166,7 @@ public:
|
|||
/**
|
||||
* Replaces isICUVersionAtLeast and isICUVersionBefore
|
||||
* log that an issue is known.
|
||||
* Usually used this way:
|
||||
* Usually used this way:
|
||||
* <code>if( ... && logKnownIssue("12345", "some bug")) continue; </code>
|
||||
* @param ticket ticket string, "12345" or "cldrbug:1234"
|
||||
* @param message optional message string
|
||||
|
@ -230,11 +230,11 @@ public:
|
|||
void errcheckln(UErrorCode status, const char *fmt, ...);
|
||||
|
||||
// Print ALL named errors encountered so far
|
||||
void printErrors();
|
||||
void printErrors();
|
||||
|
||||
// print known issues. return TRUE if there were any.
|
||||
UBool printKnownIssues();
|
||||
|
||||
|
||||
virtual void usage( void ) ;
|
||||
|
||||
/**
|
||||
|
@ -253,6 +253,30 @@ public:
|
|||
*/
|
||||
static float random();
|
||||
|
||||
|
||||
/**
|
||||
* Integer random numbers, similar to C++ std::minstd_rand, with the same algorithm
|
||||
* and constants. Allow additional access to internal state, for use by monkey tests,
|
||||
* which need to recreate previous random sequences beginning near a failure point.
|
||||
*/
|
||||
class icu_rand {
|
||||
public:
|
||||
icu_rand(uint32_t seed = 1);
|
||||
~icu_rand();
|
||||
void seed(uint32_t seed);
|
||||
uint32_t operator()();
|
||||
/**
|
||||
* Get a seed corresponding to the current state of the generator.
|
||||
* Seeding any generator with this value will cause it to produce the
|
||||
* same sequence as this one will from this point forward.
|
||||
*/
|
||||
uint32_t getSeed();
|
||||
private:
|
||||
uint32_t fLast;
|
||||
};
|
||||
|
||||
|
||||
|
||||
enum { kMaxProps = 16 };
|
||||
|
||||
virtual void setProperty(const char* propline);
|
||||
|
@ -320,7 +344,7 @@ private:
|
|||
int32_t dataErrorCount;
|
||||
IntlTest* caller;
|
||||
char* testPath; // specifies subtests
|
||||
|
||||
|
||||
char basePath[1024];
|
||||
char currName[1024]; // current test name
|
||||
|
||||
|
|
|
@ -238,6 +238,7 @@
|
|||
<DisableLanguageExtensions>false</DisableLanguageExtensions>
|
||||
</ClCompile>
|
||||
<ClCompile Include="rbbitst.cpp" />
|
||||
<ClCompile Include="rbbimonkeytest.cpp" />
|
||||
<ClCompile Include="itspoof.cpp" />
|
||||
<ClCompile Include="allcoll.cpp" />
|
||||
<ClCompile Include="alphaindextst.cpp" />
|
||||
|
@ -434,6 +435,7 @@
|
|||
<ClInclude Include="itrbbi.h" />
|
||||
<ClInclude Include="rbbiapts.h" />
|
||||
<ClInclude Include="rbbitst.h" />
|
||||
<ClInclude Include="rbbimonkeytest.h" />
|
||||
<ClInclude Include="itspoof.h" />
|
||||
<ClInclude Include="allcoll.h" />
|
||||
<ClInclude Include="alphaindextst.h" />
|
||||
|
|
|
@ -70,6 +70,9 @@
|
|||
<ClCompile Include="rbbitst.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="rbbimonkeytest.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="itspoof.cpp">
|
||||
<Filter>spoof detection</Filter>
|
||||
</ClCompile>
|
||||
|
@ -504,6 +507,9 @@
|
|||
<ClInclude Include="rbbitst.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="rbbimonkeytest.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="itspoof.h">
|
||||
<Filter>spoof detection</Filter>
|
||||
</ClInclude>
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1998-2012, International Business Machines Corporation
|
||||
* Copyright (C) 1998-2016, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -16,30 +16,23 @@
|
|||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "intltest.h"
|
||||
#include "itrbbi.h"
|
||||
#include "rbbiapts.h"
|
||||
#include "rbbitst.h"
|
||||
|
||||
#define TESTCLASS(n,classname) \
|
||||
case n: \
|
||||
name = #classname; \
|
||||
if (exec) { \
|
||||
logln(#classname "---"); \
|
||||
logln(""); \
|
||||
classname t; \
|
||||
callTest(t, par); \
|
||||
} \
|
||||
break
|
||||
#include "rbbimonkeytest.h"
|
||||
|
||||
|
||||
void IntlTestRBBI::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
|
||||
{
|
||||
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
|
||||
switch (index) {
|
||||
TESTCLASS(0, RBBIAPITest);
|
||||
TESTCLASS(1, RBBITest);
|
||||
default: name=""; break;
|
||||
if (exec) {
|
||||
logln("TestSuite RuleBasedBreakIterator: ");
|
||||
}
|
||||
TESTCASE_AUTO_BEGIN;
|
||||
TESTCASE_AUTO_CLASS(RBBIAPITest);
|
||||
TESTCASE_AUTO_CLASS(RBBITest);
|
||||
TESTCASE_AUTO_CLASS(RBBIMonkeyTest);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
|
976
icu4c/source/test/intltest/rbbimonkeytest.cpp
Normal file
976
icu4c/source/test/intltest/rbbimonkeytest.cpp
Normal file
|
@ -0,0 +1,976 @@
|
|||
/********************************************************************
|
||||
* Copyright (c) 2016, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "rbbimonkeytest.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstr.h"
|
||||
#include "uelement.h"
|
||||
#include "uhash.h"
|
||||
|
||||
#include "iostream"
|
||||
#include "string"
|
||||
|
||||
using namespace icu;
|
||||
|
||||
|
||||
void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
|
||||
fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function.
|
||||
|
||||
TESTCASE_AUTO_BEGIN;
|
||||
TESTCASE_AUTO(testMonkey);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------
|
||||
//
|
||||
// class BreakRule implementation.
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
|
||||
BreakRule::BreakRule() // : all field default initialized.
|
||||
{
|
||||
}
|
||||
|
||||
BreakRule::~BreakRule() {};
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------
|
||||
//
|
||||
// class BreakRules implementation.
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
|
||||
fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
|
||||
fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
|
||||
uhash_compareUnicodeString,
|
||||
NULL, // value comparator.
|
||||
&status));
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
|
||||
uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
|
||||
fBreakRules.setDeleter(uprv_deleteUObject);
|
||||
|
||||
fCharClassList.adoptInstead(new UVector(status));
|
||||
|
||||
fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
|
||||
"(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative lookbehind for '{' or '=' or '[:'
|
||||
// (the identifier is a unicode property name or value)
|
||||
"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
|
||||
0, status));
|
||||
|
||||
// Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
|
||||
fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
|
||||
"(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
|
||||
"[ \\t]*+" // Match white space.
|
||||
"(#.*)?+" // Optional # plus whatever follows
|
||||
"\\R$" // new-line at end of line.
|
||||
), 0, status));
|
||||
|
||||
// Match (initial parse) of a character class defintion line.
|
||||
fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
|
||||
"[ \\t]*" // leading white space
|
||||
"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
|
||||
"[ \\t]*=[ \\t]*" // =
|
||||
"(?<ClassDef>.*?)" // The char class UnicodeSet expression
|
||||
"[ \\t]*;$"), // ; <end of line>
|
||||
0, status));
|
||||
|
||||
// Match (initial parse) of a break rule line.
|
||||
fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
|
||||
"[ \\t]*" // leading white space
|
||||
"(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
|
||||
"[ \\t]*:[ \\t]*" // :
|
||||
"(?<RuleDef>.*?)" // The rule definition
|
||||
"[ \\t]*;$"), // ; <end of line>
|
||||
0, status));
|
||||
|
||||
}
|
||||
|
||||
|
||||
BreakRules::~BreakRules() {};
|
||||
|
||||
|
||||
CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
|
||||
|
||||
// Create the expanded definition for this char class,
|
||||
// replacing any set references with the corresponding definition.
|
||||
|
||||
UnicodeString expandedDef;
|
||||
UnicodeString emptyString;
|
||||
fSetRefsMatcher->reset(definition);
|
||||
while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
|
||||
const UnicodeString name =
|
||||
fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
|
||||
CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
|
||||
const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
|
||||
|
||||
fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
|
||||
expandedDef.append(expansionForName);
|
||||
}
|
||||
fSetRefsMatcher->appendTail(expandedDef);
|
||||
|
||||
// Verify that the expanded set defintion is valid.
|
||||
|
||||
if (fMonkeyImpl->fDumpExpansions) {
|
||||
printf("epandedDef: %s\n", CStr(expandedDef)());
|
||||
}
|
||||
|
||||
UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
|
||||
if (U_FAILURE(status)) {
|
||||
IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
|
||||
u_errorName(status), CStr(name)());
|
||||
return NULL;
|
||||
}
|
||||
CharClass *cclass = new CharClass(name, definition, expandedDef, s);
|
||||
CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
|
||||
new UnicodeString(name), // Key, owned by hash table.
|
||||
cclass, // Value, owned by hash table.
|
||||
&status));
|
||||
|
||||
if (previousClass != NULL) {
|
||||
// Duplicate class def.
|
||||
// These are legitimate, they are adustments of an existing class.
|
||||
// TODO: will need to keep the old around when we handle tailorings.
|
||||
IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
|
||||
delete previousClass;
|
||||
}
|
||||
return cclass;
|
||||
}
|
||||
|
||||
|
||||
void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
|
||||
LocalPointer<BreakRule> thisRule(new BreakRule);
|
||||
thisRule->fName = name;
|
||||
thisRule->fRule = definition;
|
||||
|
||||
// If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
|
||||
// This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
|
||||
UnicodeString emptyString;
|
||||
|
||||
// Expand the char class definitions within the rule.
|
||||
fSetRefsMatcher->reset(definition);
|
||||
while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
|
||||
const UnicodeString name =
|
||||
fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
|
||||
CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
|
||||
if (!nameClass) {
|
||||
IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
|
||||
__FILE__, __LINE__, CStr(name)(), CStr(definition)());
|
||||
}
|
||||
const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
|
||||
|
||||
fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
|
||||
thisRule->fExpandedRule.append(expansionForName);
|
||||
}
|
||||
fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
|
||||
|
||||
// Replace the divide sign (\u00f7) with a regular expression named capture.
|
||||
// When running the rules, a match that includes this group means we found a break position.
|
||||
|
||||
int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
|
||||
if (dividePos >= 0) {
|
||||
thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
|
||||
}
|
||||
if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message.
|
||||
}
|
||||
|
||||
// UAX break rule set definitions can be empty, just [].
|
||||
// Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
|
||||
// also matches nothing.
|
||||
|
||||
static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
|
||||
int32_t where = 0;
|
||||
while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
|
||||
thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
|
||||
}
|
||||
if (fMonkeyImpl->fDumpExpansions) {
|
||||
printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
|
||||
}
|
||||
|
||||
// Compile a regular expression for this rule.
|
||||
thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
|
||||
if (U_FAILURE(status)) {
|
||||
IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
|
||||
__FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
|
||||
return;
|
||||
}
|
||||
|
||||
// Put this new rule into the vector of all Rules.
|
||||
fBreakRules.addElement(thisRule.orphan(), status);
|
||||
}
|
||||
|
||||
|
||||
bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
|
||||
if (keyword == UnicodeString("locale")) {
|
||||
CharString localeName;
|
||||
localeName.append(CStr(value)(), -1, status);
|
||||
fLocale = Locale::createFromName(localeName.data());
|
||||
return true;
|
||||
}
|
||||
if (keyword == UnicodeString("type")) {
|
||||
if (value == UnicodeString("grapheme")) {
|
||||
fType = UBRK_CHARACTER;
|
||||
} else if (value == UnicodeString("word")) {
|
||||
fType = UBRK_WORD;
|
||||
} else if (value == UnicodeString("line")) {
|
||||
fType = UBRK_LINE;
|
||||
} else if (value == UnicodeString("sentence")) {
|
||||
fType = UBRK_SENTENCE;
|
||||
} else {
|
||||
IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// TODO: add tailoring base setting here.
|
||||
return false;
|
||||
}
|
||||
|
||||
RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
RuleBasedBreakIterator *bi = NULL;
|
||||
switch(fType) {
|
||||
case UBRK_CHARACTER:
|
||||
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
|
||||
break;
|
||||
case UBRK_WORD:
|
||||
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
|
||||
break;
|
||||
case UBRK_LINE:
|
||||
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
|
||||
break;
|
||||
case UBRK_SENTENCE:
|
||||
bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
|
||||
break;
|
||||
default:
|
||||
IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
return bi;
|
||||
}
|
||||
|
||||
|
||||
void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
UnicodeString emptyString;
|
||||
for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line.
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
int32_t lineLength = 0;
|
||||
const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
|
||||
if (lineBuf == NULL) {
|
||||
break;
|
||||
}
|
||||
UnicodeString line(lineBuf, lineLength);
|
||||
|
||||
// Strip comment lines.
|
||||
fCommentsMatcher->reset(line);
|
||||
line = fCommentsMatcher->replaceFirst(emptyString, status);
|
||||
if (line.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Recognize character class definition and keyword lines
|
||||
fClassDefMatcher->reset(line);
|
||||
if (fClassDefMatcher->matches(status)) {
|
||||
UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
|
||||
UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
|
||||
if (fMonkeyImpl->fDumpExpansions) {
|
||||
printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
|
||||
}
|
||||
if (setKeywordParameter(className, classDef, status)) {
|
||||
// The scanned item was "type = ..." or "locale = ...", etc.
|
||||
// which are not actual character classes.
|
||||
continue;
|
||||
}
|
||||
addCharClass(className, classDef, status);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Recognize rule lines.
|
||||
fRuleDefMatcher->reset(line);
|
||||
if (fRuleDefMatcher->matches(status)) {
|
||||
UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
|
||||
UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
|
||||
if (fMonkeyImpl->fDumpExpansions) {
|
||||
printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
|
||||
}
|
||||
addRule(ruleName, ruleDef, status);
|
||||
continue;
|
||||
}
|
||||
|
||||
IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
|
||||
__FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
|
||||
}
|
||||
|
||||
// Build the vector of char classes, omitting the dictionary class if there is one.
|
||||
// This will be used when constructing the random text to be tested.
|
||||
|
||||
// Also compute the "other" set, consisting of any characters not included in
|
||||
// one or more of the user defined sets.
|
||||
|
||||
UnicodeSet otherSet((UChar32)0, 0x10ffff);
|
||||
int32_t pos = UHASH_FIRST;
|
||||
const UHashElement *el = NULL;
|
||||
while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
|
||||
const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
|
||||
CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
|
||||
// printf(" Adding %s\n", CStr(*ccName)());
|
||||
if (*ccName != cclass->fName) {
|
||||
IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
|
||||
__FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
|
||||
}
|
||||
const UnicodeSet *set = cclass->fSet.getAlias();
|
||||
otherSet.removeAll(*set);
|
||||
if (*ccName == UnicodeString("dictionary")) {
|
||||
fDictionarySet = *set;
|
||||
} else {
|
||||
fCharClassList->addElement(cclass, status);
|
||||
}
|
||||
}
|
||||
|
||||
if (!otherSet.isEmpty()) {
|
||||
// fprintf(stderr, "have an other set.\n");
|
||||
UnicodeString pattern;
|
||||
CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
|
||||
fCharClassList->addElement(cclass, status);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
|
||||
int32_t localIter = 0;
|
||||
int32_t &it = iter? *iter : localIter;
|
||||
|
||||
while (it < fCharClassList->size()) {
|
||||
const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
|
||||
++it;
|
||||
if (cc->fSet->contains(c)) {
|
||||
return cc;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------
|
||||
//
|
||||
// class MonkeyTestData implementation.
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
|
||||
void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
|
||||
const int32_t dataLength = 1000;
|
||||
|
||||
// Fill the test string with random characters.
|
||||
// First randomly pick a char class, then randomly pick a character from that class.
|
||||
// Exclude any characters from the dictionary set.
|
||||
|
||||
// std::cout << "Populating Test Data" << std::endl;
|
||||
fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages,
|
||||
// allowing recreation of failing data.
|
||||
fBkRules = rules;
|
||||
fString.remove();
|
||||
for (int32_t n=0; n<dataLength;) {
|
||||
int charClassIndex = rand() % rules->fCharClassList->size();
|
||||
const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
|
||||
if (cclass->fSet->size() == 0) {
|
||||
// Some rules or tailorings do end up with empty char classes.
|
||||
continue;
|
||||
}
|
||||
int32_t charIndex = rand() % cclass->fSet->size();
|
||||
UChar32 c = cclass->fSet->charAt(charIndex);
|
||||
if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
|
||||
// Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
|
||||
// Don't let random unpaired surrogates combine in the test data because they might
|
||||
// produce an unwanted dictionary character.
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!rules->fDictionarySet.contains(c)) {
|
||||
fString.append(c);
|
||||
++n;
|
||||
}
|
||||
}
|
||||
|
||||
// Reset each rule matcher regex with this new string.
|
||||
// (Although we are always using the same string object, ICU regular expressions
|
||||
// don't like the underlying string data changing without doing a reset).
|
||||
|
||||
for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
|
||||
BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
|
||||
rule->fRuleMatcher->reset(fString);
|
||||
}
|
||||
|
||||
// Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
|
||||
// Expected and Actual breaks are one longer than the input string; a non-zero value
|
||||
// will indicate a boundary preceding that position.
|
||||
|
||||
clearActualBreaks();
|
||||
fExpectedBreaks = fActualBreaks;
|
||||
fRuleForPosition = fActualBreaks;
|
||||
f2ndRuleForPos = fActualBreaks;
|
||||
|
||||
// Apply reference rules to find the expected breaks.
|
||||
|
||||
fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text.
|
||||
// ICU always reports a break there.
|
||||
// The reference rules do not have a means to do so.
|
||||
int32_t strIdx = 0;
|
||||
while (strIdx < fString.length()) {
|
||||
BreakRule *matchingRule = NULL;
|
||||
UBool hasBreak = FALSE;
|
||||
int32_t ruleNum = 0;
|
||||
int32_t matchStart = 0;
|
||||
int32_t matchEnd = 0;
|
||||
int32_t breakGroup = 0;
|
||||
for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
|
||||
BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
|
||||
rule->fRuleMatcher->reset();
|
||||
if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
|
||||
// A candidate rule match, check further to see if we take it or continue to check other rules.
|
||||
// Matches of zero or one codepoint count only if they also specify a break.
|
||||
matchStart = rule->fRuleMatcher->start(status);
|
||||
matchEnd = rule->fRuleMatcher->end(status);
|
||||
breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
|
||||
hasBreak = U_SUCCESS(status);
|
||||
if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
|
||||
status = U_ZERO_ERROR;
|
||||
}
|
||||
if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
|
||||
matchingRule = rule;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (matchingRule == NULL) {
|
||||
// No reference rule matched. This is an error in the rules that should never happen.
|
||||
IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
|
||||
__FILE__, __LINE__, strIdx);
|
||||
dump(strIdx);
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
if (matchingRule->fRuleMatcher->group(status).length() == 0) {
|
||||
// Zero length rule match. This is also an error in the rule expressions.
|
||||
IntlTest::gTest->errln("%s:%d Zero length rule match.",
|
||||
__FILE__, __LINE__);
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
// Record which rule matched over the length of the match.
|
||||
for (int i = matchStart; i < matchEnd; i++) {
|
||||
if (fRuleForPosition.charAt(i) == 0) {
|
||||
fRuleForPosition.setCharAt(i, (UChar)ruleNum);
|
||||
} else {
|
||||
f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
|
||||
}
|
||||
}
|
||||
|
||||
// Break positions appear in rules as a matching named capture of zero length at the break position,
|
||||
// the adjusted pattern contains (?<BreakPosition>)
|
||||
if (hasBreak) {
|
||||
int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
|
||||
if (U_FAILURE(status) || breakPos < 0) {
|
||||
// Rule specified a break, but that break wasn't part of the match, even
|
||||
// though the rule as a whole matched.
|
||||
// Can't happen with regular expressions derived from (equivalent to) ICU break rules.
|
||||
// Shouldn't get here.
|
||||
IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
break;
|
||||
}
|
||||
fExpectedBreaks.setCharAt(breakPos, (UChar)1);
|
||||
// printf("recording break at %d\n", breakPos);
|
||||
// For the next iteration, pick up applying rules immediately after the break,
|
||||
// which may differ from end of the match. The matching rule may have included
|
||||
// context following the boundary that needs to be looked at again.
|
||||
strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
|
||||
} else {
|
||||
// Original rule didn't specify a break.
|
||||
// Continue applying rules starting on the last code point of this match.
|
||||
strIdx = fString.moveIndex32(matchEnd, -1);
|
||||
if (strIdx == matchStart) {
|
||||
// Match was only one code point, no progress if we continue.
|
||||
// Shouldn't get here, case is filtered out at top of loop.
|
||||
CharString ruleName;
|
||||
ruleName.appendInvariantChars(matchingRule->fName, status);
|
||||
IntlTest::gTest->errln("%s:%d Rule %s internal error",
|
||||
__FILE__, __LINE__, ruleName.data());
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
|
||||
__FILE__, __LINE__, u_errorName(status));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MonkeyTestData::clearActualBreaks() {
|
||||
fActualBreaks.remove();
|
||||
// Actual Breaks length is one longer than the data string length, allowing
|
||||
// for breaks before the first and after the last character in the data.
|
||||
for (int32_t i=0; i<=fString.length(); i++) {
|
||||
fActualBreaks.append((UChar)0);
|
||||
}
|
||||
}
|
||||
|
||||
void MonkeyTestData::dump(int32_t around) const {
|
||||
printf("\n"
|
||||
" char break Rule Character\n"
|
||||
" pos code class R I name name\n"
|
||||
"---------------------------------------------------------------------------------------------\n");
|
||||
|
||||
int32_t start;
|
||||
int32_t end;
|
||||
|
||||
if (around == -1) {
|
||||
start = 0;
|
||||
end = fString.length();
|
||||
} else {
|
||||
// Display context around a failure.
|
||||
start = fString.moveIndex32(around, -30);
|
||||
end = fString.moveIndex32(around, +30);
|
||||
}
|
||||
|
||||
for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UChar32 c = fString.char32At(charIdx);
|
||||
const CharClass *cc = fBkRules->getClassForChar(c);
|
||||
CharString ccName;
|
||||
ccName.appendInvariantChars(cc->fName, status);
|
||||
CharString ruleName, secondRuleName;
|
||||
const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
|
||||
ruleName.appendInvariantChars(rule->fName, status);
|
||||
if (f2ndRuleForPos.charAt(charIdx) > 0) {
|
||||
const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
|
||||
secondRuleName.appendInvariantChars(secondRule->fName, status);
|
||||
}
|
||||
char cName[200];
|
||||
u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
|
||||
|
||||
printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
|
||||
charIdx, c, ccName.data(),
|
||||
fExpectedBreaks.charAt(charIdx) ? '*' : '.',
|
||||
fActualBreaks.charAt(charIdx) ? '*' : '.',
|
||||
ruleName.data(), secondRuleName.data(), cName
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------
|
||||
//
|
||||
// class RBBIMonkeyImpl
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
|
||||
RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
|
||||
(void)status; // suppress unused parameter compiler warning.
|
||||
}
|
||||
|
||||
|
||||
// RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
|
||||
// reference rules and creating the icu breakiterator to test,
|
||||
// with its type and locale coming from the reference rules.
|
||||
|
||||
void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
|
||||
fRuleFileName = ruleFile;
|
||||
openBreakRules(ruleFile, status);
|
||||
if (U_FAILURE(status)) {
|
||||
IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
|
||||
return;
|
||||
}
|
||||
fRuleSet.adoptInstead(new BreakRules(this, status));
|
||||
fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
|
||||
return;
|
||||
}
|
||||
fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
|
||||
fTestData.adoptInstead(new MonkeyTestData());
|
||||
}
|
||||
|
||||
|
||||
RBBIMonkeyImpl::~RBBIMonkeyImpl() {
|
||||
}
|
||||
|
||||
|
||||
void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
|
||||
CharString path;
|
||||
path.append(IntlTest::getSourceTestData(status), status);
|
||||
path.append("break_rules" U_FILE_SEP_STRING, status);
|
||||
path.appendPathPart(fileName, status);
|
||||
const char *codePage = "UTF-8";
|
||||
fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
|
||||
}
|
||||
|
||||
|
||||
void RBBIMonkeyImpl::startTest() {
|
||||
fThread.start(); // invokes runTest() in a separate thread.
|
||||
}
|
||||
|
||||
void RBBIMonkeyImpl::join() {
|
||||
fThread.join();
|
||||
}
|
||||
|
||||
|
||||
#define MONKEY_ERROR(msg, index) { \
|
||||
IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
|
||||
__FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
|
||||
if (fVerbose) { fTestData->dump(index); } \
|
||||
status = U_INVALID_STATE_ERROR; \
|
||||
}
|
||||
|
||||
void RBBIMonkeyImpl::runTest() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t errorCount = 0;
|
||||
for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
|
||||
status = U_ZERO_ERROR;
|
||||
fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
|
||||
// fTestData->dump();
|
||||
testForwards(status);
|
||||
testPrevious(status);
|
||||
testFollowing(status);
|
||||
testPreceding(status);
|
||||
testIsBoundary(status);
|
||||
|
||||
if (fLoopCount < 0 && loopCount % 100 == 0) {
|
||||
fprintf(stderr, ".");
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
if (++errorCount > 10) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fTestData->clearActualBreaks();
|
||||
fBI->setText(fTestData->fString);
|
||||
int32_t previousBreak = -2;
|
||||
for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
|
||||
if (bk <= previousBreak) {
|
||||
MONKEY_ERROR("Break Iterator Stall", bk);
|
||||
return;
|
||||
}
|
||||
if (bk < 0 || bk > fTestData->fString.length()) {
|
||||
MONKEY_ERROR("Boundary out of bounds", bk);
|
||||
return;
|
||||
}
|
||||
fTestData->fActualBreaks.setCharAt(bk, 1);
|
||||
}
|
||||
checkResults("testForwards", FORWARD, status);
|
||||
}
|
||||
|
||||
void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fTestData->clearActualBreaks();
|
||||
fBI->setText(fTestData->fString);
|
||||
int32_t nextBreak = -1;
|
||||
for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
|
||||
int32_t bk = fBI->following(i);
|
||||
if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
|
||||
continue;
|
||||
}
|
||||
if (bk == nextBreak && bk > i) {
|
||||
// i is in the gap between two breaks.
|
||||
continue;
|
||||
}
|
||||
if (i == nextBreak && bk > nextBreak) {
|
||||
fTestData->fActualBreaks.setCharAt(bk, 1);
|
||||
nextBreak = bk;
|
||||
continue;
|
||||
}
|
||||
MONKEY_ERROR("following(i)", i);
|
||||
return;
|
||||
}
|
||||
checkResults("testFollowing", FORWARD, status);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {return;}
|
||||
|
||||
fTestData->clearActualBreaks();
|
||||
fBI->setText(fTestData->fString);
|
||||
int32_t previousBreak = INT32_MAX;
|
||||
for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
|
||||
if (bk >= previousBreak) {
|
||||
MONKEY_ERROR("Break Iterator Stall", bk);
|
||||
return;
|
||||
}
|
||||
if (bk < 0 || bk > fTestData->fString.length()) {
|
||||
MONKEY_ERROR("Boundary out of bounds", bk);
|
||||
return;
|
||||
}
|
||||
fTestData->fActualBreaks.setCharAt(bk, 1);
|
||||
}
|
||||
checkResults("testPrevius", REVERSE, status);
|
||||
}
|
||||
|
||||
|
||||
void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fTestData->clearActualBreaks();
|
||||
fBI->setText(fTestData->fString);
|
||||
int32_t nextBreak = fTestData->fString.length()+1;
|
||||
for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
|
||||
int32_t bk = fBI->preceding(i);
|
||||
// printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
|
||||
if (bk == BreakIterator::DONE && i == 0) {
|
||||
continue;
|
||||
}
|
||||
if (bk == nextBreak && bk < i) {
|
||||
// i is in the gap between two breaks.
|
||||
continue;
|
||||
}
|
||||
if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
|
||||
// i indexes to a trailing surrogate.
|
||||
// Break Iterators treat an index to either half as referring to the supplemental code point,
|
||||
// with preceding going to some preceding code point.
|
||||
if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
|
||||
MONKEY_ERROR("preceding of trailing surrogate error", i);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (i == nextBreak && bk < nextBreak) {
|
||||
fTestData->fActualBreaks.setCharAt(bk, 1);
|
||||
nextBreak = bk;
|
||||
continue;
|
||||
}
|
||||
MONKEY_ERROR("preceding(i)", i);
|
||||
return;
|
||||
}
|
||||
checkResults("testPreceding", REVERSE, status);
|
||||
}
|
||||
|
||||
|
||||
void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fTestData->clearActualBreaks();
|
||||
fBI->setText(fTestData->fString);
|
||||
for (int i=fTestData->fString.length(); i>=0; --i) {
|
||||
if (fBI->isBoundary(i)) {
|
||||
fTestData->fActualBreaks.setCharAt(i, 1);
|
||||
}
|
||||
}
|
||||
checkResults("testForwards", FORWARD, status);
|
||||
}
|
||||
|
||||
void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (direction == FORWARD) {
|
||||
for (int i=0; i<=fTestData->fString.length(); ++i) {
|
||||
if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
|
||||
IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
|
||||
__FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
|
||||
if (fVerbose) {
|
||||
fTestData->dump(i);
|
||||
}
|
||||
status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely
|
||||
break; // produce many redundant errors.
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i=fTestData->fString.length(); i>=0; i--) {
|
||||
if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
|
||||
IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
|
||||
__FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
|
||||
if (fVerbose) {
|
||||
fTestData->dump(i);
|
||||
}
|
||||
status = U_INVALID_STATE_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------
|
||||
//
|
||||
// class RBBIMonkeyTest implementation.
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
RBBIMonkeyTest::RBBIMonkeyTest() {
|
||||
}
|
||||
|
||||
RBBIMonkeyTest::~RBBIMonkeyTest() {
|
||||
}
|
||||
|
||||
|
||||
// params, taken from this->fParams.
|
||||
// rules=file_name Name of file containing the reference rules.
|
||||
// seed=nnnnn Random number starting seed.
|
||||
// Setting the seed allows errors to be reproduced.
|
||||
// loop=nnn Looping count. Controls running time.
|
||||
// -1: run forever.
|
||||
// 0 or greater: run length.
|
||||
// expansions debug option, show expansions of rules and sets.
|
||||
// verbose Display details of the failure.
|
||||
//
|
||||
// Parameters on the intltest command line follow the test name, and are preceded by '@'.
|
||||
// For example,
|
||||
// intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
|
||||
//
|
||||
void RBBIMonkeyTest::testMonkey() {
|
||||
// printf("Test parameters: %s\n", fParams);
|
||||
UnicodeString params(fParams);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
|
||||
"line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
|
||||
NULL };
|
||||
CharString testNameFromParams;
|
||||
if (getStringParam("rules", params, testNameFromParams, status)) {
|
||||
tests[0] = testNameFromParams.data();
|
||||
tests[1] = NULL;
|
||||
}
|
||||
|
||||
int64_t loopCount = quick? 100 : 5000;
|
||||
getIntParam("loop", params, loopCount, status);
|
||||
|
||||
UBool dumpExpansions = FALSE;
|
||||
getBoolParam("expansions", params, dumpExpansions, status);
|
||||
|
||||
UBool verbose = FALSE;
|
||||
getBoolParam("verbose", params, verbose, status);
|
||||
|
||||
int64_t seed = 0;
|
||||
getIntParam("seed", params, seed, status);
|
||||
|
||||
if (params.length() != 0) {
|
||||
// Options processing did not consume all of the parameters. Something unrecognized was present.
|
||||
CharString unrecognizedParameters;
|
||||
unrecognizedParameters.append(CStr(params)(), -1, status);
|
||||
errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
|
||||
return;
|
||||
}
|
||||
|
||||
UVector startedTests(status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
|
||||
return;
|
||||
}
|
||||
|
||||
// Monkey testing is multi-threaded.
|
||||
// Each set of break rules to be tested is run in a separate thread.
|
||||
// Each thread/set of rules gets a separate RBBIMonkeyImpl object.
|
||||
int32_t i;
|
||||
for (i=0; tests[i] != NULL; ++i) {
|
||||
logln("beginning testing of %s", tests[i]);
|
||||
RBBIMonkeyImpl *test = new RBBIMonkeyImpl(status);
|
||||
test->fDumpExpansions = dumpExpansions;
|
||||
test->fVerbose = verbose;
|
||||
test->fRandomGenerator.seed((uint32_t)seed);
|
||||
test->fLoopCount = loopCount;
|
||||
test->setup(tests[i], status);
|
||||
test->startTest();
|
||||
startedTests.addElement(test, status);
|
||||
if (U_FAILURE(status)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
|
||||
}
|
||||
|
||||
for (i=0; i<startedTests.size(); ++i) {
|
||||
RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
|
||||
test->join();
|
||||
delete test;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status) {
|
||||
name.append(" *= *(-?\\d+) *,? *");
|
||||
RegexMatcher m(name, params, 0, status);
|
||||
if (m.find()) {
|
||||
// The param exists. Convert the string to an int.
|
||||
CharString str;
|
||||
str.append(CStr(m.group(1, status))(), -1, status);
|
||||
val = strtol(str.data(), NULL, 10);
|
||||
|
||||
// Delete this parameter from the params string.
|
||||
m.reset();
|
||||
params = m.replaceFirst(UnicodeString(), status);
|
||||
return TRUE;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status) {
|
||||
name.append(" *= *([^ ,]*) *,? *");
|
||||
RegexMatcher m(name, params, 0, status);
|
||||
if (m.find()) {
|
||||
// The param exists.
|
||||
dest.append(CStr(m.group(1, status))(), -1, status);
|
||||
|
||||
// Delete this parameter from the params string.
|
||||
m.reset();
|
||||
params = m.replaceFirst(UnicodeString(), status);
|
||||
return TRUE;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status) {
|
||||
name.append("(?: *= *(true|false))? *,? *");
|
||||
RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
|
||||
if (m.find()) {
|
||||
if (m.start(1, status) > 0) {
|
||||
// user option included a value.
|
||||
dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
|
||||
} else {
|
||||
// No explicit user value, implies true.
|
||||
dest = TRUE;
|
||||
}
|
||||
|
||||
// Delete this parameter from the params string.
|
||||
m.reset();
|
||||
params = m.replaceFirst(UnicodeString(), status);
|
||||
return TRUE;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
208
icu4c/source/test/intltest/rbbimonkeytest.h
Normal file
208
icu4c/source/test/intltest/rbbimonkeytest.h
Normal file
|
@ -0,0 +1,208 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*************************************************************************
|
||||
*/
|
||||
#ifndef RBBIMONKEYTEST_H
|
||||
#define RBBIMONKEYTEST_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "intltest.h"
|
||||
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/regex.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
#include "simplethread.h"
|
||||
#include "ucbuf.h"
|
||||
#include "uhash.h"
|
||||
#include "uvector.h"
|
||||
|
||||
//
|
||||
// TODO:
|
||||
// Develop a tailoring format.
|
||||
// Hook to old tests that use monkey impl to get expected data.
|
||||
// Remove old tests.
|
||||
|
||||
class BreakRules; // Forward declaration
|
||||
class RBBIMonkeyImpl;
|
||||
|
||||
/**
|
||||
* Test the RuleBasedBreakIterator class giving different rules
|
||||
*/
|
||||
class RBBIMonkeyTest: public IntlTest {
|
||||
public:
|
||||
RBBIMonkeyTest();
|
||||
virtual ~RBBIMonkeyTest();
|
||||
|
||||
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
|
||||
void testMonkey();
|
||||
|
||||
|
||||
private:
|
||||
const char *fParams; // Copy of user parameters passed in from IntlTest.
|
||||
|
||||
|
||||
void testRules(const char *ruleFile);
|
||||
static UBool getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status);
|
||||
static UBool getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status);
|
||||
static UBool getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status);
|
||||
|
||||
};
|
||||
|
||||
// The following classes are internal to the RBBI Monkey Test implementation.
|
||||
|
||||
|
||||
|
||||
// class CharClass Represents a single character class from the source break rules.
|
||||
// Inherits from UObject because instances are adopted by UHashtable, which ultimately
|
||||
// deletes them using hash's object deleter function.
|
||||
|
||||
class CharClass: public UObject {
|
||||
public:
|
||||
UnicodeString fName;
|
||||
UnicodeString fOriginalDef; // set definition as it appeared in user supplied rules.
|
||||
UnicodeString fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively.
|
||||
LocalPointer<const UnicodeSet> fSet;
|
||||
CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) :
|
||||
fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {}
|
||||
};
|
||||
|
||||
|
||||
// class BreakRule represents a single rule from a set of break rules.
|
||||
// Each rule has the set definitions expanded, and
|
||||
// is compiled to a regular expression.
|
||||
|
||||
class BreakRule: public UObject {
|
||||
public:
|
||||
BreakRule();
|
||||
~BreakRule();
|
||||
UnicodeString fName; // Name of the rule.
|
||||
UnicodeString fRule; // Rule expression, excluding the name, as written in user source.
|
||||
UnicodeString fExpandedRule; // Rule expression after expanding the set definitions.
|
||||
LocalPointer<RegexMatcher> fRuleMatcher; // Regular expression that matches the rule.
|
||||
};
|
||||
|
||||
|
||||
// class BreakRules represents a complete set of break rules, possibly tailored,
|
||||
// compiled from testdata break rules.
|
||||
|
||||
class BreakRules: public UObject {
|
||||
public:
|
||||
BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status);
|
||||
~BreakRules();
|
||||
|
||||
void compileRules(UCHARBUF *rules, UErrorCode &status);
|
||||
|
||||
const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const;
|
||||
|
||||
|
||||
RBBIMonkeyImpl *fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance.
|
||||
icu::UVector fBreakRules; // Contents are of type (BreakRule *).
|
||||
|
||||
LocalUHashtablePointer fCharClasses; // Key is set name (UnicodeString).
|
||||
// Value is (CharClass *)
|
||||
LocalPointer<UVector> fCharClassList; // Char Classes, same contents as fCharClasses values,
|
||||
// but in a vector so they can be accessed by index.
|
||||
UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined.
|
||||
Locale fLocale;
|
||||
UBreakIteratorType fType;
|
||||
|
||||
CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
|
||||
void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
|
||||
bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status);
|
||||
RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status);
|
||||
|
||||
LocalPointer<RegexMatcher> fSetRefsMatcher;
|
||||
LocalPointer<RegexMatcher> fCommentsMatcher;
|
||||
LocalPointer<RegexMatcher> fClassDefMatcher;
|
||||
LocalPointer<RegexMatcher> fRuleDefMatcher;
|
||||
};
|
||||
|
||||
|
||||
// class MonkeyTestData represents a randomly synthesized test data string together
|
||||
// with the expected break positions obtained by applying
|
||||
// the test break rules.
|
||||
|
||||
class MonkeyTestData: public UObject {
|
||||
public:
|
||||
MonkeyTestData() {};
|
||||
~MonkeyTestData() {};
|
||||
void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status);
|
||||
void clearActualBreaks();
|
||||
void dump(int32_t around = -1) const;
|
||||
|
||||
uint32_t fRandomSeed; // The initial seed value from the random number genererator.
|
||||
const BreakRules *fBkRules; // The break rules used to generate this data.
|
||||
UnicodeString fString; // The text.
|
||||
UnicodeString fExpectedBreaks; // Breaks as found by the reference rules.
|
||||
// Parallel to fString. Non-zero if break preceding.
|
||||
UnicodeString fActualBreaks; // Breaks as found by ICU break iterator.
|
||||
UnicodeString fRuleForPosition; // Index into BreakRules.fBreakRules of rule that applied at each position.
|
||||
// Also parallel to fString.
|
||||
UnicodeString f2ndRuleForPos; // As above. A 2nd rule applies when the preceding rule
|
||||
// didn't cause a break, and a subsequent rule match starts
|
||||
// on the last code point of the preceding match.
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
// class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey
|
||||
// test for one set of break rules.
|
||||
//
|
||||
// When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence
|
||||
// between instances of RBBIMonkeyImpl and threads.
|
||||
//
|
||||
class RBBIMonkeyImpl: public UObject {
|
||||
public:
|
||||
RBBIMonkeyImpl(UErrorCode &status);
|
||||
~RBBIMonkeyImpl();
|
||||
|
||||
void setup(const char *ruleFileName, UErrorCode &status);
|
||||
|
||||
void startTest();
|
||||
void runTest();
|
||||
void join();
|
||||
|
||||
LocalUCHARBUFPointer fRuleCharBuffer; // source file contents of the reference rules.
|
||||
LocalPointer<BreakRules> fRuleSet;
|
||||
LocalPointer<RuleBasedBreakIterator> fBI;
|
||||
LocalPointer<MonkeyTestData> fTestData;
|
||||
IntlTest::icu_rand fRandomGenerator;
|
||||
const char *fRuleFileName;
|
||||
UBool fVerbose; // True to do long dump of failing data.
|
||||
int32_t fLoopCount;
|
||||
|
||||
UBool fDumpExpansions; // Debug flag to output epananded form of rules and sets.
|
||||
|
||||
enum CheckDirection {
|
||||
FORWARD = 1,
|
||||
REVERSE = 2
|
||||
};
|
||||
void clearActualBreaks();
|
||||
void testForwards(UErrorCode &status);
|
||||
void testPrevious(UErrorCode &status);
|
||||
void testFollowing(UErrorCode &status);
|
||||
void testPreceding(UErrorCode &status);
|
||||
void testIsBoundary(UErrorCode &status);
|
||||
void checkResults(const char *msg, CheckDirection dir, UErrorCode &status);
|
||||
|
||||
class RBBIMonkeyThread: public SimpleThread {
|
||||
private:
|
||||
RBBIMonkeyImpl *fMonkeyImpl;
|
||||
public:
|
||||
RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {};
|
||||
void run() U_OVERRIDE { fMonkeyImpl->runTest(); };
|
||||
};
|
||||
private:
|
||||
void openBreakRules(const char *fileName, UErrorCode &status);
|
||||
RBBIMonkeyThread fThread;
|
||||
|
||||
};
|
||||
|
||||
#endif // RBBIMONKEYTEST_H
|
|
@ -9,36 +9,36 @@
|
|||
* 01/12/2000 Madhu Updated for changed API and added new tests
|
||||
************************************************************************/
|
||||
|
||||
#include "utypeinfo.h" // for 'typeid' to work
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/numfmt.h"
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/schriter.h"
|
||||
#include "unicode/uniset.h"
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
#include "unicode/regex.h"
|
||||
#endif
|
||||
#include "unicode/schriter.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utext.h"
|
||||
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "intltest.h"
|
||||
#include "rbbitst.h"
|
||||
#include <string.h>
|
||||
#include "charstr.h"
|
||||
#include "utypeinfo.h" // for 'typeid' to work
|
||||
#include "uvector.h"
|
||||
#include "uvectr32.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/numfmt.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
|
||||
#include "unicode/filteredbrk.h"
|
||||
|
@ -56,7 +56,7 @@
|
|||
//---------------------------------------------
|
||||
|
||||
|
||||
// Note: Before adding new tests to this file, check whether the desired test data can
|
||||
// Note: Before adding new tests to this file, check whether the desired test data can
|
||||
// simply be added to the file testdata/rbbitest.txt. In most cases it can,
|
||||
// it's much less work than writing a new test, diagnostic output in the event of failures
|
||||
// is good, and the test data file will is shared with ICU4J, so eventually the test
|
||||
|
@ -79,7 +79,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
break;
|
||||
case 2: name = "TestStatusReturn";
|
||||
if(exec) TestStatusReturn(); break;
|
||||
|
||||
|
||||
#if !UCONFIG_NO_FILE_IO
|
||||
case 3: name = "TestUnicodeFiles";
|
||||
if(exec) TestUnicodeFiles(); break;
|
||||
|
@ -117,7 +117,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
#endif
|
||||
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
|
||||
case 16:
|
||||
case 16:
|
||||
name = "TestMonkey"; if(exec) TestMonkey(params); break;
|
||||
#else
|
||||
case 16:
|
||||
|
@ -323,7 +323,7 @@ void RBBITest::TestStatusReturn() {
|
|||
"$Numbers = [:N:];\n"
|
||||
"$Letters+{1};\n"
|
||||
"$Numbers+{2};\n"
|
||||
"Help\\ {4}/me\\!;\n"
|
||||
"Help\\ /me\\!{4};\n"
|
||||
"[^$Letters $Numbers];\n"
|
||||
"!.*;\n", -1, US_INV);
|
||||
UnicodeString testString1 = "abc123..abc Help me Help me!";
|
||||
|
@ -334,28 +334,27 @@ void RBBITest::TestStatusReturn() {
|
|||
UErrorCode status=U_ZERO_ERROR;
|
||||
UParseError parseError;
|
||||
|
||||
BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
|
||||
LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
|
||||
if(U_FAILURE(status)) {
|
||||
dataerrln("FAIL : in construction - %s", u_errorName(status));
|
||||
} else {
|
||||
int32_t pos;
|
||||
int32_t i = 0;
|
||||
bi->setText(testString1);
|
||||
for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
|
||||
if (pos != bounds1[i]) {
|
||||
errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
|
||||
break;
|
||||
}
|
||||
|
||||
int tag = bi->getRuleStatus();
|
||||
if (tag != brkStatus[i]) {
|
||||
errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
|
||||
return;
|
||||
}
|
||||
int32_t pos;
|
||||
int32_t i = 0;
|
||||
bi->setText(testString1);
|
||||
for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
|
||||
if (pos != bounds1[i]) {
|
||||
errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
|
||||
break;
|
||||
}
|
||||
|
||||
int tag = bi->getRuleStatus();
|
||||
if (tag != brkStatus[i]) {
|
||||
errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
delete bi;
|
||||
}
|
||||
|
||||
|
||||
|
@ -817,7 +816,7 @@ void RBBITest::TestBug5775() {
|
|||
if (bi == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
UnicodeString s("One.\\u00ad Two.", -1, US_INV);
|
||||
// 01234 56789
|
||||
s = s.unescape();
|
||||
|
@ -869,7 +868,7 @@ struct TestParams {
|
|||
utext_close(textToBreak);
|
||||
delete textMap;
|
||||
}
|
||||
|
||||
|
||||
int32_t getSrcLine(int32_t bp);
|
||||
int32_t getExpectedBreak(int32_t bp);
|
||||
int32_t getSrcCol(int32_t bp);
|
||||
|
@ -901,7 +900,7 @@ static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorC
|
|||
0xfffd, NULL, &status);
|
||||
dest.append(buffer, utf8Length, status);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void TestParams::setUTF16(UErrorCode &status) {
|
||||
textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
|
||||
|
@ -1578,7 +1577,7 @@ void RBBITest::TestDictRules() {
|
|||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// ReadAndConvertFile Read a text data file, convert it to UChars, and
|
||||
// return the datain one big UChar * buffer, which the caller must delete.
|
||||
// return the data in one big UChar * buffer, which the caller must delete.
|
||||
//
|
||||
// parameters:
|
||||
// fileName: the name of the file, with no directory part. The test data directory
|
||||
|
@ -1780,7 +1779,7 @@ void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *
|
|||
}
|
||||
strcpy(testFileName, testDataDirectory);
|
||||
strcat(testFileName, fileName);
|
||||
|
||||
|
||||
logln("Opening data file %s\n", fileName);
|
||||
|
||||
int len;
|
||||
|
@ -1858,7 +1857,7 @@ void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *
|
|||
else if (tokenMatcher.start(4, status) >= 0) {
|
||||
// Scanned to end of a line, possibly skipping over a comment in the process.
|
||||
// If the line from the file contained test data, run the test now.
|
||||
if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
|
||||
if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
|
||||
checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
|
||||
}
|
||||
|
||||
|
@ -2030,6 +2029,10 @@ private:
|
|||
UnicodeSet *fLVTSet;
|
||||
UnicodeSet *fHangulSet;
|
||||
UnicodeSet *fAnySet;
|
||||
UnicodeSet *fEmojiModifierSet;
|
||||
UnicodeSet *fEmojiBaseSet;
|
||||
UnicodeSet *fZWJSet;
|
||||
UnicodeSet *fGAZSet;
|
||||
|
||||
const UnicodeString *fText;
|
||||
};
|
||||
|
@ -2041,8 +2044,8 @@ RBBICharMonkey::RBBICharMonkey() {
|
|||
fText = NULL;
|
||||
|
||||
fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
|
||||
fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
|
||||
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
|
||||
fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status);
|
||||
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status);
|
||||
fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
|
||||
fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
|
||||
fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
|
||||
|
@ -2059,6 +2062,18 @@ RBBICharMonkey::RBBICharMonkey() {
|
|||
fHangulSet->addAll(*fLVTSet);
|
||||
fAnySet = new UnicodeSet(0, 0x10ffff);
|
||||
|
||||
|
||||
|
||||
fEmojiBaseSet = new UnicodeSet(UnicodeString(
|
||||
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
|
||||
"\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
|
||||
"\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
|
||||
"\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
|
||||
|
||||
fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
|
||||
fZWJSet = new UnicodeSet(0x200D, 0x200D);
|
||||
fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status);
|
||||
|
||||
fSets = new UVector(status);
|
||||
fSets->addElement(fCRLFSet, status);
|
||||
fSets->addElement(fControlSet, status);
|
||||
|
@ -2070,6 +2085,10 @@ RBBICharMonkey::RBBICharMonkey() {
|
|||
fSets->addElement(fSpacingSet, status);
|
||||
fSets->addElement(fHangulSet, status);
|
||||
fSets->addElement(fAnySet, status);
|
||||
fSets->addElement(fEmojiBaseSet, status);
|
||||
fSets->addElement(fEmojiModifierSet, status);
|
||||
fSets->addElement(fZWJSet, status);
|
||||
fSets->addElement(fGAZSet, status);
|
||||
if (U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
}
|
||||
|
@ -2090,7 +2109,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
|
|||
int breakPos = -1;
|
||||
|
||||
UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
|
||||
|
||||
|
||||
if (U_FAILURE(deferredStatus)) {
|
||||
return -1;
|
||||
}
|
||||
|
@ -2171,12 +2190,20 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
|
|||
}
|
||||
|
||||
// Rule (GB8a) Regional_Indicator x Regional_Indicator
|
||||
// Note: The first if condition is a little tricky. We only need to force
|
||||
// a break if there are three or more contiguous RIs. If there are
|
||||
// only two, a break following will occur via other rules, and will include
|
||||
// any trailing extend characters, which is needed behavior.
|
||||
if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
|
||||
&& fRegionalIndicatorSet->contains(c2)) {
|
||||
break;
|
||||
}
|
||||
if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule (GB9) Numeric x ALetter
|
||||
if (fExtendSet->contains(c2)) {
|
||||
// Rule (GB9) x Extend
|
||||
if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -2190,6 +2217,16 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
|
|||
continue;
|
||||
}
|
||||
|
||||
// Rule (GB9c) Emoji_Base x Emoji_Modifier
|
||||
if ((fEmojiBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule (GB9d) ZWJ x Glue_After_Zwj
|
||||
if (fZWJSet->contains(c1) && fGAZSet->contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule (GB10) Any <break> Any
|
||||
break;
|
||||
}
|
||||
|
@ -2220,6 +2257,10 @@ RBBICharMonkey::~RBBICharMonkey() {
|
|||
delete fLVTSet;
|
||||
delete fHangulSet;
|
||||
delete fAnySet;
|
||||
delete fEmojiBaseSet;
|
||||
delete fEmojiModifierSet;
|
||||
delete fZWJSet;
|
||||
delete fGAZSet;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
|
@ -2245,7 +2286,7 @@ private:
|
|||
UnicodeSet *fKatakanaSet;
|
||||
UnicodeSet *fHebrew_LetterSet;
|
||||
UnicodeSet *fALetterSet;
|
||||
// TODO(jungshik): Do we still need this change?
|
||||
// TODO(jungshik): Do we still need this change?
|
||||
// UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
|
||||
UnicodeSet *fSingle_QuoteSet;
|
||||
UnicodeSet *fDouble_QuoteSet;
|
||||
|
@ -2258,6 +2299,10 @@ private:
|
|||
UnicodeSet *fExtendSet;
|
||||
UnicodeSet *fExtendNumLetSet;
|
||||
UnicodeSet *fDictionaryCjkSet;
|
||||
UnicodeSet *fEBaseSet;
|
||||
UnicodeSet *fEModifierSet;
|
||||
UnicodeSet *fZWSSet;
|
||||
UnicodeSet *fGAZSet;
|
||||
|
||||
const UnicodeString *fText;
|
||||
};
|
||||
|
@ -2275,7 +2320,7 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
|
||||
// Exclude Hangul syllables from ALetterSet during testing.
|
||||
// Leave CJK dictionary characters out from the monkey tests!
|
||||
#if 0
|
||||
#if 0
|
||||
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
|
||||
"[\\p{Line_Break = Complex_Context}"
|
||||
"-\\p{Grapheme_Cluster_Break = Extend}"
|
||||
|
@ -2300,6 +2345,18 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
|
||||
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
|
||||
|
||||
fEBaseSet = new UnicodeSet(UnicodeString(
|
||||
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
|
||||
"\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
|
||||
"\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
|
||||
"\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
|
||||
|
||||
fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
|
||||
fZWSSet = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);;
|
||||
fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status);
|
||||
fExtendSet->removeAll(*fZWSSet);
|
||||
|
||||
|
||||
fOtherSet = new UnicodeSet();
|
||||
if(U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
|
@ -2322,6 +2379,11 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
fOtherSet->removeAll(*fFormatSet);
|
||||
fOtherSet->removeAll(*fExtendSet);
|
||||
fOtherSet->removeAll(*fRegionalIndicatorSet);
|
||||
fOtherSet->removeAll(*fEBaseSet);
|
||||
fOtherSet->removeAll(*fEModifierSet);
|
||||
fOtherSet->removeAll(*fZWSSet);
|
||||
fOtherSet->removeAll(*fGAZSet);
|
||||
|
||||
// Inhibit dictionary characters from being tested at all.
|
||||
fOtherSet->removeAll(*fDictionaryCjkSet);
|
||||
fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
|
||||
|
@ -2344,6 +2406,11 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
fSets->addElement(fOtherSet, status);
|
||||
fSets->addElement(fExtendNumLetSet, status);
|
||||
|
||||
fSets->addElement(fEBaseSet, status);
|
||||
fSets->addElement(fEModifierSet, status);
|
||||
fSets->addElement(fZWSSet, status);
|
||||
fSets->addElement(fGAZSet, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
}
|
||||
|
@ -2362,7 +2429,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
|||
int breakPos = -1;
|
||||
|
||||
UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
|
||||
|
||||
|
||||
if (U_FAILURE(deferredStatus)) {
|
||||
return -1;
|
||||
}
|
||||
|
@ -2392,7 +2459,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
|||
break;
|
||||
};
|
||||
}
|
||||
while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
|
||||
while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWSSet->contains(c3));
|
||||
|
||||
|
||||
if (p1 == p2) {
|
||||
|
@ -2411,7 +2478,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
|||
if (c1==0x0D && c2==0x0A) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// Rule (3a) Break before and after newlines (including CR and LF)
|
||||
//
|
||||
if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
|
||||
|
@ -2421,6 +2488,15 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
|||
break;
|
||||
};
|
||||
|
||||
// Rule (3c) ZWJ x GAZ (Glue after ZWJ).
|
||||
// Not ignoring extend chars, so peek into input text to
|
||||
// get the potential ZWJ, the character immediately preceding c2.
|
||||
// Sloppy UChar32 indexing: p2-1 may reference trail half
|
||||
// but char32At will get the full code point.
|
||||
if (fZWSSet->contains(fText->char32At(p2-1)) && fGAZSet->contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
|
||||
if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
|
||||
(fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
|
||||
|
@ -2510,10 +2586,18 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
|||
}
|
||||
|
||||
// Rule 13c
|
||||
if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
|
||||
break;
|
||||
}
|
||||
if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule 13d
|
||||
if ((fEBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEModifierSet->contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule 14. Break found here.
|
||||
break;
|
||||
}
|
||||
|
@ -2548,6 +2632,10 @@ RBBIWordMonkey::~RBBIWordMonkey() {
|
|||
delete fRegionalIndicatorSet;
|
||||
delete fDictionaryCjkSet;
|
||||
delete fOtherSet;
|
||||
delete fEBaseSet;
|
||||
delete fEModifierSet;
|
||||
delete fZWSSet;
|
||||
delete fGAZSet;
|
||||
}
|
||||
|
||||
|
||||
|
@ -2933,17 +3021,29 @@ private:
|
|||
UnicodeSet *fHL;
|
||||
UnicodeSet *fID;
|
||||
UnicodeSet *fRI;
|
||||
UnicodeSet *fSA;
|
||||
UnicodeSet *fXX;
|
||||
UnicodeSet *fEB;
|
||||
UnicodeSet *fEM;
|
||||
UnicodeSet *fZJ;
|
||||
|
||||
BreakIterator *fCharBI;
|
||||
const UnicodeString *fText;
|
||||
RegexMatcher *fNumberMatcher;
|
||||
};
|
||||
|
||||
RBBILineMonkey::RBBILineMonkey() :
|
||||
RBBIMonkeyKind(),
|
||||
fSets(NULL),
|
||||
|
||||
fCharBI(NULL),
|
||||
fText(NULL),
|
||||
fNumberMatcher(NULL)
|
||||
|
||||
RBBILineMonkey::RBBILineMonkey()
|
||||
{
|
||||
if (U_FAILURE(deferredStatus)) {
|
||||
return;
|
||||
}
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
fSets = new UVector(status);
|
||||
|
@ -2985,24 +3085,35 @@ RBBILineMonkey::RBBILineMonkey()
|
|||
fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
|
||||
fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
|
||||
fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
|
||||
fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
|
||||
fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
|
||||
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
|
||||
fEB = new UnicodeSet(UnicodeString(
|
||||
"[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
|
||||
"\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
|
||||
"\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
|
||||
"\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
|
||||
fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
|
||||
fZJ = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
fCharBI = NULL;
|
||||
fNumberMatcher = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
fAL->addAll(*fXX); // Default behavior for XX is identical to AL
|
||||
fAL->addAll(*fAI); // Default behavior for AI is identical to AL
|
||||
fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
|
||||
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
|
||||
|
||||
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
|
||||
|
||||
fID->addAll(*fEB); // Emoji Base and Emoji Modifier behave as ID.
|
||||
fID->addAll(*fEM);
|
||||
fAL->removeAll(*fEM);
|
||||
|
||||
|
||||
fAL->remove((UChar32)0x2764); // Emoji Proposal: move u2764 from Al to Id
|
||||
fID->add((UChar32)0x2764);
|
||||
|
||||
fSets->addElement(fBK, status);
|
||||
fSets->addElement(fCR, status);
|
||||
fSets->addElement(fLF, status);
|
||||
|
@ -3040,10 +3151,12 @@ RBBILineMonkey::RBBILineMonkey()
|
|||
fSets->addElement(fID, status);
|
||||
fSets->addElement(fWJ, status);
|
||||
fSets->addElement(fRI, status);
|
||||
fSets->addElement(fSA, status);
|
||||
fSets->addElement(fSG, status);
|
||||
fSets->addElement(fEB, status);
|
||||
fSets->addElement(fEM, status);
|
||||
fSets->addElement(fZJ, status);
|
||||
|
||||
const char *rules =
|
||||
const char *rules =
|
||||
"((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
|
||||
"((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
|
||||
"\\p{Line_Break=NU}\\p{Line_Break=CM}*"
|
||||
|
@ -3228,6 +3341,18 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
break;
|
||||
}
|
||||
|
||||
// LB 8a ZJ x ID
|
||||
// The monkey test's way of ignoring combining characters doesn't work
|
||||
// for this rule. ZJ is also a CM. Need to get the actual character
|
||||
// preceding "thisChar", not ignoring combining marks, possibly ZJ.
|
||||
{
|
||||
int32_t prevIdx = fText->moveIndex32(pos, -1);
|
||||
UChar32 prevC = fText->char32At(prevIdx);
|
||||
if (fZJ->contains(prevC) && fID->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// LB 9, 10 Already done, at top of loop.
|
||||
//
|
||||
|
||||
|
@ -3245,7 +3370,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
if (fGL->contains(prevChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// LB 12a
|
||||
// [^SP BA HY] x GL
|
||||
if (!(fSP->contains(prevChar) ||
|
||||
|
@ -3368,7 +3493,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
|
||||
// LB 21a
|
||||
// HL (HY | BA) x
|
||||
if (fHL->contains(prevCharX2) &&
|
||||
if (fHL->contains(prevCharX2) &&
|
||||
(fHY->contains(prevChar) || fBA->contains(prevChar))) {
|
||||
continue;
|
||||
}
|
||||
|
@ -3495,12 +3620,20 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
continue;
|
||||
}
|
||||
|
||||
// LB30a Do not break between regional indicators.
|
||||
// RI x RI
|
||||
// LB30a RI RI <break> RI
|
||||
// RI x RI
|
||||
if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
|
||||
break;
|
||||
}
|
||||
if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB30b Emoji Base x Emoji Modifier
|
||||
if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 31 Break everywhere else
|
||||
break;
|
||||
|
||||
|
@ -3555,9 +3688,10 @@ RBBILineMonkey::~RBBILineMonkey() {
|
|||
delete fHL;
|
||||
delete fID;
|
||||
delete fRI;
|
||||
delete fSA;
|
||||
delete fSG;
|
||||
delete fXX;
|
||||
delete fEB;
|
||||
delete fEM;
|
||||
delete fZJ;
|
||||
|
||||
delete fCharBI;
|
||||
delete fNumberMatcher;
|
||||
|
@ -3577,6 +3711,9 @@ RBBILineMonkey::~RBBILineMonkey() {
|
|||
//
|
||||
// type = char | word | line | sent | title
|
||||
//
|
||||
// Example:
|
||||
// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
|
||||
//
|
||||
//-------------------------------------------------------------------------------------------
|
||||
|
||||
static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
|
||||
|
@ -3853,7 +3990,6 @@ void RBBITest::TestLineBreaks(void)
|
|||
"\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
|
||||
"\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
|
||||
"\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
|
||||
"\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
|
||||
"\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
|
||||
"\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
|
||||
"\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
|
||||
|
@ -3869,25 +4005,19 @@ void RBBITest::TestLineBreaks(void)
|
|||
"\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
|
||||
"\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
|
||||
"\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
|
||||
"\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
|
||||
"\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
|
||||
"\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
|
||||
"\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
|
||||
"\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
|
||||
"\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
|
||||
"\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
|
||||
"\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
|
||||
"\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
|
||||
"\\u2014\\u0020\\u000a\\u17c5\\u24fc",
|
||||
"\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
|
||||
"\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
|
||||
"\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
|
||||
"\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
|
||||
"\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
|
||||
"\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
|
||||
"\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
|
||||
"\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
|
||||
"\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
|
||||
"\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
|
||||
"\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
|
||||
};
|
||||
|
@ -4175,9 +4305,15 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
|
|||
int32_t charIdx = m_rand() % classSet->size();
|
||||
UChar32 c = classSet->charAt(charIdx);
|
||||
if (c < 0) { // TODO: deal with sets containing strings.
|
||||
errln("c < 0");
|
||||
errln("%s:%d c < 0", __FILE__, __LINE__);
|
||||
break;
|
||||
}
|
||||
// Do not assemble a supplementary character from randomly generated separate surrogates.
|
||||
// (It could be a dictionary character)
|
||||
if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
testText.append(c);
|
||||
}
|
||||
|
||||
|
@ -4284,7 +4420,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
|
|||
} else {
|
||||
if (breakPos >= 0) {
|
||||
precedingBreaks[breakPos] = 1;
|
||||
}
|
||||
}
|
||||
lastBreakPos = breakPos;
|
||||
}
|
||||
}
|
||||
|
@ -4379,7 +4515,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
|
|||
errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
|
||||
charErrorTxt[sizeof(charErrorTxt)-1] = 0;
|
||||
const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
|
||||
|
||||
|
||||
errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
|
||||
name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
|
||||
errorType, seed, i, charErrorTxt);
|
||||
|
@ -4402,15 +4538,15 @@ void RBBITest::TestBug5532(void) {
|
|||
// Text includes a mixture of Thai and Latin.
|
||||
const unsigned char utf8Data[] = {
|
||||
0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
|
||||
0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
|
||||
0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
|
||||
0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
|
||||
0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
|
||||
0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
|
||||
0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
|
||||
0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
|
||||
0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
|
||||
0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
|
||||
0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
|
||||
0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
|
||||
0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
|
||||
0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
|
||||
0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
|
||||
0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
|
||||
0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
@ -4467,7 +4603,7 @@ void RBBITest::TestBug9983(void) {
|
|||
rstatus = brkiter->getRuleStatus();
|
||||
(void)rstatus; // Suppress set but not used warning.
|
||||
if (iterationCount >= 10) {
|
||||
break;
|
||||
break;
|
||||
}
|
||||
}
|
||||
TEST_ASSERT(iterationCount == 6);
|
||||
|
@ -4480,7 +4616,7 @@ void RBBITest::TestBug9983(void) {
|
|||
rstatus = brkiterPOSIX->getRuleStatus();
|
||||
(void)rstatus; // Suppress set but not used warning.
|
||||
if (iterationCount >= 10) {
|
||||
break;
|
||||
break;
|
||||
}
|
||||
}
|
||||
TEST_ASSERT(iterationCount == 6);
|
||||
|
|
13
icu4c/source/test/testdata/GraphemeBreakTest.txt
vendored
13
icu4c/source/test/testdata/GraphemeBreakTest.txt
vendored
|
@ -1,5 +1,6 @@
|
|||
# GraphemeBreakTest-8.0.0.txt
|
||||
# Date: 2015-02-13, 13:47:15 GMT [MD]
|
||||
# Hand patched for Emoji breaking proposal L2/16-011R3.
|
||||
#
|
||||
# Unicode Character Database
|
||||
# Copyright (c) 1991-2015 Unicode, Inc.
|
||||
|
@ -9,9 +10,9 @@
|
|||
# Default Grapheme Break Test
|
||||
#
|
||||
# Format:
|
||||
# <string> (# <comment>)?
|
||||
# <string> contains hex Unicode code points, with
|
||||
# ÷ wherever there is a break opportunity, and
|
||||
# <string> (# <comment>)?
|
||||
# <string> contains hex Unicode code points, with
|
||||
# ÷ wherever there is a break opportunity, and
|
||||
# × wherever there is not.
|
||||
# <comment> the format can change, but currently it shows:
|
||||
# - the sample character name
|
||||
|
@ -414,10 +415,10 @@
|
|||
÷ D800 ÷ 0308 ÷ D800 ÷ # ÷ [0.2] <surrogate-D800> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <surrogate-D800> (Control) ÷ [0.3]
|
||||
÷ 0061 ÷ 1F1E6 ÷ 0062 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) ÷ [999.0] LATIN SMALL LETTER B (Other) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA × 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA × 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA ÷ 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA ÷ 200B ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [5.0] ZERO WIDTH SPACE (Control) ÷ [4.0] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1E6 × 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1E6 × 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1E6 × 200D ÷ 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1E6 × 1F1E7 × 200D ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [8.1] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
|
||||
÷ 0020 × 200D ÷ 0646 ÷ # ÷ [0.2] SPACE (Other) × [9.0] ZERO WIDTH JOINER (Extend) ÷ [999.0] ARABIC LETTER NOON (Other) ÷ [0.3]
|
||||
|
|
12
icu4c/source/test/testdata/LineBreakTest.txt
vendored
12
icu4c/source/test/testdata/LineBreakTest.txt
vendored
|
@ -1,5 +1,6 @@
|
|||
# LineBreakTest-8.0.0.txt
|
||||
# Date: 2015-04-30, 09:40:15 GMT [MD]
|
||||
# Hand patched for Emoji break proposal L2/16-011R3
|
||||
#
|
||||
# Unicode Character Database
|
||||
# Copyright (c) 1991-2015 Unicode, Inc.
|
||||
|
@ -6324,13 +6325,14 @@
|
|||
× 3057 × 3001 ÷ 0061 × 0062 ÷ 3068 ÷ # × [0.3] HIRAGANA LETTER SI (ID) × [13.02] IDEOGRAPHIC COMMA (CL) ÷ [999.0] LATIN SMALL LETTER A (AL) × [28.0] LATIN SMALL LETTER B (AL) ÷ [999.0] HIRAGANA LETTER TO (ID) ÷ [0.3]
|
||||
× 0061 ÷ 1F1E6 ÷ 0062 ÷ # × [0.3] LATIN SMALL LETTER A (AL) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [999.0] LATIN SMALL LETTER B (AL) ÷ [0.3]
|
||||
× 1F1F7 × 1F1FA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [0.3]
|
||||
× 1F1F7 × 1F1FA × 1F1F8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) ÷ [0.3]
|
||||
× 1F1F7 × 1F1FA × 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]
|
||||
× 1F1F7 × 1F1FA ÷ 1F1F8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) ÷ [0.3]
|
||||
× 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]
|
||||
× 1F1F7 × 1F1FA × 200B ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [8.0] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]
|
||||
× 05D0 × 002D × 05D0 ÷ # × [0.3] HEBREW LETTER ALEF (HL) × [21.02] HYPHEN-MINUS (HY) × [21.1] HEBREW LETTER ALEF (HL) ÷ [0.3]
|
||||
× 1F1E6 × 1F1E7 × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
|
||||
× 1F1E6 × 200D × 1F1E7 × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
|
||||
× 1F1E6 × 1F1E7 × 200D × 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
|
||||
× 1F1E6 × 1F1E7 ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
|
||||
# Patched the following two lines for RI pairing. Note ZWJ behaves as CM and logically disappears.
|
||||
× 1F1E6 × 200D × 1F1E7 ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
|
||||
× 1F1E6 × 1F1E7 × 200D ÷ 1F1E8 ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER A (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER B (RI) × [9.0] ZERO WIDTH JOINER (CM) × [30.11] REGIONAL INDICATOR SYMBOL LETTER C (RI) ÷ [0.3]
|
||||
× 0020 ÷ 200D × 0646 ÷ # × [0.3] SPACE (SP) ÷ [18.0] ZERO WIDTH JOINER (CM) × [28.0] ARABIC LETTER NOON (AL) ÷ [0.3]
|
||||
× 0646 × 200D × 0020 ÷ # × [0.3] ARABIC LETTER NOON (AL) × [9.0] ZERO WIDTH JOINER (CM) × [7.01] SPACE (SP) ÷ [0.3]
|
||||
#
|
||||
|
|
12
icu4c/source/test/testdata/WordBreakTest.txt
vendored
12
icu4c/source/test/testdata/WordBreakTest.txt
vendored
|
@ -1,5 +1,7 @@
|
|||
# WordBreakTest-8.0.0.txt
|
||||
# Date: 2015-05-02, 14:48:55 GMT [MD]
|
||||
|
||||
# Hand Patched for Emoji breaking proposal L2/16-011R3
|
||||
#
|
||||
# Unicode Character Database
|
||||
# Copyright (c) 1991-2015 Unicode, Inc.
|
||||
|
@ -1392,13 +1394,13 @@
|
|||
÷ 2060 ÷ 0043 × 2060 × 002E × 2060 × 0044 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) ÷ [999.0] LATIN CAPITAL LETTER C (ALetter) × [4.0] WORD JOINER (Format_FE) × [6.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [7.0] LATIN CAPITAL LETTER D (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] WORD JOINER (Format_FE) ÷ [0.3]
|
||||
÷ 0061 ÷ 1F1E6 ÷ 0062 ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) ÷ [999.0] LATIN SMALL LETTER B (ALetter) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA × 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA × 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA ÷ 1F1F8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1F7 × 1F1FA ÷ 200B ÷ 1F1F8 × 1F1EA ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER R (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER U (Regional_Indicator) ÷ [999.0] ZERO WIDTH SPACE (Other) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER S (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER E (Regional_Indicator) ÷ [0.3]
|
||||
÷ 05D0 × 0022 × 05D0 ÷ # ÷ [0.2] HEBREW LETTER ALEF (Hebrew_Letter) × [7.2] QUOTATION MARK (Double_Quote) × [7.3] HEBREW LETTER ALEF (Hebrew_Letter) ÷ [0.3]
|
||||
÷ 1F1E6 × 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1E6 × 200D × 1F1E7 × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1E6 × 1F1E7 × 200D × 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1E6 × 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1E6 × 200D × 1F1E7 ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
|
||||
÷ 1F1E6 × 1F1E7 × 200D ÷ 1F1E8 ÷ # ÷ [0.2] REGIONAL INDICATOR SYMBOL LETTER A (Regional_Indicator) × [13.3] REGIONAL INDICATOR SYMBOL LETTER B (Regional_Indicator) × [4.0] ZERO WIDTH JOINER (Extend_FE) × [13.3] REGIONAL INDICATOR SYMBOL LETTER C (Regional_Indicator) ÷ [0.3]
|
||||
÷ 0020 × 200D ÷ 0646 ÷ # ÷ [0.2] SPACE (Other) × [4.0] ZERO WIDTH JOINER (Extend_FE) ÷ [999.0] ARABIC LETTER NOON (ALetter) ÷ [0.3]
|
||||
÷ 0646 × 200D ÷ 0020 ÷ # ÷ [0.2] ARABIC LETTER NOON (ALetter) × [4.0] ZERO WIDTH JOINER (Extend_FE) ÷ [999.0] SPACE (Other) ÷ [0.3]
|
||||
÷ 0031 ÷ 003A ÷ 003A ÷ 0031 ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [999.0] COLON (MidLetter) ÷ [999.0] DIGIT ONE (Numeric) ÷ [0.3]
|
||||
|
|
60
icu4c/source/test/testdata/break_rules/grapheme.txt
vendored
Normal file
60
icu4c/source/test/testdata/break_rules/grapheme.txt
vendored
Normal file
|
@ -0,0 +1,60 @@
|
|||
#
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: grapheme.txt
|
||||
#
|
||||
# Reference Grapheme Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
||||
type = grapheme; # one of grapheme | word | line | sentence
|
||||
locale = en;
|
||||
|
||||
CR = [\u000d];
|
||||
LF = [\u000a];
|
||||
|
||||
Control = [[\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]];
|
||||
Extend = [[\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]];
|
||||
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
|
||||
Prepend = [];
|
||||
SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
|
||||
E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
E_Modifier = [\U0001F3FB-\U0001F3FF];
|
||||
GAZ = [\U0001F455-\U0001F469\U0001F48B\U0001F5E8\u2764];
|
||||
ZWJ = [\u200D];
|
||||
|
||||
#
|
||||
# Korean Syllable Definitions
|
||||
#
|
||||
L = [\p{Grapheme_Cluster_Break = L}];
|
||||
V = [\p{Grapheme_Cluster_Break = V}];
|
||||
T = [\p{Grapheme_Cluster_Break = T}];
|
||||
|
||||
LV = [\p{Grapheme_Cluster_Break = LV}];
|
||||
LVT = [\p{Grapheme_Cluster_Break = LVT}];
|
||||
|
||||
GB3: CR LF;
|
||||
GB4: (Control | CR | LF) ÷;
|
||||
GB5: . ÷ (Control | CR | LF);
|
||||
|
||||
GB6: L (L | V | LV | LVT);
|
||||
GB7: (LV | V) (V | T);
|
||||
GB8: (LVT | T) T;
|
||||
|
||||
# Regional Indicators, split into pairs.
|
||||
# Note that a pair of RIs that is not followed by a third RI will fall into
|
||||
# the normal rules for Extend, etc.
|
||||
#
|
||||
GB8a.1: Regional_Indicator Regional_Indicator ÷ Regional_Indicator;
|
||||
GB8a.2: Regional_Indicator Regional_Indicator;
|
||||
|
||||
GB9: . Extend;
|
||||
|
||||
GB9a: . SpacingMark;
|
||||
GB9b: Prepend .;
|
||||
GB9c: (E_Base | GAZ) E_Modifier;
|
||||
GB9d: ZWJ GAZ;
|
||||
|
||||
GB10: . ÷;
|
196
icu4c/source/test/testdata/break_rules/line.txt
vendored
Normal file
196
icu4c/source/test/testdata/break_rules/line.txt
vendored
Normal file
|
@ -0,0 +1,196 @@
|
|||
#
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: line.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
||||
|
||||
type = line;
|
||||
locale = en;
|
||||
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [[:LineBreak = Alphabetic:]-[\u2764]];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
|
||||
EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
HL = [:LineBreak = Hebrew_Letter:];
|
||||
HY = [:LineBreak = Hyphen:];
|
||||
H2 = [:LineBreak = H2:];
|
||||
H3 = [:LineBreak = H3:];
|
||||
ID = [[:LineBreak = Ideographic:][\u2764]];
|
||||
IN = [:LineBreak = Inseperable:];
|
||||
IS = [:LineBreak = Infix_Numeric:];
|
||||
JL = [:LineBreak = JL:];
|
||||
JV = [:LineBreak = JV:];
|
||||
JT = [:LineBreak = JT:];
|
||||
LF = [:LineBreak = Line_Feed:];
|
||||
NL = [:LineBreak = Next_Line:];
|
||||
NS = [[:LineBreak = Nonstarter:] CJ];
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
PO = [:LineBreak = Postfix_Numeric:];
|
||||
PR = [:LineBreak = Prefix_Numeric:];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
SP = [:LineBreak = Space:];
|
||||
SY = [:LineBreak = Break_Symbols:];
|
||||
WJ = [:LineBreak = Word_Joiner:];
|
||||
XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZJ = [\u200D];
|
||||
|
||||
# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
|
||||
ID = [ID - EB];
|
||||
AL = [AL - EM];
|
||||
|
||||
dictionary = [:LineBreak = Complex_Context:];
|
||||
|
||||
# Redfine AL. LB1. TODO: refine according to latest UAX.
|
||||
AL = [ AL AI SA SG XX ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
LB5.1: CR ÷;
|
||||
LB5.2: LF ÷;
|
||||
LB5.3: NL ÷;
|
||||
|
||||
LB6: . (BK | CR | LF | NL);
|
||||
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
|
||||
|
||||
# Rules LB14 - LB17.
|
||||
# Moved before LB7, because they can match a longer sequence that would also match LB7,
|
||||
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
|
||||
# "while only the prefix "OP CM SP" matches LB7.1
|
||||
LB14: OP CM* SP* .;
|
||||
LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a, from Emoji proposal L2/16-011R3
|
||||
# ZWJ x ID
|
||||
LB8a: ZJ (ID | EB | EM);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
# LB10: Unattached CM -> AL
|
||||
|
||||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
||||
LB13.1: [^NU SP] CM* [CL CP IS SY];
|
||||
LB13.2: [^SP] CM* EX;
|
||||
LB13.2: SP [CL CP EX IS SY];
|
||||
|
||||
|
||||
# LB 14-17 are moved above LB 7.
|
||||
|
||||
LB18: SP ÷;
|
||||
|
||||
LB19: . CM* QU;
|
||||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
|
||||
# ZJ acts like a CM to the left, combining with CB.
|
||||
# ZJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZJ (ID | EB | EM);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (ID | EB | EM) CM* PO;
|
||||
LB23.2: (AL | HL | CM) CM* NU;
|
||||
LB23.3: NU CM* (AL | HL);
|
||||
|
||||
LB24.1: PR CM* (ID | EB | EM);
|
||||
LB24.2: PR CM* (AL | HL);
|
||||
LB24.3: PO CM* (AL | HL);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAX 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
LB26.2: (JV | H2) CM* (JV | JT);
|
||||
LB26.3: (JT | H3) CM* JT;
|
||||
|
||||
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
|
||||
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
|
||||
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
|
||||
|
||||
# LB28 Do not break between Alphabetics.
|
||||
# Unattached (leading) CM treated as AL.
|
||||
LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZJ (ID | EB | EM);
|
||||
LB31.2: . CM* ÷;
|
204
icu4c/source/test/testdata/break_rules/line_loose.txt
vendored
Normal file
204
icu4c/source/test/testdata/break_rules/line_loose.txt
vendored
Normal file
|
@ -0,0 +1,204 @@
|
|||
#
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file: line_loose.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
|
||||
# Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
|
||||
# * between characters of LineBreak class IN
|
||||
|
||||
type = line;
|
||||
locale = en@lb=loose;
|
||||
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [[:LineBreak = Alphabetic:]-[\u2764]];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
|
||||
EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
HL = [:LineBreak = Hebrew_Letter:];
|
||||
HY = [:LineBreak = Hyphen:];
|
||||
H2 = [:LineBreak = H2:];
|
||||
H3 = [:LineBreak = H3:];
|
||||
ID = [[:LineBreak = Ideographic:] CJ [\u2764]];
|
||||
IN = [:LineBreak = Inseperable:];
|
||||
IS = [:LineBreak = Infix_Numeric:];
|
||||
JL = [:LineBreak = JL:];
|
||||
JV = [:LineBreak = JV:];
|
||||
JT = [:LineBreak = JT:];
|
||||
LF = [:LineBreak = Line_Feed:];
|
||||
NL = [:LineBreak = Next_Line:];
|
||||
NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
|
||||
NS = [[:LineBreak = Nonstarter:] - NSX];
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
PO = [:LineBreak = Postfix_Numeric:];
|
||||
PR = [:LineBreak = Prefix_Numeric:];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
SP = [:LineBreak = Space:];
|
||||
SY = [:LineBreak = Break_Symbols:];
|
||||
WJ = [:LineBreak = Word_Joiner:];
|
||||
XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZJ = [\u200D];
|
||||
|
||||
# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
|
||||
ID = [ID - EB];
|
||||
AL = [AL - EM];
|
||||
|
||||
dictionary = [:LineBreak = Complex_Context:];
|
||||
|
||||
# Redfine AL. LB1. TODO: refine according to latest UAX.
|
||||
AL = [ AL AI SA SG XX ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
LB5.1: CR ÷;
|
||||
LB5.2: LF ÷;
|
||||
LB5.3: NL ÷;
|
||||
|
||||
LB6: . (BK | CR | LF | NL);
|
||||
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
|
||||
|
||||
# Rules LB14 - LB17.
|
||||
# Moved before LB7, because they can match a longer sequence that would also match LB7,
|
||||
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
|
||||
# "while only the prefix "OP CM SP" matches LB7.1
|
||||
LB14: OP CM* SP* .;
|
||||
LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a, from Emoji proposal L2/16-011R3
|
||||
# ZWJ x ID
|
||||
LB8a: ZJ (ID | EB | EM);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
# LB10: Unattached CM -> AL
|
||||
|
||||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
||||
LB13.1: [^NU SP] CM* [CL CP IS SY];
|
||||
LB13.2: [^SP] CM* EX;
|
||||
LB13.2: SP [CL CP EX IS SY];
|
||||
|
||||
|
||||
# LB 14-17 are moved above LB 7.
|
||||
|
||||
LB18: SP ÷;
|
||||
|
||||
LB19: . CM* QU;
|
||||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
|
||||
# ZJ acts like a CM to the left, combining with CB.
|
||||
# ZJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZJ (ID | EB | EM);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (ID | EB | EM) CM* PO;
|
||||
LB23.2: (AL | HL | CM) CM* NU;
|
||||
LB23.3: NU CM* (AL | HL);
|
||||
|
||||
LB24.1: PR CM* (ID | EB | EM);
|
||||
LB24.2: PR CM* (AL | HL);
|
||||
LB24.3: PO CM* (AL | HL);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
LB26.2: (JV | H2) CM* (JV | JT);
|
||||
LB26.3: (JT | H3) CM* JT;
|
||||
|
||||
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
|
||||
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
|
||||
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
|
||||
|
||||
# LB28 Do not break between Alphabetics.
|
||||
# Unattached (leading) CM treated as AL.
|
||||
LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZJ (ID | EB | EM);
|
||||
LB31.2: . CM* ÷;
|
225
icu4c/source/test/testdata/break_rules/line_loose_cj.txt
vendored
Normal file
225
icu4c/source/test/testdata/break_rules/line_loose_cj.txt
vendored
Normal file
|
@ -0,0 +1,225 @@
|
|||
#
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file: line_loose_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
|
||||
# * between characters of LineBreak class IN such as 2026
|
||||
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
|
||||
# FF65 (all NS) and FF01, FF1F (both EX).
|
||||
# * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W;
|
||||
# this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
|
||||
# * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
|
||||
# this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
|
||||
|
||||
|
||||
type = line;
|
||||
locale = ja@lb=loose;
|
||||
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [[:LineBreak = Alphabetic:]-[\u2764]];
|
||||
BAX = [\u2010 \u2013];
|
||||
BA = [[:LineBreak = Break_After:] - BAX];
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
|
||||
EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
EXX = [\uFF01 \uFF1F];
|
||||
EX = [[:LineBreak = Exclamation:] - EXX];
|
||||
GL = [:LineBreak = Glue:];
|
||||
HL = [:LineBreak = Hebrew_Letter:];
|
||||
HY = [:LineBreak = Hyphen:];
|
||||
H2 = [:LineBreak = H2:];
|
||||
H3 = [:LineBreak = H3:];
|
||||
ID = [[:LineBreak = Ideographic:][\u2764]CJ];
|
||||
IN = [:LineBreak = Inseperable:];
|
||||
IS = [:LineBreak = Infix_Numeric:];
|
||||
JL = [:LineBreak = JL:];
|
||||
JV = [:LineBreak = JV:];
|
||||
JT = [:LineBreak = JT:];
|
||||
LF = [:LineBreak = Line_Feed:];
|
||||
NL = [:LineBreak = Next_Line:];
|
||||
NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
|
||||
NS = [[:LineBreak = Nonstarter:] - NSX];
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
|
||||
PO = [[:LineBreak = Postfix_Numeric:] - POX];
|
||||
PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
|
||||
PR = [[:LineBreak = Prefix_Numeric:] - PRX];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
SP = [:LineBreak = Space:];
|
||||
SY = [:LineBreak = Break_Symbols:];
|
||||
WJ = [:LineBreak = Word_Joiner:];
|
||||
XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZJ = [\u200D];
|
||||
|
||||
# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
|
||||
ID = [ID - EB];
|
||||
AL = [AL - EM];
|
||||
|
||||
dictionary = [:LineBreak = Complex_Context:];
|
||||
|
||||
# Redfine AL. LB1. TODO: refine according to latest UAX.
|
||||
AL = [ AL AI SA SG XX ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
LB5.1: CR ÷;
|
||||
LB5.2: LF ÷;
|
||||
LB5.3: NL ÷;
|
||||
|
||||
LB6: . (BK | CR | LF | NL);
|
||||
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
|
||||
|
||||
# Rules LB14 - LB17.
|
||||
# Moved before LB7, because they can match a longer sequence that would also match LB7,
|
||||
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
|
||||
# "while only the prefix "OP CM SP" matches LB7.1
|
||||
LB14: OP CM* SP* .;
|
||||
LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a, from Emoji proposal L2/16-011R3
|
||||
# ZWJ x ID
|
||||
LB8a: ZJ (ID | EB | EM);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
# LB10: Unattached CM -> AL
|
||||
|
||||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA BAX HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
||||
LB13.1: [^NU SP] CM* [CL CP IS SY];
|
||||
LB13.2: [^SP] CM* EX;
|
||||
LB13.2: SP [CL CP EX IS SY];
|
||||
|
||||
|
||||
# LB 14-17 are moved above LB 7.
|
||||
|
||||
LB18: SP ÷;
|
||||
|
||||
LB19: . CM* QU;
|
||||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
|
||||
# ZJ acts like a CM to the left, combining with CB.
|
||||
# ZJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZJ (ID | EB | EM);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
|
||||
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (ID | EB | EM) CM* PO;
|
||||
LB23.2: (AL | HL | CM) CM* NU;
|
||||
LB23.3: NU CM* (AL | HL);
|
||||
|
||||
LB24.1: PR CM* (ID | EB | EM);
|
||||
LB24.2: PR CM* (AL | HL);
|
||||
LB24.3: (PO | POX) CM* (AL | HL);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
# Loose_cj tailoring: do not include $PRX at the beginning or $POX at the end.
|
||||
LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PRX | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
LB26.2: (JV | H2) CM* (JV | JT);
|
||||
LB26.3: (JT | H3) CM* JT;
|
||||
|
||||
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
|
||||
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
|
||||
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
|
||||
|
||||
# LB28 Do not break between Alphabetics.
|
||||
# Unattached (leading) CM treated as AL.
|
||||
LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZJ (ID | EB | EM);
|
||||
LB31.2: . CM* ÷;
|
210
icu4c/source/test/testdata/break_rules/line_normal.txt
vendored
Normal file
210
icu4c/source/test/testdata/break_rules/line_normal.txt
vendored
Normal file
|
@ -0,0 +1,210 @@
|
|||
#
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file: line_normal.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
|
||||
# Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
|
||||
|
||||
type = line;
|
||||
locale = en@lb=normal;
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [[:LineBreak = Alphabetic:]-[\u2764]];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
|
||||
EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
HL = [:LineBreak = Hebrew_Letter:];
|
||||
HY = [:LineBreak = Hyphen:];
|
||||
H2 = [:LineBreak = H2:];
|
||||
H3 = [:LineBreak = H3:];
|
||||
ID = [[:LineBreak = Ideographic:] CJ [\u2764]];
|
||||
IN = [:LineBreak = Inseperable:];
|
||||
IS = [:LineBreak = Infix_Numeric:];
|
||||
JL = [:LineBreak = JL:];
|
||||
JV = [:LineBreak = JV:];
|
||||
JT = [:LineBreak = JT:];
|
||||
LF = [:LineBreak = Line_Feed:];
|
||||
NL = [:LineBreak = Next_Line:];
|
||||
NS = [:LineBreak = Nonstarter:];
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
PO = [:LineBreak = Postfix_Numeric:];
|
||||
PR = [:LineBreak = Prefix_Numeric:];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
SP = [:LineBreak = Space:];
|
||||
SY = [:LineBreak = Break_Symbols:];
|
||||
WJ = [:LineBreak = Word_Joiner:];
|
||||
XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZJ = [\u200D];
|
||||
|
||||
# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
|
||||
ID = [ID - EB];
|
||||
AL = [AL - EM];
|
||||
|
||||
dictionary = [:LineBreak = Complex_Context:];
|
||||
|
||||
# Redfine AL. LB1. TODO: refine according to latest UAX.
|
||||
AL = [ AL AI SA SG XX ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
LB5.1: CR ÷;
|
||||
LB5.2: LF ÷;
|
||||
LB5.3: NL ÷;
|
||||
|
||||
LB6: . (BK | CR | LF | NL);
|
||||
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
|
||||
|
||||
# Rules LB14 - LB17.
|
||||
# Moved before LB7, because they can match a longer sequence that would also match LB7,
|
||||
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
|
||||
# "while only the prefix "OP CM SP" matches LB7.1
|
||||
LB14: OP CM* SP* .;
|
||||
LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a, from Emoji proposal L2/16-011R3
|
||||
# ZWJ x ID
|
||||
LB8a: ZJ (ID | EB | EM);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
# LB10: Unattached CM -> AL
|
||||
|
||||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
||||
LB13.1: [^NU SP] CM* [CL CP IS SY];
|
||||
LB13.2: [^SP] CM* EX;
|
||||
LB13.2: SP [CL CP EX IS SY];
|
||||
|
||||
|
||||
# LB 14-17 are moved above LB 7.
|
||||
|
||||
LB18: SP ÷;
|
||||
|
||||
LB19: . CM* QU;
|
||||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
|
||||
# ZJ acts like a CM to the left, combining with CB.
|
||||
# ZJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZJ (ID | EB | EM);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (ID | EB | EM) CM* PO;
|
||||
LB23.2: (AL | HL | CM) CM* NU;
|
||||
LB23.3: NU CM* (AL | HL);
|
||||
|
||||
LB24.1: PR CM* (ID | EB | EM);
|
||||
LB24.2: PR CM* (AL | HL);
|
||||
LB24.3: PO CM* (AL | HL);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
LB26.2: (JV | H2) CM* (JV | JT);
|
||||
LB26.3: (JT | H3) CM* JT;
|
||||
|
||||
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
|
||||
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
|
||||
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
|
||||
|
||||
# LB28 Do not break between Alphabetics.
|
||||
# Unattached (leading) CM treated as AL.
|
||||
LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZJ (ID | EB | EM);
|
||||
LB31.2: . CM* ÷;
|
218
icu4c/source/test/testdata/break_rules/line_normal_cj.txt
vendored
Normal file
218
icu4c/source/test/testdata/break_rules/line_normal_cj.txt
vendored
Normal file
|
@ -0,0 +1,218 @@
|
|||
# Copyright (c) 2016 International Business Machines Corporation and # others. All Rights Reserved.
|
||||
#
|
||||
# file: line_normal_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
|
||||
type = line;
|
||||
locale = ja@lb=normal;
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [[:LineBreak = Alphabetic:]-[\u2764]];
|
||||
BAX = [\u2010 \u2013];
|
||||
BA = [[:LineBreak = Break_After:] - BAX];
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
|
||||
EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
EM = [\U0001F3FB-\U0001F3FF];
|
||||
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
HL = [:LineBreak = Hebrew_Letter:];
|
||||
HY = [:LineBreak = Hyphen:];
|
||||
H2 = [:LineBreak = H2:];
|
||||
H3 = [:LineBreak = H3:];
|
||||
ID = [[:LineBreak = Ideographic:] CJ [\u2764]];
|
||||
IN = [:LineBreak = Inseperable:];
|
||||
IS = [:LineBreak = Infix_Numeric:];
|
||||
JL = [:LineBreak = JL:];
|
||||
JV = [:LineBreak = JV:];
|
||||
JT = [:LineBreak = JT:];
|
||||
LF = [:LineBreak = Line_Feed:];
|
||||
NL = [:LineBreak = Next_Line:];
|
||||
NSX = [\u301C \u30A0];
|
||||
NS = [[:LineBreak = Nonstarter:] - NSX];
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
PO = [:LineBreak = Postfix_Numeric:];
|
||||
PR = [:LineBreak = Prefix_Numeric:];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
SP = [:LineBreak = Space:];
|
||||
SY = [:LineBreak = Break_Symbols:];
|
||||
WJ = [:LineBreak = Word_Joiner:];
|
||||
XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZJ = [\u200D];
|
||||
|
||||
# TODO: adjustment to sets needed only until Unicode properties are updated for Emoji.
|
||||
ID = [ID - EB];
|
||||
AL = [AL - EM];
|
||||
|
||||
dictionary = [:LineBreak = Complex_Context:];
|
||||
|
||||
# Redfine AL. LB1. TODO: refine according to latest UAX.
|
||||
AL = [ AL AI SA SG XX ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
LB5.1: CR ÷;
|
||||
LB5.2: LF ÷;
|
||||
LB5.3: NL ÷;
|
||||
|
||||
LB6: . (BK | CR | LF | NL);
|
||||
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
|
||||
|
||||
# Rules LB14 - LB17.
|
||||
# Moved before LB7, because they can match a longer sequence that would also match LB7,
|
||||
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
|
||||
# "while only the prefix "OP CM SP" matches LB7.1
|
||||
LB14: OP CM* SP* .;
|
||||
LB15: QU CM* SP* OP;
|
||||
|
||||
# Do not break between closing punctuation and $NS, even with intervening spaces
|
||||
# But DO allow a break between closing punctuation and $NSX, don't include it here
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a, from Emoji proposal L2/16-011R3
|
||||
# ZWJ x ID
|
||||
LB8a: ZJ (ID | EB | EM);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
# LB10: Unattached CM -> AL
|
||||
|
||||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA BAX HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
||||
LB13.1: [^NU SP] CM* [CL CP IS SY];
|
||||
LB13.2: [^SP] CM* EX;
|
||||
LB13.2: SP [CL CP EX IS SY];
|
||||
|
||||
|
||||
# LB 14-17 are moved above LB 7.
|
||||
|
||||
LB18: SP ÷;
|
||||
|
||||
LB19: . CM* QU;
|
||||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZJ x ID is tricky because CM includes ZJ.
|
||||
# ZJ acts like a CM to the left, combining with CB.
|
||||
# ZJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZJ (ID | EB | EM);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
|
||||
# should "HL BAX" not break when followed by a CB? Thats what the current
|
||||
# rules do, which is why "[^CM CB]?" includes the ?.
|
||||
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
|
||||
|
||||
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (ID | EB | EM) CM* PO;
|
||||
LB23.2: (AL | HL | CM) CM* NU;
|
||||
LB23.3: NU CM* (AL | HL);
|
||||
|
||||
LB24.1: PR CM* (ID | EB | EM);
|
||||
LB24.2: PR CM* (AL | HL);
|
||||
LB24.3: PO CM* (AL | HL);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
LB26.2: (JV | H2) CM* (JV | JT);
|
||||
LB26.3: (JT | H3) CM* JT;
|
||||
|
||||
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
|
||||
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
|
||||
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
|
||||
|
||||
# LB28 Do not break between Alphabetics.
|
||||
# Unattached (leading) CM treated as AL.
|
||||
LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZJ (ID | EB | EM);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZJ (ID | EB | EM);
|
||||
LB31.2: . CM* ÷;
|
69
icu4c/source/test/testdata/break_rules/readme.txt
vendored
Normal file
69
icu4c/source/test/testdata/break_rules/readme.txt
vendored
Normal file
|
@ -0,0 +1,69 @@
|
|||
file: testdata/break_rules/readme.txt
|
||||
Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
This directory contains the break iterator reference rule files used by intltest rbbi/RBBIMonkeyTest/testMonkey.
|
||||
The rules in this directory track the boundary rules from Unicode UAX 14 and 29. They are interpretted
|
||||
to provide an expected set of boundary positions to compare with the results from ICU break iteration.
|
||||
|
||||
Each set of reference break rules lives in a separate file.
|
||||
The list of rule files to run by default is hardcoded into the test code, in rbbimonkeytest.cpp.
|
||||
|
||||
Each test file includes
|
||||
- The type of ICU break interator to create (word, line, sentence, etc.)
|
||||
- The locale to use
|
||||
- Character Class definitions
|
||||
- Rule definitions
|
||||
|
||||
To Do
|
||||
- Syntax for tailoring.
|
||||
|
||||
|
||||
Character Class Definition:
|
||||
name = set_regular_expression;
|
||||
|
||||
Rule Definition:
|
||||
rule_regular_expression;
|
||||
|
||||
name:
|
||||
[A-Za-z_][A-Za-z0-9_]*
|
||||
|
||||
set_regular_expression:
|
||||
The intersection of an ICU regular expression [set] expression and a UnicodeSet pattern.
|
||||
(They are mostly the same)
|
||||
May include previously defined set names, which are logically expanded in-place.
|
||||
|
||||
rule_regular_expresson:
|
||||
An ICU Regular Expression.
|
||||
May include set names, which are logically expanded in-place.
|
||||
May include a '÷', which defines a boundary position.
|
||||
|
||||
Application of the rules:
|
||||
Matching begins at the start of text, or after a previously identified boundary.
|
||||
The pseudo-code below finds the next boundary.
|
||||
|
||||
while position < end of text
|
||||
for each rule
|
||||
if the text at position matches this rule
|
||||
if the rule has a '÷'
|
||||
Boundary is found.
|
||||
return the position of the '÷' within the match.
|
||||
else
|
||||
position = last character of the rule match.
|
||||
break from the rule loop, continue the outer loop.
|
||||
|
||||
This differs from the Unicode UAX algorithm in that each position in the text is
|
||||
not tested separately. Instead, when a rule match is found, rule application restarts with the last
|
||||
character of the preceding rule match. ICU's break rules also operate this way.
|
||||
|
||||
Expressing rules this way simplifies UAX rules that have leading or trailing context; it
|
||||
is no longer necessary to write expressions that match the context starting from
|
||||
any position within it.
|
||||
|
||||
This rule form differs from ICU rules in that the rules are applied sequentially, as they
|
||||
are with the Unicode UAX rules. With the main ICU break rules, all are applied in parallel.
|
||||
|
||||
Word Dictionaries
|
||||
The monkey test does not test dictionary based breaking. The set named 'dicitionary' is special,
|
||||
as it is in the main ICU rules. For the monkey test, no characters from the dictionary set are
|
||||
included in the randomly-generated test data.
|
||||
|
43
icu4c/source/test/testdata/break_rules/sentence.txt
vendored
Normal file
43
icu4c/source/test/testdata/break_rules/sentence.txt
vendored
Normal file
|
@ -0,0 +1,43 @@
|
|||
type = sentence; # one of grapheme | word | line | sentence
|
||||
locale = en;
|
||||
|
||||
CR = [\p{Sentence_Break = CR}];
|
||||
LF = [\p{Sentence_Break = LF}];
|
||||
Extend = [\p{Sentence_Break = Extend}];
|
||||
Sep = [\p{Sentence_Break = Sep}];
|
||||
Format = [\p{Sentence_Break = Format}];
|
||||
Sp = [\p{Sentence_Break = Sp}];
|
||||
Lower = [\p{Sentence_Break = Lower}];
|
||||
Upper = [\p{Sentence_Break = Upper}];
|
||||
OLetter = [\p{Sentence_Break = OLetter}];
|
||||
Numeric = [\p{Sentence_Break = Numeric}];
|
||||
ATerm = [\p{Sentence_Break = ATerm}];
|
||||
SContinue = [\p{Sentence_Break = SContinue}];
|
||||
STerm = [\p{Sentence_Break = STerm}];
|
||||
Close = [\p{Sentence_Break = Close}];
|
||||
|
||||
ParaSep = [Sep CR LF];
|
||||
SATerm = [STerm ATerm];
|
||||
ExtFmt = [Extend Format];
|
||||
|
||||
# SB2: ÷ eot
|
||||
# Conventional regular expression matching for '$' as end-of-text also matches
|
||||
# at a line separator just preceding the physical end of text.
|
||||
# Instead, use a look-ahead assertion that there is no following character.
|
||||
SB2: . ÷ (?!.);
|
||||
|
||||
SB3: CR LF;
|
||||
SB4: ParaSep ÷;
|
||||
|
||||
# SB5: ignore Format and Extend characters.
|
||||
|
||||
SB6: ATerm ExtFmt* Numeric;
|
||||
SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
|
||||
SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
|
||||
SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
|
||||
|
||||
SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
|
||||
# Also covers SB10, SB11.
|
||||
|
||||
SB12: . ExtFmt* [^ExtFmt]?;
|
||||
|
97
icu4c/source/test/testdata/break_rules/word.txt
vendored
Normal file
97
icu4c/source/test/testdata/break_rules/word.txt
vendored
Normal file
|
@ -0,0 +1,97 @@
|
|||
#
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: word.txt
|
||||
#
|
||||
# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
||||
|
||||
type = word; # one of grapheme | word | line | sentence
|
||||
locale = en;
|
||||
|
||||
E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
E_Modifier = [\U0001F3FB-\U0001F3FF];
|
||||
ZWJ = [\u200D];
|
||||
GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
|
||||
|
||||
CR = [\p{Word_Break = CR}];
|
||||
LF = [\p{Word_Break = LF}];
|
||||
Newline = [\p{Word_Break = Newline}];
|
||||
Extend = [[[\p{Word_Break = Extend}][:Block=Tags:]]-ZWJ];
|
||||
Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
Format = [[\p{Word_Break = Format}]-[:Block=Tags:]];
|
||||
Katakana = [\p{Word_Break = Katakana}];
|
||||
Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
ALetter = [\p{Word_Break = ALetter}];
|
||||
Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
MidLetter = [\p{Word_Break = MidLetter}];
|
||||
MidNum = [\p{Word_Break = MidNum}];
|
||||
Numeric = [\p{Word_Break = Numeric}];
|
||||
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
|
||||
#define dicitionary, with the effect being that those characters don't appear in test data.
|
||||
|
||||
Han = [:Han:];
|
||||
Hiragana = [:Hiragana:];
|
||||
|
||||
Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
HangulSyllable = [\uac00-\ud7a3];
|
||||
ComplexContext = [:LineBreak = Complex_Context:];
|
||||
KanaKanji = [Han Hiragana Katakana];
|
||||
dictionaryCJK = [KanaKanji HangulSyllable];
|
||||
dictionary = [ComplexContext dictionaryCJK];
|
||||
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
# Tricky. Redfine a set.
|
||||
# For tailorings, if it modifies itself, do at end of sets ????
|
||||
# Tweak redefine to mean replace existing definition at its original location.
|
||||
# Insert defs without redefine just after last pre-existing def of that name.
|
||||
# Maybe drop redefine, add warning for sets defined and not used, should catch typos.
|
||||
|
||||
ALetter = [ALetter - dictionary];
|
||||
|
||||
AHLetter = [ALetter Hebrew_Letter];
|
||||
MidNumLetQ = [MidNumLet Single_Quote];
|
||||
ExtFmt = [Extend Format ZWJ];
|
||||
|
||||
WB3: CR LF;
|
||||
WB3a: (Newline | CR | LF) ÷;
|
||||
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
|
||||
# (but needed with UAX treat-as scheme.)
|
||||
WB3c: ZWJ GAZ;
|
||||
|
||||
WB5: AHLetter ExtFmt* AHLetter;
|
||||
|
||||
# includes both WB6 and WB7
|
||||
WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter;
|
||||
|
||||
WB7a: Hebrew_Letter ExtFmt* Single_Quote;
|
||||
WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c
|
||||
|
||||
WB8: Numeric ExtFmt* Numeric;
|
||||
WB9: AHLetter ExtFmt* Numeric;
|
||||
WB10: Numeric ExtFmt* AHLetter;
|
||||
|
||||
WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12
|
||||
WB13: Katakana ExtFmt* Katakana;
|
||||
|
||||
WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
|
||||
WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
|
||||
|
||||
# WB rule 13c, pairs of Regional Indicators stay unbroken.
|
||||
# Interacts with WB3c.
|
||||
WB13c.1: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ GAZ;
|
||||
WB13c.2: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
|
||||
|
||||
WB13d: (E_Base | GAZ) ExtFmt* E_Modifier;
|
||||
|
||||
# Rule WB 14 Any ÷ Any
|
||||
# Interacts with WB3c, do not break between ZWJ and GAZ.
|
||||
WB14.1: . ExtFmt* ZWJ GAZ;
|
||||
WB14.2: . ExtFmt* ÷;
|
||||
|
96
icu4c/source/test/testdata/break_rules/word_POSIX.txt
vendored
Normal file
96
icu4c/source/test/testdata/break_rules/word_POSIX.txt
vendored
Normal file
|
@ -0,0 +1,96 @@
|
|||
#
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: word_POSIX.txt
|
||||
#
|
||||
# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
||||
type = word; # one of grapheme | word | line | sentence
|
||||
locale = en_US_POSIX;
|
||||
|
||||
E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
|
||||
E_Modifier = [\U0001F3FB-\U0001F3FF];
|
||||
ZWJ = [\u200D];
|
||||
GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
|
||||
|
||||
CR = [\p{Word_Break = CR}];
|
||||
LF = [\p{Word_Break = LF}];
|
||||
Newline = [\p{Word_Break = Newline}];
|
||||
Extend = [[[\p{Word_Break = Extend}][:Block=Tags:]]-ZWJ];
|
||||
Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
Format = [[\p{Word_Break = Format}]-[:Block=Tags:]];
|
||||
Katakana = [\p{Word_Break = Katakana}];
|
||||
Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
ALetter = [\p{Word_Break = ALetter}];
|
||||
Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
|
||||
MidLetter = [\p{Word_Break = MidLetter} - [\:]];
|
||||
MidNum = [\p{Word_Break = MidNum} [.]];
|
||||
Numeric = [\p{Word_Break = Numeric}];
|
||||
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
|
||||
#define dicitionary, with the effect being that those characters don't appear in test data.
|
||||
|
||||
Han = [:Han:];
|
||||
Hiragana = [:Hiragana:];
|
||||
|
||||
Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
HangulSyllable = [\uac00-\ud7a3];
|
||||
ComplexContext = [:LineBreak = Complex_Context:];
|
||||
KanaKanji = [Han Hiragana Katakana];
|
||||
dictionaryCJK = [KanaKanji HangulSyllable];
|
||||
dictionary = [ComplexContext dictionaryCJK];
|
||||
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
# Tricky. Redfine a set.
|
||||
# For tailorings, if it modifies itself, do at end of sets ????
|
||||
# Tweak redefine to mean replace existing definition at its original location.
|
||||
# Insert defs without redefine just after last pre-existing def of that name.
|
||||
# Maybe drop redefine, add warning for sets defined and not used, should catch typos.
|
||||
|
||||
ALetter = [ALetter - dictionary];
|
||||
|
||||
AHLetter = [ALetter Hebrew_Letter];
|
||||
MidNumLetQ = [MidNumLet Single_Quote];
|
||||
ExtFmt = [Extend Format ZWJ];
|
||||
|
||||
WB3: CR LF;
|
||||
WB3a: (Newline | CR | LF) ÷;
|
||||
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
|
||||
# (but needed with UAX treat-as scheme.)
|
||||
WB3c: ZWJ GAZ;
|
||||
|
||||
WB5: AHLetter ExtFmt* AHLetter;
|
||||
|
||||
# includes both WB6 and WB7
|
||||
WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter;
|
||||
|
||||
WB7a: Hebrew_Letter ExtFmt* Single_Quote;
|
||||
WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c
|
||||
|
||||
WB8: Numeric ExtFmt* Numeric;
|
||||
WB9: AHLetter ExtFmt* Numeric;
|
||||
WB10: Numeric ExtFmt* AHLetter;
|
||||
|
||||
WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12
|
||||
WB13: Katakana ExtFmt* Katakana;
|
||||
|
||||
WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
|
||||
WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
|
||||
|
||||
# WB rule 13c, pairs of Regional Indicators stay unbroken.
|
||||
# Interacts with WB3c.
|
||||
WB13c.1: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ GAZ;
|
||||
WB13c.2: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
|
||||
|
||||
WB13d: (E_Base | GAZ) ExtFmt* E_Modifier;
|
||||
|
||||
# Rule WB 14 Any ÷ Any
|
||||
# Interacts with WB3c, do not break between ZWJ and GAZ.
|
||||
WB14.1: . ExtFmt* ZWJ GAZ;
|
||||
WB14.2: . ExtFmt* ÷;
|
||||
|
14
icu4c/source/test/testdata/rbbitst.txt
vendored
14
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) 2001-2015 International Business Machines
|
||||
# Copyright (c) 2001-2016 International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# RBBI Test Data
|
||||
|
@ -513,6 +513,18 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
|||
<data>• •\uF8FF\u2028<100>\uF8FF•</data>
|
||||
<data>• \u200B\u2028<100>\u200B•</data>
|
||||
|
||||
# Regional Indicator sequences. They group in pairs. The reverse rules are tricky.
|
||||
# Sequences are long enough that the non-exaustive monkey test won't reliably pick up problems.
|
||||
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
|
||||
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
|
||||
|
||||
|
||||
# User Guide example
|
||||
|
||||
<data>•Parlez-•vous •français ?•</data>
|
||||
|
|
|
@ -271,7 +271,7 @@
|
|||
<ClCompile Include="toolutil.cpp">
|
||||
<DisableLanguageExtensions>false</DisableLanguageExtensions>
|
||||
</ClCompile>
|
||||
<ClCompile Include="ucbuf.c" />
|
||||
<ClCompile Include="ucbuf.cpp" />
|
||||
<ClCompile Include="ucm.c" />
|
||||
<ClCompile Include="ucmstate.c" />
|
||||
<ClCompile Include="unewdata.c" />
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1998-2014, International Business Machines
|
||||
* Copyright (C) 1998-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
*
|
||||
* File ucbuf.c
|
||||
* File ucbuf.cpp
|
||||
*
|
||||
* Modification History:
|
||||
*
|
||||
|
@ -415,7 +415,7 @@ ucbuf_getcx32(UCHARBUF* buf,UErrorCode* error) {
|
|||
/* check if u_unescapeAt unescaped and converted
|
||||
* to c32 or not
|
||||
*/
|
||||
if(c32==0xFFFFFFFF){
|
||||
if(c32==(UChar32)0xFFFFFFFF){
|
||||
if(buf->showWarning) {
|
||||
char context[CONTEXT_LEN+1];
|
||||
int32_t len = CONTEXT_LEN;
|
|
@ -1,12 +1,12 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1998-2015, International Business Machines
|
||||
* Copyright (C) 1998-2016, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
*
|
||||
* File ucbuf.c
|
||||
* File ucbuf.h
|
||||
*
|
||||
* Modification History:
|
||||
*
|
||||
|
@ -17,6 +17,7 @@
|
|||
*******************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "filestrm.h"
|
||||
|
||||
|
@ -45,11 +46,11 @@ struct ULine {
|
|||
/**
|
||||
* Opens the UCHARBUF with the given file stream and code page for conversion
|
||||
* @param fileName Name of the file to open.
|
||||
* @param codepage The encoding of the file stream to convert to Unicode.
|
||||
* @param codepage The encoding of the file stream to convert to Unicode.
|
||||
* If *codepoge is NULL on input the API will try to autodetect
|
||||
* popular Unicode encodings
|
||||
* @param showWarning Flag to print out warnings to STDOUT
|
||||
* @param buffered If TRUE performs a buffered read of the input file. If FALSE reads
|
||||
* @param buffered If TRUE performs a buffered read of the input file. If FALSE reads
|
||||
* the whole file into memory and converts it.
|
||||
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
|
||||
* indicates a failure on entry, the function will immediately return.
|
||||
|
@ -82,7 +83,7 @@ U_CAPI int32_t U_EXPORT2
|
|||
ucbuf_getc32(UCHARBUF* buf,UErrorCode* err);
|
||||
|
||||
/**
|
||||
* Gets a UTF-16 code unit at the current position from the converted buffer after
|
||||
* Gets a UTF-16 code unit at the current position from the converted buffer after
|
||||
* unescaping and increments the current position. If the escape sequence is for UTF-32
|
||||
* code point (\\Uxxxxxxxx) then a UTF-32 codepoint is returned
|
||||
* @param buf Pointer to UCHARBUF structure
|
||||
|
@ -95,7 +96,7 @@ ucbuf_getcx32(UCHARBUF* buf,UErrorCode* err);
|
|||
|
||||
/**
|
||||
* Gets a pointer to the current position in the internal buffer and length of the line.
|
||||
* It imperative to make a copy of the returned buffere before performing operations on it.
|
||||
* It imperative to make a copy of the returned buffer before performing operations on it.
|
||||
* @param buf Pointer to UCHARBUF structure
|
||||
* @param len Output param to receive the len of the buffer returned till end of the line
|
||||
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
|
||||
|
@ -141,6 +142,14 @@ ucbuf_close(UCHARBUF* buf);
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* \class LocalUCHARBUFPointer
|
||||
* "Smart pointer" class, closes a UCHARBUF via ucbuf_close().
|
||||
* For most methods see the LocalPointerBase base class.
|
||||
*
|
||||
* @see LocalPointerBase
|
||||
* @see LocalPointer
|
||||
*/
|
||||
U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -155,7 +164,7 @@ ucbuf_ungetc(int32_t ungetChar,UCHARBUF* buf);
|
|||
|
||||
|
||||
/**
|
||||
* Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
|
||||
* Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
|
||||
* Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
|
||||
* the converter to correct state for converting the rest of the stream. So the UConverter parameter
|
||||
* is necessary.
|
||||
|
@ -175,7 +184,7 @@ ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv,
|
|||
int32_t* signatureLength, UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
|
||||
* Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
|
||||
* Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
|
||||
* the converter to correct state for converting the rest of the stream. So the UConverter parameter
|
||||
* is necessary.
|
||||
|
|
Loading…
Add table
Reference in a new issue