mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-13770 RBBI LB8a rule update for ICU4J.
X-SVN-Rev: 41424
This commit is contained in:
parent
a91445266d
commit
c85229e1e4
8 changed files with 136 additions and 131 deletions
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4bd87b532fc7ad362740dde413999961c7f372cbbce5fb54160d201b783fec33
|
||||
size 12503004
|
||||
oid sha256:e9ffd3c1d1fa55ec8819eee15483f7f1b4c4520a62a9ae3d0b3f971b9d06e18c
|
||||
size 12500142
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5c25f29e8e9f5b7244a63ddc48dd7d69f56612b310e1de8351c9ea80a84afc6f
|
||||
oid sha256:88f00fc2ffbd0fcae8531cffdcc5b405876d3d89036e5cb8e077b0e817b88d9f
|
||||
size 92867
|
||||
|
|
|
@ -646,8 +646,6 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
UnicodeSet fEB;
|
||||
UnicodeSet fEM;
|
||||
UnicodeSet fZWJ;
|
||||
UnicodeSet fExtendedPict;
|
||||
UnicodeSet fEmojiNRK;
|
||||
|
||||
StringBuffer fText;
|
||||
int fOrigPositions;
|
||||
|
@ -701,9 +699,6 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
fEB = new UnicodeSet("[\\p{Line_break=EB}]");
|
||||
fEM = new UnicodeSet("[\\p{Line_break=EM}]");
|
||||
fZWJ = new UnicodeSet("[\\p{Line_break=ZWJ}]");
|
||||
fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9©®™〰〽]]");
|
||||
fExtendedPict = new UnicodeSet("[:Extended_Pictographic:]");
|
||||
|
||||
|
||||
// Remove dictionary characters.
|
||||
// The monkey test reference implementation of line break does not replicate the dictionary behavior,
|
||||
|
@ -760,8 +755,6 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
fSets.add(fEB);
|
||||
fSets.add(fEM);
|
||||
fSets.add(fZWJ);
|
||||
fSets.add(fExtendedPict);
|
||||
fSets.add(fEmojiNRK);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -897,13 +890,39 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
break;
|
||||
}
|
||||
|
||||
// LB 25 Numbers
|
||||
// Move this test up, before LB8a, because numbers can match a longer sequence that would
|
||||
// also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
|
||||
matchVals = LBNumberCheck(fText, prevPos, matchVals);
|
||||
if (matchVals[0] != -1) {
|
||||
// Matched a number. But could have been just a single digit, which would
|
||||
// not represent a "no break here" between prevChar and thisChar
|
||||
int numEndIdx = matchVals[1]; // idx of first char following num
|
||||
if (numEndIdx > pos) {
|
||||
// Number match includes at least the two chars being checked
|
||||
if (numEndIdx > nextPos) {
|
||||
// Number match includes additional chars. Update pos and nextPos
|
||||
// so that next loop iteration will continue at the end of the number,
|
||||
// checking for breaks between last char in number & whatever follows.
|
||||
nextPos = numEndIdx;
|
||||
pos = numEndIdx;
|
||||
do {
|
||||
pos = moveIndex32(fText, pos, -1);
|
||||
thisChar = UTF16.charAt(fText, pos);
|
||||
}
|
||||
while (fCM.contains(thisChar));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// LB 8a: ZWJ x (ID | Extended_Pictographic | Emoji)
|
||||
// The monkey test's way of ignoring combining characters doesn't work
|
||||
// for this rule. ZWJ is also a CM. Need to get the actual character
|
||||
// preceding "thisChar", not ignoring combining marks, possibly ZWJ.
|
||||
{
|
||||
int prevC = fText.codePointBefore(pos);
|
||||
if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) {
|
||||
if (fZWJ.contains(prevC)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -1088,31 +1107,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
|
||||
|
||||
// LB 25 Numbers
|
||||
matchVals = LBNumberCheck(fText, prevPos, matchVals);
|
||||
if (matchVals[0] != -1) {
|
||||
// Matched a number. But could have been just a single digit, which would
|
||||
// not represent a "no break here" between prevChar and thisChar
|
||||
int numEndIdx = matchVals[1]; // idx of first char following num
|
||||
if (numEndIdx > pos) {
|
||||
// Number match includes at least the two chars being checked
|
||||
if (numEndIdx > nextPos) {
|
||||
// Number match includes additional chars. Update pos and nextPos
|
||||
// so that next loop iteration will continue at the end of the number,
|
||||
// checking for breaks between last char in number & whatever follows.
|
||||
nextPos = numEndIdx;
|
||||
pos = numEndIdx;
|
||||
do {
|
||||
pos = moveIndex32(fText, pos, -1);
|
||||
thisChar = UTF16.charAt(fText, pos);
|
||||
}
|
||||
while (fCM.contains(thisChar));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// LB 25 Numbers match, moved up, before LB 8a.
|
||||
|
||||
// LB 26 Do not break Korean Syllables
|
||||
if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
|
||||
# file: line.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
@ -24,7 +25,7 @@ B2 = [:LineBreak = Break_Both:];
|
|||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CM_ = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
|
@ -59,16 +60,13 @@ XX = [:LineBreak = Unknown:];
|
|||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
Extended_Pict = [:ExtPict:];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
CM = [CM_ ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
|
@ -97,8 +95,10 @@ LB7.2: [ZW SP] [SP ZW];
|
|||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
# the ZWJ behaves as just another generic CM.
|
||||
LB8a: ZWJ [^CM];
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
|
@ -107,7 +107,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
|||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.1: [^SP] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
|
@ -133,12 +133,14 @@ LB19: . CM* QU;
|
|||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
|
||||
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
|
||||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
# ZWJ acts independently to the right, no break after by LB8a.
|
||||
LB20.1: . CM* ZWJ CB;
|
||||
LB20.2: . CM* ÷ CB;
|
||||
|
||||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
|
@ -185,15 +187,15 @@ LB29: IS CM* (AL | HL);
|
|||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.1: . CM* ZWJ [^CM];
|
||||
LB31.2: . CM* ÷;
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
#
|
||||
# file: line_loose.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
@ -31,7 +32,7 @@ B2 = [:LineBreak = Break_Both:];
|
|||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CM_ = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
|
@ -67,16 +68,13 @@ XX = [:LineBreak = Unknown:];
|
|||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
Extended_Pict = [:ExtPict:];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
CM = [CM_ ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
|
@ -105,8 +103,10 @@ LB7.2: [ZW SP] [SP ZW];
|
|||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
# the ZWJ behaves as just another generic CM.
|
||||
LB8a: ZWJ [^CM];
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
|
@ -115,7 +115,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
|||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.1: [^SP] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
|
@ -141,12 +141,14 @@ LB19: . CM* QU;
|
|||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
|
||||
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
|
||||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
# ZWJ acts independently to the right, no break after by LB8a.
|
||||
LB20.1: . CM* ZWJ CB;
|
||||
LB20.2: . CM* ÷ CB;
|
||||
|
||||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
|
@ -193,15 +195,15 @@ LB29: IS CM* (AL | HL);
|
|||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.1: . CM* ZWJ [^CM];
|
||||
LB31.2: . CM* ÷;
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
#
|
||||
# file: line_loose_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
@ -45,7 +46,7 @@ B2 = [:LineBreak = Break_Both:];
|
|||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CM_ = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
|
@ -84,16 +85,13 @@ XX = [:LineBreak = Unknown:];
|
|||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
Extended_Pict = [:ExtPict:];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
CM = [CM_ ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
|
@ -122,8 +120,10 @@ LB7.2: [ZW SP] [SP ZW];
|
|||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
# the ZWJ behaves as just another generic CM.
|
||||
LB8a: ZWJ [^CM];
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
|
@ -132,7 +132,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
|||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.1: [^SP] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
|
@ -158,12 +158,14 @@ LB19: . CM* QU;
|
|||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
|
||||
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
|
||||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
# ZWJ acts independently to the right, no break after by LB8a.
|
||||
LB20.1: . CM* ZWJ CB;
|
||||
LB20.2: . CM* ÷ CB;
|
||||
|
||||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
|
@ -214,15 +216,15 @@ LB29: IS CM* (AL | HL);
|
|||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.1: . CM* ZWJ [^CM];
|
||||
LB31.2: . CM* ÷;
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
#
|
||||
# file: line_normal.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
@ -38,7 +39,7 @@ B2 = [:LineBreak = Break_Both:];
|
|||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CM_ = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
|
@ -73,16 +74,13 @@ XX = [:LineBreak = Unknown:];
|
|||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
Extended_Pict = [:ExtPict:];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
CM = [CM_ ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
|
@ -111,8 +109,10 @@ LB7.2: [ZW SP] [SP ZW];
|
|||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
# the ZWJ behaves as just another generic CM.
|
||||
LB8a: ZWJ [^CM];
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
|
@ -121,7 +121,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
|||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.1: [^SP] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
|
@ -147,12 +147,14 @@ LB19: . CM* QU;
|
|||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
|
||||
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
|
||||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
# ZWJ acts independently to the right, no break after by LB8a.
|
||||
LB20.1: . CM* ZWJ CB;
|
||||
LB20.2: . CM* ÷ CB;
|
||||
|
||||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
|
@ -199,15 +201,15 @@ LB29: IS CM* (AL | HL);
|
|||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.1: . CM* ZWJ [^CM];
|
||||
LB31.2: . CM* ÷;
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
#
|
||||
# file: line_normal_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
@ -39,7 +40,7 @@ B2 = [:LineBreak = Break_Both:];
|
|||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CM_ = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
|
@ -75,16 +76,13 @@ XX = [:LineBreak = Unknown:];
|
|||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
Extended_Pict = [:ExtPict:];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
CM = [CM_ ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
|
@ -116,8 +114,10 @@ LB7.2: [ZW SP] [SP ZW];
|
|||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
# ZWJ x
|
||||
# Don't match a CM on the right - let other rules pick up CM sequences, where
|
||||
# the ZWJ behaves as just another generic CM.
|
||||
LB8a: ZWJ [^CM];
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
|
@ -126,7 +126,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
|||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.1: [^SP] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
|
@ -152,12 +152,14 @@ LB19: . CM* QU;
|
|||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
|
||||
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
|
||||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
# ZWJ acts independently to the right, no break after by LB8a.
|
||||
LB20.1: . CM* ZWJ CB;
|
||||
LB20.2: . CM* ÷ CB;
|
||||
|
||||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
|
@ -208,15 +210,15 @@ LB29: IS CM* (AL | HL);
|
|||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
# LB30a keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
|
||||
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.1: . CM* ZWJ [^CM];
|
||||
LB31.2: . CM* ÷;
|
||||
|
|
Loading…
Add table
Reference in a new issue