mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-22707 Patch tailored new monkeys, manually for the last hunk on line_(loose|normal)_cj
This commit is contained in:
parent
3a004d400f
commit
7d14e8db45
5 changed files with 130 additions and 25 deletions
|
@ -6,7 +6,7 @@
|
|||
# file: line.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
|
||||
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
@ -80,6 +80,12 @@ CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
|||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
|
||||
eaFWHminusCM = [ eaFWH - CMS ];
|
||||
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
|
||||
BAminuseaFWH = [BA - eaFWH ];
|
||||
|
||||
PiQU = [\p{Pi}&QU];
|
||||
PfQU = [\p{Pf}&QU];
|
||||
|
||||
|
@ -116,15 +122,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
|
|||
# Rules LB14 - LB17.
|
||||
|
||||
# Moved before LB14, because it matches a supersequence.
|
||||
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
|
||||
|
||||
# Moved before LB14. These are really the cases where LB19a does not apply, but
|
||||
# the old LB19 would. This is to avoid many instances of chaining over two code
|
||||
# points.
|
||||
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
|
||||
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
|
||||
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
|
||||
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
|
||||
|
||||
# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
|
||||
# For example, the sequence "OP CM SP AL" matches LB14
|
||||
# while the prefix of it, "OP CM SP" matches LB7.1
|
||||
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
|
||||
LB14: OP CM* SP* .;
|
||||
|
||||
LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15a.2: ^ (PiQU CM* SP*)+ .;
|
||||
# LB15b/LB15a chaining.
|
||||
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
|
||||
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );
|
||||
|
||||
|
@ -161,7 +179,9 @@ LB11.1: [^SP] CM* WJ;
|
|||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
|
@ -183,15 +203,16 @@ LB19.1: QU CM* [^CM];
|
|||
LB20.1: . CM* ZWJ CB;
|
||||
LB20.2: . CM* ÷ CB;
|
||||
|
||||
LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
|
||||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
# LB 20a Do not break after a word-initial hyphen.
|
||||
LB20a.1: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# file: line_loose.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
|
||||
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
@ -81,6 +81,12 @@ ZWJ = [:LineBreak = ZWJ:];
|
|||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
|
||||
eaFWHminusCM = [ eaFWH - CMS ];
|
||||
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
|
||||
BAminuseaFWH = [BA - eaFWH ];
|
||||
|
||||
PiQU = [\p{Pi}&QU];
|
||||
PfQU = [\p{Pf}&QU];
|
||||
|
||||
|
@ -117,15 +123,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
|
|||
# Rules LB14 - LB17.
|
||||
|
||||
# Moved before LB14, because it matches a supersequence.
|
||||
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
|
||||
|
||||
# Moved before LB14. These are really the cases where LB19a does not apply, but
|
||||
# the old LB19 would. This is to avoid many instances of chaining over two code
|
||||
# points.
|
||||
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
|
||||
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
|
||||
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
|
||||
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
|
||||
|
||||
# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
|
||||
# For example, the sequence "OP CM SP AL" matches LB14
|
||||
# while the prefix of it, "OP CM SP" matches LB7.1
|
||||
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
|
||||
LB14: OP CM* SP* .;
|
||||
|
||||
LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15a.2: ^ (PiQU CM* SP*)+ .;
|
||||
# LB15b/LB15a chaining.
|
||||
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
|
||||
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );
|
||||
|
||||
|
@ -162,7 +180,9 @@ LB11.1: [^SP] CM* WJ;
|
|||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
|
@ -184,15 +204,16 @@ LB19.1: QU CM* [^CM];
|
|||
LB20.1: . CM* ZWJ CB;
|
||||
LB20.2: . CM* ÷ CB;
|
||||
|
||||
LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
|
||||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
# LB 20a Do not break after a word-initial hyphen.
|
||||
LB20a.1: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# file: line_loose_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
|
||||
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
@ -99,6 +99,12 @@ ZWJ = [:LineBreak = ZWJ:];
|
|||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
|
||||
eaFWHminusCM = [ eaFWH - CMS ];
|
||||
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
|
||||
BAminuseaFWH = [BA - eaFWH ];
|
||||
|
||||
PiQU = [\p{Pi}&QU];
|
||||
PfQU = [\p{Pf}&QU];
|
||||
|
||||
|
@ -136,15 +142,27 @@ LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | I
|
|||
# Rules LB14 - LB17.
|
||||
|
||||
# Moved before LB14, because it matches a supersequence.
|
||||
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
|
||||
|
||||
# Moved before LB14. These are really the cases where LB19a does not apply, but
|
||||
# the old LB19 would. This is to avoid many instances of chaining over two code
|
||||
# points.
|
||||
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
|
||||
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
|
||||
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
|
||||
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
|
||||
|
||||
# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
|
||||
# For example, the sequence "OP CM SP AL" matches LB14
|
||||
# while the prefix of it, "OP CM SP" matches LB7.1
|
||||
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
|
||||
LB14: OP CM* SP* .;
|
||||
|
||||
LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15a.2: ^ (PiQU CM* SP*)+ .;
|
||||
# LB15b/LB15a chaining.
|
||||
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
|
||||
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );
|
||||
|
||||
|
@ -181,7 +199,9 @@ LB11.1: [^SP] CM* WJ;
|
|||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA BAX HY] CM* GL;
|
||||
|
||||
|
@ -204,18 +224,19 @@ LB19.1: QU CM* [^CM];
|
|||
LB20.1: . CM* ZWJ CB;
|
||||
LB20.2: . CM* ÷ CB;
|
||||
|
||||
LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
|
||||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
# LB 20a Do not break after a word-initial hyphen.
|
||||
LB20a.1: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
|
||||
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
|
||||
LB21a: HL CM* (HY | BAminuseaFWH | BAX) CM* [^CM CB HL];
|
||||
|
||||
LB21.1: [^ID] CM* [BA BAX HY NS];
|
||||
LB21.2: ID CM* [BA HY NS];
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# file: line_normal.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
|
||||
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
@ -82,6 +82,12 @@ ZWJ = [:LineBreak = ZWJ:];
|
|||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
|
||||
eaFWHminusCM = [ eaFWH - CMS ];
|
||||
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
|
||||
BAminuseaFWH = [BA - eaFWH ];
|
||||
|
||||
PiQU = [\p{Pi}&QU];
|
||||
PfQU = [\p{Pf}&QU];
|
||||
|
||||
|
@ -118,15 +124,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
|
|||
# Rules LB14 - LB17.
|
||||
|
||||
# Moved before LB14, because it matches a supersequence.
|
||||
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
|
||||
|
||||
# Moved before LB14. These are really the cases where LB19a does not apply, but
|
||||
# the old LB19 would. This is to avoid many instances of chaining over two code
|
||||
# points.
|
||||
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
|
||||
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
|
||||
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
|
||||
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
|
||||
|
||||
# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
|
||||
# For example, the sequence "OP CM SP AL" matches LB14
|
||||
# while the prefix of it, "OP CM SP" matches LB7.1
|
||||
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
|
||||
LB14: OP CM* SP* .;
|
||||
|
||||
LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15a.2: ^ (PiQU CM* SP*)+ .;
|
||||
# LB15b/LB15a chaining.
|
||||
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
|
||||
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );
|
||||
|
||||
|
@ -163,7 +181,9 @@ LB11.1: [^SP] CM* WJ;
|
|||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
|
@ -185,15 +205,16 @@ LB19.1: QU CM* [^CM];
|
|||
LB20.1: . CM* ZWJ CB;
|
||||
LB20.2: . CM* ÷ CB;
|
||||
|
||||
LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
|
||||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
# LB 20a Do not break after a word-initial hyphen.
|
||||
LB20a.1: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# file: line_normal_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
|
||||
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
|
||||
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
@ -84,6 +84,12 @@ ZWJ = [:LineBreak = ZWJ:];
|
|||
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
|
||||
eaFWHminusCM = [ eaFWH - CMS ];
|
||||
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
|
||||
BAminuseaFWH = [BA - eaFWH ];
|
||||
|
||||
PiQU = [\p{Pi}&QU];
|
||||
PfQU = [\p{Pf}&QU];
|
||||
|
||||
|
@ -120,15 +126,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
|
|||
# Rules LB14 - LB17.
|
||||
|
||||
# Moved before LB14, because it matches a supersequence.
|
||||
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
|
||||
|
||||
# Moved before LB14. These are really the cases where LB19a does not apply, but
|
||||
# the old LB19 would. This is to avoid many instances of chaining over two code
|
||||
# points.
|
||||
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
|
||||
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
|
||||
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
|
||||
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
|
||||
|
||||
# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
|
||||
# For example, the sequence "OP CM SP AL" matches LB14
|
||||
# while the prefix of it, "OP CM SP" matches LB7.1
|
||||
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
|
||||
LB14: OP CM* SP* .;
|
||||
|
||||
LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15a.2: ^ (PiQU CM* SP*)+ .;
|
||||
# LB15b/LB15a chaining.
|
||||
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
|
||||
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
|
||||
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );
|
||||
|
||||
|
@ -167,7 +185,9 @@ LB11.1: [^SP] CM* WJ;
|
|||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
|
@ -189,15 +209,16 @@ LB19.1: QU CM* [^CM];
|
|||
LB20.1: . CM* ZWJ CB;
|
||||
LB20.2: . CM* ÷ CB;
|
||||
|
||||
LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
|
||||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
# LB 20a Do not break after a word-initial hyphen.
|
||||
LB20a.1: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB]?;
|
||||
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];
|
||||
|
||||
# DO allow breaks here before $NSXcm, so don't include it
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
|
|
Loading…
Add table
Reference in a new issue