ICU-22707 Patch tailored new monkeys, manually for the last hunk on line_(loose|normal)_cj

This commit is contained in:
Robin Leroy 2024-07-15 18:27:47 +02:00 committed by Markus Scherer
parent 3a004d400f
commit 7d14e8db45
5 changed files with 130 additions and 25 deletions

View file

@ -6,7 +6,7 @@
# file: line.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -80,6 +80,12 @@ CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
eaFWHminusCM = [ eaFWH - CMS ];
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
BAminuseaFWH = [BA - eaFWH ];
PiQU = [\p{Pi}&QU];
PfQU = [\p{Pf}&QU];
@ -116,15 +122,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
# Rules LB14 - LB17.
# Moved before LB14, because it matches a supersequence.
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
# Moved before LB14. These are really the cases where LB19a does not apply, but
# the old LB19 would. This is to avoid many instances of chaining over two code
# points.
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
# For example, the sequence "OP CM SP AL" matches LB14
# while the prefix of it, "OP CM SP" matches LB7.1
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
LB14: OP CM* SP* .;
LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.2: ^ (PiQU CM* SP*)+ .;
# LB15b/LB15a chaining.
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );
@ -161,7 +179,9 @@ LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
@ -183,15 +203,16 @@ LB19.1: QU CM* [^CM];
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
LB20.09: ^(HY | HH) CM* AL;
# LB 20a Do not break after a word-initial hyphen.
LB20a.1: ^(HY | HH) CM* AL;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];

View file

@ -6,7 +6,7 @@
# file: line_loose.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -81,6 +81,12 @@ ZWJ = [:LineBreak = ZWJ:];
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
eaFWHminusCM = [ eaFWH - CMS ];
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
BAminuseaFWH = [BA - eaFWH ];
PiQU = [\p{Pi}&QU];
PfQU = [\p{Pf}&QU];
@ -117,15 +123,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
# Rules LB14 - LB17.
# Moved before LB14, because it matches a supersequence.
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
# Moved before LB14. These are really the cases where LB19a does not apply, but
# the old LB19 would. This is to avoid many instances of chaining over two code
# points.
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
# For example, the sequence "OP CM SP AL" matches LB14
# while the prefix of it, "OP CM SP" matches LB7.1
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
LB14: OP CM* SP* .;
LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.2: ^ (PiQU CM* SP*)+ .;
# LB15b/LB15a chaining.
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );
@ -162,7 +180,9 @@ LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
@ -184,15 +204,16 @@ LB19.1: QU CM* [^CM];
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
LB20.09: ^(HY | HH) CM* AL;
# LB 20a Do not break after a word-initial hyphen.
LB20a.1: ^(HY | HH) CM* AL;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];

View file

@ -6,7 +6,7 @@
# file: line_loose_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -99,6 +99,12 @@ ZWJ = [:LineBreak = ZWJ:];
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
eaFWHminusCM = [ eaFWH - CMS ];
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
BAminuseaFWH = [BA - eaFWH ];
PiQU = [\p{Pi}&QU];
PfQU = [\p{Pf}&QU];
@ -136,15 +142,27 @@ LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | I
# Rules LB14 - LB17.
# Moved before LB14, because it matches a supersequence.
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
# Moved before LB14. These are really the cases where LB19a does not apply, but
# the old LB19 would. This is to avoid many instances of chaining over two code
# points.
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
# For example, the sequence "OP CM SP AL" matches LB14
# while the prefix of it, "OP CM SP" matches LB7.1
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
LB14: OP CM* SP* .;
LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.2: ^ (PiQU CM* SP*)+ .;
# LB15b/LB15a chaining.
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );
@ -181,7 +199,9 @@ LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA BAX HY] CM* GL;
@ -204,18 +224,19 @@ LB19.1: QU CM* [^CM];
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
LB20.09: ^(HY | HH) CM* AL;
# LB 20a Do not break after a word-initial hyphen.
LB20a.1: ^(HY | HH) CM* AL;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21a: HL CM* (HY | BAminuseaFWH | BAX) CM* [^CM CB HL];
LB21.1: [^ID] CM* [BA BAX HY NS];
LB21.2: ID CM* [BA HY NS];

View file

@ -6,7 +6,7 @@
# file: line_normal.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -82,6 +82,12 @@ ZWJ = [:LineBreak = ZWJ:];
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
eaFWHminusCM = [ eaFWH - CMS ];
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
BAminuseaFWH = [BA - eaFWH ];
PiQU = [\p{Pi}&QU];
PfQU = [\p{Pf}&QU];
@ -118,15 +124,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
# Rules LB14 - LB17.
# Moved before LB14, because it matches a supersequence.
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
# Moved before LB14. These are really the cases where LB19a does not apply, but
# the old LB19 would. This is to avoid many instances of chaining over two code
# points.
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
# For example, the sequence "OP CM SP AL" matches LB14
# while the prefix of it, "OP CM SP" matches LB7.1
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
LB14: OP CM* SP* .;
LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.2: ^ (PiQU CM* SP*)+ .;
# LB15b/LB15a chaining.
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );
@ -163,7 +181,9 @@ LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
@ -185,15 +205,16 @@ LB19.1: QU CM* [^CM];
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
LB20.09: ^(HY | HH) CM* AL;
# LB 20a Do not break after a word-initial hyphen.
LB20a.1: ^(HY | HH) CM* AL;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];

View file

@ -6,7 +6,7 @@
# file: line_normal_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0.
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -84,6 +84,12 @@ ZWJ = [:LineBreak = ZWJ:];
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
eaFWHminusCMOPGL = [ eaFWH - [CMS OP GL] ];
eaFWHminusCM = [ eaFWH - CMS ];
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
BAminuseaFWH = [BA - eaFWH ];
PiQU = [\p{Pi}&QU];
PfQU = [\p{Pf}&QU];
@ -120,15 +126,27 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
# Rules LB14 - LB17.
# Moved before LB14, because it matches a supersequence.
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
# Moved before LB14. These are really the cases where LB19a does not apply, but
# the old LB19 would. This is to avoid many instances of chaining over two code
# points.
LB19a.1: eaFWHminusCMOPGL ÷ PiQU CM* eaFWHminusCM;
LB19a.2: eaFWHminusCMOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.5: eaFWHminusCM CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.6: eaFWHminusCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
# For example, the sequence "OP CM SP AL" matches LB14
# while the prefix of it, "OP CM SP" matches LB7.1
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
LB14: OP CM* SP* .;
LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.2: ^ (PiQU CM* SP*)+ .;
# LB15b/LB15a chaining.
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );
@ -167,7 +185,9 @@ LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
@ -189,15 +209,16 @@ LB19.1: QU CM* [^CM];
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
LB20.09: ^(HY | HH) CM* AL;
# LB 20a Do not break after a word-initial hyphen.
LB20a.1: ^(HY | HH) CM* AL;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB]?;
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];
# DO allow breaks here before $NSXcm, so don't include it
LB21.1: . CM* [BA HY NS];