ICU-22707 LB19 rules that seem to actually work with ea=W BA and CM. Update default new monkeys.

This commit is contained in:
Robin Leroy 2024-07-01 16:56:52 +02:00 committed by Markus Scherer
parent 6d3b988fd0
commit 600011eb7d
3 changed files with 82 additions and 12 deletions

View file

@ -284,28 +284,58 @@ $LB18Breaks = [$LB8Breaks $SP];
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;
# OP and GL are subtracted because of LB14 and LB12 (there is no break after them).
# BA is subtracted because of LB21a:
# We must not poke a hole into HL U+3000 × [\p{Pi} & QU] [\p{ea=F}\p{ea=W}\p{ea=H}],
# where U+3000 is lb=BA and ea=W.
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$OP $CM* $SP+ [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$QU $CM* .;
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
$OP $CM* $SP+ [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$OP $CM* $SP+ [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$SP? $IS $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
# LB 20
# <break> $CB
# $CB <break>
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];
[$LB20NonBreaks - $HL] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
[$LB20NonBreaks - $HL] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
[$LB20NonBreaks - [$HL $IS $RI]] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
[$LB20NonBreaks - [$HL $IS $RI]] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$CB $CM* $ZWJ [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
($RI $CM*)? $RI $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),

View file

@ -6,11 +6,7 @@
# file: line.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 for Unicode 14.0,
# with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
# there is a boundary preceding the hyphen. See rule 20.9
# Rules derived from Unicode Standard Annex #14 for Unicode 16.0.
#
# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
# It sets characters of class CJ to behave like NS.
@ -80,6 +76,15 @@ ZWJ = [:LineBreak = ZWJ:];
OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
eaFWHminusOPGL = [ eaFWH - [OP GL] ];
eaFWHandCM = [ CMS & eaFWH ];
eaFWHminusCM = [ eaFWH - CMS ];
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];
# An annoying special case, \p{lb=BA} & [\p{ea=F}\p{ea=W}\p{ea=H}].
ideographicSpace = [\u3000];
PiQU = [\p{Pi}&QU];
PfQU = [\p{Pf}&QU];
@ -116,15 +121,31 @@ LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? (IS CM*)? NU (CM*(NU | SY | IS))* (
# Rules LB14 - LB17.
# Moved before LB14, because it matches a supersequence.
LB20a.3: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
# Moved before LB14. These are really the cases where LB19a does not apply, but
# the old LB19 would. This is to avoid many instances of chaining over two code
# points.
LB19a.1: eaFWHminusOPGL ÷ PiQU CM* eaFWHminusCM;
LB19a.2: eaFWHminusOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.3: ^eaFWHandCM ÷ PiQU CM* eaFWHminusCM;
LB19a.4: ^eaFWHandCM CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.5: eaFWH CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.6: eaFWH CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
LB19a.7: ^eaFWHandCM CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.8: ^eaFWHandCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
# For example, the sequence "OP CM SP AL" matches LB14
# while the prefix of it, "OP CM SP" matches LB7.1
LB20a.7: OP CM* SP+ (HY | HH) CM* AL;
LB14: OP CM* SP* .;
LB20a.4: ^ (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15a.2: ^ (PiQU CM* SP*)+ .;
# LB15b/LB15a chaining.
LB20a.5: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ SP (HY | HH) CM* AL;
LB15b.1: ([^SP] CM* | SP) PfQU CM* (PiQU CM* SP*)+ .;
LB15b.2: ([^SP] CM* | SP) PfQU CM* ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | $ );
@ -161,7 +182,9 @@ LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
@ -183,15 +206,18 @@ LB19.1: QU CM* [^CM];
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20a.6: CB CM* ZWJ (HY | HH) CM* AL;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
LB20.09: ^(HY | HH) CM* AL;
# LB 20a Do not break after a word-initial hyphen.
LB20a.1: ^(HY | HH) CM* AL;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
# Chains over two characters with the LB19a break rule.
LB21a.1: HL CM* ideographicSpace CM* PfQU (CM* CMS)? ÷ eaFWHBreakableAtLB19;
LB21a: HL CM* (HY | BA) CM* [^CM CB HL];
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];

View file

@ -2221,4 +2221,18 @@ Bangkok)•</data>
<data>•毕•士•悌•1901•年•—•1936•年•又•名•“杨•林”•朝•鲜•籍•红•军•将•领•</data>
<data>•Anmerkung: •„White“ •bzw. •‚白•人‘ •– •in •der •Amtlichen •Statistik•</data>
<data>•« Complex »« chaining » •</data>
<data>•« .618 »•</data> # Interaction with the ICU tailoring to break before such numbers.
<data>•« .618 »•</data> # Interaction with the ICU tailoring to break before such numbers.
# Non-breaking lb=SP (from LB14 and LB15a) followed by a lb=CM-as-AL that is
# ea=W, in a position that would match initial context for LB19a if it were not
# ea=W.
# See https://github.com/unicode-org/icu/pull/3028#issuecomment-2200259320.
<data>•︷ \U00016FF1•⸠ᅛᆅ•</data>
<data>•︷ « \U00016FF1•⸠ᅛᆅ•</data>
<data>•A »« \U00016FF1•⸠ᅛᆅ•</data>
<data>•︷ \U00016FF1\u302B•⸠ᅛᆅ•</data>
<data>•︷ « \U00016FF1\u302B•⸠ᅛᆅ•</data>
<data>•A »« \U00016FF1\u302B•⸠ᅛᆅ•</data>
<data>•❲ \u3035⸍•굼•</data>
<data>•❲ « \u3035⸍•굼•</data>
<data>•A »« \u3035⸍•굼•</data>