From 782d5cc33909c307022b0c763dcc18c2d4c602b5 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Sat, 6 Jul 2024 04:09:25 +0200 Subject: [PATCH] ICU-22707 UTC-179-A102 Consider using a macro throughout the rules for [\p{ea=F}\p{ea=W}\p{ea=H}]. --- icu4c/source/data/brkitr/rules/line.txt | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index c780b80e1e8..9f3e44984ea 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -74,12 +74,7 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; -# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14, -# without a formal name. Because ICU rules require multiple uses of the expressions, -# give them a single definition with a name - -$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; -$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}]; $ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}]; @@ -122,7 +117,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. # -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; # @@ -284,12 +279,12 @@ $LB18Breaks = [$LB8Breaks $SP]; $LB18NonBreaks $CM* $QU; ^$CM+ $QU; -[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM]; +[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM]; $QU $CM* .; -[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]]; +[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]]; # LB 20 # $CB @@ -329,7 +324,7 @@ $BB $CM* $LB20NonBreaks; # LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew # HL (HY | BA) x [^HL] # -$HL $CM* ($HY | [ $BA - [\p{ea=F}\p{ea=W}\p{ea=H}] ] ) $CM* [^$CB $HL]?; +$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?; # LB 21b (forward) Don't break between SY and HL # (break between HL and SY already disallowed by LB 13 above) @@ -389,9 +384,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); $IS $CM* ($ALPlus | $HL); # LB 30 -($ALPlus | $HL | $NU) $CM* $OP30; -^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL. -$CP30 $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian]; +^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL. +[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. # Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.