ICU-22707 Patch tailored rules (manually for hunks 1 and 6 on loose(_phrase)?_cj)

This commit is contained in:
Robin Leroy 2024-07-15 18:03:56 +02:00 committed by Markus Scherer
parent 782d5cc339
commit 3a004d400f
8 changed files with 288 additions and 152 deletions

View file

@ -75,12 +75,7 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
@ -123,7 +118,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
#
@ -252,6 +247,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
#
# LB 15d Do not break before numeric separators (IS), even after spaces.
# SP IS QU is handled below as part of LB 19.
[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
@ -274,13 +270,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
$LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
# LB 19 and LB 19a.
# Instead of implementing both as keep-together rules as in UAX #14, we have an
# East_Asian_Width and General_Category-insensitive keep-together rule
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
# on context. This avoids having to do manual chaining over multiple characters
# with many other rules over multiple characters, as a keep-together LB19a would
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;
# QU x
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
$QU $CM* .;
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
# LB 20
# <break> $CB
@ -288,11 +293,23 @@ $QU $CM* .;
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a:
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a following LB15b:
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# LB 21 x (BA | HY | NS)
# BB x
@ -305,10 +322,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
$BB $CM* [^$CB]; # $BB x
$BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
# HL (HY | BA) x [^HL]
#
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
@ -368,9 +385,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.

View file

@ -81,12 +81,7 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
@ -129,7 +124,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
#
@ -258,6 +253,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
#
# LB 15d Do not break before numeric separators (IS), even after spaces.
# SP IS QU is handled below as part of LB 19.
[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
@ -282,13 +278,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
$LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
# LB 19 and LB 19a.
# Instead of implementing both as keep-together rules as in UAX #14, we have an
# East_Asian_Width and General_Category-insensitive keep-together rule
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
# on context. This avoids having to do manual chaining over multiple characters
# with many other rules over multiple characters, as a keep-together LB19a would
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;
# QU x
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
$QU $CM* .;
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
# LB 20
# <break> $CB
@ -296,11 +301,23 @@ $QU $CM* .;
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a:
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a following LB15b:
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# LB 21 x (BA | HY | NS)
# BB x
@ -314,10 +331,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
$BB $CM* [^$CB]; # $BB x
$BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
# HL (HY | BA) x [^HL]
#
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
@ -378,9 +395,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.

View file

@ -93,12 +93,7 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
@ -141,7 +136,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus];
#
@ -270,6 +265,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
#
# LB 15d Do not break before numeric separators (IS), even after spaces.
# SP IS QU is handled below as part of LB 19.
[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
@ -294,13 +290,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
$LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
# LB 19 and LB 19a.
# Instead of implementing both as keep-together rules as in UAX #14, we have an
# East_Asian_Width and General_Category-insensitive keep-together rule
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
# on context. This avoids having to do manual chaining over multiple characters
# with many other rules over multiple characters, as a keep-together LB19a would
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;
# QU x
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
$QU $CM* .;
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
# LB 20
# <break> $CB
@ -308,11 +313,23 @@ $QU $CM* .;
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a:
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a following LB15b:
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# LB 21 x (BA | HY | NS)
# BB x
@ -328,10 +345,10 @@ $ID $CM* ($BA | $HY | $NS);
$BB $CM* [^$CB]; # $BB x
$BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
# HL (HY | BA) x [^HL]
#
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
$HL $CM* ($HY | [ $BA - $EastAsian ] | $BAX ) $CM* [^$CB $HL]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
@ -396,9 +413,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.

View file

@ -95,12 +95,7 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
@ -154,7 +149,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus];
#
@ -283,6 +278,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
#
# LB 15d Do not break before numeric separators (IS), even after spaces.
# SP IS QU is handled below as part of LB 19.
[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
@ -307,13 +303,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
$LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
# LB 19 and LB 19a.
# Instead of implementing both as keep-together rules as in UAX #14, we have an
# East_Asian_Width and General_Category-insensitive keep-together rule
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
# on context. This avoids having to do manual chaining over multiple characters
# with many other rules over multiple characters, as a keep-together LB19a would
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;
# QU x
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
$QU $CM* .;
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
# LB 20
# <break> $CB
@ -321,11 +326,23 @@ $QU $CM* .;
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a:
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a following LB15b:
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# LB 21 x (BA | HY | NS)
# BB x
@ -341,10 +358,10 @@ $ID $CM* ($BA | $HY | $NS);
$BB $CM* [^$CB]; # $BB x
$BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
# HL (HY | BA) x [^HL]
#
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
$HL $CM* ($HY | [ $BA - $EastAsian ] | $BAX ) $CM* [^$CB $HL]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
@ -409,9 +426,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.

View file

@ -76,12 +76,7 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
@ -124,7 +119,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
#
@ -253,6 +248,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
#
# LB 15d Do not break before numeric separators (IS), even after spaces.
# SP IS QU is handled below as part of LB 19.
[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
@ -275,13 +271,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
$LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
# LB 19 and LB 19a.
# Instead of implementing both as keep-together rules as in UAX #14, we have an
# East_Asian_Width and General_Category-insensitive keep-together rule
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
# on context. This avoids having to do manual chaining over multiple characters
# with many other rules over multiple characters, as a keep-together LB19a would
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;
# QU x
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
$QU $CM* .;
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
# LB 20
# <break> $CB
@ -289,11 +294,23 @@ $QU $CM* .;
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a:
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a following LB15b:
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# LB 21 x (BA | HY | NS)
# BB x
@ -306,10 +323,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
$BB $CM* [^$CB]; # $BB x
$BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
# HL (HY | BA) x [^HL]
#
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
@ -369,9 +386,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.

View file

@ -79,12 +79,7 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
@ -127,7 +122,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
#
@ -256,6 +251,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
#
# LB 15d Do not break before numeric separators (IS), even after spaces.
# SP IS QU is handled below as part of LB 19.
[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
@ -280,13 +276,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
$LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
# LB 19 and LB 19a.
# Instead of implementing both as keep-together rules as in UAX #14, we have an
# East_Asian_Width and General_Category-insensitive keep-together rule
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
# on context. This avoids having to do manual chaining over multiple characters
# with many other rules over multiple characters, as a keep-together LB19a would
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;
# QU x
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
$QU $CM* .;
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
# LB 20
# <break> $CB
@ -294,11 +299,23 @@ $QU $CM* .;
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a:
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a following LB15b:
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# LB 21 x (BA | HY | NS)
# BB x
@ -312,10 +329,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
$BB $CM* [^$CB]; # $BB x
$BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
# HL (HY | BA) x [^HL]
#
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
@ -375,9 +392,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.

View file

@ -81,12 +81,7 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
@ -140,7 +135,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
#
@ -269,6 +264,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
#
# LB 15d Do not break before numeric separators (IS), even after spaces.
# SP IS QU is handled below as part of LB 19.
[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
@ -293,13 +289,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
$LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
# LB 19 and LB 19a.
# Instead of implementing both as keep-together rules as in UAX #14, we have an
# East_Asian_Width and General_Category-insensitive keep-together rule
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
# on context. This avoids having to do manual chaining over multiple characters
# with many other rules over multiple characters, as a keep-together LB19a would
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;
# QU x
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
$QU $CM* .;
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
# LB 20
# <break> $CB
@ -307,11 +312,23 @@ $QU $CM* .;
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a:
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a following LB15b:
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# LB 21 x (BA | HY | NS)
# BB x
@ -325,10 +342,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
$BB $CM* [^$CB]; # $BB x
$BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
# HL (HY | BA) x [^HL]
#
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
@ -388,9 +405,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.

View file

@ -76,12 +76,7 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
# without a formal name. Because ICU rules require multiple uses of the expressions,
# give them a single definition with a name
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
@ -135,7 +130,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
#
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
#
@ -264,6 +259,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
#
# LB 15d Do not break before numeric separators (IS), even after spaces.
# SP IS QU is handled below as part of LB 19.
[$LB8NonBreaks - $SP] $IS;
$SP $IS $CM* [$CanFollowIS {eof}];
@ -286,13 +282,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
$LB18Breaks = [$LB8Breaks $SP];
# LB 19
# x QU
# LB 19 and LB 19a.
# Instead of implementing both as keep-together rules as in UAX #14, we have an
# East_Asian_Width and General_Category-insensitive keep-together rule
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
# on context. This avoids having to do manual chaining over multiple characters
# with many other rules over multiple characters, as a keep-together LB19a would
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;
# QU x
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
$QU $CM* .;
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
# LB 20
# <break> $CB
@ -300,11 +305,23 @@ $QU $CM* .;
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a:
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB15a following LB15b:
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
# LB 21 x (BA | HY | NS)
# BB x
@ -317,10 +334,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
$BB $CM* [^$CB]; # $BB x
$BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
# HL (HY | BA) x [^HL]
#
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
@ -380,9 +397,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
$IS $CM* ($ALPlus | $HL);
# LB 30
($ALPlus | $HL | $NU) $CM* $OP30;
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
$CP30 $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.