mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-22707 Patch tailored rules (manually for hunks 1 and 6 on loose(_phrase)?_cj)
This commit is contained in:
parent
782d5cc339
commit
3a004d400f
8 changed files with 288 additions and 152 deletions
|
@ -75,12 +75,7 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
|
@ -123,7 +118,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
|||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
|
@ -252,6 +247,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
|||
|
||||
#
|
||||
# LB 15d Do not break before numeric separators (IS), even after spaces.
|
||||
# SP IS QU is handled below as part of LB 19.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
|
@ -274,13 +270,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
|||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
# LB 19 and LB 19a.
|
||||
# Instead of implementing both as keep-together rules as in UAX #14, we have an
|
||||
# East_Asian_Width and General_Category-insensitive keep-together rule
|
||||
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
|
||||
# on context. This avoids having to do manual chaining over multiple characters
|
||||
# with many other rules over multiple characters, as a keep-together LB19a would
|
||||
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
|
||||
$QU $CM* .;
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
|
@ -288,11 +293,23 @@ $QU $CM* .;
|
|||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
|
||||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a:
|
||||
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a following LB15b:
|
||||
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
|
@ -305,10 +322,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
|
|||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
|
||||
# HL (HY | BA) x [^HL]
|
||||
#
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
|
@ -368,9 +385,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
|||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
|
||||
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
|
|
|
@ -81,12 +81,7 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
|
@ -129,7 +124,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
|||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
|
@ -258,6 +253,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
|||
|
||||
#
|
||||
# LB 15d Do not break before numeric separators (IS), even after spaces.
|
||||
# SP IS QU is handled below as part of LB 19.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
|
@ -282,13 +278,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
|||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
# LB 19 and LB 19a.
|
||||
# Instead of implementing both as keep-together rules as in UAX #14, we have an
|
||||
# East_Asian_Width and General_Category-insensitive keep-together rule
|
||||
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
|
||||
# on context. This avoids having to do manual chaining over multiple characters
|
||||
# with many other rules over multiple characters, as a keep-together LB19a would
|
||||
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
|
||||
$QU $CM* .;
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
|
@ -296,11 +301,23 @@ $QU $CM* .;
|
|||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
|
||||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a:
|
||||
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a following LB15b:
|
||||
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
|
@ -314,10 +331,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
|
|||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
|
||||
# HL (HY | BA) x [^HL]
|
||||
#
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
|
@ -378,9 +395,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
|||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
|
||||
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
|
|
|
@ -93,12 +93,7 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
|
@ -141,7 +136,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
|||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
|
@ -270,6 +265,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
|||
|
||||
#
|
||||
# LB 15d Do not break before numeric separators (IS), even after spaces.
|
||||
# SP IS QU is handled below as part of LB 19.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
|
@ -294,13 +290,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
|||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
# LB 19 and LB 19a.
|
||||
# Instead of implementing both as keep-together rules as in UAX #14, we have an
|
||||
# East_Asian_Width and General_Category-insensitive keep-together rule
|
||||
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
|
||||
# on context. This avoids having to do manual chaining over multiple characters
|
||||
# with many other rules over multiple characters, as a keep-together LB19a would
|
||||
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
|
||||
$QU $CM* .;
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
|
@ -308,11 +313,23 @@ $QU $CM* .;
|
|||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
|
||||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a:
|
||||
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a following LB15b:
|
||||
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
|
@ -328,10 +345,10 @@ $ID $CM* ($BA | $HY | $NS);
|
|||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
|
||||
# HL (HY | BA) x [^HL]
|
||||
#
|
||||
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
|
||||
$HL $CM* ($HY | [ $BA - $EastAsian ] | $BAX ) $CM* [^$CB $HL]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
|
@ -396,9 +413,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
|||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
|
||||
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
|
|
|
@ -95,12 +95,7 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
|
@ -154,7 +149,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
|||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
|
@ -283,6 +278,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
|||
|
||||
#
|
||||
# LB 15d Do not break before numeric separators (IS), even after spaces.
|
||||
# SP IS QU is handled below as part of LB 19.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
|
@ -307,13 +303,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
|||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
# LB 19 and LB 19a.
|
||||
# Instead of implementing both as keep-together rules as in UAX #14, we have an
|
||||
# East_Asian_Width and General_Category-insensitive keep-together rule
|
||||
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
|
||||
# on context. This avoids having to do manual chaining over multiple characters
|
||||
# with many other rules over multiple characters, as a keep-together LB19a would
|
||||
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
|
||||
$QU $CM* .;
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
|
@ -321,11 +326,23 @@ $QU $CM* .;
|
|||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
|
||||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a:
|
||||
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a following LB15b:
|
||||
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
|
@ -341,10 +358,10 @@ $ID $CM* ($BA | $HY | $NS);
|
|||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
|
||||
# HL (HY | BA) x [^HL]
|
||||
#
|
||||
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
|
||||
$HL $CM* ($HY | [ $BA - $EastAsian ] | $BAX ) $CM* [^$CB $HL]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
|
@ -409,9 +426,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
|||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
|
||||
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
|
|
|
@ -76,12 +76,7 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
|
@ -124,7 +119,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
|||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
|
@ -253,6 +248,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
|||
|
||||
#
|
||||
# LB 15d Do not break before numeric separators (IS), even after spaces.
|
||||
# SP IS QU is handled below as part of LB 19.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
|
@ -275,13 +271,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
|||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
# LB 19 and LB 19a.
|
||||
# Instead of implementing both as keep-together rules as in UAX #14, we have an
|
||||
# East_Asian_Width and General_Category-insensitive keep-together rule
|
||||
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
|
||||
# on context. This avoids having to do manual chaining over multiple characters
|
||||
# with many other rules over multiple characters, as a keep-together LB19a would
|
||||
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
|
||||
$QU $CM* .;
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
|
@ -289,11 +294,23 @@ $QU $CM* .;
|
|||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
|
||||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a:
|
||||
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a following LB15b:
|
||||
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
|
@ -306,10 +323,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
|
|||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
|
||||
# HL (HY | BA) x [^HL]
|
||||
#
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
|
@ -369,9 +386,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
|||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
|
||||
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
|
|
|
@ -79,12 +79,7 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
|
@ -127,7 +122,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
|||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
|
@ -256,6 +251,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
|||
|
||||
#
|
||||
# LB 15d Do not break before numeric separators (IS), even after spaces.
|
||||
# SP IS QU is handled below as part of LB 19.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
|
@ -280,13 +276,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
|||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
# LB 19 and LB 19a.
|
||||
# Instead of implementing both as keep-together rules as in UAX #14, we have an
|
||||
# East_Asian_Width and General_Category-insensitive keep-together rule
|
||||
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
|
||||
# on context. This avoids having to do manual chaining over multiple characters
|
||||
# with many other rules over multiple characters, as a keep-together LB19a would
|
||||
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
|
||||
$QU $CM* .;
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
|
@ -294,11 +299,23 @@ $QU $CM* .;
|
|||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
|
||||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a:
|
||||
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a following LB15b:
|
||||
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
|
@ -312,10 +329,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
|
|||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
|
||||
# HL (HY | BA) x [^HL]
|
||||
#
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
|
@ -375,9 +392,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
|||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
|
||||
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
|
|
|
@ -81,12 +81,7 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
|
@ -140,7 +135,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
|||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
|
@ -269,6 +264,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
|||
|
||||
#
|
||||
# LB 15d Do not break before numeric separators (IS), even after spaces.
|
||||
# SP IS QU is handled below as part of LB 19.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
|
@ -293,13 +289,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
|||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
# LB 19 and LB 19a.
|
||||
# Instead of implementing both as keep-together rules as in UAX #14, we have an
|
||||
# East_Asian_Width and General_Category-insensitive keep-together rule
|
||||
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
|
||||
# on context. This avoids having to do manual chaining over multiple characters
|
||||
# with many other rules over multiple characters, as a keep-together LB19a would
|
||||
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
|
||||
$QU $CM* .;
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
|
@ -307,11 +312,23 @@ $QU $CM* .;
|
|||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
|
||||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a:
|
||||
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a following LB15b:
|
||||
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
|
@ -325,10 +342,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
|
|||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
|
||||
# HL (HY | BA) x [^HL]
|
||||
#
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
|
@ -388,9 +405,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
|||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
|
||||
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
|
|
|
@ -76,12 +76,7 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
|
@ -135,7 +130,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
|||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL [$OP - $EastAsian] $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
|
@ -264,6 +259,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
|||
|
||||
#
|
||||
# LB 15d Do not break before numeric separators (IS), even after spaces.
|
||||
# SP IS QU is handled below as part of LB 19.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
|
@ -286,13 +282,22 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
|||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
# LB 19 and LB 19a.
|
||||
# Instead of implementing both as keep-together rules as in UAX #14, we have an
|
||||
# East_Asian_Width and General_Category-insensitive keep-together rule
|
||||
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
|
||||
# on context. This avoids having to do manual chaining over multiple characters
|
||||
# with many other rules over multiple characters, as a keep-together LB19a would
|
||||
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
[$LB18NonBreaks & $EastAsian - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ $EastAsian - $CM];
|
||||
|
||||
$QU $CM* .;
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
[$LB18NonBreaks & $EastAsian] $CM* [\p{Pf} & $QU] $CM* $CMX / [ $EastAsian - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
|
@ -300,11 +305,23 @@ $QU $CM* .;
|
|||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
|
||||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
$OP $CM* $SP+ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a:
|
||||
($OP $CM* $SP+ | [$OP $QU $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB15a following LB15b:
|
||||
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP ($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
|
@ -317,10 +334,10 @@ $LB20NonBreaks $CM* ($BA | $HY | $NS);
|
|||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
|
||||
# HL (HY | BA) x [^HL]
|
||||
#
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
$HL $CM* ($HY | [ $BA - $EastAsian ] ) $CM* [^$CB $HL]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
|
@ -380,9 +397,9 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
|||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* [$OP - $EastAsian];
|
||||
^$CM+ [$OP - $EastAsian]; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
[$CP - $EastAsian] $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
|
|
Loading…
Add table
Reference in a new issue