ICU-21592 Update cj normal/loose linebreak per CSS

This commit is contained in:
Peter Edberg 2022-02-20 22:32:18 -08:00 committed by Peter Edberg
parent 335c403618
commit 4cfe96c508
13 changed files with 65 additions and 51 deletions

View file

@ -17,7 +17,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@ -238,7 +239,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# See issue ICU-20303
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
@ -294,8 +295,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
$LB20NonBreaks $CM* ($BA | $HY | $NS);
# DO allow breaks here before $NSX, so don't include it.
# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
$ID $CM* ($BA | $HY | $NS);
^$CM+ ($BA | $HY | $NS);

View file

@ -15,7 +15,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@ -251,7 +252,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# See issue ICU-20303
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
@ -307,8 +308,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
$LB20NonBreaks $CM* ($BA | $HY | $NS);
# DO allow breaks here before $NSX, so don't include it.
# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
$ID $CM* ($BA | $HY | $NS);
^$CM+ ($BA | $HY | $NS);

View file

@ -17,7 +17,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
#
@ -29,8 +29,7 @@
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
@ -184,7 +183,7 @@ $GL $CM* .;
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
^$CM+ $GL;
@ -282,7 +281,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
# DO allow breaks here before $NSX, so don't include it
$LB20NonBreaks $CM* ($BA | $HY | $NS);
@ -294,7 +293,7 @@ $BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)

View file

@ -15,7 +15,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
#
# The content is the same as line_normal_cj.txt except the following
@ -31,8 +31,7 @@
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
@ -197,7 +196,7 @@ $GL $CM* .;
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
^$CM+ $GL;
@ -295,7 +294,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
# DO allow breaks here before $NSX, so don't include it
$LB20NonBreaks $CM* ($BA | $HY | $NS);
@ -307,7 +306,7 @@ $BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)

View file

@ -20,7 +20,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@ -200,8 +201,10 @@ LB20.09: ^(HY | HH) CM* AL;
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
LB21.2: ID CM* [BA HY NS];
LB21.3: CM+ [BA HY NS];
LB21.4: BB CM* [^CM CB];
LB21b: SY CM* HL;

View file

@ -20,7 +20,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
type = line;
@ -28,8 +28,7 @@ locale = ja@lb=normal;
AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
BAX = [\u2010 \u2013];
BA = [[:LineBreak = Break_After:] - BAX];
BA = [:LineBreak = Break_After:];
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
@ -154,7 +153,7 @@ LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA BAX HY] CM* GL;
LB12a: [^SP BA HY] CM* GL;
# LB 13 Do not break before ] or ! or /, even after spaces.
LB13.1: [^SP] CM* [CL CP EX SY];
@ -182,12 +181,9 @@ LB20.09: ^(HY | HH) CM* AL;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
# should "HL BAX" not break when followed by a CB? Thats what the current
# rules do, which is why "[^CM CB]?" includes the ?.
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21a: HL CM* (HY | BA) CM* [^CM CB]?;
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
# DO allow breaks here before $NSXcm, so don't include it
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];

View file

@ -1646,11 +1646,17 @@ Bangkok)•</data>
# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
# •no brk before 2010 •
<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
<locale ja@lb=loose>
<line>
# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>
# •no brk before 2010 except ok after ID •
<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
<locale en@lb=strict>
<line>
# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:44e45ef8b72f3a5aaa1dd9e2a7db91d3a4612dc40198d63aa39e099d7a3e4755
size 13873068
oid sha256:8c831eb36e00ffcf96d0fecd1076ed8adb1c95ed1733196eb1bde6fb3d41dcc1
size 13873113

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3465755959812f1cccb9cff48bfa34fd0f7e56388c886bc344dfc2b4c5168f01
size 96440
oid sha256:41b28caf401ac7baa8a7cea7a903a7c4b5b5d07c6f90dbfd96341fe6969a8eeb
size 96439

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4aa5bcc6d593b17ca69ab237e7347c69142d9bcab3fdca66956dad8a0e17c4bb
size 826074
oid sha256:6f2c9cabe519fa9d30169b730104f2d4d8a18b2c92ec1f257e60b2830bd6b0b8
size 826073

View file

@ -20,7 +20,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@ -200,8 +201,10 @@ LB20.09: ^(HY | HH) CM* AL;
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
LB21.2: ID CM* [BA HY NS];
LB21.3: CM+ [BA HY NS];
LB21.4: BB CM* [^CM CB];
LB21b: SY CM* HL;

View file

@ -20,7 +20,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
type = line;
@ -28,8 +28,7 @@ locale = ja@lb=normal;
AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
BAX = [\u2010 \u2013];
BA = [[:LineBreak = Break_After:] - BAX];
BA = [:LineBreak = Break_After:];
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
@ -154,7 +153,7 @@ LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA BAX HY] CM* GL;
LB12a: [^SP BA HY] CM* GL;
# LB 13 Do not break before ] or ! or /, even after spaces.
LB13.1: [^SP] CM* [CL CP EX SY];
@ -182,12 +181,9 @@ LB20.09: ^(HY | HH) CM* AL;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
# should "HL BAX" not break when followed by a CB? Thats what the current
# rules do, which is why "[^CM CB]?" includes the ?.
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21a: HL CM* (HY | BA) CM* [^CM CB]?;
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
# DO allow breaks here before $NSXcm, so don't include it
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];

View file

@ -1646,11 +1646,17 @@ Bangkok)•</data>
# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
# •no brk before 2010 •
<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
<locale ja@lb=loose>
<line>
# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>
# •no brk before 2010 except ok after ID •
<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
<locale en@lb=strict>
<line>
# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•
@ -1888,7 +1894,7 @@ Bangkok)•</data>
<line>
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>