mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-21592 Update cj normal/loose linebreak per CSS
This commit is contained in:
parent
335c403618
commit
4cfe96c508
13 changed files with 65 additions and 51 deletions
|
@ -17,7 +17,8 @@
|
|||
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# * between ID and hyphens 2010 & 2013 (both BA)
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
|
||||
# * between characters of LineBreak class IN such as 2026
|
||||
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
|
||||
|
@ -238,7 +239,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
|||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
|
@ -294,8 +295,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $BAX and $NSX, so don't include them
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
# DO allow breaks here before $NSX, so don't include it.
|
||||
# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
|
||||
[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
|
||||
$ID $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
|
|
@ -15,7 +15,8 @@
|
|||
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# * between ID and hyphens 2010 & 2013 (both BA)
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
|
||||
# * between characters of LineBreak class IN such as 2026
|
||||
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
|
||||
|
@ -251,7 +252,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
|||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
|
@ -307,8 +308,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $BAX and $NSX, so don't include them
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
# DO allow breaks here before $NSX, so don't include it.
|
||||
# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
|
||||
[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
|
||||
$ID $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
|
||||
#
|
||||
|
@ -29,8 +29,7 @@
|
|||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BAX = [\u2010 \u2013];
|
||||
$BA = [[:LineBreak = Break_After:] - $BAX];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
|
@ -184,7 +183,7 @@ $GL $CM* .;
|
|||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
@ -282,7 +281,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $BAX and $NSX, so don't include them
|
||||
# DO allow breaks here before $NSX, so don't include it
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
|
@ -294,7 +293,7 @@ $BB $CM* $LB20NonBreaks;
|
|||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
#
|
||||
# The content is the same as line_normal_cj.txt except the following
|
||||
|
@ -31,8 +31,7 @@
|
|||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BAX = [\u2010 \u2013];
|
||||
$BA = [[:LineBreak = Break_After:] - $BAX];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
|
@ -197,7 +196,7 @@ $GL $CM* .;
|
|||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
@ -295,7 +294,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $BAX and $NSX, so don't include them
|
||||
# DO allow breaks here before $NSX, so don't include it
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
|
@ -307,7 +306,7 @@ $BB $CM* $LB20NonBreaks;
|
|||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
|
|
|
@ -20,7 +20,8 @@
|
|||
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# * between ID and hyphens 2010 & 2013 (both BA)
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
|
||||
# * between characters of LineBreak class IN such as 2026
|
||||
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
|
||||
|
@ -200,8 +201,10 @@ LB20.09: ^(HY | HH) CM* AL;
|
|||
|
||||
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
|
||||
LB21.2: ID CM* [BA HY NS];
|
||||
LB21.3: CM+ [BA HY NS];
|
||||
LB21.4: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
|
||||
type = line;
|
||||
|
@ -28,8 +28,7 @@ locale = ja@lb=normal;
|
|||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BAX = [\u2010 \u2013];
|
||||
BA = [[:LineBreak = Break_After:] - BAX];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
|
@ -154,7 +153,7 @@ LB11.3: WJ CM* [^CM];
|
|||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA BAX HY] CM* GL;
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces.
|
||||
LB13.1: [^SP] CM* [CL CP EX SY];
|
||||
|
@ -182,12 +181,9 @@ LB20.09: ^(HY | HH) CM* AL;
|
|||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
|
||||
# should "HL BAX" not break when followed by a CB? Thats what the current
|
||||
# rules do, which is why "[^CM CB]?" includes the ?.
|
||||
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB]?;
|
||||
|
||||
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
|
||||
# DO allow breaks here before $NSXcm, so don't include it
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
|
|
6
icu4c/source/test/testdata/rbbitst.txt
vendored
6
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -1646,11 +1646,17 @@ Bangkok)•</data>
|
|||
# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01•
|
||||
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
|
||||
|
||||
# •no brk before 2010 •
|
||||
<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
|
||||
|
||||
<locale ja@lb=loose>
|
||||
<line>
|
||||
# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01•
|
||||
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>
|
||||
|
||||
# •no brk before 2010 except ok after ID •
|
||||
<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
|
||||
|
||||
<locale en@lb=strict>
|
||||
<line>
|
||||
# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:44e45ef8b72f3a5aaa1dd9e2a7db91d3a4612dc40198d63aa39e099d7a3e4755
|
||||
size 13873068
|
||||
oid sha256:8c831eb36e00ffcf96d0fecd1076ed8adb1c95ed1733196eb1bde6fb3d41dcc1
|
||||
size 13873113
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3465755959812f1cccb9cff48bfa34fd0f7e56388c886bc344dfc2b4c5168f01
|
||||
size 96440
|
||||
oid sha256:41b28caf401ac7baa8a7cea7a903a7c4b5b5d07c6f90dbfd96341fe6969a8eeb
|
||||
size 96439
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4aa5bcc6d593b17ca69ab237e7347c69142d9bcab3fdca66956dad8a0e17c4bb
|
||||
size 826074
|
||||
oid sha256:6f2c9cabe519fa9d30169b730104f2d4d8a18b2c92ec1f257e60b2830bd6b0b8
|
||||
size 826073
|
||||
|
|
|
@ -20,7 +20,8 @@
|
|||
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# * between ID and hyphens 2010 & 2013 (both BA)
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
|
||||
# * between characters of LineBreak class IN such as 2026
|
||||
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
|
||||
|
@ -200,8 +201,10 @@ LB20.09: ^(HY | HH) CM* AL;
|
|||
|
||||
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
|
||||
LB21.2: ID CM* [BA HY NS];
|
||||
LB21.3: CM+ [BA HY NS];
|
||||
LB21.4: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
|
||||
type = line;
|
||||
|
@ -28,8 +28,7 @@ locale = ja@lb=normal;
|
|||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BAX = [\u2010 \u2013];
|
||||
BA = [[:LineBreak = Break_After:] - BAX];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
|
@ -154,7 +153,7 @@ LB11.3: WJ CM* [^CM];
|
|||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA BAX HY] CM* GL;
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces.
|
||||
LB13.1: [^SP] CM* [CL CP EX SY];
|
||||
|
@ -182,12 +181,9 @@ LB20.09: ^(HY | HH) CM* AL;
|
|||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
|
||||
# should "HL BAX" not break when followed by a CB? Thats what the current
|
||||
# rules do, which is why "[^CM CB]?" includes the ?.
|
||||
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB]?;
|
||||
|
||||
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
|
||||
# DO allow breaks here before $NSXcm, so don't include it
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
|
|
|
@ -1646,11 +1646,17 @@ Bangkok)•</data>
|
|||
# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01•
|
||||
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
|
||||
|
||||
# •no brk before 2010 •
|
||||
<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
|
||||
|
||||
<locale ja@lb=loose>
|
||||
<line>
|
||||
# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01•
|
||||
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>
|
||||
|
||||
# •no brk before 2010 except ok after ID •
|
||||
<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
|
||||
|
||||
<locale en@lb=strict>
|
||||
<line>
|
||||
# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•
|
||||
|
@ -1888,7 +1894,7 @@ Bangkok)•</data>
|
|||
<line>
|
||||
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
|
||||
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
|
||||
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
|
||||
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た•
|
||||
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
|
||||
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
|
||||
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
|
||||
|
|
Loading…
Add table
Reference in a new issue