From 4cfe96c5081e66e5e7062f8eecff007afe23f5cb Mon Sep 17 00:00:00 2001 From: Peter Edberg Date: Sun, 20 Feb 2022 22:32:18 -0800 Subject: [PATCH] ICU-21592 Update cj normal/loose linebreak per CSS --- icu4c/source/data/brkitr/rules/line_loose_cj.txt | 11 +++++++---- .../data/brkitr/rules/line_loose_phrase_cj.txt | 11 +++++++---- icu4c/source/data/brkitr/rules/line_normal_cj.txt | 11 +++++------ .../data/brkitr/rules/line_normal_phrase_cj.txt | 11 +++++------ .../test/testdata/break_rules/line_loose_cj.txt | 9 ++++++--- .../test/testdata/break_rules/line_normal_cj.txt | 14 +++++--------- icu4c/source/test/testdata/rbbitst.txt | 6 ++++++ icu4j/main/shared/data/icudata.jar | 4 ++-- icu4j/main/shared/data/icutzdata.jar | 4 ++-- icu4j/main/shared/data/testdata.jar | 4 ++-- .../dev/test/rbbi/break_rules/line_loose_cj.txt | 9 ++++++--- .../dev/test/rbbi/break_rules/line_normal_cj.txt | 14 +++++--------- .../core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | 8 +++++++- 13 files changed, 65 insertions(+), 51 deletions(-) diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index 880d558d65b..e921a94c290 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -17,7 +17,8 @@ # line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese. # It sets characters of class CJ to behave like ID. # In addition, it allows breaks: -# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) +# * between ID and hyphens 2010 & 2013 (both BA) +# * before 301C, 30A0 (both NS) # * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS) # * between characters of LineBreak class IN such as 2026 # * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B, @@ -238,7 +239,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # See issue ICU-20303 -$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN]; +$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN]; $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; @@ -294,8 +295,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # LB 21 x (BA | HY | NS) # BB x # -# DO allow breaks here before $BAX and $NSX, so don't include them -$LB20NonBreaks $CM* ($BA | $HY | $NS); +# DO allow breaks here before $NSX, so don't include it. +# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them. +[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS); +$ID $CM* ($BA | $HY | $NS); ^$CM+ ($BA | $HY | $NS); diff --git a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt index 3eab1f73bb1..43d116a98c9 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt @@ -15,7 +15,8 @@ # line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese. # It sets characters of class CJ to behave like ID. # In addition, it allows breaks: -# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) +# * between ID and hyphens 2010 & 2013 (both BA) +# * before 301C, 30A0 (both NS) # * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS) # * between characters of LineBreak class IN such as 2026 # * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B, @@ -251,7 +252,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # See issue ICU-20303 -$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN]; +$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN]; $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; @@ -307,8 +308,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # LB 21 x (BA | HY | NS) # BB x # -# DO allow breaks here before $BAX and $NSX, so don't include them -$LB20NonBreaks $CM* ($BA | $HY | $NS); +# DO allow breaks here before $NSX, so don't include it. +# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them. +[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS); +$ID $CM* ($BA | $HY | $NS); ^$CM+ ($BA | $HY | $NS); diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index be5ce1d3302..7ed8b35081a 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -17,7 +17,7 @@ # line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese. # It sets characters of class CJ to behave like ID. # In addition, it allows breaks: -# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) +# * before 301C, 30A0 (both NS) # It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja. # @@ -29,8 +29,7 @@ $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; -$BAX = [\u2010 \u2013]; -$BA = [[:LineBreak = Break_After:] - $BAX]; +$BA = [:LineBreak = Break_After:]; $HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; @@ -184,7 +183,7 @@ $GL $CM* .; # LB 12a Do not break before NBSP and related characters ... # [^SP BA HY] x GL # -[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL; +[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL; ^$CM+ $GL; @@ -282,7 +281,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # LB 21 x (BA | HY | NS) # BB x # -# DO allow breaks here before $BAX and $NSX, so don't include them +# DO allow breaks here before $NSX, so don't include it $LB20NonBreaks $CM* ($BA | $HY | $NS); @@ -294,7 +293,7 @@ $BB $CM* $LB20NonBreaks; # LB 21a Don't break after Hebrew + Hyphen # HL (HY | BA) x # -$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?; +$HL $CM* ($HY | $BA) $CM* [^$CB]?; # LB 21b (forward) Don't break between SY and HL # (break between HL and SY already disallowed by LB 13 above) diff --git a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt index 55a12ff2bd7..1aeafdf8028 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt @@ -15,7 +15,7 @@ # line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese. # It sets characters of class CJ to behave like ID. # In addition, it allows breaks: -# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) +# * before 301C, 30A0 (both NS) # It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja. # # The content is the same as line_normal_cj.txt except the following @@ -31,8 +31,7 @@ $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; -$BAX = [\u2010 \u2013]; -$BA = [[:LineBreak = Break_After:] - $BAX]; +$BA = [:LineBreak = Break_After:]; $HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; @@ -197,7 +196,7 @@ $GL $CM* .; # LB 12a Do not break before NBSP and related characters ... # [^SP BA HY] x GL # -[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL; +[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL; ^$CM+ $GL; @@ -295,7 +294,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # LB 21 x (BA | HY | NS) # BB x # -# DO allow breaks here before $BAX and $NSX, so don't include them +# DO allow breaks here before $NSX, so don't include it $LB20NonBreaks $CM* ($BA | $HY | $NS); @@ -307,7 +306,7 @@ $BB $CM* $LB20NonBreaks; # LB 21a Don't break after Hebrew + Hyphen # HL (HY | BA) x # -$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?; +$HL $CM* ($HY | $BA) $CM* [^$CB]?; # LB 21b (forward) Don't break between SY and HL # (break between HL and SY already disallowed by LB 13 above) diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt index d0693d567cd..7d1a02570b5 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt @@ -20,7 +20,8 @@ # line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese. # It sets characters of class CJ to behave like ID. # In addition, it allows breaks: -# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) +# * between ID and hyphens 2010 & 2013 (both BA) +# * before 301C, 30A0 (both NS) # * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS) # * between characters of LineBreak class IN such as 2026 # * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B, @@ -200,8 +201,10 @@ LB20.09: ^(HY | HH) CM* AL; LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?; -LB21.1: . CM* [BA HY NS]; -LB21.2: BB CM* [^CM CB]; +LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS]; +LB21.2: ID CM* [BA HY NS]; +LB21.3: CM+ [BA HY NS]; +LB21.4: BB CM* [^CM CB]; LB21b: SY CM* HL; diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt index a2704821441..a4e1428c2b2 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt @@ -20,7 +20,7 @@ # line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese. # It sets characters of class CJ to behave like ID. # In addition, it allows breaks: -# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) +# * before 301C, 30A0 (both NS) # It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja. type = line; @@ -28,8 +28,7 @@ locale = ja@lb=normal; AI = [:LineBreak = Ambiguous:]; AL = [:LineBreak = Alphabetic:]; -BAX = [\u2010 \u2013]; -BA = [[:LineBreak = Break_After:] - BAX]; +BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; @@ -154,7 +153,7 @@ LB11.3: WJ CM* [^CM]; LB12: GL CM* [^CM]; -LB12a: [^SP BA BAX HY] CM* GL; +LB12a: [^SP BA HY] CM* GL; # LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces. LB13.1: [^SP] CM* [CL CP EX SY]; @@ -182,12 +181,9 @@ LB20.09: ^(HY | HH) CM* AL; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. -# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so, -# should "HL BAX" not break when followed by a CB? Thats what the current -# rules do, which is why "[^CM CB]?" includes the ?. -LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?; +LB21a: HL CM* (HY | BA) CM* [^CM CB]?; -# DO allow breaks here before $BAXcm and $NSXcm, so don't include them +# DO allow breaks here before $NSXcm, so don't include it LB21.1: . CM* [BA HY NS]; LB21.2: BB CM* [^CM CB]; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 8a889749a9e..efe8a326379 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1646,11 +1646,17 @@ Bangkok)• # •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01• •\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020• +# •no brk before 2010 • +•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020• + # •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01• •\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020• +# •no brk before 2010 except ok after ID • +•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020• + # •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01• diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 2e55bab128d..1183ece6b51 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44e45ef8b72f3a5aaa1dd9e2a7db91d3a4612dc40198d63aa39e099d7a3e4755 -size 13873068 +oid sha256:8c831eb36e00ffcf96d0fecd1076ed8adb1c95ed1733196eb1bde6fb3d41dcc1 +size 13873113 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index 426473f65ed..45fc6cb0801 100644 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3465755959812f1cccb9cff48bfa34fd0f7e56388c886bc344dfc2b4c5168f01 -size 96440 +oid sha256:41b28caf401ac7baa8a7cea7a903a7c4b5b5d07c6f90dbfd96341fe6969a8eeb +size 96439 diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar index ca53c266f6f..cd0adffe8f2 100644 --- a/icu4j/main/shared/data/testdata.jar +++ b/icu4j/main/shared/data/testdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4aa5bcc6d593b17ca69ab237e7347c69142d9bcab3fdca66956dad8a0e17c4bb -size 826074 +oid sha256:6f2c9cabe519fa9d30169b730104f2d4d8a18b2c92ec1f257e60b2830bd6b0b8 +size 826073 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt index d0693d567cd..7d1a02570b5 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt @@ -20,7 +20,8 @@ # line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese. # It sets characters of class CJ to behave like ID. # In addition, it allows breaks: -# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) +# * between ID and hyphens 2010 & 2013 (both BA) +# * before 301C, 30A0 (both NS) # * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS) # * between characters of LineBreak class IN such as 2026 # * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B, @@ -200,8 +201,10 @@ LB20.09: ^(HY | HH) CM* AL; LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?; -LB21.1: . CM* [BA HY NS]; -LB21.2: BB CM* [^CM CB]; +LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS]; +LB21.2: ID CM* [BA HY NS]; +LB21.3: CM+ [BA HY NS]; +LB21.4: BB CM* [^CM CB]; LB21b: SY CM* HL; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt index a2704821441..a4e1428c2b2 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt @@ -20,7 +20,7 @@ # line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese. # It sets characters of class CJ to behave like ID. # In addition, it allows breaks: -# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) +# * before 301C, 30A0 (both NS) # It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja. type = line; @@ -28,8 +28,7 @@ locale = ja@lb=normal; AI = [:LineBreak = Ambiguous:]; AL = [:LineBreak = Alphabetic:]; -BAX = [\u2010 \u2013]; -BA = [[:LineBreak = Break_After:] - BAX]; +BA = [:LineBreak = Break_After:]; HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; @@ -154,7 +153,7 @@ LB11.3: WJ CM* [^CM]; LB12: GL CM* [^CM]; -LB12a: [^SP BA BAX HY] CM* GL; +LB12a: [^SP BA HY] CM* GL; # LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces. LB13.1: [^SP] CM* [CL CP EX SY]; @@ -182,12 +181,9 @@ LB20.09: ^(HY | HH) CM* AL; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. -# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so, -# should "HL BAX" not break when followed by a CB? Thats what the current -# rules do, which is why "[^CM CB]?" includes the ?. -LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?; +LB21a: HL CM* (HY | BA) CM* [^CM CB]?; -# DO allow breaks here before $BAXcm and $NSXcm, so don't include them +# DO allow breaks here before $NSXcm, so don't include it LB21.1: . CM* [BA HY NS]; LB21.2: BB CM* [^CM CB]; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index e226e341865..efe8a326379 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1646,11 +1646,17 @@ Bangkok)• # •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01• •\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020• +# •no brk before 2010 • +•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020• + # •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01• •\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020• +# •no brk before 2010 except ok after ID • +•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020• + # •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01• @@ -1888,7 +1894,7 @@ Bangkok)• #[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。• •\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002• -#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た +#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た• •\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f• #る文字「そうだ、京都」-> る•文字•「そうだ、•京都」• •\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•