From c882c94d838578b5efe40e68019b6335fc7bd6cf Mon Sep 17 00:00:00 2001 From: allenwtsu Date: Fri, 28 Jan 2022 11:28:17 +0800 Subject: [PATCH] ICU-21699 Revise rule file 1. \u30fc doesn't belong to Hira, Kana nor Han. Add it into CJK dictionary 2. Include fullwidth char into ALPlus --- .../data/brkitr/rules/line_loose_phrase_cj.txt | 11 ++++++----- .../data/brkitr/rules/line_normal_phrase_cj.txt | 11 ++++++----- icu4c/source/data/brkitr/rules/line_phrase_cj.txt | 11 ++++++----- icu4c/source/test/testdata/rbbitst.txt | 10 ++++++++++ icu4j/main/shared/data/icudata.jar | 4 ++-- icu4j/main/shared/data/icutzdata.jar | 4 ++-- icu4j/main/shared/data/testdata.jar | 4 ++-- .../core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | 12 +++++++++++- 8 files changed, 45 insertions(+), 22 deletions(-) diff --git a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt index a10b16897b7..3eab1f73bb1 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt @@ -93,9 +93,8 @@ $ZWJ = [:LineBreak = ZWJ:]; # without a formal name. Because ICU rules require multiple uses of the expressions, # give them a single definition with a name -$EAFWH = [\p{ea=F}\p{ea=W}\p{ea=H}]; -$OP30 = [$OP - $EAFWH]; -$CP30 = [$CP - $EAFWH]; +$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; $ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}]; @@ -115,7 +114,7 @@ $Katakana = [:Katakana:]; $Hiragana = [:Hiragana:]; $HangulSyllable = [\uac00-\ud7a3]; $ComplexContext = [:LineBreak = Complex_Context:]; -$KanaKanji = [$Han $Hiragana $Katakana]; +$KanaKanji = [$Han $Hiragana $Katakana \u30fc]; $dictionaryCJK = [$KanaKanji $HangulSyllable]; $dictionary = [$ComplexContext $dictionaryCJK]; @@ -127,7 +126,9 @@ $dictionary = [$ComplexContext $dictionaryCJK]; # XX (Unknown, unassigned) # as $AL (Alphabetic) # -$ALPlus = [$AL $AI $SG $XX $EAFWH [$dictionary-[[:Mn:][:Mc:]]]]; +# Let fullwidth-ASCII digits and letters be part of words. +$FW_alphanum = [\uff10-\uff19\uff21-\uff3a\uff41-\uff5a]; +$ALPlus = [$AL $AI $SG $XX $FW_alphanum [$dictionary-[[:Mn:][:Mc:]]]]; ## ------------------------------------------------- diff --git a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt index eb51cdff54c..55a12ff2bd7 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt @@ -81,9 +81,8 @@ $ZWJ = [:LineBreak = ZWJ:]; # without a formal name. Because ICU rules require multiple uses of the expressions, # give them a single definition with a name -$EAFWH = [\p{ea=F}\p{ea=W}\p{ea=H}]; -$OP30 = [$OP - $EAFWH]; -$CP30 = [$CP - $EAFWH]; +$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; $ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}]; @@ -103,7 +102,7 @@ $Katakana = [:Katakana:]; $Hiragana = [:Hiragana:]; $HangulSyllable = [\uac00-\ud7a3]; $ComplexContext = [:LineBreak = Complex_Context:]; -$KanaKanji = [$Han $Hiragana $Katakana]; +$KanaKanji = [$Han $Hiragana $Katakana \u30fc]; $dictionaryCJK = [$KanaKanji $HangulSyllable]; $dictionary = [$ComplexContext $dictionaryCJK]; @@ -115,7 +114,9 @@ $dictionary = [$ComplexContext $dictionaryCJK]; # XX (Unknown, unassigned) # as $AL (Alphabetic) # -$ALPlus = [$AL $AI $SG $XX $EAFWH [$dictionary-[[:Mn:][:Mc:]]]]; +# Let fullwidth-ASCII digits and letters be part of words. +$FW_alphanum = [\uff10-\uff19\uff21-\uff3a\uff41-\uff5a]; +$ALPlus = [$AL $AI $SG $XX $FW_alphanum [$dictionary-[[:Mn:][:Mc:]]]]; ## ------------------------------------------------- diff --git a/icu4c/source/data/brkitr/rules/line_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_phrase_cj.txt index b9c713f13a1..290b9b8c83a 100644 --- a/icu4c/source/data/brkitr/rules/line_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_phrase_cj.txt @@ -75,9 +75,8 @@ $ZWJ = [:LineBreak = ZWJ:]; # without a formal name. Because ICU rules require multiple uses of the expressions, # give them a single definition with a name -$EAFWH = [\p{ea=F}\p{ea=W}\p{ea=H}]; -$OP30 = [$OP - $EAFWH]; -$CP30 = [$CP - $EAFWH]; +$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; $ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}]; @@ -97,7 +96,7 @@ $Katakana = [:Katakana:]; $Hiragana = [:Hiragana:]; $HangulSyllable = [\uac00-\ud7a3]; $ComplexContext = [:LineBreak = Complex_Context:]; -$KanaKanji = [$Han $Hiragana $Katakana]; +$KanaKanji = [$Han $Hiragana $Katakana \u30fc]; $dictionaryCJK = [$KanaKanji $HangulSyllable]; $dictionary = [$ComplexContext $dictionaryCJK]; @@ -109,7 +108,9 @@ $dictionary = [$ComplexContext $dictionaryCJK]; # XX (Unknown, unassigned) # as $AL (Alphabetic) # -$ALPlus = [$AL $AI $SG $XX $EAFWH [$dictionary-[[:Mn:][:Mc:]]]]; +# Let fullwidth-ASCII digits and letters be part of words. +$FW_alphanum = [\uff10-\uff19\uff21-\uff3a\uff41-\uff5a]; +$ALPlus = [$AL $AI $SG $XX $FW_alphanum [$dictionary-[[:Mn:][:Mc:]]]]; ## ------------------------------------------------- diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 702bb479038..1b74e31a810 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1901,6 +1901,16 @@ Bangkok)• #Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19 #𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あ•かよろし) •\uD82C\uDC48\u308B\uD82C\uDC38•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\uD82C\uDC19\u3088\u308D\u3057•\uFF08\u3042•\u304B\u3088\u308D\u3057\uFF09• +#最初に目に入るのは、「許諾なき写真禁止」のサインである。 -> 最初に▁目に▁入るのは、▁「許諾なき▁写真▁禁止」▁の▁サインで▁ある。 +•\u6700\u521D\u306B•\u76EE\u306B•\u5165\u308B\u306E\u306F\u3001•\u300C\u8A31\u8AFE\u306A\u304D•\u5199\u771F•\u7981\u6B62\u300D•\u306E•\u30B5\u30A4\u30F3\u3067•\u3042\u308B\u3002• +# docomoのサイト情報によると、78000パケット以上▁使うならパケ放題がいいとか -> docomoの▁サイト▁情報によると、▁78000パケット▁以上▁使う▁なら▁パケ▁放題が▁いい▁とか +•\uFF44\uFF4F\uFF43\uFF4F\uFF4D\uFF4F\u306E•\u30B5\u30A4\u30C8•\u60C5\u5831•\u306B\u3088\u308B\u3068\u3001•\uFF17\uFF18\uFF10\uFF10\uFF10\u30D1\u30B1\u30C3\u30C8•\u4EE5\u4E0A•\u4F7F\u3046•\u306A\u3089•\u30D1\u30B1•\u653E\u984C\u304C•\u3044\u3044•\u3068\u304B• +#日本の携帯はCDMAの形だといわれています -> 日本の▁携帯は▁CDMAの▁形だと▁いわれ▁ています +•\u65E5\u672C\u306E•\u643A\u5E2F\u306F\uFF23\uFF24\uFF2D\uFF21\u306E•\u5F62\u3060\u3068•\u3044\u308F\u308C•\u3066\u3044\u307E\u3059• +#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です +•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059• +#プリペイドカード携帯布教 -> プリペイドカード▁携帯▁布教 +•\u30D7\u30EA\u30DA\u30A4\u30C9\u30AB\u30FC\u30C9•\u643A\u5E2F•\u5E03\u6559• #################################################################################### # diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 1466f080d91..0dc31673b4c 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a53ab1162a9a25cf8f1e3abef3d4aa24acb11cb7677652be8d76e6843cbe5b35 -size 13871000 +oid sha256:658d41e63f73089111e306f0d90b2b3aa4757146d4eda963f8e181c0acf2b9ab +size 13866832 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index 78ae065238d..7ce295810ed 100644 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57ada8a76a3d3d728a4ffd58fbcbe6aba7de2cf3b36d08ee95b2a9a222403880 -size 96439 +oid sha256:79e4e94b0fc31ed37701b94512b985c6bf80429bb3420adb645abc4f26c913d7 +size 96440 diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar index 5d0255244ba..a504f688321 100644 --- a/icu4j/main/shared/data/testdata.jar +++ b/icu4j/main/shared/data/testdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccde7f7d4e6bf0a92d2c3d6999afabbb063a154b7d66b3fd79d6ab0707e1fad7 -size 826063 +oid sha256:c20f10955116ac255c504e8743a9c4b2eab9c0bac7dc2250f1fbe775100f3b8a +size 826064 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 2a238a80f9c..e8b07361ff8 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1901,7 +1901,17 @@ Bangkok)• #Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19 #𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あ•かよろし) •\uD82C\uDC48\u308B\uD82C\uDC38•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\uD82C\uDC19\u3088\u308D\u3057•\uFF08\u3042•\u304B\u3088\u308D\u3057\uFF09• - +•\uD82C\uDC48\u308B\uD82C\uDC38•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\uD82C\uDC19\u3088\u308D\u3057•\uFF08\u3042•\u304B\u3088\u308D\u3057\uFF09• +#最初に目に入るのは、「許諾なき写真禁止」のサインである。 -> 最初に▁目に▁入るのは、▁「許諾なき▁写真▁禁止」▁の▁サインで▁ある。 +•\u6700\u521D\u306B•\u76EE\u306B•\u5165\u308B\u306E\u306F\u3001•\u300C\u8A31\u8AFE\u306A\u304D•\u5199\u771F•\u7981\u6B62\u300D•\u306E•\u30B5\u30A4\u30F3\u3067•\u3042\u308B\u3002• +# docomoのサイト情報によると、78000パケット以上▁使うならパケ放題がいいとか -> docomoの▁サイト▁情報によると、▁78000パケット▁以上▁使う▁なら▁パケ▁放題が▁いい▁とか +•\uFF44\uFF4F\uFF43\uFF4F\uFF4D\uFF4F\u306E•\u30B5\u30A4\u30C8•\u60C5\u5831•\u306B\u3088\u308B\u3068\u3001•\uFF17\uFF18\uFF10\uFF10\uFF10\u30D1\u30B1\u30C3\u30C8•\u4EE5\u4E0A•\u4F7F\u3046•\u306A\u3089•\u30D1\u30B1•\u653E\u984C\u304C•\u3044\u3044•\u3068\u304B• +#日本の携帯はCDMAの形だといわれています -> 日本の▁携帯は▁CDMAの▁形だと▁いわれ▁ています +•\u65E5\u672C\u306E•\u643A\u5E2F\u306F\uFF23\uFF24\uFF2D\uFF21\u306E•\u5F62\u3060\u3068•\u3044\u308F\u308C•\u3066\u3044\u307E\u3059• +#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です +•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059• +#プリペイドカード携帯布教 -> プリペイドカード▁携帯▁布教 +•\u30D7\u30EA\u30DA\u30A4\u30C9\u30AB\u30FC\u30C9•\u643A\u5E2F•\u5E03\u6559• #################################################################################### #