ICU-22285 omit the gb2312 & big5han collation tailorings by default

This commit is contained in:
Markus Scherer 2023-03-13 16:57:10 -07:00
parent 97510de5d4
commit 2d9fa3fa99
5 changed files with 56 additions and 13 deletions

View file

@ -78,6 +78,32 @@ To build ICU4J with custom data, you must first build ICU4C with custom data
and then generate the JAR file. For more information on building ICU4J, read the
[ICU4J Readme](../icu4j/).
### Default Configuration
By default (without a configuration file and without option flags),
the ICU data file includes all of the data in the ICU source tree.
Since ICU 73 (2023q2), there is an exception:
By default, the "big5han" and "gb2312han" collation tailorings are omitted.
These mimic the order of their respective charsets, are relatively large, and rarely used.
(See [ICU-22285](https://unicode-org.atlassian.net/browse/ICU-22285).)
The default configuration is equivalent to a filter file like this:
{
"resourceFilters": [
{
"categories": [
"coll_tree"
],
"rules": [
"-/collations/big5han",
"-/collations/gb2312han"
]
}
]
}
### Locale Slicing
The simplest way to slice ICU data is by locale. The ICU Data Build Tool

View file

@ -159,6 +159,23 @@ class Config(object):
if "usePoolBundle" in self.filters_json_data:
self.use_pool_bundle = self.filters_json_data["usePoolBundle"]
# By default, exclude collation data that mimics the order of some large legacy charsets.
# We do this in "subtractive" strategy by inserting a resourceFilter.
# Later rules from an explicit filter file may override this default behavior.
# (In "additive" strategy this is unnecessary.)
if self.strategy == "subtractive":
filters = self.filters_json_data.setdefault("resourceFilters", [])
omit_charset_collations = {
"categories": [
"coll_tree"
],
"rules": [
"-/collations/big5han",
"-/collations/gb2312han"
]
}
filters.insert(0, omit_charset_collations)
def _parse_filter_file(self, f):
# Use the Hjson parser if it is available; otherwise, use vanilla JSON.
try:

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d73b2718b26be3283b897d2780e6c36b64efba9c0d20a04397ade3fa354d21a1
size 14443319
oid sha256:11da259e78948bcd4daf70a381eed008ac3d9a02aaefeeb904ea2b4b89f29dc1
size 14329200

View file

@ -49,19 +49,19 @@ public final class ICUResourceBundleCollationTest extends TestFmwk {
"f", "zh_MO", "zh@collation=stroke", /* alias of zh_Hant_MO */
"t", "zh_Hant_MO", "zh@collation=stroke",
"f", "zh_TW_STROKE", "zh@collation=stroke",
"f", "zh_TW_STROKE@collation=big5han", "zh@collation=big5han",
"f", "zh_TW_STROKE@collation=zhuyin", "zh@collation=zhuyin",
"f", "sv_CN@calendar=japanese", "sv",
"t", "sv@calendar=japanese", "sv",
"f", "zh_TW@collation=big5han", "zh@collation=big5han", /* alias of zh_Hant_TW */
"t", "zh_Hant_TW@collation=big5han", "zh@collation=big5han",
"f", "zh_TW@collation=gb2312han", "zh@collation=gb2312han", /* alias of zh_Hant_TW */
"t", "zh_Hant_TW@collation=gb2312han", "zh@collation=gb2312han",
"f", "zh_CN@collation=big5han", "zh@collation=big5han", /* alias of zh_Hans_CN */
"t", "zh_Hans_CN@collation=big5han", "zh@collation=big5han",
"f", "zh_CN@collation=gb2312han", "zh@collation=gb2312han", /* alias of zh_Hans_CN */
"t", "zh_Hans_CN@collation=gb2312han", "zh@collation=gb2312han",
"t", "zh@collation=big5han", "zh@collation=big5han",
"t", "zh@collation=gb2312han", "zh@collation=gb2312han",
"f", "zh_TW@collation=zhuyin", "zh@collation=zhuyin", /* alias of zh_Hant_TW */
"t", "zh_Hant_TW@collation=zhuyin", "zh@collation=zhuyin",
"f", "zh_TW@collation=unihan", "zh@collation=unihan", /* alias of zh_Hant_TW */
"t", "zh_Hant_TW@collation=unihan", "zh@collation=unihan",
"f", "zh_CN@collation=zhuyin", "zh@collation=zhuyin", /* alias of zh_Hans_CN */
"t", "zh_Hans_CN@collation=zhuyin", "zh@collation=zhuyin",
"f", "zh_CN@collation=unihan", "zh@collation=unihan", /* alias of zh_Hans_CN */
"t", "zh_Hans_CN@collation=unihan", "zh@collation=unihan",
"t", "zh@collation=zhuyin", "zh@collation=zhuyin",
"t", "zh@collation=unihan", "zh@collation=unihan",
"t", "hi@collation=standard", "hi",
"f", "hi_AU@collation=standard;currency=CHF;calendar=buddhist", "hi",
"f", "sv_SE@collation=pinyin", "sv", /* bug 4582 tests */