ICU-10923 Adding initial version of data file filtering.

- Reads filters.json or .hjson from ICU_DATA_FILTER_FILE environment variable
- Adds DepTarget for dependency semantics, and warns for missing deps.
- Fixes certain tests that crash with sliced locale data.
- Includes support for 4 filter types.
This commit is contained in:
Shane Carr 2018-12-06 00:33:10 -08:00 committed by Shane F. Carr
parent a4dd8b3702
commit a2a39e69d0
10 changed files with 405 additions and 26 deletions

View file

@ -9118,6 +9118,7 @@ else
--format gnumake \
--seqmode parallel \
--glob_dir "$srcdir/data" \
--filter_file "$ICU_DATA_FILTER_FILE" \
> data/rules.mk
echo "Spawning Python to generate test/testdata/rules.mk..."
PYTHONPATH="$srcdir/test/testdata:$srcdir/data" $PYTHON3 -m buildtool \

View file

@ -1394,6 +1394,7 @@ else
--format gnumake \
--seqmode parallel \
--glob_dir "$srcdir/data" \
--filter_file "$ICU_DATA_FILTER_FILE" \
> data/rules.mk
echo "Spawning Python to generate test/testdata/rules.mk..."
PYTHONPATH="$srcdir/test/testdata:$srcdir/data" $PYTHON3 -m buildtool \

View file

@ -54,16 +54,6 @@ def generate(config, glob, common_vars):
requests += generate_curr_supplemental(config, glob, common_vars)
requests += generate_translit(config, glob, common_vars)
# FIXME: Clean this up (duplicated logic)
brkitr_brk_files = []
input_files = [InFile(filename) for filename in glob("brkitr/rules/*.txt")]
output_files = [OutFile("brkitr/%s.brk" % v.filename[13:-4]) for v in input_files]
brkitr_brk_files += output_files
dict_files = []
input_files = [InFile(filename) for filename in glob("brkitr/dictionaries/*.txt")]
output_files = [OutFile("brkitr/%s.dict" % v.filename[20:-4]) for v in input_files]
dict_files += output_files
# Res Tree Files
# (input dirname, output dirname, resfiles.mk path, mk version var, mk source var, use pool file, dep files)
requests += generate_tree(config, glob, common_vars,
@ -120,8 +110,6 @@ def generate(config, glob, common_vars):
True,
[])
# TODO: We should not need timezoneTypes.res to build collation resource bundles.
# TODO: Maybe keyTypeData.res should be baked into the common library.
requests += generate_tree(config, glob, common_vars,
"coll",
"coll",
@ -129,7 +117,10 @@ def generate(config, glob, common_vars):
"COLLATION_CLDR_VERSION",
"COLLATION_SOURCE",
False,
[OutFile("coll/ucadata.icu"), OutFile("timezoneTypes.res"), OutFile("keyTypeData.res")])
# Depends on timezoneTypes.res and keyTypeData.res.
# TODO: We should not need this dependency to build collation.
# TODO: Bake keyTypeData.res into the common library?
[DepTarget("coll_ucadata"), DepTarget("misc")])
requests += generate_tree(config, glob, common_vars,
"brkitr",
@ -138,7 +129,7 @@ def generate(config, glob, common_vars):
"BRK_RES_CLDR_VERSION",
"BRK_RES_SOURCE",
False,
brkitr_brk_files + dict_files)
[DepTarget("brkitr_brk"), DepTarget("brkitr_dictionaries")])
requests += generate_tree(config, glob, common_vars,
"rbnf",
@ -189,7 +180,7 @@ def generate_confusables(config, glob, common_vars):
SingleExecutionRequest(
name = "confusables",
category = "confusables",
dep_files = [OutFile("cnvalias.icu")],
dep_files = [DepTarget("cnvalias")],
input_files = [txt1, txt2],
output_files = [cfu],
tool = IcuTool("gencfu"),
@ -231,7 +222,7 @@ def generate_brkitr_brk(config, glob, common_vars):
RepeatedExecutionRequest(
name = "brkitr_brk",
category = "brkitr_rules",
dep_files = [OutFile("cnvalias.icu")],
dep_files = [DepTarget("cnvalias")],
input_files = input_files,
output_files = output_files,
tool = IcuTool("genbrk"),
@ -459,13 +450,14 @@ def generate_tree(
# Generate Pool Bundle
if use_pool_bundle:
input_pool_files = [OutFile("%spool.res" % out_prefix)]
pool_target_name = "%s_pool_write" % sub_dir
use_pool_bundle_option = "--usePoolBundle {OUT_DIR}/{OUT_PREFIX}".format(
OUT_PREFIX = out_prefix,
**common_vars
)
requests += [
SingleExecutionRequest(
name = "%s_pool_write" % sub_dir,
name = pool_target_name,
category = category,
dep_files = dep_files,
input_files = input_files,
@ -481,8 +473,8 @@ def generate_tree(
}
),
]
dep_files = dep_files + [DepTarget(pool_target_name)]
else:
input_pool_files = []
use_pool_bundle_option = ""
# Generate Res File Tree
@ -490,7 +482,7 @@ def generate_tree(
RepeatedOrSingleExecutionRequest(
name = "%s_res" % sub_dir,
category = category,
dep_files = dep_files + input_pool_files,
dep_files = dep_files,
input_files = input_files,
output_files = output_files,
tool = IcuTool("genrb"),
@ -545,7 +537,7 @@ def generate_tree(
requests += [
SingleExecutionRequest(
name = "%s_index_res" % sub_dir,
category = category,
category = "%s_index" % sub_dir,
dep_files = [],
input_files = [index_file_txt],
output_files = [index_res_file],

View file

@ -32,6 +32,8 @@ PkgFile = namedtuple("PkgFile", ["filename"])
IcuTool = namedtuple("IcuTool", ["name"])
SystemTool = namedtuple("SystemTool", ["name"])
DepTarget = namedtuple("DepTarget", ["name"])
SingleExecutionRequest = namedtuple("SingleExecutionRequest", [
# Used for identification purposes
"name",
@ -39,7 +41,8 @@ SingleExecutionRequest = namedtuple("SingleExecutionRequest", [
# The filter category that applies to this request
"category",
# Dependency files; usually generated
# Names of targets (requests) or files that this request depends on;
# targets are of type DepTarget
"dep_files",
# Primary input files
@ -66,7 +69,8 @@ RepeatedExecutionRequest = namedtuple("RepeatedExecutionRequest", [
# The filter category that applies to this request
"category",
# Dependency files; usually generated
# Names of targets (requests) or files that this request depends on;
# targets are of type DepTarget
"dep_files",
# Primary input files

View file

@ -3,11 +3,12 @@
import argparse
import glob as pyglob
import json
import sys
from . import *
from .renderers import makefile, windirect
from . import utils
from . import filtration, utils
import BUILDRULES
flag_parser = argparse.ArgumentParser(
@ -84,11 +85,17 @@ features_group.add_argument(
nargs = "+",
choices = AVAILABLE_FEATURES
)
features_group.add_argument(
"--filter_file",
help = "A path to a filter file.",
default = None
)
class Config(object):
def __init__(self, args):
# Process arguments
if args.whitelist:
self._feature_set = set(args.whitelist)
elif args.blacklist:
@ -99,6 +106,23 @@ class Config(object):
# Either "unihan" or "implicithan"
self.coll_han_type = args.collation_ucadata
# Default fields before processing filter file
self.filters_json_data = {}
# Process filter file
if args.filter_file:
try:
with open(args.filter_file, "r") as f:
print("Note: Applying filters from %s." % args.filter_file, file=sys.stderr)
try:
import hjson
self.filters_json_data = hjson.load(f)
except ImportError:
self.filters_json_data = json.load(f)
except FileNotFoundError:
print("Error: Filter file not found at %s." % args.filter_file, file=sys.stderr)
exit(1)
def has_feature(self, feature_name):
assert feature_name in AVAILABLE_FEATURES
return feature_name in self._feature_set
@ -139,6 +163,7 @@ def main():
return [v.replace("\\", "/")[len(args.glob_dir)+1:] for v in sorted(result_paths)]
build_dirs, requests = BUILDRULES.generate(config, glob, common)
requests = filtration.apply_filters(requests, config)
requests = utils.flatten_requests(requests, config, common)
if args.format == "gnumake":

View file

@ -0,0 +1,273 @@
# Copyright (C) 2018 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
from abc import ABC, abstractmethod
from collections import defaultdict
import re
import sys
from . import *
from . import utils
class Filter(ABC):
@staticmethod
def create_from_json(json_data):
if "filterType" in json_data:
filter_type = json_data["filterType"]
else:
filter_type = "file-stem"
if filter_type == "file-stem":
return FileStemFilter(json_data)
elif filter_type == "language":
return LanguageFilter(json_data)
elif filter_type == "regex":
return RegexFilter(json_data)
elif filter_type == "exclude":
return ExclusionFilter()
else:
print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr)
return None
@abstractmethod
def filter(self, request):
pass
class ExclusionFilter(Filter):
def filter(self, request):
return []
class WhitelistBlacklistFilter(Filter):
def __init__(self, json_data):
if "whitelist" in json_data:
self.is_whitelist = True
self.whitelist = json_data["whitelist"]
elif "blacklist" in json_data:
self.is_whitelist = False
self.blacklist = json_data["blacklist"]
def filter(self, request):
if isinstance(request, SingleExecutionRequest):
return self._filter_single(request)
elif isinstance(request, RepeatedExecutionRequest):
return self._filter_repeated(request)
elif isinstance(request, RepeatedOrSingleExecutionRequest):
return self._filter_repeated_or_single(request)
elif isinstance(request, IndexTxtRequest):
return self._filter_index_txt(request)
else:
# Assert that no other types are needed
for file in utils.get_input_files(request):
file_stem = self._file_to_file_stem(file)
assert self._should_include(file_stem), request
return [request]
def _filter_single(self, request):
new_input_files = []
new_format_with = defaultdict(utils.SpaceSeparatedList)
for i in range(len(request.input_files)):
file_stem = self._file_to_file_stem(request.input_files[i])
if self._should_include(file_stem):
new_input_files.append(request.input_files[i])
for k,v in request.format_with.items():
if isinstance(v, list):
new_format_with[k].append(v[i])
# Return a new request if there are still >= 1 input files.
if new_input_files:
return [
SingleExecutionRequest(
name = request.name,
category = request.category,
dep_files = request.dep_files,
input_files = new_input_files,
output_files = request.output_files,
tool = request.tool,
args = request.args,
format_with = utils.concat_dicts(request.format_with, new_format_with)
)
]
return []
def _filter_repeated(self, request):
new_input_files = []
new_output_files = []
new_format_with = defaultdict(utils.SpaceSeparatedList)
new_repeat_with = defaultdict(utils.SpaceSeparatedList)
for i in range(len(request.input_files)):
file_stem = self._file_to_file_stem(request.input_files[i])
if self._should_include(file_stem):
new_input_files.append(request.input_files[i])
new_output_files.append(request.output_files[i])
for k,v in request.format_with.items():
if isinstance(v, list):
new_format_with[k].append(v[i])
for k,v in request.repeat_with.items():
assert isinstance(v, list)
new_repeat_with[k].append(v[i])
# Return a new request if there are still >= 1 input files.
if new_input_files:
return [
RepeatedExecutionRequest(
name = request.name,
category = request.category,
dep_files = request.dep_files,
input_files = new_input_files,
output_files = new_output_files,
tool = request.tool,
args = request.args,
format_with = utils.concat_dicts(request.format_with, new_format_with),
repeat_with = utils.concat_dicts(request.repeat_with, new_repeat_with)
)
]
else:
return []
def _filter_repeated_or_single(self, request):
new_input_files = []
new_output_files = []
new_format_with = defaultdict(utils.SpaceSeparatedList)
new_repeat_with = defaultdict(utils.SpaceSeparatedList)
for i in range(len(request.input_files)):
file_stem = self._file_to_file_stem(request.input_files[i])
if self._should_include(file_stem):
new_input_files.append(request.input_files[i])
new_output_files.append(request.output_files[i])
for k,v in request.format_with.items():
if isinstance(v, list):
new_format_with[k].append(v[i])
for k,v in request.repeat_with.items():
assert isinstance(v, list)
new_repeat_with[k].append(v[i])
# Return a new request if there are still >= 1 input files.
if new_input_files:
return [
RepeatedOrSingleExecutionRequest(
name = request.name,
category = request.category,
dep_files = request.dep_files,
input_files = new_input_files,
output_files = new_output_files,
tool = request.tool,
args = request.args,
format_with = utils.concat_dicts(request.format_with, new_format_with),
repeat_with = utils.concat_dicts(request.repeat_with, new_repeat_with)
)
]
else:
return []
def _filter_index_txt(self, request):
new_input_files = []
for file in request.input_files:
file_stem = self._file_to_file_stem(file)
if self._should_include(file_stem):
new_input_files.append(file)
# Return a new request if there are still >= 1 input files.
if new_input_files:
return [
IndexTxtRequest(
name = request.name,
category = request.category,
input_files = new_input_files,
output_file = request.output_file,
cldr_version = request.cldr_version
)
]
else:
return []
@classmethod
def _file_to_file_stem(cls, file):
start = file.filename.rfind("/")
limit = file.filename.rfind(".")
return file.filename[start+1:limit]
@abstractmethod
def _should_include(self, file_stem):
pass
class FileStemFilter(WhitelistBlacklistFilter):
def _should_include(self, file_stem):
if self.is_whitelist:
return file_stem in self.whitelist
else:
return file_stem not in self.blacklist
class LanguageFilter(WhitelistBlacklistFilter):
def _should_include(self, file_stem):
language = file_stem.split("_")[0]
if language == "root":
# Always include root.txt
return True
if self.is_whitelist:
return language in self.whitelist
else:
return language not in self.blacklist
class RegexFilter(WhitelistBlacklistFilter):
def __init__(self, *args):
super().__init__(*args)
if self.is_whitelist:
self.whitelist = [re.compile(pat) for pat in self.whitelist]
else:
self.blacklist = [re.compile(pat) for pat in self.blacklist]
def _should_include(self, file_stem):
if self.is_whitelist:
for pattern in self.whitelist:
if pattern.match(file_stem):
return True
return False
else:
for pattern in self.blacklist:
if pattern.match(file_stem):
return False
return True
def apply_filters(old_requests, config):
"""Runs the filters and returns a new list of requests."""
filters = _preprocess_filters(old_requests, config)
new_requests = []
for request in old_requests:
category = utils.get_category(request)
if category in filters:
new_requests += filters[category].filter(request)
else:
new_requests.append(request)
return new_requests
def _preprocess_filters(requests, config):
all_categories = set(
utils.get_category(request)
for request in requests
)
all_categories.remove(None)
all_categories = list(sorted(all_categories))
json_data = config.filters_json_data
filters = {}
for category in all_categories:
if "featureFilters" in json_data and category in json_data["featureFilters"]:
filters[category] = Filter.create_from_json(
json_data["featureFilters"][category]
)
elif "localeFilter" in json_data and category[-5:] == "_tree":
filters[category] = Filter.create_from_json(
json_data["localeFilter"]
)
if "featureFilters" in json_data:
for category in json_data["featureFilters"]:
if category not in all_categories:
print("Warning: category %s is not known" % category, file=sys.stderr)
return filters

View file

@ -1,6 +1,8 @@
# Copyright (C) 2018 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
import sys
from . import *
@ -57,6 +59,23 @@ def format_repeated_request_command(request, cmd_template, loop_vars, common_var
)
def dep_targets_to_files(this_request, all_requests):
if not this_request.dep_files:
return []
dep_files = []
for dep_target in this_request.dep_files:
for request in all_requests:
if request.name == dep_target.name:
dep_files += get_output_files(request)
break
else:
print("Warning: Unable to find target %s, a dependency of %s" % (
dep_target.name,
this_request.name
), file=sys.stderr)
return dep_files
def flatten_requests(raw_requests, config, common_vars):
"""Post-processes "meta" requests into normal requests.
@ -73,7 +92,9 @@ def flatten_requests(raw_requests, config, common_vars):
flattened_requests.append(RepeatedExecutionRequest(
name = request.name,
category = request.category,
dep_files = request.dep_files,
dep_files = dep_targets_to_files(
request, raw_requests
),
input_files = request.input_files,
output_files = request.output_files,
tool = request.tool,
@ -85,12 +106,45 @@ def flatten_requests(raw_requests, config, common_vars):
flattened_requests.append(SingleExecutionRequest(
name = request.name,
category = request.category,
input_files = request.dep_files + request.input_files,
input_files = request.input_files + dep_targets_to_files(
request, raw_requests
),
output_files = request.output_files,
tool = request.tool,
args = request.args,
format_with = concat_dicts(request.format_with, request.repeat_with)
))
elif isinstance(request, SingleExecutionRequest):
flattened_requests += [
SingleExecutionRequest(
name = request.name,
category = request.category,
dep_files = dep_targets_to_files(
request, raw_requests
),
input_files = request.input_files,
output_files = request.output_files,
tool = request.tool,
args = request.args,
format_with = request.format_with
)
]
elif isinstance(request, RepeatedExecutionRequest):
flattened_requests += [
RepeatedExecutionRequest(
name = request.name,
category = request.category,
dep_files = dep_targets_to_files(
request, raw_requests
),
input_files = request.input_files,
output_files = request.output_files,
tool = request.tool,
args = request.args,
format_with = request.format_with,
repeat_with = request.repeat_with
)
]
elif isinstance(request, ListRequest):
list_files = list(sorted(get_all_output_files(raw_requests)))
if request.include_tmp:
@ -182,6 +236,19 @@ def get_output_files(request):
assert False
def get_category(request):
if isinstance(request, SingleExecutionRequest):
return request.category
elif isinstance(request, RepeatedExecutionRequest):
return request.category
elif isinstance(request, RepeatedOrSingleExecutionRequest):
return request.category
elif isinstance(request, IndexTxtRequest):
return request.category
else:
return None
def get_all_output_files(requests, include_tmp=False):
files = []
for request in requests:

View file

@ -1634,6 +1634,7 @@ static void TestOverrideNumberFormat(void) {
// loop 5 times to check getter/setter
for (i = 0; i < 5; i++){
status = U_ZERO_ERROR;
UNumberFormat* overrideFmt;
overrideFmt = unum_open(UNUM_DEFAULT, NULL, 0, localeString, NULL, &status);
assertSuccess("unum_open()", &status);
@ -1647,15 +1648,19 @@ static void TestOverrideNumberFormat(void) {
}
}
{
status = U_ZERO_ERROR;
UNumberFormat* overrideFmt;
overrideFmt = unum_open(UNUM_DEFAULT, NULL, 0, localeString, NULL, &status);
assertSuccess("unum_open()", &status);
udat_setNumberFormat(fmt, overrideFmt); // test the same override NF will not crash
if (U_SUCCESS(status)) {
udat_setNumberFormat(fmt, overrideFmt); // test the same override NF will not crash
}
unum_close(overrideFmt);
}
udat_close(fmt);
for (i=0; i<UPRV_LENGTHOF(overrideNumberFormat); i++){
status = U_ZERO_ERROR;
UChar ubuf[kUbufMax];
UDateFormat* fmt2;
UNumberFormat* overrideFmt2;
@ -1666,6 +1671,10 @@ static void TestOverrideNumberFormat(void) {
overrideFmt2 = unum_open(UNUM_DEFAULT, NULL, 0, localeString, NULL, &status);
assertSuccess("unum_open() in loop", &status);
if (U_FAILURE(status)) {
continue;
}
u_uastrcpy(fields, overrideNumberFormat[i][0]);
u_unescape(overrideNumberFormat[i][1], expected, UPRV_LENGTHOF(expected));

View file

@ -4758,6 +4758,10 @@ void DateFormatTest::TestNumberFormatOverride() {
assertSuccess("SimpleDateFormat with pattern MM d", status);
NumberFormat* overrideNF = NumberFormat::createInstance(Locale::createFromName("zh@numbers=hanidays"),status);
assertSuccess("NumberFormat zh@numbers=hanidays", status);
if (U_FAILURE(status)) {
status = U_ZERO_ERROR;
continue;
}
if (fields == (UnicodeString) "") { // use the one w/o fields
fmt->adoptNumberFormat(overrideNF);

View file

@ -8335,6 +8335,9 @@ void NumberFormatTest::Test11739_ParseLongCurrency() {
ParsePosition ppos(0);
LocalPointer<CurrencyAmount> result(nf->parseCurrency(u"1.500 амерички долар", ppos));
assertEquals("Should parse to 1500 USD", -1, ppos.getErrorIndex());
if (ppos.getErrorIndex() != -1) {
return;
}
assertEquals("Should parse to 1500 USD", 1500LL, result->getNumber().getInt64(status));
assertEquals("Should parse to 1500 USD", u"USD", result->getISOCurrency());
}