From a2a39e69d030e440c7304d8607c31da41c7ead97 Mon Sep 17 00:00:00 2001 From: Shane Carr Date: Thu, 6 Dec 2018 00:33:10 -0800 Subject: [PATCH] ICU-10923 Adding initial version of data file filtering. - Reads filters.json or .hjson from ICU_DATA_FILTER_FILE environment variable - Adds DepTarget for dependency semantics, and warns for missing deps. - Fixes certain tests that crash with sliced locale data. - Includes support for 4 filter types. --- icu4c/source/configure | 1 + icu4c/source/configure.ac | 1 + icu4c/source/data/BUILDRULES.py | 32 +-- icu4c/source/data/buildtool/__init__.py | 8 +- icu4c/source/data/buildtool/__main__.py | 27 ++- icu4c/source/data/buildtool/filtration.py | 273 ++++++++++++++++++++++ icu4c/source/data/buildtool/utils.py | 71 +++++- icu4c/source/test/cintltst/cdattst.c | 11 +- icu4c/source/test/intltest/dtfmttst.cpp | 4 + icu4c/source/test/intltest/numfmtst.cpp | 3 + 10 files changed, 405 insertions(+), 26 deletions(-) create mode 100644 icu4c/source/data/buildtool/filtration.py diff --git a/icu4c/source/configure b/icu4c/source/configure index 420350c844c..441e30e6bdc 100755 --- a/icu4c/source/configure +++ b/icu4c/source/configure @@ -9118,6 +9118,7 @@ else --format gnumake \ --seqmode parallel \ --glob_dir "$srcdir/data" \ + --filter_file "$ICU_DATA_FILTER_FILE" \ > data/rules.mk echo "Spawning Python to generate test/testdata/rules.mk..." PYTHONPATH="$srcdir/test/testdata:$srcdir/data" $PYTHON3 -m buildtool \ diff --git a/icu4c/source/configure.ac b/icu4c/source/configure.ac index 60c59d45541..81f7e10ca15 100644 --- a/icu4c/source/configure.ac +++ b/icu4c/source/configure.ac @@ -1394,6 +1394,7 @@ else --format gnumake \ --seqmode parallel \ --glob_dir "$srcdir/data" \ + --filter_file "$ICU_DATA_FILTER_FILE" \ > data/rules.mk echo "Spawning Python to generate test/testdata/rules.mk..." PYTHONPATH="$srcdir/test/testdata:$srcdir/data" $PYTHON3 -m buildtool \ diff --git a/icu4c/source/data/BUILDRULES.py b/icu4c/source/data/BUILDRULES.py index 6de0b3edd3c..c1e45a74921 100644 --- a/icu4c/source/data/BUILDRULES.py +++ b/icu4c/source/data/BUILDRULES.py @@ -54,16 +54,6 @@ def generate(config, glob, common_vars): requests += generate_curr_supplemental(config, glob, common_vars) requests += generate_translit(config, glob, common_vars) - # FIXME: Clean this up (duplicated logic) - brkitr_brk_files = [] - input_files = [InFile(filename) for filename in glob("brkitr/rules/*.txt")] - output_files = [OutFile("brkitr/%s.brk" % v.filename[13:-4]) for v in input_files] - brkitr_brk_files += output_files - dict_files = [] - input_files = [InFile(filename) for filename in glob("brkitr/dictionaries/*.txt")] - output_files = [OutFile("brkitr/%s.dict" % v.filename[20:-4]) for v in input_files] - dict_files += output_files - # Res Tree Files # (input dirname, output dirname, resfiles.mk path, mk version var, mk source var, use pool file, dep files) requests += generate_tree(config, glob, common_vars, @@ -120,8 +110,6 @@ def generate(config, glob, common_vars): True, []) - # TODO: We should not need timezoneTypes.res to build collation resource bundles. - # TODO: Maybe keyTypeData.res should be baked into the common library. requests += generate_tree(config, glob, common_vars, "coll", "coll", @@ -129,7 +117,10 @@ def generate(config, glob, common_vars): "COLLATION_CLDR_VERSION", "COLLATION_SOURCE", False, - [OutFile("coll/ucadata.icu"), OutFile("timezoneTypes.res"), OutFile("keyTypeData.res")]) + # Depends on timezoneTypes.res and keyTypeData.res. + # TODO: We should not need this dependency to build collation. + # TODO: Bake keyTypeData.res into the common library? + [DepTarget("coll_ucadata"), DepTarget("misc")]) requests += generate_tree(config, glob, common_vars, "brkitr", @@ -138,7 +129,7 @@ def generate(config, glob, common_vars): "BRK_RES_CLDR_VERSION", "BRK_RES_SOURCE", False, - brkitr_brk_files + dict_files) + [DepTarget("brkitr_brk"), DepTarget("brkitr_dictionaries")]) requests += generate_tree(config, glob, common_vars, "rbnf", @@ -189,7 +180,7 @@ def generate_confusables(config, glob, common_vars): SingleExecutionRequest( name = "confusables", category = "confusables", - dep_files = [OutFile("cnvalias.icu")], + dep_files = [DepTarget("cnvalias")], input_files = [txt1, txt2], output_files = [cfu], tool = IcuTool("gencfu"), @@ -231,7 +222,7 @@ def generate_brkitr_brk(config, glob, common_vars): RepeatedExecutionRequest( name = "brkitr_brk", category = "brkitr_rules", - dep_files = [OutFile("cnvalias.icu")], + dep_files = [DepTarget("cnvalias")], input_files = input_files, output_files = output_files, tool = IcuTool("genbrk"), @@ -459,13 +450,14 @@ def generate_tree( # Generate Pool Bundle if use_pool_bundle: input_pool_files = [OutFile("%spool.res" % out_prefix)] + pool_target_name = "%s_pool_write" % sub_dir use_pool_bundle_option = "--usePoolBundle {OUT_DIR}/{OUT_PREFIX}".format( OUT_PREFIX = out_prefix, **common_vars ) requests += [ SingleExecutionRequest( - name = "%s_pool_write" % sub_dir, + name = pool_target_name, category = category, dep_files = dep_files, input_files = input_files, @@ -481,8 +473,8 @@ def generate_tree( } ), ] + dep_files = dep_files + [DepTarget(pool_target_name)] else: - input_pool_files = [] use_pool_bundle_option = "" # Generate Res File Tree @@ -490,7 +482,7 @@ def generate_tree( RepeatedOrSingleExecutionRequest( name = "%s_res" % sub_dir, category = category, - dep_files = dep_files + input_pool_files, + dep_files = dep_files, input_files = input_files, output_files = output_files, tool = IcuTool("genrb"), @@ -545,7 +537,7 @@ def generate_tree( requests += [ SingleExecutionRequest( name = "%s_index_res" % sub_dir, - category = category, + category = "%s_index" % sub_dir, dep_files = [], input_files = [index_file_txt], output_files = [index_res_file], diff --git a/icu4c/source/data/buildtool/__init__.py b/icu4c/source/data/buildtool/__init__.py index cc95cb89712..09d10ad43d6 100644 --- a/icu4c/source/data/buildtool/__init__.py +++ b/icu4c/source/data/buildtool/__init__.py @@ -32,6 +32,8 @@ PkgFile = namedtuple("PkgFile", ["filename"]) IcuTool = namedtuple("IcuTool", ["name"]) SystemTool = namedtuple("SystemTool", ["name"]) +DepTarget = namedtuple("DepTarget", ["name"]) + SingleExecutionRequest = namedtuple("SingleExecutionRequest", [ # Used for identification purposes "name", @@ -39,7 +41,8 @@ SingleExecutionRequest = namedtuple("SingleExecutionRequest", [ # The filter category that applies to this request "category", - # Dependency files; usually generated + # Names of targets (requests) or files that this request depends on; + # targets are of type DepTarget "dep_files", # Primary input files @@ -66,7 +69,8 @@ RepeatedExecutionRequest = namedtuple("RepeatedExecutionRequest", [ # The filter category that applies to this request "category", - # Dependency files; usually generated + # Names of targets (requests) or files that this request depends on; + # targets are of type DepTarget "dep_files", # Primary input files diff --git a/icu4c/source/data/buildtool/__main__.py b/icu4c/source/data/buildtool/__main__.py index 54fc351cb55..88b1387dd78 100644 --- a/icu4c/source/data/buildtool/__main__.py +++ b/icu4c/source/data/buildtool/__main__.py @@ -3,11 +3,12 @@ import argparse import glob as pyglob +import json import sys from . import * from .renderers import makefile, windirect -from . import utils +from . import filtration, utils import BUILDRULES flag_parser = argparse.ArgumentParser( @@ -84,11 +85,17 @@ features_group.add_argument( nargs = "+", choices = AVAILABLE_FEATURES ) +features_group.add_argument( + "--filter_file", + help = "A path to a filter file.", + default = None +) class Config(object): def __init__(self, args): + # Process arguments if args.whitelist: self._feature_set = set(args.whitelist) elif args.blacklist: @@ -99,6 +106,23 @@ class Config(object): # Either "unihan" or "implicithan" self.coll_han_type = args.collation_ucadata + # Default fields before processing filter file + self.filters_json_data = {} + + # Process filter file + if args.filter_file: + try: + with open(args.filter_file, "r") as f: + print("Note: Applying filters from %s." % args.filter_file, file=sys.stderr) + try: + import hjson + self.filters_json_data = hjson.load(f) + except ImportError: + self.filters_json_data = json.load(f) + except FileNotFoundError: + print("Error: Filter file not found at %s." % args.filter_file, file=sys.stderr) + exit(1) + def has_feature(self, feature_name): assert feature_name in AVAILABLE_FEATURES return feature_name in self._feature_set @@ -139,6 +163,7 @@ def main(): return [v.replace("\\", "/")[len(args.glob_dir)+1:] for v in sorted(result_paths)] build_dirs, requests = BUILDRULES.generate(config, glob, common) + requests = filtration.apply_filters(requests, config) requests = utils.flatten_requests(requests, config, common) if args.format == "gnumake": diff --git a/icu4c/source/data/buildtool/filtration.py b/icu4c/source/data/buildtool/filtration.py new file mode 100644 index 00000000000..b86466c596a --- /dev/null +++ b/icu4c/source/data/buildtool/filtration.py @@ -0,0 +1,273 @@ +# Copyright (C) 2018 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html + +from abc import ABC, abstractmethod +from collections import defaultdict +import re +import sys + +from . import * +from . import utils + + +class Filter(ABC): + @staticmethod + def create_from_json(json_data): + if "filterType" in json_data: + filter_type = json_data["filterType"] + else: + filter_type = "file-stem" + + if filter_type == "file-stem": + return FileStemFilter(json_data) + elif filter_type == "language": + return LanguageFilter(json_data) + elif filter_type == "regex": + return RegexFilter(json_data) + elif filter_type == "exclude": + return ExclusionFilter() + else: + print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr) + return None + + @abstractmethod + def filter(self, request): + pass + + +class ExclusionFilter(Filter): + def filter(self, request): + return [] + + +class WhitelistBlacklistFilter(Filter): + def __init__(self, json_data): + if "whitelist" in json_data: + self.is_whitelist = True + self.whitelist = json_data["whitelist"] + elif "blacklist" in json_data: + self.is_whitelist = False + self.blacklist = json_data["blacklist"] + + def filter(self, request): + if isinstance(request, SingleExecutionRequest): + return self._filter_single(request) + elif isinstance(request, RepeatedExecutionRequest): + return self._filter_repeated(request) + elif isinstance(request, RepeatedOrSingleExecutionRequest): + return self._filter_repeated_or_single(request) + elif isinstance(request, IndexTxtRequest): + return self._filter_index_txt(request) + else: + # Assert that no other types are needed + for file in utils.get_input_files(request): + file_stem = self._file_to_file_stem(file) + assert self._should_include(file_stem), request + return [request] + + def _filter_single(self, request): + new_input_files = [] + new_format_with = defaultdict(utils.SpaceSeparatedList) + for i in range(len(request.input_files)): + file_stem = self._file_to_file_stem(request.input_files[i]) + if self._should_include(file_stem): + new_input_files.append(request.input_files[i]) + for k,v in request.format_with.items(): + if isinstance(v, list): + new_format_with[k].append(v[i]) + + # Return a new request if there are still >= 1 input files. + if new_input_files: + return [ + SingleExecutionRequest( + name = request.name, + category = request.category, + dep_files = request.dep_files, + input_files = new_input_files, + output_files = request.output_files, + tool = request.tool, + args = request.args, + format_with = utils.concat_dicts(request.format_with, new_format_with) + ) + ] + return [] + + def _filter_repeated(self, request): + new_input_files = [] + new_output_files = [] + new_format_with = defaultdict(utils.SpaceSeparatedList) + new_repeat_with = defaultdict(utils.SpaceSeparatedList) + for i in range(len(request.input_files)): + file_stem = self._file_to_file_stem(request.input_files[i]) + if self._should_include(file_stem): + new_input_files.append(request.input_files[i]) + new_output_files.append(request.output_files[i]) + for k,v in request.format_with.items(): + if isinstance(v, list): + new_format_with[k].append(v[i]) + for k,v in request.repeat_with.items(): + assert isinstance(v, list) + new_repeat_with[k].append(v[i]) + + # Return a new request if there are still >= 1 input files. + if new_input_files: + return [ + RepeatedExecutionRequest( + name = request.name, + category = request.category, + dep_files = request.dep_files, + input_files = new_input_files, + output_files = new_output_files, + tool = request.tool, + args = request.args, + format_with = utils.concat_dicts(request.format_with, new_format_with), + repeat_with = utils.concat_dicts(request.repeat_with, new_repeat_with) + ) + ] + else: + return [] + + def _filter_repeated_or_single(self, request): + new_input_files = [] + new_output_files = [] + new_format_with = defaultdict(utils.SpaceSeparatedList) + new_repeat_with = defaultdict(utils.SpaceSeparatedList) + for i in range(len(request.input_files)): + file_stem = self._file_to_file_stem(request.input_files[i]) + if self._should_include(file_stem): + new_input_files.append(request.input_files[i]) + new_output_files.append(request.output_files[i]) + for k,v in request.format_with.items(): + if isinstance(v, list): + new_format_with[k].append(v[i]) + for k,v in request.repeat_with.items(): + assert isinstance(v, list) + new_repeat_with[k].append(v[i]) + + # Return a new request if there are still >= 1 input files. + if new_input_files: + return [ + RepeatedOrSingleExecutionRequest( + name = request.name, + category = request.category, + dep_files = request.dep_files, + input_files = new_input_files, + output_files = new_output_files, + tool = request.tool, + args = request.args, + format_with = utils.concat_dicts(request.format_with, new_format_with), + repeat_with = utils.concat_dicts(request.repeat_with, new_repeat_with) + ) + ] + else: + return [] + + def _filter_index_txt(self, request): + new_input_files = [] + for file in request.input_files: + file_stem = self._file_to_file_stem(file) + if self._should_include(file_stem): + new_input_files.append(file) + + # Return a new request if there are still >= 1 input files. + if new_input_files: + return [ + IndexTxtRequest( + name = request.name, + category = request.category, + input_files = new_input_files, + output_file = request.output_file, + cldr_version = request.cldr_version + ) + ] + else: + return [] + + @classmethod + def _file_to_file_stem(cls, file): + start = file.filename.rfind("/") + limit = file.filename.rfind(".") + return file.filename[start+1:limit] + + @abstractmethod + def _should_include(self, file_stem): + pass + + +class FileStemFilter(WhitelistBlacklistFilter): + def _should_include(self, file_stem): + if self.is_whitelist: + return file_stem in self.whitelist + else: + return file_stem not in self.blacklist + + +class LanguageFilter(WhitelistBlacklistFilter): + def _should_include(self, file_stem): + language = file_stem.split("_")[0] + if language == "root": + # Always include root.txt + return True + if self.is_whitelist: + return language in self.whitelist + else: + return language not in self.blacklist + + +class RegexFilter(WhitelistBlacklistFilter): + def __init__(self, *args): + super().__init__(*args) + if self.is_whitelist: + self.whitelist = [re.compile(pat) for pat in self.whitelist] + else: + self.blacklist = [re.compile(pat) for pat in self.blacklist] + + def _should_include(self, file_stem): + if self.is_whitelist: + for pattern in self.whitelist: + if pattern.match(file_stem): + return True + return False + else: + for pattern in self.blacklist: + if pattern.match(file_stem): + return False + return True + + +def apply_filters(old_requests, config): + """Runs the filters and returns a new list of requests.""" + filters = _preprocess_filters(old_requests, config) + new_requests = [] + for request in old_requests: + category = utils.get_category(request) + if category in filters: + new_requests += filters[category].filter(request) + else: + new_requests.append(request) + return new_requests + + +def _preprocess_filters(requests, config): + all_categories = set( + utils.get_category(request) + for request in requests + ) + all_categories.remove(None) + all_categories = list(sorted(all_categories)) + json_data = config.filters_json_data + filters = {} + for category in all_categories: + if "featureFilters" in json_data and category in json_data["featureFilters"]: + filters[category] = Filter.create_from_json( + json_data["featureFilters"][category] + ) + elif "localeFilter" in json_data and category[-5:] == "_tree": + filters[category] = Filter.create_from_json( + json_data["localeFilter"] + ) + if "featureFilters" in json_data: + for category in json_data["featureFilters"]: + if category not in all_categories: + print("Warning: category %s is not known" % category, file=sys.stderr) + return filters diff --git a/icu4c/source/data/buildtool/utils.py b/icu4c/source/data/buildtool/utils.py index e01a9c60aa0..fb5cf6d747a 100644 --- a/icu4c/source/data/buildtool/utils.py +++ b/icu4c/source/data/buildtool/utils.py @@ -1,6 +1,8 @@ # Copyright (C) 2018 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html +import sys + from . import * @@ -57,6 +59,23 @@ def format_repeated_request_command(request, cmd_template, loop_vars, common_var ) +def dep_targets_to_files(this_request, all_requests): + if not this_request.dep_files: + return [] + dep_files = [] + for dep_target in this_request.dep_files: + for request in all_requests: + if request.name == dep_target.name: + dep_files += get_output_files(request) + break + else: + print("Warning: Unable to find target %s, a dependency of %s" % ( + dep_target.name, + this_request.name + ), file=sys.stderr) + return dep_files + + def flatten_requests(raw_requests, config, common_vars): """Post-processes "meta" requests into normal requests. @@ -73,7 +92,9 @@ def flatten_requests(raw_requests, config, common_vars): flattened_requests.append(RepeatedExecutionRequest( name = request.name, category = request.category, - dep_files = request.dep_files, + dep_files = dep_targets_to_files( + request, raw_requests + ), input_files = request.input_files, output_files = request.output_files, tool = request.tool, @@ -85,12 +106,45 @@ def flatten_requests(raw_requests, config, common_vars): flattened_requests.append(SingleExecutionRequest( name = request.name, category = request.category, - input_files = request.dep_files + request.input_files, + input_files = request.input_files + dep_targets_to_files( + request, raw_requests + ), output_files = request.output_files, tool = request.tool, args = request.args, format_with = concat_dicts(request.format_with, request.repeat_with) )) + elif isinstance(request, SingleExecutionRequest): + flattened_requests += [ + SingleExecutionRequest( + name = request.name, + category = request.category, + dep_files = dep_targets_to_files( + request, raw_requests + ), + input_files = request.input_files, + output_files = request.output_files, + tool = request.tool, + args = request.args, + format_with = request.format_with + ) + ] + elif isinstance(request, RepeatedExecutionRequest): + flattened_requests += [ + RepeatedExecutionRequest( + name = request.name, + category = request.category, + dep_files = dep_targets_to_files( + request, raw_requests + ), + input_files = request.input_files, + output_files = request.output_files, + tool = request.tool, + args = request.args, + format_with = request.format_with, + repeat_with = request.repeat_with + ) + ] elif isinstance(request, ListRequest): list_files = list(sorted(get_all_output_files(raw_requests))) if request.include_tmp: @@ -182,6 +236,19 @@ def get_output_files(request): assert False +def get_category(request): + if isinstance(request, SingleExecutionRequest): + return request.category + elif isinstance(request, RepeatedExecutionRequest): + return request.category + elif isinstance(request, RepeatedOrSingleExecutionRequest): + return request.category + elif isinstance(request, IndexTxtRequest): + return request.category + else: + return None + + def get_all_output_files(requests, include_tmp=False): files = [] for request in requests: diff --git a/icu4c/source/test/cintltst/cdattst.c b/icu4c/source/test/cintltst/cdattst.c index acdfa6c1f18..70461b0b2af 100644 --- a/icu4c/source/test/cintltst/cdattst.c +++ b/icu4c/source/test/cintltst/cdattst.c @@ -1634,6 +1634,7 @@ static void TestOverrideNumberFormat(void) { // loop 5 times to check getter/setter for (i = 0; i < 5; i++){ + status = U_ZERO_ERROR; UNumberFormat* overrideFmt; overrideFmt = unum_open(UNUM_DEFAULT, NULL, 0, localeString, NULL, &status); assertSuccess("unum_open()", &status); @@ -1647,15 +1648,19 @@ static void TestOverrideNumberFormat(void) { } } { + status = U_ZERO_ERROR; UNumberFormat* overrideFmt; overrideFmt = unum_open(UNUM_DEFAULT, NULL, 0, localeString, NULL, &status); assertSuccess("unum_open()", &status); - udat_setNumberFormat(fmt, overrideFmt); // test the same override NF will not crash + if (U_SUCCESS(status)) { + udat_setNumberFormat(fmt, overrideFmt); // test the same override NF will not crash + } unum_close(overrideFmt); } udat_close(fmt); for (i=0; iadoptNumberFormat(overrideNF); diff --git a/icu4c/source/test/intltest/numfmtst.cpp b/icu4c/source/test/intltest/numfmtst.cpp index 52ed34d9105..3594c730d18 100644 --- a/icu4c/source/test/intltest/numfmtst.cpp +++ b/icu4c/source/test/intltest/numfmtst.cpp @@ -8335,6 +8335,9 @@ void NumberFormatTest::Test11739_ParseLongCurrency() { ParsePosition ppos(0); LocalPointer result(nf->parseCurrency(u"1.500 амерички долар", ppos)); assertEquals("Should parse to 1500 USD", -1, ppos.getErrorIndex()); + if (ppos.getErrorIndex() != -1) { + return; + } assertEquals("Should parse to 1500 USD", 1500LL, result->getNumber().getInt64(status)); assertEquals("Should parse to 1500 USD", u"USD", result->getISOCurrency()); }