From 949f4d6dfc15701a506d75ddaef179b4724ad9b3 Mon Sep 17 00:00:00 2001 From: Maksim Andrianov Date: Tue, 7 May 2019 13:52:09 +0300 Subject: [PATCH] Review fixes --- tools/python/post_generation/__main__.py | 19 ++-- .../post_generation/hierarchy_to_countries.py | 101 +++++++++--------- .../post_generation/localads_mwm_to_csv.py | 7 +- tools/unix/generate_planet.sh | 4 +- 4 files changed, 64 insertions(+), 67 deletions(-) diff --git a/tools/python/post_generation/__main__.py b/tools/python/post_generation/__main__.py index 6f7d4cf111..c004c7deb9 100644 --- a/tools/python/post_generation/__main__.py +++ b/tools/python/post_generation/__main__.py @@ -9,9 +9,9 @@ from .localads_mwm_to_csv import create_csv class PostGeneration: def __init__(self): parser = argparse.ArgumentParser( - description="Post generation instruments", + description="Post-generation instruments", usage="""post_generation [] -The most commonly used post_generation commands are: +The post_generation commands are: localads_mwm_to_csv Prepares CSV files for uploading to localads database from mwm files. hierarchy_to_countries Produces countries.txt from hierarchy.txt. """) @@ -42,17 +42,16 @@ The most commonly used post_generation commands are: help="path to omim/data/types.txt") parser.add_argument("--threads", type=int, + default=1, help="number of threads to process files") - parser.add_argument("--version", type=int, help="override mwm version") - parser.add_argument("--debug", - action="store_true", - help="debug parse_mwm call") + parser.add_argument("--mwm_version", type=int, required=True, + help="Mwm version") args = parser.parse_args(sys.argv[2:]) if not args.osm2ft: args.osm2ft = args.mwm create_csv(args.output, args.mwm, args.osm2ft, args.types, - args.version, args.threads, args.debug) + args.mwm_version, args.threads) @staticmethod def hierarchy_to_countries(): @@ -67,15 +66,15 @@ The most commonly used post_generation commands are: help="old_vs_new.csv file") parser.add_argument("--osm", required=True, help="borders_vs_osm.csv file") - parser.add_argument("--version", type=int, default=151231, - help="Version") + parser.add_argument("--mwm_version", type=int, required=True, + help="Mwm version") parser.add_argument("-o", "--output", required=True, help="Output countries.txt file (default is stdout)") args = parser.parse_args(sys.argv[2:]) countries_json = hierarchy_to_countries_(args.old, args.osm, args.hierarchy, args.target, - args.version) + args.mwm_version) if args.output: with open(args.output, "w") as f: f.write(countries_json) diff --git a/tools/python/post_generation/hierarchy_to_countries.py b/tools/python/post_generation/hierarchy_to_countries.py index 66dddbb92b..c5683b6252 100755 --- a/tools/python/post_generation/hierarchy_to_countries.py +++ b/tools/python/post_generation/hierarchy_to_countries.py @@ -4,7 +4,7 @@ # # Sample lines: # Iran;Q794;ir;fa -# Iran_South;Q794-South +# Iran_South;Q794-South # # Number of leading spaces mean hierarchy depth. In above case, Iran_South is inside Iran. # Then follows a semicolon-separated list: @@ -12,6 +12,7 @@ # 2. Region name template using wikidata Qxxx codes and predefined strings # 3. Country ISO code (used for flags in the legacy format) # 4. Comma-separated list of language ISO codes for the region + import base64 import hashlib import json @@ -22,7 +23,7 @@ import re class CountryDict(dict): def __init__(self, *args, **kwargs): dict.__init__(self, *args, **kwargs) - self.order = ["id", "n", "f", "v", "c", "s", "sha1_base64", "rs", "g"] + self.order = ["id", "n", "v", "c", "s", "sha1_base64", "rs", "g"] def __iter__(self): for key in self.order: @@ -37,7 +38,7 @@ class CountryDict(dict): yield (key, self.__getitem__(key)) -def get_hash(path, name): +def get_mwm_hash(path, name): filename = os.path.join(path, f"{name}.mwm") h = hashlib.sha1() with open(filename, "rb") as f: @@ -46,7 +47,7 @@ def get_hash(path, name): return str(base64.b64encode(h.digest()), "utf-8") -def get_size(path, name): +def get_mwm_size(path, name): filename = os.path.join(path, f"{name}.mwm") return os.path.getsize(filename) @@ -82,14 +83,14 @@ def parse_old_vs_new(old_vs_new_csv_path): if not old_vs_new_csv_path: return oldvs - with open(old_vs_new_csv_path, "r") as f: + with open(old_vs_new_csv_path) as f: for line in f: m = re.match(r"(.+?)\t(.+)", line.strip()) - if m: - if m.group(2) in oldvs: - oldvs[m.group(2)].append(m.group(1)) - else: - oldvs[m.group(2)] = [m.group(1)] + assert m + if m.group(2) in oldvs: + oldvs[m.group(2)].append(m.group(1)) + else: + oldvs[m.group(2)] = [m.group(1)] return oldvs @@ -98,60 +99,60 @@ def parse_borders_vs_osm(borders_vs_osm_csv_path): if not borders_vs_osm_csv_path: return vsosm - with open(borders_vs_osm_csv_path, "r") as f: + with open(borders_vs_osm_csv_path) as f: for line in f: - m = re.match(r"^(.+?)\t(\d)\t(.+?)$", line.strip()) - if m: - if m.group(1) in vsosm: - vsosm[m.group(1)].append(m.group(3)) - else: - vsosm[m.group(1)] = [m.group(3)] + m = re.match(r"(.+)\t(\d)\t(.+)", line.strip()) + assert m + if m.group(1) in vsosm: + vsosm[m.group(1)].append(m.group(3)) + else: + vsosm[m.group(1)] = [m.group(3)] return vsosm def hierarchy_to_countries(old_vs_new_csv_path, borders_vs_osm_csv_path, hierarchy_path, target_path, version): + + def fill_last(last, stack): + name = last["id"] + last["s"] = get_mwm_size(target_path, name) + last["sha1_base64"] = get_mwm_hash(target_path, name) + if last["s"] >= 0: + stack[-1]["g"].append(last) + oldvs = parse_old_vs_new(old_vs_new_csv_path) vsosm = parse_borders_vs_osm(borders_vs_osm_csv_path) stack = [CountryDict(v=version, nameattr="Countries", g=[])] last = None - with open(hierarchy_path, "r") as f: + with open(hierarchy_path) as f: for line in f: - m = re.match("( *)(.+?)\n", line) - if m: - depth = len(m.group(1)) - if last is not None: - lastd = last["d"] - del last["d"] - if lastd < depth: - # last is a group - last["g"] = [] - stack.append(last) - else: - name = last["f" if "f" in last else "id"] - last["s"] = get_size(target_path, name) - last["sha1_base64"] = get_hash(target_path, name) - if last["s"] >= 0: - stack[-1]["g"].append(last) - while depth < len(stack) - 1: - # group ended, add it to higher group - g = stack.pop() - if len(g["g"]) > 0: - stack[-1]["g"].append(g) - items = m.group(2).split(";") - last = CountryDict({"id": items[0], "d": depth}) - if items[0] in oldvs: - last["old"] = oldvs[items[0]] - if items[0] in vsosm: - last["affiliations"] = vsosm[items[0]] + m = re.match("( *).+", line) + assert m + depth = len(m.group(1)) + if last is not None: + lastd = last["d"] + del last["d"] + if lastd < depth: + # last is a group + last["g"] = [] + stack.append(last) + else: + fill_last(last, stack) + while depth < len(stack) - 1: + # group ended, add it to higher group + g = stack.pop() + if len(g["g"]) > 0: + stack[-1]["g"].append(g) + items = m.group(2).split(";") + last = CountryDict({"id": items[0], "d": depth}) + if items[0] in oldvs: + last["old"] = oldvs[items[0]] + if items[0] in vsosm: + last["affiliations"] = vsosm[items[0]] # the last line is always a file del last["d"] - name = last["f" if "f" in last else "id"] - last["s"] = get_size(target_path, name) - last["sha1_base64"] = get_hash(target_path, name) - if last["s"] >= 0: - stack[-1]["g"].append(last) + fill_last(last, stack) while len(stack) > 1: g = stack.pop() if len(g["g"]) > 0: diff --git a/tools/python/post_generation/localads_mwm_to_csv.py b/tools/python/post_generation/localads_mwm_to_csv.py index caa0b77d7a..af779a09c5 100755 --- a/tools/python/post_generation/localads_mwm_to_csv.py +++ b/tools/python/post_generation/localads_mwm_to_csv.py @@ -80,7 +80,7 @@ def write_csv(output_dir, qtype): mapping = QUEUES[qtype].get() -def create_csv(output, mwm_path, osm2ft_path, types, version, threads, debug=False): +def create_csv(output, mwm_path, osm2ft_path, types, version, threads): if not os.path.isdir(output): os.mkdir(output) @@ -97,10 +97,7 @@ def create_csv(output, mwm_path, osm2ft_path, types, version, threads, debug=Fal logging.error("Cannot find %s", osm2ft_name) sys.exit(2) parse_mwm_args = (os.path.join(mwm_path, mwm_name), osm2ft_name, version, types) - if debug: - parse_mwm(*parse_mwm_args) - else: - pool.apply_async(parse_mwm, parse_mwm_args) + pool.apply_async(parse_mwm, parse_mwm_args) pool.close() pool.join() for queue in QUEUES.values(): diff --git a/tools/unix/generate_planet.sh b/tools/unix/generate_planet.sh index 0d388b98de..1f5e867bf7 100755 --- a/tools/unix/generate_planet.sh +++ b/tools/unix/generate_planet.sh @@ -635,7 +635,7 @@ fi if [ "$MODE" == "resources" ]; then putmode "Step 8: Updating resource lists" # Update countries list - $PYTHON36 -m $POST_GENERATION_MODULE hierarchy_to_countries --target "$TARGET" --hierarchy "$DATA_PATH/hierarchy.txt" --version "$COUNTRIES_VERSION" \ + $PYTHON36 -m $POST_GENERATION_MODULE hierarchy_to_countries --target "$TARGET" --hierarchy "$DATA_PATH/hierarchy.txt" --mwm_version "$COUNTRIES_VERSION" \ --old "$DATA_PATH/old_vs_new.csv" --osm "$DATA_PATH/borders_vs_osm.csv" --output "$TARGET/countries.txt" >> "$PLANET_LOG" 2>&1 # A quick fix: chmodding to a+rw all generated files @@ -674,7 +674,7 @@ if [ -n "${LOCALADS-}" ]; then LOCALADS_LOG="$LOG_PATH/localads.log" LOCALADS_PATH="$INTDIR/localads" mkdir -p "$LOCALADS_PATH" - $PYTHON36 -m "$POST_GENERATION_MODULE" localads_mwm_to_csv "$TARGET" --osm2ft "$INTDIR" --version "$COUNTRIES_VERSION" --types "$DATA_PATH/types.txt" --output "$LOCALADS_PATH" >> "$LOCALADS_LOG" 2>&1 + $PYTHON36 -m "$POST_GENERATION_MODULE" localads_mwm_to_csv "$TARGET" --osm2ft "$INTDIR" --mwm_version "$COUNTRIES_VERSION" --types "$DATA_PATH/types.txt" --output "$LOCALADS_PATH" >> "$LOCALADS_LOG" 2>&1 LOCALADS_ARCHIVE="localads_$COUNTRIES_VERSION.tgz" cd "$LOCALADS_PATH" tar -czf "$LOCALADS_ARCHIVE" *.csv >> "$LOCALADS_LOG" 2>&1