From 949f4d6dfc15701a506d75ddaef179b4724ad9b3 Mon Sep 17 00:00:00 2001
From: Maksim Andrianov <maksimandrianov1@gmail.com>
Date: Tue, 7 May 2019 13:52:09 +0300
Subject: [PATCH] Review fixes

---
 tools/python/post_generation/__main__.py      |  19 ++--
 .../post_generation/hierarchy_to_countries.py | 101 +++++++++---------
 .../post_generation/localads_mwm_to_csv.py    |   7 +-
 tools/unix/generate_planet.sh                 |   4 +-
 4 files changed, 64 insertions(+), 67 deletions(-)
diff --git a/tools/python/post_generation/__main__.py b/tools/python/post_generation/__main__.py
index 6f7d4cf111..c004c7deb9 100644
--- a/tools/python/post_generation/__main__.py
+++ b/tools/python/post_generation/__main__.py
@@ -9,9 +9,9 @@ from .localads_mwm_to_csv import create_csv
 class PostGeneration:
     def __init__(self):
         parser = argparse.ArgumentParser(
-            description="Post generation instruments",
+            description="Post-generation instruments",
             usage="""post_generation <command> [<args>]
-The most commonly used post_generation commands are:
+The post_generation commands are:
     localads_mwm_to_csv    Prepares CSV files for uploading to localads database from mwm files.
     hierarchy_to_countries Produces countries.txt from hierarchy.txt.
     """)
@@ -42,17 +42,16 @@ The most commonly used post_generation commands are:
                             help="path to omim/data/types.txt")
         parser.add_argument("--threads",
                             type=int,
+                            default=1,
                             help="number of threads to process files")
-        parser.add_argument("--version", type=int, help="override mwm version")
-        parser.add_argument("--debug",
-                            action="store_true",
-                            help="debug parse_mwm call")
+        parser.add_argument("--mwm_version", type=int, required=True,
+                            help="Mwm version")
         args = parser.parse_args(sys.argv[2:])
         if not args.osm2ft:
             args.osm2ft = args.mwm
 
         create_csv(args.output, args.mwm, args.osm2ft, args.types,
-                   args.version, args.threads, args.debug)
+                   args.mwm_version, args.threads)
 
     @staticmethod
     def hierarchy_to_countries():
@@ -67,15 +66,15 @@ The most commonly used post_generation commands are:
                             help="old_vs_new.csv file")
         parser.add_argument("--osm", required=True,
                             help="borders_vs_osm.csv file")
-        parser.add_argument("--version", type=int, default=151231,
-                            help="Version")
+        parser.add_argument("--mwm_version", type=int, required=True,
+                            help="Mwm version")
         parser.add_argument("-o", "--output", required=True,
                             help="Output countries.txt file (default is stdout)")
         args = parser.parse_args(sys.argv[2:])
         countries_json = hierarchy_to_countries_(args.old, args.osm,
                                                  args.hierarchy,
                                                  args.target,
-                                                 args.version)
+                                                 args.mwm_version)
         if args.output:
             with open(args.output, "w") as f:
                 f.write(countries_json)
diff --git a/tools/python/post_generation/hierarchy_to_countries.py b/tools/python/post_generation/hierarchy_to_countries.py
index 66dddbb92b..c5683b6252 100755
--- a/tools/python/post_generation/hierarchy_to_countries.py
+++ b/tools/python/post_generation/hierarchy_to_countries.py
@@ -4,7 +4,7 @@
 #
 # Sample lines:
 # Iran;Q794;ir;fa
-# Iran_South;Q794-South
+#  Iran_South;Q794-South
 #
 # Number of leading spaces mean hierarchy depth. In above case, Iran_South is inside Iran.
 # Then follows a semicolon-separated list:
@@ -12,6 +12,7 @@
 # 2. Region name template using wikidata Qxxx codes and predefined strings
 # 3. Country ISO code (used for flags in the legacy format)
 # 4. Comma-separated list of language ISO codes for the region
+
 import base64
 import hashlib
 import json
@@ -22,7 +23,7 @@ import re
 class CountryDict(dict):
     def __init__(self, *args, **kwargs):
         dict.__init__(self, *args, **kwargs)
-        self.order = ["id", "n", "f", "v", "c", "s", "sha1_base64", "rs", "g"]
+        self.order = ["id", "n", "v", "c", "s", "sha1_base64", "rs", "g"]
 
     def __iter__(self):
         for key in self.order:
@@ -37,7 +38,7 @@ class CountryDict(dict):
             yield (key, self.__getitem__(key))
 
 
-def get_hash(path, name):
+def get_mwm_hash(path, name):
     filename = os.path.join(path, f"{name}.mwm")
     h = hashlib.sha1()
     with open(filename, "rb") as f:
@@ -46,7 +47,7 @@ def get_hash(path, name):
     return str(base64.b64encode(h.digest()), "utf-8")
 
 
-def get_size(path, name):
+def get_mwm_size(path, name):
     filename = os.path.join(path, f"{name}.mwm")
     return os.path.getsize(filename)
 
@@ -82,14 +83,14 @@ def parse_old_vs_new(old_vs_new_csv_path):
     if not old_vs_new_csv_path:
         return oldvs
 
-    with open(old_vs_new_csv_path, "r") as f:
+    with open(old_vs_new_csv_path) as f:
         for line in f:
             m = re.match(r"(.+?)\t(.+)", line.strip())
-            if m:
-                if m.group(2) in oldvs:
-                    oldvs[m.group(2)].append(m.group(1))
-                else:
-                    oldvs[m.group(2)] = [m.group(1)]
+            assert m
+            if m.group(2) in oldvs:
+                oldvs[m.group(2)].append(m.group(1))
+            else:
+                oldvs[m.group(2)] = [m.group(1)]
     return oldvs
 
 
@@ -98,60 +99,60 @@ def parse_borders_vs_osm(borders_vs_osm_csv_path):
     if not borders_vs_osm_csv_path:
         return vsosm
 
-    with open(borders_vs_osm_csv_path, "r") as f:
+    with open(borders_vs_osm_csv_path) as f:
         for line in f:
-            m = re.match(r"^(.+?)\t(\d)\t(.+?)$", line.strip())
-            if m:
-                if m.group(1) in vsosm:
-                    vsosm[m.group(1)].append(m.group(3))
-                else:
-                    vsosm[m.group(1)] = [m.group(3)]
+            m = re.match(r"(.+)\t(\d)\t(.+)", line.strip())
+            assert m
+            if m.group(1) in vsosm:
+                vsosm[m.group(1)].append(m.group(3))
+            else:
+                vsosm[m.group(1)] = [m.group(3)]
     return vsosm
 
 
 def hierarchy_to_countries(old_vs_new_csv_path, borders_vs_osm_csv_path,
                            hierarchy_path, target_path, version):
+
+    def fill_last(last, stack):
+        name = last["id"]
+        last["s"] = get_mwm_size(target_path, name)
+        last["sha1_base64"] = get_mwm_hash(target_path, name)
+        if last["s"] >= 0:
+            stack[-1]["g"].append(last)
+
     oldvs = parse_old_vs_new(old_vs_new_csv_path)
     vsosm = parse_borders_vs_osm(borders_vs_osm_csv_path)
     stack = [CountryDict(v=version, nameattr="Countries", g=[])]
     last = None
-    with open(hierarchy_path, "r") as f:
+    with open(hierarchy_path) as f:
         for line in f:
-            m = re.match("( *)(.+?)\n", line)
-            if m:
-                depth = len(m.group(1))
-                if last is not None:
-                    lastd = last["d"]
-                    del last["d"]
-                    if lastd < depth:
-                        # last is a group
-                        last["g"] = []
-                        stack.append(last)
-                    else:
-                        name = last["f" if "f" in last else "id"]
-                        last["s"] = get_size(target_path, name)
-                        last["sha1_base64"] = get_hash(target_path, name)
-                        if last["s"] >= 0:
-                            stack[-1]["g"].append(last)
-                while depth < len(stack) - 1:
-                    # group ended, add it to higher group
-                    g = stack.pop()
-                    if len(g["g"]) > 0:
-                        stack[-1]["g"].append(g)
-                items = m.group(2).split(";")
-                last = CountryDict({"id": items[0], "d": depth})
-                if items[0] in oldvs:
-                    last["old"] = oldvs[items[0]]
-                if items[0] in vsosm:
-                    last["affiliations"] = vsosm[items[0]]
+            m = re.match("( *).+", line)
+            assert m
+            depth = len(m.group(1))
+            if last is not None:
+                lastd = last["d"]
+                del last["d"]
+                if lastd < depth:
+                    # last is a group
+                    last["g"] = []
+                    stack.append(last)
+                else:
+                    fill_last(last, stack)
+            while depth < len(stack) - 1:
+                # group ended, add it to higher group
+                g = stack.pop()
+                if len(g["g"]) > 0:
+                    stack[-1]["g"].append(g)
+            items = m.group(2).split(";")
+            last = CountryDict({"id": items[0], "d": depth})
+            if items[0] in oldvs:
+                last["old"] = oldvs[items[0]]
+            if items[0] in vsosm:
+                last["affiliations"] = vsosm[items[0]]
 
     # the last line is always a file
     del last["d"]
-    name = last["f" if "f" in last else "id"]
-    last["s"] = get_size(target_path, name)
-    last["sha1_base64"] = get_hash(target_path, name)
-    if last["s"] >= 0:
-        stack[-1]["g"].append(last)
+    fill_last(last, stack)
     while len(stack) > 1:
         g = stack.pop()
         if len(g["g"]) > 0:
diff --git a/tools/python/post_generation/localads_mwm_to_csv.py b/tools/python/post_generation/localads_mwm_to_csv.py
index caa0b77d7a..af779a09c5 100755
--- a/tools/python/post_generation/localads_mwm_to_csv.py
+++ b/tools/python/post_generation/localads_mwm_to_csv.py
@@ -80,7 +80,7 @@ def write_csv(output_dir, qtype):
             mapping = QUEUES[qtype].get()
 
 
-def create_csv(output, mwm_path, osm2ft_path, types, version, threads, debug=False):
+def create_csv(output, mwm_path, osm2ft_path, types, version, threads):
     if not os.path.isdir(output):
         os.mkdir(output)
 
@@ -97,10 +97,7 @@ def create_csv(output, mwm_path, osm2ft_path, types, version, threads, debug=Fal
             logging.error("Cannot find %s", osm2ft_name)
             sys.exit(2)
         parse_mwm_args = (os.path.join(mwm_path, mwm_name), osm2ft_name, version, types)
-        if debug:
-            parse_mwm(*parse_mwm_args)
-        else:
-            pool.apply_async(parse_mwm, parse_mwm_args)
+        pool.apply_async(parse_mwm, parse_mwm_args)
     pool.close()
     pool.join()
     for queue in QUEUES.values():
diff --git a/tools/unix/generate_planet.sh b/tools/unix/generate_planet.sh
index 0d388b98de..1f5e867bf7 100755
--- a/tools/unix/generate_planet.sh
+++ b/tools/unix/generate_planet.sh
@@ -635,7 +635,7 @@ fi
 if [ "$MODE" == "resources" ]; then
   putmode "Step 8: Updating resource lists"
   # Update countries list
-  $PYTHON36 -m $POST_GENERATION_MODULE hierarchy_to_countries --target "$TARGET" --hierarchy "$DATA_PATH/hierarchy.txt" --version "$COUNTRIES_VERSION" \
+  $PYTHON36 -m $POST_GENERATION_MODULE hierarchy_to_countries --target "$TARGET" --hierarchy "$DATA_PATH/hierarchy.txt" --mwm_version "$COUNTRIES_VERSION" \
     --old "$DATA_PATH/old_vs_new.csv" --osm "$DATA_PATH/borders_vs_osm.csv" --output "$TARGET/countries.txt" >> "$PLANET_LOG" 2>&1
 
   # A quick fix: chmodding to a+rw all generated files
@@ -674,7 +674,7 @@ if [ -n "${LOCALADS-}" ]; then
     LOCALADS_LOG="$LOG_PATH/localads.log"
     LOCALADS_PATH="$INTDIR/localads"
     mkdir -p "$LOCALADS_PATH"
-    $PYTHON36 -m "$POST_GENERATION_MODULE" localads_mwm_to_csv "$TARGET" --osm2ft "$INTDIR" --version "$COUNTRIES_VERSION" --types "$DATA_PATH/types.txt" --output "$LOCALADS_PATH" >> "$LOCALADS_LOG" 2>&1
+    $PYTHON36 -m "$POST_GENERATION_MODULE" localads_mwm_to_csv "$TARGET" --osm2ft "$INTDIR" --mwm_version "$COUNTRIES_VERSION" --types "$DATA_PATH/types.txt" --output "$LOCALADS_PATH" >> "$LOCALADS_LOG" 2>&1
     LOCALADS_ARCHIVE="localads_$COUNTRIES_VERSION.tgz"
     cd "$LOCALADS_PATH"
     tar -czf "$LOCALADS_ARCHIVE" *.csv >> "$LOCALADS_LOG" 2>&1