From fc60bf56fac1604c9aa3356e0a9fc3798efa95b1 Mon Sep 17 00:00:00 2001
From: Alexey Zakharenkov <a-zakh@yandex.ru>
Date: Wed, 11 Jan 2023 16:13:29 +0300
Subject: [PATCH] Add --cities-info-url CLI parameter to the main script and
 utilities

---
 make_all_metro_poly.py     |  35 ++++--
 mapsme_json_to_cities.py   |  21 +++-
 process_subways.py         |  82 ++++++++++++--
 scripts/process_subways.sh |   9 +-
 subway_structure.py        |  52 ---------
 v2h_templates.py           |   7 +-
 validation_to_html.py      | 217 ++++++++++++++++++++-----------------
 7 files changed, 242 insertions(+), 181 deletions(-)

diff --git a/make_all_metro_poly.py b/make_all_metro_poly.py
index 05a01b1..00281a7 100644
--- a/make_all_metro_poly.py
+++ b/make_all_metro_poly.py
@@ -1,20 +1,23 @@
+import argparse
+
 import shapely.geometry
 import shapely.ops
 
-from process_subways import download_cities
+from process_subways import DEFAULT_CITIES_INFO_URL, get_cities_info
 
 
-def make_disjoint_metro_polygons():
-    cities = download_cities()
+def make_disjoint_metro_polygons(cities_info_url: str) -> None:
+    cities_info = get_cities_info(cities_info_url)
 
     polygons = []
-    for c in cities:
+    for ci in cities_info:
+        bbox = tuple(map(float, ci["bbox"].split(",")))
         polygon = shapely.geometry.Polygon(
             [
-                (c.bbox[1], c.bbox[0]),
-                (c.bbox[1], c.bbox[2]),
-                (c.bbox[3], c.bbox[2]),
-                (c.bbox[3], c.bbox[0]),
+                (bbox[0], bbox[1]),
+                (bbox[0], bbox[3]),
+                (bbox[2], bbox[3]),
+                (bbox[2], bbox[1]),
             ]
         )
         polygons.append(polygon)
@@ -31,5 +34,19 @@ def make_disjoint_metro_polygons():
     print("END")
 
 
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--cities-info-url",
+        default=DEFAULT_CITIES_INFO_URL,
+        help=(
+            "URL of CSV file with reference information about rapid transit "
+            "networks. file:// protocol is also supported."
+        ),
+    )
+    options = parser.parse_args()
+    make_disjoint_metro_polygons(options.cities_info_url)
+
+
 if __name__ == "__main__":
-    make_disjoint_metro_polygons()
+    main()
diff --git a/mapsme_json_to_cities.py b/mapsme_json_to_cities.py
index 043d0b6..1c69a77 100644
--- a/mapsme_json_to_cities.py
+++ b/mapsme_json_to_cities.py
@@ -1,7 +1,7 @@
 import argparse
 import json
 
-from process_subways import download_cities
+from process_subways import DEFAULT_CITIES_INFO_URL, get_cities_info
 
 
 if __name__ == "__main__":
@@ -25,6 +25,15 @@ if __name__ == "__main__":
         ),
     )
 
+    arg_parser.add_argument(
+        "--cities-info-url",
+        default=DEFAULT_CITIES_INFO_URL,
+        help=(
+            "URL of CSV file with reference information about rapid transit "
+            "networks. file:// protocol is also supported."
+        ),
+    )
+
     arg_parser.add_argument(
         "--with-bad",
         action="store_true",
@@ -40,14 +49,14 @@ if __name__ == "__main__":
     good_cities = set(
         n.get("network", n.get("title")) for n in subway_json["networks"]
     )
-    cities = download_cities()
+    cities_info = get_cities_info(args.cities_info_url)
 
     lines = []
-    for c in cities:
-        if c.name in good_cities:
-            lines.append(f"{c.name}, {c.country}")
+    for ci in cities_info:
+        if ci["name"] in good_cities:
+            lines.append(f"{ci['name']}, {ci['country']}")
         elif with_bad:
-            lines.append(f"{c.name}, {c.country} (Bad)")
+            lines.append(f"{ci['name']}, {ci['country']} (Bad)")
 
     for line in sorted(lines):
         print(line)
diff --git a/process_subways.py b/process_subways.py
index 21a2fb4..89e1021 100755
--- a/process_subways.py
+++ b/process_subways.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import argparse
+import csv
 import inspect
 import json
 import logging
@@ -9,6 +10,7 @@ import sys
 import time
 import urllib.parse
 import urllib.request
+from functools import partial
 from typing import Dict, List, Optional, Tuple
 
 import processors
@@ -20,8 +22,8 @@ from subway_io import (
     write_recovery_data,
 )
 from subway_structure import (
+    City,
     CriticalValidationError,
-    download_cities,
     find_transfers,
     get_unused_entrances_geojson,
     MODES_OVERGROUND,
@@ -29,6 +31,12 @@ from subway_structure import (
 )
 
 
+DEFAULT_SPREADSHEET_ID = "1SEW1-NiNOnA2qDwievcxYV1FOaQl1mb1fdeyqAxHu3k"
+DEFAULT_CITIES_INFO_URL = (
+    "https://docs.google.com/spreadsheets/d/"
+    f"{DEFAULT_SPREADSHEET_ID}/export?format=csv"
+)
+
 Point = Tuple[float, float]
 
 
@@ -49,13 +57,11 @@ def overpass_request(overground, overpass_api, bboxes):
             "rel(br)[type=public_transport][public_transport=stop_area_group];"
         )
     query += ");(._;>>;);out body center qt;"
-    logging.info("Query: %s", query)
+    logging.debug("Query: %s", query)
     url = "{}?data={}".format(overpass_api, urllib.parse.quote(query))
     response = urllib.request.urlopen(url, timeout=1000)
-    if response.getcode() != 200:
-        raise Exception(
-            "Failed to query Overpass API: HTTP {}".format(response.getcode())
-        )
+    if (r_code := response.getcode()) != 200:
+        raise Exception(f"Failed to query Overpass API: HTTP {r_code}")
     return json.load(response)["elements"]
 
 
@@ -258,8 +264,69 @@ def validate_cities(cities):
     return good_cities
 
 
+def get_cities_info(
+    cities_info_url: str = DEFAULT_CITIES_INFO_URL,
+) -> List[dict]:
+    response = urllib.request.urlopen(cities_info_url)
+    if (
+        not cities_info_url.startswith("file://")
+        and (r_code := response.getcode()) != 200
+    ):
+        raise Exception(
+            f"Failed to download cities spreadsheet: HTTP {r_code}"
+        )
+    data = response.read().decode("utf-8")
+    reader = csv.DictReader(
+        data.splitlines(),
+        fieldnames=(
+            "id",
+            "name",
+            "country",
+            "continent",
+            "num_stations",
+            "num_lines",
+            "num_light_lines",
+            "num_interchanges",
+            "bbox",
+            "networks",
+        ),
+    )
+
+    cities_info = list()
+    names = set()
+    next(reader)  # skipping the header
+    for city_info in reader:
+        if city_info["id"] and city_info["bbox"]:
+            cities_info.append(city_info)
+            name = city_info["name"].strip()
+            if name in names:
+                logging.warning(
+                    "Duplicate city name in city list: %s",
+                    city_info,
+                )
+            names.add(name)
+    return cities_info
+
+
+def prepare_cities(
+    cities_info_url: str = DEFAULT_CITIES_INFO_URL, overground: bool = False
+) -> List[City]:
+    if overground:
+        raise NotImplementedError("Overground transit not implemented yet")
+    cities_info = get_cities_info(cities_info_url)
+    return list(map(partial(City, overground=overground), cities_info))
+
+
 def main():
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--cities-info-url",
+        default=DEFAULT_CITIES_INFO_URL,
+        help=(
+            "URL of CSV file with reference information about rapid transit "
+            "networks. file:// protocol is also supported."
+        ),
+    )
     parser.add_argument(
         "-i",
         "--source",
@@ -340,8 +407,7 @@ def main():
         format="%(asctime)s %(levelname)-7s  %(message)s",
     )
 
-    # Downloading cities from Google Spreadsheets
-    cities = download_cities(options.overground)
+    cities = prepare_cities(options.cities_info_url, options.overground)
     if options.city:
         cities = [
             c
diff --git a/scripts/process_subways.sh b/scripts/process_subways.sh
index 2ee69ab..2068643 100755
--- a/scripts/process_subways.sh
+++ b/scripts/process_subways.sh
@@ -32,6 +32,7 @@ Environment variable reference:
   - PLANET_METRO: path to a local o5m file with extract of cities having metro
     It's used instead of \$PLANET if exists otherwise it's created first
   - PLANET_UPDATE_SERVER: server to get replication data from. Defaults to https://planet.openstreetmap.org/replication/
+  - CITIES_INFO_URL: http(s) or "file://" URL to a CSV file with reference information about rapid transit systems. A default value is hammered into python code.
   - CITY: name of a city/country to process
   - BBOX: bounding box of an extract; x1,y1,x2,y2. Has precedence over \$POLY
   - POLY: *.poly file with [multi]polygon comprising cities with metro
@@ -92,7 +93,8 @@ function check_poly() {
         if [ -n "$("$PYTHON" -c "import shapely" 2>&1)" ]; then
           "$PYTHON" -m pip install shapely
         fi
-        "$PYTHON" "$SUBWAYS_PATH"/make_all_metro_poly.py > "$POLY"
+        "$PYTHON" "$SUBWAYS_PATH"/make_all_metro_poly.py \
+            ${CITIES_INFO_URL:+--cities-info-url "$CITIES_INFO_URL"} > "$POLY"
       fi
     fi
     POLY_CHECKED=1
@@ -244,6 +246,7 @@ fi
 VALIDATION="$TMPDIR/validation.json"
 "$PYTHON" "$SUBWAYS_PATH/process_subways.py" ${QUIET:+-q} \
     -x "$FILTERED_DATA" -l "$VALIDATION" \
+    ${CITIES_INFO_URL:+--cities-info-url "$CITIES_INFO_URL"} \
     ${MAPSME:+--output-mapsme "$MAPSME"} \
     ${GTFS:+--output-gtfs "$GTFS"} \
     ${CITY:+-c "$CITY"} ${DUMP:+-d "$DUMP"} ${GEOJSON:+-j "$GEOJSON"} \
@@ -264,7 +267,9 @@ fi
 
 mkdir -p $HTML_DIR
 rm -f "$HTML_DIR"/*.html
-"$PYTHON" "$SUBWAYS_PATH/validation_to_html.py" "$VALIDATION" "$HTML_DIR"
+"$PYTHON" "$SUBWAYS_PATH/validation_to_html.py" \
+    ${CITIES_INFO_URL:+--cities-info-url "$CITIES_INFO_URL"} \
+    "$VALIDATION" "$HTML_DIR"
 
 # Uploading files to the server
 
diff --git a/subway_structure.py b/subway_structure.py
index 739d9c5..ef8b5eb 100644
--- a/subway_structure.py
+++ b/subway_structure.py
@@ -1,15 +1,10 @@
-import csv
-import logging
 import math
 import re
-import urllib.parse
-import urllib.request
 from collections import Counter, defaultdict
 
 from css_colours import normalize_colour
 
 
-SPREADSHEET_ID = "1SEW1-NiNOnA2qDwievcxYV1FOaQl1mb1fdeyqAxHu3k"
 MAX_DISTANCE_TO_ENTRANCES = 300  # in meters
 MAX_DISTANCE_STOP_TO_LINE = 50  # in meters
 ALLOWED_STATIONS_MISMATCH = 0.02  # part of total station count
@@ -2108,50 +2103,3 @@ def get_unused_entrances_geojson(elements):
                     }
                 )
     return {"type": "FeatureCollection", "features": features}
-
-
-def download_cities(overground=False):
-    assert not overground, "Overground transit not implemented yet"
-    url = (
-        "https://docs.google.com/spreadsheets/d/{}/export?format=csv{}".format(
-            SPREADSHEET_ID, "&gid=1881416409" if overground else ""
-        )
-    )
-    response = urllib.request.urlopen(url)
-    if response.getcode() != 200:
-        raise Exception(
-            "Failed to download cities spreadsheet: HTTP {}".format(
-                response.getcode()
-            )
-        )
-    data = response.read().decode("utf-8")
-    reader = csv.DictReader(
-        data.splitlines(),
-        fieldnames=(
-            "id",
-            "name",
-            "country",
-            "continent",
-            "num_stations",
-            "num_lines",
-            "num_light_lines",
-            "num_interchanges",
-            "bbox",
-            "networks",
-        ),
-    )
-
-    next(reader)  # skipping the header
-    names = set()
-    cities = []
-    for city_data in reader:
-        if city_data["id"] and city_data["bbox"]:
-            cities.append(City(city_data, overground))
-            name = city_data["name"].strip()
-            if name in names:
-                logging.warning(
-                    "Duplicate city name in the google spreadsheet: %s",
-                    city_data,
-                )
-            names.add(name)
-    return cities
diff --git a/v2h_templates.py b/v2h_templates.py
index 3162180..a1102b4 100644
--- a/v2h_templates.py
+++ b/v2h_templates.py
@@ -3,7 +3,7 @@ validator_osm_wiki_url = (
 )
 github_url = "https://github.com/alexey-zakharenkov/subways"
 produced_by = f"""Produced by
-<a href="{github_url}">Subway Preprocessor</a> on {{date}}."""
+<a href="{github_url}">Subway Preprocessor</a> on {{date}}"""
 metro_mapping_osm_article = "https://wiki.openstreetmap.org/wiki/Metro_Mapping"
 list_of_metro_systems_url = (
     "https://en.wikipedia.org/wiki/List_of_metro_systems#List"
@@ -191,8 +191,7 @@ INDEX_FOOTER = f"""
 </table>
 </main>
 <footer>{produced_by}
-See <a href="{{google}}">this spreadsheet</a> for the reference
-metro statistics and
+from <a href="{{cities_info_url}}">this reference metro statistics</a>. See
 <a href="{list_of_metro_systems_url}">
 this wiki page</a> for a list of all metro systems.</footer>
 </body>
@@ -292,7 +291,7 @@ COUNTRY_CITY = """
 COUNTRY_FOOTER = f"""
 </table>
 </main>
-<footer>{produced_by}</footer>
+<footer>{produced_by}.</footer>
 </body>
 </html>
 """
diff --git a/validation_to_html.py b/validation_to_html.py
index 9eca75c..9858d2b 100755
--- a/validation_to_html.py
+++ b/validation_to_html.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python3
+import argparse
 import datetime
 import json
 import os
 import re
-import sys
 
-from subway_structure import SPREADSHEET_ID
+from process_subways import DEFAULT_SPREADSHEET_ID
 from v2h_templates import (
     COUNTRY_CITY,
     COUNTRY_FOOTER,
@@ -105,13 +105,6 @@ def tmpl(s, data=None, **kwargs):
                 s,
                 flags=re.DOTALL,
             )
-    s = s.replace("{date}", date)
-    google_url = (
-        "https://docs.google.com/spreadsheets/d/{}/edit?usp=sharing".format(
-            SPREADSHEET_ID
-        )
-    )
-    s = s.replace("{google}", google_url)
     return s
 
 
@@ -143,104 +136,128 @@ def esc(s):
     return s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
 
 
-if len(sys.argv) < 2:
-    print("Reads a log from subway validator and prepares HTML files.")
-    print(
-        "Usage: {} <validation.log> [<target_directory>]".format(sys.argv[0])
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Reads a log from subway validator and prepares HTML files."
+        )
     )
-    sys.exit(1)
+    parser.add_argument("validation_log")
+    parser.add_argument("target_directory", nargs="?", default=".")
+    parser.add_argument(
+        "--cities-info-url",
+        default=(
+            "https://docs.google.com/spreadsheets/d/"
+            f"{DEFAULT_SPREADSHEET_ID}/edit?usp=sharing"
+        ),
+    )
+    options = parser.parse_args()
+    target_dir = options.target_directory
+    cities_info_url = options.cities_info_url
 
-with open(sys.argv[1], "r", encoding="utf-8") as f:
-    data = {c["name"]: CityData(c) for c in json.load(f)}
+    with open(options.validation_log, "r", encoding="utf-8") as f:
+        data = {c["name"]: CityData(c) for c in json.load(f)}
 
-countries = {}
-continents = {}
-c_by_c = {}  # continent → set of countries
-for c in data.values():
-    countries[c.country] = c + countries.get(c.country, CityData())
-    continents[c.continent] = c + continents.get(c.continent, CityData())
-    if c.continent not in c_by_c:
-        c_by_c[c.continent] = set()
-    c_by_c[c.continent].add(c.country)
-world = sum(continents.values(), CityData())
+    countries = {}
+    continents = {}
+    c_by_c = {}  # continent → set of countries
+    for c in data.values():
+        countries[c.country] = c + countries.get(c.country, CityData())
+        continents[c.continent] = c + continents.get(c.continent, CityData())
+        if c.continent not in c_by_c:
+            c_by_c[c.continent] = set()
+        c_by_c[c.continent].add(c.country)
+    world = sum(continents.values(), CityData())
 
-overground = "traml_expected" in next(iter(data.values())).data
-date = datetime.datetime.utcnow().strftime("%d.%m.%Y %H:%M UTC")
-path = "." if len(sys.argv) < 3 else sys.argv[2]
-index = open(os.path.join(path, "index.html"), "w", encoding="utf-8")
-index.write(tmpl(INDEX_HEADER, world))
+    overground = "traml_expected" in next(iter(data.values())).data
+    date = datetime.datetime.utcnow().strftime("%d.%m.%Y %H:%M UTC")
+    index = open(os.path.join(target_dir, "index.html"), "w", encoding="utf-8")
+    index.write(tmpl(INDEX_HEADER, world))
 
-for continent in sorted(continents.keys()):
-    content = ""
-    for country in sorted(c_by_c[continent]):
-        country_file_name = country.lower().replace(" ", "-") + ".html"
-        content += tmpl(
-            INDEX_COUNTRY,
-            countries[country],
-            file=country_file_name,
-            country=country,
-            continent=continent,
-        )
-        country_file = open(
-            os.path.join(path, country_file_name), "w", encoding="utf-8"
-        )
-        country_file.write(
-            tmpl(
-                COUNTRY_HEADER,
+    for continent in sorted(continents.keys()):
+        content = ""
+        for country in sorted(c_by_c[continent]):
+            country_file_name = country.lower().replace(" ", "-") + ".html"
+            content += tmpl(
+                INDEX_COUNTRY,
+                countries[country],
+                file=country_file_name,
                 country=country,
                 continent=continent,
-                overground=overground,
-                subways=not overground,
+            )
+            country_file = open(
+                os.path.join(target_dir, country_file_name),
+                "w",
+                encoding="utf-8",
+            )
+            country_file.write(
+                tmpl(
+                    COUNTRY_HEADER,
+                    country=country,
+                    continent=continent,
+                    overground=overground,
+                    subways=not overground,
+                )
+            )
+            for name, city in sorted(data.items()):
+                if city.country == country:
+                    file_base = os.path.join(target_dir, city.slug)
+                    yaml_file = (
+                        city.slug + ".yaml"
+                        if os.path.exists(file_base + ".yaml")
+                        else None
+                    )
+                    json_file = (
+                        city.slug + ".geojson"
+                        if os.path.exists(file_base + ".geojson")
+                        else None
+                    )
+                    errors = "<br>".join(
+                        [osm_links(esc(e)) for e in city.errors]
+                    )
+                    warnings = "<br>".join(
+                        [osm_links(esc(w)) for w in city.warnings]
+                    )
+                    notices = "<br>".join(
+                        [osm_links(esc(n)) for n in city.notices]
+                    )
+                    country_file.write(
+                        tmpl(
+                            COUNTRY_CITY,
+                            city,
+                            city=name,
+                            country=country,
+                            continent=continent,
+                            yaml=yaml_file,
+                            json=json_file,
+                            subways=not overground,
+                            errors=errors,
+                            warnings=warnings,
+                            notices=notices,
+                            overground=overground,
+                        )
+                    )
+            country_file.write(
+                tmpl(
+                    COUNTRY_FOOTER,
+                    country=country,
+                    continent=continent,
+                    date=date,
+                )
+            )
+            country_file.close()
+        index.write(
+            tmpl(
+                INDEX_CONTINENT,
+                continents[continent],
+                content=content,
+                continent=continent,
             )
         )
-        for name, city in sorted(data.items()):
-            if city.country == country:
-                file_base = os.path.join(path, city.slug)
-                yaml_file = (
-                    city.slug + ".yaml"
-                    if os.path.exists(file_base + ".yaml")
-                    else None
-                )
-                json_file = (
-                    city.slug + ".geojson"
-                    if os.path.exists(file_base + ".geojson")
-                    else None
-                )
-                errors = "<br>".join([osm_links(esc(e)) for e in city.errors])
-                warnings = "<br>".join(
-                    [osm_links(esc(w)) for w in city.warnings]
-                )
-                notices = "<br>".join(
-                    [osm_links(esc(n)) for n in city.notices]
-                )
-                country_file.write(
-                    tmpl(
-                        COUNTRY_CITY,
-                        city,
-                        city=name,
-                        country=country,
-                        continent=continent,
-                        yaml=yaml_file,
-                        json=json_file,
-                        subways=not overground,
-                        errors=errors,
-                        warnings=warnings,
-                        notices=notices,
-                        overground=overground,
-                    )
-                )
-        country_file.write(
-            tmpl(COUNTRY_FOOTER, country=country, continent=continent)
-        )
-        country_file.close()
-    index.write(
-        tmpl(
-            INDEX_CONTINENT,
-            continents[continent],
-            content=content,
-            continent=continent,
-        )
-    )
 
-index.write(tmpl(INDEX_FOOTER))
-index.close()
+    index.write(tmpl(INDEX_FOOTER, date=date, cities_info_url=cities_info_url))
+    index.close()
+
+
+if __name__ == "__main__":
+    main()