From 483429129c8d3db541f42a5746e579ff735daea9 Mon Sep 17 00:00:00 2001 From: Alexey Zakharenkov Date: Mon, 27 Mar 2023 11:32:19 +0300 Subject: [PATCH] Check CSV numeric values on reading network information --- .github/workflows/python-app.yml | 4 +- README.md | 2 +- process_subways.py | 48 +++++++++------- subway_structure.py | 70 ++++++++++++++++------- tests/assets/networks_with_bad_values.csv | 12 ++++ tests/test_prepare_cities.py | 36 ++++++++++++ 6 files changed, 126 insertions(+), 46 deletions(-) create mode 100644 tests/assets/networks_with_bad_values.csv create mode 100644 tests/test_prepare_cities.py diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index fbf35d3..2c5434e 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -19,10 +19,10 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.11 uses: actions/setup-python@v3 with: - python-version: "3.8" + python-version: "3.11" - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/README.md b/README.md index 92d7d9d..e259087 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ a city's bbox has been extended. A single city or a country with few metro networks can be validated much faster if you allow the `process_subway.py` to fetch data from Overpass API. Here are the steps: -1. Python3 interpreter required (3.8+) +1. Python3 interpreter required (3.11+) 2. Clone the repo ``` git clone https://github.com/alexey-zakharenkov/subways.git subways_validator diff --git a/process_subways.py b/process_subways.py index 89e1021..c81a21c 100755 --- a/process_subways.py +++ b/process_subways.py @@ -11,7 +11,6 @@ import time import urllib.parse import urllib.request from functools import partial -from typing import Dict, List, Optional, Tuple import processors from subway_io import ( @@ -30,17 +29,18 @@ from subway_structure import ( MODES_RAPID, ) - DEFAULT_SPREADSHEET_ID = "1SEW1-NiNOnA2qDwievcxYV1FOaQl1mb1fdeyqAxHu3k" DEFAULT_CITIES_INFO_URL = ( "https://docs.google.com/spreadsheets/d/" f"{DEFAULT_SPREADSHEET_ID}/export?format=csv" ) -Point = Tuple[float, float] +Point = tuple[float, float] -def overpass_request(overground, overpass_api, bboxes): +def overpass_request( + overground: bool, overpass_api: str, bboxes: list[list[float]] +) -> list[dict]: query = "[out:json][timeout:1000];(" modes = MODES_OVERGROUND if overground else MODES_RAPID for bbox in bboxes: @@ -65,7 +65,9 @@ def overpass_request(overground, overpass_api, bboxes): return json.load(response)["elements"] -def multi_overpass(overground, overpass_api, bboxes): +def multi_overpass( + overground: bool, overpass_api: str, bboxes: list[list[float]] +) -> list[dict]: SLICE_SIZE = 10 INTERREQUEST_WAIT = 5 # in seconds result = [] @@ -77,13 +79,13 @@ def multi_overpass(overground, overpass_api, bboxes): return result -def slugify(name): +def slugify(name: str) -> str: return re.sub(r"[^a-z0-9_-]+", "", name.lower().replace(" ", "_")) def get_way_center( - element: dict, node_centers: Dict[int, Point] -) -> Optional[Point]: + element: dict, node_centers: dict[int, Point] +) -> Point | None: """ :param element: dict describing OSM element :param node_centers: osm_id => (lat, lon) @@ -123,11 +125,11 @@ def get_way_center( def get_relation_center( element: dict, - node_centers: Dict[int, Point], - way_centers: Dict[int, Point], - relation_centers: Dict[int, Point], + node_centers: dict[int, Point], + way_centers: dict[int, Point], + relation_centers: dict[int, Point], ignore_unlocalized_child_relations: bool = False, -) -> Optional[Point]: +) -> Point | None: """ :param element: dict describing OSM element :param node_centers: osm_id => (lat, lon) @@ -176,14 +178,14 @@ def get_relation_center( return element["center"]["lat"], element["center"]["lon"] -def calculate_centers(elements): +def calculate_centers(elements: list[dict]) -> None: """Adds 'center' key to each way/relation in elements, except for empty ways or relations. Relies on nodes-ways-relations order in the elements list. """ - nodes: Dict[int, Point] = {} # id => (lat, lon) - ways: Dict[int, Point] = {} # id => (lat, lon) - relations: Dict[int, Point] = {} # id => (lat, lon) + nodes: dict[int, Point] = {} # id => (lat, lon) + ways: dict[int, Point] = {} # id => (lat, lon) + relations: dict[int, Point] = {} # id => (lat, lon) unlocalized_relations = [] # 'unlocalized' means the center of the # relation has not been calculated yet @@ -202,7 +204,7 @@ def calculate_centers(elements): def iterate_relation_centers_calculation( ignore_unlocalized_child_relations: bool, - ) -> List[int]: + ) -> list[dict]: unlocalized_relations_upd = [] for rel in unlocalized_relations: if center := get_relation_center( @@ -229,14 +231,16 @@ def calculate_centers(elements): unlocalized_relations = unlocalized_relations_upd -def add_osm_elements_to_cities(osm_elements, cities): +def add_osm_elements_to_cities( + osm_elements: list[dict], cities: list[City] +) -> None: for el in osm_elements: for c in cities: if c.contains(el): c.add(el) -def validate_cities(cities): +def validate_cities(cities: list[City]) -> list[City]: """Validate cities. Return list of good cities.""" good_cities = [] for c in cities: @@ -266,7 +270,7 @@ def validate_cities(cities): def get_cities_info( cities_info_url: str = DEFAULT_CITIES_INFO_URL, -) -> List[dict]: +) -> list[dict]: response = urllib.request.urlopen(cities_info_url) if ( not cities_info_url.startswith("file://") @@ -310,14 +314,14 @@ def get_cities_info( def prepare_cities( cities_info_url: str = DEFAULT_CITIES_INFO_URL, overground: bool = False -) -> List[City]: +) -> list[City]: if overground: raise NotImplementedError("Overground transit not implemented yet") cities_info = get_cities_info(cities_info_url) return list(map(partial(City, overground=overground), cities_info)) -def main(): +def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "--cities-info-url", diff --git a/subway_structure.py b/subway_structure.py index 63bf63f..96cf6e8 100644 --- a/subway_structure.py +++ b/subway_structure.py @@ -1517,25 +1517,23 @@ class City: self.errors = [] self.warnings = [] self.notices = [] - self.id = int(city_data["id"]) + self.try_fill_int_attribute(city_data, "id") self.name = city_data["name"] self.country = city_data["country"] self.continent = city_data["continent"] self.overground = overground if not overground: - self.num_stations = int(city_data["num_stations"]) - self.num_lines = int(city_data["num_lines"] or "0") - self.num_light_lines = int(city_data["num_light_lines"] or "0") - self.num_interchanges = int(city_data["num_interchanges"] or "0") + self.try_fill_int_attribute(city_data, "num_stations") + self.try_fill_int_attribute(city_data, "num_lines", "0") + self.try_fill_int_attribute(city_data, "num_light_lines", "0") + self.try_fill_int_attribute(city_data, "num_interchanges", "0") else: - self.num_tram_lines = int(city_data["num_tram_lines"] or "0") - self.num_trolleybus_lines = int( - city_data["num_trolleybus_lines"] or "0" - ) - self.num_bus_lines = int(city_data["num_bus_lines"] or "0") - self.num_other_lines = int(city_data["num_other_lines"] or "0") + self.try_fill_int_attribute(city_data, "num_tram_lines", "0") + self.try_fill_int_attribute(city_data, "num_trolleybus_lines", "0") + self.try_fill_int_attribute(city_data, "num_bus_lines", "0") + self.try_fill_int_attribute(city_data, "num_other_lines", "0") - # Aquiring list of networks and modes + # Acquiring list of networks and modes networks = ( None if not city_data["networks"] @@ -1574,6 +1572,33 @@ class City: self.stops_and_platforms = set() # Set of stops and platforms el_id self.recovery_data = None + def try_fill_int_attribute( + self, city_data: dict, attr: str, default: str | None = None + ) -> None: + """Try to convert string value to int. Conversion is considered + to fail if one of the following is true: + * attr is not empty and data type casting fails; + * attr is empty and no default value is given. + In such cases the city is marked as bad by adding an error + to the city validation log. + """ + attr_value = city_data[attr] + if not attr_value and default is not None: + attr_value = default + + try: + attr_int = int(attr_value) + except ValueError: + print_value = ( + f"{city_data[attr]}" if city_data[attr] else "" + ) + self.error( + f"Configuration error: wrong value for {attr}: {print_value}" + ) + setattr(self, attr, 0) + else: + setattr(self, attr, attr_int) + @staticmethod def log_message(message, el): if el: @@ -1814,12 +1839,12 @@ class City: if not self.overground: result.update( { - "subwayl_expected": self.num_lines, - "lightrl_expected": self.num_light_lines, + "subwayl_expected": getattr(self, "num_lines", 0), + "lightrl_expected": getattr(self, "num_light_lines", 0), "subwayl_found": getattr(self, "found_lines", 0), "lightrl_found": getattr(self, "found_light_lines", 0), - "stations_expected": self.num_stations, - "transfers_expected": self.num_interchanges, + "stations_expected": getattr(self, "num_stations", 0), + "transfers_expected": getattr(self, "num_interchanges", 0), } ) else: @@ -1827,10 +1852,12 @@ class City: { "stations_expected": 0, "transfers_expected": 0, - "busl_expected": self.num_bus_lines, - "trolleybusl_expected": self.num_trolleybus_lines, - "traml_expected": self.num_tram_lines, - "otherl_expected": self.num_other_lines, + "busl_expected": getattr(self, "num_bus_lines", 0), + "trolleybusl_expected": getattr( + self, "num_trolleybus_lines", 0 + ), + "traml_expected": getattr(self, "num_tram_lines", 0), + "otherl_expected": getattr(self, "num_other_lines", 0), "busl_found": getattr(self, "found_bus_lines", 0), "trolleybusl_found": getattr( self, "found_trolleybus_lines", 0 @@ -2006,7 +2033,8 @@ class City: ) log_function = ( self.error - if not ( + if self.num_stations > 0 + and not ( 0 <= (self.num_stations - self.found_stations) / self.num_stations diff --git a/tests/assets/networks_with_bad_values.csv b/tests/assets/networks_with_bad_values.csv new file mode 100644 index 0000000..0d0b528 --- /dev/null +++ b/tests/assets/networks_with_bad_values.csv @@ -0,0 +1,12 @@ +#,City,Country,Region,Stations,Subway Lines,Light Rail +Monorail,Interchanges,"BBox (lon, lat)",Networks (opt.),Comment,Source, +291,Moscow,Russia,Europe,335,14,3,66,,"subway,train:Московский метрополитен;МЦК;МЦД","No bbox - skip the city", +,Moscow - Aeroexpress,Russia,Europe,22,0,3,0,"37.170,55.385,38.028,56.022",train:Аэроэкспресс,"No id - skip the city",https://aeroexpress.ru +292,Nizhny Novgorod,Russia,Europe,16,2,0,,"43.759918,56.1662,44.13208,56.410862",,"No configuration errors",,, +NBS,Novosibirsk,Russia,Europe,13,2,0,1,"82.774773,54.926747,83.059044,55.127864",,"Non-numeric ID",,, +294,Saint Petersburg,Russia,Europe,72,,,,"30.0648,59.7509,30.5976,60.1292",,"Empty line counts - no problem at CSV parsing stage",, +,,,,,,,,,,,, +295,Samara,Russia,Europe,10x,1,0,0,"50.011826,53.094024,50.411453,53.384147",,"Non-numeric station count",, +296,Volgograd,Russia,Europe,40,zero,2zero,0,"44.366704,48.636024,44.62302,48.81765",,"Non-numbers in subway and light_rail line counts",, +297,Yekaterinburg,Russia,Europe,,1,0,0,"60.460854,56.730505,60.727272,56.920997",,"Empty station count",, +,,,,,,,,,,,, + diff --git a/tests/test_prepare_cities.py b/tests/test_prepare_cities.py new file mode 100644 index 0000000..e74505f --- /dev/null +++ b/tests/test_prepare_cities.py @@ -0,0 +1,36 @@ +import inspect +from pathlib import Path +from unittest import TestCase + +from process_subways import prepare_cities + + +class TestPrepareCities(TestCase): + def test_prepare_cities(self) -> None: + csv_path = ( + Path(inspect.getfile(self.__class__)).parent + / "assets" + / "networks_with_bad_values.csv" + ) + + cities = prepare_cities(cities_info_url=f"file://{csv_path}") + + city_errors = {city.name: sorted(city.errors) for city in cities} + + expected_errors = { + "Nizhny Novgorod": [], + "Novosibirsk": ["Configuration error: wrong value for id: NBS"], + "Saint Petersburg": [], + "Samara": [ + "Configuration error: wrong value for num_stations: 10x" + ], + "Volgograd": [ + "Configuration error: wrong value for num_light_lines: 2zero", + "Configuration error: wrong value for num_lines: zero", + ], + "Yekaterinburg": [ + "Configuration error: wrong value for num_stations: " + ], + } + + self.assertDictEqual(city_errors, expected_errors)