Defer CSV-complying data transformation to write-to-CSV phase

This commit is contained in:
Alexey Zakharenkov 2022-10-18 17:42:12 +03:00 committed by Alexey Zakharenkov
parent 314941ea28
commit a6b76068f8
4 changed files with 102 additions and 80 deletions

View file

@ -19,10 +19,10 @@ jobs:
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.6
- name: Set up Python 3.8
uses: actions/setup-python@v3
with:
python-version: "3.6"
python-version: "3.8"
- name: Install dependencies
run: |
python -m pip install --upgrade pip

View file

@ -51,7 +51,7 @@ a city's bbox has been extended.
A single city or a country with few metro networks can be validated much faster
if you allow the `process_subway.py` to fetch data from Overpass API. Here are the steps:
1. Python3 interpreter required (3.6+)
1. Python3 interpreter required (3.8+)
2. Clone the repo
```
git clone https://github.com/alexey-zakharenkov/subways.git subways_validator

View file

@ -2,6 +2,7 @@ import csv
import io
import zipfile
from functools import partial
from itertools import permutations
from ._common import (
@ -129,19 +130,6 @@ GTFS_COLUMNS = {
}
def dict_to_row(dict_data, record_type):
"""Given object stored in a dict and an array of columns,
returns a row to use in CSV.
"""
row = []
for column in GTFS_COLUMNS[record_type]:
value = dict_data.get(column)
if value is None:
value = ""
row.append(value)
return row
def round_coords(coords_tuple):
return tuple(
map(lambda coord: round(coord, COORDINATE_PRECISION), coords_tuple)
@ -162,21 +150,18 @@ def process(cities, transfers, filename, cache_path):
gtfs_data = {key: [] for key in GTFS_COLUMNS.keys()}
gtfs_data["calendar"].append(
dict_to_row(
{
"service_id": "always",
"monday": 1,
"tuesday": 1,
"wednesday": 1,
"thursday": 1,
"friday": 1,
"saturday": 1,
"sunday": 1,
"start_date": "19700101",
"end_date": "30000101",
},
"calendar",
)
{
"service_id": "always",
"monday": 1,
"tuesday": 1,
"wednesday": 1,
"thursday": 1,
"friday": 1,
"saturday": 1,
"sunday": 1,
"start_date": "19700101",
"end_date": "30000101",
}
)
all_stops = {} # stop (stop area center or station) el_id -> stop data
@ -262,7 +247,7 @@ def process(cities, transfers, filename, cache_path):
# agency, routes, trips, stop_times, frequencies, shapes
for city in good_cities:
agency = {"agency_id": city.id, "agency_name": city.name}
gtfs_data["agency"].append(dict_to_row(agency, "agency"))
gtfs_data["agency"].append(agency)
for city_route in city:
route = {
@ -273,7 +258,7 @@ def process(cities, transfers, filename, cache_path):
"route_long_name": city_route.name,
"route_color": format_colour(city_route.colour),
}
gtfs_data["routes"].append(dict_to_row(route, "routes"))
gtfs_data["routes"].append(route)
for variant in city_route:
shape_id = variant.id[1:] # truncate leading 'r'
@ -283,7 +268,7 @@ def process(cities, transfers, filename, cache_path):
"service_id": "always",
"shape_id": shape_id,
}
gtfs_data["trips"].append(dict_to_row(trip, "trips"))
gtfs_data["trips"].append(trip)
tracks = variant.get_extended_tracks()
tracks = variant.get_truncated_tracks(tracks)
@ -291,16 +276,13 @@ def process(cities, transfers, filename, cache_path):
for i, (lon, lat) in enumerate(tracks):
lon, lat = round_coords((lon, lat))
gtfs_data["shapes"].append(
dict_to_row(
{
"shape_id": shape_id,
"trip_id": variant.id,
"shape_pt_lat": lat,
"shape_pt_lon": lon,
"shape_pt_sequence": i,
},
"shapes",
)
{
"shape_id": shape_id,
"trip_id": variant.id,
"shape_pt_lat": lat,
"shape_pt_lon": lon,
"shape_pt_sequence": i,
}
)
start_time = variant.start_time or DEFAULT_TRIP_START_TIME
@ -311,37 +293,29 @@ def process(cities, transfers, filename, cache_path):
end_time = f"{end_time[0]:02d}:{end_time[1]:02d}:00"
gtfs_data["frequencies"].append(
dict_to_row(
{
"trip_id": variant.id,
"start_time": start_time,
"end_time": end_time,
"headway_secs": variant.interval
or DEFAULT_INTERVAL,
},
"frequencies",
)
{
"trip_id": variant.id,
"start_time": start_time,
"end_time": end_time,
"headway_secs": variant.interval
or DEFAULT_INTERVAL,
}
)
for stop_sequence, route_stop in enumerate(variant):
gtfs_platform_id = add_stop_gtfs(route_stop, city)
gtfs_data["stop_times"].append(
dict_to_row(
{
"trip_id": variant.id,
"stop_sequence": stop_sequence,
"shape_dist_traveled": route_stop.distance,
"stop_id": gtfs_platform_id,
},
"stop_times",
)
{
"trip_id": variant.id,
"stop_sequence": stop_sequence,
"shape_dist_traveled": route_stop.distance,
"stop_id": gtfs_platform_id,
}
)
# stops
gtfs_data["stops"].extend(
map(lambda row: dict_to_row(row, "stops"), all_stops.values())
)
gtfs_data["stops"].extend(all_stops.values())
# transfers
for stoparea_set in transfers:
@ -358,20 +332,27 @@ def process(cities, transfers, filename, cache_path):
)
for id1, id2 in permutations((stop1_id, stop2_id)):
gtfs_data["transfers"].append(
dict_to_row(
{
"from_stop_id": id1,
"to_stop_id": id2,
"transfer_type": 0,
"min_transfer_time": transfer_time,
},
"transfers",
)
{
"from_stop_id": id1,
"to_stop_id": id2,
"transfer_type": 0,
"min_transfer_time": transfer_time,
}
)
make_gtfs(filename, gtfs_data)
def dict_to_row(dict_data: dict, record_type: str) -> list:
"""Given object stored in a dict and an array of columns,
return a row to use in CSV.
"""
return [
"" if (v := dict_data.get(column)) is None else v
for column in GTFS_COLUMNS[record_type]
]
def make_gtfs(filename, gtfs_data):
if not filename.lower().endswith("zip"):
filename = f"{filename}.zip"
@ -381,5 +362,10 @@ def make_gtfs(filename, gtfs_data):
with io.StringIO(newline="") as string_io:
writer = csv.writer(string_io, delimiter=",")
writer.writerow(columns)
writer.writerows(gtfs_data[gtfs_feature])
writer.writerows(
map(
partial(dict_to_row, record_type=gtfs_feature),
gtfs_data[gtfs_feature]
)
)
zf.writestr(f"{gtfs_feature}.txt", string_io.getvalue())

View file

@ -1,4 +1,4 @@
import unittest
from unittest import TestCase
from processors.gtfs import (
dict_to_row,
@ -6,10 +6,10 @@ from processors.gtfs import (
)
class TestGTFS(unittest.TestCase):
class TestGTFS(TestCase):
"""Test processors/gtfs.py"""
def test_dict_to_row(self):
def test__dict_to_row__Nones_and_absent_keys(self) -> None:
"""Test that absent or None values in a GTFS feature item
are converted by dict_to_row() function to empty strings
in right amount.
@ -55,6 +55,42 @@ class TestGTFS(unittest.TestCase):
for test_trip in test_trips:
with self.subTest(msg=test_trip["description"]):
self.assertEqual(
self.assertListEqual(
dict_to_row(test_trip["trip_data"], "trips"), answer
)
def test__dict_to_row__numeric_values(self) -> None:
"""Test that zero numeric values remain zeros in dict_to_row() function,
and not empty strings or None.
"""
shapes = [
{
"description": "Numeric non-zeroes",
"shape_data": {
"shape_id": 1,
"shape_pt_lat": 55.3242425,
"shape_pt_lon": -179.23242,
"shape_pt_sequence": 133,
"shape_dist_traveled": 1.2345,
},
"answer": [1, 55.3242425, -179.23242, 133, 1.2345],
},
{
"description": "Numeric zeroes and None keys",
"shape_data": {
"shape_id": 0,
"shape_pt_lat": 0.0,
"shape_pt_lon": 0,
"shape_pt_sequence": 0,
"shape_dist_traveled": None,
},
"answer": [0, 0.0, 0, 0, ""],
},
]
for shape in shapes:
with self.subTest(shape["description"]):
self.assertListEqual(
dict_to_row(shape["shape_data"], "shapes"), shape["answer"]
)