subways/process_subways.py

#!/usr/bin/env python3
import argparse
import json
import logging
import os
import re
import sys
import time
import urllib.parse
import urllib.request
from processors import processor
from subway_io import (
    dump_yaml,
    load_xml,
    make_geojson,
    read_recovery_data,
    write_recovery_data,
)
from subway_structure import (
    CriticalValidationError,
    download_cities,
    find_transfers,
    get_unused_entrances_geojson,
    MODES_OVERGROUND,
    MODES_RAPID,
)

# Hack to always use IPv4.
import socket

old_getaddrinfo = socket.getaddrinfo
def new_getaddrinfo(*args, **kwargs):
    responses = old_getaddrinfo(*args, **kwargs)
    return [response
            for response in responses
            if response[0] == socket.AF_INET]
socket.getaddrinfo = new_getaddrinfo


def overpass_request(overground, overpass_api, bboxes):
    query = '[out:json][timeout:1000];('
    modes = MODES_OVERGROUND if overground else MODES_RAPID
    for bbox in bboxes:
        bbox_part = '({})'.format(','.join(str(coord) for coord in bbox))
        query += '('
        for mode in modes:
            query += 'rel[route="{}"]{};'.format(mode, bbox_part)
        query += ');'
        query += 'rel(br)[type=route_master];'
        if not overground:
            query += 'node[railway=subway_entrance]{};'.format(bbox_part)
        query += 'rel[public_transport=stop_area]{};'.format(bbox_part)
        query += (
            'rel(br)[type=public_transport][public_transport=stop_area_group];'
        )
    query += ');(._;>>;);out body center qt;'
    logging.debug('Query: %s', query)
    url = '{}?data={}'.format(overpass_api, urllib.parse.quote(query))
    response = urllib.request.urlopen(url, timeout=1000)
    if response.getcode() != 200:
        raise Exception(
            'Failed to query Overpass API: HTTP {}'.format(response.getcode())
        )
    return json.load(response)['elements']


def multi_overpass(overground, overpass_api, bboxes):
    SLICE_SIZE = 10
    INTERREQUEST_WAIT = 5  # in seconds
    result = []
    for i in range(0, len(bboxes) + SLICE_SIZE - 1, SLICE_SIZE):
        if i > 0:
            time.sleep(INTERREQUEST_WAIT)
        result.extend(
            overpass_request(
                overground, overpass_api, bboxes[i : i + SLICE_SIZE]
            )
        )
    return result


def slugify(name):
    return re.sub(r'[^a-z0-9_-]+', '', name.lower().replace(' ', '_'))


def calculate_centers(elements):
    """Adds 'center' key to each way/relation in elements,
    except for empty ways or relations.
    Relies on nodes-ways-relations order in the elements list.
    """
    nodes = {}  # id(int) => (lat, lon)
    ways = {}  # id(int) => (lat, lon)
    relations = {}  # id(int) => (lat, lon)
    empty_relations = set()  # ids(int) of relations without members
    # or containing only empty relations

    def calculate_way_center(el):
        # If element has been queried via overpass-api with 'out center;'
        # clause then ways already have 'center' attribute
        if 'center' in el:
            ways[el['id']] = (el['center']['lat'], el['center']['lon'])
            return
        center = [0, 0]
        count = 0
        for nd in el['nodes']:
            if nd in nodes:
                center[0] += nodes[nd][0]
                center[1] += nodes[nd][1]
                count += 1
        if count > 0:
            el['center'] = {'lat': center[0] / count, 'lon': center[1] / count}
            ways[el['id']] = (el['center']['lat'], el['center']['lon'])

    def calculate_relation_center(el):
        # If element has been queried via overpass-api with 'out center;'
        # clause then some relations already have 'center' attribute
        if 'center' in el:
            relations[el['id']] = (el['center']['lat'], el['center']['lon'])
            return True
        center = [0, 0]
        count = 0
        for m in el.get('members', []):
            if m['type'] == 'relation' and m['ref'] not in relations:
                if m['ref'] in empty_relations:
                    # Ignore empty child relations
                    continue
                else:
                    # Center of child relation is not known yet
                    return False
            member_container = (
                nodes
                if m['type'] == 'node'
                else ways
                if m['type'] == 'way'
                else relations
            )
            if m['ref'] in member_container:
                center[0] += member_container[m['ref']][0]
                center[1] += member_container[m['ref']][1]
                count += 1
        if count == 0:
            empty_relations.add(el['id'])
        else:
            el['center'] = {'lat': center[0] / count, 'lon': center[1] / count}
            relations[el['id']] = (el['center']['lat'], el['center']['lon'])
        return True

    relations_without_center = []

    for el in elements:
        if el['type'] == 'node':
            nodes[el['id']] = (el['lat'], el['lon'])
        elif el['type'] == 'way':
            if 'nodes' in el:
                calculate_way_center(el)
        elif el['type'] == 'relation':
            if not calculate_relation_center(el):
                relations_without_center.append(el)

    # Calculate centers for relations that have no one yet
    while relations_without_center:
        new_relations_without_center = []
        for rel in relations_without_center:
            if not calculate_relation_center(rel):
                new_relations_without_center.append(rel)
        if len(new_relations_without_center) == len(relations_without_center):
            break
        relations_without_center = new_relations_without_center

    if relations_without_center:
        logging.error(
            "Cannot calculate center for the relations (%d in total): %s%s",
            len(relations_without_center),
            ', '.join(str(rel['id']) for rel in relations_without_center[:20]),
            ", ..." if len(relations_without_center) > 20 else "",
        )
    if empty_relations:
        logging.warning(
            "Empty relations (%d in total): %s%s",
            len(empty_relations),
            ', '.join(str(x) for x in list(empty_relations)[:20]),
            ", ..." if len(empty_relations) > 20 else "",
        )


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-i',
        '--source',
        help='File to write backup of OSM data, or to read data from',
    )
    parser.add_argument(
        '-x', '--xml', help='OSM extract with routes, to read data from'
    )
    parser.add_argument(
        '--overpass-api',
        default='http://overpass-api.de/api/interpreter',
        help="Overpass API URL",
    )
    parser.add_argument(
        '-q',
        '--quiet',
        action='store_true',
        help='Show only warnings and errors',
    )
    parser.add_argument(
        '-c', '--city', help='Validate only a single city or a country'
    )
    parser.add_argument(
        '-t',
        '--overground',
        action='store_true',
        help='Process overground transport instead of subways',
    )
    parser.add_argument(
        '-e',
        '--entrances',
        type=argparse.FileType('w', encoding='utf-8'),
        help='Export unused subway entrances as GeoJSON here',
    )
    parser.add_argument(
        '-l',
        '--log',
        type=argparse.FileType('w', encoding='utf-8'),
        help='Validation JSON file name',
    )
    parser.add_argument(
        '-o',
        '--output',
        type=argparse.FileType('w', encoding='utf-8'),
        help='Processed metro systems output',
    )
    parser.add_argument('--cache', help='Cache file name for processed data')
    parser.add_argument(
        '-r', '--recovery-path', help='Cache file name for error recovery'
    )
    parser.add_argument(
        '-d', '--dump', help='Make a YAML file for a city data'
    )
    parser.add_argument(
        '-j', '--geojson', help='Make a GeoJSON file for a city data'
    )
    parser.add_argument(
        '--crude',
        action='store_true',
        help='Do not use OSM railway geometry for GeoJSON',
    )
    options = parser.parse_args()

    if options.quiet:
        log_level = logging.WARNING
    else:
        log_level = logging.INFO
    logging.basicConfig(
        level=log_level,
        datefmt='%H:%M:%S',
        format='%(asctime)s %(levelname)-7s  %(message)s',
    )

    # Downloading cities from Google Spreadsheets
    cities = download_cities(options.overground)
    if options.city:
        cities = [
            c
            for c in cities
            if c.name == options.city or c.country == options.city
        ]
    if not cities:
        logging.error('No cities to process')
        sys.exit(2)

    # Augment cities with recovery data
    recovery_data = None
    if options.recovery_path:
        recovery_data = read_recovery_data(options.recovery_path)
        for city in cities:
            city.recovery_data = recovery_data.get(city.name, None)

    logging.info('Read %s metro networks', len(cities))

    # Reading cached json, loading XML or querying Overpass API
    if options.source and os.path.exists(options.source):
        logging.info('Reading %s', options.source)
        with open(options.source, 'r') as f:
            osm = json.load(f)
            if 'elements' in osm:
                osm = osm['elements']
            calculate_centers(osm)
    elif options.xml:
        logging.info('Reading %s', options.xml)
        osm = load_xml(options.xml)
        calculate_centers(osm)
        if options.source:
            with open(options.source, 'w', encoding='utf-8') as f:
                json.dump(osm, f)
    else:
        if len(cities) > 10:
            logging.error(
                'Would not download that many cities from Overpass API, '
                'choose a smaller set'
            )
            sys.exit(3)
        bboxes = [c.bbox for c in cities]
        logging.info('Downloading data from Overpass API')
        osm = multi_overpass(options.overground, options.overpass_api, bboxes)
        calculate_centers(osm)
        if options.source:
            with open(options.source, 'w', encoding='utf-8') as f:
                json.dump(osm, f)
    logging.info('Downloaded %s elements, sorting by city', len(osm))

    # Sorting elements by city and prepare a dict
    for el in osm:
        for c in cities:
            if c.contains(el):
                c.add(el)

    logging.info('Building routes for each city')
    good_cities = []
    for c in cities:
        try:
            c.extract_routes()
        except CriticalValidationError as e:
            logging.error(
                "Critical validation error while processing %s: %s",
                c.name,
                str(e),
            )
            c.error(str(e))
        except AssertionError as e:
            logging.error(
                "Validation logic error while processing %s: %s",
                c.name,
                str(e),
            )
            c.error("Validation logic error: {}".format(str(e)))
        else:
            c.validate()
            if c.is_good():
                good_cities.append(c)

    logging.info('Finding transfer stations')
    transfers = find_transfers(osm, cities)

    good_city_names = set(c.name for c in good_cities)
    logging.info(
        '%s good cities: %s',
        len(good_city_names),
        ', '.join(sorted(good_city_names)),
    )
    bad_city_names = set(c.name for c in cities) - good_city_names
    logging.info(
        '%s bad cities: %s',
        len(bad_city_names),
        ', '.join(sorted(bad_city_names)),
    )

    if options.recovery_path:
        write_recovery_data(options.recovery_path, recovery_data, cities)

    if options.entrances:
        json.dump(get_unused_entrances_geojson(osm), options.entrances)

    if options.dump:
        if os.path.isdir(options.dump):
            for c in cities:
                with open(
                    os.path.join(options.dump, slugify(c.name) + '.yaml'),
                    'w',
                    encoding='utf-8',
                ) as f:
                    dump_yaml(c, f)
        elif len(cities) == 1:
            with open(options.dump, 'w', encoding='utf-8') as f:
                dump_yaml(cities[0], f)
        else:
            logging.error('Cannot dump %s cities at once', len(cities))

    if options.geojson:
        if os.path.isdir(options.geojson):
            for c in cities:
                with open(
                    os.path.join(
                        options.geojson, slugify(c.name) + '.geojson'
                    ),
                    'w',
                    encoding='utf-8',
                ) as f:
                    json.dump(make_geojson(c, not options.crude), f)
        elif len(cities) == 1:
            with open(options.geojson, 'w', encoding='utf-8') as f:
                json.dump(make_geojson(cities[0], not options.crude), f)
        else:
            logging.error(
                'Cannot make a geojson of %s cities at once', len(cities)
            )

    if options.log:
        res = []
        for c in cities:
            v = c.get_validation_result()
            v['slug'] = slugify(c.name)
            res.append(v)
        json.dump(res, options.log, indent=2, ensure_ascii=False)

    if options.output:
        json.dump(
            processor.process(cities, transfers, options.cache),
            options.output,
            indent=1,
            ensure_ascii=False,
        )