osm_conflate/conflate/osm.py

import logging
import requests
import re
from .data import OSMPoint
from . import etree


OVERPASS_SERVER = 'https://overpass-api.de/api/'
ALT_OVERPASS_SERVER = 'https://overpass.kumi.systems/api/'
OSM_API_SERVER = 'https://api.openstreetmap.org/api/0.6/'
BBOX_PADDING = 0.003  # in degrees, ~330 m default


class OsmDownloader:
    def __init__(self, profile):
        self.profile = profile

    def set_overpass(self, server='alt'):
        global OVERPASS_SERVER
        if server == 'alt':
            OVERPASS_SERVER = ALT_OVERPASS_SERVER
        else:
            OVERPASS_SERVER = server

    def construct_overpass_query(self, bboxes):
        """Constructs an Overpass API query from the "query" list in the profile.
        (k, v) turns into [k=v], (k,) into [k], (k, None) into [!k], (k, "~v") into [k~v]."""
        tags = self.profile.get(
            'query', required="a list of tuples. E.g. [('amenity', 'cafe'), ('name', '~Mc.*lds')]")
        tag_strs = []
        if isinstance(tags, str):
            tag_strs = [tags]
        else:
            if not isinstance(tags[0], str) and isinstance(tags[0][0], str):
                tags = [tags]
            for tags_q in tags:
                if isinstance(tags_q, str):
                    tag_strs.append(tags_q)
                    continue
                tag_str = ''
                for t in tags_q:
                    if len(t) == 1:
                        q = '"{}"'.format(t[0])
                    elif t[1] is None or len(t[1]) == 0:
                        q = '"!{}"'.format(t[0])
                    elif t[1][0] == '~':
                        q = '"{}"~"{}",i'.format(t[0], t[1][1:])
                    elif len(t) > 2:
                        q = '"{}"~"^({})$"'.format(t[0], '|'.join(t[1:]))
                    else:
                        q = '"{}"="{}"'.format(t[0], t[1])
                    tag_str += '[' + q + ']'
                tag_strs.append(tag_str)

        if self.profile.get('no_dataset_id', False):
            ref = None
        else:
            ref = 'nwr["ref:' + self.profile.get(
                'dataset_id', required='A fairly unique id of the dataset to query OSM') + '"]'
        timeout = self.profile.get('overpass_timeout', 120)
        query = '[out:xml]{};('.format('' if timeout is None else '[timeout:{}]'.format(timeout))
        for bbox in bboxes:
            bbox_str = '' if bbox is None else '(' + ','.join([str(x) for x in bbox]) + ')'
            for tag_str in tag_strs:
                query += 'nwr' + tag_str + bbox_str + ';'
        if ref is not None:
            if not self.profile.get('bounded_update', False):
                query += ref + ';'
            else:
                for bbox in bboxes:
                    bbox_str = '' if bbox is None else '(' + ','.join(
                        [str(x) for x in bbox]) + ')'
                    query += ref + bbox_str + ';'
        query += '); out meta qt center;'
        return query

    def get_bbox(self, points):
        """Plain iterates over the dataset and returns the bounding box
        that encloses it."""
        padding = self.profile.get('bbox_padding', BBOX_PADDING)
        bbox = [90.0, 180.0, -90.0, -180.0]
        for p in points:
            bbox[0] = min(bbox[0], p.lat - padding)
            bbox[1] = min(bbox[1], p.lon - padding)
            bbox[2] = max(bbox[2], p.lat + padding)
            bbox[3] = max(bbox[3], p.lon + padding)
        return bbox

    def split_into_bboxes(self, points):
        """
        Splits the dataset into multiple bboxes to lower load on the overpass api.

        Returns a list of tuples (minlat, minlon, maxlat, maxlon).
        """
        max_bboxes = self.profile.get('max_request_boxes', 4)
        if max_bboxes <= 1 or len(points) <= 1:
            return [self.get_bbox(points)]

        # coord, alt coord, total w/h to the left/bottom, total w/h to the right/top
        lons = sorted([[d.lon, d.lat, 0, 0] for d in points])
        lats = sorted([[d.lat, d.lon, 0, 0] for d in points])

        def update_side_dimensions(ar):
            """For each point, calculates the maximum and
            minimum bound for all points left and right."""
            fwd_top = fwd_bottom = ar[0][1]
            back_top = back_bottom = ar[-1][1]
            for i in range(len(ar)):
                fwd_top = max(fwd_top, ar[i][1])
                fwd_bottom = min(fwd_bottom, ar[i][1])
                ar[i][2] = fwd_top - fwd_bottom
                back_top = max(back_top, ar[-i-1][1])
                back_bottom = min(back_bottom, ar[-i-1][1])
                ar[-i-1][3] = back_top - back_bottom

        def find_max_gap(ar, h):
            """Select an interval between points, which would give
            the maximum area if split there."""
            max_id = None
            max_gap = 0
            for i in range(len(ar) - 1):
                # "Extra" variables are for area to the left and right
                # that would be freed after splitting.
                extra_left = (ar[i][0]-ar[0][0]) * (h-ar[i][2])
                extra_right = (ar[-1][0]-ar[i+1][0]) * (h-ar[i+1][3])
                # Gap is the area of the column between points i and i+1
                # plus extra areas to the left and right.
                gap = (ar[i+1][0] - ar[i][0]) * h + extra_left + extra_right
                if gap > max_gap:
                    max_id = i
                    max_gap = gap
            return max_id, max_gap

        def get_bbox(b, pad=0):
            """Returns a list of [min_lat, min_lon, max_lat, max_lon] for a box."""
            return [b[2][0][0]-pad, b[3][0][0]-pad, b[2][-1][0]+pad, b[3][-1][0]+pad]

        def split(box, point_array, point_id):
            """Split the box over axis point_array at point point_id...point_id+1.
            Modifies the box in-place and returns a new box."""
            alt_array = 5 - point_array  # 3->2, 2->3
            points = box[point_array][point_id+1:]
            del box[point_array][point_id+1:]
            alt = {True: [], False: []}  # True means point is in new box
            for p in box[alt_array]:
                alt[(p[1], p[0]) >= (points[0][0], points[0][1])].append(p)

            new_box = [None] * 4
            new_box[point_array] = points
            new_box[alt_array] = alt[True]
            box[alt_array] = alt[False]
            for i in range(2):
                box[i] = box[i+2][-1][0] - box[i+2][0][0]
                new_box[i] = new_box[i+2][-1][0] - new_box[i+2][0][0]
            return new_box

        # height, width, lats, lons
        boxes = [[lats[-1][0]-lats[0][0], lons[-1][0]-lons[0][0], lats, lons]]
        initial_area = boxes[0][0] * boxes[0][1]
        while len(boxes) < max_bboxes and len(boxes) <= len(points):
            candidate_box = None
            area = 0
            point_id = None
            point_array = None
            for box in boxes:
                for ar in (2, 3):
                    # Find a box and an axis for splitting that would decrease the area the most
                    update_side_dimensions(box[ar])
                    max_id, max_area = find_max_gap(box[ar], box[3-ar])
                    if max_area > area:
                        area = max_area
                        candidate_box = box
                        point_id = max_id
                        point_array = ar
            if area * 100 < initial_area:
                # Stop splitting when the area decrease is less than 1%
                break
            logging.debug('Splitting bbox %s at %s %s..%s; area decrease %s%%',
                          get_bbox(candidate_box),
                          'longs' if point_array == 3 else 'lats',
                          candidate_box[point_array][point_id][0],
                          candidate_box[point_array][point_id+1][0],
                          round(100*area/initial_area))
            boxes.append(split(candidate_box, point_array, point_id))

        padding = self.profile.get('bbox_padding', BBOX_PADDING)
        return [get_bbox(b, padding) for b in boxes]

    def get_categories(self, tags):
        def match_query(tags, query):
            for tag in query:
                if len(tag) == 1:
                    return tag[0] in tags
                else:
                    value = tags.get(tag[0], None)
                    if tag[1] is None or tag[1] == '':
                        return value is None
                    if value is None:
                        return False
                    found = False
                    for t2 in tag[1:]:
                        if t2[0] == '~':
                            if re.search(t2[1:], value):
                                found = True
                        elif t2[0] == '!':
                            if t2[1:].lower() in value.lower():
                                found = True
                        elif t2 == value:
                            found = True
                        if found:
                            break
                    if not found:
                        return False
            return True

        def tags_to_query(tags):
            return [(k, v) for k, v in tags.items()]

        result = set()
        qualifies = self.profile.get('qualifies', args=tags)
        if qualifies is not None:
            if qualifies:
                result.add(None)
            return result

        # First check default query
        query = self.profile.get('query', None)
        if query is not None:
            if isinstance(query, str):
                result.add(None)
            else:
                if isinstance(query[0][0], str):
                    query = [query]
                for q in query:
                    if match_query(tags, q):
                        result.add(None)
                        break

        # Then check each category if we got these
        categories = self.profile.get('categories', {})
        for name, params in categories.items():
            if 'tags' not in params and 'query' not in params:
                raise ValueError('No tags and query attributes for category "{}"'.format(name))
            if match_query(tags, params.get('query', tags_to_query(params.get('tags')))):
                result.add(name)

        return result

    def calc_boxes(self, dataset_points):
        profile_bbox = self.profile.get('bbox', True)
        if not profile_bbox:
            bboxes = [None]
        elif hasattr(profile_bbox, '__len__') and len(profile_bbox) == 4:
            bboxes = [profile_bbox]
        else:
            bboxes = self.split_into_bboxes(dataset_points)
        return bboxes

    def download(self, bboxes=None):
        """Constructs an Overpass API query and requests objects
        to match from a server."""
        if not bboxes:
            pbbox = self.profile.get('bbox', True)
            if pbbox and hasattr(pbbox, '__len__') and len(pbbox) == 4:
                bboxes = [pbbox]
            else:
                bboxes = [None]

        query = self.construct_overpass_query(bboxes)
        logging.debug('Overpass query: %s', query)
        r = requests.get(OVERPASS_SERVER + 'interpreter', {'data': query})
        if r.encoding is None:
            r.encoding = 'utf-8'
        if r.status_code != 200:
            logging.error('Failed to download data from Overpass API: %s', r.status_code)
            if 'rate_limited' in r.text:
                r = requests.get(OVERPASS_SERVER + 'status')
                logging.warning('Seems like you are rate limited. API status:\n%s', r.text)
            else:
                logging.error('Error message: %s', r.text)
            raise IOError()
        if 'runtime error: ' in r.text:
            m = re.search(r'runtime error: ([^<]+)', r.text)
            error = 'unknown' if not m else m.group(1)
            if 'Query timed out' in error:
                logging.error(
                    'Query timed out, try increasing the "overpass_timeout" profile variable')
            else:
                logging.error('Runtime error: %s', error)
            raise IOError()
        return self.parse_xml(r.content)

    def parse_xml(self, fileobj):
        """Parses an OSM XML file into the "osmdata" field. For ways and relations,
        finds the center. Drops objects that do not match the overpass query tags
        (see "check_against_profile_tags" method)."""
        if isinstance(fileobj, bytes):
            xml = etree.fromstring(fileobj)
        else:
            xml = etree.parse(fileobj).getroot()
        nodes = {}
        for nd in xml.findall('node'):
            nodes[nd.get('id')] = (float(nd.get('lat')), float(nd.get('lon')))
        ways = {}
        for way in xml.findall('way'):
            center = way.find('center')
            if center is not None:
                ways[way.get('id')] = [float(center.get('lat')), float(center.get('lon'))]
            else:
                logging.debug('Way %s does not have a center', way.get('id'))
                coord = [0, 0]
                count = 0
                for nd in way.findall('nd'):
                    if nd.get('ref') in nodes:
                        count += 1
                        for i in range(len(coord)):
                            coord[i] += nodes[nd.get('ref')][i]
                ways[way.get('id')] = [coord[0] / count, coord[1] / count]

        # For calculating weight of OSM objects
        weight_fn = self.profile.get_raw('weight')
        osmdata = {}

        for el in xml:
            tags = {}
            for tag in el.findall('tag'):
                tags[tag.get('k')] = tag.get('v')
            categories = self.get_categories(tags)
            if categories is False or categories is None or len(categories) == 0:
                continue

            if el.tag == 'node':
                coord = nodes[el.get('id')]
                members = None
            elif el.tag == 'way':
                coord = ways[el.get('id')]
                members = [nd.get('ref') for nd in el.findall('nd')]
            elif el.tag == 'relation':
                center = el.find('center')
                if center is not None:
                    coord = [float(center.get('lat')), float(center.get('lon'))]
                else:
                    logging.debug('Relation %s does not have a center', el.get('id'))
                    coord = [0, 0]
                    count = 0
                    for m in el.findall('member'):
                        if m.get('type') == 'node' and m.get('ref') in nodes:
                            count += 1
                            for i in range(len(coord)):
                                coord[i] += nodes[m.get('ref')][i]
                        elif m.get('type') == 'way' and m.get('ref') in ways:
                            count += 1
                            for i in range(len(coord)):
                                coord[i] += ways[m.get('ref')][i]
                    if count > 0:
                        coord = [coord[0] / count, coord[1] / count]
                members = [
                    (m.get('type'), m.get('ref'), m.get('role'))
                    for m in el.findall('member')
                ]
            else:
                continue
            if not coord or coord == [0, 0]:
                continue
            pt = OSMPoint(
                el.tag, int(el.get('id')), int(el.get('version')),
                coord[0], coord[1], tags, categories)
            pt.members = members
            if pt.is_poi():
                if callable(weight_fn):
                    weight = weight_fn(pt)
                    if weight:
                        if abs(weight) > 3:
                            pt.dist_offset = weight
                        else:
                            pt.dist_offset = weight * self.profile.max_distance
                osmdata[pt.id] = pt
        return osmdata


def check_moveability(changes):
    to_check = [x for x in changes if x['properties']['osm_type'] == 'node' and
                x['properties']['action'] == 'modify']
    logging.info('Checking moveability of %s modified nodes', len(to_check))
    for c in to_check:
        p = c['properties']
        p['can_move'] = False
        r = requests.get('{}node/{}/ways'.format(OSM_API_SERVER, p['osm_id']))
        if r.status_code == 200:
            xml = etree.fromstring(r.content)
            p['can_move'] = xml.find('way') is None