From 0f19afbd018c7bccd00903431798c1a03bef1a27 Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Mon, 28 May 2018 13:15:53 +0300 Subject: [PATCH] Mighty refactoring --- CHANGELOG.md | 1 + conflate/__init__.py | 10 +- conflate/__main__.py | 3 + conflate/conflate.py | 1307 +--------------------------------- conflate/conflator.py | 441 ++++++++++++ conflate/data.py | 104 +++ conflate/dataset.py | 213 ++++++ conflate/geocoder.py | 120 ++++ conflate/osm.py | 383 ++++++++++ conflate/profile.py | 62 ++ profiles/auchan_moscow.py | 6 +- profiles/burgerking.py | 7 +- profiles/minkult.py | 11 +- profiles/moscow_addr.py | 10 +- profiles/moscow_parkomats.py | 43 +- profiles/velobike.py | 8 +- 16 files changed, 1392 insertions(+), 1337 deletions(-) create mode 100644 conflate/__main__.py create mode 100644 conflate/conflator.py create mode 100644 conflate/data.py create mode 100644 conflate/dataset.py create mode 100644 conflate/geocoder.py create mode 100644 conflate/osm.py create mode 100644 conflate/profile.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 099f568..d4a5f92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## master branch (1.4.0) +* Refactored `conflate.py` into seven smaller files. * Added a simple kd-tree based geocoder for countries and regions. Controlled by the `regions` parameter in a profile. * You can filter by regions using `-r` argument or `"regions"` list in an audit file. * Using the new `nwr` query type of Overpass API. diff --git a/conflate/__init__.py b/conflate/__init__.py index 9b0ed07..3d0e6a4 100644 --- a/conflate/__init__.py +++ b/conflate/__init__.py @@ -1 +1,9 @@ -from .conflate import SourcePoint, OSMPoint, OsmConflator, Profile, ProfileException, run, __version__ +try: + from lxml import etree +except ImportError: + import xml.etree.ElementTree as etree +from .data import SourcePoint +from .conflate import run +from .version import __version__ +from .profile import Profile, ProfileException +from .conflator import OsmConflator diff --git a/conflate/__main__.py b/conflate/__main__.py new file mode 100644 index 0000000..7c15dec --- /dev/null +++ b/conflate/__main__.py @@ -0,0 +1,3 @@ +from . import run + +run() diff --git a/conflate/conflate.py b/conflate/conflate.py index 29d97de..fff7730 100755 --- a/conflate/conflate.py +++ b/conflate/conflate.py @@ -1,1289 +1,20 @@ #!/usr/bin/env python3 import argparse -import codecs import csv import json -import kdtree import logging -import math -import requests -import re import os -import struct import sys -from io import BytesIO -from collections import defaultdict -try: - from .version import __version__ -except ImportError: - from version import __version__ -try: - from lxml import etree -except ImportError: - import xml.etree.ElementTree as etree - -TITLE = 'OSM Conflator ' + __version__ -OVERPASS_SERVER = 'https://overpass-api.de/api/' -ALT_OVERPASS_SERVER = 'https://overpass.kumi.systems/api/' -OSM_API_SERVER = 'https://api.openstreetmap.org/api/0.6/' -BBOX_PADDING = 0.003 # in degrees, ~330 m default -MAX_DISTANCE = 100 # how far can object be to be considered a match, in meters -CONTACT_KEYS = set(('phone', 'website', 'email', 'fax', 'facebook', 'twitter', 'instagram')) -LIFECYCLE_KEYS = set(('amenity', 'shop', 'tourism', 'craft', 'office')) -LIFECYCLE_PREFIXES = ('proposed', 'construction', 'disused', 'abandoned', 'was', 'removed') - - -class SourcePoint: - """A common class for points. Has an id, latitude and longitude, - and a dict of tags. Remarks are optional for reviewers hints only.""" - def __init__(self, pid, lat, lon, tags=None, category=None, remarks=None, region=None): - self.id = str(pid) - self.lat = lat - self.lon = lon - self.tags = {} if tags is None else { - k.lower(): str(v).strip() for k, v in tags.items() if v is not None} - self.category = category - self.dist_offset = 0 - self.remarks = remarks - self.region = region - self.exclusive_group = None - - def distance(self, other): - """Calculate distance in meters.""" - dx = math.radians(self.lon - other.lon) * math.cos(0.5 * math.radians(self.lat + other.lat)) - dy = math.radians(self.lat - other.lat) - return 6378137 * math.sqrt(dx*dx + dy*dy) - self.dist_offset - - def __len__(self): - return 2 - - def __getitem__(self, i): - if i == 0: - return self.lat - elif i == 1: - return self.lon - else: - raise ValueError('A SourcePoint has only lat and lon in a list') - - def __eq__(self, other): - return self.id == other.id - - def __hash__(self): - return hash(self.id) - - def __repr__(self): - return 'SourcePoint({}, {}, {}, offset={}, tags={})'.format( - self.id, self.lat, self.lon, self.dist_offset, self.tags) - - -class OSMPoint(SourcePoint): - """An OSM points is a SourcePoint with a few extra fields. - Namely, version, members (for ways and relations), and an action. - The id is compound and created from object type and object id.""" - def __init__(self, ptype, pid, version, lat, lon, tags=None, categories=None): - super().__init__('{}{}'.format(ptype[0], pid), lat, lon, tags) - self.tags = {k: v for k, v in self.tags.items() if v is not None and len(v) > 0} - self.osm_type = ptype - self.osm_id = pid - self.version = version - self.members = None - self.action = None - self.categories = categories or set() - self.remarks = None - - def copy(self): - """Returns a copy of this object, except for members field.""" - c = OSMPoint(self.osm_type, self.osm_id, self.version, self.lat, self.lon, self.tags.copy()) - c.action = self.action - c.remarks = self.remarks - c.categories = self.categories.copy() - return c - - def is_area(self): - return self.osm_type != 'node' - - def is_poi(self): - if self.osm_type == 'node': - return True - if self.osm_type == 'way' and len(self.members) > 2: - return self.members[0] == self.members[-1] - if self.osm_type == 'relation' and len(self.members) > 0: - return self.tags.get('type', None) == 'multipolygon' - return False - - def to_xml(self): - """Produces an XML out of the point data. Disregards the "action" field.""" - el = etree.Element(self.osm_type, id=str(self.osm_id), version=str(self.version)) - for tag, value in self.tags.items(): - etree.SubElement(el, 'tag', k=tag, v=value) - - if self.osm_type == 'node': - el.set('lat', str(self.lat)) - el.set('lon', str(self.lon)) - elif self.osm_type == 'way': - for node_id in self.members: - etree.SubElement(el, 'nd', ref=str(node_id)) - elif self.osm_type == 'relation': - for member in self.members: - m = etree.SubElement(el, 'member') - for i, n in enumerate(('type', 'ref', 'role')): - m.set(n, str(member[i])) - return el - - def __repr__(self): - return 'OSMPoint({} {} v{}, {}, {}, action={}, tags={})'.format( - self.osm_type, self.osm_id, self.version, self.lat, self.lon, self.action, self.tags) - - -class ProfileException(Exception): - """An exception class for the Profile instance.""" - def __init__(self, attr, desc): - super().__init__('Field missing in profile: {} ({})'.format(attr, desc)) - - -class Profile: - """A wrapper for a profile. - - A profile is a python script that sets a few local variables. - These variables become properties of the profile, accessible with - a "get" method. If something is a function, it will be called, - optional parameters might be passed to it. - - You can compile a list of all supported variables by grepping through - this code, or by looking at a few example profiles. If something - is required, you will be notified of that. - """ - def __init__(self, fileobj): - if isinstance(fileobj, dict): - self.profile = fileobj - elif hasattr(fileobj, 'read'): - s = fileobj.read().replace('\r', '') - if s[0] == '{': - self.profile = json.loads(s) - else: - self.profile = {} - exec(s, globals(), self.profile) - else: - # Got a class - self.profile = {name: getattr(fileobj, name) - for name in dir(fileobj) if not name.startswith('_')} - - def has(self, attr): - return attr in self.profile - - def get(self, attr, default=None, required=None, args=None): - if attr in self.profile: - value = self.profile[attr] - if callable(value): - if args is None: - return value() - else: - return value(*args) - else: - return value - if required is not None: - raise ProfileException(attr, required) - return default - - def get_raw(self, attr, default=None): - if attr in self.profile: - return self.profile[attr] - return default - - -class Geocoder: - def __init__(self, profile): - profile_regions = profile.get_raw('regions') - self.filter = None - self.enabled = bool(profile_regions) - if self.enabled: - logging.info('Initializing geocoder (this will take a minute)') - self.regions = self.parse_regions(profile_regions) - self.tree = self.load_places_tree() - if not self.tree: - if callable(profile_regions): - logging.warn('Could not read the geocoding file') - else: - logging.error('Could not read the geocoding file, no regions will be added') - self.enabled = False - - def set_filter(self, opt_regions): - if isinstance(opt_regions, str): - self.f_negate = opt_regions[0] in ('-', '^') - if self.f_negate: - opt_regions = opt_regions[1:] - self.filter = set([r.strip() for r in opt_regions.split(',')]) - elif isinstance(opt_regions, list): - self.f_negate = False - self.filter = set(opt_regions) - - def load_places_tree(self): - class PlacePoint: - def __init__(self, lon, lat, country, region): - self.coord = (lon, lat) - self.country = country - self.region = region - - def __len__(self): - return len(self.coord) - - def __getitem__(self, i): - return self.coord[i] - - filename = os.path.join(os.getcwd(), os.path.dirname(__file__), 'places.bin') - if not os.path.exists(filename): - return None - places = [] - with open(filename, 'rb') as f: - countries = [] - cnt = struct.unpack('B', f.read(1))[0] - for i in range(cnt): - countries.append(struct.unpack('2s', f.read(2))[0].decode('ascii')) - regions = [] - cnt = struct.unpack(' 2: - q = '"{}"~"^({})$"'.format(t[0], '|'.join(t[1:])) - else: - q = '"{}"="{}"'.format(t[0], t[1]) - tag_str += '[' + q + ']' - tag_strs.append(tag_str) - - timeout = self.profile.get('overpass_timeout', 120) - query = '[out:xml]{};('.format('' if timeout is None else '[timeout:{}]'.format(timeout)) - for bbox in bboxes: - bbox_str = '' if bbox is None else '(' + ','.join([str(x) for x in bbox]) + ')' - for tag_str in tag_strs: - query += 'nwr' + tag_str + bbox_str + ';' - if self.ref is not None: - if not self.profile.get('bounded_update', False): - query += 'nwr["' + self.ref + '"];' - else: - for bbox in bboxes: - bbox_str = '' if bbox is None else '(' + ','.join( - [str(x) for x in bbox]) + ')' - query += 'nwr["' + self.ref + '"]' + bbox_str + ';' - query += '); out meta qt center;' - return query - - def get_dataset_bbox(self): - """Plain iterates over the dataset and returns the bounding box - that encloses it.""" - padding = self.profile.get('bbox_padding', BBOX_PADDING) - bbox = [90.0, 180.0, -90.0, -180.0] - for p in self.dataset.values(): - bbox[0] = min(bbox[0], p.lat - padding) - bbox[1] = min(bbox[1], p.lon - padding) - bbox[2] = max(bbox[2], p.lat + padding) - bbox[3] = max(bbox[3], p.lon + padding) - return bbox - - def split_into_bboxes(self): - """ - Splits the dataset into multiple bboxes to lower load on the overpass api. - - Returns a list of tuples (minlat, minlon, maxlat, maxlon). - """ - if len(self.dataset) <= 1: - return [self.get_dataset_bbox()] - - # coord, alt coord, total w/h to the left/bottom, total w/h to the right/top - lons = sorted([[d.lon, d.lat, 0, 0] for d in self.dataset.values()]) - lats = sorted([[d.lat, d.lon, 0, 0] for d in self.dataset.values()]) - - def update_side_dimensions(ar): - """For each point, calculates the maximum and - minimum bound for all points left and right.""" - fwd_top = fwd_bottom = ar[0][1] - back_top = back_bottom = ar[-1][1] - for i in range(len(ar)): - fwd_top = max(fwd_top, ar[i][1]) - fwd_bottom = min(fwd_bottom, ar[i][1]) - ar[i][2] = fwd_top - fwd_bottom - back_top = max(back_top, ar[-i-1][1]) - back_bottom = min(back_bottom, ar[-i-1][1]) - ar[-i-1][3] = back_top - back_bottom - - def find_max_gap(ar, h): - """Select an interval between points, which would give - the maximum area if split there.""" - max_id = None - max_gap = 0 - for i in range(len(ar) - 1): - # "Extra" variables are for area to the left and right - # that would be freed after splitting. - extra_left = (ar[i][0]-ar[0][0]) * (h-ar[i][2]) - extra_right = (ar[-1][0]-ar[i+1][0]) * (h-ar[i+1][3]) - # Gap is the area of the column between points i and i+1 - # plus extra areas to the left and right. - gap = (ar[i+1][0] - ar[i][0]) * h + extra_left + extra_right - if gap > max_gap: - max_id = i - max_gap = gap - return max_id, max_gap - - def get_bbox(b, pad=0): - """Returns a list of [min_lat, min_lon, max_lat, max_lon] for a box.""" - return [b[2][0][0]-pad, b[3][0][0]-pad, b[2][-1][0]+pad, b[3][-1][0]+pad] - - def split(box, point_array, point_id): - """Split the box over axis point_array at point point_id...point_id+1. - Modifies the box in-place and returns a new box.""" - alt_array = 5 - point_array # 3->2, 2->3 - points = box[point_array][point_id+1:] - del box[point_array][point_id+1:] - alt = {True: [], False: []} # True means point is in new box - for p in box[alt_array]: - alt[(p[1], p[0]) >= (points[0][0], points[0][1])].append(p) - - new_box = [None] * 4 - new_box[point_array] = points - new_box[alt_array] = alt[True] - box[alt_array] = alt[False] - for i in range(2): - box[i] = box[i+2][-1][0] - box[i+2][0][0] - new_box[i] = new_box[i+2][-1][0] - new_box[i+2][0][0] - return new_box - - # height, width, lats, lons - max_bboxes = self.profile.get('max_request_boxes', 4) - boxes = [[lats[-1][0]-lats[0][0], lons[-1][0]-lons[0][0], lats, lons]] - initial_area = boxes[0][0] * boxes[0][1] - while len(boxes) < max_bboxes and len(boxes) <= len(self.dataset): - candidate_box = None - area = 0 - point_id = None - point_array = None - for box in boxes: - for ar in (2, 3): - # Find a box and an axis for splitting that would decrease the area the most - update_side_dimensions(box[ar]) - max_id, max_area = find_max_gap(box[ar], box[3-ar]) - if max_area > area: - area = max_area - candidate_box = box - point_id = max_id - point_array = ar - if area * 100 < initial_area: - # Stop splitting when the area decrease is less than 1% - break - logging.debug('Splitting bbox %s at %s %s..%s; area decrease %s%%', - get_bbox(candidate_box), - 'longs' if point_array == 3 else 'lats', - candidate_box[point_array][point_id][0], - candidate_box[point_array][point_id+1][0], - round(100*area/initial_area)) - boxes.append(split(candidate_box, point_array, point_id)) - - padding = self.profile.get('bbox_padding', BBOX_PADDING) - return [get_bbox(b, padding) for b in boxes] - - def get_categories(self, tags): - def match_query(tags, query): - for tag in query: - if len(tag) == 1: - return tag[0] in tags - else: - value = tags.get(tag[0], None) - if tag[1] is None or tag[1] == '': - return value is None - if value is None: - return False - found = False - for t2 in tag[1:]: - if t2[0] == '~': - m = re.search(t2[1:], value) - if not m: - return False - elif t2[0] == '!': - if t2[1:].lower() in value.lower(): - found = True - elif t2 == value: - found = True - if found: - break - if not found: - return False - return True - - def tags_to_query(tags): - return [(k, v) for k, v in tags.items()] - - result = set() - qualifies = self.profile.get('qualifies', args=tags) - if qualifies is not None: - if qualifies: - result.add(None) - return result - - # First check default query - query = self.profile.get('query', None) - if query is not None: - if isinstance(query, str): - result.add(None) - else: - if isinstance(query[0][0], str): - query = [query] - for q in query: - if match_query(tags, q): - result.add(None) - break - - # Then check each category if we got these - categories = self.profile.get('categories', {}) - for name, params in categories.items(): - if 'tags' not in params and 'query' not in params: - raise ValueError('No tags and query attributes for category "{}"'.format(name)) - if match_query(tags, params.get('query', tags_to_query(params.get('tags')))): - result.add(name) - - return result - - def download_osm(self): - """Constructs an Overpass API query and requests objects - to match from a server.""" - profile_bbox = self.profile.get('bbox', True) - if not profile_bbox: - bboxes = [None] - elif hasattr(profile_bbox, '__len__') and len(profile_bbox) == 4: - bboxes = [profile_bbox] - else: - bboxes = self.split_into_bboxes() - - query = self.construct_overpass_query(bboxes) - logging.debug('Overpass query: %s', query) - r = requests.get(OVERPASS_SERVER + 'interpreter', {'data': query}) - if r.encoding is None: - r.encoding = 'utf-8' - if r.status_code != 200: - logging.error('Failed to download data from Overpass API: %s', r.status_code) - if 'rate_limited' in r.text: - r = requests.get(OVERPASS_SERVER + 'status') - logging.warning('Seems like you are rate limited. API status:\n%s', r.text) - else: - logging.error('Error message: %s', r.text) - raise IOError() - if 'runtime error: ' in r.text: - m = re.search(r'runtime error: ([^<]+)', r.text) - error = 'unknown' if not m else m.group(1) - if 'Query timed out' in error: - logging.error( - 'Query timed out, try increasing the "overpass_timeout" profile variable') - else: - logging.error('Runtime error: %s', error) - raise IOError() - self.parse_osm(r.content) - - def parse_osm(self, fileobj): - """Parses an OSM XML file into the "osmdata" field. For ways and relations, - finds the center. Drops objects that do not match the overpass query tags - (see "check_against_profile_tags" method).""" - if isinstance(fileobj, bytes): - xml = etree.fromstring(fileobj) - else: - xml = etree.parse(fileobj).getroot() - nodes = {} - for nd in xml.findall('node'): - nodes[nd.get('id')] = (float(nd.get('lat')), float(nd.get('lon'))) - ways = {} - for way in xml.findall('way'): - center = way.find('center') - if center is not None: - ways[way.get('id')] = [float(center.get('lat')), float(center.get('lon'))] - else: - logging.debug('Way %s does not have a center', way.get('id')) - coord = [0, 0] - count = 0 - for nd in way.findall('nd'): - if nd.get('ref') in nodes: - count += 1 - for i in range(len(coord)): - coord[i] += nodes[nd.get('ref')][i] - ways[way.get('id')] = [coord[0] / count, coord[1] / count] - - # For calculating weight of OSM objects - weight_fn = self.profile.get_raw('weight') - max_distance = self.profile.get('max_distance', MAX_DISTANCE) - - for el in xml: - tags = {} - for tag in el.findall('tag'): - tags[tag.get('k')] = tag.get('v') - categories = self.get_categories(tags) - if categories is False or categories is None or len(categories) == 0: - continue - - if el.tag == 'node': - coord = nodes[el.get('id')] - members = None - elif el.tag == 'way': - coord = ways[el.get('id')] - members = [nd.get('ref') for nd in el.findall('nd')] - elif el.tag == 'relation': - center = el.find('center') - if center is not None: - coord = [float(center.get('lat')), float(center.get('lon'))] - else: - logging.debug('Relation %s does not have a center', el.get('id')) - coord = [0, 0] - count = 0 - for m in el.findall('member'): - if m.get('type') == 'node' and m.get('ref') in nodes: - count += 1 - for i in range(len(coord)): - coord[i] += nodes[m.get('ref')][i] - elif m.get('type') == 'way' and m.get('ref') in ways: - count += 1 - for i in range(len(coord)): - coord[i] += ways[m.get('ref')][i] - if count > 0: - coord = [coord[0] / count, coord[1] / count] - members = [ - (m.get('type'), m.get('ref'), m.get('role')) - for m in el.findall('member') - ] - else: - continue - if not coord or coord == [0, 0]: - continue - pt = OSMPoint( - el.tag, int(el.get('id')), int(el.get('version')), - coord[0], coord[1], tags, categories) - pt.members = members - if pt.is_poi(): - if callable(weight_fn): - weight = weight_fn(pt) - if weight: - pt.dist_offset = weight if abs(weight) > 3 else weight * max_distance - self.osmdata[pt.id] = pt - - def register_match(self, dataset_key, osmdata_key, keep=False, retag=None): - """Registers a match between an OSM point and a dataset point. - - Merges tags from an OSM Point and a dataset point, and add the result to the - self.matched list. - If dataset_key is None, deletes or retags the OSM point. - If osmdata_key is None, adds a new OSM point for the dataset point. - """ - def get_osm_key(k, osm_tags): - """Conflating contact: namespace.""" - if k in CONTACT_KEYS and k not in osm_tags and 'contact:'+k in osm_tags: - return 'contact:'+k - elif k.startswith('contact:') and k not in osm_tags and k[8:] in osm_tags: - return k[8:] - - # Now conflating lifecycle prefixes, only forward - if k in LIFECYCLE_KEYS and k not in osm_tags: - for prefix in LIFECYCLE_PREFIXES: - if prefix+':'+k in osm_tags: - return prefix+':'+k - return k - - def update_tags(tags, source, master_tags=None, retagging=False, audit=None): - """Updates tags dictionary with tags from source, - returns True is something was changed.""" - keep = set() - override = set() - changed = False - if source: - if audit: - keep = set(audit.get('keep', [])) - override = set(audit.get('override', [])) - for k, v in source.items(): - osm_key = get_osm_key(k, tags) - - if k in keep or osm_key in keep: - continue - if k in override or osm_key in override: - if not v and osm_key in tags: - del tags[osm_key] - changed = True - elif v and tags.get(osm_key, None) != v: - tags[osm_key] = v - changed = True - continue - - if osm_key not in tags or retagging or ( - tags[osm_key] != v and (master_tags and k in master_tags)): - if v is not None and len(v) > 0: - # Not setting addr:full when the object has addr:housenumber - if k == 'addr:full' and 'addr:housenumber' in tags: - continue - tags[osm_key] = v - changed = True - elif osm_key in tags and (v == '' or retagging): - del tags[osm_key] - changed = True - return changed - - def format_change(before, after, ref): - MARKER_COLORS = { - 'delete': '#ee2211', # deleting feature from OSM - 'create': '#11dd11', # creating a new node - 'update': '#0000ee', # changing tags on an existing feature - 'retag': '#660000', # cannot delete unmatched feature, changing tags - 'move': '#110055', # moving an existing node - } - marker_action = None - geometry = {'type': 'Point', 'coordinates': [after.lon, after.lat]} - props = { - 'osm_type': after.osm_type, - 'osm_id': after.osm_id, - 'action': after.action - } - if after.action in ('create', 'delete'): - # Red if deleted, green if added - marker_action = after.action - for k, v in after.tags.items(): - props['tags.{}'.format(k)] = v - if ref: - props['ref_id'] = ref.id - else: # modified - # Blue if updated from dataset, dark red if retagged, dark blue if moved - marker_action = 'update' if ref else 'retag' - if ref: - props['ref_id'] = ref.id - props['ref_distance'] = round(10 * ref.distance(before)) / 10.0 - props['ref_coords'] = [ref.lon, ref.lat] - if before.lon != after.lon or before.lat != after.lat: - # The object was moved - props['were_coords'] = [before.lon, before.lat] - marker_action = 'move' - # Find tags that were superseeded by OSM tags - for k, v in ref.tags.items(): - osm_key = get_osm_key(k, after.tags) - if osm_key not in after.tags or after.tags[osm_key] != v: - props['ref_unused_tags.{}'.format(osm_key)] = v - # Now compare old and new OSM tags - for k in set(after.tags.keys()).union(set(before.tags.keys())): - v0 = before.tags.get(k, None) - v1 = after.tags.get(k, None) - if v0 == v1: - props['tags.{}'.format(k)] = v0 - elif v0 is None: - props['tags_new.{}'.format(k)] = v1 - elif v1 is None: - props['tags_deleted.{}'.format(k)] = v0 - else: - props['tags_changed.{}'.format(k)] = '{} -> {}'.format(v0, v1) - props['marker-color'] = MARKER_COLORS[marker_action] - if ref and ref.remarks: - props['remarks'] = ref.remarks - if ref and ref.region: - props['region'] = ref.region - elif self.geocoder: - region, present = self.geocoder.find(after) - if not present: - return None - if region is not None: - props['region'] = region - return {'type': 'Feature', 'geometry': geometry, 'properties': props} - - max_distance = self.profile.get('max_distance', MAX_DISTANCE) - p = self.osmdata.pop(osmdata_key, None) - p0 = None if p is None else p.copy() - sp = self.dataset.pop(dataset_key, None) - audit = self.audit.get(sp.id if sp else '{}{}'.format(p.osm_type, p.osm_id), {}) - if audit.get('skip', False): - return - - if sp is not None: - if p is None: - p = OSMPoint('node', -1-len(self.matched), 1, sp.lat, sp.lon, sp.tags) - p.action = 'create' - else: - master_tags = set(self.profile.get('master_tags', [])) - if update_tags(p.tags, sp.tags, master_tags, audit=audit): - p.action = 'modify' - # Move a node if it is too far from the dataset point - if not p.is_area() and sp.distance(p) > max_distance: - p.lat = sp.lat - p.lon = sp.lon - p.action = 'modify' - if self.add_source_tag: - if 'source' in p.tags: - if self.source not in p.tags['source']: - p.tags['source'] = ';'.join([p.tags['source'], self.source]) - else: - p.tags['source'] = self.source - if self.ref is not None: - p.tags[self.ref] = sp.id - if 'fixme' in audit and audit['fixme'] != p.tags.get('fixme'): - p.tags['fixme'] = audit['fixme'] - if p.action is None: - p.action = 'modify' - if 'move' in audit and not p.is_area(): - if p0 and audit['move'] == 'osm': - p.lat = p0.lat - p.lon = p0.lon - elif audit['move'] == 'dataset': - p.lat = sp.lat - p.lon = sp.lon - elif len(audit['move']) == 2: - p.lat = audit['move'][1] - p.lon = audit['move'][0] - if p.action is None and p0.distance(p) > 0.1: - p.action = 'modify' - if p.action != 'create': - self.matches.append([sp.id, p.osm_type, p.osm_id, p.lat, p.lon, p.action]) - else: - self.matches.append([sp.id, '', '', p.lat, p.lon, p.action]) - elif keep or p.is_area(): - if update_tags(p.tags, retag, retagging=True, audit=audit): - p.action = 'modify' - else: - p.action = 'delete' - - if p.action is not None: - change = format_change(p0, p, sp) - if change is not None: - self.matched.append(p) - self.changes.append(change) - - def match_dataset_points_smart(self): - """Smart matching for dataset <-> OSM points. - - We find a shortest link between a dataset and an OSM point. - Then we match these and remove both from dicts. - Then find another link and so on, until the length of a link - becomes larger than "max_distance". - - Currently the worst case complexity is around O(n^2*log^2 n). - But given the small number of objects to match, and that - the average case complexity is ~O(n*log^2 n), this is fine. - """ - def search_nn_fix(kd, point): - nearest = kd.search_knn(point, self.profile.get('nearest_points', 10)) - if not nearest: - return None, None - match_func = self.profile.get_raw('matches') - if match_func: - nearest = [p for p in nearest if match_func(p[0].data.tags, point.tags)] - if not nearest: - return None, None - nearest = [(n[0], n[0].data.distance(point)) - for n in nearest if point.category in n[0].data.categories] - return sorted(nearest, key=lambda kv: kv[1])[0] - - if not self.osmdata: - return - max_distance = self.profile.get('max_distance', MAX_DISTANCE) - osm_kd = kdtree.create(list(self.osmdata.values())) - count_matched = 0 - - # Process overridden features first - for override, osm_find in self.profile.get('override', {}).items(): - override = str(override) - if override not in self.dataset: - continue - found = None - if len(osm_find) > 2 and osm_find[0] in 'nwr' and osm_find[1].isdigit(): - if osm_find in self.osmdata: - found = self.osmdata[osm_find] - # Search nearest 100 points - nearest = osm_kd.search_knn(self.dataset[override], 100) - if nearest: - for p in nearest: - if 'name' in p[0].data.tags and p[0].data.tags['name'] == osm_find: - found = p[0].data - if found: - count_matched += 1 - self.register_match(override, found.id) - osm_kd = osm_kd.remove(found) - - # Prepare distance list: match OSM points to each of the dataset points - dist = [] - for sp, v in self.dataset.items(): - osm_point, distance = search_nn_fix(osm_kd, v) - if osm_point is not None and distance <= max_distance: - dist.append((distance, sp, osm_point.data)) - - # The main matching loop: sort dist list if needed, - # register the closes match, update the list - needs_sorting = True - while dist: - if needs_sorting: - dist.sort(key=lambda x: x[0]) - needs_sorting = False - count_matched += 1 - osm_point = dist[0][2] - self.register_match(dist[0][1], osm_point.id) - osm_kd = osm_kd.remove(osm_point) - del dist[0] - for i in reversed(range(len(dist))): - if dist[i][2] == osm_point: - nearest, distance = search_nn_fix(osm_kd, self.dataset[dist[i][1]]) - if nearest and distance <= max_distance: - dist[i] = (distance, dist[i][1], nearest.data) - needs_sorting = i == 0 or distance < dist[0][0] - else: - del dist[i] - needs_sorting = i == 0 - logging.info('Matched %s points', count_matched) - - def match(self): - """Matches each osm object with a SourcePoint, or marks it as obsolete. - The resulting list of OSM Points are written to the "matched" field.""" - find_ref = self.profile.get_raw('find_ref') - if self.ref is not None or callable(find_ref): - # First match all objects with ref:whatever tag set - count_ref = 0 - for k, p in list(self.osmdata.items()): - ref = None - if self.ref and self.ref in p.tags: - ref = p.tags[self.ref] - elif find_ref: - ref = find_ref(p.tags) - if ref is not None: - if ref in self.dataset: - count_ref += 1 - self.register_match(ref, k) - logging.info('Updated %s OSM objects with %s tag', count_ref, self.ref) - - # Add points for which audit specifically mentioned creating - count_created = 0 - for ref, a in self.audit.items(): - if ref in self.dataset: - if a.get('create', None): - count_created += 1 - self.register_match(ref, None) - elif a.get('skip', None): - # If we skip an object here, it would affect the conflation order - pass - if count_created > 0: - logging.info('Created %s audit-overridden dataset points', count_created) - - # Prepare exclusive groups dict - exclusive_groups = defaultdict(set) - for p, v in self.dataset.items(): - if v.exclusive_group is not None: - exclusive_groups[v.exclusive_group].add(p) - - # Then find matches for unmatched dataset points - self.match_dataset_points_smart() - - # Remove unmatched duplicates - count_duplicates = 0 - for ids in exclusive_groups.values(): - found = False - for p in ids: - if p not in self.dataset: - found = True - break - for p in ids: - if p in self.dataset: - if found: - count_duplicates += 1 - del self.dataset[p] - else: - # Leave one element when not matched any - found = True - if count_duplicates > 0: - logging.info('Removed %s unmatched duplicates', count_duplicates) - - # Add unmatched dataset points - logging.info('Adding %s unmatched dataset points', len(self.dataset)) - for k in sorted(list(self.dataset.keys())): - self.register_match(k, None) - - # And finally delete some or all of the remaining osm objects - if len(self.osmdata) > 0: - count_deleted = 0 - count_retagged = 0 - delete_unmatched = self.profile.get('delete_unmatched', False) - retag = self.profile.get('tag_unmatched') - for k, p in list(self.osmdata.items()): - ref = None - if self.ref and self.ref in p.tags: - ref = p.tags[self.ref] - elif find_ref: - ref = find_ref(p.tags) - if ref is not None: - # When ref:whatever is present, we can delete that object safely - count_deleted += 1 - self.register_match(None, k, retag=retag) - elif delete_unmatched or retag: - if not delete_unmatched or p.is_area(): - count_retagged += 1 - else: - count_deleted += 1 - self.register_match(None, k, keep=not delete_unmatched, retag=retag) - logging.info( - 'Deleted %s and retagged %s unmatched objects from OSM', - count_deleted, count_retagged) - - def backup_osm(self): - """Writes OSM data as-is.""" - osm = etree.Element('osm', version='0.6', generator=TITLE) - for osmel in self.osmdata.values(): - el = osmel.to_xml() - if osmel.osm_type != 'node': - etree.SubElement(el, 'center', lat=str(osmel.lat), lon=str(osmel.lon)) - osm.append(el) - return ("\n" + - etree.tostring(osm, encoding='utf-8').decode('utf-8')) - - def to_osc(self, josm=False): - """Returns a string with osmChange or JOSM XML.""" - osc = etree.Element('osm' if josm else 'osmChange', version='0.6', generator=TITLE) - if josm: - neg_id = -1 - changeset = etree.SubElement(osc, 'changeset') - ch_tags = { - 'source': self.source, - 'created_by': TITLE, - 'type': 'import' - } - for k, v in ch_tags.items(): - etree.SubElement(changeset, 'tag', k=k, v=v) - for osmel in self.matched: - if osmel.action is not None: - el = osmel.to_xml() - if josm: - if osmel.action == 'create': - el.set('id', str(neg_id)) - neg_id -= 1 - else: - el.set('action', osmel.action) - osc.append(el) - else: - etree.SubElement(osc, osmel.action).append(el) - return ("\n" + - etree.tostring(osc, encoding='utf-8').decode('utf-8')) - - -def check_moveability(changes): - to_check = [x for x in changes if x['properties']['osm_type'] == 'node' and - x['properties']['action'] == 'modify'] - logging.info('Checking moveability of %s modified nodes', len(to_check)) - for c in to_check: - p = c['properties'] - p['can_move'] = False - r = requests.get('{}node/{}/ways'.format(OSM_API_SERVER, p['osm_id'])) - if r.status_code == 200: - xml = etree.fromstring(r.content) - p['can_move'] = xml.find('way') is None - - -def read_dataset(profile, fileobj): - """A helper function to call a "dataset" function in the profile. - If the fileobj is not specified, tries to download a dataset from - an URL specified in "download_url" profile variable.""" - if not fileobj: - url = profile.get('download_url') - if url is None: - logging.error('No download_url specified in the profile, ' - 'please provide a dataset file with --source') - return None - r = requests.get(url) - if r.status_code != 200: - logging.error('Could not download source data: %s %s', r.status_code, r.text) - return None - if len(r.content) == 0: - logging.error('Empty response from %s', url) - return None - fileobj = BytesIO(r.content) - if not profile.has('dataset'): - # The default option is to parse the source as a JSON - try: - data = [] - reader = codecs.getreader('utf-8') - json_src = json.load(reader(fileobj)) - if 'features' in json_src: - # Parse GeoJSON - for item in json_src['features']: - if item['geometry'].get('type') != 'Point' or 'properties' not in item: - continue - # Get the identifier from "id", "ref", "ref*" - iid = item['properties'].get('id', item['properties'].get('ref')) - if not iid: - for k, v in item['properties'].items(): - if k.startswith('ref'): - iid = v - break - if not iid: - continue - data.append(SourcePoint( - iid, - item['geometry']['coordinates'][1], - item['geometry']['coordinates'][0], - {k: v for k, v in item['properties'].items() if k != 'id'})) - else: - for item in json_src: - data.append(SourcePoint(item['id'], item['lat'], item['lon'], item['tags'])) - return data - except Exception: - logging.error('Failed to parse the source as a JSON') - return list(profile.get( - 'dataset', args=(fileobj,), - required='returns a list of SourcePoints with the dataset')) - - -def add_categories_to_dataset(profile, dataset): - categories = profile.get('categories') - if not categories: - return - tag = profile.get('category_tag') - other = categories.get('other', {}) - for d in dataset: - if tag and tag in d.tags: - d.category = d.tags[tag] - del d.tags[tag] - if d.category: - cat_tags = categories.get(d.category, other).get('tags', None) - if cat_tags: - d.tags.update(cat_tags) - - -def transform_dataset(profile, dataset): - """Transforms tags in the dataset using the "transform" method in the profile - or the instructions in that field in string or dict form.""" - transform = profile.get_raw('transform') - if not transform: - return - if callable(transform): - for d in dataset: - transform(d.tags) - return - if isinstance(transform, str): - # Convert string of "key=value|rule1|rule2" lines to a dict - lines = [line.split('=', 1) for line in transform.splitlines()] - transform = {l[0].strip(): l[1].strip() for l in lines} - if not transform or not isinstance(transform, dict): - return - for key in transform: - if isinstance(transform[key], str): - transform[key] = [x.strip() for x in transform[key].split('|')] - - for d in dataset: - for key, rules in transform.items(): - if not rules: - continue - value = None - if callable(rules): - # The value can be generated - value = rules(None if key not in d.tags else d.tags[key]) - if value is None and key in d.tags: - del d.tags[key] - elif not rules[0]: - # Use the value of the tag - if key in d.tags: - value = d.tags[key] - elif not isinstance(rules[0], str): - # If the value is not a string, use it - value = str(rules[0]) - elif rules[0][0] == '.': - # Use the value from another tag - alt_key = rules[0][1:] - if alt_key in d.tags: - value = d.tags[alt_key] - elif rules[0][0] == '>': - # Replace the key - if key in d.tags: - d.tags[rules[0][1:]] = d.tags[key] - del d.tags[key] - elif rules[0][0] == '<': - # Replace the key, the same but backwards - alt_key = rules[0][1:] - if alt_key in d.tags: - d.tags[key] = d.tags[alt_key] - del d.tags[alt_key] - elif rules[0] == '-': - # Delete the tag - if key in d.tags: - del d.tags[key] - else: - # Take the value as written - value = rules[0] - if value is None: - continue - if isinstance(rules, list): - for rule in rules[1:]: - if rule == 'lower': - value = value.lower() - d.tags[key] = value - - -def check_dataset_for_duplicates(profile, dataset, print_all=False): - # First checking for duplicate ids and collecting tags with varying values - ids = set() - tags = {} - found_duplicate_ids = False - for d in dataset: - if d.id in ids: - found_duplicate_ids = True - logging.error('Duplicate id {} in the dataset'.format(d.id)) - ids.add(d.id) - for k, v in d.tags.items(): - if k not in tags: - tags[k] = v - elif tags[k] != '---' and tags[k] != v: - tags[k] = '---' - - # And then for near-duplicate points with similar tags - max_distance = profile.get('max_distance', MAX_DISTANCE) - uncond_distance = profile.get('duplicate_distance', 1) - diff_tags = [k for k in tags if tags[k] == '---'] - kd = kdtree.create(list(dataset)) - duplicates = set() - group = 0 - for d in dataset: - if d.id in duplicates: - continue - group += 1 - dups = kd.search_knn(d, 2) # The first one will be equal to d - if len(dups) < 2 or dups[1][0].data.distance(d) > max_distance: - continue - for alt, _ in kd.search_knn(d, 20): - dist = alt.data.distance(d) - if alt.data.id != d.id and dist <= max_distance: - tags_differ = 0 - if dist > uncond_distance: - for k in diff_tags: - if alt.data.tags.get(k) != d.tags.get(k): - tags_differ += 1 - if tags_differ <= len(diff_tags) / 3: - duplicates.add(alt.data.id) - d.exclusive_group = group - alt.data.exclusive_group = group - if print_all or len(duplicates) <= 5: - is_duplicate = tags_differ <= 1 - logging.error('Dataset points %s: %s and %s', - 'duplicate each other' if is_duplicate else 'are too similar', - d.id, alt.data.id) - if duplicates: - logging.error('Found %s duplicates in the dataset', len(duplicates)) - if found_duplicate_ids: - raise KeyError('Cannot continue with duplicate ids') - - -def add_regions(dataset, geocoder): - if not geocoder.enabled: - return - if geocoder.filter: - logging.info('Geocoding and filtering points') - else: - logging.info('Geocoding points') - for i in reversed(range(len(dataset))): - region, present = geocoder.find(dataset[i]) - if not present: - del dataset[i] - else: - dataset[i].region = region +from .geocoder import Geocoder +from .profile import Profile +from .conflator import OsmConflator, TITLE +from .dataset import ( + read_dataset, + add_categories_to_dataset, + transform_dataset, + check_dataset_for_duplicates, + add_regions, +) def write_for_filter(profile, dataset, f): @@ -1394,15 +125,13 @@ def run(profile=None): if not profile: logging.debug('Loading profile %s', options.profile) - global param - param = options.param - profile = Profile(profile or options.profile) + profile = Profile(profile or options.profile, options.param) audit = None if options.audit: audit = json.load(options.audit) - geocoder = Geocoder(profile) + geocoder = Geocoder(profile.get_raw('regions')) if options.regions: geocoder.set_filter(options.regions) elif audit and audit.get('regions'): @@ -1423,12 +152,10 @@ def run(profile=None): logging.info('Prepared data for filtering, exitting') return - if options.alt_overpass: - global OVERPASS_SERVER - OVERPASS_SERVER = ALT_OVERPASS_SERVER - conflator = OsmConflator(profile, dataset, audit) conflator.geocoder = geocoder + if options.alt_overpass: + conflator.set_overpass('alt') if options.osm and os.path.exists(options.osm): with open(options.osm, 'r') as f: conflator.parse_osm(f) @@ -1447,7 +174,7 @@ def run(profile=None): if options.changes: if options.check_move: - check_moveability(conflator.changes) + conflator.check_moveability() fc = {'type': 'FeatureCollection', 'features': conflator.changes} json.dump(fc, options.changes, ensure_ascii=False, sort_keys=True, indent=1) @@ -1458,7 +185,3 @@ def run(profile=None): writer.writerow(row) logging.info('Done') - - -if __name__ == '__main__': - run() diff --git a/conflate/conflator.py b/conflate/conflator.py new file mode 100644 index 0000000..f099555 --- /dev/null +++ b/conflate/conflator.py @@ -0,0 +1,441 @@ +import logging +import kdtree +from collections import defaultdict +from .data import OSMPoint +from .version import __version__ +from .osm import OsmDownloader, check_moveability +from . import etree + + +TITLE = 'OSM Conflator ' + __version__ +CONTACT_KEYS = set(('phone', 'website', 'email', 'fax', 'facebook', 'twitter', 'instagram')) +LIFECYCLE_KEYS = set(('amenity', 'shop', 'tourism', 'craft', 'office')) +LIFECYCLE_PREFIXES = ('proposed', 'construction', 'disused', 'abandoned', 'was', 'removed') + + +class OsmConflator: + """The main class for the conflator. + + It receives a dataset, after which one must call either + "download_osm" or "parse_osm" methods. Then it is ready to match: + call the "match" method and get results with "to_osc". + """ + def __init__(self, profile, dataset, audit=None): + self.dataset = {p.id: p for p in dataset} + self.audit = audit or {} + self.osmdata = {} + self.matched = [] + self.changes = [] + self.matches = [] + self.profile = profile + self.geocoder = None + self.downloader = OsmDownloader(profile) + self.source = self.profile.get( + 'source', required='value of "source" tag for uploaded OSM objects') + self.add_source_tag = self.profile.get('add_source', False) + if self.profile.get('no_dataset_id', False): + self.ref = None + else: + self.ref = 'ref:' + self.profile.get( + 'dataset_id', required='A fairly unique id of the dataset to query OSM') + + def download_osm(self): + self.osmdata = self.downloader.download(self.dataset.values()) + + def parse_osm(self, fileobj): + self.osmdata = self.downloader.parse_xml(fileobj) + + def register_match(self, dataset_key, osmdata_key, keep=False, retag=None): + """Registers a match between an OSM point and a dataset point. + + Merges tags from an OSM Point and a dataset point, and add the result to the + self.matched list. + If dataset_key is None, deletes or retags the OSM point. + If osmdata_key is None, adds a new OSM point for the dataset point. + """ + def get_osm_key(k, osm_tags): + """Conflating contact: namespace.""" + if k in CONTACT_KEYS and k not in osm_tags and 'contact:'+k in osm_tags: + return 'contact:'+k + elif k.startswith('contact:') and k not in osm_tags and k[8:] in osm_tags: + return k[8:] + + # Now conflating lifecycle prefixes, only forward + if k in LIFECYCLE_KEYS and k not in osm_tags: + for prefix in LIFECYCLE_PREFIXES: + if prefix+':'+k in osm_tags: + return prefix+':'+k + return k + + def update_tags(tags, source, master_tags=None, retagging=False, audit=None): + """Updates tags dictionary with tags from source, + returns True is something was changed.""" + keep = set() + override = set() + changed = False + if source: + if audit: + keep = set(audit.get('keep', [])) + override = set(audit.get('override', [])) + for k, v in source.items(): + osm_key = get_osm_key(k, tags) + + if k in keep or osm_key in keep: + continue + if k in override or osm_key in override: + if not v and osm_key in tags: + del tags[osm_key] + changed = True + elif v and tags.get(osm_key, None) != v: + tags[osm_key] = v + changed = True + continue + + if osm_key not in tags or retagging or ( + tags[osm_key] != v and (master_tags and k in master_tags)): + if v is not None and len(v) > 0: + # Not setting addr:full when the object has addr:housenumber + if k == 'addr:full' and 'addr:housenumber' in tags: + continue + tags[osm_key] = v + changed = True + elif osm_key in tags and (v == '' or retagging): + del tags[osm_key] + changed = True + return changed + + def format_change(before, after, ref): + MARKER_COLORS = { + 'delete': '#ee2211', # deleting feature from OSM + 'create': '#11dd11', # creating a new node + 'update': '#0000ee', # changing tags on an existing feature + 'retag': '#660000', # cannot delete unmatched feature, changing tags + 'move': '#110055', # moving an existing node + } + marker_action = None + geometry = {'type': 'Point', 'coordinates': [after.lon, after.lat]} + props = { + 'osm_type': after.osm_type, + 'osm_id': after.osm_id, + 'action': after.action + } + if after.action in ('create', 'delete'): + # Red if deleted, green if added + marker_action = after.action + for k, v in after.tags.items(): + props['tags.{}'.format(k)] = v + if ref: + props['ref_id'] = ref.id + else: # modified + # Blue if updated from dataset, dark red if retagged, dark blue if moved + marker_action = 'update' if ref else 'retag' + if ref: + props['ref_id'] = ref.id + props['ref_distance'] = round(10 * ref.distance(before)) / 10.0 + props['ref_coords'] = [ref.lon, ref.lat] + if before.lon != after.lon or before.lat != after.lat: + # The object was moved + props['were_coords'] = [before.lon, before.lat] + marker_action = 'move' + # Find tags that were superseeded by OSM tags + for k, v in ref.tags.items(): + osm_key = get_osm_key(k, after.tags) + if osm_key not in after.tags or after.tags[osm_key] != v: + props['ref_unused_tags.{}'.format(osm_key)] = v + # Now compare old and new OSM tags + for k in set(after.tags.keys()).union(set(before.tags.keys())): + v0 = before.tags.get(k, None) + v1 = after.tags.get(k, None) + if v0 == v1: + props['tags.{}'.format(k)] = v0 + elif v0 is None: + props['tags_new.{}'.format(k)] = v1 + elif v1 is None: + props['tags_deleted.{}'.format(k)] = v0 + else: + props['tags_changed.{}'.format(k)] = '{} -> {}'.format(v0, v1) + props['marker-color'] = MARKER_COLORS[marker_action] + if ref and ref.remarks: + props['remarks'] = ref.remarks + if ref and ref.region: + props['region'] = ref.region + elif self.geocoder: + region, present = self.geocoder.find(after) + if not present: + return None + if region is not None: + props['region'] = region + return {'type': 'Feature', 'geometry': geometry, 'properties': props} + + p = self.osmdata.pop(osmdata_key, None) + p0 = None if p is None else p.copy() + sp = self.dataset.pop(dataset_key, None) + audit = self.audit.get(sp.id if sp else '{}{}'.format(p.osm_type, p.osm_id), {}) + if audit.get('skip', False): + return + + if sp is not None: + if p is None: + p = OSMPoint('node', -1-len(self.matched), 1, sp.lat, sp.lon, sp.tags) + p.action = 'create' + else: + master_tags = set(self.profile.get('master_tags', [])) + if update_tags(p.tags, sp.tags, master_tags, audit=audit): + p.action = 'modify' + # Move a node if it is too far from the dataset point + if not p.is_area() and sp.distance(p) > self.profile.max_distance: + p.lat = sp.lat + p.lon = sp.lon + p.action = 'modify' + if self.add_source_tag: + if 'source' in p.tags: + if self.source not in p.tags['source']: + p.tags['source'] = ';'.join([p.tags['source'], self.source]) + else: + p.tags['source'] = self.source + if self.ref is not None: + p.tags[self.ref] = sp.id + if 'fixme' in audit and audit['fixme'] != p.tags.get('fixme'): + p.tags['fixme'] = audit['fixme'] + if p.action is None: + p.action = 'modify' + if 'move' in audit and not p.is_area(): + if p0 and audit['move'] == 'osm': + p.lat = p0.lat + p.lon = p0.lon + elif audit['move'] == 'dataset': + p.lat = sp.lat + p.lon = sp.lon + elif len(audit['move']) == 2: + p.lat = audit['move'][1] + p.lon = audit['move'][0] + if p.action is None and p0.distance(p) > 0.1: + p.action = 'modify' + if p.action != 'create': + self.matches.append([sp.id, p.osm_type, p.osm_id, p.lat, p.lon, p.action]) + else: + self.matches.append([sp.id, '', '', p.lat, p.lon, p.action]) + elif keep or p.is_area(): + if update_tags(p.tags, retag, retagging=True, audit=audit): + p.action = 'modify' + else: + p.action = 'delete' + + if p.action is not None: + change = format_change(p0, p, sp) + if change is not None: + self.matched.append(p) + self.changes.append(change) + + def match_dataset_points_smart(self): + """Smart matching for dataset <-> OSM points. + + We find a shortest link between a dataset and an OSM point. + Then we match these and remove both from dicts. + Then find another link and so on, until the length of a link + becomes larger than "max_distance". + + Currently the worst case complexity is around O(n^2*log^2 n). + But given the small number of objects to match, and that + the average case complexity is ~O(n*log^2 n), this is fine. + """ + def search_nn_fix(kd, point): + nearest = kd.search_knn(point, self.profile.get('nearest_points', 10)) + if not nearest: + return None, None + match_func = self.profile.get_raw('matches') + if match_func: + nearest = [p for p in nearest if match_func(p[0].data.tags, point.tags)] + if not nearest: + return None, None + nearest = [(n[0], n[0].data.distance(point)) + for n in nearest if point.category in n[0].data.categories] + return sorted(nearest, key=lambda kv: kv[1])[0] + + if not self.osmdata: + return + osm_kd = kdtree.create(list(self.osmdata.values())) + count_matched = 0 + + # Process overridden features first + for override, osm_find in self.profile.get('override', {}).items(): + override = str(override) + if override not in self.dataset: + continue + found = None + if len(osm_find) > 2 and osm_find[0] in 'nwr' and osm_find[1].isdigit(): + if osm_find in self.osmdata: + found = self.osmdata[osm_find] + # Search nearest 100 points + nearest = osm_kd.search_knn(self.dataset[override], 100) + if nearest: + for p in nearest: + if 'name' in p[0].data.tags and p[0].data.tags['name'] == osm_find: + found = p[0].data + if found: + count_matched += 1 + self.register_match(override, found.id) + osm_kd = osm_kd.remove(found) + + # Prepare distance list: match OSM points to each of the dataset points + dist = [] + for sp, v in self.dataset.items(): + osm_point, distance = search_nn_fix(osm_kd, v) + if osm_point is not None and distance <= self.profile.max_distance: + dist.append((distance, sp, osm_point.data)) + + # The main matching loop: sort dist list if needed, + # register the closes match, update the list + needs_sorting = True + while dist: + if needs_sorting: + dist.sort(key=lambda x: x[0]) + needs_sorting = False + count_matched += 1 + osm_point = dist[0][2] + self.register_match(dist[0][1], osm_point.id) + osm_kd = osm_kd.remove(osm_point) + del dist[0] + for i in reversed(range(len(dist))): + if dist[i][2] == osm_point: + nearest, distance = search_nn_fix(osm_kd, self.dataset[dist[i][1]]) + if nearest and distance <= self.profile.max_distance: + dist[i] = (distance, dist[i][1], nearest.data) + needs_sorting = i == 0 or distance < dist[0][0] + else: + del dist[i] + needs_sorting = i == 0 + logging.info('Matched %s points', count_matched) + + def match(self): + """Matches each osm object with a SourcePoint, or marks it as obsolete. + The resulting list of OSM Points are written to the "matched" field.""" + find_ref = self.profile.get_raw('find_ref') + if self.ref is not None or callable(find_ref): + # First match all objects with ref:whatever tag set + count_ref = 0 + for k, p in list(self.osmdata.items()): + ref = None + if self.ref and self.ref in p.tags: + ref = p.tags[self.ref] + elif find_ref: + ref = find_ref(p.tags) + if ref is not None: + if ref in self.dataset: + count_ref += 1 + self.register_match(ref, k) + logging.info('Updated %s OSM objects with %s tag', count_ref, self.ref) + + # Add points for which audit specifically mentioned creating + count_created = 0 + for ref, a in self.audit.items(): + if ref in self.dataset: + if a.get('create', None): + count_created += 1 + self.register_match(ref, None) + elif a.get('skip', None): + # If we skip an object here, it would affect the conflation order + pass + if count_created > 0: + logging.info('Created %s audit-overridden dataset points', count_created) + + # Prepare exclusive groups dict + exclusive_groups = defaultdict(set) + for p, v in self.dataset.items(): + if v.exclusive_group is not None: + exclusive_groups[v.exclusive_group].add(p) + + # Then find matches for unmatched dataset points + self.match_dataset_points_smart() + + # Remove unmatched duplicates + count_duplicates = 0 + for ids in exclusive_groups.values(): + found = False + for p in ids: + if p not in self.dataset: + found = True + break + for p in ids: + if p in self.dataset: + if found: + count_duplicates += 1 + del self.dataset[p] + else: + # Leave one element when not matched any + found = True + if count_duplicates > 0: + logging.info('Removed %s unmatched duplicates', count_duplicates) + + # Add unmatched dataset points + logging.info('Adding %s unmatched dataset points', len(self.dataset)) + for k in sorted(list(self.dataset.keys())): + self.register_match(k, None) + + # And finally delete some or all of the remaining osm objects + if len(self.osmdata) > 0: + count_deleted = 0 + count_retagged = 0 + delete_unmatched = self.profile.get('delete_unmatched', False) + retag = self.profile.get('tag_unmatched') + for k, p in list(self.osmdata.items()): + ref = None + if self.ref and self.ref in p.tags: + ref = p.tags[self.ref] + elif find_ref: + ref = find_ref(p.tags) + if ref is not None: + # When ref:whatever is present, we can delete that object safely + count_deleted += 1 + self.register_match(None, k, retag=retag) + elif delete_unmatched or retag: + if not delete_unmatched or p.is_area(): + count_retagged += 1 + else: + count_deleted += 1 + self.register_match(None, k, keep=not delete_unmatched, retag=retag) + logging.info( + 'Deleted %s and retagged %s unmatched objects from OSM', + count_deleted, count_retagged) + + def backup_osm(self): + """Writes OSM data as-is.""" + osm = etree.Element('osm', version='0.6', generator=TITLE) + for osmel in self.osmdata.values(): + el = osmel.to_xml() + if osmel.osm_type != 'node': + etree.SubElement(el, 'center', lat=str(osmel.lat), lon=str(osmel.lon)) + osm.append(el) + return ("\n" + + etree.tostring(osm, encoding='utf-8').decode('utf-8')) + + def to_osc(self, josm=False): + """Returns a string with osmChange or JOSM XML.""" + osc = etree.Element('osm' if josm else 'osmChange', version='0.6', generator=TITLE) + if josm: + neg_id = -1 + changeset = etree.SubElement(osc, 'changeset') + ch_tags = { + 'source': self.source, + 'created_by': TITLE, + 'type': 'import' + } + for k, v in ch_tags.items(): + etree.SubElement(changeset, 'tag', k=k, v=v) + for osmel in self.matched: + if osmel.action is not None: + el = osmel.to_xml() + if josm: + if osmel.action == 'create': + el.set('id', str(neg_id)) + neg_id -= 1 + else: + el.set('action', osmel.action) + osc.append(el) + else: + etree.SubElement(osc, osmel.action).append(el) + return ("\n" + + etree.tostring(osc, encoding='utf-8').decode('utf-8')) + + def check_moveability(self): + check_moveability(self.changes) diff --git a/conflate/data.py b/conflate/data.py new file mode 100644 index 0000000..73d5964 --- /dev/null +++ b/conflate/data.py @@ -0,0 +1,104 @@ +import math +from . import etree + + +class SourcePoint: + """A common class for points. Has an id, latitude and longitude, + and a dict of tags. Remarks are optional for reviewers hints only.""" + def __init__(self, pid, lat, lon, tags=None, category=None, remarks=None, region=None): + self.id = str(pid) + self.lat = lat + self.lon = lon + self.tags = {} if tags is None else { + k.lower(): str(v).strip() for k, v in tags.items() if v is not None} + self.category = category + self.dist_offset = 0 + self.remarks = remarks + self.region = region + self.exclusive_group = None + + def distance(self, other): + """Calculate distance in meters.""" + dx = math.radians(self.lon - other.lon) * math.cos(0.5 * math.radians(self.lat + other.lat)) + dy = math.radians(self.lat - other.lat) + return 6378137 * math.sqrt(dx*dx + dy*dy) - self.dist_offset + + def __len__(self): + return 2 + + def __getitem__(self, i): + if i == 0: + return self.lat + elif i == 1: + return self.lon + else: + raise ValueError('A SourcePoint has only lat and lon in a list') + + def __eq__(self, other): + return self.id == other.id + + def __hash__(self): + return hash(self.id) + + def __repr__(self): + return 'SourcePoint({}, {}, {}, offset={}, tags={})'.format( + self.id, self.lat, self.lon, self.dist_offset, self.tags) + + +class OSMPoint(SourcePoint): + """An OSM points is a SourcePoint with a few extra fields. + Namely, version, members (for ways and relations), and an action. + The id is compound and created from object type and object id.""" + def __init__(self, ptype, pid, version, lat, lon, tags=None, categories=None): + super().__init__('{}{}'.format(ptype[0], pid), lat, lon, tags) + self.tags = {k: v for k, v in self.tags.items() if v is not None and len(v) > 0} + self.osm_type = ptype + self.osm_id = pid + self.version = version + self.members = None + self.action = None + self.categories = categories or set() + self.remarks = None + + def copy(self): + """Returns a copy of this object, except for members field.""" + c = OSMPoint(self.osm_type, self.osm_id, self.version, self.lat, self.lon, self.tags.copy()) + c.action = self.action + c.remarks = self.remarks + c.categories = self.categories.copy() + return c + + def is_area(self): + return self.osm_type != 'node' + + def is_poi(self): + if self.osm_type == 'node': + return True + if self.osm_type == 'way' and len(self.members) > 2: + return self.members[0] == self.members[-1] + if self.osm_type == 'relation' and len(self.members) > 0: + return self.tags.get('type', None) == 'multipolygon' + return False + + def to_xml(self): + """Produces an XML out of the point data. Disregards the "action" field.""" + el = etree.Element(self.osm_type, id=str(self.osm_id), version=str(self.version)) + for tag, value in self.tags.items(): + etree.SubElement(el, 'tag', k=tag, v=value) + + if self.osm_type == 'node': + el.set('lat', str(self.lat)) + el.set('lon', str(self.lon)) + elif self.osm_type == 'way': + for node_id in self.members: + etree.SubElement(el, 'nd', ref=str(node_id)) + elif self.osm_type == 'relation': + for member in self.members: + m = etree.SubElement(el, 'member') + for i, n in enumerate(('type', 'ref', 'role')): + m.set(n, str(member[i])) + return el + + def __repr__(self): + return 'OSMPoint({} {} v{}, {}, {}, action={}, tags={})'.format( + self.osm_type, self.osm_id, self.version, self.lat, self.lon, self.action, self.tags) diff --git a/conflate/dataset.py b/conflate/dataset.py new file mode 100644 index 0000000..1bce623 --- /dev/null +++ b/conflate/dataset.py @@ -0,0 +1,213 @@ +import logging +import json +import codecs +import requests +import kdtree +from io import BytesIO +from .data import SourcePoint + + +def read_dataset(profile, fileobj): + """A helper function to call a "dataset" function in the profile. + If the fileobj is not specified, tries to download a dataset from + an URL specified in "download_url" profile variable.""" + if not fileobj: + url = profile.get('download_url') + if url is None: + logging.error('No download_url specified in the profile, ' + 'please provide a dataset file with --source') + return None + r = requests.get(url) + if r.status_code != 200: + logging.error('Could not download source data: %s %s', r.status_code, r.text) + return None + if len(r.content) == 0: + logging.error('Empty response from %s', url) + return None + fileobj = BytesIO(r.content) + if not profile.has('dataset'): + # The default option is to parse the source as a JSON + try: + data = [] + reader = codecs.getreader('utf-8') + json_src = json.load(reader(fileobj)) + if 'features' in json_src: + # Parse GeoJSON + for item in json_src['features']: + if item['geometry'].get('type') != 'Point' or 'properties' not in item: + continue + # Get the identifier from "id", "ref", "ref*" + iid = item['properties'].get('id', item['properties'].get('ref')) + if not iid: + for k, v in item['properties'].items(): + if k.startswith('ref'): + iid = v + break + if not iid: + continue + data.append(SourcePoint( + iid, + item['geometry']['coordinates'][1], + item['geometry']['coordinates'][0], + {k: v for k, v in item['properties'].items() if k != 'id'})) + else: + for item in json_src: + data.append(SourcePoint(item['id'], item['lat'], item['lon'], item['tags'])) + return data + except Exception: + logging.error('Failed to parse the source as a JSON') + return list(profile.get( + 'dataset', args=(fileobj,), + required='returns a list of SourcePoints with the dataset')) + + +def add_categories_to_dataset(profile, dataset): + categories = profile.get('categories') + if not categories: + return + tag = profile.get('category_tag') + other = categories.get('other', {}) + for d in dataset: + if tag and tag in d.tags: + d.category = d.tags[tag] + del d.tags[tag] + if d.category: + cat_tags = categories.get(d.category, other).get('tags', None) + if cat_tags: + d.tags.update(cat_tags) + + +def transform_dataset(profile, dataset): + """Transforms tags in the dataset using the "transform" method in the profile + or the instructions in that field in string or dict form.""" + transform = profile.get_raw('transform') + if not transform: + return + if callable(transform): + for d in dataset: + transform(d.tags) + return + if isinstance(transform, str): + # Convert string of "key=value|rule1|rule2" lines to a dict + lines = [line.split('=', 1) for line in transform.splitlines()] + transform = {l[0].strip(): l[1].strip() for l in lines} + if not transform or not isinstance(transform, dict): + return + for key in transform: + if isinstance(transform[key], str): + transform[key] = [x.strip() for x in transform[key].split('|')] + + for d in dataset: + for key, rules in transform.items(): + if not rules: + continue + value = None + if callable(rules): + # The value can be generated + value = rules(None if key not in d.tags else d.tags[key]) + if value is None and key in d.tags: + del d.tags[key] + elif not rules[0]: + # Use the value of the tag + if key in d.tags: + value = d.tags[key] + elif not isinstance(rules[0], str): + # If the value is not a string, use it + value = str(rules[0]) + elif rules[0][0] == '.': + # Use the value from another tag + alt_key = rules[0][1:] + if alt_key in d.tags: + value = d.tags[alt_key] + elif rules[0][0] == '>': + # Replace the key + if key in d.tags: + d.tags[rules[0][1:]] = d.tags[key] + del d.tags[key] + elif rules[0][0] == '<': + # Replace the key, the same but backwards + alt_key = rules[0][1:] + if alt_key in d.tags: + d.tags[key] = d.tags[alt_key] + del d.tags[alt_key] + elif rules[0] == '-': + # Delete the tag + if key in d.tags: + del d.tags[key] + else: + # Take the value as written + value = rules[0] + if value is None: + continue + if isinstance(rules, list): + for rule in rules[1:]: + if rule == 'lower': + value = value.lower() + d.tags[key] = value + + +def check_dataset_for_duplicates(profile, dataset, print_all=False): + # First checking for duplicate ids and collecting tags with varying values + ids = set() + tags = {} + found_duplicate_ids = False + for d in dataset: + if d.id in ids: + found_duplicate_ids = True + logging.error('Duplicate id {} in the dataset'.format(d.id)) + ids.add(d.id) + for k, v in d.tags.items(): + if k not in tags: + tags[k] = v + elif tags[k] != '---' and tags[k] != v: + tags[k] = '---' + + # And then for near-duplicate points with similar tags + uncond_distance = profile.get('duplicate_distance', 1) + diff_tags = [k for k in tags if tags[k] == '---'] + kd = kdtree.create(list(dataset)) + duplicates = set() + group = 0 + for d in dataset: + if d.id in duplicates: + continue + group += 1 + dups = kd.search_knn(d, 2) # The first one will be equal to d + if len(dups) < 2 or dups[1][0].data.distance(d) > profile.max_distance: + continue + for alt, _ in kd.search_knn(d, 20): + dist = alt.data.distance(d) + if alt.data.id != d.id and dist <= profile.max_distance: + tags_differ = 0 + if dist > uncond_distance: + for k in diff_tags: + if alt.data.tags.get(k) != d.tags.get(k): + tags_differ += 1 + if tags_differ <= len(diff_tags) / 3: + duplicates.add(alt.data.id) + d.exclusive_group = group + alt.data.exclusive_group = group + if print_all or len(duplicates) <= 5: + is_duplicate = tags_differ <= 1 + logging.error('Dataset points %s: %s and %s', + 'duplicate each other' if is_duplicate else 'are too similar', + d.id, alt.data.id) + if duplicates: + logging.error('Found %s duplicates in the dataset', len(duplicates)) + if found_duplicate_ids: + raise KeyError('Cannot continue with duplicate ids') + + +def add_regions(dataset, geocoder): + if not geocoder.enabled: + return + if geocoder.filter: + logging.info('Geocoding and filtering points') + else: + logging.info('Geocoding points') + for i in reversed(range(len(dataset))): + region, present = geocoder.find(dataset[i]) + if not present: + del dataset[i] + else: + dataset[i].region = region diff --git a/conflate/geocoder.py b/conflate/geocoder.py new file mode 100644 index 0000000..76ce723 --- /dev/null +++ b/conflate/geocoder.py @@ -0,0 +1,120 @@ +import struct +import logging +import os +import kdtree + + +class Geocoder: + def __init__(self, profile_regions='all'): + self.filter = None + self.enabled = bool(profile_regions) + if self.enabled: + logging.info('Initializing geocoder (this will take a minute)') + self.regions = self.parse_regions(profile_regions) + self.tree = self.load_places_tree() + if not self.tree: + if callable(profile_regions): + logging.warn('Could not read the geocoding file') + else: + logging.error('Could not read the geocoding file, no regions will be added') + self.enabled = False + + def set_filter(self, opt_regions): + if isinstance(opt_regions, str): + self.f_negate = opt_regions[0] in ('-', '^') + if self.f_negate: + opt_regions = opt_regions[1:] + self.filter = set([r.strip() for r in opt_regions.split(',')]) + elif isinstance(opt_regions, list): + self.f_negate = False + self.filter = set(opt_regions) + + def load_places_tree(self): + class PlacePoint: + def __init__(self, lon, lat, country, region): + self.coord = (lon, lat) + self.country = country + self.region = region + + def __len__(self): + return len(self.coord) + + def __getitem__(self, i): + return self.coord[i] + + filename = os.path.join(os.getcwd(), os.path.dirname(__file__), 'places.bin') + if not os.path.exists(filename): + return None + places = [] + with open(filename, 'rb') as f: + countries = [] + cnt = struct.unpack('B', f.read(1))[0] + for i in range(cnt): + countries.append(struct.unpack('2s', f.read(2))[0].decode('ascii')) + regions = [] + cnt = struct.unpack(' 2: + q = '"{}"~"^({})$"'.format(t[0], '|'.join(t[1:])) + else: + q = '"{}"="{}"'.format(t[0], t[1]) + tag_str += '[' + q + ']' + tag_strs.append(tag_str) + + if self.profile.get('no_dataset_id', False): + ref = None + else: + ref = 'nwr["ref:' + self.profile.get( + 'dataset_id', required='A fairly unique id of the dataset to query OSM') + '"]' + timeout = self.profile.get('overpass_timeout', 120) + query = '[out:xml]{};('.format('' if timeout is None else '[timeout:{}]'.format(timeout)) + for bbox in bboxes: + bbox_str = '' if bbox is None else '(' + ','.join([str(x) for x in bbox]) + ')' + for tag_str in tag_strs: + query += 'nwr' + tag_str + bbox_str + ';' + if ref is not None: + if not self.profile.get('bounded_update', False): + query += ref + ';' + else: + for bbox in bboxes: + bbox_str = '' if bbox is None else '(' + ','.join( + [str(x) for x in bbox]) + ')' + query += ref + bbox_str + ';' + query += '); out meta qt center;' + return query + + def get_bbox(self, points): + """Plain iterates over the dataset and returns the bounding box + that encloses it.""" + padding = self.profile.get('bbox_padding', BBOX_PADDING) + bbox = [90.0, 180.0, -90.0, -180.0] + for p in points: + bbox[0] = min(bbox[0], p.lat - padding) + bbox[1] = min(bbox[1], p.lon - padding) + bbox[2] = max(bbox[2], p.lat + padding) + bbox[3] = max(bbox[3], p.lon + padding) + return bbox + + def split_into_bboxes(self, points): + """ + Splits the dataset into multiple bboxes to lower load on the overpass api. + + Returns a list of tuples (minlat, minlon, maxlat, maxlon). + """ + max_bboxes = self.profile.get('max_request_boxes', 4) + if max_bboxes <= 1 or len(points) <= 1: + return [self.get_bbox(points)] + + # coord, alt coord, total w/h to the left/bottom, total w/h to the right/top + lons = sorted([[d.lon, d.lat, 0, 0] for d in points]) + lats = sorted([[d.lat, d.lon, 0, 0] for d in points]) + + def update_side_dimensions(ar): + """For each point, calculates the maximum and + minimum bound for all points left and right.""" + fwd_top = fwd_bottom = ar[0][1] + back_top = back_bottom = ar[-1][1] + for i in range(len(ar)): + fwd_top = max(fwd_top, ar[i][1]) + fwd_bottom = min(fwd_bottom, ar[i][1]) + ar[i][2] = fwd_top - fwd_bottom + back_top = max(back_top, ar[-i-1][1]) + back_bottom = min(back_bottom, ar[-i-1][1]) + ar[-i-1][3] = back_top - back_bottom + + def find_max_gap(ar, h): + """Select an interval between points, which would give + the maximum area if split there.""" + max_id = None + max_gap = 0 + for i in range(len(ar) - 1): + # "Extra" variables are for area to the left and right + # that would be freed after splitting. + extra_left = (ar[i][0]-ar[0][0]) * (h-ar[i][2]) + extra_right = (ar[-1][0]-ar[i+1][0]) * (h-ar[i+1][3]) + # Gap is the area of the column between points i and i+1 + # plus extra areas to the left and right. + gap = (ar[i+1][0] - ar[i][0]) * h + extra_left + extra_right + if gap > max_gap: + max_id = i + max_gap = gap + return max_id, max_gap + + def get_bbox(b, pad=0): + """Returns a list of [min_lat, min_lon, max_lat, max_lon] for a box.""" + return [b[2][0][0]-pad, b[3][0][0]-pad, b[2][-1][0]+pad, b[3][-1][0]+pad] + + def split(box, point_array, point_id): + """Split the box over axis point_array at point point_id...point_id+1. + Modifies the box in-place and returns a new box.""" + alt_array = 5 - point_array # 3->2, 2->3 + points = box[point_array][point_id+1:] + del box[point_array][point_id+1:] + alt = {True: [], False: []} # True means point is in new box + for p in box[alt_array]: + alt[(p[1], p[0]) >= (points[0][0], points[0][1])].append(p) + + new_box = [None] * 4 + new_box[point_array] = points + new_box[alt_array] = alt[True] + box[alt_array] = alt[False] + for i in range(2): + box[i] = box[i+2][-1][0] - box[i+2][0][0] + new_box[i] = new_box[i+2][-1][0] - new_box[i+2][0][0] + return new_box + + # height, width, lats, lons + boxes = [[lats[-1][0]-lats[0][0], lons[-1][0]-lons[0][0], lats, lons]] + initial_area = boxes[0][0] * boxes[0][1] + while len(boxes) < max_bboxes and len(boxes) <= len(points): + candidate_box = None + area = 0 + point_id = None + point_array = None + for box in boxes: + for ar in (2, 3): + # Find a box and an axis for splitting that would decrease the area the most + update_side_dimensions(box[ar]) + max_id, max_area = find_max_gap(box[ar], box[3-ar]) + if max_area > area: + area = max_area + candidate_box = box + point_id = max_id + point_array = ar + if area * 100 < initial_area: + # Stop splitting when the area decrease is less than 1% + break + logging.debug('Splitting bbox %s at %s %s..%s; area decrease %s%%', + get_bbox(candidate_box), + 'longs' if point_array == 3 else 'lats', + candidate_box[point_array][point_id][0], + candidate_box[point_array][point_id+1][0], + round(100*area/initial_area)) + boxes.append(split(candidate_box, point_array, point_id)) + + padding = self.profile.get('bbox_padding', BBOX_PADDING) + return [get_bbox(b, padding) for b in boxes] + + def get_categories(self, tags): + def match_query(tags, query): + for tag in query: + if len(tag) == 1: + return tag[0] in tags + else: + value = tags.get(tag[0], None) + if tag[1] is None or tag[1] == '': + return value is None + if value is None: + return False + found = False + for t2 in tag[1:]: + if t2[0] == '~': + m = re.search(t2[1:], value) + if not m: + return False + elif t2[0] == '!': + if t2[1:].lower() in value.lower(): + found = True + elif t2 == value: + found = True + if found: + break + if not found: + return False + return True + + def tags_to_query(tags): + return [(k, v) for k, v in tags.items()] + + result = set() + qualifies = self.profile.get('qualifies', args=tags) + if qualifies is not None: + if qualifies: + result.add(None) + return result + + # First check default query + query = self.profile.get('query', None) + if query is not None: + if isinstance(query, str): + result.add(None) + else: + if isinstance(query[0][0], str): + query = [query] + for q in query: + if match_query(tags, q): + result.add(None) + break + + # Then check each category if we got these + categories = self.profile.get('categories', {}) + for name, params in categories.items(): + if 'tags' not in params and 'query' not in params: + raise ValueError('No tags and query attributes for category "{}"'.format(name)) + if match_query(tags, params.get('query', tags_to_query(params.get('tags')))): + result.add(name) + + return result + + def download(self, dataset_points): + """Constructs an Overpass API query and requests objects + to match from a server.""" + profile_bbox = self.profile.get('bbox', True) + if not profile_bbox: + bboxes = [None] + elif hasattr(profile_bbox, '__len__') and len(profile_bbox) == 4: + bboxes = [profile_bbox] + else: + bboxes = self.split_into_bboxes(dataset_points) + + query = self.construct_overpass_query(bboxes) + logging.debug('Overpass query: %s', query) + r = requests.get(OVERPASS_SERVER + 'interpreter', {'data': query}) + if r.encoding is None: + r.encoding = 'utf-8' + if r.status_code != 200: + logging.error('Failed to download data from Overpass API: %s', r.status_code) + if 'rate_limited' in r.text: + r = requests.get(OVERPASS_SERVER + 'status') + logging.warning('Seems like you are rate limited. API status:\n%s', r.text) + else: + logging.error('Error message: %s', r.text) + raise IOError() + if 'runtime error: ' in r.text: + m = re.search(r'runtime error: ([^<]+)', r.text) + error = 'unknown' if not m else m.group(1) + if 'Query timed out' in error: + logging.error( + 'Query timed out, try increasing the "overpass_timeout" profile variable') + else: + logging.error('Runtime error: %s', error) + raise IOError() + return self.parse_xml(r.content) + + def parse_xml(self, fileobj): + """Parses an OSM XML file into the "osmdata" field. For ways and relations, + finds the center. Drops objects that do not match the overpass query tags + (see "check_against_profile_tags" method).""" + if isinstance(fileobj, bytes): + xml = etree.fromstring(fileobj) + else: + xml = etree.parse(fileobj).getroot() + nodes = {} + for nd in xml.findall('node'): + nodes[nd.get('id')] = (float(nd.get('lat')), float(nd.get('lon'))) + ways = {} + for way in xml.findall('way'): + center = way.find('center') + if center is not None: + ways[way.get('id')] = [float(center.get('lat')), float(center.get('lon'))] + else: + logging.debug('Way %s does not have a center', way.get('id')) + coord = [0, 0] + count = 0 + for nd in way.findall('nd'): + if nd.get('ref') in nodes: + count += 1 + for i in range(len(coord)): + coord[i] += nodes[nd.get('ref')][i] + ways[way.get('id')] = [coord[0] / count, coord[1] / count] + + # For calculating weight of OSM objects + weight_fn = self.profile.get_raw('weight') + osmdata = {} + + for el in xml: + tags = {} + for tag in el.findall('tag'): + tags[tag.get('k')] = tag.get('v') + categories = self.get_categories(tags) + if categories is False or categories is None or len(categories) == 0: + continue + + if el.tag == 'node': + coord = nodes[el.get('id')] + members = None + elif el.tag == 'way': + coord = ways[el.get('id')] + members = [nd.get('ref') for nd in el.findall('nd')] + elif el.tag == 'relation': + center = el.find('center') + if center is not None: + coord = [float(center.get('lat')), float(center.get('lon'))] + else: + logging.debug('Relation %s does not have a center', el.get('id')) + coord = [0, 0] + count = 0 + for m in el.findall('member'): + if m.get('type') == 'node' and m.get('ref') in nodes: + count += 1 + for i in range(len(coord)): + coord[i] += nodes[m.get('ref')][i] + elif m.get('type') == 'way' and m.get('ref') in ways: + count += 1 + for i in range(len(coord)): + coord[i] += ways[m.get('ref')][i] + if count > 0: + coord = [coord[0] / count, coord[1] / count] + members = [ + (m.get('type'), m.get('ref'), m.get('role')) + for m in el.findall('member') + ] + else: + continue + if not coord or coord == [0, 0]: + continue + pt = OSMPoint( + el.tag, int(el.get('id')), int(el.get('version')), + coord[0], coord[1], tags, categories) + pt.members = members + if pt.is_poi(): + if callable(weight_fn): + weight = weight_fn(pt) + if weight: + if abs(weight) > 3: + pt.dist_offset = weight + else: + pt.dist_offset = weight * self.profile.max_distance + osmdata[pt.id] = pt + return osmdata + + +def check_moveability(changes): + to_check = [x for x in changes if x['properties']['osm_type'] == 'node' and + x['properties']['action'] == 'modify'] + logging.info('Checking moveability of %s modified nodes', len(to_check)) + for c in to_check: + p = c['properties'] + p['can_move'] = False + r = requests.get('{}node/{}/ways'.format(OSM_API_SERVER, p['osm_id'])) + if r.status_code == 200: + xml = etree.fromstring(r.content) + p['can_move'] = xml.find('way') is None diff --git a/conflate/profile.py b/conflate/profile.py new file mode 100644 index 0000000..a2793db --- /dev/null +++ b/conflate/profile.py @@ -0,0 +1,62 @@ +import json +from .data import SourcePoint # So we don't have to import this in profiles +from . import etree + + +class ProfileException(Exception): + """An exception class for the Profile instance.""" + def __init__(self, attr, desc): + super().__init__('Field missing in profile: {} ({})'.format(attr, desc)) + + +class Profile: + """A wrapper for a profile. + + A profile is a python script that sets a few local variables. + These variables become properties of the profile, accessible with + a "get" method. If something is a function, it will be called, + optional parameters might be passed to it. + + You can compile a list of all supported variables by grepping through + this code, or by looking at a few example profiles. If something + is required, you will be notified of that. + """ + def __init__(self, fileobj, par=None): + global param + param = par + if isinstance(fileobj, dict): + self.profile = fileobj + elif hasattr(fileobj, 'read'): + s = fileobj.read().replace('\r', '') + if s[0] == '{': + self.profile = json.loads(s) + else: + self.profile = {} + exec(s, globals(), self.profile) + else: + # Got a class + self.profile = {name: getattr(fileobj, name) + for name in dir(fileobj) if not name.startswith('_')} + self.max_distance = self.get('max_distance', 100) + + def has(self, attr): + return attr in self.profile + + def get(self, attr, default=None, required=None, args=None): + if attr in self.profile: + value = self.profile[attr] + if callable(value): + if args is None: + return value() + else: + return value(*args) + else: + return value + if required is not None: + raise ProfileException(attr, required) + return default + + def get_raw(self, attr, default=None): + if attr in self.profile: + return self.profile[attr] + return default diff --git a/profiles/auchan_moscow.py b/profiles/auchan_moscow.py index 735e089..c7fd7db 100644 --- a/profiles/auchan_moscow.py +++ b/profiles/auchan_moscow.py @@ -1,7 +1,3 @@ -# This profile requires lxml package -import logging -import re - # A web page with a list of shops in Moscow. You can replace it with one for another city download_url = 'https://www.auchan.ru/ru/moscow/' source = 'auchan.ru' @@ -44,6 +40,8 @@ def dataset(fileobj): # We are parsing HTML, and for that we need an lxml package from lxml import html + import logging + import re global download_url_copy, re h = html.fromstring(fileobj.read().decode('utf-8')) shops = h.find_class('shops-in-the-city-holder')[0] diff --git a/profiles/burgerking.py b/profiles/burgerking.py index e5d76fa..65275b9 100644 --- a/profiles/burgerking.py +++ b/profiles/burgerking.py @@ -2,10 +2,6 @@ # and does not contain any useful data now. # So this profile is here solely for demonstration purposes. -import json -import codecs -import re - download_url = 'https://burgerking.ru/restaurant-locations-json-reply-new' source = 'Burger King' dataset_id = 'burger_king' @@ -60,6 +56,9 @@ def dataset(fileobj): s = s.replace(' доб. ', '-') return s + import json + import codecs + import re notes = { 172: 'Подвинуть на второй терминал', 25: 'Подвинуть в ЮниМолл', diff --git a/profiles/minkult.py b/profiles/minkult.py index 98dcb1a..bcbd12c 100644 --- a/profiles/minkult.py +++ b/profiles/minkult.py @@ -1,8 +1,3 @@ -import json -import logging -import requests -import codecs - source = 'opendata.mkrf.ru' dataset_id = 'mkrf_theaters' query = [('amenity', 'theatre')] @@ -12,6 +7,9 @@ master_tags = ('official_name', 'phone', 'opening_hours', 'website') # Reading the dataset passport to determine an URL of the latest dataset version def download_url(): + import logging + import requests + dataset_id = '7705851331-' + (param or 'museums') r = requests.get('http://opendata.mkrf.ru/opendata/{}/meta.json'.format(dataset_id)) if r.status_code != 200 or len(r.content) == 0: @@ -41,6 +39,9 @@ master_tags = ('official_name', 'phone', 'opening_hours', 'website') def dataset(fileobj): + import json + import codecs + def make_wd_ranges(r): """Converts e.g. [0,1,4] into 'Mo-Tu, Fr'.""" wd = ['Mo', 'Tu', 'We', 'Th', 'Fr', 'Sa', 'Su'] diff --git a/profiles/moscow_addr.py b/profiles/moscow_addr.py index a2796bd..92764c1 100644 --- a/profiles/moscow_addr.py +++ b/profiles/moscow_addr.py @@ -1,6 +1,3 @@ -import json -import logging - source = 'dit.mos.ru' no_dataset_id = True query = [('building',)] @@ -35,6 +32,11 @@ if param: def dataset(fileobj): + import zipfile + import json + import logging + global COMPLEX, ADM + def find_center(geodata): if not geodata: return None @@ -63,9 +65,7 @@ def dataset(fileobj): return [lonlat[0]/cnt, lonlat[1]/cnt] return None - global COMPLEX, ADM logging.info('Экспортируем %s (%s)', ADM, 'строения' if COMPLEX else 'без строений') - import zipfile zf = zipfile.ZipFile(fileobj) data = [] no_geodata = 0 diff --git a/profiles/moscow_parkomats.py b/profiles/moscow_parkomats.py index 98752b8..0d44ad9 100644 --- a/profiles/moscow_parkomats.py +++ b/profiles/moscow_parkomats.py @@ -1,25 +1,3 @@ -# Available modules: codecs, logging, requests, json, etree. But importing these helps catch other errors -import json -import logging - - -def download_url(mos_dataset_id=1421): - import requests - r = requests.get('https://data.mos.ru/api/datasets/expformats/?datasetId={}'.format(mos_dataset_id)) - if r.status_code != 200 or len(r.content) == 0: - logging.error('Could not get URL for dataset: %s %s', r.status_code, r.text) - logging.error('Please check http://data.mos.ru/opendata/{}/passport'.format(mos_dataset_id)) - return None - url = [x for x in r.json() if x['Format'] == 'json'][0] - version = '?' - title = 'dataset' - r = requests.get('https://data.mos.ru/apiproxy/opendata/{}/meta.json'.format(mos_dataset_id)) - if r.status_code == 200: - title = r.json()['Title'] - version = r.json()['VersionNumber'] - logging.info('Downloading %s %s from %s', title, version, url['GenerationStart']) - return 'https://op.mos.ru/EHDWSREST/catalog/export/get?id=' + url['EhdId'] - # What will be put into "source" tags. Lower case please source = 'dit.mos.ru' # A fairly unique id of the dataset to query OSM, used for "ref:mos_parking" tags @@ -46,8 +24,29 @@ tag_unmatched = None master_tags = ('zone:parking', 'ref', 'contact:phone', 'contact:website', 'operator') +def download_url(mos_dataset_id=1421): + import requests + import logging + r = requests.get('https://data.mos.ru/api/datasets/expformats/?datasetId={}'.format(mos_dataset_id)) + if r.status_code != 200 or len(r.content) == 0: + logging.error('Could not get URL for dataset: %s %s', r.status_code, r.text) + logging.error('Please check http://data.mos.ru/opendata/{}/passport'.format(mos_dataset_id)) + return None + url = [x for x in r.json() if x['Format'] == 'json'][0] + version = '?' + title = 'dataset' + r = requests.get('https://data.mos.ru/apiproxy/opendata/{}/meta.json'.format(mos_dataset_id)) + if r.status_code == 200: + title = r.json()['Title'] + version = r.json()['VersionNumber'] + logging.info('Downloading %s %s from %s', title, version, url['GenerationStart']) + return 'https://op.mos.ru/EHDWSREST/catalog/export/get?id=' + url['EhdId'] + + # A list of SourcePoint objects. Initialize with (id, lat, lon, {tags}). def dataset(fileobj): + import json + import logging import zipfile import re zf = zipfile.ZipFile(fileobj) diff --git a/profiles/velobike.py b/profiles/velobike.py index 231d563..af5468c 100644 --- a/profiles/velobike.py +++ b/profiles/velobike.py @@ -1,7 +1,3 @@ -import codecs -import json -import logging - # Where to get the latest feed download_url = 'http://www.velobike.ru/proxy/parkings/' # What to write for the changeset's source tag @@ -29,6 +25,10 @@ master_tags = ('ref', 'capacity', 'capacity:electric', 'contact:email', def dataset(fileobj): + import codecs + import json + import logging + # Specifying utf-8 is important, otherwise you'd get "bytes" instead of "str" source = json.load(codecs.getreader('utf-8')(fileobj)) data = []