Compare commits
No commits in common. "master" and "v1.3.2" have entirely different histories.
27 changed files with 1183 additions and 65220 deletions
10
.gitignore
vendored
10
.gitignore
vendored
|
@ -1,9 +1,17 @@
|
|||
*.swp
|
||||
*.osc
|
||||
*.osm
|
||||
*.zip
|
||||
*.json
|
||||
*.gz
|
||||
*.csv
|
||||
*.pyc
|
||||
*.pbf
|
||||
*.lst
|
||||
*.user
|
||||
*.log
|
||||
private/
|
||||
data/
|
||||
dist/
|
||||
__pycache__/
|
||||
*.egg*
|
||||
build/
|
||||
|
|
30
CHANGELOG.md
30
CHANGELOG.md
|
@ -2,36 +2,6 @@
|
|||
|
||||
## master branch
|
||||
|
||||
## 1.4.1
|
||||
|
||||
_Released 2019-06-04_
|
||||
|
||||
* Fixed an error when the query is pure regexp and it did not match anything.
|
||||
|
||||
## 1.4.0
|
||||
|
||||
_Released 2018-05-30_
|
||||
|
||||
* Refactored `conflate.py` into seven smaller files.
|
||||
* Added a simple kd-tree based geocoder for countries and regions. Controlled by the `regions` parameter in a profile.
|
||||
* You can filter by regions using `-r` argument or `"regions"` list in an audit file.
|
||||
* Using the new `nwr` query type of Overpass API.
|
||||
* Reduced default `max_request_boxes` to four.
|
||||
* New argument `--alt-overpass` to use Kumi Systems' server (since the main one is blocked in Russia).
|
||||
* Better handling of server runtime errors.
|
||||
* Find matches in OSM with `--list <result.csv>`.
|
||||
* Control number of nearest points to check for matches with `nearest_points` profile parameter.
|
||||
* When you have dataset ID in an URL or other tag, use `find_ref` profile function to match on it.
|
||||
|
||||
## 1.3.3
|
||||
|
||||
_Released 2018-04-26_
|
||||
|
||||
* Fixed processing of `''` tag value.
|
||||
* More that 3 duplicate points in a single place are processed correctly.
|
||||
* Now you can `yield` points from a profile instead of making a list.
|
||||
* Not marking nodes with `move` in the audit file as modified, unless we move them.
|
||||
|
||||
## 1.3.2
|
||||
|
||||
_Released 2018-04-19_
|
||||
|
|
|
@ -1,9 +1 @@
|
|||
try:
|
||||
from lxml import etree
|
||||
except ImportError:
|
||||
import xml.etree.ElementTree as etree
|
||||
from .data import SourcePoint
|
||||
from .conflate import run
|
||||
from .version import __version__
|
||||
from .profile import Profile, ProfileException
|
||||
from .conflator import OsmConflator
|
||||
from .conflate import SourcePoint, OSMPoint, OsmConflator, Profile, ProfileException, run, __version__
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
from . import run
|
||||
|
||||
run()
|
1162
conflate/conflate.py
Normal file → Executable file
1162
conflate/conflate.py
Normal file → Executable file
File diff suppressed because it is too large
Load diff
|
@ -1,445 +0,0 @@
|
|||
import logging
|
||||
import kdtree
|
||||
from collections import defaultdict
|
||||
from .data import OSMPoint
|
||||
from .version import __version__
|
||||
from .osm import OsmDownloader, check_moveability
|
||||
from . import etree
|
||||
|
||||
|
||||
TITLE = 'OSM Conflator ' + __version__
|
||||
CONTACT_KEYS = set(('phone', 'website', 'email', 'fax', 'facebook', 'twitter', 'instagram'))
|
||||
LIFECYCLE_KEYS = set(('amenity', 'shop', 'tourism', 'craft', 'office'))
|
||||
LIFECYCLE_PREFIXES = ('proposed', 'construction', 'disused', 'abandoned', 'was', 'removed')
|
||||
|
||||
|
||||
class OsmConflator:
|
||||
"""The main class for the conflator.
|
||||
|
||||
It receives a dataset, after which one must call either
|
||||
"download_osm" or "parse_osm" methods. Then it is ready to match:
|
||||
call the "match" method and get results with "to_osc".
|
||||
"""
|
||||
def __init__(self, profile, dataset, audit=None):
|
||||
self.dataset = {p.id: p for p in dataset}
|
||||
self.audit = audit or {}
|
||||
self.osmdata = {}
|
||||
self.matched = []
|
||||
self.changes = []
|
||||
self.matches = []
|
||||
self.profile = profile
|
||||
self.geocoder = None
|
||||
self.downloader = OsmDownloader(profile)
|
||||
self.source = self.profile.get(
|
||||
'source', required='value of "source" tag for uploaded OSM objects')
|
||||
self.add_source_tag = self.profile.get('add_source', False)
|
||||
if self.profile.get('no_dataset_id', False):
|
||||
self.ref = None
|
||||
else:
|
||||
self.ref = 'ref:' + self.profile.get(
|
||||
'dataset_id', required='A fairly unique id of the dataset to query OSM')
|
||||
|
||||
def set_overpass(self, server='alt'):
|
||||
self.downloader.set_overpass(server)
|
||||
|
||||
def download_osm(self):
|
||||
bboxes = self.downloader.calc_boxes(self.dataset.values())
|
||||
self.osmdata = self.downloader.download(bboxes)
|
||||
|
||||
def parse_osm(self, fileobj):
|
||||
self.osmdata = self.downloader.parse_xml(fileobj)
|
||||
|
||||
def register_match(self, dataset_key, osmdata_key, keep=False, retag=None):
|
||||
"""Registers a match between an OSM point and a dataset point.
|
||||
|
||||
Merges tags from an OSM Point and a dataset point, and add the result to the
|
||||
self.matched list.
|
||||
If dataset_key is None, deletes or retags the OSM point.
|
||||
If osmdata_key is None, adds a new OSM point for the dataset point.
|
||||
"""
|
||||
def get_osm_key(k, osm_tags):
|
||||
"""Conflating contact: namespace."""
|
||||
if k in CONTACT_KEYS and k not in osm_tags and 'contact:'+k in osm_tags:
|
||||
return 'contact:'+k
|
||||
elif k.startswith('contact:') and k not in osm_tags and k[8:] in osm_tags:
|
||||
return k[8:]
|
||||
|
||||
# Now conflating lifecycle prefixes, only forward
|
||||
if k in LIFECYCLE_KEYS and k not in osm_tags:
|
||||
for prefix in LIFECYCLE_PREFIXES:
|
||||
if prefix+':'+k in osm_tags:
|
||||
return prefix+':'+k
|
||||
return k
|
||||
|
||||
def update_tags(tags, source, master_tags=None, retagging=False, audit=None):
|
||||
"""Updates tags dictionary with tags from source,
|
||||
returns True is something was changed."""
|
||||
keep = set()
|
||||
override = set()
|
||||
changed = False
|
||||
if source:
|
||||
if audit:
|
||||
keep = set(audit.get('keep', []))
|
||||
override = set(audit.get('override', []))
|
||||
for k, v in source.items():
|
||||
osm_key = get_osm_key(k, tags)
|
||||
|
||||
if k in keep or osm_key in keep:
|
||||
continue
|
||||
if k in override or osm_key in override:
|
||||
if not v and osm_key in tags:
|
||||
del tags[osm_key]
|
||||
changed = True
|
||||
elif v and tags.get(osm_key, None) != v:
|
||||
tags[osm_key] = v
|
||||
changed = True
|
||||
continue
|
||||
|
||||
if osm_key not in tags or retagging or (
|
||||
tags[osm_key] != v and (master_tags and k in master_tags)):
|
||||
if v is not None and len(v) > 0:
|
||||
# Not setting addr:full when the object has addr:housenumber
|
||||
if k == 'addr:full' and 'addr:housenumber' in tags:
|
||||
continue
|
||||
tags[osm_key] = v
|
||||
changed = True
|
||||
elif osm_key in tags and (v == '' or retagging):
|
||||
del tags[osm_key]
|
||||
changed = True
|
||||
return changed
|
||||
|
||||
def format_change(before, after, ref):
|
||||
MARKER_COLORS = {
|
||||
'delete': '#ee2211', # deleting feature from OSM
|
||||
'create': '#11dd11', # creating a new node
|
||||
'update': '#0000ee', # changing tags on an existing feature
|
||||
'retag': '#660000', # cannot delete unmatched feature, changing tags
|
||||
'move': '#110055', # moving an existing node
|
||||
}
|
||||
marker_action = None
|
||||
geometry = {'type': 'Point', 'coordinates': [after.lon, after.lat]}
|
||||
props = {
|
||||
'osm_type': after.osm_type,
|
||||
'osm_id': after.osm_id,
|
||||
'action': after.action
|
||||
}
|
||||
if after.action in ('create', 'delete'):
|
||||
# Red if deleted, green if added
|
||||
marker_action = after.action
|
||||
for k, v in after.tags.items():
|
||||
props['tags.{}'.format(k)] = v
|
||||
if ref:
|
||||
props['ref_id'] = ref.id
|
||||
else: # modified
|
||||
# Blue if updated from dataset, dark red if retagged, dark blue if moved
|
||||
marker_action = 'update' if ref else 'retag'
|
||||
if ref:
|
||||
props['ref_id'] = ref.id
|
||||
props['ref_distance'] = round(10 * ref.distance(before)) / 10.0
|
||||
props['ref_coords'] = [ref.lon, ref.lat]
|
||||
if before.lon != after.lon or before.lat != after.lat:
|
||||
# The object was moved
|
||||
props['were_coords'] = [before.lon, before.lat]
|
||||
marker_action = 'move'
|
||||
# Find tags that were superseeded by OSM tags
|
||||
for k, v in ref.tags.items():
|
||||
osm_key = get_osm_key(k, after.tags)
|
||||
if osm_key not in after.tags or after.tags[osm_key] != v:
|
||||
props['ref_unused_tags.{}'.format(osm_key)] = v
|
||||
# Now compare old and new OSM tags
|
||||
for k in set(after.tags.keys()).union(set(before.tags.keys())):
|
||||
v0 = before.tags.get(k, None)
|
||||
v1 = after.tags.get(k, None)
|
||||
if v0 == v1:
|
||||
props['tags.{}'.format(k)] = v0
|
||||
elif v0 is None:
|
||||
props['tags_new.{}'.format(k)] = v1
|
||||
elif v1 is None:
|
||||
props['tags_deleted.{}'.format(k)] = v0
|
||||
else:
|
||||
props['tags_changed.{}'.format(k)] = '{} -> {}'.format(v0, v1)
|
||||
props['marker-color'] = MARKER_COLORS[marker_action]
|
||||
if ref and ref.remarks:
|
||||
props['remarks'] = ref.remarks
|
||||
if ref and ref.region:
|
||||
props['region'] = ref.region
|
||||
elif self.geocoder:
|
||||
region, present = self.geocoder.find(after)
|
||||
if not present:
|
||||
return None
|
||||
if region is not None:
|
||||
props['region'] = region
|
||||
return {'type': 'Feature', 'geometry': geometry, 'properties': props}
|
||||
|
||||
p = self.osmdata.pop(osmdata_key, None)
|
||||
p0 = None if p is None else p.copy()
|
||||
sp = self.dataset.pop(dataset_key, None)
|
||||
audit = self.audit.get(sp.id if sp else '{}{}'.format(p.osm_type, p.osm_id), {})
|
||||
if audit.get('skip', False):
|
||||
return
|
||||
|
||||
if sp is not None:
|
||||
if p is None:
|
||||
p = OSMPoint('node', -1-len(self.matched), 1, sp.lat, sp.lon, sp.tags)
|
||||
p.action = 'create'
|
||||
else:
|
||||
master_tags = set(self.profile.get('master_tags', []))
|
||||
if update_tags(p.tags, sp.tags, master_tags, audit=audit):
|
||||
p.action = 'modify'
|
||||
# Move a node if it is too far from the dataset point
|
||||
if not p.is_area() and sp.distance(p) > self.profile.max_distance:
|
||||
p.lat = sp.lat
|
||||
p.lon = sp.lon
|
||||
p.action = 'modify'
|
||||
if self.add_source_tag:
|
||||
if 'source' in p.tags:
|
||||
if self.source not in p.tags['source']:
|
||||
p.tags['source'] = ';'.join([p.tags['source'], self.source])
|
||||
else:
|
||||
p.tags['source'] = self.source
|
||||
if self.ref is not None:
|
||||
p.tags[self.ref] = sp.id
|
||||
if 'fixme' in audit and audit['fixme'] != p.tags.get('fixme'):
|
||||
p.tags['fixme'] = audit['fixme']
|
||||
if p.action is None:
|
||||
p.action = 'modify'
|
||||
if 'move' in audit and not p.is_area():
|
||||
if p0 and audit['move'] == 'osm':
|
||||
p.lat = p0.lat
|
||||
p.lon = p0.lon
|
||||
elif audit['move'] == 'dataset':
|
||||
p.lat = sp.lat
|
||||
p.lon = sp.lon
|
||||
elif len(audit['move']) == 2:
|
||||
p.lat = audit['move'][1]
|
||||
p.lon = audit['move'][0]
|
||||
if p.action is None and p0.distance(p) > 0.1:
|
||||
p.action = 'modify'
|
||||
if p.action != 'create':
|
||||
self.matches.append([sp.id, p.osm_type, p.osm_id, p.lat, p.lon, p.action])
|
||||
else:
|
||||
self.matches.append([sp.id, '', '', p.lat, p.lon, p.action])
|
||||
elif keep or p.is_area():
|
||||
if update_tags(p.tags, retag, retagging=True, audit=audit):
|
||||
p.action = 'modify'
|
||||
else:
|
||||
p.action = 'delete'
|
||||
|
||||
if p.action is not None:
|
||||
change = format_change(p0, p, sp)
|
||||
if change is not None:
|
||||
self.matched.append(p)
|
||||
self.changes.append(change)
|
||||
|
||||
def match_dataset_points_smart(self):
|
||||
"""Smart matching for dataset <-> OSM points.
|
||||
|
||||
We find a shortest link between a dataset and an OSM point.
|
||||
Then we match these and remove both from dicts.
|
||||
Then find another link and so on, until the length of a link
|
||||
becomes larger than "max_distance".
|
||||
|
||||
Currently the worst case complexity is around O(n^2*log^2 n).
|
||||
But given the small number of objects to match, and that
|
||||
the average case complexity is ~O(n*log^2 n), this is fine.
|
||||
"""
|
||||
def search_nn_fix(kd, point):
|
||||
nearest = kd.search_knn(point, self.profile.get('nearest_points', 10))
|
||||
if not nearest:
|
||||
return None, None
|
||||
match_func = self.profile.get_raw('matches')
|
||||
if match_func:
|
||||
nearest = [p for p in nearest if match_func(p[0].data.tags, point.tags)]
|
||||
if not nearest:
|
||||
return None, None
|
||||
nearest = [(n[0], n[0].data.distance(point))
|
||||
for n in nearest if point.category in n[0].data.categories]
|
||||
return sorted(nearest, key=lambda kv: kv[1])[0]
|
||||
|
||||
if not self.osmdata:
|
||||
return
|
||||
osm_kd = kdtree.create(list(self.osmdata.values()))
|
||||
count_matched = 0
|
||||
|
||||
# Process overridden features first
|
||||
for override, osm_find in self.profile.get('override', {}).items():
|
||||
override = str(override)
|
||||
if override not in self.dataset:
|
||||
continue
|
||||
found = None
|
||||
if len(osm_find) > 2 and osm_find[0] in 'nwr' and osm_find[1].isdigit():
|
||||
if osm_find in self.osmdata:
|
||||
found = self.osmdata[osm_find]
|
||||
# Search nearest 100 points
|
||||
nearest = osm_kd.search_knn(self.dataset[override], 100)
|
||||
if nearest:
|
||||
for p in nearest:
|
||||
if 'name' in p[0].data.tags and p[0].data.tags['name'] == osm_find:
|
||||
found = p[0].data
|
||||
if found:
|
||||
count_matched += 1
|
||||
self.register_match(override, found.id)
|
||||
osm_kd = osm_kd.remove(found)
|
||||
|
||||
# Prepare distance list: match OSM points to each of the dataset points
|
||||
dist = []
|
||||
for sp, v in self.dataset.items():
|
||||
osm_point, distance = search_nn_fix(osm_kd, v)
|
||||
if osm_point is not None and distance <= self.profile.max_distance:
|
||||
dist.append((distance, sp, osm_point.data))
|
||||
|
||||
# The main matching loop: sort dist list if needed,
|
||||
# register the closes match, update the list
|
||||
needs_sorting = True
|
||||
while dist:
|
||||
if needs_sorting:
|
||||
dist.sort(key=lambda x: x[0])
|
||||
needs_sorting = False
|
||||
count_matched += 1
|
||||
osm_point = dist[0][2]
|
||||
self.register_match(dist[0][1], osm_point.id)
|
||||
osm_kd = osm_kd.remove(osm_point)
|
||||
del dist[0]
|
||||
for i in reversed(range(len(dist))):
|
||||
if dist[i][2] == osm_point:
|
||||
nearest, distance = search_nn_fix(osm_kd, self.dataset[dist[i][1]])
|
||||
if nearest and distance <= self.profile.max_distance:
|
||||
dist[i] = (distance, dist[i][1], nearest.data)
|
||||
needs_sorting = i == 0 or distance < dist[0][0]
|
||||
else:
|
||||
del dist[i]
|
||||
needs_sorting = i == 0
|
||||
logging.info('Matched %s points', count_matched)
|
||||
|
||||
def match(self):
|
||||
"""Matches each osm object with a SourcePoint, or marks it as obsolete.
|
||||
The resulting list of OSM Points are written to the "matched" field."""
|
||||
find_ref = self.profile.get_raw('find_ref')
|
||||
if self.ref is not None or callable(find_ref):
|
||||
# First match all objects with ref:whatever tag set
|
||||
count_ref = 0
|
||||
for k, p in list(self.osmdata.items()):
|
||||
ref = None
|
||||
if self.ref and self.ref in p.tags:
|
||||
ref = p.tags[self.ref]
|
||||
elif find_ref:
|
||||
ref = find_ref(p.tags)
|
||||
if ref is not None:
|
||||
if ref in self.dataset:
|
||||
count_ref += 1
|
||||
self.register_match(ref, k)
|
||||
logging.info('Updated %s OSM objects with %s tag', count_ref, self.ref)
|
||||
|
||||
# Add points for which audit specifically mentioned creating
|
||||
count_created = 0
|
||||
for ref, a in self.audit.items():
|
||||
if ref in self.dataset:
|
||||
if a.get('create', None):
|
||||
count_created += 1
|
||||
self.register_match(ref, None)
|
||||
elif a.get('skip', None):
|
||||
# If we skip an object here, it would affect the conflation order
|
||||
pass
|
||||
if count_created > 0:
|
||||
logging.info('Created %s audit-overridden dataset points', count_created)
|
||||
|
||||
# Prepare exclusive groups dict
|
||||
exclusive_groups = defaultdict(set)
|
||||
for p, v in self.dataset.items():
|
||||
if v.exclusive_group is not None:
|
||||
exclusive_groups[v.exclusive_group].add(p)
|
||||
|
||||
# Then find matches for unmatched dataset points
|
||||
self.match_dataset_points_smart()
|
||||
|
||||
# Remove unmatched duplicates
|
||||
count_duplicates = 0
|
||||
for ids in exclusive_groups.values():
|
||||
found = False
|
||||
for p in ids:
|
||||
if p not in self.dataset:
|
||||
found = True
|
||||
break
|
||||
for p in ids:
|
||||
if p in self.dataset:
|
||||
if found:
|
||||
count_duplicates += 1
|
||||
del self.dataset[p]
|
||||
else:
|
||||
# Leave one element when not matched any
|
||||
found = True
|
||||
if count_duplicates > 0:
|
||||
logging.info('Removed %s unmatched duplicates', count_duplicates)
|
||||
|
||||
# Add unmatched dataset points
|
||||
logging.info('Adding %s unmatched dataset points', len(self.dataset))
|
||||
for k in sorted(list(self.dataset.keys())):
|
||||
self.register_match(k, None)
|
||||
|
||||
# And finally delete some or all of the remaining osm objects
|
||||
if len(self.osmdata) > 0:
|
||||
count_deleted = 0
|
||||
count_retagged = 0
|
||||
delete_unmatched = self.profile.get('delete_unmatched', False)
|
||||
retag = self.profile.get('tag_unmatched')
|
||||
for k, p in list(self.osmdata.items()):
|
||||
ref = None
|
||||
if self.ref and self.ref in p.tags:
|
||||
ref = p.tags[self.ref]
|
||||
elif find_ref:
|
||||
ref = find_ref(p.tags)
|
||||
if ref is not None:
|
||||
# When ref:whatever is present, we can delete that object safely
|
||||
count_deleted += 1
|
||||
self.register_match(None, k, retag=retag)
|
||||
elif delete_unmatched or retag:
|
||||
if not delete_unmatched or p.is_area():
|
||||
count_retagged += 1
|
||||
else:
|
||||
count_deleted += 1
|
||||
self.register_match(None, k, keep=not delete_unmatched, retag=retag)
|
||||
logging.info(
|
||||
'Deleted %s and retagged %s unmatched objects from OSM',
|
||||
count_deleted, count_retagged)
|
||||
|
||||
def backup_osm(self):
|
||||
"""Writes OSM data as-is."""
|
||||
osm = etree.Element('osm', version='0.6', generator=TITLE)
|
||||
for osmel in self.osmdata.values():
|
||||
el = osmel.to_xml()
|
||||
if osmel.osm_type != 'node':
|
||||
etree.SubElement(el, 'center', lat=str(osmel.lat), lon=str(osmel.lon))
|
||||
osm.append(el)
|
||||
return ("<?xml version='1.0' encoding='utf-8'?>\n" +
|
||||
etree.tostring(osm, encoding='utf-8').decode('utf-8'))
|
||||
|
||||
def to_osc(self, josm=False):
|
||||
"""Returns a string with osmChange or JOSM XML."""
|
||||
osc = etree.Element('osm' if josm else 'osmChange', version='0.6', generator=TITLE)
|
||||
if josm:
|
||||
neg_id = -1
|
||||
changeset = etree.SubElement(osc, 'changeset')
|
||||
ch_tags = {
|
||||
'source': self.source,
|
||||
'created_by': TITLE,
|
||||
'type': 'import'
|
||||
}
|
||||
for k, v in ch_tags.items():
|
||||
etree.SubElement(changeset, 'tag', k=k, v=v)
|
||||
for osmel in self.matched:
|
||||
if osmel.action is not None:
|
||||
el = osmel.to_xml()
|
||||
if josm:
|
||||
if osmel.action == 'create':
|
||||
el.set('id', str(neg_id))
|
||||
neg_id -= 1
|
||||
else:
|
||||
el.set('action', osmel.action)
|
||||
osc.append(el)
|
||||
else:
|
||||
etree.SubElement(osc, osmel.action).append(el)
|
||||
return ("<?xml version='1.0' encoding='utf-8'?>\n" +
|
||||
etree.tostring(osc, encoding='utf-8').decode('utf-8'))
|
||||
|
||||
def check_moveability(self):
|
||||
check_moveability(self.changes)
|
104
conflate/data.py
104
conflate/data.py
|
@ -1,104 +0,0 @@
|
|||
import math
|
||||
from . import etree
|
||||
|
||||
|
||||
class SourcePoint:
|
||||
"""A common class for points. Has an id, latitude and longitude,
|
||||
and a dict of tags. Remarks are optional for reviewers hints only."""
|
||||
def __init__(self, pid, lat, lon, tags=None, category=None, remarks=None, region=None):
|
||||
self.id = str(pid)
|
||||
self.lat = lat
|
||||
self.lon = lon
|
||||
self.tags = {} if tags is None else {
|
||||
k.lower(): str(v).strip() for k, v in tags.items() if v is not None}
|
||||
self.category = category
|
||||
self.dist_offset = 0
|
||||
self.remarks = remarks
|
||||
self.region = region
|
||||
self.exclusive_group = None
|
||||
|
||||
def distance(self, other):
|
||||
"""Calculate distance in meters."""
|
||||
dx = math.radians(self.lon - other.lon) * math.cos(0.5 * math.radians(self.lat + other.lat))
|
||||
dy = math.radians(self.lat - other.lat)
|
||||
return 6378137 * math.sqrt(dx*dx + dy*dy) - self.dist_offset
|
||||
|
||||
def __len__(self):
|
||||
return 2
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i == 0:
|
||||
return self.lon
|
||||
elif i == 1:
|
||||
return self.lat
|
||||
else:
|
||||
raise ValueError('A SourcePoint has only lat and lon in a list')
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.id == other.id
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.id)
|
||||
|
||||
def __repr__(self):
|
||||
return 'SourcePoint({}, {}, {}, offset={}, tags={})'.format(
|
||||
self.id, self.lat, self.lon, self.dist_offset, self.tags)
|
||||
|
||||
|
||||
class OSMPoint(SourcePoint):
|
||||
"""An OSM points is a SourcePoint with a few extra fields.
|
||||
Namely, version, members (for ways and relations), and an action.
|
||||
The id is compound and created from object type and object id."""
|
||||
def __init__(self, ptype, pid, version, lat, lon, tags=None, categories=None):
|
||||
super().__init__('{}{}'.format(ptype[0], pid), lat, lon, tags)
|
||||
self.tags = {k: v for k, v in self.tags.items() if v is not None and len(v) > 0}
|
||||
self.osm_type = ptype
|
||||
self.osm_id = pid
|
||||
self.version = version
|
||||
self.members = None
|
||||
self.action = None
|
||||
self.categories = categories or set()
|
||||
self.remarks = None
|
||||
|
||||
def copy(self):
|
||||
"""Returns a copy of this object, except for members field."""
|
||||
c = OSMPoint(self.osm_type, self.osm_id, self.version, self.lat, self.lon, self.tags.copy())
|
||||
c.action = self.action
|
||||
c.remarks = self.remarks
|
||||
c.categories = self.categories.copy()
|
||||
return c
|
||||
|
||||
def is_area(self):
|
||||
return self.osm_type != 'node'
|
||||
|
||||
def is_poi(self):
|
||||
if self.osm_type == 'node':
|
||||
return True
|
||||
if self.osm_type == 'way' and len(self.members) > 2:
|
||||
return self.members[0] == self.members[-1]
|
||||
if self.osm_type == 'relation' and len(self.members) > 0:
|
||||
return self.tags.get('type', None) == 'multipolygon'
|
||||
return False
|
||||
|
||||
def to_xml(self):
|
||||
"""Produces an XML out of the point data. Disregards the "action" field."""
|
||||
el = etree.Element(self.osm_type, id=str(self.osm_id), version=str(self.version))
|
||||
for tag, value in self.tags.items():
|
||||
etree.SubElement(el, 'tag', k=tag, v=value)
|
||||
|
||||
if self.osm_type == 'node':
|
||||
el.set('lat', str(self.lat))
|
||||
el.set('lon', str(self.lon))
|
||||
elif self.osm_type == 'way':
|
||||
for node_id in self.members:
|
||||
etree.SubElement(el, 'nd', ref=str(node_id))
|
||||
elif self.osm_type == 'relation':
|
||||
for member in self.members:
|
||||
m = etree.SubElement(el, 'member')
|
||||
for i, n in enumerate(('type', 'ref', 'role')):
|
||||
m.set(n, str(member[i]))
|
||||
return el
|
||||
|
||||
def __repr__(self):
|
||||
return 'OSMPoint({} {} v{}, {}, {}, action={}, tags={})'.format(
|
||||
self.osm_type, self.osm_id, self.version, self.lat, self.lon, self.action, self.tags)
|
|
@ -1,213 +0,0 @@
|
|||
import logging
|
||||
import json
|
||||
import codecs
|
||||
import requests
|
||||
import kdtree
|
||||
from io import BytesIO
|
||||
from .data import SourcePoint
|
||||
|
||||
|
||||
def read_dataset(profile, fileobj):
|
||||
"""A helper function to call a "dataset" function in the profile.
|
||||
If the fileobj is not specified, tries to download a dataset from
|
||||
an URL specified in "download_url" profile variable."""
|
||||
if not fileobj:
|
||||
url = profile.get('download_url')
|
||||
if url is None:
|
||||
logging.error('No download_url specified in the profile, '
|
||||
'please provide a dataset file with --source')
|
||||
return None
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
logging.error('Could not download source data: %s %s', r.status_code, r.text)
|
||||
return None
|
||||
if len(r.content) == 0:
|
||||
logging.error('Empty response from %s', url)
|
||||
return None
|
||||
fileobj = BytesIO(r.content)
|
||||
if not profile.has('dataset'):
|
||||
# The default option is to parse the source as a JSON
|
||||
try:
|
||||
data = []
|
||||
reader = codecs.getreader('utf-8')
|
||||
json_src = json.load(reader(fileobj))
|
||||
if 'features' in json_src:
|
||||
# Parse GeoJSON
|
||||
for item in json_src['features']:
|
||||
if item['geometry'].get('type') != 'Point' or 'properties' not in item:
|
||||
continue
|
||||
# Get the identifier from "id", "ref", "ref*"
|
||||
iid = item['properties'].get('id', item['properties'].get('ref'))
|
||||
if not iid:
|
||||
for k, v in item['properties'].items():
|
||||
if k.startswith('ref'):
|
||||
iid = v
|
||||
break
|
||||
if not iid:
|
||||
continue
|
||||
data.append(SourcePoint(
|
||||
iid,
|
||||
item['geometry']['coordinates'][1],
|
||||
item['geometry']['coordinates'][0],
|
||||
{k: v for k, v in item['properties'].items() if k != 'id'}))
|
||||
else:
|
||||
for item in json_src:
|
||||
data.append(SourcePoint(item['id'], item['lat'], item['lon'], item['tags']))
|
||||
return data
|
||||
except Exception:
|
||||
logging.error('Failed to parse the source as a JSON')
|
||||
return list(profile.get(
|
||||
'dataset', args=(fileobj,),
|
||||
required='returns a list of SourcePoints with the dataset'))
|
||||
|
||||
|
||||
def add_categories_to_dataset(profile, dataset):
|
||||
categories = profile.get('categories')
|
||||
if not categories:
|
||||
return
|
||||
tag = profile.get('category_tag')
|
||||
other = categories.get('other', {})
|
||||
for d in dataset:
|
||||
if tag and tag in d.tags:
|
||||
d.category = d.tags[tag]
|
||||
del d.tags[tag]
|
||||
if d.category:
|
||||
cat_tags = categories.get(d.category, other).get('tags', None)
|
||||
if cat_tags:
|
||||
d.tags.update(cat_tags)
|
||||
|
||||
|
||||
def transform_dataset(profile, dataset):
|
||||
"""Transforms tags in the dataset using the "transform" method in the profile
|
||||
or the instructions in that field in string or dict form."""
|
||||
transform = profile.get_raw('transform')
|
||||
if not transform:
|
||||
return
|
||||
if callable(transform):
|
||||
for d in dataset:
|
||||
transform(d.tags)
|
||||
return
|
||||
if isinstance(transform, str):
|
||||
# Convert string of "key=value|rule1|rule2" lines to a dict
|
||||
lines = [line.split('=', 1) for line in transform.splitlines()]
|
||||
transform = {l[0].strip(): l[1].strip() for l in lines}
|
||||
if not transform or not isinstance(transform, dict):
|
||||
return
|
||||
for key in transform:
|
||||
if isinstance(transform[key], str):
|
||||
transform[key] = [x.strip() for x in transform[key].split('|')]
|
||||
|
||||
for d in dataset:
|
||||
for key, rules in transform.items():
|
||||
if not rules:
|
||||
continue
|
||||
value = None
|
||||
if callable(rules):
|
||||
# The value can be generated
|
||||
value = rules(None if key not in d.tags else d.tags[key])
|
||||
if value is None and key in d.tags:
|
||||
del d.tags[key]
|
||||
elif not rules[0]:
|
||||
# Use the value of the tag
|
||||
if key in d.tags:
|
||||
value = d.tags[key]
|
||||
elif not isinstance(rules[0], str):
|
||||
# If the value is not a string, use it
|
||||
value = str(rules[0])
|
||||
elif rules[0][0] == '.':
|
||||
# Use the value from another tag
|
||||
alt_key = rules[0][1:]
|
||||
if alt_key in d.tags:
|
||||
value = d.tags[alt_key]
|
||||
elif rules[0][0] == '>':
|
||||
# Replace the key
|
||||
if key in d.tags:
|
||||
d.tags[rules[0][1:]] = d.tags[key]
|
||||
del d.tags[key]
|
||||
elif rules[0][0] == '<':
|
||||
# Replace the key, the same but backwards
|
||||
alt_key = rules[0][1:]
|
||||
if alt_key in d.tags:
|
||||
d.tags[key] = d.tags[alt_key]
|
||||
del d.tags[alt_key]
|
||||
elif rules[0] == '-':
|
||||
# Delete the tag
|
||||
if key in d.tags:
|
||||
del d.tags[key]
|
||||
else:
|
||||
# Take the value as written
|
||||
value = rules[0]
|
||||
if value is None:
|
||||
continue
|
||||
if isinstance(rules, list):
|
||||
for rule in rules[1:]:
|
||||
if rule == 'lower':
|
||||
value = value.lower()
|
||||
d.tags[key] = value
|
||||
|
||||
|
||||
def check_dataset_for_duplicates(profile, dataset, print_all=False):
|
||||
# First checking for duplicate ids and collecting tags with varying values
|
||||
ids = set()
|
||||
tags = {}
|
||||
found_duplicate_ids = False
|
||||
for d in dataset:
|
||||
if d.id in ids:
|
||||
found_duplicate_ids = True
|
||||
logging.error('Duplicate id {} in the dataset'.format(d.id))
|
||||
ids.add(d.id)
|
||||
for k, v in d.tags.items():
|
||||
if k not in tags:
|
||||
tags[k] = v
|
||||
elif tags[k] != '---' and tags[k] != v:
|
||||
tags[k] = '---'
|
||||
|
||||
# And then for near-duplicate points with similar tags
|
||||
uncond_distance = profile.get('duplicate_distance', 1)
|
||||
diff_tags = [k for k in tags if tags[k] == '---']
|
||||
kd = kdtree.create(list(dataset))
|
||||
duplicates = set()
|
||||
group = 0
|
||||
for d in dataset:
|
||||
if d.id in duplicates:
|
||||
continue
|
||||
group += 1
|
||||
dups = kd.search_knn(d, 2) # The first one will be equal to d
|
||||
if len(dups) < 2 or dups[1][0].data.distance(d) > profile.max_distance:
|
||||
continue
|
||||
for alt, _ in kd.search_knn(d, 20):
|
||||
dist = alt.data.distance(d)
|
||||
if alt.data.id != d.id and dist <= profile.max_distance:
|
||||
tags_differ = 0
|
||||
if dist > uncond_distance:
|
||||
for k in diff_tags:
|
||||
if alt.data.tags.get(k) != d.tags.get(k):
|
||||
tags_differ += 1
|
||||
if tags_differ <= len(diff_tags) / 3:
|
||||
duplicates.add(alt.data.id)
|
||||
d.exclusive_group = group
|
||||
alt.data.exclusive_group = group
|
||||
if print_all or len(duplicates) <= 5:
|
||||
is_duplicate = tags_differ <= 1
|
||||
logging.error('Dataset points %s: %s and %s',
|
||||
'duplicate each other' if is_duplicate else 'are too similar',
|
||||
d.id, alt.data.id)
|
||||
if duplicates:
|
||||
logging.error('Found %s duplicates in the dataset', len(duplicates))
|
||||
if found_duplicate_ids:
|
||||
raise KeyError('Cannot continue with duplicate ids')
|
||||
|
||||
|
||||
def add_regions(dataset, geocoder):
|
||||
if not geocoder.enabled:
|
||||
return
|
||||
if geocoder.filter:
|
||||
logging.info('Geocoding and filtering points')
|
||||
else:
|
||||
logging.info('Geocoding points')
|
||||
for i in reversed(range(len(dataset))):
|
||||
region, present = geocoder.find(dataset[i])
|
||||
if not present:
|
||||
del dataset[i]
|
||||
else:
|
||||
dataset[i].region = region
|
|
@ -1,120 +0,0 @@
|
|||
import struct
|
||||
import logging
|
||||
import os
|
||||
import kdtree
|
||||
|
||||
|
||||
class Geocoder:
|
||||
def __init__(self, profile_regions='all'):
|
||||
self.filter = None
|
||||
self.enabled = bool(profile_regions)
|
||||
if self.enabled:
|
||||
logging.info('Initializing geocoder (this will take a minute)')
|
||||
self.regions = self.parse_regions(profile_regions)
|
||||
self.tree = self.load_places_tree()
|
||||
if not self.tree:
|
||||
if callable(profile_regions):
|
||||
logging.warn('Could not read the geocoding file')
|
||||
else:
|
||||
logging.error('Could not read the geocoding file, no regions will be added')
|
||||
self.enabled = False
|
||||
|
||||
def set_filter(self, opt_regions):
|
||||
if isinstance(opt_regions, str):
|
||||
self.f_negate = opt_regions[0] in ('-', '^')
|
||||
if self.f_negate:
|
||||
opt_regions = opt_regions[1:]
|
||||
self.filter = set([r.strip() for r in opt_regions.split(',')])
|
||||
elif isinstance(opt_regions, list):
|
||||
self.f_negate = False
|
||||
self.filter = set(opt_regions)
|
||||
|
||||
def load_places_tree(self):
|
||||
class PlacePoint:
|
||||
def __init__(self, lon, lat, country, region):
|
||||
self.coord = (lon, lat)
|
||||
self.country = country
|
||||
self.region = region
|
||||
|
||||
def __len__(self):
|
||||
return len(self.coord)
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self.coord[i]
|
||||
|
||||
def unpack_coord(data):
|
||||
if data[-1] > 0x7f:
|
||||
data += b'\xFF'
|
||||
else:
|
||||
data += b'\0'
|
||||
return struct.unpack('<l', data)[0] / 10000
|
||||
|
||||
filename = os.path.join(os.getcwd(), os.path.dirname(__file__), 'places.bin')
|
||||
if not os.path.exists(filename):
|
||||
return None
|
||||
places = []
|
||||
with open(filename, 'rb') as f:
|
||||
countries = []
|
||||
cnt = struct.unpack('B', f.read(1))[0]
|
||||
for i in range(cnt):
|
||||
countries.append(struct.unpack('2s', f.read(2))[0].decode('ascii'))
|
||||
regions = []
|
||||
cnt = struct.unpack('<h', f.read(2))[0]
|
||||
for i in range(cnt):
|
||||
l = struct.unpack('B', f.read(1))[0]
|
||||
regions.append(f.read(l).decode('ascii'))
|
||||
dlon = f.read(3)
|
||||
while len(dlon) == 3:
|
||||
dlat = f.read(3)
|
||||
country = struct.unpack('B', f.read(1))[0]
|
||||
region = struct.unpack('<h', f.read(2))[0]
|
||||
places.append(PlacePoint(unpack_coord(dlon), unpack_coord(dlat),
|
||||
countries[country], regions[region]))
|
||||
dlon = f.read(3)
|
||||
if not places:
|
||||
return None
|
||||
return kdtree.create(places)
|
||||
|
||||
def parse_regions(self, profile_regions):
|
||||
if not profile_regions or callable(profile_regions):
|
||||
return profile_regions
|
||||
regions = profile_regions
|
||||
if regions is True or regions == 4:
|
||||
regions = 'all'
|
||||
elif regions is False or regions == 2:
|
||||
regions = []
|
||||
if isinstance(regions, str):
|
||||
regions = regions.lower()
|
||||
if regions[:3] == 'reg' or '4' in regions:
|
||||
regions = 'all'
|
||||
elif regions[:3] == 'cou' or '2' in regions:
|
||||
regions = []
|
||||
elif regions == 'some':
|
||||
regions = ['US', 'RU']
|
||||
if isinstance(regions, set):
|
||||
regions = list(regions)
|
||||
if isinstance(regions, dict):
|
||||
regions = list(regions.keys())
|
||||
if isinstance(regions, list):
|
||||
for i in regions:
|
||||
regions[i] = regions[i].upper()
|
||||
regions = set(regions)
|
||||
return regions
|
||||
|
||||
def find(self, pt):
|
||||
"""Returns a tuple of (region, present). A point should be skipped if not present."""
|
||||
region = pt.region
|
||||
if self.enabled:
|
||||
if not self.tree:
|
||||
if callable(self.regions):
|
||||
region = self.regions(pt, region)
|
||||
elif region is None:
|
||||
reg, _ = self.tree.search_nn(pt)
|
||||
if callable(self.regions):
|
||||
region = self.regions(pt, reg.data.region)
|
||||
elif self.regions == 'all' or reg.data.country in self.regions:
|
||||
region = reg.data.region
|
||||
else:
|
||||
region = reg.data.country
|
||||
|
||||
return region, not self.filter or (self.negate != (region not in self.filter))
|
391
conflate/osm.py
391
conflate/osm.py
|
@ -1,391 +0,0 @@
|
|||
import logging
|
||||
import requests
|
||||
import re
|
||||
from .data import OSMPoint
|
||||
from . import etree
|
||||
|
||||
|
||||
OVERPASS_SERVER = 'https://overpass-api.de/api/'
|
||||
ALT_OVERPASS_SERVER = 'https://overpass.kumi.systems/api/'
|
||||
OSM_API_SERVER = 'https://api.openstreetmap.org/api/0.6/'
|
||||
BBOX_PADDING = 0.003 # in degrees, ~330 m default
|
||||
|
||||
|
||||
class OsmDownloader:
|
||||
def __init__(self, profile):
|
||||
self.profile = profile
|
||||
|
||||
def set_overpass(self, server='alt'):
|
||||
global OVERPASS_SERVER
|
||||
if server == 'alt':
|
||||
OVERPASS_SERVER = ALT_OVERPASS_SERVER
|
||||
else:
|
||||
OVERPASS_SERVER = server
|
||||
|
||||
def construct_overpass_query(self, bboxes):
|
||||
"""Constructs an Overpass API query from the "query" list in the profile.
|
||||
(k, v) turns into [k=v], (k,) into [k], (k, None) into [!k], (k, "~v") into [k~v]."""
|
||||
tags = self.profile.get(
|
||||
'query', required="a list of tuples. E.g. [('amenity', 'cafe'), ('name', '~Mc.*lds')]")
|
||||
tag_strs = []
|
||||
if isinstance(tags, str):
|
||||
tag_strs = [tags]
|
||||
else:
|
||||
if not isinstance(tags[0], str) and isinstance(tags[0][0], str):
|
||||
tags = [tags]
|
||||
for tags_q in tags:
|
||||
if isinstance(tags_q, str):
|
||||
tag_strs.append(tags_q)
|
||||
continue
|
||||
tag_str = ''
|
||||
for t in tags_q:
|
||||
if len(t) == 1:
|
||||
q = '"{}"'.format(t[0])
|
||||
elif t[1] is None or len(t[1]) == 0:
|
||||
q = '"!{}"'.format(t[0])
|
||||
elif t[1][0] == '~':
|
||||
q = '"{}"~"{}",i'.format(t[0], t[1][1:])
|
||||
elif len(t) > 2:
|
||||
q = '"{}"~"^({})$"'.format(t[0], '|'.join(t[1:]))
|
||||
else:
|
||||
q = '"{}"="{}"'.format(t[0], t[1])
|
||||
tag_str += '[' + q + ']'
|
||||
tag_strs.append(tag_str)
|
||||
|
||||
if self.profile.get('no_dataset_id', False):
|
||||
ref = None
|
||||
else:
|
||||
ref = 'nwr["ref:' + self.profile.get(
|
||||
'dataset_id', required='A fairly unique id of the dataset to query OSM') + '"]'
|
||||
timeout = self.profile.get('overpass_timeout', 120)
|
||||
query = '[out:xml]{};('.format('' if timeout is None else '[timeout:{}]'.format(timeout))
|
||||
for bbox in bboxes:
|
||||
bbox_str = '' if bbox is None else '(' + ','.join([str(x) for x in bbox]) + ')'
|
||||
for tag_str in tag_strs:
|
||||
query += 'nwr' + tag_str + bbox_str + ';'
|
||||
if ref is not None:
|
||||
if not self.profile.get('bounded_update', False):
|
||||
query += ref + ';'
|
||||
else:
|
||||
for bbox in bboxes:
|
||||
bbox_str = '' if bbox is None else '(' + ','.join(
|
||||
[str(x) for x in bbox]) + ')'
|
||||
query += ref + bbox_str + ';'
|
||||
query += '); out meta qt center;'
|
||||
return query
|
||||
|
||||
def get_bbox(self, points):
|
||||
"""Plain iterates over the dataset and returns the bounding box
|
||||
that encloses it."""
|
||||
padding = self.profile.get('bbox_padding', BBOX_PADDING)
|
||||
bbox = [90.0, 180.0, -90.0, -180.0]
|
||||
for p in points:
|
||||
bbox[0] = min(bbox[0], p.lat - padding)
|
||||
bbox[1] = min(bbox[1], p.lon - padding)
|
||||
bbox[2] = max(bbox[2], p.lat + padding)
|
||||
bbox[3] = max(bbox[3], p.lon + padding)
|
||||
return bbox
|
||||
|
||||
def split_into_bboxes(self, points):
|
||||
"""
|
||||
Splits the dataset into multiple bboxes to lower load on the overpass api.
|
||||
|
||||
Returns a list of tuples (minlat, minlon, maxlat, maxlon).
|
||||
"""
|
||||
max_bboxes = self.profile.get('max_request_boxes', 4)
|
||||
if max_bboxes <= 1 or len(points) <= 1:
|
||||
return [self.get_bbox(points)]
|
||||
|
||||
# coord, alt coord, total w/h to the left/bottom, total w/h to the right/top
|
||||
lons = sorted([[d.lon, d.lat, 0, 0] for d in points])
|
||||
lats = sorted([[d.lat, d.lon, 0, 0] for d in points])
|
||||
|
||||
def update_side_dimensions(ar):
|
||||
"""For each point, calculates the maximum and
|
||||
minimum bound for all points left and right."""
|
||||
fwd_top = fwd_bottom = ar[0][1]
|
||||
back_top = back_bottom = ar[-1][1]
|
||||
for i in range(len(ar)):
|
||||
fwd_top = max(fwd_top, ar[i][1])
|
||||
fwd_bottom = min(fwd_bottom, ar[i][1])
|
||||
ar[i][2] = fwd_top - fwd_bottom
|
||||
back_top = max(back_top, ar[-i-1][1])
|
||||
back_bottom = min(back_bottom, ar[-i-1][1])
|
||||
ar[-i-1][3] = back_top - back_bottom
|
||||
|
||||
def find_max_gap(ar, h):
|
||||
"""Select an interval between points, which would give
|
||||
the maximum area if split there."""
|
||||
max_id = None
|
||||
max_gap = 0
|
||||
for i in range(len(ar) - 1):
|
||||
# "Extra" variables are for area to the left and right
|
||||
# that would be freed after splitting.
|
||||
extra_left = (ar[i][0]-ar[0][0]) * (h-ar[i][2])
|
||||
extra_right = (ar[-1][0]-ar[i+1][0]) * (h-ar[i+1][3])
|
||||
# Gap is the area of the column between points i and i+1
|
||||
# plus extra areas to the left and right.
|
||||
gap = (ar[i+1][0] - ar[i][0]) * h + extra_left + extra_right
|
||||
if gap > max_gap:
|
||||
max_id = i
|
||||
max_gap = gap
|
||||
return max_id, max_gap
|
||||
|
||||
def get_bbox(b, pad=0):
|
||||
"""Returns a list of [min_lat, min_lon, max_lat, max_lon] for a box."""
|
||||
return [b[2][0][0]-pad, b[3][0][0]-pad, b[2][-1][0]+pad, b[3][-1][0]+pad]
|
||||
|
||||
def split(box, point_array, point_id):
|
||||
"""Split the box over axis point_array at point point_id...point_id+1.
|
||||
Modifies the box in-place and returns a new box."""
|
||||
alt_array = 5 - point_array # 3->2, 2->3
|
||||
points = box[point_array][point_id+1:]
|
||||
del box[point_array][point_id+1:]
|
||||
alt = {True: [], False: []} # True means point is in new box
|
||||
for p in box[alt_array]:
|
||||
alt[(p[1], p[0]) >= (points[0][0], points[0][1])].append(p)
|
||||
|
||||
new_box = [None] * 4
|
||||
new_box[point_array] = points
|
||||
new_box[alt_array] = alt[True]
|
||||
box[alt_array] = alt[False]
|
||||
for i in range(2):
|
||||
box[i] = box[i+2][-1][0] - box[i+2][0][0]
|
||||
new_box[i] = new_box[i+2][-1][0] - new_box[i+2][0][0]
|
||||
return new_box
|
||||
|
||||
# height, width, lats, lons
|
||||
boxes = [[lats[-1][0]-lats[0][0], lons[-1][0]-lons[0][0], lats, lons]]
|
||||
initial_area = boxes[0][0] * boxes[0][1]
|
||||
while len(boxes) < max_bboxes and len(boxes) <= len(points):
|
||||
candidate_box = None
|
||||
area = 0
|
||||
point_id = None
|
||||
point_array = None
|
||||
for box in boxes:
|
||||
for ar in (2, 3):
|
||||
# Find a box and an axis for splitting that would decrease the area the most
|
||||
update_side_dimensions(box[ar])
|
||||
max_id, max_area = find_max_gap(box[ar], box[3-ar])
|
||||
if max_area > area:
|
||||
area = max_area
|
||||
candidate_box = box
|
||||
point_id = max_id
|
||||
point_array = ar
|
||||
if area * 100 < initial_area:
|
||||
# Stop splitting when the area decrease is less than 1%
|
||||
break
|
||||
logging.debug('Splitting bbox %s at %s %s..%s; area decrease %s%%',
|
||||
get_bbox(candidate_box),
|
||||
'longs' if point_array == 3 else 'lats',
|
||||
candidate_box[point_array][point_id][0],
|
||||
candidate_box[point_array][point_id+1][0],
|
||||
round(100*area/initial_area))
|
||||
boxes.append(split(candidate_box, point_array, point_id))
|
||||
|
||||
padding = self.profile.get('bbox_padding', BBOX_PADDING)
|
||||
return [get_bbox(b, padding) for b in boxes]
|
||||
|
||||
def get_categories(self, tags):
|
||||
def match_query(tags, query):
|
||||
for tag in query:
|
||||
if len(tag) == 1:
|
||||
return tag[0] in tags
|
||||
else:
|
||||
value = tags.get(tag[0], None)
|
||||
if tag[1] is None or tag[1] == '':
|
||||
return value is None
|
||||
if value is None:
|
||||
return False
|
||||
found = False
|
||||
for t2 in tag[1:]:
|
||||
if t2[0] == '~':
|
||||
if re.search(t2[1:], value):
|
||||
found = True
|
||||
elif t2[0] == '!':
|
||||
if t2[1:].lower() in value.lower():
|
||||
found = True
|
||||
elif t2 == value:
|
||||
found = True
|
||||
if found:
|
||||
break
|
||||
if not found:
|
||||
return False
|
||||
return True
|
||||
|
||||
def tags_to_query(tags):
|
||||
return [(k, v) for k, v in tags.items()]
|
||||
|
||||
result = set()
|
||||
qualifies = self.profile.get('qualifies', args=tags)
|
||||
if qualifies is not None:
|
||||
if qualifies:
|
||||
result.add(None)
|
||||
return result
|
||||
|
||||
# First check default query
|
||||
query = self.profile.get('query', None)
|
||||
if query is not None:
|
||||
if isinstance(query, str):
|
||||
result.add(None)
|
||||
else:
|
||||
if isinstance(query[0][0], str):
|
||||
query = [query]
|
||||
for q in query:
|
||||
if match_query(tags, q):
|
||||
result.add(None)
|
||||
break
|
||||
|
||||
# Then check each category if we got these
|
||||
categories = self.profile.get('categories', {})
|
||||
for name, params in categories.items():
|
||||
if 'tags' not in params and 'query' not in params:
|
||||
raise ValueError('No tags and query attributes for category "{}"'.format(name))
|
||||
if match_query(tags, params.get('query', tags_to_query(params.get('tags')))):
|
||||
result.add(name)
|
||||
|
||||
return result
|
||||
|
||||
def calc_boxes(self, dataset_points):
|
||||
profile_bbox = self.profile.get('bbox', True)
|
||||
if not profile_bbox:
|
||||
bboxes = [None]
|
||||
elif hasattr(profile_bbox, '__len__') and len(profile_bbox) == 4:
|
||||
bboxes = [profile_bbox]
|
||||
else:
|
||||
bboxes = self.split_into_bboxes(dataset_points)
|
||||
return bboxes
|
||||
|
||||
def download(self, bboxes=None):
|
||||
"""Constructs an Overpass API query and requests objects
|
||||
to match from a server."""
|
||||
if not bboxes:
|
||||
pbbox = self.profile.get('bbox', True)
|
||||
if pbbox and hasattr(pbbox, '__len__') and len(pbbox) == 4:
|
||||
bboxes = [pbbox]
|
||||
else:
|
||||
bboxes = [None]
|
||||
|
||||
query = self.construct_overpass_query(bboxes)
|
||||
logging.debug('Overpass query: %s', query)
|
||||
r = requests.get(OVERPASS_SERVER + 'interpreter', {'data': query})
|
||||
if r.encoding is None:
|
||||
r.encoding = 'utf-8'
|
||||
if r.status_code != 200:
|
||||
logging.error('Failed to download data from Overpass API: %s', r.status_code)
|
||||
if 'rate_limited' in r.text:
|
||||
r = requests.get(OVERPASS_SERVER + 'status')
|
||||
logging.warning('Seems like you are rate limited. API status:\n%s', r.text)
|
||||
else:
|
||||
logging.error('Error message: %s', r.text)
|
||||
raise IOError()
|
||||
if 'runtime error: ' in r.text:
|
||||
m = re.search(r'runtime error: ([^<]+)', r.text)
|
||||
error = 'unknown' if not m else m.group(1)
|
||||
if 'Query timed out' in error:
|
||||
logging.error(
|
||||
'Query timed out, try increasing the "overpass_timeout" profile variable')
|
||||
else:
|
||||
logging.error('Runtime error: %s', error)
|
||||
raise IOError()
|
||||
return self.parse_xml(r.content)
|
||||
|
||||
def parse_xml(self, fileobj):
|
||||
"""Parses an OSM XML file into the "osmdata" field. For ways and relations,
|
||||
finds the center. Drops objects that do not match the overpass query tags
|
||||
(see "check_against_profile_tags" method)."""
|
||||
if isinstance(fileobj, bytes):
|
||||
xml = etree.fromstring(fileobj)
|
||||
else:
|
||||
xml = etree.parse(fileobj).getroot()
|
||||
nodes = {}
|
||||
for nd in xml.findall('node'):
|
||||
nodes[nd.get('id')] = (float(nd.get('lat')), float(nd.get('lon')))
|
||||
ways = {}
|
||||
for way in xml.findall('way'):
|
||||
center = way.find('center')
|
||||
if center is not None:
|
||||
ways[way.get('id')] = [float(center.get('lat')), float(center.get('lon'))]
|
||||
else:
|
||||
logging.debug('Way %s does not have a center', way.get('id'))
|
||||
coord = [0, 0]
|
||||
count = 0
|
||||
for nd in way.findall('nd'):
|
||||
if nd.get('ref') in nodes:
|
||||
count += 1
|
||||
for i in range(len(coord)):
|
||||
coord[i] += nodes[nd.get('ref')][i]
|
||||
ways[way.get('id')] = [coord[0] / count, coord[1] / count]
|
||||
|
||||
# For calculating weight of OSM objects
|
||||
weight_fn = self.profile.get_raw('weight')
|
||||
osmdata = {}
|
||||
|
||||
for el in xml:
|
||||
tags = {}
|
||||
for tag in el.findall('tag'):
|
||||
tags[tag.get('k')] = tag.get('v')
|
||||
categories = self.get_categories(tags)
|
||||
if categories is False or categories is None or len(categories) == 0:
|
||||
continue
|
||||
|
||||
if el.tag == 'node':
|
||||
coord = nodes[el.get('id')]
|
||||
members = None
|
||||
elif el.tag == 'way':
|
||||
coord = ways[el.get('id')]
|
||||
members = [nd.get('ref') for nd in el.findall('nd')]
|
||||
elif el.tag == 'relation':
|
||||
center = el.find('center')
|
||||
if center is not None:
|
||||
coord = [float(center.get('lat')), float(center.get('lon'))]
|
||||
else:
|
||||
logging.debug('Relation %s does not have a center', el.get('id'))
|
||||
coord = [0, 0]
|
||||
count = 0
|
||||
for m in el.findall('member'):
|
||||
if m.get('type') == 'node' and m.get('ref') in nodes:
|
||||
count += 1
|
||||
for i in range(len(coord)):
|
||||
coord[i] += nodes[m.get('ref')][i]
|
||||
elif m.get('type') == 'way' and m.get('ref') in ways:
|
||||
count += 1
|
||||
for i in range(len(coord)):
|
||||
coord[i] += ways[m.get('ref')][i]
|
||||
if count > 0:
|
||||
coord = [coord[0] / count, coord[1] / count]
|
||||
members = [
|
||||
(m.get('type'), m.get('ref'), m.get('role'))
|
||||
for m in el.findall('member')
|
||||
]
|
||||
else:
|
||||
continue
|
||||
if not coord or coord == [0, 0]:
|
||||
continue
|
||||
pt = OSMPoint(
|
||||
el.tag, int(el.get('id')), int(el.get('version')),
|
||||
coord[0], coord[1], tags, categories)
|
||||
pt.members = members
|
||||
if pt.is_poi():
|
||||
if callable(weight_fn):
|
||||
weight = weight_fn(pt)
|
||||
if weight:
|
||||
if abs(weight) > 3:
|
||||
pt.dist_offset = weight
|
||||
else:
|
||||
pt.dist_offset = weight * self.profile.max_distance
|
||||
osmdata[pt.id] = pt
|
||||
return osmdata
|
||||
|
||||
|
||||
def check_moveability(changes):
|
||||
to_check = [x for x in changes if x['properties']['osm_type'] == 'node' and
|
||||
x['properties']['action'] == 'modify']
|
||||
logging.info('Checking moveability of %s modified nodes', len(to_check))
|
||||
for c in to_check:
|
||||
p = c['properties']
|
||||
p['can_move'] = False
|
||||
r = requests.get('{}node/{}/ways'.format(OSM_API_SERVER, p['osm_id']))
|
||||
if r.status_code == 200:
|
||||
xml = etree.fromstring(r.content)
|
||||
p['can_move'] = xml.find('way') is None
|
63303
conflate/places.bin
63303
conflate/places.bin
File diff suppressed because one or more lines are too long
|
@ -1,62 +0,0 @@
|
|||
import json
|
||||
from .data import SourcePoint # So we don't have to import this in profiles
|
||||
from . import etree
|
||||
|
||||
|
||||
class ProfileException(Exception):
|
||||
"""An exception class for the Profile instance."""
|
||||
def __init__(self, attr, desc):
|
||||
super().__init__('Field missing in profile: {} ({})'.format(attr, desc))
|
||||
|
||||
|
||||
class Profile:
|
||||
"""A wrapper for a profile.
|
||||
|
||||
A profile is a python script that sets a few local variables.
|
||||
These variables become properties of the profile, accessible with
|
||||
a "get" method. If something is a function, it will be called,
|
||||
optional parameters might be passed to it.
|
||||
|
||||
You can compile a list of all supported variables by grepping through
|
||||
this code, or by looking at a few example profiles. If something
|
||||
is required, you will be notified of that.
|
||||
"""
|
||||
def __init__(self, fileobj, par=None):
|
||||
global param
|
||||
param = par
|
||||
if isinstance(fileobj, dict):
|
||||
self.profile = fileobj
|
||||
elif hasattr(fileobj, 'read'):
|
||||
s = fileobj.read().replace('\r', '')
|
||||
if s[0] == '{':
|
||||
self.profile = json.loads(s)
|
||||
else:
|
||||
self.profile = {}
|
||||
exec(s, globals(), self.profile)
|
||||
else:
|
||||
# Got a class
|
||||
self.profile = {name: getattr(fileobj, name)
|
||||
for name in dir(fileobj) if not name.startswith('_')}
|
||||
self.max_distance = self.get('max_distance', 100)
|
||||
|
||||
def has(self, attr):
|
||||
return attr in self.profile
|
||||
|
||||
def get(self, attr, default=None, required=None, args=None):
|
||||
if attr in self.profile:
|
||||
value = self.profile[attr]
|
||||
if callable(value):
|
||||
if args is None:
|
||||
return value()
|
||||
else:
|
||||
return value(*args)
|
||||
else:
|
||||
return value
|
||||
if required is not None:
|
||||
raise ProfileException(attr, required)
|
||||
return default
|
||||
|
||||
def get_raw(self, attr, default=None):
|
||||
if attr in self.profile:
|
||||
return self.profile[attr]
|
||||
return default
|
|
@ -1 +1 @@
|
|||
__version__ = '1.4.1'
|
||||
__version__ = '1.3.2'
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
# This profile requires lxml package
|
||||
import logging
|
||||
import re
|
||||
|
||||
# A web page with a list of shops in Moscow. You can replace it with one for another city
|
||||
download_url = 'https://www.auchan.ru/ru/moscow/'
|
||||
source = 'auchan.ru'
|
||||
|
@ -40,8 +44,6 @@ def dataset(fileobj):
|
|||
|
||||
# We are parsing HTML, and for that we need an lxml package
|
||||
from lxml import html
|
||||
import logging
|
||||
import re
|
||||
global download_url_copy, re
|
||||
h = html.fromstring(fileobj.read().decode('utf-8'))
|
||||
shops = h.find_class('shops-in-the-city-holder')[0]
|
||||
|
|
|
@ -1,67 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
import conflate
|
||||
import requests
|
||||
import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
from yandex_parser import parse_feed
|
||||
|
||||
|
||||
class Profile:
|
||||
source = 'Азбука Вкуса'
|
||||
dataset_id = 'av'
|
||||
query = [('shop', 'convenience', 'supermarket', 'wine', 'alcohol')]
|
||||
master_tags = ('operator', 'shop', 'opening_hours', 'name', 'contact:website', 'contact:phone')
|
||||
download_url = 'https://av.ru/yandex/supermarket.xml'
|
||||
bounded_update = True
|
||||
|
||||
def matches(osmtags, avtags):
|
||||
if 'Энотека' in avtags['name']:
|
||||
return osmtags.get('shop') in ('wine', 'alcohol')
|
||||
name = osmtags.get('name')
|
||||
if osmtags.get('shop') not in ('convenience', 'supermarket'):
|
||||
return False
|
||||
if not name or re.search(r'AB|АВ|Азбука|Daily', name, re.I):
|
||||
return True
|
||||
if name.upper() in ('SPAR', 'СПАР') or 'континент' in name.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
def dataset(fileobj):
|
||||
data = []
|
||||
other_urls = [
|
||||
None,
|
||||
'http://av.ru/yandex/market.xml',
|
||||
'http://av.ru/yandex/daily.xml',
|
||||
'http://av.ru/yandex/enoteka.xml',
|
||||
]
|
||||
for url in other_urls:
|
||||
if url:
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
logging.error('Could not download source data: %s %s', r.status_code, r.text)
|
||||
return None
|
||||
f = BytesIO(r.content)
|
||||
else:
|
||||
f = fileobj
|
||||
for c in parse_feed(f):
|
||||
name = next(iter(c.name.values()))
|
||||
tags = {
|
||||
'name': name,
|
||||
'operator': 'ООО «Городской супермаркет»',
|
||||
'contact:phone': '; '.join(c.phones) or None,
|
||||
'contact:website': c.url_add,
|
||||
'opening_hours': c.opening_hours,
|
||||
}
|
||||
if 'Энотека' in name:
|
||||
tags['shop'] = 'wine'
|
||||
elif 'Daily' in name:
|
||||
tags['shop'] = 'convenience'
|
||||
else:
|
||||
tags['shop'] = 'supermarket'
|
||||
data.append(conflate.SourcePoint(c.id, c.lat, c.lon, tags))
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
conflate.run(Profile)
|
|
@ -2,6 +2,10 @@
|
|||
# and does not contain any useful data now.
|
||||
# So this profile is here solely for demonstration purposes.
|
||||
|
||||
import json
|
||||
import codecs
|
||||
import re
|
||||
|
||||
download_url = 'https://burgerking.ru/restaurant-locations-json-reply-new'
|
||||
source = 'Burger King'
|
||||
dataset_id = 'burger_king'
|
||||
|
@ -56,9 +60,6 @@ def dataset(fileobj):
|
|||
s = s.replace(' доб. ', '-')
|
||||
return s
|
||||
|
||||
import json
|
||||
import codecs
|
||||
import re
|
||||
notes = {
|
||||
172: 'Подвинуть на второй терминал',
|
||||
25: 'Подвинуть в ЮниМолл',
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
import json
|
||||
import logging
|
||||
import requests
|
||||
import codecs
|
||||
|
||||
source = 'opendata.mkrf.ru'
|
||||
dataset_id = 'mkrf_theaters'
|
||||
query = [('amenity', 'theatre')]
|
||||
|
@ -7,9 +12,6 @@ master_tags = ('official_name', 'phone', 'opening_hours', 'website')
|
|||
|
||||
# Reading the dataset passport to determine an URL of the latest dataset version
|
||||
def download_url():
|
||||
import logging
|
||||
import requests
|
||||
|
||||
dataset_id = '7705851331-' + (param or 'museums')
|
||||
r = requests.get('http://opendata.mkrf.ru/opendata/{}/meta.json'.format(dataset_id))
|
||||
if r.status_code != 200 or len(r.content) == 0:
|
||||
|
@ -39,9 +41,6 @@ master_tags = ('official_name', 'phone', 'opening_hours', 'website')
|
|||
|
||||
|
||||
def dataset(fileobj):
|
||||
import json
|
||||
import codecs
|
||||
|
||||
def make_wd_ranges(r):
|
||||
"""Converts e.g. [0,1,4] into 'Mo-Tu, Fr'."""
|
||||
wd = ['Mo', 'Tu', 'We', 'Th', 'Fr', 'Sa', 'Su']
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
import json
|
||||
import logging
|
||||
|
||||
source = 'dit.mos.ru'
|
||||
no_dataset_id = True
|
||||
query = [('building',)]
|
||||
query = [[('addr:housenumber',)], [('building',)]]
|
||||
max_distance = 50
|
||||
max_request_boxes = 2
|
||||
master_tags = ('addr:housenumber', 'addr:street')
|
||||
|
@ -27,16 +30,9 @@ if param:
|
|||
param = param[1:]
|
||||
if param in ADMS:
|
||||
ADM = ADMS[param]
|
||||
if param == '5':
|
||||
query = [[('addr:housenumber',)], [('building',)]]
|
||||
|
||||
|
||||
def dataset(fileobj):
|
||||
import zipfile
|
||||
import json
|
||||
import logging
|
||||
global COMPLEX, ADM
|
||||
|
||||
def find_center(geodata):
|
||||
if not geodata:
|
||||
return None
|
||||
|
@ -65,7 +61,8 @@ def dataset(fileobj):
|
|||
return [lonlat[0]/cnt, lonlat[1]/cnt]
|
||||
return None
|
||||
|
||||
logging.info('Экспортируем %s (%s)', ADM, 'строения' if COMPLEX else 'без строений')
|
||||
global COMPLEX, ADM
|
||||
import zipfile
|
||||
zf = zipfile.ZipFile(fileobj)
|
||||
data = []
|
||||
no_geodata = 0
|
||||
|
@ -90,7 +87,7 @@ def dataset(fileobj):
|
|||
ctype = el.get('L2_TYPE')
|
||||
stroenie = el.get('L3_VALUE')
|
||||
stype = el.get('L3_TYPE')
|
||||
if not street or not house or 'Б/Н' in house:
|
||||
if not street or not house:
|
||||
no_addr += 1
|
||||
continue
|
||||
if not lonlat:
|
||||
|
|
|
@ -1,3 +1,25 @@
|
|||
# Available modules: codecs, logging, requests, json, etree. But importing these helps catch other errors
|
||||
import json
|
||||
import logging
|
||||
|
||||
|
||||
def download_url(mos_dataset_id=1421):
|
||||
import requests
|
||||
r = requests.get('https://data.mos.ru/api/datasets/expformats/?datasetId={}'.format(mos_dataset_id))
|
||||
if r.status_code != 200 or len(r.content) == 0:
|
||||
logging.error('Could not get URL for dataset: %s %s', r.status_code, r.text)
|
||||
logging.error('Please check http://data.mos.ru/opendata/{}/passport'.format(mos_dataset_id))
|
||||
return None
|
||||
url = [x for x in r.json() if x['Format'] == 'json'][0]
|
||||
version = '?'
|
||||
title = 'dataset'
|
||||
r = requests.get('https://data.mos.ru/apiproxy/opendata/{}/meta.json'.format(mos_dataset_id))
|
||||
if r.status_code == 200:
|
||||
title = r.json()['Title']
|
||||
version = r.json()['VersionNumber']
|
||||
logging.info('Downloading %s %s from %s', title, version, url['GenerationStart'])
|
||||
return 'https://op.mos.ru/EHDWSREST/catalog/export/get?id=' + url['EhdId']
|
||||
|
||||
# What will be put into "source" tags. Lower case please
|
||||
source = 'dit.mos.ru'
|
||||
# A fairly unique id of the dataset to query OSM, used for "ref:mos_parking" tags
|
||||
|
@ -24,29 +46,8 @@ tag_unmatched = None
|
|||
master_tags = ('zone:parking', 'ref', 'contact:phone', 'contact:website', 'operator')
|
||||
|
||||
|
||||
def download_url(mos_dataset_id=1421):
|
||||
import requests
|
||||
import logging
|
||||
r = requests.get('https://data.mos.ru/api/datasets/expformats/?datasetId={}'.format(mos_dataset_id))
|
||||
if r.status_code != 200 or len(r.content) == 0:
|
||||
logging.error('Could not get URL for dataset: %s %s', r.status_code, r.text)
|
||||
logging.error('Please check http://data.mos.ru/opendata/{}/passport'.format(mos_dataset_id))
|
||||
return None
|
||||
url = [x for x in r.json() if x['Format'] == 'json'][0]
|
||||
version = '?'
|
||||
title = 'dataset'
|
||||
r = requests.get('https://data.mos.ru/apiproxy/opendata/{}/meta.json'.format(mos_dataset_id))
|
||||
if r.status_code == 200:
|
||||
title = r.json()['Title']
|
||||
version = r.json()['VersionNumber']
|
||||
logging.info('Downloading %s %s from %s', title, version, url['GenerationStart'])
|
||||
return 'https://op.mos.ru/EHDWSREST/catalog/export/get?id=' + url['EhdId']
|
||||
|
||||
|
||||
# A list of SourcePoint objects. Initialize with (id, lat, lon, {tags}).
|
||||
def dataset(fileobj):
|
||||
import json
|
||||
import logging
|
||||
import zipfile
|
||||
import re
|
||||
zf = zipfile.ZipFile(fileobj)
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
import json
|
||||
import codecs
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
source = 'Navads'
|
||||
dataset_id = 'navads_shell'
|
||||
query = [('amenity', 'fuel')]
|
||||
|
@ -7,11 +12,6 @@ max_request_boxes = 3
|
|||
|
||||
|
||||
def dataset(fileobj):
|
||||
import json
|
||||
import codecs
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
def format_phone(ph):
|
||||
if ph and len(ph) == 13 and ph[:3] == '+44':
|
||||
if (ph[3] == '1' and ph[4] != '1' and ph[5] != '1') or ph[3:7] == '7624':
|
||||
|
|
|
@ -1,78 +0,0 @@
|
|||
download_url = 'http://www.rosinter.ru/locator/RestaurantsFeed.aspx?city=all&location=&lang=ru&brand=all&cuisine=all&metro=&hasDelivery=&isCorporate='
|
||||
source = 'Rosinter'
|
||||
no_dataset_id = True
|
||||
max_distance = 500
|
||||
query = [('amenity', 'restaurant', 'cafe', 'bar', 'pub', 'fast_food')]
|
||||
overpass_timeout = 1000
|
||||
duplicate_distance = -1
|
||||
nearest_points = 30
|
||||
master_tags = ('name', 'phone', 'amenity')
|
||||
|
||||
types = {
|
||||
# substr: osm_substr, amenity, cuisine
|
||||
'Costa': ['costa', 'cafe', 'coffee_shop'],
|
||||
'IL': [('patio', 'патио'), 'restaurant', 'italian'],
|
||||
'TGI': [('tgi', 'friday'), 'restaurant', 'american'],
|
||||
'Бар и': ['гриль', 'restaurant', 'american'],
|
||||
'Макд': ['мак', 'fast_food', None],
|
||||
'Раша': ['мама', 'fast_food', 'russian'],
|
||||
'Планета': ['планета', 'restaurant', 'japanese'],
|
||||
'Шика': ['шика', 'restaurant', 'asian'],
|
||||
'Свои': ['сво', 'restaurant', None],
|
||||
}
|
||||
|
||||
|
||||
def matches(osmtags, ritags):
|
||||
global types
|
||||
rname = ritags['name']
|
||||
name = osmtags.get('name', '').lower()
|
||||
for k, v in types.items():
|
||||
if k in rname:
|
||||
if isinstance(v[0], str):
|
||||
return v[0] in name
|
||||
for n in v[0]:
|
||||
if n in name:
|
||||
return True
|
||||
return False
|
||||
logging.error('Unknown rname value: %s', rname)
|
||||
return False
|
||||
|
||||
|
||||
def dataset(f):
|
||||
global types
|
||||
from lxml import etree
|
||||
root = etree.parse(f).getroot()
|
||||
for el in root.find('Restaurants'):
|
||||
rid = el.find('id').text
|
||||
city = el.find('city').text
|
||||
if city in ('Прага', 'Будапешт', 'Варшава', 'Баку', 'Рига'):
|
||||
continue
|
||||
brand = el.find('brand').text
|
||||
if 'TGI' in brand:
|
||||
brand = 'TGI Fridays'
|
||||
elif 'СВОИ' in brand:
|
||||
brand = 'Свои'
|
||||
phone = el.find('telephone').text
|
||||
if phone:
|
||||
phone = phone.replace('(', '').replace(')', '')
|
||||
website = el.find('siteurl').text
|
||||
if website and 'il-patio' in website:
|
||||
website = 'http://ilpatio.ru'
|
||||
if 'Свои' in brand:
|
||||
website = 'http://restoransvoi.by'
|
||||
lat = float(el.find('latitude').text)
|
||||
lon = float(el.find('longitude').text)
|
||||
tags = {
|
||||
'amenity': 'restaurant',
|
||||
'name': brand,
|
||||
'phone': phone,
|
||||
'website': website,
|
||||
}
|
||||
address = el.find('address').text
|
||||
for k, v in types.items():
|
||||
if k in brand:
|
||||
tags['amenity'] = v[1]
|
||||
tags['cuisine'] = v[2]
|
||||
yield SourcePoint(
|
||||
rid, lat, lon, tags,
|
||||
remarks='Обязательно подвиньте точку!\nАдрес: ' + str(address))
|
|
@ -1,104 +0,0 @@
|
|||
download_url = 'http://new.shoko.ru/addresses/'
|
||||
source = 'Шоколадница'
|
||||
no_dataset_id = True
|
||||
overpass_timeout = 600
|
||||
max_distance = 250
|
||||
max_request_boxes = 6
|
||||
query = [('amenity',), ('name', '~Шоколадница')]
|
||||
master_tags = ['amenity', 'name', 'name:ru', 'name:en', 'website', 'phone', 'opening_hours']
|
||||
|
||||
|
||||
def dataset(fileobj):
|
||||
def parse_oh(s):
|
||||
if not s:
|
||||
return None
|
||||
olds = s
|
||||
if s.strip().lower() == 'круглосуточно':
|
||||
return '24/7'
|
||||
trans = {
|
||||
'будни': 'Mo-Fr',
|
||||
'суббота': 'Sa',
|
||||
'воскресенье': 'Su',
|
||||
'ежедневно': 'Mo-Su',
|
||||
'выходные': 'Sa-Su',
|
||||
'восерсенье': 'Su',
|
||||
'ежеденевно': 'Mo-Su',
|
||||
'пн-чтивс': 'Mo-Th,Su',
|
||||
'пн-чт,вс': 'Mo-Th,Su',
|
||||
'пт.-сб': 'Fr-Sa',
|
||||
'вск.-чт': 'Su-Th',
|
||||
'смаяпооктябрь': 'May-Oct',
|
||||
'ч.смаяпооктябрь': 'May-Oct',
|
||||
'сентября': 'May-Sep',
|
||||
}
|
||||
weekdays = {'пн': 'Mo', 'вт': 'Tu', 'ср': 'We', 'чт': 'Th', 'пт': 'Fr', 'сб': 'Sa', 'вс': 'Su'}
|
||||
if s == 'с 10 до 22' or s == 'с 10.00-22.00':
|
||||
s = '10:00 - 22:00'
|
||||
s = s.replace('круглосуточно', '00:00-24:00')
|
||||
s = s.replace('23,', '23:00')
|
||||
parts = []
|
||||
for m in re.finditer(r'([а-яА-Я ,.:\(\)-]+?)?(?:\sс)?\s*(\d?\d[:.]\d\d)(?: до |[^\w\d]+)(\d\d[:.]\d\d)', s):
|
||||
days = (m[1] or '').strip(' -.,:()').lower().replace(' ', '')
|
||||
m2 = re.match(r'^([б-ч]{2})\s?[,и-]\s?([б-ч]{2})$', days)
|
||||
if not days:
|
||||
days = 'Mo-Su'
|
||||
elif days in weekdays:
|
||||
days = weekdays[days]
|
||||
elif m2 and m2[1] in weekdays and m2[2] in weekdays:
|
||||
days = weekdays[m2[1]] + '-' + weekdays[m2[2]]
|
||||
else:
|
||||
if days not in trans:
|
||||
logging.warn('Unknown days: %s', days)
|
||||
continue
|
||||
days = trans[days]
|
||||
parts.append('{} {:0>5}-{}'.format(days, m[2].replace('.', ':'), m[3].replace('.', ':')))
|
||||
# logging.info('%s -> %s', olds, '; '.join(parts))
|
||||
if parts:
|
||||
return '; '.join(parts)
|
||||
return None
|
||||
|
||||
from lxml import html
|
||||
import re
|
||||
import logging
|
||||
import phonenumbers
|
||||
h = html.fromstring(fileobj.read().decode('utf-8'))
|
||||
markers = h.get_element_by_id('markers')
|
||||
i = 0
|
||||
for m in markers:
|
||||
lat = m.get('data-lat')
|
||||
lon = m.get('data-lng')
|
||||
if not lat or not lon:
|
||||
continue
|
||||
oh = parse_oh(m.get('data-time'))
|
||||
phone = m.get('data-phone')
|
||||
if phone[:3] == '812':
|
||||
phone = '+7' + phone
|
||||
if ' 891' in phone:
|
||||
phone = phone[:phone.index(' 891')]
|
||||
if ' 8-91' in phone:
|
||||
phone = phone[:phone.index(' 8-91')]
|
||||
try:
|
||||
if phone == 'отключен' or not phone:
|
||||
phone = None
|
||||
else:
|
||||
parsed_phone = phonenumbers.parse(phone.replace(';', ',').split(',')[0], "RU")
|
||||
except:
|
||||
logging.info(phone)
|
||||
raise
|
||||
if phone is None:
|
||||
fphone = None
|
||||
else:
|
||||
fphone = phonenumbers.format_number(
|
||||
parsed_phone, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
|
||||
tags = {
|
||||
'amenity': 'cafe',
|
||||
'name': 'Шоколадница',
|
||||
'name:ru': 'Шоколадница',
|
||||
'name:en': 'Shokoladnitsa',
|
||||
'website': 'http://shoko.ru',
|
||||
'cuisine': 'coffee_shop',
|
||||
'phone': fphone,
|
||||
'opening_hours': oh
|
||||
}
|
||||
i += 1
|
||||
yield SourcePoint(i, float(lat), float(lon), tags, remarks=m.get('data-title'))
|
|
@ -1,3 +1,7 @@
|
|||
import codecs
|
||||
import json
|
||||
import logging
|
||||
|
||||
# Where to get the latest feed
|
||||
download_url = 'http://www.velobike.ru/proxy/parkings/'
|
||||
# What to write for the changeset's source tag
|
||||
|
@ -25,10 +29,6 @@ master_tags = ('ref', 'capacity', 'capacity:electric', 'contact:email',
|
|||
|
||||
|
||||
def dataset(fileobj):
|
||||
import codecs
|
||||
import json
|
||||
import logging
|
||||
|
||||
# Specifying utf-8 is important, otherwise you'd get "bytes" instead of "str"
|
||||
source = json.load(codecs.getreader('utf-8')(fileobj))
|
||||
data = []
|
||||
|
|
|
@ -1,139 +0,0 @@
|
|||
from lxml import etree
|
||||
import logging
|
||||
import re
|
||||
import phonenumbers # https://pypi.python.org/pypi/phonenumberslite
|
||||
|
||||
|
||||
class Company:
|
||||
def __init__(self, cid):
|
||||
self.id = cid
|
||||
self.name = {}
|
||||
self.alt_name = {}
|
||||
self.address = {}
|
||||
self.country = {}
|
||||
self.address_add = {}
|
||||
self.opening_hours = None
|
||||
self.url = None
|
||||
self.url_add = None
|
||||
self.url_ext = None
|
||||
self.email = None
|
||||
self.rubric = []
|
||||
self.phones = []
|
||||
self.faxes = []
|
||||
self.photos = []
|
||||
self.lat = None
|
||||
self.lon = None
|
||||
self.other = {}
|
||||
|
||||
|
||||
def parse_feed(f):
|
||||
def multilang(c, name):
|
||||
for el in company.findall(name):
|
||||
lang = el.get('lang', 'default')
|
||||
value = el.text
|
||||
if value and len(value.strip()) > 0:
|
||||
c[lang] = value.strip()
|
||||
|
||||
def parse_subels(el):
|
||||
res = {}
|
||||
if el is None:
|
||||
return res
|
||||
for subel in el:
|
||||
name = subel.tag
|
||||
text = subel.text
|
||||
if text and text.strip():
|
||||
res[name] = text
|
||||
return res
|
||||
|
||||
def parse_opening_hours(s):
|
||||
if 'углосуточн' in s:
|
||||
return '24/7'
|
||||
m = re.search(r'([01]?\d:\d\d).*?([12]?\d:\d\d)', s)
|
||||
if m:
|
||||
# TODO: parse weekdays
|
||||
start = m.group(1)
|
||||
start = re.sub(r'^(\d:)', r'0\1', start)
|
||||
end = m.group(2)
|
||||
end = re.sub(r'0?0:', '24:', end)
|
||||
return 'Mo-Su {}-{}'.format(start, end)
|
||||
# TODO
|
||||
return None
|
||||
|
||||
xml = etree.parse(f).getroot()
|
||||
if xml.tag != 'companies':
|
||||
logging.error('Root node must be named "companies", not %s', xml.tag)
|
||||
for company in xml:
|
||||
if company.tag != 'company':
|
||||
logging.warn('Non-company in yandex xml: %s', company.tag)
|
||||
continue
|
||||
cid = company.find('company-id')
|
||||
if cid is None or not cid.text:
|
||||
logging.error('No id for a company')
|
||||
continue
|
||||
c = Company(cid.text.strip())
|
||||
multilang(c.name, 'name')
|
||||
multilang(c.alt_name, 'name-other')
|
||||
multilang(c.address, 'address')
|
||||
loc = {}
|
||||
multilang(loc, 'locality-name')
|
||||
if loc:
|
||||
for lng, place in loc.items():
|
||||
if lng in c.address:
|
||||
c.address = place + ', ' + c.address
|
||||
multilang(c.address_add, 'address-add')
|
||||
multilang(c.country, 'country')
|
||||
coord = parse_subels(company.find('coordinates'))
|
||||
if 'lat' in coord and 'lon' in coord:
|
||||
c.lat = float(coord['lat'])
|
||||
c.lon = float(coord['lon'])
|
||||
else:
|
||||
logging.warn('No coordinates for %s', c.id)
|
||||
continue
|
||||
for ph in company.findall('phone'):
|
||||
phone = parse_subels(ph)
|
||||
if 'number' not in phone:
|
||||
continue
|
||||
parsed_phone = phonenumbers.parse(phone['number'], 'RU')
|
||||
number = phonenumbers.format_number(
|
||||
parsed_phone, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
|
||||
if 'ext' in phone:
|
||||
number += ' ext. ' + phone['ext']
|
||||
typ = phone.get('type', 'phone')
|
||||
if typ == 'fax':
|
||||
c.faxes.append(number)
|
||||
else:
|
||||
c.phones.append(number)
|
||||
email = company.find('email')
|
||||
if email is not None and email.text:
|
||||
c.email = email.text.strip()
|
||||
url = company.find('url')
|
||||
if url is not None and url.text:
|
||||
c.url = url.text.strip()
|
||||
url_add = company.find('add-url')
|
||||
if url_add is not None and url_add.text:
|
||||
c.url_add = url_add.text.strip()
|
||||
url_ext = company.find('info-page')
|
||||
if url_ext is not None and url_ext.text:
|
||||
c.url_ext = url_ext.text.strip()
|
||||
for rub in company.findall('rubric-rd'):
|
||||
if rub.text:
|
||||
c.rubric.append(int(rub.text.strip()))
|
||||
coh = company.find('working-time')
|
||||
if coh is not None and coh.text:
|
||||
c.opening_hours = parse_opening_hours(coh.text)
|
||||
photos = company.find('photos')
|
||||
if photos is not None:
|
||||
for photo in photos:
|
||||
if photo.get('type', 'interior') != 'food':
|
||||
c.photos.append(photo.get('url'))
|
||||
for feat in company:
|
||||
if feat.tag.startswith('feature-'):
|
||||
name = feat.get('name', None)
|
||||
value = feat.get('value', None)
|
||||
if name is not None and value is not None:
|
||||
if feat.tag == 'feature-boolean':
|
||||
value = value == '1'
|
||||
elif '-numeric' in feat.tag:
|
||||
value = float(value)
|
||||
c.other[name] = value
|
||||
yield c
|
|
@ -1,16 +0,0 @@
|
|||
# Scripts
|
||||
|
||||
Here are some (one at the moment) scripts to prepare data for the conflator
|
||||
or do stuff after conflating.
|
||||
|
||||
## pack_places.py
|
||||
|
||||
Prepares `places.bin` file for the geocoder. Requires three JSON files:
|
||||
|
||||
* places.json
|
||||
* regions.json
|
||||
* countries.json
|
||||
|
||||
These comprise the "places feed" and can be prepared using
|
||||
[these scripts](https://github.com/mapsme/geocoding_data). You can
|
||||
find a link to a ready-made feed in that repository.
|
|
@ -1,41 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
import json
|
||||
import struct
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def pack_coord(coord):
|
||||
data = struct.pack('<l', round(coord * 10000))
|
||||
return data[:-1]
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
path = '.'
|
||||
else:
|
||||
path = sys.argv[1]
|
||||
|
||||
with open(os.path.join(path, 'regions.json'), 'r') as f:
|
||||
regions = [(r, int(rid)) for rid, r in json.load(f).items() if r.get('iso')]
|
||||
reg_idx = {regions[i][1]: i for i in range(len(regions))}
|
||||
with open(os.path.join(path, 'countries.json'), 'r') as f:
|
||||
countries = [(r, int(rid)) for rid, r in json.load(f).items() if r.get('iso')]
|
||||
c_idx = {countries[i][1]: i for i in range(len(countries))}
|
||||
with open(os.path.join(path, 'places.json'), 'r') as f:
|
||||
places = json.load(f)
|
||||
|
||||
out = open('places.bin', 'wb')
|
||||
out.write(struct.pack('B', len(countries)))
|
||||
for c, _ in countries:
|
||||
out.write(struct.pack('2s', c['iso'].encode('ascii')))
|
||||
out.write(struct.pack('<h', len(regions)))
|
||||
for r, _ in regions:
|
||||
rname = r['iso'].encode('ascii')
|
||||
out.write(struct.pack('B', len(rname)))
|
||||
out.write(rname)
|
||||
for pl in places.values():
|
||||
if pl['country'] not in c_idx:
|
||||
continue
|
||||
out.write(pack_coord(pl['lon']))
|
||||
out.write(pack_coord(pl['lat']))
|
||||
out.write(struct.pack('B', c_idx[pl['country']]))
|
||||
out.write(struct.pack('<h', reg_idx.get(pl.get('region'), -1)))
|
1
setup.py
1
setup.py
|
@ -10,7 +10,6 @@ setup(
|
|||
author='Ilya Zverev',
|
||||
author_email='ilya@zverev.info',
|
||||
packages=['conflate'],
|
||||
package_data={'conflate': ['places.bin']},
|
||||
install_requires=[
|
||||
'kdtree',
|
||||
'requests',
|
||||
|
|
Loading…
Add table
Reference in a new issue