Initial commit, transferred from the omim repository

This commit is contained in:
Ilya Zverev 2017-04-11 12:56:10 +03:00
parent 7114208d8d
commit 631f828134
8 changed files with 687 additions and 0 deletions

31
README.md Normal file
View file

@ -0,0 +1,31 @@
# mwm.py
It is a python library to read contents of MAPS.ME mwm files. Not everything
is supported, but you can get at least all the features and their attributes.
We at MAPS.ME use this script to do analytics and maintenance.
## Installation
pip install mwm
## Usage
Just add `import mwm` to your script, and read an mwm file with:
```python
with open('file.mwm', 'rb') as f:
data = mwm.MWM(f)
```
## Tools
There are some useful tools in the relevant directory, which can serve as
the library usage examples:
* `dump_mwm.py` prints the header and some statistics on an mwm file.
* `find_feature.py` can find features inside an mwm by type or name.
* `ft2osm.py` converts a feature id to an OSM website link.
## License
Written by Ilya Zverev for MAPS.ME. Published under the Apache License 2.0.

2
mwm/__init__.py Normal file
View file

@ -0,0 +1,2 @@
from .mwm import MWM, Osm2Ft
from .mwmfile import MWMFile

305
mwm/mwm.py Normal file
View file

@ -0,0 +1,305 @@
# MWM Reader Module
from .mwmfile import MWMFile
from datetime import datetime
# Unprocessed sections: geomN, trgN, idx, sdx (search index), addr (search address), offs (feature offsets - succinct)
# Routing sections: mercedes (matrix), daewoo (edge data), infinity (edge id), skoda (shortcuts), chrysler (cross context), ftseg, node2ftseg
# (these mostly are succinct structures, except chrysler and node2ftseg, so no use trying to load them here)
# TODO:
# - Predictive reading of LineStrings
# - Find why polygon geometry is incorrect in iter_features()
# - Find feature ids in the 'dat' section, or find a way to read the 'offs' section
class MWM(MWMFile):
# indexer/feature_meta.hpp
metadata = ["0",
"cuisine", "open_hours", "phone_number", "fax_number", "stars",
"operator", "url", "website", "internet", "ele",
"turn_lanes", "turn_lanes_forward", "turn_lanes_backward", "email", "postcode",
"wikipedia", "maxspeed", "flats", "height", "min_height",
"denomination", "building_levels", "test_id", "ref:sponsored", "price_rate",
"rating", "fuel", "routes"]
regiondata = ["languages", "driving", "timezone", "addr_fmt", "phone_fmt", "postcode_fmt", "holidays", "housenames"]
def __init__(self, f):
MWMFile.__init__(self, f)
self.read_tags()
self.read_header()
self.type_mapping = []
def read_types(self, filename):
with open(filename, 'r') as ft:
for line in ft:
if len(line.strip()) > 0:
self.type_mapping.append(line.strip().replace('|', '-'))
def read_version(self):
"""Reads 'version' section."""
self.seek_tag('version')
self.f.read(4) # skip prolog
fmt = self.read_varuint() + 1
version = self.read_varuint()
if version < 161231:
version = datetime(2000 + int(version / 10000), int(version / 100) % 100, version % 100)
else:
version = datetime.fromtimestamp(version)
return {'fmt': fmt, 'version': version}
def read_header(self):
"""Reads 'header' section."""
if not self.has_tag('header'):
# Stub for routing files
self.coord_size = (1 << 30) - 1
return {}
self.seek_tag('header')
result = {}
coord_bits = self.read_varuint()
self.coord_size = (1 << coord_bits) - 1
self.base_point = self.mwm_bitwise_split(self.read_varuint())
result['basePoint'] = self.to_4326(self.base_point)
result['bounds'] = self.read_bounds()
result['scales'] = self.read_uint_array()
langs = self.read_uint_array()
for i in range(len(langs)):
if i < len(self.languages):
langs[i] = self.languages[langs[i]]
result['langs'] = langs
map_type = self.read_varint()
if map_type == 0:
result['mapType'] = 'world'
elif map_type == 1:
result['mapType'] = 'worldcoasts'
elif map_type == 2:
result['mapType'] = 'country'
else:
result['mapType'] = 'unknown: {0}'.format(map_type)
return result
# COMPLEX READERS
def read_region_info(self):
if not self.has_tag('rgninfo'):
return {}
fields = {}
self.seek_tag('rgninfo')
sz = self.read_varuint()
if sz:
for i in range(sz):
t = self.read_varuint()
t = self.regiondata[t] if t < len(self.regiondata) else str(t)
fields[t] = self.read_string()
if t == 'languages':
fields[t] = [self.languages[ord(x)] for x in fields[t]]
return fields
def read_metadata(self):
"""Reads 'meta' and 'metaidx' sections."""
if not self.has_tag('metaidx'):
return {}
# Metadata format is different since v8
fmt = self.read_version()['fmt']
# First, read metaidx, to match featureId <-> metadata
self.seek_tag('metaidx')
ftid_meta = []
while self.inside_tag('metaidx'):
ftid = self.read_uint(4)
moffs = self.read_uint(4)
ftid_meta.append((moffs, ftid))
# Sort ftid_meta array
ftid_meta.sort(key=lambda x: x[0])
ftpos = 0
# Now read metadata
self.seek_tag('meta')
metadatar = {}
while self.inside_tag('meta'):
tag_pos = self.tag_offset('meta')
fields = {}
if fmt >= 8:
sz = self.read_varuint()
if sz:
for i in range(sz):
t = self.read_varuint()
t = self.metadata[t] if t < len(self.metadata) else str(t)
fields[t] = self.read_string()
if t == 'fuel':
fields[t] = fields[t].split('\x01')
else:
while True:
t = self.read_uint(1)
is_last = t & 0x80 > 0
t = t & 0x7f
t = self.metadata[t] if t < len(self.metadata) else str(t)
l = self.read_uint(1)
fields[t] = self.f.read(l).decode('utf-8')
if is_last:
break
if len(fields):
while ftpos < len(ftid_meta) and ftid_meta[ftpos][0] < tag_pos:
ftpos += 1
if ftpos < len(ftid_meta):
if ftid_meta[ftpos][0] == tag_pos:
metadatar[ftid_meta[ftpos][1]] = fields
return metadatar
def read_crossmwm(self):
"""Reads 'chrysler' section (cross-mwm routing table)."""
if not self.has_tag('chrysler'):
return {}
self.seek_tag('chrysler')
# Ingoing nodes: array of (nodeId, coord) tuples
incomingCount = self.read_uint(4)
incoming = []
for i in range(incomingCount):
nodeId = self.read_uint(4)
point = self.read_coord(False)
incoming.append((nodeId, point))
# Outgoing nodes: array of (nodeId, coord, outIndex) tuples
# outIndex is an index in neighbours array
outgoingCount = self.read_uint(4)
outgoing = []
for i in range(outgoingCount):
nodeId = self.read_uint(4)
point = self.read_coord(False)
outIndex = self.read_uint(1)
outgoing.append((nodeId, point, outIndex))
# Adjacency matrix: costs of routes for each (incoming, outgoing) tuple
matrix = []
for i in range(incomingCount):
sub = []
for j in range(outgoingCount):
sub.append(self.read_uint(4))
matrix.append(sub)
# List of mwms to which leads each outgoing node
neighboursCount = self.read_uint(4)
neighbours = []
for i in range(neighboursCount):
size = self.read_uint(4)
neighbours.append(self.f.read(size).decode('utf-8'))
return {'in': incoming, 'out': outgoing, 'matrix': matrix, 'neighbours': neighbours}
def iter_features(self, metadata=False):
"""Reads 'dat' section."""
if not self.has_tag('dat'):
return
# TODO: read 'offs'?
md = {}
if metadata:
md = self.read_metadata()
self.seek_tag('dat')
ftid = -1
while self.inside_tag('dat'):
ftid += 1
feature = {'id': ftid}
feature_size = self.read_varuint()
next_feature = self.f.tell() + feature_size
feature['size'] = feature_size
# Header
header = {}
header_bits = self.read_uint(1)
types_count = (header_bits & 0x07) + 1
has_name = header_bits & 0x08 > 0
has_layer = header_bits & 0x10 > 0
has_addinfo = header_bits & 0x80 > 0
geom_type = header_bits & 0x60
types = []
for i in range(types_count):
type_id = self.read_varuint()
if type_id < len(self.type_mapping):
types.append(self.type_mapping[type_id])
else:
types.append(str(type_id + 1)) # So the numbers match with mapcss-mapping.csv
header['types'] = types
if has_name:
header['name'] = self.read_multilang()
if has_layer:
header['layer'] = self.read_uint(1)
if has_addinfo:
if geom_type == MWM.GeomType.POINT:
header['rank'] = self.read_uint(1)
elif geom_type == MWM.GeomType.LINE:
header['ref'] = self.read_string()
elif geom_type == MWM.GeomType.AREA or geom_type == MWM.GeomType.POINT_EX:
header['house'] = self.read_numeric_string()
feature['header'] = header
# Metadata
if ftid in md:
feature['metadata'] = md[ftid]
# Geometry
geometry = {}
if geom_type == MWM.GeomType.POINT or geom_type == MWM.GeomType.POINT_EX:
geometry['type'] = 'Point'
elif geom_type == MWM.GeomType.LINE:
geometry['type'] = 'LineString'
elif geom_type == MWM.GeomType.AREA:
geometry['type'] = 'Polygon'
if geom_type == MWM.GeomType.POINT:
geometry['coordinates'] = list(self.read_coord())
# (flipping table emoticon)
feature['geometry'] = geometry
if False:
if geom_type != MWM.GeomType.POINT:
polygon_count = self.read_varuint()
polygons = []
for i in range(polygon_count):
count = self.read_varuint()
buf = self.f.read(count)
# TODO: decode
geometry['coordinates'] = polygons
feature['coastCell'] = self.read_varint()
# OSM IDs
count = self.read_varuint()
osmids = []
for i in range(count):
osmid = self.read_osmid()
osmids.append('{0}{1}'.format(osmid[0], osmid[1]))
feature['osmIds'] = osmids
if self.f.tell() > next_feature:
raise Exception('Feature parsing error, read too much')
yield feature
self.f.seek(next_feature)
class Osm2Ft(MWMFile):
def __init__(self, f, ft2osm=False, tuples=True):
MWMFile.__init__(self, f)
self.read(ft2osm, tuples)
def read(self, ft2osm=False, tuples=True):
"""Reads mwm.osm2ft file, returning a dict of feature id <-> osm way id."""
count = self.read_varuint()
self.data = {}
self.ft2osm = ft2osm
for i in range(count):
osmid = self.read_osmid(tuples)
fid = self.read_uint(4)
self.read_uint(4) # filler
if osmid is not None:
if ft2osm:
self.data[fid] = osmid
else:
self.data[osmid] = fid
def __getitem__(self, k):
return self.data.get(k)
def __repr__(self):
return '{} with {} items'.format('ft2osm' if self.ft2osm else 'osm2ft', len(self.data))
def __len__(self):
return len(self.data)
def __contains__(self, k):
return k in self.data
def __iter__(self):
return iter(self.data)

218
mwm/mwmfile.py Normal file
View file

@ -0,0 +1,218 @@
# MWM Reader Module
import struct
import math
class MWMFile(object):
# coding/multilang_utf8_string.cpp
languages = ["default",
"en", "ja", "fr", "ko_rm", "ar", "de", "int_name", "ru", "sv", "zh", "fi", "be", "ka", "ko",
"he", "nl", "ga", "ja_rm", "el", "it", "es", "zh_pinyin", "th", "cy", "sr", "uk", "ca", "hu",
"hsb", "eu", "fa", "br", "pl", "hy", "kn", "sl", "ro", "sq", "am", "fy", "cs", "gd", "sk",
"af", "ja_kana", "lb", "pt", "hr", "fur", "vi", "tr", "bg", "eo", "lt", "la", "kk", "gsw",
"et", "ku", "mn", "mk", "lv", "hi"]
def __init__(self, f):
self.f = f
self.tags = {}
self.coord_size = None
self.base_point = (0, 0)
def read_tags(self):
self.f.seek(0)
self.f.seek(self.read_uint(8))
cnt = self.read_varuint()
for i in range(cnt):
name = self.read_string(plain=True)
offset = self.read_varuint()
length = self.read_varuint()
self.tags[name] = (offset, length)
def has_tag(self, tag):
return tag in self.tags and self.tags[tag][1] > 0
def seek_tag(self, tag):
self.f.seek(self.tags[tag][0])
def tag_offset(self, tag):
return self.f.tell() - self.tags[tag][0]
def inside_tag(self, tag):
pos = self.tag_offset(tag)
return pos >= 0 and pos < self.tags[tag][1]
def read_uint(self, bytelen=1):
if bytelen == 1:
fmt = 'B'
elif bytelen == 2:
fmt = 'H'
elif bytelen == 4:
fmt = 'I'
elif bytelen == 8:
fmt = 'Q'
else:
raise Exception('Bytelen {0} is not supported'.format(bytelen))
res = struct.unpack(fmt, self.f.read(bytelen))
return res[0]
def read_varuint(self):
res = 0
shift = 0
more = True
while more:
b = self.f.read(1)
if not b:
return res
try:
bc = ord(b)
except TypeError:
bc = b
res |= (bc & 0x7F) << shift
shift += 7
more = bc >= 0x80
return res
@staticmethod
def zigzag_decode(uint):
res = uint >> 1
return res if uint & 1 == 0 else -res
def read_varint(self):
return self.zigzag_decode(self.read_varuint())
class GeomType:
POINT = 0
LINE = 1 << 5
AREA = 1 << 6
POINT_EX = 3 << 5
class OsmIdCode:
NODE = 0x4000000000000000
WAY = 0x8000000000000000
RELATION = 0xC000000000000000
RESET = ~(NODE | WAY | RELATION)
@staticmethod
def unpack_osmid(num):
if num & MWMFile.OsmIdCode.RELATION == MWMFile.OsmIdCode.RELATION:
typ = 'r'
elif num & MWMFile.OsmIdCode.WAY == MWMFile.OsmIdCode.WAY:
typ = 'w'
elif num & MWMFile.OsmIdCode.NODE == MWMFile.OsmIdCode.NODE:
typ = 'n'
else:
return None
return typ, num & MWMFile.OsmIdCode.RESET
def read_osmid(self, as_tuple=True):
osmid = self.read_uint(8)
return self.unpack_osmid(osmid) if as_tuple else osmid
def mwm_unshuffle(self, x):
x = ((x & 0x22222222) << 1) | ((x >> 1) & 0x22222222) | (x & 0x99999999)
x = ((x & 0x0C0C0C0C) << 2) | ((x >> 2) & 0x0C0C0C0C) | (x & 0xC3C3C3C3)
x = ((x & 0x00F000F0) << 4) | ((x >> 4) & 0x00F000F0) | (x & 0xF00FF00F)
x = ((x & 0x0000FF00) << 8) | ((x >> 8) & 0x0000FF00) | (x & 0xFF0000FF)
return x
def mwm_bitwise_split(self, v):
hi = self.mwm_unshuffle(v >> 32)
lo = self.mwm_unshuffle(v & 0xFFFFFFFF)
x = ((hi & 0xFFFF) << 16) | (lo & 0xFFFF)
y = (hi & 0xFFFF0000) | (lo >> 16)
return (x, y)
def mwm_decode_delta(self, v, ref):
x, y = self.mwm_bitwise_split(v)
return ref[0] + self.zigzag_decode(x), ref[1] + self.zigzag_decode(y)
def read_point(self, ref, packed=True):
"""Reads an unsigned point, returns (x, y)."""
if packed:
u = self.read_varuint()
else:
u = self.read_uint(8)
return self.mwm_decode_delta(u, ref)
def to_4326(self, point):
"""Convert a point in maps.me-mercator CS to WGS-84 (EPSG:4326)."""
if self.coord_size is None:
raise Exception('Call read_header() first.')
merc_bounds = (-180.0, -180.0, 180.0, 180.0) # Xmin, Ymin, Xmax, Ymax
x = point[0] * (merc_bounds[2] - merc_bounds[0]) / self.coord_size + merc_bounds[0]
y = point[1] * (merc_bounds[3] - merc_bounds[1]) / self.coord_size + merc_bounds[1]
y = 360.0 * math.atan(math.tanh(y * math.pi / 360.0)) / math.pi
return (x, y)
def read_coord(self, packed=True):
"""Reads a pair of coords in degrees mercator, returns (lon, lat)."""
point = self.read_point(self.base_point, packed)
return self.to_4326(point)
def read_bounds(self):
"""Reads mercator bounds, returns (min_lon, min_lat, max_lon, max_lat)."""
rmin = self.mwm_bitwise_split(self.read_varint())
rmax = self.mwm_bitwise_split(self.read_varint())
pmin = self.to_4326(rmin)
pmax = self.to_4326(rmax)
return (pmin[0], pmin[1], pmax[0], pmax[1])
def read_string(self, plain=False, decode=True):
length = self.read_varuint() + (0 if plain else 1)
s = self.f.read(length)
return s.decode('utf-8') if decode else s
def read_uint_array(self):
length = self.read_varuint()
result = []
for i in range(length):
result.append(self.read_varuint())
return result
def read_numeric_string(self):
sz = self.read_varuint()
if sz & 1 != 0:
return str(sz >> 1)
sz = (sz >> 1) + 1
return self.f.read(sz).decode('utf-8')
def read_multilang(self):
def find_multilang_next(s, i):
i += 1
while i < len(s):
try:
c = ord(s[i])
except:
c = s[i]
if c & 0xC0 == 0x80:
break
if c & 0x80 == 0:
pass
elif c & 0xFE == 0xFE:
i += 6
elif c & 0xFC == 0xFC:
i += 5
elif c & 0xF8 == 0xF8:
i += 4
elif c & 0xF0 == 0xF0:
i += 3
elif c & 0xE0 == 0xE0:
i += 2
elif c & 0xC0 == 0xC0:
i += 1
i += 1
return i
s = self.read_string(decode=False)
langs = {}
i = 0
while i < len(s):
n = find_multilang_next(s, i)
try:
lng = ord(s[i]) & 0x3F
except TypeError:
lng = s[i] & 0x3F
if lng < len(self.languages):
langs[self.languages[lng]] = s[i+1:n].decode('utf-8')
i = n
return langs

28
setup.py Normal file
View file

@ -0,0 +1,28 @@
from setuptools import setup
from os import path
here = path.abspath(path.dirname(__file__))
setup(
name='mwm',
version='0.9.0',
author='Ilya Zverev',
author_email='ilya@zverev.info',
packages=['mwm'],
url='http://pypi.python.org/pypi/mwm/',
license='Apache License 2.0',
description='Library to read binary MAPS.ME files.',
long_description=open(path.join(here, 'README.md')).read(),
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Utilities',
'Environment :: Console',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
],
)

40
tools/dump_mwm.py Executable file
View file

@ -0,0 +1,40 @@
#!/usr/bin/python
import sys, os.path, random
import json
from mwm import MWM
if len(sys.argv) < 2:
print('Dumps some MWM structures.')
print('Usage: {0} <country.mwm>'.format(sys.argv[0]))
sys.exit(1)
mwm = MWM(open(sys.argv[1], 'rb'))
mwm.read_types(os.path.join(os.path.dirname(sys.argv[0]), '..', '..', '..', '..', 'data', 'types.txt'))
print('Tags:')
tvv = sorted([(k, v[0], v[1]) for k, v in mwm.tags.items()], key=lambda x: x[1])
for tv in tvv:
print(' {0:<8}: offs {1:9} len {2:8}'.format(tv[0], tv[1], tv[2]))
v = mwm.read_version()
print('Format: {0}, version: {1}'.format(v['fmt'], v['version'].strftime('%Y-%m-%d %H:%M')))
print('Header: {0}'.format(mwm.read_header()))
print('Region Info: {0}'.format(mwm.read_region_info()))
print('Metadata count: {0}'.format(len(mwm.read_metadata())))
cross = mwm.read_crossmwm()
if cross:
print('Outgoing points: {0}, incoming: {1}'.format(len(cross['out']), len(cross['in'])))
print('Outgoing regions: {0}'.format(set(cross['neighbours'])))
# Print some random features using reservoir sampling
count = 5
sample = []
for i, feature in enumerate(mwm.iter_features()):
if i < count:
sample.append(feature)
elif random.randint(0, i) < count:
sample[random.randint(0, count-1)] = feature
print('Feature count: {0}'.format(i))
print('Sample features:')
for feature in sample:
print(json.dumps(feature, ensure_ascii=False))

41
tools/find_feature.py Executable file
View file

@ -0,0 +1,41 @@
#!/usr/bin/env python
import sys, os.path, json
from mwm import MWM
if len(sys.argv) < 4:
print('Finds features in an mwm file based on a query')
print('Usage: {0} <country.mwm> <type> <string>'.format(sys.argv[0]))
print('')
print('Type:')
print(' t for inside types ("t hwtag" will find all hwtags-*)')
print(' et for exact type ("et shop" won\'t find shop-chemist)')
print(' n for names, case-sensitive ("n Starbucks" for all starbucks)')
print(' m for metadata keys ("m flats" for features with flats)')
print(' id for feature id ("id 1234" for feature #1234)')
sys.exit(1)
typ = sys.argv[2].lower()
find = sys.argv[3].decode('utf-8')
mwm = MWM(open(sys.argv[1], 'rb'))
mwm.read_header()
mwm.read_types(os.path.join(os.path.dirname(sys.argv[0]), '..', '..', '..', '..', 'data', 'types.txt'))
for i, feature in enumerate(mwm.iter_features(metadata=True)):
found = False
if typ == 'n' and 'name' in feature['header']:
for value in feature['header']['name'].values():
if find in value:
found = True
elif typ in ('t', 'et'):
for t in feature['header']['types']:
if t == find:
found = True
elif typ == 't' and find in t:
found = True
elif typ == 'm' and 'metadata' in feature:
if find in feature['metadata']:
found = True
elif typ == 'id' and i == int(find):
found = True
if found:
print(json.dumps(feature, ensure_ascii=False, sort_keys=True).encode('utf-8'))

22
tools/ft2osm.py Executable file
View file

@ -0,0 +1,22 @@
#!/usr/bin/env python
import sys
import mwm
if len(sys.argv) < 3:
print('Finds an OSM object for a given feature id.')
print('Usage: {} <mwm.osm2ft> <ftid>'.format(sys.argv[0]))
sys.exit(1)
with open(sys.argv[1], 'rb') as f:
ft2osm = mwm.Osm2Ft(f, True)
code = 0
type_abbr = {'n': 'node', 'w': 'way', 'r': 'relation'}
for ftid in sys.argv[2:]:
ftid = int(ftid)
if ftid in ft2osm:
print('https://www.openstreetmap.org/{}/{}'.format(type_abbr[ft2osm[ftid][0]], ft2osm[ftid][1]))
else:
print('Could not find osm id for feature {}'.format(ftid))
code = 2
sys.exit(code)