Merge pull request #1 from mapsme/newcode2

Initial commit, transferred from the omim repository
This commit is contained in:
Ilya Zverev 2017-06-08 11:50:11 +03:00 committed by GitHub
commit 0a1fba1d4c
8 changed files with 731 additions and 0 deletions

2
.gitignore vendored
View file

@ -1,3 +1,5 @@
build/
__pycache__/
*.pyc
dist/
*.egg*

9
CHANGELOG.md Normal file
View file

@ -0,0 +1,9 @@
# mwm.py Change Log
## master branch
## 0.9.0
_Released 2017-06-08_
The initial release with some features.

39
README.rst Normal file
View file

@ -0,0 +1,39 @@
mwm.py
======
It is a python library to read contents of MAPS.ME mwm files. Not
everything is supported, but you can get at least all the features and
their attributes. We at MAPS.ME use this script to do analytics and
maintenance.
Installation
------------
::
pip install mwm
Usage
-----
Just add ``import mwm`` to your script, and read an mwm file with:
.. code:: python
with open('file.mwm', 'rb') as f:
data = mwm.MWM(f)
Tools
-----
The package installs the ``mwmtool`` command-line script. It shows
statistics about an MWM file, can search for features or convert ids.
Run it with ``-h`` to see a list of options.
The script source can serve as a library usage example.
License
-------
Written by Ilya Zverev for MAPS.ME. Published under the Apache License
2.0.

2
mwm/__init__.py Normal file
View file

@ -0,0 +1,2 @@
from .mwm import MWM, Osm2Ft, __version__
from .mwmfile import MWMFile

306
mwm/mwm.py Normal file
View file

@ -0,0 +1,306 @@
# MWM Reader Module
from .mwmfile import MWMFile
from datetime import datetime
__version__ = '0.9.0'
# Unprocessed sections: geomN, trgN, idx, sdx (search index),
# addr (search address), offs (feature offsets - succinct)
# TODO:
# - Predictive reading of LineStrings
# - Find why polygon geometry is incorrect in iter_features()
class MWM(MWMFile):
# indexer/feature_meta.hpp
metadata = ["0",
"cuisine", "open_hours", "phone_number", "fax_number", "stars",
"operator", "url", "website", "internet", "ele",
"turn_lanes", "turn_lanes_forward", "turn_lanes_backward", "email", "postcode",
"wikipedia", "maxspeed", "flats", "height", "min_height",
"denomination", "building_levels", "test_id", "ref:sponsored", "price_rate",
"rating", "fuel", "routes"]
regiondata = ["languages", "driving", "timezone", "addr_fmt", "phone_fmt", "postcode_fmt", "holidays", "housenames"]
def __init__(self, f):
MWMFile.__init__(self, f)
self.read_tags()
self.read_header()
self.type_mapping = []
def read_types(self, filename):
with open(filename, 'r') as ft:
for line in ft:
if len(line.strip()) > 0:
self.type_mapping.append(line.strip().replace('|', '-'))
def read_version(self):
"""Reads 'version' section."""
self.seek_tag('version')
self.f.read(4) # skip prolog
fmt = self.read_varuint() + 1
version = self.read_varuint()
if version < 161231:
vdate = datetime(2000 + int(version / 10000), int(version / 100) % 100, version % 100)
else:
vdate = datetime.fromtimestamp(version)
version = int(vdate.strftime('%y%m%d'))
return {'fmt': fmt, 'version': version, 'date': vdate}
def read_header(self):
"""Reads 'header' section."""
if not self.has_tag('header'):
# Stub for routing files
self.coord_size = (1 << 30) - 1
return {}
self.seek_tag('header')
result = {}
coord_bits = self.read_varuint()
self.coord_size = (1 << coord_bits) - 1
self.base_point = self.mwm_bitwise_split(self.read_varuint())
result['basePoint'] = self.to_4326(self.base_point)
result['bounds'] = self.read_bounds()
result['scales'] = self.read_uint_array()
langs = self.read_uint_array()
for i in range(len(langs)):
if i < len(self.languages):
langs[i] = self.languages[langs[i]]
result['langs'] = langs
map_type = self.read_varint()
if map_type == 0:
result['mapType'] = 'world'
elif map_type == 1:
result['mapType'] = 'worldcoasts'
elif map_type == 2:
result['mapType'] = 'country'
else:
result['mapType'] = 'unknown: {0}'.format(map_type)
return result
# COMPLEX READERS
def read_region_info(self):
if not self.has_tag('rgninfo'):
return {}
fields = {}
self.seek_tag('rgninfo')
sz = self.read_varuint()
if sz:
for i in range(sz):
t = self.read_varuint()
t = self.regiondata[t] if t < len(self.regiondata) else str(t)
fields[t] = self.read_string()
if t == 'languages':
fields[t] = [self.languages[ord(x)] for x in fields[t]]
return fields
def read_metadata(self):
"""Reads 'meta' and 'metaidx' sections."""
if not self.has_tag('metaidx'):
return {}
# Metadata format is different since v8
fmt = self.read_version()['fmt']
# First, read metaidx, to match featureId <-> metadata
self.seek_tag('metaidx')
ftid_meta = []
while self.inside_tag('metaidx'):
ftid = self.read_uint(4)
moffs = self.read_uint(4)
ftid_meta.append((moffs, ftid))
# Sort ftid_meta array
ftid_meta.sort(key=lambda x: x[0])
ftpos = 0
# Now read metadata
self.seek_tag('meta')
metadatar = {}
while self.inside_tag('meta'):
tag_pos = self.tag_offset('meta')
fields = {}
if fmt >= 8:
sz = self.read_varuint()
if sz:
for i in range(sz):
t = self.read_varuint()
t = self.metadata[t] if t < len(self.metadata) else str(t)
fields[t] = self.read_string()
if t == 'fuel':
fields[t] = fields[t].split('\x01')
else:
while True:
t = self.read_uint(1)
is_last = t & 0x80 > 0
t = t & 0x7f
t = self.metadata[t] if t < len(self.metadata) else str(t)
l = self.read_uint(1)
fields[t] = self.f.read(l).decode('utf-8')
if is_last:
break
if len(fields):
while ftpos < len(ftid_meta) and ftid_meta[ftpos][0] < tag_pos:
ftpos += 1
if ftpos < len(ftid_meta):
if ftid_meta[ftpos][0] == tag_pos:
metadatar[ftid_meta[ftpos][1]] = fields
return metadatar
def read_crossmwm(self):
"""Reads 'chrysler' section (cross-mwm routing table)."""
if not self.has_tag('chrysler'):
return {}
self.seek_tag('chrysler')
# Ingoing nodes: array of (nodeId, coord) tuples
incomingCount = self.read_uint(4)
incoming = []
for i in range(incomingCount):
nodeId = self.read_uint(4)
point = self.read_coord(False)
incoming.append((nodeId, point))
# Outgoing nodes: array of (nodeId, coord, outIndex) tuples
# outIndex is an index in neighbours array
outgoingCount = self.read_uint(4)
outgoing = []
for i in range(outgoingCount):
nodeId = self.read_uint(4)
point = self.read_coord(False)
outIndex = self.read_uint(1)
outgoing.append((nodeId, point, outIndex))
# Adjacency matrix: costs of routes for each (incoming, outgoing) tuple
matrix = []
for i in range(incomingCount):
sub = []
for j in range(outgoingCount):
sub.append(self.read_uint(4))
matrix.append(sub)
# List of mwms to which leads each outgoing node
neighboursCount = self.read_uint(4)
neighbours = []
for i in range(neighboursCount):
size = self.read_uint(4)
neighbours.append(self.f.read(size).decode('utf-8'))
return {'in': incoming, 'out': outgoing, 'matrix': matrix, 'neighbours': neighbours}
def iter_features(self, metadata=False):
"""Reads 'dat' section."""
if not self.has_tag('dat'):
return
# TODO: read 'offs'?
md = {}
if metadata:
md = self.read_metadata()
self.seek_tag('dat')
ftid = -1
while self.inside_tag('dat'):
ftid += 1
feature = {'id': ftid}
feature_size = self.read_varuint()
next_feature = self.f.tell() + feature_size
feature['size'] = feature_size
# Header
header = {}
header_bits = self.read_uint(1)
types_count = (header_bits & 0x07) + 1
has_name = header_bits & 0x08 > 0
has_layer = header_bits & 0x10 > 0
has_addinfo = header_bits & 0x80 > 0
geom_type = header_bits & 0x60
types = []
for i in range(types_count):
type_id = self.read_varuint()
if type_id < len(self.type_mapping):
types.append(self.type_mapping[type_id])
else:
types.append(str(type_id + 1)) # So the numbers match with mapcss-mapping.csv
header['types'] = types
if has_name:
header['name'] = self.read_multilang()
if has_layer:
header['layer'] = self.read_uint(1)
if has_addinfo:
if geom_type == MWM.GeomType.POINT:
header['rank'] = self.read_uint(1)
elif geom_type == MWM.GeomType.LINE:
header['ref'] = self.read_string()
elif geom_type == MWM.GeomType.AREA or geom_type == MWM.GeomType.POINT_EX:
header['house'] = self.read_numeric_string()
feature['header'] = header
# Metadata
if ftid in md:
feature['metadata'] = md[ftid]
# Geometry
geometry = {}
if geom_type == MWM.GeomType.POINT or geom_type == MWM.GeomType.POINT_EX:
geometry['type'] = 'Point'
elif geom_type == MWM.GeomType.LINE:
geometry['type'] = 'LineString'
elif geom_type == MWM.GeomType.AREA:
geometry['type'] = 'Polygon'
if geom_type == MWM.GeomType.POINT:
geometry['coordinates'] = list(self.read_coord())
# (flipping table emoticon)
feature['geometry'] = geometry
if False:
if geom_type != MWM.GeomType.POINT:
polygon_count = self.read_varuint()
polygons = []
for i in range(polygon_count):
count = self.read_varuint()
buf = self.f.read(count)
# TODO: decode
geometry['coordinates'] = polygons
feature['coastCell'] = self.read_varint()
# OSM IDs
count = self.read_varuint()
osmids = []
for i in range(count):
osmid = self.read_osmid()
osmids.append('{0}{1}'.format(osmid[0], osmid[1]))
feature['osmIds'] = osmids
if self.f.tell() > next_feature:
raise Exception('Feature parsing error, read too much')
yield feature
self.f.seek(next_feature)
class Osm2Ft(MWMFile):
def __init__(self, f, ft2osm=False, tuples=True):
MWMFile.__init__(self, f)
self.read(ft2osm, tuples)
def read(self, ft2osm=False, tuples=True):
"""Reads mwm.osm2ft file, returning a dict of feature id <-> osm way id."""
count = self.read_varuint()
self.data = {}
self.ft2osm = ft2osm
for i in range(count):
osmid = self.read_osmid(tuples)
fid = self.read_uint(4)
self.read_uint(4) # filler
if osmid is not None:
if ft2osm:
self.data[fid] = osmid
else:
self.data[osmid] = fid
def __getitem__(self, k):
return self.data.get(k)
def __repr__(self):
return '{} with {} items'.format('ft2osm' if self.ft2osm else 'osm2ft', len(self.data))
def __len__(self):
return len(self.data)
def __contains__(self, k):
return k in self.data
def __iter__(self):
return iter(self.data)

218
mwm/mwmfile.py Normal file
View file

@ -0,0 +1,218 @@
# MWM Reader Module
import struct
import math
class MWMFile(object):
# coding/multilang_utf8_string.cpp
languages = ["default",
"en", "ja", "fr", "ko_rm", "ar", "de", "int_name", "ru", "sv", "zh", "fi", "be", "ka", "ko",
"he", "nl", "ga", "ja_rm", "el", "it", "es", "zh_pinyin", "th", "cy", "sr", "uk", "ca", "hu",
"hsb", "eu", "fa", "br", "pl", "hy", "kn", "sl", "ro", "sq", "am", "fy", "cs", "gd", "sk",
"af", "ja_kana", "lb", "pt", "hr", "fur", "vi", "tr", "bg", "eo", "lt", "la", "kk", "gsw",
"et", "ku", "mn", "mk", "lv", "hi"]
def __init__(self, f):
self.f = f
self.tags = {}
self.coord_size = None
self.base_point = (0, 0)
def read_tags(self):
self.f.seek(0)
self.f.seek(self.read_uint(8))
cnt = self.read_varuint()
for i in range(cnt):
name = self.read_string(plain=True)
offset = self.read_varuint()
length = self.read_varuint()
self.tags[name] = (offset, length)
def has_tag(self, tag):
return tag in self.tags and self.tags[tag][1] > 0
def seek_tag(self, tag):
self.f.seek(self.tags[tag][0])
def tag_offset(self, tag):
return self.f.tell() - self.tags[tag][0]
def inside_tag(self, tag):
pos = self.tag_offset(tag)
return pos >= 0 and pos < self.tags[tag][1]
def read_uint(self, bytelen=1):
if bytelen == 1:
fmt = 'B'
elif bytelen == 2:
fmt = 'H'
elif bytelen == 4:
fmt = 'I'
elif bytelen == 8:
fmt = 'Q'
else:
raise Exception('Bytelen {0} is not supported'.format(bytelen))
res = struct.unpack(fmt, self.f.read(bytelen))
return res[0]
def read_varuint(self):
res = 0
shift = 0
more = True
while more:
b = self.f.read(1)
if not b:
return res
try:
bc = ord(b)
except TypeError:
bc = b
res |= (bc & 0x7F) << shift
shift += 7
more = bc >= 0x80
return res
@staticmethod
def zigzag_decode(uint):
res = uint >> 1
return res if uint & 1 == 0 else -res
def read_varint(self):
return self.zigzag_decode(self.read_varuint())
class GeomType:
POINT = 0
LINE = 1 << 5
AREA = 1 << 6
POINT_EX = 3 << 5
class OsmIdCode:
NODE = 0x4000000000000000
WAY = 0x8000000000000000
RELATION = 0xC000000000000000
RESET = ~(NODE | WAY | RELATION)
@staticmethod
def unpack_osmid(num):
if num & MWMFile.OsmIdCode.RELATION == MWMFile.OsmIdCode.RELATION:
typ = 'r'
elif num & MWMFile.OsmIdCode.WAY == MWMFile.OsmIdCode.WAY:
typ = 'w'
elif num & MWMFile.OsmIdCode.NODE == MWMFile.OsmIdCode.NODE:
typ = 'n'
else:
return None
return typ, num & MWMFile.OsmIdCode.RESET
def read_osmid(self, as_tuple=True):
osmid = self.read_uint(8)
return self.unpack_osmid(osmid) if as_tuple else osmid
def mwm_unshuffle(self, x):
x = ((x & 0x22222222) << 1) | ((x >> 1) & 0x22222222) | (x & 0x99999999)
x = ((x & 0x0C0C0C0C) << 2) | ((x >> 2) & 0x0C0C0C0C) | (x & 0xC3C3C3C3)
x = ((x & 0x00F000F0) << 4) | ((x >> 4) & 0x00F000F0) | (x & 0xF00FF00F)
x = ((x & 0x0000FF00) << 8) | ((x >> 8) & 0x0000FF00) | (x & 0xFF0000FF)
return x
def mwm_bitwise_split(self, v):
hi = self.mwm_unshuffle(v >> 32)
lo = self.mwm_unshuffle(v & 0xFFFFFFFF)
x = ((hi & 0xFFFF) << 16) | (lo & 0xFFFF)
y = (hi & 0xFFFF0000) | (lo >> 16)
return (x, y)
def mwm_decode_delta(self, v, ref):
x, y = self.mwm_bitwise_split(v)
return ref[0] + self.zigzag_decode(x), ref[1] + self.zigzag_decode(y)
def read_point(self, ref, packed=True):
"""Reads an unsigned point, returns (x, y)."""
if packed:
u = self.read_varuint()
else:
u = self.read_uint(8)
return self.mwm_decode_delta(u, ref)
def to_4326(self, point):
"""Convert a point in maps.me-mercator CS to WGS-84 (EPSG:4326)."""
if self.coord_size is None:
raise Exception('Call read_header() first.')
merc_bounds = (-180.0, -180.0, 180.0, 180.0) # Xmin, Ymin, Xmax, Ymax
x = point[0] * (merc_bounds[2] - merc_bounds[0]) / self.coord_size + merc_bounds[0]
y = point[1] * (merc_bounds[3] - merc_bounds[1]) / self.coord_size + merc_bounds[1]
y = 360.0 * math.atan(math.tanh(y * math.pi / 360.0)) / math.pi
return (x, y)
def read_coord(self, packed=True):
"""Reads a pair of coords in degrees mercator, returns (lon, lat)."""
point = self.read_point(self.base_point, packed)
return self.to_4326(point)
def read_bounds(self):
"""Reads mercator bounds, returns (min_lon, min_lat, max_lon, max_lat)."""
rmin = self.mwm_bitwise_split(self.read_varint())
rmax = self.mwm_bitwise_split(self.read_varint())
pmin = self.to_4326(rmin)
pmax = self.to_4326(rmax)
return (pmin[0], pmin[1], pmax[0], pmax[1])
def read_string(self, plain=False, decode=True):
length = self.read_varuint() + (0 if plain else 1)
s = self.f.read(length)
return s.decode('utf-8') if decode else s
def read_uint_array(self):
length = self.read_varuint()
result = []
for i in range(length):
result.append(self.read_varuint())
return result
def read_numeric_string(self):
sz = self.read_varuint()
if sz & 1 != 0:
return str(sz >> 1)
sz = (sz >> 1) + 1
return self.f.read(sz).decode('utf-8')
def read_multilang(self):
def find_multilang_next(s, i):
i += 1
while i < len(s):
try:
c = ord(s[i])
except:
c = s[i]
if c & 0xC0 == 0x80:
break
if c & 0x80 == 0:
pass
elif c & 0xFE == 0xFE:
i += 6
elif c & 0xFC == 0xFC:
i += 5
elif c & 0xF8 == 0xF8:
i += 4
elif c & 0xF0 == 0xF0:
i += 3
elif c & 0xE0 == 0xE0:
i += 2
elif c & 0xC0 == 0xC0:
i += 1
i += 1
return i
s = self.read_string(decode=False)
langs = {}
i = 0
while i < len(s):
n = find_multilang_next(s, i)
try:
lng = ord(s[i]) & 0x3F
except TypeError:
lng = s[i] & 0x3F
if lng < len(self.languages):
langs[self.languages[lng]] = s[i+1:n].decode('utf-8')
i = n
return langs

123
mwm/mwmtool.py Executable file
View file

@ -0,0 +1,123 @@
#!/usr/bin/env python
import sys
import os.path
import random
import json
import argparse
from mwm import MWM, Osm2Ft
def dump_mwm(args):
mwm = MWM(args.mwm)
if os.path.exists(args.types):
mwm.read_types(args.types)
print('Tags:')
tvv = sorted([(k, v[0], v[1]) for k, v in mwm.tags.items()], key=lambda x: x[1])
for tv in tvv:
print(' {0:<8}: offs {1:9} len {2:8}'.format(tv[0], tv[1], tv[2]))
v = mwm.read_version()
print('Format: {0}, version: {1}'.format(v['fmt'], v['date'].strftime('%Y-%m-%d %H:%M')))
print('Header: {0}'.format(mwm.read_header()))
print('Region Info: {0}'.format(mwm.read_region_info()))
print('Metadata count: {0}'.format(len(mwm.read_metadata())))
cross = mwm.read_crossmwm()
if cross:
print('Outgoing points: {0}, incoming: {1}'.format(len(cross['out']), len(cross['in'])))
print('Outgoing regions: {0}'.format(set(cross['neighbours'])))
# Print some random features using reservoir sampling
count = 5
sample = []
for i, feature in enumerate(mwm.iter_features()):
if i < count:
sample.append(feature)
elif random.randint(0, i) < count:
sample[random.randint(0, count-1)] = feature
print('Feature count: {0}'.format(i))
print('Sample features:')
for feature in sample:
print(json.dumps(feature, ensure_ascii=False))
def find_feature(args):
mwm = MWM(args.mwm)
mwm.read_header()
if os.path.exists(args.types):
mwm.read_types(args.types)
if args.iname:
args.iname = args.iname.lower()
for i, feature in enumerate(mwm.iter_features(metadata=True)):
if args.fid and i != args.fid:
continue
if args.name or args.iname:
if 'name' not in feature['header']:
continue
found = False
for value in feature['header']['name'].values():
if args.name and args.name in value:
found = True
elif args.iname and args.iname in value.lower():
found = True
if not found:
continue
if args.type or args.exact_type:
found = False
for t in feature['header']['types']:
if t == args.type or t == args.exact_type:
found = True
elif args.type and args.type in t:
found = True
if not found:
continue
if args.meta and ('metadata' not in feature or args.meta not in feature['metadata']):
continue
print(json.dumps(feature, ensure_ascii=False, sort_keys=True))
def ft2osm(args):
ft2osm = Osm2Ft(args.osm2ft, True)
code = 0
type_abbr = {'n': 'node', 'w': 'way', 'r': 'relation'}
for ftid in args.ftid:
if ftid in ft2osm:
print('https://www.openstreetmap.org/{}/{}'.format(type_abbr[ft2osm[ftid][0]], ft2osm[ftid][1]))
else:
print('Could not find osm id for feature {}'.format(ftid))
code = 2
return code
def main():
parser = argparse.ArgumentParser(description='Toolbox for MWM files.')
parser.add_argument('--types', default=os.path.join(os.path.dirname(sys.argv[0]), '..', '..', '..', '..', 'data', 'types.txt'), help='path to types.txt')
subparsers = parser.add_subparsers(dest='cmd')
subparsers.required = True
parser_dump = subparsers.add_parser('dump', help='Dumps some structures.')
parser_dump.add_argument('mwm', type=argparse.FileType('rb'), help='file to browse')
parser_dump.set_defaults(func=dump_mwm)
parser_find = subparsers.add_parser('find', help='Finds features in a file.')
parser_find.add_argument('mwm', type=argparse.FileType('rb'), help='file to search')
parser_find.add_argument('-t', dest='type', help='look inside types ("-t hwtag" will find all hwtags-*)')
parser_find.add_argument('-et', dest='exact_type', help='look for a type ("-et shop won\'t find shop-chemist)')
parser_find.add_argument('-n', dest='name', help='look inside names, case-sensitive ("-n Starbucks" for all starbucks)')
parser_find.add_argument('-in', dest='iname', help='look inside names, case-insensitive ("-in star" will find Starbucks)')
parser_find.add_argument('-m', dest='meta', help='look for a metadata key ("m flats" for features with flats)')
parser_find.add_argument('-id', dest='fid', type=int, help='look for a feature id ("-id 1234 for feature #1234)')
parser_find.set_defaults(func=find_feature)
parser_osm = subparsers.add_parser('osm', help='Displays an OpenStreetMap link for a feature id.')
parser_osm.add_argument('osm2ft', type=argparse.FileType('rb'), help='.mwm.osm2ft file')
parser_osm.add_argument('ftid', type=int, nargs='+', help='feature id')
parser_osm.set_defaults(func=ft2osm)
args = parser.parse_args()
code = args.func(args)
if code is not None:
sys.exit(code)
if __name__ == '__main__':
main()

32
setup.py Normal file
View file

@ -0,0 +1,32 @@
from setuptools import setup
from os import path
from mwm import __version__
here = path.abspath(path.dirname(__file__))
setup(
name='mwm',
version=__version__,
author='Ilya Zverev',
author_email='ilya@zverev.info',
packages=['mwm'],
url='https://github.com/mapsme/mwm.py',
license='Apache License 2.0',
description='Library to read binary MAPS.ME files.',
long_description=open(path.join(here, 'README.rst')).read(),
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Utilities',
'Environment :: Console',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
],
entry_points={
'console_scripts': ['mwmtool = mwm.mwmtool:main']
},
)