From e6152683c48681bc02048f17e6c225056d5626d6 Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Tue, 26 Jan 2016 17:18:09 +0300 Subject: [PATCH 1/9] Started work on mwm.py --- tools/python/mwm/dump_mwm.py | 24 +++ tools/python/mwm/mwm.py | 382 +++++++++++++++++++++++++++++++++++ 2 files changed, 406 insertions(+) create mode 100755 tools/python/mwm/dump_mwm.py create mode 100644 tools/python/mwm/mwm.py diff --git a/tools/python/mwm/dump_mwm.py b/tools/python/mwm/dump_mwm.py new file mode 100755 index 0000000000..bc86ba53cd --- /dev/null +++ b/tools/python/mwm/dump_mwm.py @@ -0,0 +1,24 @@ +#!/usr/bin/python +import sys, os.path +import itertools +from mwm import MWM + +if len(sys.argv) < 2: + print 'Dumps some MWM structures.' + print 'Usage: {0} '.format(sys.argv[0]) + sys.exit(1) + +mwm = MWM(open(sys.argv[1], 'rb')) +mwm.read_types(os.path.join(os.path.dirname(sys.argv[0]), '..', '..', '..', 'data', 'types.txt')) +print 'Tags:' +for tag, value in mwm.tags.iteritems(): + print ' {0:<8}: offs {1:9} len {2:8}'.format(tag, value[0], value[1]) +print 'Version:', mwm.read_version() +print 'Header:', mwm.read_header() +print 'Metadata count:', len(mwm.read_metadata()) +cross = mwm.read_crossmwm() +if cross: + print 'Outgoing points:', len(cross['out']), 'incoming:', len(cross['in']) + print 'Outgoing regions:', set(cross['neighbours']) +for feature in itertools.islice(mwm.iter_features(), 10): + print feature diff --git a/tools/python/mwm/mwm.py b/tools/python/mwm/mwm.py new file mode 100644 index 0000000000..1ad6851331 --- /dev/null +++ b/tools/python/mwm/mwm.py @@ -0,0 +1,382 @@ +# MWM Reader Module +import struct +import math + +# Unprocessed sections: geomN, trgN, idx, sdx (search index), addr (search address), offs (feature offsets), dat (!) +# Routing sections: mercedes (matrix), daewoo (edge data), infinity (edge id), skoda (shortcuts), chrysler (cross context), ftseg, node2ftseg +# (these mostly are succinct structures, except chrysler and node2ftseg, so no use trying to load them here) + +class MWM: + languages = ["default", + "en", "ja", "fr", "ko_rm", "ar", "de", "int_name", "ru", "sv", "zh", "fi", "be", "ka", "ko", + "he", "nl", "ga", "ja_rm", "el", "it", "es", "zh_pinyin", "th", "cy", "sr", "uk", "ca", "hu", + "hsb", "eu", "fa", "br", "pl", "hy", "kn", "sl", "ro", "sq", "am", "fy", "cs", "gd", "sk", + "af", "ja_kana", "lb", "pt", "hr", "fur", "vi", "tr", "bg", "eo", "lt", "la", "kk", "gsw", + "et", "ku", "mn", "mk", "lv", "hi"] + + metadata = ["0", + "cuisine", "open_hours", "phone_number", "fax_number", "stars", + "operator", "url", "website", "internet", "ele", + "turn_lanes", "turn_lanes_forward", "turn_lanes_backward", "email", "postcode", + "wikipedia", "maxspeed", "flats", "height", "min_height", + "denomination", "building_levels" + ] + + def __init__(self, f): + self.f = f + self.coord_size = None + self.base_point = (0, 0) + self.read_info() + self.type_mapping = [] + + def read_types(self, filename): + with open(filename, 'r') as ft: + for line in ft: + if len(line.strip()) > 0: + self.type_mapping.append(line.strip()) + + def read_info(self): + self.f.seek(0) + self.f.seek(self.read_uint(8)) + cnt = self.read_varuint() + self.tags = {} + for i in range(cnt): + name = self.read_string(True) + offset = self.read_varuint() + length = self.read_varuint() + self.tags[name] = (offset, length) + + def has_tag(self, tag): + return tag in self.tags and self.tags[tag][1] > 0 + + def seek_tag(self, tag): + self.f.seek(self.tags[tag][0]) + + def inside_tag(self, tag): + pos = self.tag_position(tag) + return pos >= 0 and pos < self.tags[tag][1] + + def tag_position(self, tag): + return self.f.tell() - self.tags[tag][0] + + def read_version(self): + """Reads 'version' section.""" + self.seek_tag('version') + self.f.read(4) # skip prolog + fmt = self.read_varuint() + 1 + version = self.read_varuint() + return { 'fmt': fmt, 'version': version } + + def read_header(self): + """Reads 'header' section.""" + if not self.has_tag('header'): + # Stub for routing files + self.coord_size = (1 << 30) - 1 + return {} + self.seek_tag('header') + result = {} + coord_bits = self.read_varuint() + self.coord_size = (1 << coord_bits) - 1 + self.base_point = self.read_coord(convert=False) + result['basePoint'] = self.to_4326(self.base_point) + result['bounds'] = self.read_bounds() + result['scales'] = self.read_uint_array() + langs = self.read_uint_array() + for i in range(len(langs)): + if i < len(self.languages): + langs[i] = self.languages[langs[i]] + result['langs'] = langs + map_type = self.read_varint() + if map_type == 0: + result['mapType'] = 'world' + elif map_type == 1: + result['mapType'] = 'worldcoasts' + elif map_type == 2: + result['mapType'] = 'country' + else: + result['mapType'] = 'unknown: {0}'.format(map_type) + return result + + # COMPLEX READERS + + def read_metadata(self): + """Reads 'meta' and 'metaidx' sections.""" + if not self.has_tag('metaidx'): + return {} + # Metadata format is different since v8 + fmt = self.read_version()['fmt'] + # First, read metaidx, to match featureId <-> metadata + self.seek_tag('metaidx') + ftid_meta = [] + while self.inside_tag('metaidx'): + ftid = self.read_uint(4) + moffs = self.read_uint(4) + ftid_meta.append((moffs, ftid)) + # Sort ftid_meta array + ftid_meta.sort(key=lambda x: x[0]) + ftpos = 0 + # Now read metadata + self.seek_tag('meta') + metadatar = {} + while self.inside_tag('meta'): + tag_pos = self.tag_position('meta') + fields = {} + if fmt >= 8: + sz = self.read_varuint() + if sz: + for i in range(sz): + t = self.read_varuint() + t = self.metadata[t] if t < len(self.metadata) else str(t) + fields[t] = self.read_string() + else: + while True: + t = self.read_uint(1) + is_last = t & 0x80 > 0 + t = t & 0x7f + t = self.metadata[t] if t < len(self.metadata) else str(t) + l = self.read_uint(1) + fields[t] = self.f.read(l) + if is_last: + break + + if len(fields): + while ftpos < len(ftid_meta) and ftid_meta[ftpos][0] < tag_pos: + ftpos += 1 + if ftpos < len(ftid_meta): + if ftid_meta[ftpos][0] == tag_pos: + metadatar[ftid_meta[ftpos][1]] = fields + return metadatar + + def read_crossmwm(self): + """Reads 'chrysler' section (cross-mwm routing table).""" + if not self.has_tag('chrysler'): + return {} + self.seek_tag('chrysler') + # Ingoing nodes: array of (nodeId, coord) tuples + incomingCount = self.read_uint(4) + incoming = [] + for i in range(incomingCount): + nodeId = self.read_uint(4) + point = self.read_coord(False) + incoming.append((nodeId, point)) + # Outgoing nodes: array of (nodeId, coord, outIndex) tuples + # outIndex is an index in neighbours array + outgoingCount = self.read_uint(4) + outgoing = [] + for i in range(outgoingCount): + nodeId = self.read_uint(4) + point = self.read_coord(False) + outIndex = self.read_uint(1) + outgoing.append((nodeId, point, outIndex)) + # Adjacency matrix: costs of routes for each (incoming, outgoing) tuple + matrix = [] + for i in range(incomingCount): + sub = [] + for j in range(outgoingCount): + sub.append(self.read_uint(4)) + matrix.append(sub) + # List of mwms to which leads each outgoing node + neighboursCount = self.read_uint(4) + neighbours = [] + for i in range(neighboursCount): + size = self.read_uint(4) + neighbours.append(self.f.read(size)) + return { 'in': incoming, 'out': outgoing, 'matrix': matrix, 'neighbours': neighbours } + + class GeomType: + POINT = 0 + LINE = 1 << 5 + AREA = 1 << 6 + POINT_EX = 3 << 5 + + class OsmIdCode: + NODE = 0x4000000000000000; + WAY = 0x8000000000000000; + RELATION = 0xC000000000000000; + RESET = ~(NODE | WAY | RELATION); + + def iter_features(self): + """Reads 'dat' section.""" + if not self.has_tag('dat'): + return + # TODO: read 'offs'? + self.seek_tag('dat') + while self.inside_tag('dat'): + feature = {} + feature_size = self.read_varuint() + next_feature = self.f.tell() + feature_size + feature['size'] = feature_size + + # Header + header = {} + header_bits = self.read_uint(1) + types_count = (header_bits & 0x07) + 1 + has_name = header_bits & 0x08 > 0 + has_layer = header_bits & 0x10 > 0 + has_addinfo = header_bits & 0x80 > 0 + geom_type = header_bits & 0x60 + types = [] + for i in range(types_count): + type_id = self.read_varuint() + if type_id < len(self.type_mapping): + types.append(self.type_mapping[type_id]) + else: + types.append(str(type_id)) + header['types'] = types + if has_name: + header['name'] = self.read_multilang() + if has_layer: + header['layer'] = self.read_uint(1) + if has_addinfo: + if geom_type == MWM.GeomType.POINT: + header['rank'] = self.read_uint(1) + elif geom_type == MWM.GeomType.LINE: + header['ref'] = self.read_string() + elif geom_type == MWM.GeomType.AREA or geom_type == MWM.GeomType.POINT_EX: + header['house'] = self.read_numeric_string() + feature['header'] = header + + # Geometry + geometry = {} + if geom_type == MWM.GeomType.POINT or geom_type == MWM.GeomType.POINT_EX: + geometry['type'] = 'Point' + elif geom_type == MWM.GeomType.LINE: + geometry['type'] = 'LineString' + elif geom_type == MWM.GeomType.AREA: + geometry['type'] = 'Polygon' + if geom_type == MWM.GeomType.POINT: + geometry['coordinates'] = list(self.read_coord()) + + # (flipping table emoticon) + feature['geometry'] = geometry + if False: + if geom_type != MWM.GeomType.POINT: + polygon_count = self.read_varuint() + polygons = [] + for i in range(polygon_count): + count = self.read_varuint() + buf = self.f.read(count) + # TODO: decode + geometry['coordinates'] = polygons + feature['coastCell'] = self.read_varint() + + # OSM IDs + count = self.read_varuint() + osmids = [] + for i in range(count): + encid = self.read_uint(8) + if encid & MWM.OsmIdCode.NODE == MWM.OsmIdCode.NODE: + typ = 'n' + elif encid & MWM.OsmIdCode.WAY == MWM.OsmIdCode.WAY: + typ = 'w' + elif encid & MWM.OsmIdCode.RELATION == MWM.OsmIdCode.RELATION: + typ = 'r' + else: + typ = '' + osmids.append('{0}{1}'.format(typ, encid & MWM.OsmIdCode.RESET)) + feature['osmIds'] = osmids + + if self.f.tell() > next_feature: + raise Exception('Feature parsing error, read too much') + yield feature + self.f.seek(next_feature) + + # BITWISE READERS + + def read_uint(self, bytelen=1): + if bytelen == 1: + fmt = 'B' + elif bytelen == 2: + fmt = 'H' + elif bytelen == 4: + fmt = 'I' + elif bytelen == 8: + fmt = 'Q' + else: + raise Exception('Bytelen {0} is not supported'.format(bytelen)) + res = struct.unpack(fmt, self.f.read(bytelen)) + return res[0] + + def read_varuint(self): + res = 0 + shift = 0 + more = True + while more: + b = self.f.read(1) + if not b: + return res + res |= (ord(b[0]) & 0x7F) << shift + shift += 7 + more = ord(b[0]) >= 0x80 + return res + + def read_varint(self): + uint = self.read_varuint() + res = uint >> 1 + return res if uint & 1 == 0 else -res + + def mwm_unshuffle(self, x): + x = ((x & 0x22222222) << 1) | ((x >> 1) & 0x22222222) | (x & 0x99999999) + x = ((x & 0x0C0C0C0C) << 2) | ((x >> 2) & 0x0C0C0C0C) | (x & 0xC3C3C3C3) + x = ((x & 0x00F000F0) << 4) | ((x >> 4) & 0x00F000F0) | (x & 0xF00FF00F) + x = ((x & 0x0000FF00) << 8) | ((x >> 8) & 0x0000FF00) | (x & 0xFF0000FF) + return x + + def mwm_bitwise_split(self, v): + hi = self.mwm_unshuffle(v >> 32) + lo = self.mwm_unshuffle(v & 0xFFFFFFFF) + x = ((hi & 0xFFFF) << 16) | (lo & 0xFFFF); + y = (hi & 0xFFFF0000) | (lo >> 16); + return (x, y) + + def read_point(self, packed=True): + """Reads an unsigned point, returns (x, y).""" + if packed: + u = self.read_varuint() + else: + u = self.read_uint(8) + return self.mwm_bitwise_split(u) + + def to_4326(self, point): + if self.coord_size is None: + raise Exception('Call read_header() first.') + merc_bounds = (-180, -180, 180, 180) # Xmin, Ymin, Xmax, Ymax + x = point[0] * (merc_bounds[2] - merc_bounds[0]) / self.coord_size + merc_bounds[0] + y = point[1] * (merc_bounds[3] - merc_bounds[1]) / self.coord_size + merc_bounds[1] + y = 360.0 * math.atan(math.tanh(y * math.pi / 360.0)) / math.pi + return (x, y) + + def read_coord(self, packed=True, convert=True): + """Reads a pair of coords in degrees mercator, returns (lon, lat).""" + upoint = self.read_point(packed) + point = (upoint[0] + self.base_point[0], upoint[1] + self.base_point[1]) + return self.to_4326(point) if convert else point + + def read_bounds(self): + """Reads mercator bounds, returns (min_lon, min_lat, max_lon, max_lat).""" + rmin = self.read_coord() + rmax = self.read_coord() + return (rmin[0], rmin[1], rmax[0], rmax[1]) + + def read_string(self, plain=False): + length = self.read_varuint() + (0 if plain else 1) + return self.f.read(length) + + def read_uint_array(self): + length = self.read_varuint() + result = [] + for i in range(length): + result.append(self.read_varuint()) + return result + + def read_numeric_string(self): + sz = self.read_varuint() + if sz & 1 != 0: + return str(sz >> 1) + sz = (sz >> 1) + 1 + return self.f.read(sz) + + def read_multilang(self): + s = self.read_string() + # TODO! + return s From a0109c229e77372300487c31cd51e92fc59403c7 Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Wed, 27 Jan 2016 15:41:31 +0300 Subject: [PATCH 2/9] More mwm magic --- tools/python/mwm/dump_mwm.py | 16 ++++++++++++---- tools/python/mwm/mwm.py | 9 ++++++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/tools/python/mwm/dump_mwm.py b/tools/python/mwm/dump_mwm.py index bc86ba53cd..828303d6ad 100755 --- a/tools/python/mwm/dump_mwm.py +++ b/tools/python/mwm/dump_mwm.py @@ -1,6 +1,5 @@ #!/usr/bin/python -import sys, os.path -import itertools +import sys, os.path, random from mwm import MWM if len(sys.argv) < 2: @@ -16,9 +15,18 @@ for tag, value in mwm.tags.iteritems(): print 'Version:', mwm.read_version() print 'Header:', mwm.read_header() print 'Metadata count:', len(mwm.read_metadata()) + cross = mwm.read_crossmwm() if cross: print 'Outgoing points:', len(cross['out']), 'incoming:', len(cross['in']) print 'Outgoing regions:', set(cross['neighbours']) -for feature in itertools.islice(mwm.iter_features(), 10): - print feature + +print 'Sample features:' +count = 5 +probability = 1.0 / 1000 +for feature in mwm.iter_features(): + if random.random() < probability: + print feature + count -= 1 + if count <= 0: + break diff --git a/tools/python/mwm/mwm.py b/tools/python/mwm/mwm.py index 1ad6851331..af40135add 100644 --- a/tools/python/mwm/mwm.py +++ b/tools/python/mwm/mwm.py @@ -2,10 +2,17 @@ import struct import math -# Unprocessed sections: geomN, trgN, idx, sdx (search index), addr (search address), offs (feature offsets), dat (!) +# Unprocessed sections: geomN, trgN, idx, sdx (search index), addr (search address), offs (feature offsets - succinct) # Routing sections: mercedes (matrix), daewoo (edge data), infinity (edge id), skoda (shortcuts), chrysler (cross context), ftseg, node2ftseg # (these mostly are succinct structures, except chrysler and node2ftseg, so no use trying to load them here) +# TODO: +# - Fix bounds reading in the header +# - Fix delta point encoding (coords are plausible, but incorrect) +# - Find why polygon geometry is incorrect in iter_features() +# - Multilang string reading +# - Find feature ids in the 'dat' section, or find a way to read the 'offs' section + class MWM: languages = ["default", "en", "ja", "fr", "ko_rm", "ar", "de", "int_name", "ru", "sv", "zh", "fi", "be", "ka", "ko", From 95c95d9befe029be7e861b98156120bf329bb531 Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Fri, 25 Mar 2016 14:01:31 +0300 Subject: [PATCH 3/9] Sorting and correct timestamp in mwm_dump --- tools/python/mwm/dump_mwm.py | 5 +++-- tools/python/mwm/mwm.py | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/python/mwm/dump_mwm.py b/tools/python/mwm/dump_mwm.py index 828303d6ad..7493f1c423 100755 --- a/tools/python/mwm/dump_mwm.py +++ b/tools/python/mwm/dump_mwm.py @@ -10,8 +10,9 @@ if len(sys.argv) < 2: mwm = MWM(open(sys.argv[1], 'rb')) mwm.read_types(os.path.join(os.path.dirname(sys.argv[0]), '..', '..', '..', 'data', 'types.txt')) print 'Tags:' -for tag, value in mwm.tags.iteritems(): - print ' {0:<8}: offs {1:9} len {2:8}'.format(tag, value[0], value[1]) +tvv = sorted([(k, v[0], v[1]) for k, v in mwm.tags.iteritems()], key=lambda x: x[1]) +for tv in tvv: + print ' {0:<8}: offs {1:9} len {2:8}'.format(tv[0], tv[1], tv[2]) print 'Version:', mwm.read_version() print 'Header:', mwm.read_header() print 'Metadata count:', len(mwm.read_metadata()) diff --git a/tools/python/mwm/mwm.py b/tools/python/mwm/mwm.py index af40135add..27df5817a7 100644 --- a/tools/python/mwm/mwm.py +++ b/tools/python/mwm/mwm.py @@ -1,6 +1,7 @@ # MWM Reader Module import struct import math +from datetime import datetime # Unprocessed sections: geomN, trgN, idx, sdx (search index), addr (search address), offs (feature offsets - succinct) # Routing sections: mercedes (matrix), daewoo (edge data), infinity (edge id), skoda (shortcuts), chrysler (cross context), ftseg, node2ftseg @@ -72,6 +73,10 @@ class MWM: self.f.read(4) # skip prolog fmt = self.read_varuint() + 1 version = self.read_varuint() + if version < 161231: + version = datetime(2000 + int(version / 10000), int(version / 100) % 100, version % 100) + else: + version = datetime.fromtimestamp(version) return { 'fmt': fmt, 'version': version } def read_header(self): From 543ef4f0931cf9931fb173be20996873bbc91724 Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Sat, 4 Jun 2016 11:53:13 +0300 Subject: [PATCH 4/9] [mwm.py] PEP8 --- tools/python/mwm/mwm.py | 697 ++++++++++++++++++++-------------------- 1 file changed, 348 insertions(+), 349 deletions(-) diff --git a/tools/python/mwm/mwm.py b/tools/python/mwm/mwm.py index 27df5817a7..739872abb3 100644 --- a/tools/python/mwm/mwm.py +++ b/tools/python/mwm/mwm.py @@ -15,380 +15,379 @@ from datetime import datetime # - Find feature ids in the 'dat' section, or find a way to read the 'offs' section class MWM: - languages = ["default", - "en", "ja", "fr", "ko_rm", "ar", "de", "int_name", "ru", "sv", "zh", "fi", "be", "ka", "ko", - "he", "nl", "ga", "ja_rm", "el", "it", "es", "zh_pinyin", "th", "cy", "sr", "uk", "ca", "hu", - "hsb", "eu", "fa", "br", "pl", "hy", "kn", "sl", "ro", "sq", "am", "fy", "cs", "gd", "sk", - "af", "ja_kana", "lb", "pt", "hr", "fur", "vi", "tr", "bg", "eo", "lt", "la", "kk", "gsw", - "et", "ku", "mn", "mk", "lv", "hi"] + languages = ["default", + "en", "ja", "fr", "ko_rm", "ar", "de", "int_name", "ru", "sv", "zh", "fi", "be", "ka", "ko", + "he", "nl", "ga", "ja_rm", "el", "it", "es", "zh_pinyin", "th", "cy", "sr", "uk", "ca", "hu", + "hsb", "eu", "fa", "br", "pl", "hy", "kn", "sl", "ro", "sq", "am", "fy", "cs", "gd", "sk", + "af", "ja_kana", "lb", "pt", "hr", "fur", "vi", "tr", "bg", "eo", "lt", "la", "kk", "gsw", + "et", "ku", "mn", "mk", "lv", "hi"] - metadata = ["0", - "cuisine", "open_hours", "phone_number", "fax_number", "stars", - "operator", "url", "website", "internet", "ele", - "turn_lanes", "turn_lanes_forward", "turn_lanes_backward", "email", "postcode", - "wikipedia", "maxspeed", "flats", "height", "min_height", - "denomination", "building_levels" - ] + metadata = ["0", + "cuisine", "open_hours", "phone_number", "fax_number", "stars", + "operator", "url", "website", "internet", "ele", + "turn_lanes", "turn_lanes_forward", "turn_lanes_backward", "email", "postcode", + "wikipedia", "maxspeed", "flats", "height", "min_height", + "denomination", "building_levels"] - def __init__(self, f): - self.f = f - self.coord_size = None - self.base_point = (0, 0) - self.read_info() - self.type_mapping = [] + def __init__(self, f): + self.f = f + self.coord_size = None + self.base_point = (0, 0) + self.read_info() + self.type_mapping = [] - def read_types(self, filename): - with open(filename, 'r') as ft: - for line in ft: - if len(line.strip()) > 0: - self.type_mapping.append(line.strip()) + def read_types(self, filename): + with open(filename, 'r') as ft: + for line in ft: + if len(line.strip()) > 0: + self.type_mapping.append(line.strip()) - def read_info(self): - self.f.seek(0) - self.f.seek(self.read_uint(8)) - cnt = self.read_varuint() - self.tags = {} - for i in range(cnt): - name = self.read_string(True) - offset = self.read_varuint() - length = self.read_varuint() - self.tags[name] = (offset, length) + def read_info(self): + self.f.seek(0) + self.f.seek(self.read_uint(8)) + cnt = self.read_varuint() + self.tags = {} + for i in range(cnt): + name = self.read_string(True) + offset = self.read_varuint() + length = self.read_varuint() + self.tags[name] = (offset, length) - def has_tag(self, tag): - return tag in self.tags and self.tags[tag][1] > 0 + def has_tag(self, tag): + return tag in self.tags and self.tags[tag][1] > 0 - def seek_tag(self, tag): - self.f.seek(self.tags[tag][0]) + def seek_tag(self, tag): + self.f.seek(self.tags[tag][0]) - def inside_tag(self, tag): - pos = self.tag_position(tag) - return pos >= 0 and pos < self.tags[tag][1] + def inside_tag(self, tag): + pos = self.tag_position(tag) + return pos >= 0 and pos < self.tags[tag][1] - def tag_position(self, tag): - return self.f.tell() - self.tags[tag][0] + def tag_position(self, tag): + return self.f.tell() - self.tags[tag][0] - def read_version(self): - """Reads 'version' section.""" - self.seek_tag('version') - self.f.read(4) # skip prolog - fmt = self.read_varuint() + 1 - version = self.read_varuint() - if version < 161231: - version = datetime(2000 + int(version / 10000), int(version / 100) % 100, version % 100) - else: - version = datetime.fromtimestamp(version) - return { 'fmt': fmt, 'version': version } - - def read_header(self): - """Reads 'header' section.""" - if not self.has_tag('header'): - # Stub for routing files - self.coord_size = (1 << 30) - 1 - return {} - self.seek_tag('header') - result = {} - coord_bits = self.read_varuint() - self.coord_size = (1 << coord_bits) - 1 - self.base_point = self.read_coord(convert=False) - result['basePoint'] = self.to_4326(self.base_point) - result['bounds'] = self.read_bounds() - result['scales'] = self.read_uint_array() - langs = self.read_uint_array() - for i in range(len(langs)): - if i < len(self.languages): - langs[i] = self.languages[langs[i]] - result['langs'] = langs - map_type = self.read_varint() - if map_type == 0: - result['mapType'] = 'world' - elif map_type == 1: - result['mapType'] = 'worldcoasts' - elif map_type == 2: - result['mapType'] = 'country' - else: - result['mapType'] = 'unknown: {0}'.format(map_type) - return result - - # COMPLEX READERS - - def read_metadata(self): - """Reads 'meta' and 'metaidx' sections.""" - if not self.has_tag('metaidx'): - return {} - # Metadata format is different since v8 - fmt = self.read_version()['fmt'] - # First, read metaidx, to match featureId <-> metadata - self.seek_tag('metaidx') - ftid_meta = [] - while self.inside_tag('metaidx'): - ftid = self.read_uint(4) - moffs = self.read_uint(4) - ftid_meta.append((moffs, ftid)) - # Sort ftid_meta array - ftid_meta.sort(key=lambda x: x[0]) - ftpos = 0 - # Now read metadata - self.seek_tag('meta') - metadatar = {} - while self.inside_tag('meta'): - tag_pos = self.tag_position('meta') - fields = {} - if fmt >= 8: - sz = self.read_varuint() - if sz: - for i in range(sz): - t = self.read_varuint() - t = self.metadata[t] if t < len(self.metadata) else str(t) - fields[t] = self.read_string() - else: - while True: - t = self.read_uint(1) - is_last = t & 0x80 > 0 - t = t & 0x7f - t = self.metadata[t] if t < len(self.metadata) else str(t) - l = self.read_uint(1) - fields[t] = self.f.read(l) - if is_last: - break - - if len(fields): - while ftpos < len(ftid_meta) and ftid_meta[ftpos][0] < tag_pos: - ftpos += 1 - if ftpos < len(ftid_meta): - if ftid_meta[ftpos][0] == tag_pos: - metadatar[ftid_meta[ftpos][1]] = fields - return metadatar - - def read_crossmwm(self): - """Reads 'chrysler' section (cross-mwm routing table).""" - if not self.has_tag('chrysler'): - return {} - self.seek_tag('chrysler') - # Ingoing nodes: array of (nodeId, coord) tuples - incomingCount = self.read_uint(4) - incoming = [] - for i in range(incomingCount): - nodeId = self.read_uint(4) - point = self.read_coord(False) - incoming.append((nodeId, point)) - # Outgoing nodes: array of (nodeId, coord, outIndex) tuples - # outIndex is an index in neighbours array - outgoingCount = self.read_uint(4) - outgoing = [] - for i in range(outgoingCount): - nodeId = self.read_uint(4) - point = self.read_coord(False) - outIndex = self.read_uint(1) - outgoing.append((nodeId, point, outIndex)) - # Adjacency matrix: costs of routes for each (incoming, outgoing) tuple - matrix = [] - for i in range(incomingCount): - sub = [] - for j in range(outgoingCount): - sub.append(self.read_uint(4)) - matrix.append(sub) - # List of mwms to which leads each outgoing node - neighboursCount = self.read_uint(4) - neighbours = [] - for i in range(neighboursCount): - size = self.read_uint(4) - neighbours.append(self.f.read(size)) - return { 'in': incoming, 'out': outgoing, 'matrix': matrix, 'neighbours': neighbours } - - class GeomType: - POINT = 0 - LINE = 1 << 5 - AREA = 1 << 6 - POINT_EX = 3 << 5 - - class OsmIdCode: - NODE = 0x4000000000000000; - WAY = 0x8000000000000000; - RELATION = 0xC000000000000000; - RESET = ~(NODE | WAY | RELATION); - - def iter_features(self): - """Reads 'dat' section.""" - if not self.has_tag('dat'): - return - # TODO: read 'offs'? - self.seek_tag('dat') - while self.inside_tag('dat'): - feature = {} - feature_size = self.read_varuint() - next_feature = self.f.tell() + feature_size - feature['size'] = feature_size - - # Header - header = {} - header_bits = self.read_uint(1) - types_count = (header_bits & 0x07) + 1 - has_name = header_bits & 0x08 > 0 - has_layer = header_bits & 0x10 > 0 - has_addinfo = header_bits & 0x80 > 0 - geom_type = header_bits & 0x60 - types = [] - for i in range(types_count): - type_id = self.read_varuint() - if type_id < len(self.type_mapping): - types.append(self.type_mapping[type_id]) + def read_version(self): + """Reads 'version' section.""" + self.seek_tag('version') + self.f.read(4) # skip prolog + fmt = self.read_varuint() + 1 + version = self.read_varuint() + if version < 161231: + version = datetime(2000 + int(version / 10000), int(version / 100) % 100, version % 100) else: - types.append(str(type_id)) - header['types'] = types - if has_name: - header['name'] = self.read_multilang() - if has_layer: - header['layer'] = self.read_uint(1) - if has_addinfo: - if geom_type == MWM.GeomType.POINT: - header['rank'] = self.read_uint(1) - elif geom_type == MWM.GeomType.LINE: - header['ref'] = self.read_string() - elif geom_type == MWM.GeomType.AREA or geom_type == MWM.GeomType.POINT_EX: - header['house'] = self.read_numeric_string() - feature['header'] = header + version = datetime.fromtimestamp(version) + return { 'fmt': fmt, 'version': version } - # Geometry - geometry = {} - if geom_type == MWM.GeomType.POINT or geom_type == MWM.GeomType.POINT_EX: - geometry['type'] = 'Point' - elif geom_type == MWM.GeomType.LINE: - geometry['type'] = 'LineString' - elif geom_type == MWM.GeomType.AREA: - geometry['type'] = 'Polygon' - if geom_type == MWM.GeomType.POINT: - geometry['coordinates'] = list(self.read_coord()) + def read_header(self): + """Reads 'header' section.""" + if not self.has_tag('header'): + # Stub for routing files + self.coord_size = (1 << 30) - 1 + return {} + self.seek_tag('header') + result = {} + coord_bits = self.read_varuint() + self.coord_size = (1 << coord_bits) - 1 + self.base_point = self.read_coord(convert=False) + result['basePoint'] = self.to_4326(self.base_point) + result['bounds'] = self.read_bounds() + result['scales'] = self.read_uint_array() + langs = self.read_uint_array() + for i in range(len(langs)): + if i < len(self.languages): + langs[i] = self.languages[langs[i]] + result['langs'] = langs + map_type = self.read_varint() + if map_type == 0: + result['mapType'] = 'world' + elif map_type == 1: + result['mapType'] = 'worldcoasts' + elif map_type == 2: + result['mapType'] = 'country' + else: + result['mapType'] = 'unknown: {0}'.format(map_type) + return result - # (flipping table emoticon) - feature['geometry'] = geometry - if False: - if geom_type != MWM.GeomType.POINT: - polygon_count = self.read_varuint() - polygons = [] - for i in range(polygon_count): - count = self.read_varuint() - buf = self.f.read(count) - # TODO: decode - geometry['coordinates'] = polygons - feature['coastCell'] = self.read_varint() - - # OSM IDs - count = self.read_varuint() - osmids = [] - for i in range(count): - encid = self.read_uint(8) - if encid & MWM.OsmIdCode.NODE == MWM.OsmIdCode.NODE: - typ = 'n' - elif encid & MWM.OsmIdCode.WAY == MWM.OsmIdCode.WAY: - typ = 'w' - elif encid & MWM.OsmIdCode.RELATION == MWM.OsmIdCode.RELATION: - typ = 'r' - else: - typ = '' - osmids.append('{0}{1}'.format(typ, encid & MWM.OsmIdCode.RESET)) - feature['osmIds'] = osmids + # COMPLEX READERS - if self.f.tell() > next_feature: - raise Exception('Feature parsing error, read too much') - yield feature - self.f.seek(next_feature) + def read_metadata(self): + """Reads 'meta' and 'metaidx' sections.""" + if not self.has_tag('metaidx'): + return {} + # Metadata format is different since v8 + fmt = self.read_version()['fmt'] + # First, read metaidx, to match featureId <-> metadata + self.seek_tag('metaidx') + ftid_meta = [] + while self.inside_tag('metaidx'): + ftid = self.read_uint(4) + moffs = self.read_uint(4) + ftid_meta.append((moffs, ftid)) + # Sort ftid_meta array + ftid_meta.sort(key=lambda x: x[0]) + ftpos = 0 + # Now read metadata + self.seek_tag('meta') + metadatar = {} + while self.inside_tag('meta'): + tag_pos = self.tag_position('meta') + fields = {} + if fmt >= 8: + sz = self.read_varuint() + if sz: + for i in range(sz): + t = self.read_varuint() + t = self.metadata[t] if t < len(self.metadata) else str(t) + fields[t] = self.read_string() + else: + while True: + t = self.read_uint(1) + is_last = t & 0x80 > 0 + t = t & 0x7f + t = self.metadata[t] if t < len(self.metadata) else str(t) + l = self.read_uint(1) + fields[t] = self.f.read(l) + if is_last: + break - # BITWISE READERS + if len(fields): + while ftpos < len(ftid_meta) and ftid_meta[ftpos][0] < tag_pos: + ftpos += 1 + if ftpos < len(ftid_meta): + if ftid_meta[ftpos][0] == tag_pos: + metadatar[ftid_meta[ftpos][1]] = fields + return metadatar - def read_uint(self, bytelen=1): - if bytelen == 1: - fmt = 'B' - elif bytelen == 2: - fmt = 'H' - elif bytelen == 4: - fmt = 'I' - elif bytelen == 8: - fmt = 'Q' - else: - raise Exception('Bytelen {0} is not supported'.format(bytelen)) - res = struct.unpack(fmt, self.f.read(bytelen)) - return res[0] + def read_crossmwm(self): + """Reads 'chrysler' section (cross-mwm routing table).""" + if not self.has_tag('chrysler'): + return {} + self.seek_tag('chrysler') + # Ingoing nodes: array of (nodeId, coord) tuples + incomingCount = self.read_uint(4) + incoming = [] + for i in range(incomingCount): + nodeId = self.read_uint(4) + point = self.read_coord(False) + incoming.append((nodeId, point)) + # Outgoing nodes: array of (nodeId, coord, outIndex) tuples + # outIndex is an index in neighbours array + outgoingCount = self.read_uint(4) + outgoing = [] + for i in range(outgoingCount): + nodeId = self.read_uint(4) + point = self.read_coord(False) + outIndex = self.read_uint(1) + outgoing.append((nodeId, point, outIndex)) + # Adjacency matrix: costs of routes for each (incoming, outgoing) tuple + matrix = [] + for i in range(incomingCount): + sub = [] + for j in range(outgoingCount): + sub.append(self.read_uint(4)) + matrix.append(sub) + # List of mwms to which leads each outgoing node + neighboursCount = self.read_uint(4) + neighbours = [] + for i in range(neighboursCount): + size = self.read_uint(4) + neighbours.append(self.f.read(size)) + return { 'in': incoming, 'out': outgoing, 'matrix': matrix, 'neighbours': neighbours } - def read_varuint(self): - res = 0 - shift = 0 - more = True - while more: - b = self.f.read(1) - if not b: + class GeomType: + POINT = 0 + LINE = 1 << 5 + AREA = 1 << 6 + POINT_EX = 3 << 5 + + class OsmIdCode: + NODE = 0x4000000000000000 + WAY = 0x8000000000000000 + RELATION = 0xC000000000000000 + RESET = ~(NODE | WAY | RELATION) + + def iter_features(self): + """Reads 'dat' section.""" + if not self.has_tag('dat'): + return + # TODO: read 'offs'? + self.seek_tag('dat') + while self.inside_tag('dat'): + feature = {} + feature_size = self.read_varuint() + next_feature = self.f.tell() + feature_size + feature['size'] = feature_size + + # Header + header = {} + header_bits = self.read_uint(1) + types_count = (header_bits & 0x07) + 1 + has_name = header_bits & 0x08 > 0 + has_layer = header_bits & 0x10 > 0 + has_addinfo = header_bits & 0x80 > 0 + geom_type = header_bits & 0x60 + types = [] + for i in range(types_count): + type_id = self.read_varuint() + if type_id < len(self.type_mapping): + types.append(self.type_mapping[type_id]) + else: + types.append(str(type_id)) + header['types'] = types + if has_name: + header['name'] = self.read_multilang() + if has_layer: + header['layer'] = self.read_uint(1) + if has_addinfo: + if geom_type == MWM.GeomType.POINT: + header['rank'] = self.read_uint(1) + elif geom_type == MWM.GeomType.LINE: + header['ref'] = self.read_string() + elif geom_type == MWM.GeomType.AREA or geom_type == MWM.GeomType.POINT_EX: + header['house'] = self.read_numeric_string() + feature['header'] = header + + # Geometry + geometry = {} + if geom_type == MWM.GeomType.POINT or geom_type == MWM.GeomType.POINT_EX: + geometry['type'] = 'Point' + elif geom_type == MWM.GeomType.LINE: + geometry['type'] = 'LineString' + elif geom_type == MWM.GeomType.AREA: + geometry['type'] = 'Polygon' + if geom_type == MWM.GeomType.POINT: + geometry['coordinates'] = list(self.read_coord()) + + # (flipping table emoticon) + feature['geometry'] = geometry + if False: + if geom_type != MWM.GeomType.POINT: + polygon_count = self.read_varuint() + polygons = [] + for i in range(polygon_count): + count = self.read_varuint() + buf = self.f.read(count) + # TODO: decode + geometry['coordinates'] = polygons + feature['coastCell'] = self.read_varint() + + # OSM IDs + count = self.read_varuint() + osmids = [] + for i in range(count): + encid = self.read_uint(8) + if encid & MWM.OsmIdCode.NODE == MWM.OsmIdCode.NODE: + typ = 'n' + elif encid & MWM.OsmIdCode.WAY == MWM.OsmIdCode.WAY: + typ = 'w' + elif encid & MWM.OsmIdCode.RELATION == MWM.OsmIdCode.RELATION: + typ = 'r' + else: + typ = '' + osmids.append('{0}{1}'.format(typ, encid & MWM.OsmIdCode.RESET)) + feature['osmIds'] = osmids + + if self.f.tell() > next_feature: + raise Exception('Feature parsing error, read too much') + yield feature + self.f.seek(next_feature) + + # BITWISE READERS + + def read_uint(self, bytelen=1): + if bytelen == 1: + fmt = 'B' + elif bytelen == 2: + fmt = 'H' + elif bytelen == 4: + fmt = 'I' + elif bytelen == 8: + fmt = 'Q' + else: + raise Exception('Bytelen {0} is not supported'.format(bytelen)) + res = struct.unpack(fmt, self.f.read(bytelen)) + return res[0] + + def read_varuint(self): + res = 0 + shift = 0 + more = True + while more: + b = self.f.read(1) + if not b: + return res + res |= (ord(b[0]) & 0x7F) << shift + shift += 7 + more = ord(b[0]) >= 0x80 return res - res |= (ord(b[0]) & 0x7F) << shift - shift += 7 - more = ord(b[0]) >= 0x80 - return res - def read_varint(self): - uint = self.read_varuint() - res = uint >> 1 - return res if uint & 1 == 0 else -res + def read_varint(self): + uint = self.read_varuint() + res = uint >> 1 + return res if uint & 1 == 0 else -res - def mwm_unshuffle(self, x): - x = ((x & 0x22222222) << 1) | ((x >> 1) & 0x22222222) | (x & 0x99999999) - x = ((x & 0x0C0C0C0C) << 2) | ((x >> 2) & 0x0C0C0C0C) | (x & 0xC3C3C3C3) - x = ((x & 0x00F000F0) << 4) | ((x >> 4) & 0x00F000F0) | (x & 0xF00FF00F) - x = ((x & 0x0000FF00) << 8) | ((x >> 8) & 0x0000FF00) | (x & 0xFF0000FF) - return x + def mwm_unshuffle(self, x): + x = ((x & 0x22222222) << 1) | ((x >> 1) & 0x22222222) | (x & 0x99999999) + x = ((x & 0x0C0C0C0C) << 2) | ((x >> 2) & 0x0C0C0C0C) | (x & 0xC3C3C3C3) + x = ((x & 0x00F000F0) << 4) | ((x >> 4) & 0x00F000F0) | (x & 0xF00FF00F) + x = ((x & 0x0000FF00) << 8) | ((x >> 8) & 0x0000FF00) | (x & 0xFF0000FF) + return x - def mwm_bitwise_split(self, v): - hi = self.mwm_unshuffle(v >> 32) - lo = self.mwm_unshuffle(v & 0xFFFFFFFF) - x = ((hi & 0xFFFF) << 16) | (lo & 0xFFFF); - y = (hi & 0xFFFF0000) | (lo >> 16); - return (x, y) + def mwm_bitwise_split(self, v): + hi = self.mwm_unshuffle(v >> 32) + lo = self.mwm_unshuffle(v & 0xFFFFFFFF) + x = ((hi & 0xFFFF) << 16) | (lo & 0xFFFF) + y = (hi & 0xFFFF0000) | (lo >> 16) + return (x, y) - def read_point(self, packed=True): - """Reads an unsigned point, returns (x, y).""" - if packed: - u = self.read_varuint() - else: - u = self.read_uint(8) - return self.mwm_bitwise_split(u) + def read_point(self, packed=True): + """Reads an unsigned point, returns (x, y).""" + if packed: + u = self.read_varuint() + else: + u = self.read_uint(8) + return self.mwm_bitwise_split(u) - def to_4326(self, point): - if self.coord_size is None: - raise Exception('Call read_header() first.') - merc_bounds = (-180, -180, 180, 180) # Xmin, Ymin, Xmax, Ymax - x = point[0] * (merc_bounds[2] - merc_bounds[0]) / self.coord_size + merc_bounds[0] - y = point[1] * (merc_bounds[3] - merc_bounds[1]) / self.coord_size + merc_bounds[1] - y = 360.0 * math.atan(math.tanh(y * math.pi / 360.0)) / math.pi - return (x, y) + def to_4326(self, point): + if self.coord_size is None: + raise Exception('Call read_header() first.') + merc_bounds = (-180, -180, 180, 180) # Xmin, Ymin, Xmax, Ymax + x = point[0] * (merc_bounds[2] - merc_bounds[0]) / self.coord_size + merc_bounds[0] + y = point[1] * (merc_bounds[3] - merc_bounds[1]) / self.coord_size + merc_bounds[1] + y = 360.0 * math.atan(math.tanh(y * math.pi / 360.0)) / math.pi + return (x, y) - def read_coord(self, packed=True, convert=True): - """Reads a pair of coords in degrees mercator, returns (lon, lat).""" - upoint = self.read_point(packed) - point = (upoint[0] + self.base_point[0], upoint[1] + self.base_point[1]) - return self.to_4326(point) if convert else point + def read_coord(self, packed=True, convert=True): + """Reads a pair of coords in degrees mercator, returns (lon, lat).""" + upoint = self.read_point(packed) + point = (upoint[0] + self.base_point[0], upoint[1] + self.base_point[1]) + return self.to_4326(point) if convert else point - def read_bounds(self): - """Reads mercator bounds, returns (min_lon, min_lat, max_lon, max_lat).""" - rmin = self.read_coord() - rmax = self.read_coord() - return (rmin[0], rmin[1], rmax[0], rmax[1]) + def read_bounds(self): + """Reads mercator bounds, returns (min_lon, min_lat, max_lon, max_lat).""" + rmin = self.read_coord() + rmax = self.read_coord() + return (rmin[0], rmin[1], rmax[0], rmax[1]) - def read_string(self, plain=False): - length = self.read_varuint() + (0 if plain else 1) - return self.f.read(length) + def read_string(self, plain=False): + length = self.read_varuint() + (0 if plain else 1) + return self.f.read(length) - def read_uint_array(self): - length = self.read_varuint() - result = [] - for i in range(length): - result.append(self.read_varuint()) - return result + def read_uint_array(self): + length = self.read_varuint() + result = [] + for i in range(length): + result.append(self.read_varuint()) + return result - def read_numeric_string(self): - sz = self.read_varuint() - if sz & 1 != 0: - return str(sz >> 1) - sz = (sz >> 1) + 1 - return self.f.read(sz) + def read_numeric_string(self): + sz = self.read_varuint() + if sz & 1 != 0: + return str(sz >> 1) + sz = (sz >> 1) + 1 + return self.f.read(sz) - def read_multilang(self): - s = self.read_string() - # TODO! - return s + def read_multilang(self): + s = self.read_string() + # TODO! + return s From effb90042c6975009260361e120e7309f36866fa Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Sat, 4 Jun 2016 12:21:32 +0300 Subject: [PATCH 5/9] [mwm.py] Multilang strings --- tools/python/mwm/dump_mwm.py | 23 ++++++++++++----------- tools/python/mwm/mwm.py | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/tools/python/mwm/dump_mwm.py b/tools/python/mwm/dump_mwm.py index 7493f1c423..251c5cd03c 100755 --- a/tools/python/mwm/dump_mwm.py +++ b/tools/python/mwm/dump_mwm.py @@ -1,33 +1,34 @@ #!/usr/bin/python import sys, os.path, random +import json from mwm import MWM if len(sys.argv) < 2: - print 'Dumps some MWM structures.' - print 'Usage: {0} '.format(sys.argv[0]) - sys.exit(1) + print 'Dumps some MWM structures.' + print 'Usage: {0} '.format(sys.argv[0]) + sys.exit(1) mwm = MWM(open(sys.argv[1], 'rb')) mwm.read_types(os.path.join(os.path.dirname(sys.argv[0]), '..', '..', '..', 'data', 'types.txt')) print 'Tags:' tvv = sorted([(k, v[0], v[1]) for k, v in mwm.tags.iteritems()], key=lambda x: x[1]) for tv in tvv: - print ' {0:<8}: offs {1:9} len {2:8}'.format(tv[0], tv[1], tv[2]) + print ' {0:<8}: offs {1:9} len {2:8}'.format(tv[0], tv[1], tv[2]) print 'Version:', mwm.read_version() print 'Header:', mwm.read_header() print 'Metadata count:', len(mwm.read_metadata()) cross = mwm.read_crossmwm() if cross: - print 'Outgoing points:', len(cross['out']), 'incoming:', len(cross['in']) - print 'Outgoing regions:', set(cross['neighbours']) + print 'Outgoing points:', len(cross['out']), 'incoming:', len(cross['in']) + print 'Outgoing regions:', set(cross['neighbours']) print 'Sample features:' count = 5 probability = 1.0 / 1000 for feature in mwm.iter_features(): - if random.random() < probability: - print feature - count -= 1 - if count <= 0: - break + if random.random() < probability: + print json.dumps(feature, ensure_ascii=False) + count -= 1 + if count <= 0: + break diff --git a/tools/python/mwm/mwm.py b/tools/python/mwm/mwm.py index 739872abb3..568a2a6d01 100644 --- a/tools/python/mwm/mwm.py +++ b/tools/python/mwm/mwm.py @@ -11,7 +11,6 @@ from datetime import datetime # - Fix bounds reading in the header # - Fix delta point encoding (coords are plausible, but incorrect) # - Find why polygon geometry is incorrect in iter_features() -# - Multilang string reading # - Find feature ids in the 'dat' section, or find a way to read the 'offs' section class MWM: @@ -388,6 +387,36 @@ class MWM: return self.f.read(sz) def read_multilang(self): + def find_multilang_next(s, i): + i += 1 + while i < len(s): + c = struct.unpack('B', s[i])[0] + if c & 0xC0 == 0x80: + break + if c & 0x80 == 0: + pass + elif c & 0xFE == 0xFE: + i += 6 + elif c & 0xFC == 0xFC: + i += 5 + elif c & 0xF8 == 0xF8: + i += 4 + elif c & 0xF0 == 0xF0: + i += 3 + elif c & 0xE0 == 0xE0: + i += 2 + elif c & 0xC0 == 0xC0: + i += 1 + i += 1 + return i + s = self.read_string() - # TODO! - return s + langs = {} + i = 0 + while i < len(s): + n = find_multilang_next(s, i) + lng = struct.unpack('B', s[i])[0] & 0x3F + if lng < len(self.languages): + langs[self.languages[lng]] = s[i+1:n] + i = n + return langs From b86735478487d7f9e1784e8a25ee6d259bb6b576 Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Sat, 4 Jun 2016 12:59:01 +0300 Subject: [PATCH 6/9] [mwm.py] Fix reading of coordinates --- tools/python/mwm/mwm.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/tools/python/mwm/mwm.py b/tools/python/mwm/mwm.py index 568a2a6d01..abc3552382 100644 --- a/tools/python/mwm/mwm.py +++ b/tools/python/mwm/mwm.py @@ -8,8 +8,7 @@ from datetime import datetime # (these mostly are succinct structures, except chrysler and node2ftseg, so no use trying to load them here) # TODO: -# - Fix bounds reading in the header -# - Fix delta point encoding (coords are plausible, but incorrect) +# - Predictive reading of LineStrings # - Find why polygon geometry is incorrect in iter_features() # - Find feature ids in the 'dat' section, or find a way to read the 'offs' section @@ -87,7 +86,7 @@ class MWM: result = {} coord_bits = self.read_varuint() self.coord_size = (1 << coord_bits) - 1 - self.base_point = self.read_coord(convert=False) + self.base_point = self.mwm_bitwise_split(self.read_varuint()) result['basePoint'] = self.to_4326(self.base_point) result['bounds'] = self.read_bounds() result['scales'] = self.read_uint_array() @@ -320,11 +319,13 @@ class MWM: more = ord(b[0]) >= 0x80 return res - def read_varint(self): - uint = self.read_varuint() + def zigzag_decode(self, uint): res = uint >> 1 return res if uint & 1 == 0 else -res + def read_varint(self): + return self.zigzag_decode(self.read_varuint()) + def mwm_unshuffle(self, x): x = ((x & 0x22222222) << 1) | ((x >> 1) & 0x22222222) | (x & 0x99999999) x = ((x & 0x0C0C0C0C) << 2) | ((x >> 2) & 0x0C0C0C0C) | (x & 0xC3C3C3C3) @@ -339,34 +340,39 @@ class MWM: y = (hi & 0xFFFF0000) | (lo >> 16) return (x, y) - def read_point(self, packed=True): + def mwm_decode_delta(self, v, ref): + x, y = self.mwm_bitwise_split(v) + return ref[0] + self.zigzag_decode(x), ref[1] + self.zigzag_decode(y) + + def read_point(self, ref, packed=True): """Reads an unsigned point, returns (x, y).""" if packed: u = self.read_varuint() else: u = self.read_uint(8) - return self.mwm_bitwise_split(u) + return self.mwm_decode_delta(u, ref) def to_4326(self, point): if self.coord_size is None: raise Exception('Call read_header() first.') - merc_bounds = (-180, -180, 180, 180) # Xmin, Ymin, Xmax, Ymax + merc_bounds = (-180.0, -180.0, 180.0, 180.0) # Xmin, Ymin, Xmax, Ymax x = point[0] * (merc_bounds[2] - merc_bounds[0]) / self.coord_size + merc_bounds[0] y = point[1] * (merc_bounds[3] - merc_bounds[1]) / self.coord_size + merc_bounds[1] y = 360.0 * math.atan(math.tanh(y * math.pi / 360.0)) / math.pi return (x, y) - def read_coord(self, packed=True, convert=True): + def read_coord(self, packed=True): """Reads a pair of coords in degrees mercator, returns (lon, lat).""" - upoint = self.read_point(packed) - point = (upoint[0] + self.base_point[0], upoint[1] + self.base_point[1]) - return self.to_4326(point) if convert else point + point = self.read_point(self.base_point, packed) + return self.to_4326(point) def read_bounds(self): """Reads mercator bounds, returns (min_lon, min_lat, max_lon, max_lat).""" - rmin = self.read_coord() - rmax = self.read_coord() - return (rmin[0], rmin[1], rmax[0], rmax[1]) + rmin = self.mwm_bitwise_split(self.read_varint()) + rmax = self.mwm_bitwise_split(self.read_varint()) + pmin = self.to_4326(rmin) + pmax = self.to_4326(rmax) + return (pmin[0], pmin[1], pmax[0], pmax[1]) def read_string(self, plain=False): length = self.read_varuint() + (0 if plain else 1) From 114ae064698c5f4c76b0c2df7fe6402b7584c93f Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Sat, 4 Jun 2016 14:35:01 +0300 Subject: [PATCH 7/9] [mwm.py] Universal feature finder --- tools/python/mwm/find_feature.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100755 tools/python/mwm/find_feature.py diff --git a/tools/python/mwm/find_feature.py b/tools/python/mwm/find_feature.py new file mode 100755 index 0000000000..4071e7dc08 --- /dev/null +++ b/tools/python/mwm/find_feature.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +import sys, os.path, json +from mwm import MWM + +if len(sys.argv) < 4: + print 'Finds features in an mwm file' + print 'Usage: {0} '.format(sys.argv[0]) + print 'Type: t for inside types, et for exact type, n for names' + sys.exit(1) + +typ = sys.argv[2].lower() +find = sys.argv[3] + +mwm = MWM(open(sys.argv[1], 'rb')) +mwm.read_header() +mwm.read_types(os.path.join(os.path.dirname(sys.argv[0]), '..', '..', '..', 'data', 'types.txt')) +for feature in mwm.iter_features(): + found = False + if typ == 'n' and 'name' in feature['header']: + for value in feature['header']['name'].values(): + if find in value: + found = True + elif typ in ('t', 'et'): + for t in feature['header']['types']: + if t == find: + found = True + elif typ == 't' and find in t: + found = True + if found: + print json.dumps(feature, ensure_ascii=False) From 8bcaac7b8e7b0e85172d4dc2906f64eb6f286a20 Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Mon, 6 Jun 2016 15:55:56 +0300 Subject: [PATCH 8/9] [mwm.py] Review fixes and python3 support --- tools/python/mwm/dump_mwm.py | 28 ++++++++++--------- tools/python/mwm/find_feature.py | 14 ++++++---- tools/python/mwm/mwm.py | 48 ++++++++++++++++++++------------ 3 files changed, 54 insertions(+), 36 deletions(-) diff --git a/tools/python/mwm/dump_mwm.py b/tools/python/mwm/dump_mwm.py index 251c5cd03c..dfa0203aa2 100755 --- a/tools/python/mwm/dump_mwm.py +++ b/tools/python/mwm/dump_mwm.py @@ -4,31 +4,33 @@ import json from mwm import MWM if len(sys.argv) < 2: - print 'Dumps some MWM structures.' - print 'Usage: {0} '.format(sys.argv[0]) + print('Dumps some MWM structures.') + print('Usage: {0} '.format(sys.argv[0])) sys.exit(1) mwm = MWM(open(sys.argv[1], 'rb')) mwm.read_types(os.path.join(os.path.dirname(sys.argv[0]), '..', '..', '..', 'data', 'types.txt')) -print 'Tags:' -tvv = sorted([(k, v[0], v[1]) for k, v in mwm.tags.iteritems()], key=lambda x: x[1]) +print('Tags:') +tvv = sorted([(k, v[0], v[1]) for k, v in mwm.tags.items()], key=lambda x: x[1]) for tv in tvv: - print ' {0:<8}: offs {1:9} len {2:8}'.format(tv[0], tv[1], tv[2]) -print 'Version:', mwm.read_version() -print 'Header:', mwm.read_header() -print 'Metadata count:', len(mwm.read_metadata()) + print(' {0:<8}: offs {1:9} len {2:8}'.format(tv[0], tv[1], tv[2])) +v = mwm.read_version() +print('Format: {0}, version: {1}'.format(v['fmt'], v['version'].strftime('%Y-%m-%d %H:%M'))) +print('Header: {0}'.format(mwm.read_header())) +print('Metadata count: {0}'.format(len(mwm.read_metadata()))) cross = mwm.read_crossmwm() if cross: - print 'Outgoing points:', len(cross['out']), 'incoming:', len(cross['in']) - print 'Outgoing regions:', set(cross['neighbours']) + print('Outgoing points: {0}, incoming: {1}'.format(len(cross['out']), len(cross['in']))) + print('Outgoing regions: {0}'.format(set(cross['neighbours']))) -print 'Sample features:' +print('Sample features:') +# Print 5 random features ~10000 features apart count = 5 -probability = 1.0 / 1000 +probability = 1.0 / 10000 for feature in mwm.iter_features(): if random.random() < probability: - print json.dumps(feature, ensure_ascii=False) + print(json.dumps(feature, ensure_ascii=False)) count -= 1 if count <= 0: break diff --git a/tools/python/mwm/find_feature.py b/tools/python/mwm/find_feature.py index 4071e7dc08..87b88066d5 100755 --- a/tools/python/mwm/find_feature.py +++ b/tools/python/mwm/find_feature.py @@ -3,13 +3,17 @@ import sys, os.path, json from mwm import MWM if len(sys.argv) < 4: - print 'Finds features in an mwm file' - print 'Usage: {0} '.format(sys.argv[0]) - print 'Type: t for inside types, et for exact type, n for names' + print('Finds features in an mwm file based on a query') + print('Usage: {0} '.format(sys.argv[0])) + print('') + print('Type:') + print(' t for inside types ("t hwtag" will find all hwtags-*)') + print(' et for exact type ("et shop" won\'t find shop-chemist)') + print(' n for names, case-sensitive ("n Starbucks" for all starbucks)') sys.exit(1) typ = sys.argv[2].lower() -find = sys.argv[3] +find = sys.argv[3].decode('utf-8') mwm = MWM(open(sys.argv[1], 'rb')) mwm.read_header() @@ -27,4 +31,4 @@ for feature in mwm.iter_features(): elif typ == 't' and find in t: found = True if found: - print json.dumps(feature, ensure_ascii=False) + print(json.dumps(feature, ensure_ascii=False)) diff --git a/tools/python/mwm/mwm.py b/tools/python/mwm/mwm.py index abc3552382..99addccf8d 100644 --- a/tools/python/mwm/mwm.py +++ b/tools/python/mwm/mwm.py @@ -46,7 +46,7 @@ class MWM: cnt = self.read_varuint() self.tags = {} for i in range(cnt): - name = self.read_string(True) + name = self.read_string(plain=True) offset = self.read_varuint() length = self.read_varuint() self.tags[name] = (offset, length) @@ -57,13 +57,13 @@ class MWM: def seek_tag(self, tag): self.f.seek(self.tags[tag][0]) - def inside_tag(self, tag): - pos = self.tag_position(tag) - return pos >= 0 and pos < self.tags[tag][1] - - def tag_position(self, tag): + def tag_offset(self, tag): return self.f.tell() - self.tags[tag][0] + def inside_tag(self, tag): + pos = self.tag_offset(tag) + return pos >= 0 and pos < self.tags[tag][1] + def read_version(self): """Reads 'version' section.""" self.seek_tag('version') @@ -128,7 +128,7 @@ class MWM: self.seek_tag('meta') metadatar = {} while self.inside_tag('meta'): - tag_pos = self.tag_position('meta') + tag_pos = self.tag_offset('meta') fields = {} if fmt >= 8: sz = self.read_varuint() @@ -144,7 +144,7 @@ class MWM: t = t & 0x7f t = self.metadata[t] if t < len(self.metadata) else str(t) l = self.read_uint(1) - fields[t] = self.f.read(l) + fields[t] = self.f.read(l).decode('utf-8') if is_last: break @@ -189,7 +189,7 @@ class MWM: neighbours = [] for i in range(neighboursCount): size = self.read_uint(4) - neighbours.append(self.f.read(size)) + neighbours.append(self.f.read(size).decode('utf-8')) return { 'in': incoming, 'out': outgoing, 'matrix': matrix, 'neighbours': neighbours } class GeomType: @@ -314,9 +314,13 @@ class MWM: b = self.f.read(1) if not b: return res - res |= (ord(b[0]) & 0x7F) << shift + try: + bc = ord(b) + except TypeError: + bc = b + res |= (bc & 0x7F) << shift shift += 7 - more = ord(b[0]) >= 0x80 + more = bc >= 0x80 return res def zigzag_decode(self, uint): @@ -353,6 +357,7 @@ class MWM: return self.mwm_decode_delta(u, ref) def to_4326(self, point): + """Convert a point in maps.me-mercator CS to WGS-84 (EPSG:4326).""" if self.coord_size is None: raise Exception('Call read_header() first.') merc_bounds = (-180.0, -180.0, 180.0, 180.0) # Xmin, Ymin, Xmax, Ymax @@ -374,9 +379,10 @@ class MWM: pmax = self.to_4326(rmax) return (pmin[0], pmin[1], pmax[0], pmax[1]) - def read_string(self, plain=False): + def read_string(self, plain=False, decode=True): length = self.read_varuint() + (0 if plain else 1) - return self.f.read(length) + s = self.f.read(length) + return s.decode('utf-8') if decode else s def read_uint_array(self): length = self.read_varuint() @@ -390,13 +396,16 @@ class MWM: if sz & 1 != 0: return str(sz >> 1) sz = (sz >> 1) + 1 - return self.f.read(sz) + return self.f.read(sz).decode('utf-8') def read_multilang(self): def find_multilang_next(s, i): i += 1 while i < len(s): - c = struct.unpack('B', s[i])[0] + try: + c = ord(s[i]) + except: + c = s[i] if c & 0xC0 == 0x80: break if c & 0x80 == 0: @@ -416,13 +425,16 @@ class MWM: i += 1 return i - s = self.read_string() + s = self.read_string(decode=False) langs = {} i = 0 while i < len(s): n = find_multilang_next(s, i) - lng = struct.unpack('B', s[i])[0] & 0x3F + try: + lng = ord(s[i]) & 0x3F + except TypeError: + lng = s[i] & 0x3F if lng < len(self.languages): - langs[self.languages[lng]] = s[i+1:n] + langs[self.languages[lng]] = s[i+1:n].decode('utf-8') i = n return langs From b93f85fa3e315346ef2149ff60be975919369e0d Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Mon, 6 Jun 2016 20:23:17 +0300 Subject: [PATCH 9/9] [mwm.py] Use reservoir sampling for printing out features --- tools/python/mwm/dump_mwm.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/python/mwm/dump_mwm.py b/tools/python/mwm/dump_mwm.py index dfa0203aa2..0147c8dc19 100755 --- a/tools/python/mwm/dump_mwm.py +++ b/tools/python/mwm/dump_mwm.py @@ -25,12 +25,13 @@ if cross: print('Outgoing regions: {0}'.format(set(cross['neighbours']))) print('Sample features:') -# Print 5 random features ~10000 features apart +# Print some random features using reservoir sampling count = 5 -probability = 1.0 / 10000 -for feature in mwm.iter_features(): - if random.random() < probability: - print(json.dumps(feature, ensure_ascii=False)) - count -= 1 - if count <= 0: - break +sample = [] +for i, feature in enumerate(mwm.iter_features()): + if i < count: + sample.append(feature) + elif random.randint(0, i) < count: + sample[random.randint(0, count-1)] = feature +for feature in sample: + print(json.dumps(feature, ensure_ascii=False))