From 105babafb822d743f5bedcc56924aef966a2f4f4 Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Fri, 5 May 2017 20:23:08 +0300 Subject: [PATCH 1/2] [localads] Rewrite mapping script --- .../python/local_ads/mwm_to_csv_4localads.py | 117 ++++++++++++++++++ tools/python/mwm/dump_mwm.py | 2 +- tools/python/mwm/mwm.py | 7 +- 3 files changed, 122 insertions(+), 4 deletions(-) create mode 100755 tools/python/local_ads/mwm_to_csv_4localads.py diff --git a/tools/python/local_ads/mwm_to_csv_4localads.py b/tools/python/local_ads/mwm_to_csv_4localads.py new file mode 100755 index 0000000000..384142f8db --- /dev/null +++ b/tools/python/local_ads/mwm_to_csv_4localads.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python2.7 +import os +import sys + +# TODO: Make mwm an installable module. +sys.path.append( + os.path.join( + os.path.dirname(__file__), '..', 'mwm' + ) +) + +import argparse +import csv +import mwm +import logging +import ctypes +from zlib import adler32 +from multiprocessing import Pool, Queue, Process + + +HEADERS = { + 'mapping': 'osmid fid mwm_id mwm_version'.split(), + 'sponsored': 'stype sid fid mwm_id mwm_version'.split(), + 'mwm': 'mwm_id name mwm_version'.split(), +} +QUEUES = {name: Queue() for name in HEADERS} +GOOD_TYPES = ("amenity", "shop", "tourism", "leisure", "sport", + "craft", "man_made", "office", "historic") + + +def generate_id_from_name_and_version(name, version): + return ctypes.c_long((adler32(name) << 32) | version).value + + +def parse_mwm(mwm_name, osm2ft_name, override_version, types_name): + with open(osm2ft_name, 'rb') as f: + ft2osm = mwm.read_osm2ft(f, ft2osm=True, tuples=False) + with open(mwm_name, 'rb') as f: + mwm_file = mwm.MWM(f) + version = override_version or mwm_file.read_version()['version'] + region_name = os.path.splitext(mwm_name)[0] + mwm_id = generate_id_from_name_and_version(region_name, version) + QUEUES['mwm'].put((mwm_id, region_name, version)) + mwm_file.read_header() + mwm_file.read_types(types_name) + for feature in mwm_file.iter_features(metadata=True): + osm_id = ft2osm.get(feature['id'], None) + if osm_id is None: + if 'ref:sponsored' in feature['metadata']: + for t in feature['header']['types']: + if t.startswith('sponsored-'): + QUEUES['sponsored'].put((t[t.find('-')+1:], + feature['metadata']['ref:sponsored'], + feature['id'], + mwm_id, + version)) + break + else: + for t in feature['header']['types']: + if t.startswith(GOOD_TYPES): + QUEUES['mapping'].put((ctypes.c_long(osm_id).value, + feature['id'], + mwm_id, + version)) + break + + +def write_csv(output_dir, qtype): + with open(os.path.join(output_dir, qtype + '.csv'), 'w') as f: + mapping = QUEUES[qtype].get() + w = csv.writer(f) + w.writerow(HEADERS[qtype]) + while mapping is not None: + w.writerow(mapping) + mapping = QUEUES[qtype].get() + + +def main(): + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%H:%M:%S') + parser = argparse.ArgumentParser( + description='Prepares CSV files for uploading to localads database from mwm files.') + parser.add_argument('mwm', help='path to mwm files') + parser.add_argument('--osm2ft', help='path to osm2ft files (default is the same as mwm)') + parser.add_argument('--output', default='.', help='path to generated files ("." by default)') + types_default = os.path.join(os.path.dirname(sys.argv[0]), + '..', '..', '..', 'data', 'types.txt') + parser.add_argument('--types', default=types_default, help='path to omim/data/types.txt') + parser.add_argument('--threads', type=int, help='number of threads to process files') + parser.add_argument('--version', type=int, help='override mwm version') + args = parser.parse_args() + if not args.osm2ft: + args.osm2ft = args.mwm + + # Create CSV writer processes for each queue and a pool of MWM readers. + writers = [Process(target=write_csv, args=(args.output, qtype)) for qtype in QUEUES] + for w in writers: + w.start() + pool = Pool(processes=args.threads) + for mwm_name in os.listdir(args.mwm): + if 'World' in mwm_name or 'minsk_pass' in mwm_name or not mwm_name.endswith('.mwm'): + continue + osm2ft_name = os.path.join(args.osm2ft, os.path.basename(mwm_name) + '.osm2ft') + if not os.path.exists(osm2ft_name): + logging.error('Cannot find %s', osm2ft_name) + sys.exit(2) + logging.info(mwm_name) + pool.apply_async(parse_mwm, (mwm_name, osm2ft_name, args.version, args.types)) + pool.close() + pool.join() + for queue in QUEUES.values(): + queue.put(None) + for w in writers: + w.join() + + +if __name__ == '__main__': + main() diff --git a/tools/python/mwm/dump_mwm.py b/tools/python/mwm/dump_mwm.py index 29fb1c6562..98e240eebb 100755 --- a/tools/python/mwm/dump_mwm.py +++ b/tools/python/mwm/dump_mwm.py @@ -15,7 +15,7 @@ tvv = sorted([(k, v[0], v[1]) for k, v in mwm.tags.items()], key=lambda x: x[1]) for tv in tvv: print(' {0:<8}: offs {1:9} len {2:8}'.format(tv[0], tv[1], tv[2])) v = mwm.read_version() -print('Format: {0}, version: {1}'.format(v['fmt'], v['version'].strftime('%Y-%m-%d %H:%M'))) +print('Format: {0}, version: {1}'.format(v['fmt'], v['date'].strftime('%Y-%m-%d %H:%M'))) print('Header: {0}'.format(mwm.read_header())) print('Region Info: {0}'.format(mwm.read_region_info())) print('Metadata count: {0}'.format(len(mwm.read_metadata()))) diff --git a/tools/python/mwm/mwm.py b/tools/python/mwm/mwm.py index e67ae8def0..4784f98b67 100644 --- a/tools/python/mwm/mwm.py +++ b/tools/python/mwm/mwm.py @@ -77,10 +77,11 @@ class MWM: fmt = self.read_varuint() + 1 version = self.read_varuint() if version < 161231: - version = datetime(2000 + int(version / 10000), int(version / 100) % 100, version % 100) + vdate = datetime(2000 + int(version / 10000), int(version / 100) % 100, version % 100) else: - version = datetime.fromtimestamp(version) - return { 'fmt': fmt, 'version': version } + vdate = datetime.fromtimestamp(version) + version = int(vdate.strftime('%y%m%d')) + return {'fmt': fmt, 'version': version, 'date': vdate} def read_header(self): """Reads 'header' section.""" From 5944d06a901bc171de114139b9c7d8406c7743be Mon Sep 17 00:00:00 2001 From: Ilya Zverev Date: Thu, 11 May 2017 15:01:01 +0300 Subject: [PATCH 2/2] [localads] Remove the old script --- tools/python/local_ads/features_db_updater.py | 147 ------------------ 1 file changed, 147 deletions(-) delete mode 100755 tools/python/local_ads/features_db_updater.py diff --git a/tools/python/local_ads/features_db_updater.py b/tools/python/local_ads/features_db_updater.py deleted file mode 100755 index 127e995a54..0000000000 --- a/tools/python/local_ads/features_db_updater.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python2.7 - -from __future__ import print_function - -import os -import sys - -# TODO(mgsergio, zveric, yershov): Make mwm an installable module. -sys.path.append( - os.path.join( - os.path.dirname(__file__), '..', 'mwm' - ) -) - -import argparse -import csv -# c_long is used to get signed int64. Postgres can't handle uint64. -import ctypes -import logging -import mwm - -from itertools import islice -from zlib import adler32 - - -def get_mapping(mapping_name): - with open(mapping_name, 'rb') as f: - osm2ft = mwm.read_osm2ft(f, tuples=False) - - for osmid, fid in osm2ft.iteritems(): - yield ctypes.c_long(osmid).value, fid - - -def print_mapping(mapping, count): - for osmid, fid in islice(mapping, count): - print('{}\t{}'.format(osmid, fid)) - - -def generate_id_from_name_and_version(name, version): - return ctypes.c_long((adler32(name) << 32) | version).value - - -def generate_csvs(mapping, mapping_name, version, output_path): - mwm_id = generate_id_from_name_and_version( - mapping_name, - version - ) - - with open(os.path.join(output_path, 'mwm.csv'), 'ab') as f: - w = csv.writer(f) - # TODO(mgsergio): Either remove or make so this is will write only one header. - # w.writerow(['id', 'name', 'version']) - w.writerow([ - mwm_id, - mapping_name, - version, - ]) - - with open(os.path.join(output_path, 'mapping.csv'), 'ab') as f: - w = csv.writer(f) - # TODO(mgsergio): Either remove or make so this is will write only one header. - # w.writerow(['osmid', 'fid', 'mwm_id', mwm_version]) - for row in mapping: - w.writerow(row + (mwm_id, version)) - - -def get_args(): - parser = argparse.ArgumentParser() - src = parser.add_mutually_exclusive_group(required=True) - dst = parser.add_mutually_exclusive_group(required=True) - - src.add_argument( - '--mapping_names', - nargs='+', - help='osm2ft files to handle.' - ) - src.add_argument( - '--mapping_path', - nargs=1, - action=AppendOsm2FidAction, - dest='mapping_names', - help='Path to folder with .osm2ft. Each file whould be handled.' - ) - - dst.add_argument( - '--output_path', - help='A path to an output folder.' - ) - dst.add_argument( - '--head', - type=int, - help='Write that much lines of osmid <-> fid to stdout.' - ) - - parser.add_argument( - '--version', - required=True, - type=int, - help='The version of mwm for which a mapping is generated.' - ) - - return parser.parse_args(); - - -def main(): - args = get_args() - for mapping_name in args.mapping_names: - mapping = get_mapping(mapping_name) - if args.head: - print('{}:'.format(mapping_name)) - print_mapping(mapping, args.head) - exit(0) - mwm_name = ( - os.path.basename(mapping_name) - .split('.', 1) - )[0] - logging.info('Writing mapping for {}'.format(mapping_name)) - generate_csvs( - mapping, - mwm_name, - args.version, - args.output_path - ) - - -class AppendOsm2FidAction(argparse.Action): - def __init__(self, option_strings, dest, nargs=None, **kwargs): - assert nargs == 1, 'nargs should equals to 1.' - super(AppendOsm2FidAction, self).__init__( - option_strings, - dest, - nargs=1, - **kwargs - ) - - def __call__(self, parser, namespace, values, option_string=None): - values = [ - os.path.join(values[0], mapping_name) - for mapping_name in os.listdir(values[0]) - if mapping_name.endswith('.osm2ft') - ] - setattr(namespace, self.dest, values) - - -if __name__ == '__main__': - logging.basicConfig(level=logging.INFO) - main()