forked from organicmaps/organicmaps
Merge pull request #6032 from Zverik/new_localads_csv
[localads] Rewrite feature→osm mapping script
This commit is contained in:
commit
b02a967ae1
4 changed files with 122 additions and 151 deletions
|
@ -1,147 +0,0 @@
|
|||
#!/usr/bin/env python2.7
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
# TODO(mgsergio, zveric, yershov): Make mwm an installable module.
|
||||
sys.path.append(
|
||||
os.path.join(
|
||||
os.path.dirname(__file__), '..', 'mwm'
|
||||
)
|
||||
)
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
# c_long is used to get signed int64. Postgres can't handle uint64.
|
||||
import ctypes
|
||||
import logging
|
||||
import mwm
|
||||
|
||||
from itertools import islice
|
||||
from zlib import adler32
|
||||
|
||||
|
||||
def get_mapping(mapping_name):
|
||||
with open(mapping_name, 'rb') as f:
|
||||
osm2ft = mwm.read_osm2ft(f, tuples=False)
|
||||
|
||||
for osmid, fid in osm2ft.iteritems():
|
||||
yield ctypes.c_long(osmid).value, fid
|
||||
|
||||
|
||||
def print_mapping(mapping, count):
|
||||
for osmid, fid in islice(mapping, count):
|
||||
print('{}\t{}'.format(osmid, fid))
|
||||
|
||||
|
||||
def generate_id_from_name_and_version(name, version):
|
||||
return ctypes.c_long((adler32(name) << 32) | version).value
|
||||
|
||||
|
||||
def generate_csvs(mapping, mapping_name, version, output_path):
|
||||
mwm_id = generate_id_from_name_and_version(
|
||||
mapping_name,
|
||||
version
|
||||
)
|
||||
|
||||
with open(os.path.join(output_path, 'mwm.csv'), 'ab') as f:
|
||||
w = csv.writer(f)
|
||||
# TODO(mgsergio): Either remove or make so this is will write only one header.
|
||||
# w.writerow(['id', 'name', 'version'])
|
||||
w.writerow([
|
||||
mwm_id,
|
||||
mapping_name,
|
||||
version,
|
||||
])
|
||||
|
||||
with open(os.path.join(output_path, 'mapping.csv'), 'ab') as f:
|
||||
w = csv.writer(f)
|
||||
# TODO(mgsergio): Either remove or make so this is will write only one header.
|
||||
# w.writerow(['osmid', 'fid', 'mwm_id', mwm_version])
|
||||
for row in mapping:
|
||||
w.writerow(row + (mwm_id, version))
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
src = parser.add_mutually_exclusive_group(required=True)
|
||||
dst = parser.add_mutually_exclusive_group(required=True)
|
||||
|
||||
src.add_argument(
|
||||
'--mapping_names',
|
||||
nargs='+',
|
||||
help='osm2ft files to handle.'
|
||||
)
|
||||
src.add_argument(
|
||||
'--mapping_path',
|
||||
nargs=1,
|
||||
action=AppendOsm2FidAction,
|
||||
dest='mapping_names',
|
||||
help='Path to folder with .osm2ft. Each file whould be handled.'
|
||||
)
|
||||
|
||||
dst.add_argument(
|
||||
'--output_path',
|
||||
help='A path to an output folder.'
|
||||
)
|
||||
dst.add_argument(
|
||||
'--head',
|
||||
type=int,
|
||||
help='Write that much lines of osmid <-> fid to stdout.'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--version',
|
||||
required=True,
|
||||
type=int,
|
||||
help='The version of mwm for which a mapping is generated.'
|
||||
)
|
||||
|
||||
return parser.parse_args();
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
for mapping_name in args.mapping_names:
|
||||
mapping = get_mapping(mapping_name)
|
||||
if args.head:
|
||||
print('{}:'.format(mapping_name))
|
||||
print_mapping(mapping, args.head)
|
||||
exit(0)
|
||||
mwm_name = (
|
||||
os.path.basename(mapping_name)
|
||||
.split('.', 1)
|
||||
)[0]
|
||||
logging.info('Writing mapping for {}'.format(mapping_name))
|
||||
generate_csvs(
|
||||
mapping,
|
||||
mwm_name,
|
||||
args.version,
|
||||
args.output_path
|
||||
)
|
||||
|
||||
|
||||
class AppendOsm2FidAction(argparse.Action):
|
||||
def __init__(self, option_strings, dest, nargs=None, **kwargs):
|
||||
assert nargs == 1, 'nargs should equals to 1.'
|
||||
super(AppendOsm2FidAction, self).__init__(
|
||||
option_strings,
|
||||
dest,
|
||||
nargs=1,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
values = [
|
||||
os.path.join(values[0], mapping_name)
|
||||
for mapping_name in os.listdir(values[0])
|
||||
if mapping_name.endswith('.osm2ft')
|
||||
]
|
||||
setattr(namespace, self.dest, values)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
main()
|
117
tools/python/local_ads/mwm_to_csv_4localads.py
Executable file
117
tools/python/local_ads/mwm_to_csv_4localads.py
Executable file
|
@ -0,0 +1,117 @@
|
|||
#!/usr/bin/env python2.7
|
||||
import os
|
||||
import sys
|
||||
|
||||
# TODO: Make mwm an installable module.
|
||||
sys.path.append(
|
||||
os.path.join(
|
||||
os.path.dirname(__file__), '..', 'mwm'
|
||||
)
|
||||
)
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import mwm
|
||||
import logging
|
||||
import ctypes
|
||||
from zlib import adler32
|
||||
from multiprocessing import Pool, Queue, Process
|
||||
|
||||
|
||||
HEADERS = {
|
||||
'mapping': 'osmid fid mwm_id mwm_version'.split(),
|
||||
'sponsored': 'stype sid fid mwm_id mwm_version'.split(),
|
||||
'mwm': 'mwm_id name mwm_version'.split(),
|
||||
}
|
||||
QUEUES = {name: Queue() for name in HEADERS}
|
||||
GOOD_TYPES = ("amenity", "shop", "tourism", "leisure", "sport",
|
||||
"craft", "man_made", "office", "historic")
|
||||
|
||||
|
||||
def generate_id_from_name_and_version(name, version):
|
||||
return ctypes.c_long((adler32(name) << 32) | version).value
|
||||
|
||||
|
||||
def parse_mwm(mwm_name, osm2ft_name, override_version, types_name):
|
||||
with open(osm2ft_name, 'rb') as f:
|
||||
ft2osm = mwm.read_osm2ft(f, ft2osm=True, tuples=False)
|
||||
with open(mwm_name, 'rb') as f:
|
||||
mwm_file = mwm.MWM(f)
|
||||
version = override_version or mwm_file.read_version()['version']
|
||||
region_name = os.path.splitext(mwm_name)[0]
|
||||
mwm_id = generate_id_from_name_and_version(region_name, version)
|
||||
QUEUES['mwm'].put((mwm_id, region_name, version))
|
||||
mwm_file.read_header()
|
||||
mwm_file.read_types(types_name)
|
||||
for feature in mwm_file.iter_features(metadata=True):
|
||||
osm_id = ft2osm.get(feature['id'], None)
|
||||
if osm_id is None:
|
||||
if 'ref:sponsored' in feature['metadata']:
|
||||
for t in feature['header']['types']:
|
||||
if t.startswith('sponsored-'):
|
||||
QUEUES['sponsored'].put((t[t.find('-')+1:],
|
||||
feature['metadata']['ref:sponsored'],
|
||||
feature['id'],
|
||||
mwm_id,
|
||||
version))
|
||||
break
|
||||
else:
|
||||
for t in feature['header']['types']:
|
||||
if t.startswith(GOOD_TYPES):
|
||||
QUEUES['mapping'].put((ctypes.c_long(osm_id).value,
|
||||
feature['id'],
|
||||
mwm_id,
|
||||
version))
|
||||
break
|
||||
|
||||
|
||||
def write_csv(output_dir, qtype):
|
||||
with open(os.path.join(output_dir, qtype + '.csv'), 'w') as f:
|
||||
mapping = QUEUES[qtype].get()
|
||||
w = csv.writer(f)
|
||||
w.writerow(HEADERS[qtype])
|
||||
while mapping is not None:
|
||||
w.writerow(mapping)
|
||||
mapping = QUEUES[qtype].get()
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%H:%M:%S')
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Prepares CSV files for uploading to localads database from mwm files.')
|
||||
parser.add_argument('mwm', help='path to mwm files')
|
||||
parser.add_argument('--osm2ft', help='path to osm2ft files (default is the same as mwm)')
|
||||
parser.add_argument('--output', default='.', help='path to generated files ("." by default)')
|
||||
types_default = os.path.join(os.path.dirname(sys.argv[0]),
|
||||
'..', '..', '..', 'data', 'types.txt')
|
||||
parser.add_argument('--types', default=types_default, help='path to omim/data/types.txt')
|
||||
parser.add_argument('--threads', type=int, help='number of threads to process files')
|
||||
parser.add_argument('--version', type=int, help='override mwm version')
|
||||
args = parser.parse_args()
|
||||
if not args.osm2ft:
|
||||
args.osm2ft = args.mwm
|
||||
|
||||
# Create CSV writer processes for each queue and a pool of MWM readers.
|
||||
writers = [Process(target=write_csv, args=(args.output, qtype)) for qtype in QUEUES]
|
||||
for w in writers:
|
||||
w.start()
|
||||
pool = Pool(processes=args.threads)
|
||||
for mwm_name in os.listdir(args.mwm):
|
||||
if 'World' in mwm_name or 'minsk_pass' in mwm_name or not mwm_name.endswith('.mwm'):
|
||||
continue
|
||||
osm2ft_name = os.path.join(args.osm2ft, os.path.basename(mwm_name) + '.osm2ft')
|
||||
if not os.path.exists(osm2ft_name):
|
||||
logging.error('Cannot find %s', osm2ft_name)
|
||||
sys.exit(2)
|
||||
logging.info(mwm_name)
|
||||
pool.apply_async(parse_mwm, (mwm_name, osm2ft_name, args.version, args.types))
|
||||
pool.close()
|
||||
pool.join()
|
||||
for queue in QUEUES.values():
|
||||
queue.put(None)
|
||||
for w in writers:
|
||||
w.join()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -15,7 +15,7 @@ tvv = sorted([(k, v[0], v[1]) for k, v in mwm.tags.items()], key=lambda x: x[1])
|
|||
for tv in tvv:
|
||||
print(' {0:<8}: offs {1:9} len {2:8}'.format(tv[0], tv[1], tv[2]))
|
||||
v = mwm.read_version()
|
||||
print('Format: {0}, version: {1}'.format(v['fmt'], v['version'].strftime('%Y-%m-%d %H:%M')))
|
||||
print('Format: {0}, version: {1}'.format(v['fmt'], v['date'].strftime('%Y-%m-%d %H:%M')))
|
||||
print('Header: {0}'.format(mwm.read_header()))
|
||||
print('Region Info: {0}'.format(mwm.read_region_info()))
|
||||
print('Metadata count: {0}'.format(len(mwm.read_metadata())))
|
||||
|
|
|
@ -77,10 +77,11 @@ class MWM:
|
|||
fmt = self.read_varuint() + 1
|
||||
version = self.read_varuint()
|
||||
if version < 161231:
|
||||
version = datetime(2000 + int(version / 10000), int(version / 100) % 100, version % 100)
|
||||
vdate = datetime(2000 + int(version / 10000), int(version / 100) % 100, version % 100)
|
||||
else:
|
||||
version = datetime.fromtimestamp(version)
|
||||
return { 'fmt': fmt, 'version': version }
|
||||
vdate = datetime.fromtimestamp(version)
|
||||
version = int(vdate.strftime('%y%m%d'))
|
||||
return {'fmt': fmt, 'version': version, 'date': vdate}
|
||||
|
||||
def read_header(self):
|
||||
"""Reads 'header' section."""
|
||||
|
|
Loading…
Add table
Reference in a new issue