forked from organicmaps/organicmaps
[python] Refactoring: changed format.
This commit is contained in:
parent
c08ce405d3
commit
d17c7ed26b
2 changed files with 174 additions and 145 deletions
|
@ -26,164 +26,181 @@ from optparse import OptionParser
|
|||
|
||||
|
||||
class CountryDict(dict):
|
||||
def __init__(self, *args, **kwargs):
|
||||
dict.__init__(self, *args, **kwargs)
|
||||
self.order = ['id', 'n', 'f', 'v', 'c', 's', 'sha1_base64', 'rs', 'g']
|
||||
def __init__(self, *args, **kwargs):
|
||||
dict.__init__(self, *args, **kwargs)
|
||||
self.order = ['id', 'n', 'f', 'v', 'c', 's', 'sha1_base64', 'rs', 'g']
|
||||
|
||||
def __iter__(self):
|
||||
for key in self.order:
|
||||
if key in self:
|
||||
yield key
|
||||
for key in dict.__iter__(self):
|
||||
if key not in self.order:
|
||||
yield key
|
||||
def __iter__(self):
|
||||
for key in self.order:
|
||||
if key in self:
|
||||
yield key
|
||||
for key in dict.__iter__(self):
|
||||
if key not in self.order:
|
||||
yield key
|
||||
|
||||
def iteritems(self):
|
||||
for key in self.__iter__():
|
||||
yield (key, self.__getitem__(key))
|
||||
|
||||
def iteritems(self):
|
||||
for key in self.__iter__():
|
||||
yield (key, self.__getitem__(key))
|
||||
|
||||
def get_hash(path, name):
|
||||
if path == '0':
|
||||
return ''
|
||||
filename = os.path.join(path, '{0}.mwm'.format(name))
|
||||
h = hashlib.sha1()
|
||||
with open(filename, 'rb') as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
h.update(chunk)
|
||||
return base64.b64encode(h.digest())
|
||||
if path == '0':
|
||||
return ''
|
||||
filename = os.path.join(path, '{0}.mwm'.format(name))
|
||||
h = hashlib.sha1()
|
||||
with open(filename, 'rb') as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
h.update(chunk)
|
||||
return base64.b64encode(h.digest())
|
||||
|
||||
|
||||
def get_size(path, name):
|
||||
if path == '0':
|
||||
return 0
|
||||
filename = os.path.join(path, '{0}.mwm'.format(name))
|
||||
try:
|
||||
return os.path.getsize(filename)
|
||||
except OSError:
|
||||
sys.stderr.write('Missing file: {0}\n'.format(filename))
|
||||
return -1
|
||||
if path == '0':
|
||||
return 0
|
||||
filename = os.path.join(path, '{0}.mwm'.format(name))
|
||||
try:
|
||||
return os.path.getsize(filename)
|
||||
except OSError:
|
||||
sys.stderr.write('Missing file: {0}\n'.format(filename))
|
||||
return -1
|
||||
|
||||
|
||||
def collapse_single(root):
|
||||
for i in range(len(root['g'])):
|
||||
if 'g' in root['g'][i]:
|
||||
if len(root['g'][i]['g']) == 1:
|
||||
# replace group by a leaf
|
||||
if 'c' in root['g'][i]:
|
||||
root['g'][i]['g'][0]['c'] = root['g'][i]['c']
|
||||
root['g'][i] = root['g'][i]['g'][0]
|
||||
else:
|
||||
collapse_single(root['g'][i])
|
||||
for i in range(len(root['g'])):
|
||||
if 'g' in root['g'][i]:
|
||||
if len(root['g'][i]['g']) == 1:
|
||||
# replace group by a leaf
|
||||
if 'c' in root['g'][i]:
|
||||
root['g'][i]['g'][0]['c'] = root['g'][i]['c']
|
||||
root['g'][i] = root['g'][i]['g'][0]
|
||||
else:
|
||||
collapse_single(root['g'][i])
|
||||
|
||||
|
||||
def get_name(leaf):
|
||||
if 'n' in leaf:
|
||||
return leaf['n'].lower()
|
||||
else:
|
||||
return leaf['id'].lower()
|
||||
if 'n' in leaf:
|
||||
return leaf['n'].lower()
|
||||
else:
|
||||
return leaf['id'].lower()
|
||||
|
||||
|
||||
def sort_tree(root):
|
||||
root['g'].sort(key=get_name)
|
||||
for leaf in root['g']:
|
||||
if 'g' in leaf:
|
||||
sort_tree(leaf)
|
||||
root['g'].sort(key=get_name)
|
||||
for leaf in root['g']:
|
||||
if 'g' in leaf:
|
||||
sort_tree(leaf)
|
||||
|
||||
|
||||
parser = OptionParser(add_help_option=False)
|
||||
parser.add_option('-t', '--target', help='Path to mwm files')
|
||||
parser.add_option('-h', '--hierarchy', default='hierarchy.txt', help='Hierarchy file')
|
||||
parser.add_option('-h', '--hierarchy', default='hierarchy.txt',
|
||||
help='Hierarchy file')
|
||||
parser.add_option('--old', help='old_vs_new.csv file')
|
||||
parser.add_option('--osm', help='borders_vs_osm.csv file')
|
||||
parser.add_option('-v', '--version', type='int', default=151231, help='Version')
|
||||
parser.add_option('-o', '--output', help='Output countries.txt file (default is stdout)')
|
||||
parser.add_option('-o', '--output',
|
||||
help='Output countries.txt file (default is stdout)')
|
||||
parser.add_option('-m', '--help', action='store_true', help='Display this help')
|
||||
parser.add_option('--flag', action='store_true', help='Add flags ("c") to countries')
|
||||
parser.add_option('--lang', action='store_true', help='Add languages ("lang") to countries')
|
||||
parser.add_option('-l', '--legacy', action='store_true', help='Produce a legacy format file')
|
||||
parser.add_option('-n', '--names', help='Translations for file names (for legacy format)')
|
||||
parser.add_option('-s', '--sort', action='store_true', help='Sort leaves by name (useful for legacy)')
|
||||
parser.add_option('--flag', action='store_true',
|
||||
help='Add flags ("c") to countries')
|
||||
parser.add_option('--lang', action='store_true',
|
||||
help='Add languages ("lang") to countries')
|
||||
parser.add_option('-l', '--legacy', action='store_true',
|
||||
help='Produce a legacy format file')
|
||||
parser.add_option('-n', '--names',
|
||||
help='Translations for file names (for legacy format)')
|
||||
parser.add_option('-s', '--sort', action='store_true',
|
||||
help='Sort leaves by name (useful for legacy)')
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
if options.help:
|
||||
parser.print_help()
|
||||
sys.exit(0)
|
||||
parser.print_help()
|
||||
sys.exit(0)
|
||||
|
||||
if not os.path.isfile(options.hierarchy):
|
||||
parser.error('Hierarchy file is required.')
|
||||
parser.error('Hierarchy file is required.')
|
||||
|
||||
oldvs = {}
|
||||
if options.old:
|
||||
with open(options.old, 'r') as f:
|
||||
for line in f:
|
||||
m = re.match(r'(.+?)\t(.+)', line.strip())
|
||||
if m:
|
||||
if m.group(2) in oldvs:
|
||||
oldvs[m.group(2)].append(m.group(1))
|
||||
else:
|
||||
oldvs[m.group(2)] = [m.group(1)]
|
||||
with open(options.old, 'r') as f:
|
||||
for line in f:
|
||||
m = re.match(r'(.+?)\t(.+)', line.strip())
|
||||
if m:
|
||||
if m.group(2) in oldvs:
|
||||
oldvs[m.group(2)].append(m.group(1))
|
||||
else:
|
||||
oldvs[m.group(2)] = [m.group(1)]
|
||||
|
||||
vsosm = {}
|
||||
if options.osm:
|
||||
with codecs.open(options.osm, 'r', 'utf-8') as f:
|
||||
for line in f:
|
||||
m = re.match(r'^(.+?)\t(\d)\t(.+?)$', line.strip())
|
||||
if m:
|
||||
if m.group(1) in vsosm:
|
||||
vsosm[m.group(1)].append(m.group(3))
|
||||
else:
|
||||
vsosm[m.group(1)] = [m.group(3)]
|
||||
with codecs.open(options.osm, 'r', 'utf-8') as f:
|
||||
for line in f:
|
||||
m = re.match(r'^(.+?)\t(\d)\t(.+?)$', line.strip())
|
||||
if m:
|
||||
if m.group(1) in vsosm:
|
||||
vsosm[m.group(1)].append(m.group(3))
|
||||
else:
|
||||
vsosm[m.group(1)] = [m.group(3)]
|
||||
|
||||
names = {}
|
||||
if options.names:
|
||||
with codecs.open(options.names, 'r', 'utf-8') as f:
|
||||
for line in f:
|
||||
pair = [x.strip() for x in line.split('=', 1)]
|
||||
if len(pair) == 2 and pair[0] != pair[1]:
|
||||
try:
|
||||
names[pair[0]] = pair[1]
|
||||
except Error:
|
||||
sys.stderr.write('Could not read translation for {0}\n'.format(pair[0]))
|
||||
with codecs.open(options.names, 'r', 'utf-8') as f:
|
||||
for line in f:
|
||||
pair = [x.strip() for x in line.split('=', 1)]
|
||||
if len(pair) == 2 and pair[0] != pair[1]:
|
||||
try:
|
||||
names[pair[0]] = pair[1]
|
||||
except Error:
|
||||
sys.stderr.write(
|
||||
'Could not read translation for {0}\n'.format(pair[0]))
|
||||
|
||||
nameattr = 'n' if options.legacy else 'id'
|
||||
mwmpath = '0' if not options.target else options.target
|
||||
stack = [CountryDict({ "v": options.version, nameattr: "World" if options.legacy else "Countries", "g": [] })]
|
||||
stack = [CountryDict(
|
||||
{"v": options.version, nameattr: "World" if options.legacy else "Countries",
|
||||
"g": []})]
|
||||
last = None
|
||||
with open(options.hierarchy, 'r') as f:
|
||||
for line in f:
|
||||
m = re.match('( *)(.+?)\n', line)
|
||||
if m:
|
||||
depth = len(m.group(1))
|
||||
if last is not None:
|
||||
lastd = last['d']
|
||||
del last['d']
|
||||
if lastd < depth:
|
||||
# last is a group
|
||||
last['g'] = []
|
||||
if options.legacy and 'f' in last:
|
||||
del last['f']
|
||||
stack.append(last)
|
||||
else:
|
||||
name = last['f' if 'f' in last else nameattr]
|
||||
last['s'] = get_size(mwmpath, name)
|
||||
last['sha1_base64'] = get_hash(mwmpath, name)
|
||||
if options.legacy:
|
||||
last['rs'] = 0
|
||||
if last['s'] >= 0:
|
||||
stack[-1]['g'].append(last)
|
||||
while depth < len(stack) - 1:
|
||||
# group ended, add it to higher group
|
||||
g = stack.pop()
|
||||
if len(g['g']) > 0:
|
||||
stack[-1]['g'].append(g)
|
||||
items = m.group(2).split(';')
|
||||
last = CountryDict({ nameattr: items[0], "d": depth })
|
||||
if not options.legacy and items[0] in oldvs:
|
||||
last['old'] = oldvs[items[0]]
|
||||
if not options.legacy and items[0] in vsosm:
|
||||
last['affiliations'] = vsosm[items[0]]
|
||||
if (options.legacy or options.flag) and len(items) > 2 and len(items[2]) > 0:
|
||||
last['c'] = items[2]
|
||||
if options.lang and len(items) > 3 and len(items[3]) > 0:
|
||||
last['lang'] = items[3].split(',')
|
||||
if options.legacy and items[0] in names:
|
||||
last['f'] = last[nameattr]
|
||||
last[nameattr] = names[items[0]]
|
||||
for line in f:
|
||||
m = re.match('( *)(.+?)\n', line)
|
||||
if m:
|
||||
depth = len(m.group(1))
|
||||
if last is not None:
|
||||
lastd = last['d']
|
||||
del last['d']
|
||||
if lastd < depth:
|
||||
# last is a group
|
||||
last['g'] = []
|
||||
if options.legacy and 'f' in last:
|
||||
del last['f']
|
||||
stack.append(last)
|
||||
else:
|
||||
name = last['f' if 'f' in last else nameattr]
|
||||
last['s'] = get_size(mwmpath, name)
|
||||
last['sha1_base64'] = get_hash(mwmpath, name)
|
||||
if options.legacy:
|
||||
last['rs'] = 0
|
||||
if last['s'] >= 0:
|
||||
stack[-1]['g'].append(last)
|
||||
while depth < len(stack) - 1:
|
||||
# group ended, add it to higher group
|
||||
g = stack.pop()
|
||||
if len(g['g']) > 0:
|
||||
stack[-1]['g'].append(g)
|
||||
items = m.group(2).split(';')
|
||||
last = CountryDict({nameattr: items[0], "d": depth})
|
||||
if not options.legacy and items[0] in oldvs:
|
||||
last['old'] = oldvs[items[0]]
|
||||
if not options.legacy and items[0] in vsosm:
|
||||
last['affiliations'] = vsosm[items[0]]
|
||||
if (options.legacy or options.flag) and len(items) > 2 and len(
|
||||
items[2]) > 0:
|
||||
last['c'] = items[2]
|
||||
if options.lang and len(items) > 3 and len(items[3]) > 0:
|
||||
last['lang'] = items[3].split(',')
|
||||
if options.legacy and items[0] in names:
|
||||
last['f'] = last[nameattr]
|
||||
last[nameattr] = names[items[0]]
|
||||
|
||||
# the last line is always a file
|
||||
del last['d']
|
||||
|
@ -191,19 +208,19 @@ name = last['f' if 'f' in last else nameattr]
|
|||
last['s'] = get_size(mwmpath, name)
|
||||
last['sha1_base64'] = get_hash(mwmpath, name)
|
||||
if options.legacy:
|
||||
last['rs'] = 0
|
||||
last['rs'] = 0
|
||||
if last['s'] >= 0:
|
||||
stack[-1]['g'].append(last)
|
||||
stack[-1]['g'].append(last)
|
||||
while len(stack) > 1:
|
||||
g = stack.pop()
|
||||
if len(g['g']) > 0:
|
||||
stack[-1]['g'].append(g)
|
||||
g = stack.pop()
|
||||
if len(g['g']) > 0:
|
||||
stack[-1]['g'].append(g)
|
||||
|
||||
collapse_single(stack[-1])
|
||||
if options.sort:
|
||||
sort_tree(stack[-1])
|
||||
sort_tree(stack[-1])
|
||||
if options.output:
|
||||
with codecs.open(options.output, 'w', 'utf-8') as f:
|
||||
json.dump(stack[-1], f, ensure_ascii=True, indent=1)
|
||||
with codecs.open(options.output, 'w', 'utf-8') as f:
|
||||
json.dump(stack[-1], f, ensure_ascii=True, indent=1)
|
||||
else:
|
||||
print(json.dumps(stack[-1], ensure_ascii=True, indent=1))
|
||||
print(json.dumps(stack[-1], ensure_ascii=True, indent=1))
|
||||
|
|
|
@ -47,14 +47,16 @@ def parse_mwm(mwm_name, osm2ft_name, override_version, types_name):
|
|||
for feature in mwm_file.iter_features(metadata=True):
|
||||
osm_id = ft2osm.get(feature['id'], None)
|
||||
if osm_id is None:
|
||||
if 'metadata' in feature and 'ref:sponsored' in feature['metadata']:
|
||||
if 'metadata' in feature and 'ref:sponsored' in feature[
|
||||
'metadata']:
|
||||
for t in feature['header']['types']:
|
||||
if t.startswith('sponsored-'):
|
||||
QUEUES['sponsored'].put((feature['metadata']['ref:sponsored'],
|
||||
feature['id'],
|
||||
mwm_id,
|
||||
version,
|
||||
SOURCE_TYPES[t[t.find('-') + 1:]]))
|
||||
QUEUES['sponsored'].put(
|
||||
(feature['metadata']['ref:sponsored'],
|
||||
feature['id'],
|
||||
mwm_id,
|
||||
version,
|
||||
SOURCE_TYPES[t[t.find('-') + 1:]]))
|
||||
break
|
||||
else:
|
||||
for t in feature['header']['types']:
|
||||
|
@ -83,18 +85,24 @@ def write_csv(output_dir, qtype):
|
|||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%H:%M:%S')
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s',
|
||||
datefmt='%H:%M:%S')
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Prepares CSV files for uploading to localads database from mwm files.')
|
||||
parser.add_argument('mwm', help='path to mwm files')
|
||||
parser.add_argument('--osm2ft', help='path to osm2ft files (default is the same as mwm)')
|
||||
parser.add_argument('--output', default='.', help='path to generated files ("." by default)')
|
||||
parser.add_argument('--osm2ft',
|
||||
help='path to osm2ft files (default is the same as mwm)')
|
||||
parser.add_argument('--output', default='.',
|
||||
help='path to generated files ("." by default)')
|
||||
types_default = os.path.join(os.path.dirname(sys.argv[0]),
|
||||
'..', '..', '..', 'data', 'types.txt')
|
||||
parser.add_argument('--types', default=types_default, help='path to omim/data/types.txt')
|
||||
parser.add_argument('--threads', type=int, help='number of threads to process files')
|
||||
parser.add_argument('--types', default=types_default,
|
||||
help='path to omim/data/types.txt')
|
||||
parser.add_argument('--threads', type=int,
|
||||
help='number of threads to process files')
|
||||
parser.add_argument('--version', type=int, help='override mwm version')
|
||||
parser.add_argument('--debug', action='store_true', help='debug parse_mwm call')
|
||||
parser.add_argument('--debug', action='store_true',
|
||||
help='debug parse_mwm call')
|
||||
args = parser.parse_args()
|
||||
if not args.osm2ft:
|
||||
args.osm2ft = args.mwm
|
||||
|
@ -103,18 +111,22 @@ def main():
|
|||
os.mkdir(args.output)
|
||||
|
||||
# Create CSV writer processes for each queue and a pool of MWM readers.
|
||||
writers = [Process(target=write_csv, args=(args.output, qtype)) for qtype in QUEUES]
|
||||
writers = [Process(target=write_csv, args=(args.output, qtype)) for qtype in
|
||||
QUEUES]
|
||||
for w in writers:
|
||||
w.start()
|
||||
pool = Pool(processes=args.threads)
|
||||
for mwm_name in os.listdir(args.mwm):
|
||||
if 'World' in mwm_name or 'minsk_pass' in mwm_name or not mwm_name.endswith('.mwm'):
|
||||
if 'World' in mwm_name or 'minsk_pass' in mwm_name or not mwm_name.endswith(
|
||||
'.mwm'):
|
||||
continue
|
||||
osm2ft_name = os.path.join(args.osm2ft, os.path.basename(mwm_name) + '.osm2ft')
|
||||
osm2ft_name = os.path.join(args.osm2ft,
|
||||
os.path.basename(mwm_name) + '.osm2ft')
|
||||
if not os.path.exists(osm2ft_name):
|
||||
logging.error('Cannot find %s', osm2ft_name)
|
||||
sys.exit(2)
|
||||
parse_mwm_args = (os.path.join(args.mwm, mwm_name), osm2ft_name, args.version, args.types)
|
||||
parse_mwm_args = (
|
||||
os.path.join(args.mwm, mwm_name), osm2ft_name, args.version, args.types)
|
||||
if args.debug:
|
||||
parse_mwm(*parse_mwm_args)
|
||||
else:
|
||||
|
|
Loading…
Add table
Reference in a new issue