From 334c5d55340b86864303b04df468efb8f5b41809 Mon Sep 17 00:00:00 2001 From: Arsentiy Milchakov Date: Wed, 6 Nov 2019 16:45:57 +0300 Subject: [PATCH] [generator][promo] inject countries osm ids into countries.txt --- tools/python/maps_generator/generator/env.py | 5 + .../maps_generator/generator/settings.py | 2 + tools/python/maps_generator/maps_generator.py | 8 +- .../var/etc/map_generator.ini.default | 1 + tools/python/post_generation/__main__.py | 12 +- .../post_generation/inject_promo_ids.py | 172 ++++++++++-------- 6 files changed, 114 insertions(+), 86 deletions(-) diff --git a/tools/python/maps_generator/generator/env.py b/tools/python/maps_generator/generator/env.py index b8681d0a79..0b16394621 100644 --- a/tools/python/maps_generator/generator/env.py +++ b/tools/python/maps_generator/generator/env.py @@ -202,6 +202,11 @@ class Env: def promo_catalog_cities_path(self): return os.path.join(self.intermediate_path, "promo_catalog_cities.json") + @property + def promo_catalog_countries_path(self): + return os.path.join(self.intermediate_path, + "promo_catalog_countries.json") + @property def popularity_path(self): return os.path.join(self.intermediate_path, "popular_places.csv") diff --git a/tools/python/maps_generator/generator/settings.py b/tools/python/maps_generator/generator/settings.py index f03c931c0a..a6f216502b 100644 --- a/tools/python/maps_generator/generator/settings.py +++ b/tools/python/maps_generator/generator/settings.py @@ -46,6 +46,7 @@ PLANET_COASTS_URL = "" UGC_URL = "" HOTELS_URL = "" PROMO_CATALOG_CITIES_URL = "" +PROMO_CATALOG_COUNTRIES_URL = "" POPULARITY_URL= "" SUBWAY_URL = "" FOOD_URL = "" @@ -120,6 +121,7 @@ PLANET_COASTS_URL = _get_opt_path(config, "External", "PLANET_COASTS_URL", PLANE UGC_URL = _get_opt_path(config, "External", "UGC_URL", UGC_URL) HOTELS_URL = _get_opt_path(config, "External", "HOTELS_URL", HOTELS_URL) PROMO_CATALOG_CITIES_URL = _get_opt_path(config, "External", "PROMO_CATALOG_CITIES_URL", PROMO_CATALOG_CITIES_URL) +PROMO_CATALOG_COUNTRIES_URL = _get_opt_path(config, "External", "PROMO_CATALOG_COUNTRIES_URL", PROMO_CATALOG_COUNTRIES_URL) POPULARITY_URL = _get_opt_path(config, "External", "POPULARITY_URL", POPULARITY_URL) SUBWAY_URL = _get_opt(config, "External", "SUBWAY_URL", SUBWAY_URL) FOOD_URL = _get_opt(config, "External", "FOOD_URL", FOOD_URL) diff --git a/tools/python/maps_generator/maps_generator.py b/tools/python/maps_generator/maps_generator.py index 1857235711..97fd903a5c 100644 --- a/tools/python/maps_generator/maps_generator.py +++ b/tools/python/maps_generator/maps_generator.py @@ -14,7 +14,7 @@ from descriptions.descriptions_downloader import (check_and_get_checker, download_from_wikidata_tags) from filelock import FileLock from post_generation.hierarchy_to_countries import hierarchy_to_countries -from post_generation.inject_promo_cities import inject_promo_cities +from post_generation.inject_promo_ids import inject_promo_ids from post_generation.localads_mwm_to_csv import create_csv from .generator import stages @@ -63,6 +63,7 @@ def stage_download_production_external(env): settings.UGC_URL: env.ugc_path, settings.HOTELS_URL: env.hotels_path, settings.PROMO_CATALOG_CITIES_URL: env.promo_catalog_cities_path, + settings.PROMO_CATALOG_COUNTRIES_URL: env.promo_catalog_countries_path, settings.POPULARITY_URL: env.popularity_path, settings.FOOD_URL: env.food_paths, settings.FOOD_TRANSLATIONS_URL: env.food_translations_path, @@ -235,8 +236,9 @@ def stage_countries_txt(env): env.mwm_version) if env.is_accepted_stage(stage_download_production_external): countries_json = json.loads(countries) - inject_promo_cities(countries_json, env.promo_catalog_cities_path, - env.mwm_path, env.types_path, env.mwm_path) + inject_promo_ids(countries_json, env.promo_catalog_cities_path, + env.promo_catalog_countries_path, env.mwm_path, + env.types_path, env.mwm_path) countries = json.dumps(countries_json, ensure_ascii=True, indent=1) with open(env.counties_txt_path, "w") as f: diff --git a/tools/python/maps_generator/var/etc/map_generator.ini.default b/tools/python/maps_generator/var/etc/map_generator.ini.default index 98d23b97a0..8d7bc3f1cd 100644 --- a/tools/python/maps_generator/var/etc/map_generator.ini.default +++ b/tools/python/maps_generator/var/etc/map_generator.ini.default @@ -31,6 +31,7 @@ SUBWAY_URL: http://osm-subway.maps.me/mapsme/latest.json # UGC_URL: # HOTELS_URL: # PROMO_CATALOG_CITIES_URL: +# PROMO_CATALOG_COUNTRIES_URL: # POPULARITY_URL: # FOOD_URL: # FOOD_TRANSLATIONS_URL: diff --git a/tools/python/post_generation/__main__.py b/tools/python/post_generation/__main__.py index 484446c45d..833417deb8 100644 --- a/tools/python/post_generation/__main__.py +++ b/tools/python/post_generation/__main__.py @@ -4,7 +4,7 @@ import os import sys from .hierarchy_to_countries import hierarchy_to_countries as hierarchy_to_countries_ -from .inject_promo_ids import inject_promo_cities +from .inject_promo_ids import inject_promo_ids from .localads_mwm_to_csv import create_csv @@ -16,7 +16,7 @@ class PostGeneration: The post_generation commands are: localads_mwm_to_csv Prepares CSV files for uploading to localads database from mwm files. hierarchy_to_countries Produces countries.txt from hierarchy.txt. - inject_promo_cities Injects promo cities osm ids into countries.txt + inject_promo_ids Injects promo osm ids into countries.txt """) parser.add_argument("command", help="Subcommand to run") args = parser.parse_args(sys.argv[1:2]) @@ -88,7 +88,7 @@ The post_generation commands are: print(countries_json) @staticmethod - def inject_promo_cities(): + def inject_promo_ids(): parser = argparse.ArgumentParser( description="Injects promo cities osm ids into countries.txt") parser.add_argument("--mwm", required=True, help="path to mwm files") @@ -96,6 +96,8 @@ The post_generation commands are: help="path to omim/data/types.txt") parser.add_argument("--promo_cities", required=True, help="Path to promo cities file") + parser.add_argument("--promo_countries", required=True, + help="Path to promo countries file") parser.add_argument("--osm2ft", help="path to osm2ft files (default is the same as mwm)") parser.add_argument("--countries", @@ -114,8 +116,8 @@ The post_generation commands are: with open(args.countries) as f: countries = json.load(f) - inject_promo_cities(countries, args.promo_cities, args.mwm, args.types, - args.osm2ft) + inject_promo_ids(countries, args.promo_cities, args.promo_countries, + args.mwm, args.types, args.osm2ft) with open(args.output, "w") as f: json.dump(countries, f, indent=1) diff --git a/tools/python/post_generation/inject_promo_ids.py b/tools/python/post_generation/inject_promo_ids.py index 2edb60172b..8eb31af5b7 100644 --- a/tools/python/post_generation/inject_promo_ids.py +++ b/tools/python/post_generation/inject_promo_ids.py @@ -4,20 +4,45 @@ import os import re import sys +from collections import defaultdict from multiprocessing import Pool from mwm import mwm -class PromoCities(object): - def __init__(self, cities, mwm_path, types_path, osm2ft_path): +class PromoIds(object): + def __init__(self, countries, cities, mwm_path, types_path, osm2ft_path): + self.countries = countries self.cities = cities self.mwm_path = mwm_path self.types_path = types_path self.osm2ft_path = osm2ft_path - def find(self, leaf_id): - result = [] + def inject_into_country(self, country): + nodes = self._get_nodes(country) + with Pool() as pool: + proposed_ids = pool.map(self._find, (n["id"] for n in nodes), + chunksize=1) + + countries_ids = [ids for node_ids in proposed_ids for ids in + node_ids["countries"]] + if countries_ids: + country["top_countries_geo_ids"] = countries_ids + + for idx, node_ids in enumerate(proposed_ids): + if not node_ids["cities"]: + continue + node = nodes[idx] + best = self._choose_best_city(node_ids["cities"]) + node["top_city_geo_id"] = best["id"] + if best["id"] < 0: + node["top_city_geo_id"] += (1 << 64) + + def _find(self, leaf_id): + result = { + "countries": [], + "cities": [] + } ft2osm = load_osm2ft(self.osm2ft_path, leaf_id) with open(os.path.join(self.mwm_path, leaf_id + ".mwm"), "rb") as f: mwm_file = mwm.MWM(f) @@ -27,63 +52,80 @@ class PromoCities(object): osm_id = ft2osm.get(feature["id"], None) types = feature["header"]["types"] - if "sponsored-promo_catalog" not in types or osm_id not in self.cities: - continue + if "sponsored-promo_catalog" in types and osm_id in self.cities: + city = self._get_city(osm_id, types) + result["cities"].append(city) - city = { - "id": osm_id, - "count_of_guides": self.cities[osm_id], - "types": [] - } - - for t in types: - if t.startswith("place"): - city["types"].append(t) - - if not city["types"]: - logging.error(f"Incorrect types for sponsored-promo_catalog " - f"feature osm_id {osm_id}") - sys.exit(3) - - result.append(city) + if "place-country" in types and osm_id in self.countries: + result["countries"].append(osm_id) return result @staticmethod - def choose_best_city(proposed_cities): + def _get_nodes(root): + def __get_nodes(node, mwm_nodes): + if "g" in node: + for item in node["g"]: + __get_nodes(item, mwm_nodes) + else: + mwm_nodes.append(node) + + mwm_nodes = [] + __get_nodes(root, mwm_nodes) + return mwm_nodes + + def _get_city(self, osm_id, types): + city = { + "id": osm_id, + "count_of_guides": self.cities[osm_id], + "types": [] + } + + for t in types: + if t.startswith("place"): + city["types"].append(t) + + if not city["types"]: + logging.error(f"Incorrect types for sponsored-promo_catalog " + f"feature osm_id {osm_id}") + sys.exit(3) + + return city + + def _choose_best_city(self, proposed_cities): def key_compare(city): - return city["count_of_guides"], score_types(city["types"]) + return city["count_of_guides"], self._score_city_types( + city["types"]) return max(proposed_cities, key=key_compare) + def _score_city_types(self, types): + return max([self._city_type_to_int(t) for t in types]) -def place_type_to_int(t): - if t == "place-town": - return 1 - if t == "place-city": - return 2 + @staticmethod + def _city_type_to_int(t): + if t == "place-town": + return 1 + if t == "place-city": + return 2 - m = re.match(r"^place-city-capital?(-(?P\d+)|)$", t) - if m: - admin_level = int(m.groupdict("1")["admin_level"]) - if 1 <= admin_level <= 12: - return 14 - admin_level - return 0 + m = re.match(r"^place-city-capital?(-(?P\d+)|)$", t) + if m: + admin_level = int(m.groupdict("1")["admin_level"]) + if 1 <= admin_level <= 12: + return 14 - admin_level + return 0 -def score_types(types): - return max([place_type_to_int(t) for t in types]) - - -def load_cities(path): +def load_promo_ids(path): with open(path) as f: - cities_list = json.load(f) + root = json.load(f) - cities = {} - for city in cities_list["data"]: - cities[city["osmid"]] = city["paid_bundles_count"] + ids = {} + for item in root["data"]: + ids[item["osmid"]] = item["paid_bundles_count"] - return cities + return ids def load_osm2ft(osm2ft_path, mwm_id): @@ -95,36 +137,10 @@ def load_osm2ft(osm2ft_path, mwm_id): return mwm.read_osm2ft(f, ft2osm=True, tuples=False) -def get_nodes(node): - def _get_nodes(node, mwm_nodes): - if "g" in node: - for item in node["g"]: - _get_nodes(item, mwm_nodes) - else: - mwm_nodes.append(node) - - mwm_nodes = [] - _get_nodes(node, mwm_nodes) - return mwm_nodes - - -def inject_into_leafs(node, cities): - nodes = get_nodes(node) - with Pool() as pool: - proposed_cities_list = pool.map(cities.find, (n["id"] for n in nodes), - chunksize=1) - for idx, proposed_cities in enumerate(proposed_cities_list): - if not proposed_cities: - continue - node = nodes[idx] - best = cities.choose_best_city(proposed_cities) - node["top_city_geo_id"] = best["id"] - if best["id"] < 0: - node["top_city_geo_id"] += (1 << 64) - - -def inject_promo_cities(countries_json, promo_cities_path, mwm_path, types_path, - osm2ft_path): - cities = PromoCities(load_cities(promo_cities_path), mwm_path, types_path, - osm2ft_path) - inject_into_leafs(countries_json, cities) +def inject_promo_ids(countries_json, promo_cities_path, promo_countries_path, + mwm_path, types_path, osm2ft_path): + promo_ids = PromoIds(load_promo_ids(promo_countries_path), + load_promo_ids(promo_cities_path), mwm_path, + types_path, osm2ft_path) + for country in countries_json["g"]: + promo_ids.inject_into_country(country)