[generator][promo] inject countries osm ids into countries.txt

This commit is contained in:
Arsentiy Milchakov 2019-11-06 16:45:57 +03:00 committed by Maksim Andrianov
parent 3211f04247
commit 334c5d5534
6 changed files with 114 additions and 86 deletions

View file

@ -202,6 +202,11 @@ class Env:
def promo_catalog_cities_path(self):
return os.path.join(self.intermediate_path, "promo_catalog_cities.json")
@property
def promo_catalog_countries_path(self):
return os.path.join(self.intermediate_path,
"promo_catalog_countries.json")
@property
def popularity_path(self):
return os.path.join(self.intermediate_path, "popular_places.csv")

View file

@ -46,6 +46,7 @@ PLANET_COASTS_URL = ""
UGC_URL = ""
HOTELS_URL = ""
PROMO_CATALOG_CITIES_URL = ""
PROMO_CATALOG_COUNTRIES_URL = ""
POPULARITY_URL= ""
SUBWAY_URL = ""
FOOD_URL = ""
@ -120,6 +121,7 @@ PLANET_COASTS_URL = _get_opt_path(config, "External", "PLANET_COASTS_URL", PLANE
UGC_URL = _get_opt_path(config, "External", "UGC_URL", UGC_URL)
HOTELS_URL = _get_opt_path(config, "External", "HOTELS_URL", HOTELS_URL)
PROMO_CATALOG_CITIES_URL = _get_opt_path(config, "External", "PROMO_CATALOG_CITIES_URL", PROMO_CATALOG_CITIES_URL)
PROMO_CATALOG_COUNTRIES_URL = _get_opt_path(config, "External", "PROMO_CATALOG_COUNTRIES_URL", PROMO_CATALOG_COUNTRIES_URL)
POPULARITY_URL = _get_opt_path(config, "External", "POPULARITY_URL", POPULARITY_URL)
SUBWAY_URL = _get_opt(config, "External", "SUBWAY_URL", SUBWAY_URL)
FOOD_URL = _get_opt(config, "External", "FOOD_URL", FOOD_URL)

View file

@ -14,7 +14,7 @@ from descriptions.descriptions_downloader import (check_and_get_checker,
download_from_wikidata_tags)
from filelock import FileLock
from post_generation.hierarchy_to_countries import hierarchy_to_countries
from post_generation.inject_promo_cities import inject_promo_cities
from post_generation.inject_promo_ids import inject_promo_ids
from post_generation.localads_mwm_to_csv import create_csv
from .generator import stages
@ -63,6 +63,7 @@ def stage_download_production_external(env):
settings.UGC_URL: env.ugc_path,
settings.HOTELS_URL: env.hotels_path,
settings.PROMO_CATALOG_CITIES_URL: env.promo_catalog_cities_path,
settings.PROMO_CATALOG_COUNTRIES_URL: env.promo_catalog_countries_path,
settings.POPULARITY_URL: env.popularity_path,
settings.FOOD_URL: env.food_paths,
settings.FOOD_TRANSLATIONS_URL: env.food_translations_path,
@ -235,8 +236,9 @@ def stage_countries_txt(env):
env.mwm_version)
if env.is_accepted_stage(stage_download_production_external):
countries_json = json.loads(countries)
inject_promo_cities(countries_json, env.promo_catalog_cities_path,
env.mwm_path, env.types_path, env.mwm_path)
inject_promo_ids(countries_json, env.promo_catalog_cities_path,
env.promo_catalog_countries_path, env.mwm_path,
env.types_path, env.mwm_path)
countries = json.dumps(countries_json, ensure_ascii=True, indent=1)
with open(env.counties_txt_path, "w") as f:

View file

@ -31,6 +31,7 @@ SUBWAY_URL: http://osm-subway.maps.me/mapsme/latest.json
# UGC_URL:
# HOTELS_URL:
# PROMO_CATALOG_CITIES_URL:
# PROMO_CATALOG_COUNTRIES_URL:
# POPULARITY_URL:
# FOOD_URL:
# FOOD_TRANSLATIONS_URL:

View file

@ -4,7 +4,7 @@ import os
import sys
from .hierarchy_to_countries import hierarchy_to_countries as hierarchy_to_countries_
from .inject_promo_ids import inject_promo_cities
from .inject_promo_ids import inject_promo_ids
from .localads_mwm_to_csv import create_csv
@ -16,7 +16,7 @@ class PostGeneration:
The post_generation commands are:
localads_mwm_to_csv Prepares CSV files for uploading to localads database from mwm files.
hierarchy_to_countries Produces countries.txt from hierarchy.txt.
inject_promo_cities Injects promo cities osm ids into countries.txt
inject_promo_ids Injects promo osm ids into countries.txt
""")
parser.add_argument("command", help="Subcommand to run")
args = parser.parse_args(sys.argv[1:2])
@ -88,7 +88,7 @@ The post_generation commands are:
print(countries_json)
@staticmethod
def inject_promo_cities():
def inject_promo_ids():
parser = argparse.ArgumentParser(
description="Injects promo cities osm ids into countries.txt")
parser.add_argument("--mwm", required=True, help="path to mwm files")
@ -96,6 +96,8 @@ The post_generation commands are:
help="path to omim/data/types.txt")
parser.add_argument("--promo_cities", required=True,
help="Path to promo cities file")
parser.add_argument("--promo_countries", required=True,
help="Path to promo countries file")
parser.add_argument("--osm2ft",
help="path to osm2ft files (default is the same as mwm)")
parser.add_argument("--countries",
@ -114,8 +116,8 @@ The post_generation commands are:
with open(args.countries) as f:
countries = json.load(f)
inject_promo_cities(countries, args.promo_cities, args.mwm, args.types,
args.osm2ft)
inject_promo_ids(countries, args.promo_cities, args.promo_countries,
args.mwm, args.types, args.osm2ft)
with open(args.output, "w") as f:
json.dump(countries, f, indent=1)

View file

@ -4,20 +4,45 @@ import os
import re
import sys
from collections import defaultdict
from multiprocessing import Pool
from mwm import mwm
class PromoCities(object):
def __init__(self, cities, mwm_path, types_path, osm2ft_path):
class PromoIds(object):
def __init__(self, countries, cities, mwm_path, types_path, osm2ft_path):
self.countries = countries
self.cities = cities
self.mwm_path = mwm_path
self.types_path = types_path
self.osm2ft_path = osm2ft_path
def find(self, leaf_id):
result = []
def inject_into_country(self, country):
nodes = self._get_nodes(country)
with Pool() as pool:
proposed_ids = pool.map(self._find, (n["id"] for n in nodes),
chunksize=1)
countries_ids = [ids for node_ids in proposed_ids for ids in
node_ids["countries"]]
if countries_ids:
country["top_countries_geo_ids"] = countries_ids
for idx, node_ids in enumerate(proposed_ids):
if not node_ids["cities"]:
continue
node = nodes[idx]
best = self._choose_best_city(node_ids["cities"])
node["top_city_geo_id"] = best["id"]
if best["id"] < 0:
node["top_city_geo_id"] += (1 << 64)
def _find(self, leaf_id):
result = {
"countries": [],
"cities": []
}
ft2osm = load_osm2ft(self.osm2ft_path, leaf_id)
with open(os.path.join(self.mwm_path, leaf_id + ".mwm"), "rb") as f:
mwm_file = mwm.MWM(f)
@ -27,63 +52,80 @@ class PromoCities(object):
osm_id = ft2osm.get(feature["id"], None)
types = feature["header"]["types"]
if "sponsored-promo_catalog" not in types or osm_id not in self.cities:
continue
if "sponsored-promo_catalog" in types and osm_id in self.cities:
city = self._get_city(osm_id, types)
result["cities"].append(city)
city = {
"id": osm_id,
"count_of_guides": self.cities[osm_id],
"types": []
}
for t in types:
if t.startswith("place"):
city["types"].append(t)
if not city["types"]:
logging.error(f"Incorrect types for sponsored-promo_catalog "
f"feature osm_id {osm_id}")
sys.exit(3)
result.append(city)
if "place-country" in types and osm_id in self.countries:
result["countries"].append(osm_id)
return result
@staticmethod
def choose_best_city(proposed_cities):
def _get_nodes(root):
def __get_nodes(node, mwm_nodes):
if "g" in node:
for item in node["g"]:
__get_nodes(item, mwm_nodes)
else:
mwm_nodes.append(node)
mwm_nodes = []
__get_nodes(root, mwm_nodes)
return mwm_nodes
def _get_city(self, osm_id, types):
city = {
"id": osm_id,
"count_of_guides": self.cities[osm_id],
"types": []
}
for t in types:
if t.startswith("place"):
city["types"].append(t)
if not city["types"]:
logging.error(f"Incorrect types for sponsored-promo_catalog "
f"feature osm_id {osm_id}")
sys.exit(3)
return city
def _choose_best_city(self, proposed_cities):
def key_compare(city):
return city["count_of_guides"], score_types(city["types"])
return city["count_of_guides"], self._score_city_types(
city["types"])
return max(proposed_cities, key=key_compare)
def _score_city_types(self, types):
return max([self._city_type_to_int(t) for t in types])
def place_type_to_int(t):
if t == "place-town":
return 1
if t == "place-city":
return 2
@staticmethod
def _city_type_to_int(t):
if t == "place-town":
return 1
if t == "place-city":
return 2
m = re.match(r"^place-city-capital?(-(?P<admin_level>\d+)|)$", t)
if m:
admin_level = int(m.groupdict("1")["admin_level"])
if 1 <= admin_level <= 12:
return 14 - admin_level
return 0
m = re.match(r"^place-city-capital?(-(?P<admin_level>\d+)|)$", t)
if m:
admin_level = int(m.groupdict("1")["admin_level"])
if 1 <= admin_level <= 12:
return 14 - admin_level
return 0
def score_types(types):
return max([place_type_to_int(t) for t in types])
def load_cities(path):
def load_promo_ids(path):
with open(path) as f:
cities_list = json.load(f)
root = json.load(f)
cities = {}
for city in cities_list["data"]:
cities[city["osmid"]] = city["paid_bundles_count"]
ids = {}
for item in root["data"]:
ids[item["osmid"]] = item["paid_bundles_count"]
return cities
return ids
def load_osm2ft(osm2ft_path, mwm_id):
@ -95,36 +137,10 @@ def load_osm2ft(osm2ft_path, mwm_id):
return mwm.read_osm2ft(f, ft2osm=True, tuples=False)
def get_nodes(node):
def _get_nodes(node, mwm_nodes):
if "g" in node:
for item in node["g"]:
_get_nodes(item, mwm_nodes)
else:
mwm_nodes.append(node)
mwm_nodes = []
_get_nodes(node, mwm_nodes)
return mwm_nodes
def inject_into_leafs(node, cities):
nodes = get_nodes(node)
with Pool() as pool:
proposed_cities_list = pool.map(cities.find, (n["id"] for n in nodes),
chunksize=1)
for idx, proposed_cities in enumerate(proposed_cities_list):
if not proposed_cities:
continue
node = nodes[idx]
best = cities.choose_best_city(proposed_cities)
node["top_city_geo_id"] = best["id"]
if best["id"] < 0:
node["top_city_geo_id"] += (1 << 64)
def inject_promo_cities(countries_json, promo_cities_path, mwm_path, types_path,
osm2ft_path):
cities = PromoCities(load_cities(promo_cities_path), mwm_path, types_path,
osm2ft_path)
inject_into_leafs(countries_json, cities)
def inject_promo_ids(countries_json, promo_cities_path, promo_countries_path,
mwm_path, types_path, osm2ft_path):
promo_ids = PromoIds(load_promo_ids(promo_countries_path),
load_promo_ids(promo_cities_path), mwm_path,
types_path, osm2ft_path)
for country in countries_json["g"]:
promo_ids.inject_into_country(country)