[python] Refactoring: Changed formatting.

This commit is contained in:
Maksim Andrianov 2020-06-05 17:23:26 +03:00 committed by Tatiana Yan
parent 986b4ac0d2
commit 8c0dc3139d
10 changed files with 388 additions and 190 deletions

View file

@ -14,20 +14,39 @@ from booking.download_test_data import download_test_data
def process_options():
parser = argparse.ArgumentParser(description="Download and process booking hotels.")
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("--logfile", default="",
help="Name and destination for log file")
parser.add_argument("--password", required=True, dest="password",
help="Booking.com account password")
parser.add_argument("--user", required=True, dest="user",
help="Booking.com account user name")
parser.add_argument("--threads_count", default=1, type=int,
help="The number of threads for processing countries.")
parser.add_argument("--output", required=True, dest="output",
help="Name and destination for output file")
parser.add_argument("--country_code", default=None, action="append",
help="Download hotels of this country.")
parser.add_argument("--download_test_dataset", default=False,
help="Download dataset for tests.")
parser.add_argument(
"--logfile", default="", help="Name and destination for log file"
)
parser.add_argument(
"--password",
required=True,
dest="password",
help="Booking.com account password",
)
parser.add_argument(
"--user", required=True, dest="user", help="Booking.com account user name"
)
parser.add_argument(
"--threads_count",
default=1,
type=int,
help="The number of threads for processing countries.",
)
parser.add_argument(
"--output",
required=True,
dest="output",
help="Name and destination for output file",
)
parser.add_argument(
"--country_code",
default=None,
action="append",
help="Download hotels of this country.",
)
parser.add_argument(
"--download_test_dataset", default=False, help="Download dataset for tests."
)
options = parser.parse_args()
return options
@ -43,16 +62,34 @@ def main():
logfile = os.path.join(os.path.dirname(os.path.realpath(__file__)), name)
print(f"Logs saved to {logfile}.", file=sys.stdout)
if options.threads_count > 1:
print(f"Limit requests per minute is {LIMIT_REQUESTS_PER_MINUTE}.", file=sys.stdout)
logging.basicConfig(level=logging.DEBUG, filename=logfile,
format="%(thread)d [%(asctime)s] %(levelname)s: %(message)s")
print(
f"Limit requests per minute is {LIMIT_REQUESTS_PER_MINUTE}.",
file=sys.stdout,
)
logging.basicConfig(
level=logging.DEBUG,
filename=logfile,
format="%(thread)d [%(asctime)s] %(levelname)s: %(message)s",
)
with tqdm(disable=not options.verbose) as progress_bar:
if options.download_test_dataset:
download_test_data(options.country_code, options.user, options.password,
options.output, options.threads_count, progress_bar)
download_test_data(
options.country_code,
options.user,
options.password,
options.output,
options.threads_count,
progress_bar,
)
else:
download(options.country_code, options.user, options.password,
options.output, options.threads_count, progress_bar)
download(
options.country_code,
options.user,
options.password,
options.output,
options.threads_count,
progress_bar,
)
main()

View file

@ -16,11 +16,7 @@ MINMAX_LIMIT_WAIT_AFTER_429_ERROR_SECONDS = (30, 120)
class BookingApi:
ENDPOINTS = {
"countries": "list",
"hotels": "list",
"districts": "list"
}
ENDPOINTS = {"countries": "list", "hotels": "list", "districts": "list"}
def __init__(self, login, password, version):
major_minor = version.split(".")
@ -46,9 +42,12 @@ class BookingApi:
attempts -= 1
response = None
try:
response = requests.get(f"{self._base_url}/{endpoint}",
auth=(self._login, self._password),
params=params, timeout=self._timeout)
response = requests.get(
f"{self._base_url}/{endpoint}",
auth=(self._login, self._password),
params=params,
timeout=self._timeout,
)
except requests.exceptions.ReadTimeout:
logging.exception("Timeout error.")
continue
@ -60,8 +59,9 @@ class BookingApi:
try:
data = response.json()
except json.decoder.JSONDecodeError:
logging.exception(f"JSON decode error. "
f"Content: {response.content}")
logging.exception(
f"JSON decode error. " f"Content: {response.content}"
)
continue
if code == 200:
@ -78,13 +78,14 @@ class BookingApi:
if code == 429:
self._event.clear()
wait_seconds = randint(*MINMAX_LIMIT_WAIT_AFTER_429_ERROR_SECONDS)
logging.warning(f"Http error {code}: {data}. "
f"It waits {wait_seconds} seconds and tries again.")
logging.warning(
f"Http error {code}: {data}. "
f"It waits {wait_seconds} seconds and tries again."
)
sleep(wait_seconds)
self._event.set()
else:
raise HTTPError(
f"Http error with code {code}: {data}.")
raise HTTPError(f"Http error with code {code}: {data}.")
def _set_endpoints(self):
for endpoint in BookingApi.ENDPOINTS:
@ -110,11 +111,10 @@ class BookingListApi:
return result
def _call_endpoint_offset(self, offset, endpoint, **params):
r = self.api.call_endpoint(endpoint, **{
"offset": offset,
"rows": BookingListApi._ROWS_BY_REQUEST,
**params
})
r = self.api.call_endpoint(
endpoint,
**{"offset": offset, "rows": BookingListApi._ROWS_BY_REQUEST, **params},
)
if not isinstance(r, list):
raise TypeError(f"Result has unexpected type {type(r)}")
return r

View file

@ -14,9 +14,35 @@ from booking.api.booking_api import BookingApi
from booking.api.booking_api import BookingListApi
from booking.api.exceptions import GettingMinPriceError
SUPPORTED_LANGUAGES = ("en", "ru", "ar", "cs", "da", "nl", "fi", "fr", "de",
"hu", "id", "it", "ja", "ko", "pl", "pt", "ro", "es",
"sv", "th", "tr", "uk", "vi", "zh", "he", "sk", "el")
SUPPORTED_LANGUAGES = (
"en",
"ru",
"ar",
"cs",
"da",
"nl",
"fi",
"fr",
"de",
"hu",
"id",
"it",
"ja",
"ko",
"pl",
"pt",
"ro",
"es",
"sv",
"th",
"tr",
"uk",
"vi",
"zh",
"he",
"sk",
"el",
)
class BookingGen:
@ -55,11 +81,17 @@ class BookingGen:
return self.api.hotels(country_ids=self.country_code, **params)
def _download_translations(self):
extras = ["hotel_info", ]
extras = [
"hotel_info",
]
translations = defaultdict(dict)
with ThreadPoolExecutor(max_workers=len(SUPPORTED_LANGUAGES)) as executor:
m = {executor.submit(self._download_hotels, extras=extras, language=lang): lang
for lang in SUPPORTED_LANGUAGES}
m = {
executor.submit(
self._download_hotels, extras=extras, language=lang
): lang
for lang in SUPPORTED_LANGUAGES
}
for future in as_completed(m):
lang = m[future]
hotels = future.result()
@ -68,7 +100,7 @@ class BookingGen:
hotel_data = hotel["hotel_data"]
translations[hotel_id][lang] = {
"name": BookingGen._format_string(hotel_data["name"]),
"address": BookingGen._format_string(hotel_data["address"])
"address": BookingGen._format_string(hotel_data["address"]),
}
return translations
@ -162,7 +194,7 @@ class BookingGen:
hotel_data["review_score"],
hotel_data["url"],
hotel_data["hotel_type_id"],
self._get_translations(hotel)
self._get_translations(hotel),
)
return sep.join(BookingGen._format_string(str(x)) for x in row)
@ -174,8 +206,9 @@ def download_hotels_by_country(api, country):
return rows
def download(country_code, user, password, path, threads_count,
progress_bar=tqdm(disable=True)):
def download(
country_code, user, password, path, threads_count, progress_bar=tqdm(disable=True)
):
api = BookingApi(user, password, "2.4")
list_api = BookingListApi(api)
countries = list_api.countries(languages="en")
@ -186,8 +219,9 @@ def download(country_code, user, password, path, threads_count,
progress_bar.total = len(countries)
with open(path, "w") as f:
with ThreadPool(threads_count) as pool:
for lines in pool.imap_unordered(partial(download_hotels_by_country, list_api),
countries):
for lines in pool.imap_unordered(
partial(download_hotels_by_country, list_api), countries
):
f.writelines([f"{x}\n" for x in lines])
progress_bar.update()
logging.info(f"Hotels were saved to {path}.")

View file

@ -49,6 +49,7 @@ class BookingGen:
)
return sep.join(BookingGen._format_string(str(x)) for x in row)
def create_tsv_header(sep="\t"):
row = (
"Hotel ID",
@ -68,17 +69,18 @@ def download_hotels_by_country(api, district_names, country):
return rows
def download_test_data(country_code, user, password, path, threads_count,
progress_bar=tqdm(disable=True)):
def download_test_data(
country_code, user, password, path, threads_count, progress_bar=tqdm(disable=True)
):
logging.info(f"Starting test dataset download.")
api = BookingApi(user, password, "2.4")
list_api = BookingListApi(api)
districts = list_api.districts(languages="en")
district_names = {}
for district in districts:
for translation in district['translations']:
if translation['language'] == 'en':
district_names[district['district_id']] = translation['name']
for translation in district["translations"]:
if translation["language"] == "en":
district_names[district["district_id"]] = translation["name"]
countries = list_api.countries(languages="en")
if country_code is not None:
countries = list(filter(lambda x: x["country"] in country_code, countries))
@ -88,8 +90,9 @@ def download_test_data(country_code, user, password, path, threads_count,
with open(path, "w") as f:
f.write(create_tsv_header() + "\n")
with ThreadPool(threads_count) as pool:
for lines in pool.imap_unordered(partial(download_hotels_by_country, list_api, district_names),
countries):
for lines in pool.imap_unordered(
partial(download_hotels_by_country, list_api, district_names), countries
):
f.writelines([f"{x}\n" for x in lines])
progress_bar.update()
logging.info(f"Hotels test dataset saved to {path}.")

View file

@ -13,20 +13,36 @@ from descriptions.descriptions_downloader import log
def parse_args():
parser = argparse.ArgumentParser(description="Download wiki pages.")
parser.add_argument("--output_dir", metavar="PATH", type=str,
help="Output dir for saving pages")
parser.add_argument("--popularity", metavar="PATH", type=str,
help="File with popular object ids for which we "
"download wikipedia data. If not given, download "
"for all objects.")
parser.add_argument('--wikipedia', metavar="PATH", type=str, required=True,
help="Input file with wikipedia url.")
parser.add_argument('--wikidata', metavar="PATH", type=str,
help="Input file with wikidata ids.")
parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+',
action='append',
help="Languages for pages. If left blank, pages in all "
"available languages will be loaded.")
parser.add_argument(
"--output_dir", metavar="PATH", type=str, help="Output dir for saving pages"
)
parser.add_argument(
"--popularity",
metavar="PATH",
type=str,
help="File with popular object ids for which we "
"download wikipedia data. If not given, download "
"for all objects.",
)
parser.add_argument(
"--wikipedia",
metavar="PATH",
type=str,
required=True,
help="Input file with wikipedia url.",
)
parser.add_argument(
"--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids."
)
parser.add_argument(
"--langs",
metavar="LANGS",
type=str,
nargs="+",
action="append",
help="Languages for pages. If left blank, pages in all "
"available languages will be loaded.",
)
return parser.parse_args()

View file

@ -28,13 +28,37 @@ CHUNK_SIZE = 16
REQUEST_ATTEMPTS = 32
ATTEMPTS_PAUSE_MS = 4000
HEADERS = {f"h{x}" for x in range(1,7)}
HEADERS = {f"h{x}" for x in range(1, 7)}
BAD_SECTIONS = {
"en": ["External links", "Sources", "See also", "Bibliography", "Further reading", "References"],
"en": [
"External links",
"Sources",
"See also",
"Bibliography",
"Further reading",
"References",
],
"ru": ["Литература", "Ссылки", "См. также", "Библиография", "Примечания"],
"de": ["Einzelnachweise", "Weblinks", "Literatur", "Siehe auch", "Anmerkungen", "Anmerkungen und Einzelnachweise", "Filme", "Einzelbelege"],
"de": [
"Einzelnachweise",
"Weblinks",
"Literatur",
"Siehe auch",
"Anmerkungen",
"Anmerkungen und Einzelnachweise",
"Filme",
"Einzelbelege",
],
"es": ["Vínculos de interés", "Véase también", "Enlaces externos", "Referencias"],
"fr": ["Bibliographie", "Lien externe", "Voir aussi", "Liens externes", "Références", "Notes et références", "Articles connexes"]
"fr": [
"Bibliographie",
"Lien externe",
"Voir aussi",
"Liens externes",
"Références",
"Notes et références",
"Articles connexes",
],
}
@ -45,9 +69,11 @@ def try_get(obj, prop, *args, **kwargs):
attr = getattr(obj, prop)
is_method = isinstance(attr, types.MethodType)
return attr(*args, **kwargs) if is_method else attr
except (requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout,
json.decoder.JSONDecodeError):
except (
requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout,
json.decoder.JSONDecodeError,
):
time.sleep(random.uniform(0.0, 1.0 / 1000.0 * ATTEMPTS_PAUSE_MS))
attempts -= 1
except urllib.error.HTTPError as e:
@ -58,8 +84,9 @@ def try_get(obj, prop, *args, **kwargs):
except urllib.error.URLError:
raise GettingError(f"URLError: {obj}, {prop}, {args}, {kwargs}")
raise GettingError(f"Getting {prop} field failed. "
f"All {REQUEST_ATTEMPTS} attempts are spent")
raise GettingError(
f"Getting {prop} field failed. " f"All {REQUEST_ATTEMPTS} attempts are spent"
)
def read_popularity(path):
@ -81,6 +108,7 @@ def should_download_page(popularity_set):
@functools.wraps(popularity_set)
def wrapped(ident):
return popularity_set is None or ident in popularity_set
return wrapped
@ -135,8 +163,9 @@ def get_page_info(url):
def get_wiki_page(lang, page_name):
wiki = wikipediaapi.Wikipedia(language=lang,
extract_format=wikipediaapi.ExtractFormat.HTML)
wiki = wikipediaapi.Wikipedia(
language=lang, extract_format=wikipediaapi.ExtractFormat.HTML
)
return wiki.page(page_name)
@ -171,11 +200,15 @@ def download(directory, url):
def get_wiki_langs(url):
lang, page_name = get_page_info(url)
page = get_wiki_page(lang, page_name)
curr_lang = [(lang, url), ]
curr_lang = [
(lang, url),
]
try:
langlinks = try_get(page, "langlinks")
return list(zip(langlinks.keys(),
[link.fullurl for link in langlinks.values()])) + curr_lang
return (
list(zip(langlinks.keys(), [link.fullurl for link in langlinks.values()]))
+ curr_lang
)
except GettingError as e:
log.warning(f"Error: no languages for {url} ({e}).")
return curr_lang
@ -209,6 +242,7 @@ def wikipedia_worker(output_dir, checker, langs):
parsed = urllib.parse.urlparse(url)
path = os.path.join(output_dir, parsed.netloc, parsed.path[1:])
download_all_from_wikipedia(path, url, langs)
return wrapped
@ -228,7 +262,8 @@ def get_wikidata_urls(entity, langs):
log.exception(f"Sitelinks not found for {entity.id}.")
return None
return [
entity.data["sitelinks"][k]["url"] for k in keys
entity.data["sitelinks"][k]["url"]
for k in keys
if any([k.startswith(lang) for lang in langs])
]
@ -259,6 +294,7 @@ def wikidata_worker(output_dir, checker, langs):
path = os.path.join(output_dir, wikidata_id)
for url in urls:
download(path, url)
return wrapped
@ -267,8 +303,9 @@ def download_from_wikidata_tags(input_file, output_dir, langs, checker):
os.makedirs(wikidata_output_dir, exist_ok=True)
with open(input_file) as file:
with ThreadPool(processes=WORKERS) as pool:
pool.map(wikidata_worker(wikidata_output_dir, checker, langs),
file, CHUNK_SIZE)
pool.map(
wikidata_worker(wikidata_output_dir, checker, langs), file, CHUNK_SIZE
)
def check_and_get_checker(popularity_file):

View file

@ -3,7 +3,9 @@ import json
import os
import sys
from post_generation.hierarchy_to_countries import hierarchy_to_countries as hierarchy_to_countries_
from post_generation.hierarchy_to_countries import (
hierarchy_to_countries as hierarchy_to_countries_,
)
from post_generation.inject_promo_ids import inject_promo_ids
from post_generation.localads_mwm_to_csv import create_csv
@ -17,7 +19,8 @@ The post_generation commands are:
localads_mwm_to_csv Prepares CSV files for uploading to localads database from mwm files.
hierarchy_to_countries Produces countries.txt from hierarchy.txt.
inject_promo_ids Injects promo osm ids into countries.txt
""")
""",
)
parser.add_argument("command", help="Subcommand to run")
args = parser.parse_args(sys.argv[1:2])
if not hasattr(self, args.command):
@ -30,57 +33,71 @@ The post_generation commands are:
def localads_mwm_to_csv():
parser = argparse.ArgumentParser(
description="Prepares CSV files for uploading to localads database "
"from mwm files.")
"from mwm files."
)
parser.add_argument("mwm", help="path to mwm files")
parser.add_argument(
"--osm2ft",
help="path to osm2ft files (default is the same as mwm)")
parser.add_argument("--output",
default=".",
help="path to generated files ('.' by default)")
types_default = os.path.join(os.path.dirname(__file__), "..", "..",
"..", "data", "types.txt")
parser.add_argument("--types",
default=types_default,
help="path to omim/data/types.txt")
parser.add_argument("--threads",
type=int,
default=1,
help="number of threads to process files")
parser.add_argument("--mwm_version", type=int, required=True,
help="Mwm version")
"--osm2ft", help="path to osm2ft files (default is the same as mwm)"
)
parser.add_argument(
"--output", default=".", help="path to generated files ('.' by default)"
)
types_default = os.path.join(
os.path.dirname(__file__), "..", "..", "..", "data", "types.txt"
)
parser.add_argument(
"--types", default=types_default, help="path to omim/data/types.txt"
)
parser.add_argument(
"--threads", type=int, default=1, help="number of threads to process files"
)
parser.add_argument(
"--mwm_version", type=int, required=True, help="Mwm version"
)
args = parser.parse_args(sys.argv[2:])
if not args.osm2ft:
args.osm2ft = args.mwm
create_csv(args.output, args.mwm, args.osm2ft, args.types,
args.mwm_version, args.threads)
create_csv(
args.output,
args.mwm,
args.osm2ft,
args.mwm_version,
args.threads,
)
@staticmethod
def hierarchy_to_countries():
parser = argparse.ArgumentParser(
description="Produces countries.txt from hierarchy.txt.")
parser.add_argument("--target", required=True,
help="Path to mwm files")
parser.add_argument("--hierarchy", required=True,
default="hierarchy.txt",
help="Hierarchy file")
parser.add_argument("--old", required=True,
help="old_vs_new.csv file")
parser.add_argument("--osm", required=True,
help="borders_vs_osm.csv file")
parser.add_argument("--countries_synonyms", required=True,
help="countries_synonyms.csv file")
parser.add_argument("--mwm_version", type=int, required=True,
help="Mwm version")
parser.add_argument("-o", "--output", required=True,
help="Output countries.txt file (default is stdout)")
description="Produces countries.txt from hierarchy.txt."
)
parser.add_argument("--target", required=True, help="Path to mwm files")
parser.add_argument(
"--hierarchy", required=True, default="hierarchy.txt", help="Hierarchy file"
)
parser.add_argument("--old", required=True, help="old_vs_new.csv file")
parser.add_argument("--osm", required=True, help="borders_vs_osm.csv file")
parser.add_argument(
"--countries_synonyms", required=True, help="countries_synonyms.csv file"
)
parser.add_argument(
"--mwm_version", type=int, required=True, help="Mwm version"
)
parser.add_argument(
"-o",
"--output",
required=True,
help="Output countries.txt file (default is stdout)",
)
args = parser.parse_args(sys.argv[2:])
countries_json = hierarchy_to_countries_(args.old, args.osm,
args.countries_synonyms,
args.hierarchy,
args.target,
args.mwm_version)
countries_json = hierarchy_to_countries_(
args.old,
args.osm,
args.countries_synonyms,
args.hierarchy,
args.target,
args.mwm_version,
)
if args.output:
with open(args.output, "w") as f:
f.write(countries_json)
@ -90,20 +107,29 @@ The post_generation commands are:
@staticmethod
def inject_promo_ids():
parser = argparse.ArgumentParser(
description="Injects promo cities osm ids into countries.txt")
description="Injects promo cities osm ids into countries.txt"
)
parser.add_argument("--mwm", required=True, help="path to mwm files")
parser.add_argument("--types", required=True,
help="path to omim/data/types.txt")
parser.add_argument("--promo_cities", required=True,
help="Path to promo cities file")
parser.add_argument("--promo_countries", required=True,
help="Path to promo countries file")
parser.add_argument("--osm2ft",
help="path to osm2ft files (default is the same as mwm)")
parser.add_argument("--countries",
help="path to countries.txt file (default is countries.txt file into mwm directory)")
parser.add_argument("--output",
help="Output countries.txt file (default is countries.txt file into mwm directory)")
parser.add_argument(
"--types", required=True, help="path to omim/data/types.txt"
)
parser.add_argument(
"--promo_cities", required=True, help="Path to promo cities file"
)
parser.add_argument(
"--promo_countries", required=True, help="Path to promo countries file"
)
parser.add_argument(
"--osm2ft", help="path to osm2ft files (default is the same as mwm)"
)
parser.add_argument(
"--countries",
help="path to countries.txt file (default is countries.txt file into mwm directory)",
)
parser.add_argument(
"--output",
help="Output countries.txt file (default is countries.txt file into mwm directory)",
)
args = parser.parse_args(sys.argv[2:])
if not args.osm2ft:
@ -116,8 +142,14 @@ The post_generation commands are:
with open(args.countries) as f:
countries = json.load(f)
inject_promo_ids(countries, args.promo_cities, args.promo_countries,
args.mwm, args.types, args.osm2ft)
inject_promo_ids(
countries,
args.promo_cities,
args.promo_countries,
args.mwm,
args.types,
args.osm2ft,
)
with open(args.output, "w") as f:
json.dump(countries, f, indent=1)

View file

@ -109,6 +109,7 @@ def parse_borders_vs_osm(borders_vs_osm_csv_path):
vsosm[m.group(1)] = [m.group(3)]
return vsosm
def parse_countries_synonyms(countries_synonyms_csv_path):
countries_synonyms = {}
if not countries_synonyms_csv_path:
@ -124,10 +125,15 @@ def parse_countries_synonyms(countries_synonyms_csv_path):
countries_synonyms[m.group(1)] = [m.group(2)]
return countries_synonyms
def hierarchy_to_countries(old_vs_new_csv_path, borders_vs_osm_csv_path,
countries_synonyms_csv_path, hierarchy_path,
target_path, version):
def hierarchy_to_countries(
old_vs_new_csv_path,
borders_vs_osm_csv_path,
countries_synonyms_csv_path,
hierarchy_path,
target_path,
version,
):
def fill_last(last, stack):
name = last["id"]
if not os.path.exists(os.path.join(target_path, f"{name}.mwm")):

View file

@ -20,11 +20,11 @@ class PromoIds(object):
def inject_into_country(self, country):
nodes = self._get_nodes(country)
with Pool() as pool:
proposed_ids = pool.map(self._find, (n["id"] for n in nodes),
chunksize=1)
proposed_ids = pool.map(self._find, (n["id"] for n in nodes), chunksize=1)
countries_ids = [ids for node_ids in proposed_ids for ids in
node_ids["countries"]]
countries_ids = [
ids for node_ids in proposed_ids for ids in node_ids["countries"]
]
if countries_ids:
country["top_countries_geo_ids"] = countries_ids
@ -35,13 +35,10 @@ class PromoIds(object):
best = self._choose_best_city(node_ids["cities"])
node["top_city_geo_id"] = best["id"]
if best["id"] < 0:
node["top_city_geo_id"] += (1 << 64)
node["top_city_geo_id"] += 1 << 64
def _find(self, leaf_id):
result = {
"countries": [],
"cities": []
}
result = {"countries": [], "cities": []}
ft2osm = load_osm2ft(self.osm2ft_path, leaf_id)
for feature in Mwm(os.path.join(self.mwm_path, leaf_id + ".mwm")):
@ -71,27 +68,24 @@ class PromoIds(object):
return mwm_nodes
def _get_city(self, osm_id, types):
city = {
"id": osm_id,
"count_of_guides": self.cities[osm_id],
"types": []
}
city = {"id": osm_id, "count_of_guides": self.cities[osm_id], "types": []}
for t in types:
if t.startswith("place"):
city["types"].append(t)
if not city["types"]:
logging.error(f"Incorrect types for sponsored-promo_catalog "
f"feature osm_id {osm_id}")
logging.error(
f"Incorrect types for sponsored-promo_catalog "
f"feature osm_id {osm_id}"
)
sys.exit(3)
return city
def _choose_best_city(self, proposed_cities):
def key_compare(city):
return city["count_of_guides"], self._score_city_types(
city["types"])
return city["count_of_guides"], self._score_city_types(city["types"])
return max(proposed_cities, key=key_compare)
@ -133,10 +127,20 @@ def load_osm2ft(osm2ft_path, mwm_id):
return read_osm2ft(f, ft2osm=True, tuples=False)
def inject_promo_ids(countries_json, promo_cities_path, promo_countries_path,
mwm_path, types_path, osm2ft_path):
promo_ids = PromoIds(load_promo_ids(promo_countries_path),
load_promo_ids(promo_cities_path), mwm_path,
types_path, osm2ft_path)
def inject_promo_ids(
countries_json,
promo_cities_path,
promo_countries_path,
mwm_path,
types_path,
osm2ft_path,
):
promo_ids = PromoIds(
load_promo_ids(promo_countries_path),
load_promo_ids(promo_cities_path),
mwm_path,
types_path,
osm2ft_path,
)
for country in countries_json["g"]:
promo_ids.inject_into_country(country)

View file

@ -18,12 +18,29 @@ HEADERS = {
"mwm": "mwm_id name mwm_version".split(),
}
QUEUES = {name: Queue() for name in HEADERS}
GOOD_TYPES = ("amenity", "shop", "tourism", "leisure", "sport",
"craft", "man_made", "office", "historic",
"aeroway", "natural-beach", "natural-peak", "natural-volcano",
"natural-spring", "natural-cave_entrance",
"waterway-waterfall", "place-island", "railway-station",
"railway-halt", "aerialway-station", "building-train_station")
GOOD_TYPES = (
"amenity",
"shop",
"tourism",
"leisure",
"sport",
"craft",
"man_made",
"office",
"historic",
"aeroway",
"natural-beach",
"natural-peak",
"natural-volcano",
"natural-spring",
"natural-cave_entrance",
"waterway-waterfall",
"place-island",
"railway-station",
"railway-halt",
"aerialway-station",
"building-train_station",
)
SOURCE_TYPES = {"osm": 0, "booking": 1}
@ -49,20 +66,28 @@ def parse_mwm(mwm_name, osm2ft_name, override_version):
if metadata is not None and MetadataField.sponsored_id in metadata:
for t in readable_types:
if t.startswith("sponsored-"):
QUEUES["sponsored"].put((metadata[MetadataField.sponsored_id],
feature.index(),
mwm_id,
version,
SOURCE_TYPES[t[t.find("-") + 1:]]))
QUEUES["sponsored"].put(
(
metadata[MetadataField.sponsored_id],
feature.index(),
mwm_id,
version,
SOURCE_TYPES[t[t.find("-") + 1 :]],
)
)
break
else:
for t in readable_types:
if t.startswith(GOOD_TYPES):
QUEUES["mapping"].put((ctypes.c_long(osm_id).value,
feature.index(),
mwm_id,
version,
SOURCE_TYPES["osm"]))
QUEUES["mapping"].put(
(
ctypes.c_long(osm_id).value,
feature.index(),
mwm_id,
version,
SOURCE_TYPES["osm"],
)
)
break
@ -87,7 +112,11 @@ def create_csv(output, mwm_path, osm2ft_path, version, threads):
pool = Pool(processes=threads)
for mwm_name in os.listdir(mwm_path):
if "World" in mwm_name or "minsk_pass" in mwm_name or not mwm_name.endswith(".mwm"):
if (
"World" in mwm_name
or "minsk_pass" in mwm_name
or not mwm_name.endswith(".mwm")
):
continue
osm2ft_name = os.path.join(osm2ft_path, os.path.basename(mwm_name) + ".osm2ft")
if not os.path.exists(osm2ft_name):