From 8c0dc3139db0db089cba5f8e5c1ca985a2b51c52 Mon Sep 17 00:00:00 2001 From: Maksim Andrianov Date: Fri, 5 Jun 2020 17:23:26 +0300 Subject: [PATCH] [python] Refactoring: Changed formatting. --- tools/python/booking/__main__.py | 79 +++++++--- tools/python/booking/api/booking_api.py | 38 ++--- tools/python/booking/download_hotels.py | 58 +++++-- tools/python/booking/download_test_data.py | 17 +- tools/python/descriptions/__main__.py | 44 ++++-- .../descriptions/descriptions_downloader.py | 71 +++++++-- tools/python/post_generation/__main__.py | 146 +++++++++++------- .../post_generation/hierarchy_to_countries.py | 12 +- .../post_generation/inject_promo_ids.py | 50 +++--- .../post_generation/localads_mwm_to_csv.py | 63 ++++++-- 10 files changed, 388 insertions(+), 190 deletions(-) diff --git a/tools/python/booking/__main__.py b/tools/python/booking/__main__.py index f79ab49e81..cedc7eb805 100644 --- a/tools/python/booking/__main__.py +++ b/tools/python/booking/__main__.py @@ -14,20 +14,39 @@ from booking.download_test_data import download_test_data def process_options(): parser = argparse.ArgumentParser(description="Download and process booking hotels.") parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("--logfile", default="", - help="Name and destination for log file") - parser.add_argument("--password", required=True, dest="password", - help="Booking.com account password") - parser.add_argument("--user", required=True, dest="user", - help="Booking.com account user name") - parser.add_argument("--threads_count", default=1, type=int, - help="The number of threads for processing countries.") - parser.add_argument("--output", required=True, dest="output", - help="Name and destination for output file") - parser.add_argument("--country_code", default=None, action="append", - help="Download hotels of this country.") - parser.add_argument("--download_test_dataset", default=False, - help="Download dataset for tests.") + parser.add_argument( + "--logfile", default="", help="Name and destination for log file" + ) + parser.add_argument( + "--password", + required=True, + dest="password", + help="Booking.com account password", + ) + parser.add_argument( + "--user", required=True, dest="user", help="Booking.com account user name" + ) + parser.add_argument( + "--threads_count", + default=1, + type=int, + help="The number of threads for processing countries.", + ) + parser.add_argument( + "--output", + required=True, + dest="output", + help="Name and destination for output file", + ) + parser.add_argument( + "--country_code", + default=None, + action="append", + help="Download hotels of this country.", + ) + parser.add_argument( + "--download_test_dataset", default=False, help="Download dataset for tests." + ) options = parser.parse_args() return options @@ -43,16 +62,34 @@ def main(): logfile = os.path.join(os.path.dirname(os.path.realpath(__file__)), name) print(f"Logs saved to {logfile}.", file=sys.stdout) if options.threads_count > 1: - print(f"Limit requests per minute is {LIMIT_REQUESTS_PER_MINUTE}.", file=sys.stdout) - logging.basicConfig(level=logging.DEBUG, filename=logfile, - format="%(thread)d [%(asctime)s] %(levelname)s: %(message)s") + print( + f"Limit requests per minute is {LIMIT_REQUESTS_PER_MINUTE}.", + file=sys.stdout, + ) + logging.basicConfig( + level=logging.DEBUG, + filename=logfile, + format="%(thread)d [%(asctime)s] %(levelname)s: %(message)s", + ) with tqdm(disable=not options.verbose) as progress_bar: if options.download_test_dataset: - download_test_data(options.country_code, options.user, options.password, - options.output, options.threads_count, progress_bar) + download_test_data( + options.country_code, + options.user, + options.password, + options.output, + options.threads_count, + progress_bar, + ) else: - download(options.country_code, options.user, options.password, - options.output, options.threads_count, progress_bar) + download( + options.country_code, + options.user, + options.password, + options.output, + options.threads_count, + progress_bar, + ) main() diff --git a/tools/python/booking/api/booking_api.py b/tools/python/booking/api/booking_api.py index 6b75f7b7d7..18691c76a7 100644 --- a/tools/python/booking/api/booking_api.py +++ b/tools/python/booking/api/booking_api.py @@ -16,11 +16,7 @@ MINMAX_LIMIT_WAIT_AFTER_429_ERROR_SECONDS = (30, 120) class BookingApi: - ENDPOINTS = { - "countries": "list", - "hotels": "list", - "districts": "list" - } + ENDPOINTS = {"countries": "list", "hotels": "list", "districts": "list"} def __init__(self, login, password, version): major_minor = version.split(".") @@ -46,9 +42,12 @@ class BookingApi: attempts -= 1 response = None try: - response = requests.get(f"{self._base_url}/{endpoint}", - auth=(self._login, self._password), - params=params, timeout=self._timeout) + response = requests.get( + f"{self._base_url}/{endpoint}", + auth=(self._login, self._password), + params=params, + timeout=self._timeout, + ) except requests.exceptions.ReadTimeout: logging.exception("Timeout error.") continue @@ -60,8 +59,9 @@ class BookingApi: try: data = response.json() except json.decoder.JSONDecodeError: - logging.exception(f"JSON decode error. " - f"Content: {response.content}") + logging.exception( + f"JSON decode error. " f"Content: {response.content}" + ) continue if code == 200: @@ -78,13 +78,14 @@ class BookingApi: if code == 429: self._event.clear() wait_seconds = randint(*MINMAX_LIMIT_WAIT_AFTER_429_ERROR_SECONDS) - logging.warning(f"Http error {code}: {data}. " - f"It waits {wait_seconds} seconds and tries again.") + logging.warning( + f"Http error {code}: {data}. " + f"It waits {wait_seconds} seconds and tries again." + ) sleep(wait_seconds) self._event.set() else: - raise HTTPError( - f"Http error with code {code}: {data}.") + raise HTTPError(f"Http error with code {code}: {data}.") def _set_endpoints(self): for endpoint in BookingApi.ENDPOINTS: @@ -110,11 +111,10 @@ class BookingListApi: return result def _call_endpoint_offset(self, offset, endpoint, **params): - r = self.api.call_endpoint(endpoint, **{ - "offset": offset, - "rows": BookingListApi._ROWS_BY_REQUEST, - **params - }) + r = self.api.call_endpoint( + endpoint, + **{"offset": offset, "rows": BookingListApi._ROWS_BY_REQUEST, **params}, + ) if not isinstance(r, list): raise TypeError(f"Result has unexpected type {type(r)}") return r diff --git a/tools/python/booking/download_hotels.py b/tools/python/booking/download_hotels.py index 1148d987b7..1ebd0b257c 100755 --- a/tools/python/booking/download_hotels.py +++ b/tools/python/booking/download_hotels.py @@ -14,9 +14,35 @@ from booking.api.booking_api import BookingApi from booking.api.booking_api import BookingListApi from booking.api.exceptions import GettingMinPriceError -SUPPORTED_LANGUAGES = ("en", "ru", "ar", "cs", "da", "nl", "fi", "fr", "de", - "hu", "id", "it", "ja", "ko", "pl", "pt", "ro", "es", - "sv", "th", "tr", "uk", "vi", "zh", "he", "sk", "el") +SUPPORTED_LANGUAGES = ( + "en", + "ru", + "ar", + "cs", + "da", + "nl", + "fi", + "fr", + "de", + "hu", + "id", + "it", + "ja", + "ko", + "pl", + "pt", + "ro", + "es", + "sv", + "th", + "tr", + "uk", + "vi", + "zh", + "he", + "sk", + "el", +) class BookingGen: @@ -55,11 +81,17 @@ class BookingGen: return self.api.hotels(country_ids=self.country_code, **params) def _download_translations(self): - extras = ["hotel_info", ] + extras = [ + "hotel_info", + ] translations = defaultdict(dict) with ThreadPoolExecutor(max_workers=len(SUPPORTED_LANGUAGES)) as executor: - m = {executor.submit(self._download_hotels, extras=extras, language=lang): lang - for lang in SUPPORTED_LANGUAGES} + m = { + executor.submit( + self._download_hotels, extras=extras, language=lang + ): lang + for lang in SUPPORTED_LANGUAGES + } for future in as_completed(m): lang = m[future] hotels = future.result() @@ -68,7 +100,7 @@ class BookingGen: hotel_data = hotel["hotel_data"] translations[hotel_id][lang] = { "name": BookingGen._format_string(hotel_data["name"]), - "address": BookingGen._format_string(hotel_data["address"]) + "address": BookingGen._format_string(hotel_data["address"]), } return translations @@ -162,7 +194,7 @@ class BookingGen: hotel_data["review_score"], hotel_data["url"], hotel_data["hotel_type_id"], - self._get_translations(hotel) + self._get_translations(hotel), ) return sep.join(BookingGen._format_string(str(x)) for x in row) @@ -174,8 +206,9 @@ def download_hotels_by_country(api, country): return rows -def download(country_code, user, password, path, threads_count, - progress_bar=tqdm(disable=True)): +def download( + country_code, user, password, path, threads_count, progress_bar=tqdm(disable=True) +): api = BookingApi(user, password, "2.4") list_api = BookingListApi(api) countries = list_api.countries(languages="en") @@ -186,8 +219,9 @@ def download(country_code, user, password, path, threads_count, progress_bar.total = len(countries) with open(path, "w") as f: with ThreadPool(threads_count) as pool: - for lines in pool.imap_unordered(partial(download_hotels_by_country, list_api), - countries): + for lines in pool.imap_unordered( + partial(download_hotels_by_country, list_api), countries + ): f.writelines([f"{x}\n" for x in lines]) progress_bar.update() logging.info(f"Hotels were saved to {path}.") diff --git a/tools/python/booking/download_test_data.py b/tools/python/booking/download_test_data.py index d8fc7a022c..6acf1017b1 100755 --- a/tools/python/booking/download_test_data.py +++ b/tools/python/booking/download_test_data.py @@ -49,6 +49,7 @@ class BookingGen: ) return sep.join(BookingGen._format_string(str(x)) for x in row) + def create_tsv_header(sep="\t"): row = ( "Hotel ID", @@ -68,17 +69,18 @@ def download_hotels_by_country(api, district_names, country): return rows -def download_test_data(country_code, user, password, path, threads_count, - progress_bar=tqdm(disable=True)): +def download_test_data( + country_code, user, password, path, threads_count, progress_bar=tqdm(disable=True) +): logging.info(f"Starting test dataset download.") api = BookingApi(user, password, "2.4") list_api = BookingListApi(api) districts = list_api.districts(languages="en") district_names = {} for district in districts: - for translation in district['translations']: - if translation['language'] == 'en': - district_names[district['district_id']] = translation['name'] + for translation in district["translations"]: + if translation["language"] == "en": + district_names[district["district_id"]] = translation["name"] countries = list_api.countries(languages="en") if country_code is not None: countries = list(filter(lambda x: x["country"] in country_code, countries)) @@ -88,8 +90,9 @@ def download_test_data(country_code, user, password, path, threads_count, with open(path, "w") as f: f.write(create_tsv_header() + "\n") with ThreadPool(threads_count) as pool: - for lines in pool.imap_unordered(partial(download_hotels_by_country, list_api, district_names), - countries): + for lines in pool.imap_unordered( + partial(download_hotels_by_country, list_api, district_names), countries + ): f.writelines([f"{x}\n" for x in lines]) progress_bar.update() logging.info(f"Hotels test dataset saved to {path}.") diff --git a/tools/python/descriptions/__main__.py b/tools/python/descriptions/__main__.py index 17e8973659..f74a1fac72 100644 --- a/tools/python/descriptions/__main__.py +++ b/tools/python/descriptions/__main__.py @@ -13,20 +13,36 @@ from descriptions.descriptions_downloader import log def parse_args(): parser = argparse.ArgumentParser(description="Download wiki pages.") - parser.add_argument("--output_dir", metavar="PATH", type=str, - help="Output dir for saving pages") - parser.add_argument("--popularity", metavar="PATH", type=str, - help="File with popular object ids for which we " - "download wikipedia data. If not given, download " - "for all objects.") - parser.add_argument('--wikipedia', metavar="PATH", type=str, required=True, - help="Input file with wikipedia url.") - parser.add_argument('--wikidata', metavar="PATH", type=str, - help="Input file with wikidata ids.") - parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+', - action='append', - help="Languages ​​for pages. If left blank, pages in all " - "available languages ​​will be loaded.") + parser.add_argument( + "--output_dir", metavar="PATH", type=str, help="Output dir for saving pages" + ) + parser.add_argument( + "--popularity", + metavar="PATH", + type=str, + help="File with popular object ids for which we " + "download wikipedia data. If not given, download " + "for all objects.", + ) + parser.add_argument( + "--wikipedia", + metavar="PATH", + type=str, + required=True, + help="Input file with wikipedia url.", + ) + parser.add_argument( + "--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids." + ) + parser.add_argument( + "--langs", + metavar="LANGS", + type=str, + nargs="+", + action="append", + help="Languages ​​for pages. If left blank, pages in all " + "available languages ​​will be loaded.", + ) return parser.parse_args() diff --git a/tools/python/descriptions/descriptions_downloader.py b/tools/python/descriptions/descriptions_downloader.py index ec76367c8f..404bb0b9f3 100644 --- a/tools/python/descriptions/descriptions_downloader.py +++ b/tools/python/descriptions/descriptions_downloader.py @@ -28,13 +28,37 @@ CHUNK_SIZE = 16 REQUEST_ATTEMPTS = 32 ATTEMPTS_PAUSE_MS = 4000 -HEADERS = {f"h{x}" for x in range(1,7)} +HEADERS = {f"h{x}" for x in range(1, 7)} BAD_SECTIONS = { - "en": ["External links", "Sources", "See also", "Bibliography", "Further reading", "References"], + "en": [ + "External links", + "Sources", + "See also", + "Bibliography", + "Further reading", + "References", + ], "ru": ["Литература", "Ссылки", "См. также", "Библиография", "Примечания"], - "de": ["Einzelnachweise", "Weblinks", "Literatur", "Siehe auch", "Anmerkungen", "Anmerkungen und Einzelnachweise", "Filme", "Einzelbelege"], + "de": [ + "Einzelnachweise", + "Weblinks", + "Literatur", + "Siehe auch", + "Anmerkungen", + "Anmerkungen und Einzelnachweise", + "Filme", + "Einzelbelege", + ], "es": ["Vínculos de interés", "Véase también", "Enlaces externos", "Referencias"], - "fr": ["Bibliographie", "Lien externe", "Voir aussi", "Liens externes", "Références", "Notes et références", "Articles connexes"] + "fr": [ + "Bibliographie", + "Lien externe", + "Voir aussi", + "Liens externes", + "Références", + "Notes et références", + "Articles connexes", + ], } @@ -45,9 +69,11 @@ def try_get(obj, prop, *args, **kwargs): attr = getattr(obj, prop) is_method = isinstance(attr, types.MethodType) return attr(*args, **kwargs) if is_method else attr - except (requests.exceptions.ConnectionError, - requests.exceptions.ReadTimeout, - json.decoder.JSONDecodeError): + except ( + requests.exceptions.ConnectionError, + requests.exceptions.ReadTimeout, + json.decoder.JSONDecodeError, + ): time.sleep(random.uniform(0.0, 1.0 / 1000.0 * ATTEMPTS_PAUSE_MS)) attempts -= 1 except urllib.error.HTTPError as e: @@ -58,8 +84,9 @@ def try_get(obj, prop, *args, **kwargs): except urllib.error.URLError: raise GettingError(f"URLError: {obj}, {prop}, {args}, {kwargs}") - raise GettingError(f"Getting {prop} field failed. " - f"All {REQUEST_ATTEMPTS} attempts are spent") + raise GettingError( + f"Getting {prop} field failed. " f"All {REQUEST_ATTEMPTS} attempts are spent" + ) def read_popularity(path): @@ -81,6 +108,7 @@ def should_download_page(popularity_set): @functools.wraps(popularity_set) def wrapped(ident): return popularity_set is None or ident in popularity_set + return wrapped @@ -135,8 +163,9 @@ def get_page_info(url): def get_wiki_page(lang, page_name): - wiki = wikipediaapi.Wikipedia(language=lang, - extract_format=wikipediaapi.ExtractFormat.HTML) + wiki = wikipediaapi.Wikipedia( + language=lang, extract_format=wikipediaapi.ExtractFormat.HTML + ) return wiki.page(page_name) @@ -171,11 +200,15 @@ def download(directory, url): def get_wiki_langs(url): lang, page_name = get_page_info(url) page = get_wiki_page(lang, page_name) - curr_lang = [(lang, url), ] + curr_lang = [ + (lang, url), + ] try: langlinks = try_get(page, "langlinks") - return list(zip(langlinks.keys(), - [link.fullurl for link in langlinks.values()])) + curr_lang + return ( + list(zip(langlinks.keys(), [link.fullurl for link in langlinks.values()])) + + curr_lang + ) except GettingError as e: log.warning(f"Error: no languages for {url} ({e}).") return curr_lang @@ -209,6 +242,7 @@ def wikipedia_worker(output_dir, checker, langs): parsed = urllib.parse.urlparse(url) path = os.path.join(output_dir, parsed.netloc, parsed.path[1:]) download_all_from_wikipedia(path, url, langs) + return wrapped @@ -228,7 +262,8 @@ def get_wikidata_urls(entity, langs): log.exception(f"Sitelinks not found for {entity.id}.") return None return [ - entity.data["sitelinks"][k]["url"] for k in keys + entity.data["sitelinks"][k]["url"] + for k in keys if any([k.startswith(lang) for lang in langs]) ] @@ -259,6 +294,7 @@ def wikidata_worker(output_dir, checker, langs): path = os.path.join(output_dir, wikidata_id) for url in urls: download(path, url) + return wrapped @@ -267,8 +303,9 @@ def download_from_wikidata_tags(input_file, output_dir, langs, checker): os.makedirs(wikidata_output_dir, exist_ok=True) with open(input_file) as file: with ThreadPool(processes=WORKERS) as pool: - pool.map(wikidata_worker(wikidata_output_dir, checker, langs), - file, CHUNK_SIZE) + pool.map( + wikidata_worker(wikidata_output_dir, checker, langs), file, CHUNK_SIZE + ) def check_and_get_checker(popularity_file): diff --git a/tools/python/post_generation/__main__.py b/tools/python/post_generation/__main__.py index d3048a5936..0b97f9647a 100644 --- a/tools/python/post_generation/__main__.py +++ b/tools/python/post_generation/__main__.py @@ -3,7 +3,9 @@ import json import os import sys -from post_generation.hierarchy_to_countries import hierarchy_to_countries as hierarchy_to_countries_ +from post_generation.hierarchy_to_countries import ( + hierarchy_to_countries as hierarchy_to_countries_, +) from post_generation.inject_promo_ids import inject_promo_ids from post_generation.localads_mwm_to_csv import create_csv @@ -17,7 +19,8 @@ The post_generation commands are: localads_mwm_to_csv Prepares CSV files for uploading to localads database from mwm files. hierarchy_to_countries Produces countries.txt from hierarchy.txt. inject_promo_ids Injects promo osm ids into countries.txt - """) + """, + ) parser.add_argument("command", help="Subcommand to run") args = parser.parse_args(sys.argv[1:2]) if not hasattr(self, args.command): @@ -30,57 +33,71 @@ The post_generation commands are: def localads_mwm_to_csv(): parser = argparse.ArgumentParser( description="Prepares CSV files for uploading to localads database " - "from mwm files.") + "from mwm files." + ) parser.add_argument("mwm", help="path to mwm files") parser.add_argument( - "--osm2ft", - help="path to osm2ft files (default is the same as mwm)") - parser.add_argument("--output", - default=".", - help="path to generated files ('.' by default)") - types_default = os.path.join(os.path.dirname(__file__), "..", "..", - "..", "data", "types.txt") - parser.add_argument("--types", - default=types_default, - help="path to omim/data/types.txt") - parser.add_argument("--threads", - type=int, - default=1, - help="number of threads to process files") - parser.add_argument("--mwm_version", type=int, required=True, - help="Mwm version") + "--osm2ft", help="path to osm2ft files (default is the same as mwm)" + ) + parser.add_argument( + "--output", default=".", help="path to generated files ('.' by default)" + ) + types_default = os.path.join( + os.path.dirname(__file__), "..", "..", "..", "data", "types.txt" + ) + parser.add_argument( + "--types", default=types_default, help="path to omim/data/types.txt" + ) + parser.add_argument( + "--threads", type=int, default=1, help="number of threads to process files" + ) + parser.add_argument( + "--mwm_version", type=int, required=True, help="Mwm version" + ) args = parser.parse_args(sys.argv[2:]) if not args.osm2ft: args.osm2ft = args.mwm - create_csv(args.output, args.mwm, args.osm2ft, args.types, - args.mwm_version, args.threads) + create_csv( + args.output, + args.mwm, + args.osm2ft, + args.mwm_version, + args.threads, + ) @staticmethod def hierarchy_to_countries(): parser = argparse.ArgumentParser( - description="Produces countries.txt from hierarchy.txt.") - parser.add_argument("--target", required=True, - help="Path to mwm files") - parser.add_argument("--hierarchy", required=True, - default="hierarchy.txt", - help="Hierarchy file") - parser.add_argument("--old", required=True, - help="old_vs_new.csv file") - parser.add_argument("--osm", required=True, - help="borders_vs_osm.csv file") - parser.add_argument("--countries_synonyms", required=True, - help="countries_synonyms.csv file") - parser.add_argument("--mwm_version", type=int, required=True, - help="Mwm version") - parser.add_argument("-o", "--output", required=True, - help="Output countries.txt file (default is stdout)") + description="Produces countries.txt from hierarchy.txt." + ) + parser.add_argument("--target", required=True, help="Path to mwm files") + parser.add_argument( + "--hierarchy", required=True, default="hierarchy.txt", help="Hierarchy file" + ) + parser.add_argument("--old", required=True, help="old_vs_new.csv file") + parser.add_argument("--osm", required=True, help="borders_vs_osm.csv file") + parser.add_argument( + "--countries_synonyms", required=True, help="countries_synonyms.csv file" + ) + parser.add_argument( + "--mwm_version", type=int, required=True, help="Mwm version" + ) + parser.add_argument( + "-o", + "--output", + required=True, + help="Output countries.txt file (default is stdout)", + ) args = parser.parse_args(sys.argv[2:]) - countries_json = hierarchy_to_countries_(args.old, args.osm, - args.countries_synonyms, - args.hierarchy, - args.target, - args.mwm_version) + countries_json = hierarchy_to_countries_( + args.old, + args.osm, + args.countries_synonyms, + args.hierarchy, + args.target, + args.mwm_version, + ) if args.output: with open(args.output, "w") as f: f.write(countries_json) @@ -90,20 +107,29 @@ The post_generation commands are: @staticmethod def inject_promo_ids(): parser = argparse.ArgumentParser( - description="Injects promo cities osm ids into countries.txt") + description="Injects promo cities osm ids into countries.txt" + ) parser.add_argument("--mwm", required=True, help="path to mwm files") - parser.add_argument("--types", required=True, - help="path to omim/data/types.txt") - parser.add_argument("--promo_cities", required=True, - help="Path to promo cities file") - parser.add_argument("--promo_countries", required=True, - help="Path to promo countries file") - parser.add_argument("--osm2ft", - help="path to osm2ft files (default is the same as mwm)") - parser.add_argument("--countries", - help="path to countries.txt file (default is countries.txt file into mwm directory)") - parser.add_argument("--output", - help="Output countries.txt file (default is countries.txt file into mwm directory)") + parser.add_argument( + "--types", required=True, help="path to omim/data/types.txt" + ) + parser.add_argument( + "--promo_cities", required=True, help="Path to promo cities file" + ) + parser.add_argument( + "--promo_countries", required=True, help="Path to promo countries file" + ) + parser.add_argument( + "--osm2ft", help="path to osm2ft files (default is the same as mwm)" + ) + parser.add_argument( + "--countries", + help="path to countries.txt file (default is countries.txt file into mwm directory)", + ) + parser.add_argument( + "--output", + help="Output countries.txt file (default is countries.txt file into mwm directory)", + ) args = parser.parse_args(sys.argv[2:]) if not args.osm2ft: @@ -116,8 +142,14 @@ The post_generation commands are: with open(args.countries) as f: countries = json.load(f) - inject_promo_ids(countries, args.promo_cities, args.promo_countries, - args.mwm, args.types, args.osm2ft) + inject_promo_ids( + countries, + args.promo_cities, + args.promo_countries, + args.mwm, + args.types, + args.osm2ft, + ) with open(args.output, "w") as f: json.dump(countries, f, indent=1) diff --git a/tools/python/post_generation/hierarchy_to_countries.py b/tools/python/post_generation/hierarchy_to_countries.py index e7896935d4..777f4f0079 100755 --- a/tools/python/post_generation/hierarchy_to_countries.py +++ b/tools/python/post_generation/hierarchy_to_countries.py @@ -109,6 +109,7 @@ def parse_borders_vs_osm(borders_vs_osm_csv_path): vsosm[m.group(1)] = [m.group(3)] return vsosm + def parse_countries_synonyms(countries_synonyms_csv_path): countries_synonyms = {} if not countries_synonyms_csv_path: @@ -124,10 +125,15 @@ def parse_countries_synonyms(countries_synonyms_csv_path): countries_synonyms[m.group(1)] = [m.group(2)] return countries_synonyms -def hierarchy_to_countries(old_vs_new_csv_path, borders_vs_osm_csv_path, - countries_synonyms_csv_path, hierarchy_path, - target_path, version): +def hierarchy_to_countries( + old_vs_new_csv_path, + borders_vs_osm_csv_path, + countries_synonyms_csv_path, + hierarchy_path, + target_path, + version, +): def fill_last(last, stack): name = last["id"] if not os.path.exists(os.path.join(target_path, f"{name}.mwm")): diff --git a/tools/python/post_generation/inject_promo_ids.py b/tools/python/post_generation/inject_promo_ids.py index 056676e18c..efc4b0e958 100644 --- a/tools/python/post_generation/inject_promo_ids.py +++ b/tools/python/post_generation/inject_promo_ids.py @@ -20,11 +20,11 @@ class PromoIds(object): def inject_into_country(self, country): nodes = self._get_nodes(country) with Pool() as pool: - proposed_ids = pool.map(self._find, (n["id"] for n in nodes), - chunksize=1) + proposed_ids = pool.map(self._find, (n["id"] for n in nodes), chunksize=1) - countries_ids = [ids for node_ids in proposed_ids for ids in - node_ids["countries"]] + countries_ids = [ + ids for node_ids in proposed_ids for ids in node_ids["countries"] + ] if countries_ids: country["top_countries_geo_ids"] = countries_ids @@ -35,13 +35,10 @@ class PromoIds(object): best = self._choose_best_city(node_ids["cities"]) node["top_city_geo_id"] = best["id"] if best["id"] < 0: - node["top_city_geo_id"] += (1 << 64) + node["top_city_geo_id"] += 1 << 64 def _find(self, leaf_id): - result = { - "countries": [], - "cities": [] - } + result = {"countries": [], "cities": []} ft2osm = load_osm2ft(self.osm2ft_path, leaf_id) for feature in Mwm(os.path.join(self.mwm_path, leaf_id + ".mwm")): @@ -71,27 +68,24 @@ class PromoIds(object): return mwm_nodes def _get_city(self, osm_id, types): - city = { - "id": osm_id, - "count_of_guides": self.cities[osm_id], - "types": [] - } + city = {"id": osm_id, "count_of_guides": self.cities[osm_id], "types": []} for t in types: if t.startswith("place"): city["types"].append(t) if not city["types"]: - logging.error(f"Incorrect types for sponsored-promo_catalog " - f"feature osm_id {osm_id}") + logging.error( + f"Incorrect types for sponsored-promo_catalog " + f"feature osm_id {osm_id}" + ) sys.exit(3) return city def _choose_best_city(self, proposed_cities): def key_compare(city): - return city["count_of_guides"], self._score_city_types( - city["types"]) + return city["count_of_guides"], self._score_city_types(city["types"]) return max(proposed_cities, key=key_compare) @@ -133,10 +127,20 @@ def load_osm2ft(osm2ft_path, mwm_id): return read_osm2ft(f, ft2osm=True, tuples=False) -def inject_promo_ids(countries_json, promo_cities_path, promo_countries_path, - mwm_path, types_path, osm2ft_path): - promo_ids = PromoIds(load_promo_ids(promo_countries_path), - load_promo_ids(promo_cities_path), mwm_path, - types_path, osm2ft_path) +def inject_promo_ids( + countries_json, + promo_cities_path, + promo_countries_path, + mwm_path, + types_path, + osm2ft_path, +): + promo_ids = PromoIds( + load_promo_ids(promo_countries_path), + load_promo_ids(promo_cities_path), + mwm_path, + types_path, + osm2ft_path, + ) for country in countries_json["g"]: promo_ids.inject_into_country(country) diff --git a/tools/python/post_generation/localads_mwm_to_csv.py b/tools/python/post_generation/localads_mwm_to_csv.py index 6b6b655ecf..02032cd918 100755 --- a/tools/python/post_generation/localads_mwm_to_csv.py +++ b/tools/python/post_generation/localads_mwm_to_csv.py @@ -18,12 +18,29 @@ HEADERS = { "mwm": "mwm_id name mwm_version".split(), } QUEUES = {name: Queue() for name in HEADERS} -GOOD_TYPES = ("amenity", "shop", "tourism", "leisure", "sport", - "craft", "man_made", "office", "historic", - "aeroway", "natural-beach", "natural-peak", "natural-volcano", - "natural-spring", "natural-cave_entrance", - "waterway-waterfall", "place-island", "railway-station", - "railway-halt", "aerialway-station", "building-train_station") +GOOD_TYPES = ( + "amenity", + "shop", + "tourism", + "leisure", + "sport", + "craft", + "man_made", + "office", + "historic", + "aeroway", + "natural-beach", + "natural-peak", + "natural-volcano", + "natural-spring", + "natural-cave_entrance", + "waterway-waterfall", + "place-island", + "railway-station", + "railway-halt", + "aerialway-station", + "building-train_station", +) SOURCE_TYPES = {"osm": 0, "booking": 1} @@ -49,20 +66,28 @@ def parse_mwm(mwm_name, osm2ft_name, override_version): if metadata is not None and MetadataField.sponsored_id in metadata: for t in readable_types: if t.startswith("sponsored-"): - QUEUES["sponsored"].put((metadata[MetadataField.sponsored_id], - feature.index(), - mwm_id, - version, - SOURCE_TYPES[t[t.find("-") + 1:]])) + QUEUES["sponsored"].put( + ( + metadata[MetadataField.sponsored_id], + feature.index(), + mwm_id, + version, + SOURCE_TYPES[t[t.find("-") + 1 :]], + ) + ) break else: for t in readable_types: if t.startswith(GOOD_TYPES): - QUEUES["mapping"].put((ctypes.c_long(osm_id).value, - feature.index(), - mwm_id, - version, - SOURCE_TYPES["osm"])) + QUEUES["mapping"].put( + ( + ctypes.c_long(osm_id).value, + feature.index(), + mwm_id, + version, + SOURCE_TYPES["osm"], + ) + ) break @@ -87,7 +112,11 @@ def create_csv(output, mwm_path, osm2ft_path, version, threads): pool = Pool(processes=threads) for mwm_name in os.listdir(mwm_path): - if "World" in mwm_name or "minsk_pass" in mwm_name or not mwm_name.endswith(".mwm"): + if ( + "World" in mwm_name + or "minsk_pass" in mwm_name + or not mwm_name.endswith(".mwm") + ): continue osm2ft_name = os.path.join(osm2ft_path, os.path.basename(mwm_name) + ".osm2ft") if not os.path.exists(osm2ft_name):