forked from organicmaps/organicmaps
[python] Refactoring: Changed formatting.
This commit is contained in:
parent
986b4ac0d2
commit
8c0dc3139d
10 changed files with 388 additions and 190 deletions
|
@ -14,20 +14,39 @@ from booking.download_test_data import download_test_data
|
|||
def process_options():
|
||||
parser = argparse.ArgumentParser(description="Download and process booking hotels.")
|
||||
parser.add_argument("-v", "--verbose", action="store_true")
|
||||
parser.add_argument("--logfile", default="",
|
||||
help="Name and destination for log file")
|
||||
parser.add_argument("--password", required=True, dest="password",
|
||||
help="Booking.com account password")
|
||||
parser.add_argument("--user", required=True, dest="user",
|
||||
help="Booking.com account user name")
|
||||
parser.add_argument("--threads_count", default=1, type=int,
|
||||
help="The number of threads for processing countries.")
|
||||
parser.add_argument("--output", required=True, dest="output",
|
||||
help="Name and destination for output file")
|
||||
parser.add_argument("--country_code", default=None, action="append",
|
||||
help="Download hotels of this country.")
|
||||
parser.add_argument("--download_test_dataset", default=False,
|
||||
help="Download dataset for tests.")
|
||||
parser.add_argument(
|
||||
"--logfile", default="", help="Name and destination for log file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--password",
|
||||
required=True,
|
||||
dest="password",
|
||||
help="Booking.com account password",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--user", required=True, dest="user", help="Booking.com account user name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--threads_count",
|
||||
default=1,
|
||||
type=int,
|
||||
help="The number of threads for processing countries.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
required=True,
|
||||
dest="output",
|
||||
help="Name and destination for output file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--country_code",
|
||||
default=None,
|
||||
action="append",
|
||||
help="Download hotels of this country.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--download_test_dataset", default=False, help="Download dataset for tests."
|
||||
)
|
||||
options = parser.parse_args()
|
||||
return options
|
||||
|
||||
|
@ -43,16 +62,34 @@ def main():
|
|||
logfile = os.path.join(os.path.dirname(os.path.realpath(__file__)), name)
|
||||
print(f"Logs saved to {logfile}.", file=sys.stdout)
|
||||
if options.threads_count > 1:
|
||||
print(f"Limit requests per minute is {LIMIT_REQUESTS_PER_MINUTE}.", file=sys.stdout)
|
||||
logging.basicConfig(level=logging.DEBUG, filename=logfile,
|
||||
format="%(thread)d [%(asctime)s] %(levelname)s: %(message)s")
|
||||
print(
|
||||
f"Limit requests per minute is {LIMIT_REQUESTS_PER_MINUTE}.",
|
||||
file=sys.stdout,
|
||||
)
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
filename=logfile,
|
||||
format="%(thread)d [%(asctime)s] %(levelname)s: %(message)s",
|
||||
)
|
||||
with tqdm(disable=not options.verbose) as progress_bar:
|
||||
if options.download_test_dataset:
|
||||
download_test_data(options.country_code, options.user, options.password,
|
||||
options.output, options.threads_count, progress_bar)
|
||||
download_test_data(
|
||||
options.country_code,
|
||||
options.user,
|
||||
options.password,
|
||||
options.output,
|
||||
options.threads_count,
|
||||
progress_bar,
|
||||
)
|
||||
else:
|
||||
download(options.country_code, options.user, options.password,
|
||||
options.output, options.threads_count, progress_bar)
|
||||
download(
|
||||
options.country_code,
|
||||
options.user,
|
||||
options.password,
|
||||
options.output,
|
||||
options.threads_count,
|
||||
progress_bar,
|
||||
)
|
||||
|
||||
|
||||
main()
|
||||
|
|
|
@ -16,11 +16,7 @@ MINMAX_LIMIT_WAIT_AFTER_429_ERROR_SECONDS = (30, 120)
|
|||
|
||||
|
||||
class BookingApi:
|
||||
ENDPOINTS = {
|
||||
"countries": "list",
|
||||
"hotels": "list",
|
||||
"districts": "list"
|
||||
}
|
||||
ENDPOINTS = {"countries": "list", "hotels": "list", "districts": "list"}
|
||||
|
||||
def __init__(self, login, password, version):
|
||||
major_minor = version.split(".")
|
||||
|
@ -46,9 +42,12 @@ class BookingApi:
|
|||
attempts -= 1
|
||||
response = None
|
||||
try:
|
||||
response = requests.get(f"{self._base_url}/{endpoint}",
|
||||
auth=(self._login, self._password),
|
||||
params=params, timeout=self._timeout)
|
||||
response = requests.get(
|
||||
f"{self._base_url}/{endpoint}",
|
||||
auth=(self._login, self._password),
|
||||
params=params,
|
||||
timeout=self._timeout,
|
||||
)
|
||||
except requests.exceptions.ReadTimeout:
|
||||
logging.exception("Timeout error.")
|
||||
continue
|
||||
|
@ -60,8 +59,9 @@ class BookingApi:
|
|||
try:
|
||||
data = response.json()
|
||||
except json.decoder.JSONDecodeError:
|
||||
logging.exception(f"JSON decode error. "
|
||||
f"Content: {response.content}")
|
||||
logging.exception(
|
||||
f"JSON decode error. " f"Content: {response.content}"
|
||||
)
|
||||
continue
|
||||
|
||||
if code == 200:
|
||||
|
@ -78,13 +78,14 @@ class BookingApi:
|
|||
if code == 429:
|
||||
self._event.clear()
|
||||
wait_seconds = randint(*MINMAX_LIMIT_WAIT_AFTER_429_ERROR_SECONDS)
|
||||
logging.warning(f"Http error {code}: {data}. "
|
||||
f"It waits {wait_seconds} seconds and tries again.")
|
||||
logging.warning(
|
||||
f"Http error {code}: {data}. "
|
||||
f"It waits {wait_seconds} seconds and tries again."
|
||||
)
|
||||
sleep(wait_seconds)
|
||||
self._event.set()
|
||||
else:
|
||||
raise HTTPError(
|
||||
f"Http error with code {code}: {data}.")
|
||||
raise HTTPError(f"Http error with code {code}: {data}.")
|
||||
|
||||
def _set_endpoints(self):
|
||||
for endpoint in BookingApi.ENDPOINTS:
|
||||
|
@ -110,11 +111,10 @@ class BookingListApi:
|
|||
return result
|
||||
|
||||
def _call_endpoint_offset(self, offset, endpoint, **params):
|
||||
r = self.api.call_endpoint(endpoint, **{
|
||||
"offset": offset,
|
||||
"rows": BookingListApi._ROWS_BY_REQUEST,
|
||||
**params
|
||||
})
|
||||
r = self.api.call_endpoint(
|
||||
endpoint,
|
||||
**{"offset": offset, "rows": BookingListApi._ROWS_BY_REQUEST, **params},
|
||||
)
|
||||
if not isinstance(r, list):
|
||||
raise TypeError(f"Result has unexpected type {type(r)}")
|
||||
return r
|
||||
|
|
|
@ -14,9 +14,35 @@ from booking.api.booking_api import BookingApi
|
|||
from booking.api.booking_api import BookingListApi
|
||||
from booking.api.exceptions import GettingMinPriceError
|
||||
|
||||
SUPPORTED_LANGUAGES = ("en", "ru", "ar", "cs", "da", "nl", "fi", "fr", "de",
|
||||
"hu", "id", "it", "ja", "ko", "pl", "pt", "ro", "es",
|
||||
"sv", "th", "tr", "uk", "vi", "zh", "he", "sk", "el")
|
||||
SUPPORTED_LANGUAGES = (
|
||||
"en",
|
||||
"ru",
|
||||
"ar",
|
||||
"cs",
|
||||
"da",
|
||||
"nl",
|
||||
"fi",
|
||||
"fr",
|
||||
"de",
|
||||
"hu",
|
||||
"id",
|
||||
"it",
|
||||
"ja",
|
||||
"ko",
|
||||
"pl",
|
||||
"pt",
|
||||
"ro",
|
||||
"es",
|
||||
"sv",
|
||||
"th",
|
||||
"tr",
|
||||
"uk",
|
||||
"vi",
|
||||
"zh",
|
||||
"he",
|
||||
"sk",
|
||||
"el",
|
||||
)
|
||||
|
||||
|
||||
class BookingGen:
|
||||
|
@ -55,11 +81,17 @@ class BookingGen:
|
|||
return self.api.hotels(country_ids=self.country_code, **params)
|
||||
|
||||
def _download_translations(self):
|
||||
extras = ["hotel_info", ]
|
||||
extras = [
|
||||
"hotel_info",
|
||||
]
|
||||
translations = defaultdict(dict)
|
||||
with ThreadPoolExecutor(max_workers=len(SUPPORTED_LANGUAGES)) as executor:
|
||||
m = {executor.submit(self._download_hotels, extras=extras, language=lang): lang
|
||||
for lang in SUPPORTED_LANGUAGES}
|
||||
m = {
|
||||
executor.submit(
|
||||
self._download_hotels, extras=extras, language=lang
|
||||
): lang
|
||||
for lang in SUPPORTED_LANGUAGES
|
||||
}
|
||||
for future in as_completed(m):
|
||||
lang = m[future]
|
||||
hotels = future.result()
|
||||
|
@ -68,7 +100,7 @@ class BookingGen:
|
|||
hotel_data = hotel["hotel_data"]
|
||||
translations[hotel_id][lang] = {
|
||||
"name": BookingGen._format_string(hotel_data["name"]),
|
||||
"address": BookingGen._format_string(hotel_data["address"])
|
||||
"address": BookingGen._format_string(hotel_data["address"]),
|
||||
}
|
||||
return translations
|
||||
|
||||
|
@ -162,7 +194,7 @@ class BookingGen:
|
|||
hotel_data["review_score"],
|
||||
hotel_data["url"],
|
||||
hotel_data["hotel_type_id"],
|
||||
self._get_translations(hotel)
|
||||
self._get_translations(hotel),
|
||||
)
|
||||
return sep.join(BookingGen._format_string(str(x)) for x in row)
|
||||
|
||||
|
@ -174,8 +206,9 @@ def download_hotels_by_country(api, country):
|
|||
return rows
|
||||
|
||||
|
||||
def download(country_code, user, password, path, threads_count,
|
||||
progress_bar=tqdm(disable=True)):
|
||||
def download(
|
||||
country_code, user, password, path, threads_count, progress_bar=tqdm(disable=True)
|
||||
):
|
||||
api = BookingApi(user, password, "2.4")
|
||||
list_api = BookingListApi(api)
|
||||
countries = list_api.countries(languages="en")
|
||||
|
@ -186,8 +219,9 @@ def download(country_code, user, password, path, threads_count,
|
|||
progress_bar.total = len(countries)
|
||||
with open(path, "w") as f:
|
||||
with ThreadPool(threads_count) as pool:
|
||||
for lines in pool.imap_unordered(partial(download_hotels_by_country, list_api),
|
||||
countries):
|
||||
for lines in pool.imap_unordered(
|
||||
partial(download_hotels_by_country, list_api), countries
|
||||
):
|
||||
f.writelines([f"{x}\n" for x in lines])
|
||||
progress_bar.update()
|
||||
logging.info(f"Hotels were saved to {path}.")
|
||||
|
|
|
@ -49,6 +49,7 @@ class BookingGen:
|
|||
)
|
||||
return sep.join(BookingGen._format_string(str(x)) for x in row)
|
||||
|
||||
|
||||
def create_tsv_header(sep="\t"):
|
||||
row = (
|
||||
"Hotel ID",
|
||||
|
@ -68,17 +69,18 @@ def download_hotels_by_country(api, district_names, country):
|
|||
return rows
|
||||
|
||||
|
||||
def download_test_data(country_code, user, password, path, threads_count,
|
||||
progress_bar=tqdm(disable=True)):
|
||||
def download_test_data(
|
||||
country_code, user, password, path, threads_count, progress_bar=tqdm(disable=True)
|
||||
):
|
||||
logging.info(f"Starting test dataset download.")
|
||||
api = BookingApi(user, password, "2.4")
|
||||
list_api = BookingListApi(api)
|
||||
districts = list_api.districts(languages="en")
|
||||
district_names = {}
|
||||
for district in districts:
|
||||
for translation in district['translations']:
|
||||
if translation['language'] == 'en':
|
||||
district_names[district['district_id']] = translation['name']
|
||||
for translation in district["translations"]:
|
||||
if translation["language"] == "en":
|
||||
district_names[district["district_id"]] = translation["name"]
|
||||
countries = list_api.countries(languages="en")
|
||||
if country_code is not None:
|
||||
countries = list(filter(lambda x: x["country"] in country_code, countries))
|
||||
|
@ -88,8 +90,9 @@ def download_test_data(country_code, user, password, path, threads_count,
|
|||
with open(path, "w") as f:
|
||||
f.write(create_tsv_header() + "\n")
|
||||
with ThreadPool(threads_count) as pool:
|
||||
for lines in pool.imap_unordered(partial(download_hotels_by_country, list_api, district_names),
|
||||
countries):
|
||||
for lines in pool.imap_unordered(
|
||||
partial(download_hotels_by_country, list_api, district_names), countries
|
||||
):
|
||||
f.writelines([f"{x}\n" for x in lines])
|
||||
progress_bar.update()
|
||||
logging.info(f"Hotels test dataset saved to {path}.")
|
||||
|
|
|
@ -13,20 +13,36 @@ from descriptions.descriptions_downloader import log
|
|||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Download wiki pages.")
|
||||
parser.add_argument("--output_dir", metavar="PATH", type=str,
|
||||
help="Output dir for saving pages")
|
||||
parser.add_argument("--popularity", metavar="PATH", type=str,
|
||||
help="File with popular object ids for which we "
|
||||
"download wikipedia data. If not given, download "
|
||||
"for all objects.")
|
||||
parser.add_argument('--wikipedia', metavar="PATH", type=str, required=True,
|
||||
help="Input file with wikipedia url.")
|
||||
parser.add_argument('--wikidata', metavar="PATH", type=str,
|
||||
help="Input file with wikidata ids.")
|
||||
parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+',
|
||||
action='append',
|
||||
help="Languages for pages. If left blank, pages in all "
|
||||
"available languages will be loaded.")
|
||||
parser.add_argument(
|
||||
"--output_dir", metavar="PATH", type=str, help="Output dir for saving pages"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--popularity",
|
||||
metavar="PATH",
|
||||
type=str,
|
||||
help="File with popular object ids for which we "
|
||||
"download wikipedia data. If not given, download "
|
||||
"for all objects.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wikipedia",
|
||||
metavar="PATH",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Input file with wikipedia url.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--langs",
|
||||
metavar="LANGS",
|
||||
type=str,
|
||||
nargs="+",
|
||||
action="append",
|
||||
help="Languages for pages. If left blank, pages in all "
|
||||
"available languages will be loaded.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
|
|
|
@ -28,13 +28,37 @@ CHUNK_SIZE = 16
|
|||
REQUEST_ATTEMPTS = 32
|
||||
ATTEMPTS_PAUSE_MS = 4000
|
||||
|
||||
HEADERS = {f"h{x}" for x in range(1,7)}
|
||||
HEADERS = {f"h{x}" for x in range(1, 7)}
|
||||
BAD_SECTIONS = {
|
||||
"en": ["External links", "Sources", "See also", "Bibliography", "Further reading", "References"],
|
||||
"en": [
|
||||
"External links",
|
||||
"Sources",
|
||||
"See also",
|
||||
"Bibliography",
|
||||
"Further reading",
|
||||
"References",
|
||||
],
|
||||
"ru": ["Литература", "Ссылки", "См. также", "Библиография", "Примечания"],
|
||||
"de": ["Einzelnachweise", "Weblinks", "Literatur", "Siehe auch", "Anmerkungen", "Anmerkungen und Einzelnachweise", "Filme", "Einzelbelege"],
|
||||
"de": [
|
||||
"Einzelnachweise",
|
||||
"Weblinks",
|
||||
"Literatur",
|
||||
"Siehe auch",
|
||||
"Anmerkungen",
|
||||
"Anmerkungen und Einzelnachweise",
|
||||
"Filme",
|
||||
"Einzelbelege",
|
||||
],
|
||||
"es": ["Vínculos de interés", "Véase también", "Enlaces externos", "Referencias"],
|
||||
"fr": ["Bibliographie", "Lien externe", "Voir aussi", "Liens externes", "Références", "Notes et références", "Articles connexes"]
|
||||
"fr": [
|
||||
"Bibliographie",
|
||||
"Lien externe",
|
||||
"Voir aussi",
|
||||
"Liens externes",
|
||||
"Références",
|
||||
"Notes et références",
|
||||
"Articles connexes",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
|
@ -45,9 +69,11 @@ def try_get(obj, prop, *args, **kwargs):
|
|||
attr = getattr(obj, prop)
|
||||
is_method = isinstance(attr, types.MethodType)
|
||||
return attr(*args, **kwargs) if is_method else attr
|
||||
except (requests.exceptions.ConnectionError,
|
||||
requests.exceptions.ReadTimeout,
|
||||
json.decoder.JSONDecodeError):
|
||||
except (
|
||||
requests.exceptions.ConnectionError,
|
||||
requests.exceptions.ReadTimeout,
|
||||
json.decoder.JSONDecodeError,
|
||||
):
|
||||
time.sleep(random.uniform(0.0, 1.0 / 1000.0 * ATTEMPTS_PAUSE_MS))
|
||||
attempts -= 1
|
||||
except urllib.error.HTTPError as e:
|
||||
|
@ -58,8 +84,9 @@ def try_get(obj, prop, *args, **kwargs):
|
|||
except urllib.error.URLError:
|
||||
raise GettingError(f"URLError: {obj}, {prop}, {args}, {kwargs}")
|
||||
|
||||
raise GettingError(f"Getting {prop} field failed. "
|
||||
f"All {REQUEST_ATTEMPTS} attempts are spent")
|
||||
raise GettingError(
|
||||
f"Getting {prop} field failed. " f"All {REQUEST_ATTEMPTS} attempts are spent"
|
||||
)
|
||||
|
||||
|
||||
def read_popularity(path):
|
||||
|
@ -81,6 +108,7 @@ def should_download_page(popularity_set):
|
|||
@functools.wraps(popularity_set)
|
||||
def wrapped(ident):
|
||||
return popularity_set is None or ident in popularity_set
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
|
@ -135,8 +163,9 @@ def get_page_info(url):
|
|||
|
||||
|
||||
def get_wiki_page(lang, page_name):
|
||||
wiki = wikipediaapi.Wikipedia(language=lang,
|
||||
extract_format=wikipediaapi.ExtractFormat.HTML)
|
||||
wiki = wikipediaapi.Wikipedia(
|
||||
language=lang, extract_format=wikipediaapi.ExtractFormat.HTML
|
||||
)
|
||||
return wiki.page(page_name)
|
||||
|
||||
|
||||
|
@ -171,11 +200,15 @@ def download(directory, url):
|
|||
def get_wiki_langs(url):
|
||||
lang, page_name = get_page_info(url)
|
||||
page = get_wiki_page(lang, page_name)
|
||||
curr_lang = [(lang, url), ]
|
||||
curr_lang = [
|
||||
(lang, url),
|
||||
]
|
||||
try:
|
||||
langlinks = try_get(page, "langlinks")
|
||||
return list(zip(langlinks.keys(),
|
||||
[link.fullurl for link in langlinks.values()])) + curr_lang
|
||||
return (
|
||||
list(zip(langlinks.keys(), [link.fullurl for link in langlinks.values()]))
|
||||
+ curr_lang
|
||||
)
|
||||
except GettingError as e:
|
||||
log.warning(f"Error: no languages for {url} ({e}).")
|
||||
return curr_lang
|
||||
|
@ -209,6 +242,7 @@ def wikipedia_worker(output_dir, checker, langs):
|
|||
parsed = urllib.parse.urlparse(url)
|
||||
path = os.path.join(output_dir, parsed.netloc, parsed.path[1:])
|
||||
download_all_from_wikipedia(path, url, langs)
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
|
@ -228,7 +262,8 @@ def get_wikidata_urls(entity, langs):
|
|||
log.exception(f"Sitelinks not found for {entity.id}.")
|
||||
return None
|
||||
return [
|
||||
entity.data["sitelinks"][k]["url"] for k in keys
|
||||
entity.data["sitelinks"][k]["url"]
|
||||
for k in keys
|
||||
if any([k.startswith(lang) for lang in langs])
|
||||
]
|
||||
|
||||
|
@ -259,6 +294,7 @@ def wikidata_worker(output_dir, checker, langs):
|
|||
path = os.path.join(output_dir, wikidata_id)
|
||||
for url in urls:
|
||||
download(path, url)
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
|
@ -267,8 +303,9 @@ def download_from_wikidata_tags(input_file, output_dir, langs, checker):
|
|||
os.makedirs(wikidata_output_dir, exist_ok=True)
|
||||
with open(input_file) as file:
|
||||
with ThreadPool(processes=WORKERS) as pool:
|
||||
pool.map(wikidata_worker(wikidata_output_dir, checker, langs),
|
||||
file, CHUNK_SIZE)
|
||||
pool.map(
|
||||
wikidata_worker(wikidata_output_dir, checker, langs), file, CHUNK_SIZE
|
||||
)
|
||||
|
||||
|
||||
def check_and_get_checker(popularity_file):
|
||||
|
|
|
@ -3,7 +3,9 @@ import json
|
|||
import os
|
||||
import sys
|
||||
|
||||
from post_generation.hierarchy_to_countries import hierarchy_to_countries as hierarchy_to_countries_
|
||||
from post_generation.hierarchy_to_countries import (
|
||||
hierarchy_to_countries as hierarchy_to_countries_,
|
||||
)
|
||||
from post_generation.inject_promo_ids import inject_promo_ids
|
||||
from post_generation.localads_mwm_to_csv import create_csv
|
||||
|
||||
|
@ -17,7 +19,8 @@ The post_generation commands are:
|
|||
localads_mwm_to_csv Prepares CSV files for uploading to localads database from mwm files.
|
||||
hierarchy_to_countries Produces countries.txt from hierarchy.txt.
|
||||
inject_promo_ids Injects promo osm ids into countries.txt
|
||||
""")
|
||||
""",
|
||||
)
|
||||
parser.add_argument("command", help="Subcommand to run")
|
||||
args = parser.parse_args(sys.argv[1:2])
|
||||
if not hasattr(self, args.command):
|
||||
|
@ -30,57 +33,71 @@ The post_generation commands are:
|
|||
def localads_mwm_to_csv():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Prepares CSV files for uploading to localads database "
|
||||
"from mwm files.")
|
||||
"from mwm files."
|
||||
)
|
||||
parser.add_argument("mwm", help="path to mwm files")
|
||||
parser.add_argument(
|
||||
"--osm2ft",
|
||||
help="path to osm2ft files (default is the same as mwm)")
|
||||
parser.add_argument("--output",
|
||||
default=".",
|
||||
help="path to generated files ('.' by default)")
|
||||
types_default = os.path.join(os.path.dirname(__file__), "..", "..",
|
||||
"..", "data", "types.txt")
|
||||
parser.add_argument("--types",
|
||||
default=types_default,
|
||||
help="path to omim/data/types.txt")
|
||||
parser.add_argument("--threads",
|
||||
type=int,
|
||||
default=1,
|
||||
help="number of threads to process files")
|
||||
parser.add_argument("--mwm_version", type=int, required=True,
|
||||
help="Mwm version")
|
||||
"--osm2ft", help="path to osm2ft files (default is the same as mwm)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", default=".", help="path to generated files ('.' by default)"
|
||||
)
|
||||
types_default = os.path.join(
|
||||
os.path.dirname(__file__), "..", "..", "..", "data", "types.txt"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--types", default=types_default, help="path to omim/data/types.txt"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--threads", type=int, default=1, help="number of threads to process files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mwm_version", type=int, required=True, help="Mwm version"
|
||||
)
|
||||
args = parser.parse_args(sys.argv[2:])
|
||||
if not args.osm2ft:
|
||||
args.osm2ft = args.mwm
|
||||
|
||||
create_csv(args.output, args.mwm, args.osm2ft, args.types,
|
||||
args.mwm_version, args.threads)
|
||||
create_csv(
|
||||
args.output,
|
||||
args.mwm,
|
||||
args.osm2ft,
|
||||
args.mwm_version,
|
||||
args.threads,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def hierarchy_to_countries():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Produces countries.txt from hierarchy.txt.")
|
||||
parser.add_argument("--target", required=True,
|
||||
help="Path to mwm files")
|
||||
parser.add_argument("--hierarchy", required=True,
|
||||
default="hierarchy.txt",
|
||||
help="Hierarchy file")
|
||||
parser.add_argument("--old", required=True,
|
||||
help="old_vs_new.csv file")
|
||||
parser.add_argument("--osm", required=True,
|
||||
help="borders_vs_osm.csv file")
|
||||
parser.add_argument("--countries_synonyms", required=True,
|
||||
help="countries_synonyms.csv file")
|
||||
parser.add_argument("--mwm_version", type=int, required=True,
|
||||
help="Mwm version")
|
||||
parser.add_argument("-o", "--output", required=True,
|
||||
help="Output countries.txt file (default is stdout)")
|
||||
description="Produces countries.txt from hierarchy.txt."
|
||||
)
|
||||
parser.add_argument("--target", required=True, help="Path to mwm files")
|
||||
parser.add_argument(
|
||||
"--hierarchy", required=True, default="hierarchy.txt", help="Hierarchy file"
|
||||
)
|
||||
parser.add_argument("--old", required=True, help="old_vs_new.csv file")
|
||||
parser.add_argument("--osm", required=True, help="borders_vs_osm.csv file")
|
||||
parser.add_argument(
|
||||
"--countries_synonyms", required=True, help="countries_synonyms.csv file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mwm_version", type=int, required=True, help="Mwm version"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
required=True,
|
||||
help="Output countries.txt file (default is stdout)",
|
||||
)
|
||||
args = parser.parse_args(sys.argv[2:])
|
||||
countries_json = hierarchy_to_countries_(args.old, args.osm,
|
||||
args.countries_synonyms,
|
||||
args.hierarchy,
|
||||
args.target,
|
||||
args.mwm_version)
|
||||
countries_json = hierarchy_to_countries_(
|
||||
args.old,
|
||||
args.osm,
|
||||
args.countries_synonyms,
|
||||
args.hierarchy,
|
||||
args.target,
|
||||
args.mwm_version,
|
||||
)
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
f.write(countries_json)
|
||||
|
@ -90,20 +107,29 @@ The post_generation commands are:
|
|||
@staticmethod
|
||||
def inject_promo_ids():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Injects promo cities osm ids into countries.txt")
|
||||
description="Injects promo cities osm ids into countries.txt"
|
||||
)
|
||||
parser.add_argument("--mwm", required=True, help="path to mwm files")
|
||||
parser.add_argument("--types", required=True,
|
||||
help="path to omim/data/types.txt")
|
||||
parser.add_argument("--promo_cities", required=True,
|
||||
help="Path to promo cities file")
|
||||
parser.add_argument("--promo_countries", required=True,
|
||||
help="Path to promo countries file")
|
||||
parser.add_argument("--osm2ft",
|
||||
help="path to osm2ft files (default is the same as mwm)")
|
||||
parser.add_argument("--countries",
|
||||
help="path to countries.txt file (default is countries.txt file into mwm directory)")
|
||||
parser.add_argument("--output",
|
||||
help="Output countries.txt file (default is countries.txt file into mwm directory)")
|
||||
parser.add_argument(
|
||||
"--types", required=True, help="path to omim/data/types.txt"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--promo_cities", required=True, help="Path to promo cities file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--promo_countries", required=True, help="Path to promo countries file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--osm2ft", help="path to osm2ft files (default is the same as mwm)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--countries",
|
||||
help="path to countries.txt file (default is countries.txt file into mwm directory)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
help="Output countries.txt file (default is countries.txt file into mwm directory)",
|
||||
)
|
||||
args = parser.parse_args(sys.argv[2:])
|
||||
|
||||
if not args.osm2ft:
|
||||
|
@ -116,8 +142,14 @@ The post_generation commands are:
|
|||
with open(args.countries) as f:
|
||||
countries = json.load(f)
|
||||
|
||||
inject_promo_ids(countries, args.promo_cities, args.promo_countries,
|
||||
args.mwm, args.types, args.osm2ft)
|
||||
inject_promo_ids(
|
||||
countries,
|
||||
args.promo_cities,
|
||||
args.promo_countries,
|
||||
args.mwm,
|
||||
args.types,
|
||||
args.osm2ft,
|
||||
)
|
||||
|
||||
with open(args.output, "w") as f:
|
||||
json.dump(countries, f, indent=1)
|
||||
|
|
|
@ -109,6 +109,7 @@ def parse_borders_vs_osm(borders_vs_osm_csv_path):
|
|||
vsosm[m.group(1)] = [m.group(3)]
|
||||
return vsosm
|
||||
|
||||
|
||||
def parse_countries_synonyms(countries_synonyms_csv_path):
|
||||
countries_synonyms = {}
|
||||
if not countries_synonyms_csv_path:
|
||||
|
@ -124,10 +125,15 @@ def parse_countries_synonyms(countries_synonyms_csv_path):
|
|||
countries_synonyms[m.group(1)] = [m.group(2)]
|
||||
return countries_synonyms
|
||||
|
||||
def hierarchy_to_countries(old_vs_new_csv_path, borders_vs_osm_csv_path,
|
||||
countries_synonyms_csv_path, hierarchy_path,
|
||||
target_path, version):
|
||||
|
||||
def hierarchy_to_countries(
|
||||
old_vs_new_csv_path,
|
||||
borders_vs_osm_csv_path,
|
||||
countries_synonyms_csv_path,
|
||||
hierarchy_path,
|
||||
target_path,
|
||||
version,
|
||||
):
|
||||
def fill_last(last, stack):
|
||||
name = last["id"]
|
||||
if not os.path.exists(os.path.join(target_path, f"{name}.mwm")):
|
||||
|
|
|
@ -20,11 +20,11 @@ class PromoIds(object):
|
|||
def inject_into_country(self, country):
|
||||
nodes = self._get_nodes(country)
|
||||
with Pool() as pool:
|
||||
proposed_ids = pool.map(self._find, (n["id"] for n in nodes),
|
||||
chunksize=1)
|
||||
proposed_ids = pool.map(self._find, (n["id"] for n in nodes), chunksize=1)
|
||||
|
||||
countries_ids = [ids for node_ids in proposed_ids for ids in
|
||||
node_ids["countries"]]
|
||||
countries_ids = [
|
||||
ids for node_ids in proposed_ids for ids in node_ids["countries"]
|
||||
]
|
||||
if countries_ids:
|
||||
country["top_countries_geo_ids"] = countries_ids
|
||||
|
||||
|
@ -35,13 +35,10 @@ class PromoIds(object):
|
|||
best = self._choose_best_city(node_ids["cities"])
|
||||
node["top_city_geo_id"] = best["id"]
|
||||
if best["id"] < 0:
|
||||
node["top_city_geo_id"] += (1 << 64)
|
||||
node["top_city_geo_id"] += 1 << 64
|
||||
|
||||
def _find(self, leaf_id):
|
||||
result = {
|
||||
"countries": [],
|
||||
"cities": []
|
||||
}
|
||||
result = {"countries": [], "cities": []}
|
||||
ft2osm = load_osm2ft(self.osm2ft_path, leaf_id)
|
||||
|
||||
for feature in Mwm(os.path.join(self.mwm_path, leaf_id + ".mwm")):
|
||||
|
@ -71,27 +68,24 @@ class PromoIds(object):
|
|||
return mwm_nodes
|
||||
|
||||
def _get_city(self, osm_id, types):
|
||||
city = {
|
||||
"id": osm_id,
|
||||
"count_of_guides": self.cities[osm_id],
|
||||
"types": []
|
||||
}
|
||||
city = {"id": osm_id, "count_of_guides": self.cities[osm_id], "types": []}
|
||||
|
||||
for t in types:
|
||||
if t.startswith("place"):
|
||||
city["types"].append(t)
|
||||
|
||||
if not city["types"]:
|
||||
logging.error(f"Incorrect types for sponsored-promo_catalog "
|
||||
f"feature osm_id {osm_id}")
|
||||
logging.error(
|
||||
f"Incorrect types for sponsored-promo_catalog "
|
||||
f"feature osm_id {osm_id}"
|
||||
)
|
||||
sys.exit(3)
|
||||
|
||||
return city
|
||||
|
||||
def _choose_best_city(self, proposed_cities):
|
||||
def key_compare(city):
|
||||
return city["count_of_guides"], self._score_city_types(
|
||||
city["types"])
|
||||
return city["count_of_guides"], self._score_city_types(city["types"])
|
||||
|
||||
return max(proposed_cities, key=key_compare)
|
||||
|
||||
|
@ -133,10 +127,20 @@ def load_osm2ft(osm2ft_path, mwm_id):
|
|||
return read_osm2ft(f, ft2osm=True, tuples=False)
|
||||
|
||||
|
||||
def inject_promo_ids(countries_json, promo_cities_path, promo_countries_path,
|
||||
mwm_path, types_path, osm2ft_path):
|
||||
promo_ids = PromoIds(load_promo_ids(promo_countries_path),
|
||||
load_promo_ids(promo_cities_path), mwm_path,
|
||||
types_path, osm2ft_path)
|
||||
def inject_promo_ids(
|
||||
countries_json,
|
||||
promo_cities_path,
|
||||
promo_countries_path,
|
||||
mwm_path,
|
||||
types_path,
|
||||
osm2ft_path,
|
||||
):
|
||||
promo_ids = PromoIds(
|
||||
load_promo_ids(promo_countries_path),
|
||||
load_promo_ids(promo_cities_path),
|
||||
mwm_path,
|
||||
types_path,
|
||||
osm2ft_path,
|
||||
)
|
||||
for country in countries_json["g"]:
|
||||
promo_ids.inject_into_country(country)
|
||||
|
|
|
@ -18,12 +18,29 @@ HEADERS = {
|
|||
"mwm": "mwm_id name mwm_version".split(),
|
||||
}
|
||||
QUEUES = {name: Queue() for name in HEADERS}
|
||||
GOOD_TYPES = ("amenity", "shop", "tourism", "leisure", "sport",
|
||||
"craft", "man_made", "office", "historic",
|
||||
"aeroway", "natural-beach", "natural-peak", "natural-volcano",
|
||||
"natural-spring", "natural-cave_entrance",
|
||||
"waterway-waterfall", "place-island", "railway-station",
|
||||
"railway-halt", "aerialway-station", "building-train_station")
|
||||
GOOD_TYPES = (
|
||||
"amenity",
|
||||
"shop",
|
||||
"tourism",
|
||||
"leisure",
|
||||
"sport",
|
||||
"craft",
|
||||
"man_made",
|
||||
"office",
|
||||
"historic",
|
||||
"aeroway",
|
||||
"natural-beach",
|
||||
"natural-peak",
|
||||
"natural-volcano",
|
||||
"natural-spring",
|
||||
"natural-cave_entrance",
|
||||
"waterway-waterfall",
|
||||
"place-island",
|
||||
"railway-station",
|
||||
"railway-halt",
|
||||
"aerialway-station",
|
||||
"building-train_station",
|
||||
)
|
||||
SOURCE_TYPES = {"osm": 0, "booking": 1}
|
||||
|
||||
|
||||
|
@ -49,20 +66,28 @@ def parse_mwm(mwm_name, osm2ft_name, override_version):
|
|||
if metadata is not None and MetadataField.sponsored_id in metadata:
|
||||
for t in readable_types:
|
||||
if t.startswith("sponsored-"):
|
||||
QUEUES["sponsored"].put((metadata[MetadataField.sponsored_id],
|
||||
feature.index(),
|
||||
mwm_id,
|
||||
version,
|
||||
SOURCE_TYPES[t[t.find("-") + 1:]]))
|
||||
QUEUES["sponsored"].put(
|
||||
(
|
||||
metadata[MetadataField.sponsored_id],
|
||||
feature.index(),
|
||||
mwm_id,
|
||||
version,
|
||||
SOURCE_TYPES[t[t.find("-") + 1 :]],
|
||||
)
|
||||
)
|
||||
break
|
||||
else:
|
||||
for t in readable_types:
|
||||
if t.startswith(GOOD_TYPES):
|
||||
QUEUES["mapping"].put((ctypes.c_long(osm_id).value,
|
||||
feature.index(),
|
||||
mwm_id,
|
||||
version,
|
||||
SOURCE_TYPES["osm"]))
|
||||
QUEUES["mapping"].put(
|
||||
(
|
||||
ctypes.c_long(osm_id).value,
|
||||
feature.index(),
|
||||
mwm_id,
|
||||
version,
|
||||
SOURCE_TYPES["osm"],
|
||||
)
|
||||
)
|
||||
break
|
||||
|
||||
|
||||
|
@ -87,7 +112,11 @@ def create_csv(output, mwm_path, osm2ft_path, version, threads):
|
|||
|
||||
pool = Pool(processes=threads)
|
||||
for mwm_name in os.listdir(mwm_path):
|
||||
if "World" in mwm_name or "minsk_pass" in mwm_name or not mwm_name.endswith(".mwm"):
|
||||
if (
|
||||
"World" in mwm_name
|
||||
or "minsk_pass" in mwm_name
|
||||
or not mwm_name.endswith(".mwm")
|
||||
):
|
||||
continue
|
||||
osm2ft_name = os.path.join(osm2ft_path, os.path.basename(mwm_name) + ".osm2ft")
|
||||
if not os.path.exists(osm2ft_name):
|
||||
|
|
Loading…
Add table
Reference in a new issue