From ce150f2169b1539eda6e845230102c8eca6e4d3a Mon Sep 17 00:00:00 2001 From: Maksim Andrianov Date: Fri, 12 Apr 2019 17:22:57 +0300 Subject: [PATCH] [generator] New structure python projects. Refactored booking. --- tools/python/booking/__init__.py | 0 tools/python/booking/api/__init__.py | 0 tools/python/booking/api/booking_api.py | 119 ++++++++++++++ tools/python/booking/api/exceptions.py | 14 ++ .../download_hotels.py} | 144 ++--------------- tools/python/booking/requirements.txt | 4 + tools/python/eviltransform.py | 149 ------------------ tools/unix/generate_planet.sh | 2 +- 8 files changed, 147 insertions(+), 285 deletions(-) create mode 100644 tools/python/booking/__init__.py create mode 100644 tools/python/booking/api/__init__.py create mode 100644 tools/python/booking/api/booking_api.py create mode 100644 tools/python/booking/api/exceptions.py rename tools/python/{booking_hotels.py => booking/download_hotels.py} (67%) create mode 100644 tools/python/booking/requirements.txt delete mode 100644 tools/python/eviltransform.py diff --git a/tools/python/booking/__init__.py b/tools/python/booking/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/python/booking/api/__init__.py b/tools/python/booking/api/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/python/booking/api/booking_api.py b/tools/python/booking/api/booking_api.py new file mode 100644 index 0000000000..4a854ac657 --- /dev/null +++ b/tools/python/booking/api/booking_api.py @@ -0,0 +1,119 @@ +import logging +from functools import partial +from random import randint +from threading import Event +from time import sleep + +import requests +from ratelimit import limits, sleep_and_retry + +from .exceptions import AttemptsSpentError, HTTPError + +LIMIT_REQUESTS_PER_MINUTE = 400 +ATTEMPTS_COUNT = 10 +MINMAX_LIMIT_WAIT_AFTER_429_ERROR_SECONDS = (30, 120) + + +class BookingApi: + ENDPOINTS = { + "countries": "list", + "hotels": "list" + } + + def __init__(self, login, password, version): + major_minor = version.split(".") + assert len(major_minor) == 2 + assert int(major_minor[0]) >= 2 + assert 0 <= int(major_minor[1]) <= 4 + + self._event = Event() + self._event.set() + self._timeout = 5 * 60 # in seconds + self._login = login + self._password = password + self._base_url = f"https://distribution-xml.booking.com/{version}/json" + self._set_endpoints() + + @sleep_and_retry + @limits(calls=LIMIT_REQUESTS_PER_MINUTE, period=60) + def call_endpoint(self, endpoint, **params): + self._event.wait() + try: + attempts = ATTEMPTS_COUNT + while attempts: + attempts -= 1 + response = None + try: + response = requests.get(f"{self._base_url}/{endpoint}", + auth=(self._login, self._password), + params=params, timeout=self._timeout) + except requests.exceptions.ReadTimeout: + logging.exception("Timeout error.") + continue + if response.status_code == 200: + data = response.json() + return data["result"] + else: + self._handle_errors(response) + raise AttemptsSpentError(f"{ATTEMPTS_COUNT} attempts were spent.") + except Exception as e: + if not self._event.is_set(): + self._event.set() + raise e + + def _handle_errors(self, response): + error_message = "" + data = response.json() + try: + error_message = ",".join(x["message"] for x in data["errors"]) + except KeyError: + error_message = data + + if response.status_code == 429: + self._event.clear() + wait_seconds = randint(*MINMAX_LIMIT_WAIT_AFTER_429_ERROR_SECONDS) + logging.warning(f"Http error {response.status_code}: {error_message}. " + f"It waits {wait_seconds} seconds and tries again.") + sleep(wait_seconds) + self._event.set() + else: + raise HTTPError( + f"Http error with code {response.status_code}: {error_message}.") + + def _set_endpoints(self): + for endpoint in BookingApi.ENDPOINTS: + setattr(self, endpoint, partial(self.call_endpoint, endpoint)) + + +class BookingListApi: + _ROWS_BY_REQUEST = 1000 + + def __init__(self, api): + self.api = api + self._set_endpoints() + + def call_endpoint(self, endpoint, **params): + result = [] + offset = 0 + while True: + resp = self._call_endpoint_offset(offset, endpoint, **params) + result.extend(resp) + if len(resp) < BookingListApi._ROWS_BY_REQUEST: + break + offset += BookingListApi._ROWS_BY_REQUEST + return result + + def _call_endpoint_offset(self, offset, endpoint, **params): + r = self.api.call_endpoint(endpoint, **{ + "offset": offset, + "rows": BookingListApi._ROWS_BY_REQUEST, + **params + }) + if not isinstance(r, list): + raise TypeError(f"Result has unexpected type {type(r)}") + return r + + def _set_endpoints(self): + for endpoint in BookingApi.ENDPOINTS: + if BookingApi.ENDPOINTS[endpoint] == "list": + setattr(self, endpoint, partial(self.call_endpoint, endpoint)) \ No newline at end of file diff --git a/tools/python/booking/api/exceptions.py b/tools/python/booking/api/exceptions.py new file mode 100644 index 0000000000..784149ea99 --- /dev/null +++ b/tools/python/booking/api/exceptions.py @@ -0,0 +1,14 @@ +class BookingError(Exception): + pass + + +class HTTPError(BookingError): + pass + + +class AttemptsSpentError(BookingError): + pass + + +class GettingMinPriceError(BookingError): + pass diff --git a/tools/python/booking_hotels.py b/tools/python/booking/download_hotels.py similarity index 67% rename from tools/python/booking_hotels.py rename to tools/python/booking/download_hotels.py index ed036eecca..01d2ab698e 100755 --- a/tools/python/booking_hotels.py +++ b/tools/python/booking/download_hotels.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding: utf8 import argparse import datetime import logging @@ -10,145 +9,19 @@ from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial from multiprocessing.pool import ThreadPool -from random import randint -from threading import Event -from time import sleep -import eviltransform import math -import requests -from ratelimit import limits, sleep_and_retry +from eviltransform import gcj2wgs_exact from tqdm import tqdm -LIMIT_REQUESTS_PER_MINUTE = 400 -ATTEMPTS_COUNT = 10 -MINMAX_LIMIT_WAIT_AFTER_429_ERROR_SECONDS = (30, 120) +from api.booking_api import BookingApi, BookingListApi, LIMIT_REQUESTS_PER_MINUTE +from api.exceptions import GettingMinPriceError + SUPPORTED_LANGUAGES = ("en", "ru", "ar", "cs", "da", "nl", "fi", "fr", "de", "hu", "id", "it", "ja", "ko", "pl", "pt", "ro", "es", "sv", "th", "tr", "uk", "vi", "zh", "he", "sk", "el") -class AppError(Exception): - pass - - -class HTTPError(AppError): - pass - - -class AttemptsSpentError(AppError): - pass - - -class GettingMinPriceError(AppError): - pass - - -class BookingApi: - ENDPOINTS = { - "countries": "list", - "hotels": "list" - } - - def __init__(self, login, password, version): - major_minor = version.split(".") - assert len(major_minor) == 2 - assert int(major_minor[0]) >= 2 - assert 0 <= int(major_minor[1]) <= 4 - - self._event = Event() - self._event.set() - self._timeout = 5 * 60 # in seconds - self._login = login - self._password = password - self._base_url = f"https://distribution-xml.booking.com/{version}/json" - self._set_endpoints() - - @sleep_and_retry - @limits(calls=LIMIT_REQUESTS_PER_MINUTE, period=60) - def call_endpoint(self, endpoint, **params): - self._event.wait() - try: - attempts = ATTEMPTS_COUNT - while attempts: - attempts -= 1 - response = None - try: - response = requests.get(f"{self._base_url}/{endpoint}", - auth=(self._login, self._password), - params=params, timeout=self._timeout) - except requests.exceptions.ReadTimeout: - logging.exception("Timeout error.") - continue - if response.status_code == 200: - data = response.json() - return data["result"] - else: - self._handle_errors(response) - raise AttemptsSpentError(f"{ATTEMPTS_COUNT} attempts were spent.") - except Exception as e: - if not self._event.is_set(): - self._event.set() - raise e - - def _handle_errors(self, response): - error_message = "" - data = response.json() - try: - error_message = ",".join(x["message"] for x in data["errors"]) - except KeyError: - error_message = data - - if response.status_code == 429: - self._event.clear() - wait_seconds = randint(*MINMAX_LIMIT_WAIT_AFTER_429_ERROR_SECONDS) - logging.warning(f"Http error {response.status_code}: {error_message}. " - f"It waits {wait_seconds} seconds and tries again.") - sleep(wait_seconds) - self._event.set() - else: - raise HTTPError( - f"Http error with code {response.status_code}: {error_message}.") - - def _set_endpoints(self): - for endpoint in BookingApi.ENDPOINTS: - setattr(self, endpoint, partial(self.call_endpoint, endpoint)) - - -class BookingListApi: - _ROWS_BY_REQUEST = 1000 - - def __init__(self, api): - self.api = api - self._set_endpoints() - - def call_endpoint(self, endpoint, **params): - result = [] - offset = 0 - while True: - resp = self._call_endpoint_offset(offset, endpoint, **params) - result.extend(resp) - if len(resp) < BookingListApi._ROWS_BY_REQUEST: - break - offset += BookingListApi._ROWS_BY_REQUEST - return result - - def _call_endpoint_offset(self, offset, endpoint, **params): - r = self.api.call_endpoint(endpoint, **{ - "offset": offset, - "rows": BookingListApi._ROWS_BY_REQUEST, - **params - }) - if not isinstance(r, list): - raise TypeError(f"Result has unexpected type {type(r)}") - return r - - def _set_endpoints(self): - for endpoint in BookingApi.ENDPOINTS: - if BookingApi.ENDPOINTS[endpoint] == "list": - setattr(self, endpoint, partial(self.call_endpoint, endpoint)) - - class BookingGen: def __init__(self, api, country): self.api = api @@ -210,7 +83,7 @@ class BookingGen: hotel_data = hotel["hotel_data"] location = hotel_data["location"] try: - location["latitude"], location["longitude"] = eviltransform.gcj2wgs_exact( + location["latitude"], location["longitude"] = gcj2wgs_exact( float(location["latitude"]), float(location["longitude"]) ) except ValueError: @@ -304,13 +177,15 @@ def download_hotels_by_country(api, country): return rows -def download(country_code, user, password, path, threads_count, progress_bar): +def download(country_code, user, password, path, threads_count, + progress_bar=tqdm(disable=True)): api = BookingApi(user, password, "2.4") list_api = BookingListApi(api) countries = list_api.countries(languages="en") if country_code is not None: countries = list(filter(lambda x: x["country"] in country_code, countries)) logging.info(f"There is {len(countries)} countries.") + progress_bar.desc = "Countries" progress_bar.total = len(countries) with open(path, "w") as f: with ThreadPool(threads_count) as pool: @@ -323,8 +198,7 @@ def download(country_code, user, password, path, threads_count, progress_bar): def process_options(): parser = argparse.ArgumentParser(description="Download and process booking hotels.") - parser.add_argument("-q", "--quiet", action="store_false", dest="verbose") - parser.add_argument("-v", "--verbose", action="store_true", dest="verbose") + parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("--logfile", default="", help="Name and destination for log file") parser.add_argument("--password", required=True, dest="password", diff --git a/tools/python/booking/requirements.txt b/tools/python/booking/requirements.txt new file mode 100644 index 0000000000..4e17d841e3 --- /dev/null +++ b/tools/python/booking/requirements.txt @@ -0,0 +1,4 @@ +eviltransform +ratelimit +requests +tqdm diff --git a/tools/python/eviltransform.py b/tools/python/eviltransform.py deleted file mode 100644 index 670ac93fbe..0000000000 --- a/tools/python/eviltransform.py +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Source: https://github.com/googollee/eviltransform -# Published under 2-clause BSD license -# Copyright (c) 2015, Googol Lee , @gutenye, @xingxing, @bewantbe, -# @GhostFlying, @larryli, @gumblex,@lbt05, @chenweiyj - -import math - - -__all__ = ['wgs2gcj', 'gcj2wgs', 'gcj2wgs_exact', - 'distance', 'gcj2bd', 'bd2gcj', 'wgs2bd', 'bd2wgs'] - -earthR = 6378137.0 - -def outOfChina(lat, lng): - return not (72.004 <= lng <= 137.8347 and 0.8293 <= lat <= 55.8271) - - -def transform(x, y): - xy = x * y - absX = math.sqrt(abs(x)) - xPi = x * math.pi - yPi = y * math.pi - d = 20.0*math.sin(6.0*xPi) + 20.0*math.sin(2.0*xPi) - - lat = d - lng = d - - lat += 20.0*math.sin(yPi) + 40.0*math.sin(yPi/3.0) - lng += 20.0*math.sin(xPi) + 40.0*math.sin(xPi/3.0) - - lat += 160.0*math.sin(yPi/12.0) + 320*math.sin(yPi/30.0) - lng += 150.0*math.sin(xPi/12.0) + 300.0*math.sin(xPi/30.0) - - lat *= 2.0 / 3.0 - lng *= 2.0 / 3.0 - - lat += -100.0 + 2.0*x + 3.0*y + 0.2*y*y + 0.1*xy + 0.2*absX - lng += 300.0 + x + 2.0*y + 0.1*x*x + 0.1*xy + 0.1*absX - - return lat, lng - - -def delta(lat, lng): - ee = 0.00669342162296594323 - dLat, dLng = transform(lng-105.0, lat-35.0) - radLat = lat / 180.0 * math.pi - magic = math.sin(radLat) - magic = 1 - ee * magic * magic - sqrtMagic = math.sqrt(magic) - dLat = (dLat * 180.0) / ((earthR * (1 - ee)) / (magic * sqrtMagic) * math.pi) - dLng = (dLng * 180.0) / (earthR / sqrtMagic * math.cos(radLat) * math.pi) - return dLat, dLng - - -def wgs2gcj(wgsLat, wgsLng): - if outOfChina(wgsLat, wgsLng): - return wgsLat, wgsLng - else: - dlat, dlng = delta(wgsLat, wgsLng) - return wgsLat + dlat, wgsLng + dlng - - -def gcj2wgs(gcjLat, gcjLng): - if outOfChina(gcjLat, gcjLng): - return gcjLat, gcjLng - else: - dlat, dlng = delta(gcjLat, gcjLng) - return gcjLat - dlat, gcjLng - dlng - - -def gcj2wgs_exact(gcjLat, gcjLng): - initDelta = 0.01 - threshold = 0.000001 - dLat = dLng = initDelta - mLat = gcjLat - dLat - mLng = gcjLng - dLng - pLat = gcjLat + dLat - pLng = gcjLng + dLng - for i in range(30): - wgsLat = (mLat + pLat) / 2 - wgsLng = (mLng + pLng) / 2 - tmplat, tmplng = wgs2gcj(wgsLat, wgsLng) - dLat = tmplat - gcjLat - dLng = tmplng - gcjLng - if abs(dLat) < threshold and abs(dLng) < threshold: - return wgsLat, wgsLng - if dLat > 0: - pLat = wgsLat - else: - mLat = wgsLat - if dLng > 0: - pLng = wgsLng - else: - mLng = wgsLng - return wgsLat, wgsLng - - -def distance(latA, lngA, latB, lngB): - pi180 = math.pi / 180 - arcLatA = latA * pi180 - arcLatB = latB * pi180 - x = (math.cos(arcLatA) * math.cos(arcLatB) * - math.cos((lngA - lngB) * pi180)) - y = math.sin(arcLatA) * math.sin(arcLatB) - s = x + y - if s > 1: - s = 1 - if s < -1: - s = -1 - alpha = math.acos(s) - distance = alpha * earthR - return distance - - -def gcj2bd(gcjLat, gcjLng): - if outOfChina(gcjLat, gcjLng): - return gcjLat, gcjLng - - x = gcjLng - y = gcjLat - z = math.hypot(x, y) + 0.00002 * math.sin(y * math.pi) - theta = math.atan2(y, x) + 0.000003 * math.cos(x * math.pi) - bdLng = z * math.cos(theta) + 0.0065 - bdLat = z * math.sin(theta) + 0.006 - return bdLat, bdLng - - -def bd2gcj(bdLat, bdLng): - if outOfChina(bdLat, bdLng): - return bdLat, bdLng - - x = bdLng - 0.0065 - y = bdLat - 0.006 - z = math.hypot(x, y) - 0.00002 * math.sin(y * math.pi) - theta = math.atan2(y, x) - 0.000003 * math.cos(x * math.pi) - gcjLng = z * math.cos(theta) - gcjLat = z * math.sin(theta) - return gcjLat, gcjLng - - -def wgs2bd(wgsLat, wgsLng): - return gcj2bd(*wgs2gcj(wgsLat, wgsLng)) - - -def bd2wgs(bdLat, bdLng): - return gcj2wgs(*bd2gcj(bdLat, bdLng)) diff --git a/tools/unix/generate_planet.sh b/tools/unix/generate_planet.sh index d95aeb8167..4b61212543 100755 --- a/tools/unix/generate_planet.sh +++ b/tools/unix/generate_planet.sh @@ -231,7 +231,7 @@ LOCALADS_SCRIPT="$PYTHON_SCRIPTS_PATH/local_ads/mwm_to_csv_4localads.py" UGC_FILE="${UGC_FILE:-$INTDIR/ugc_db.sqlite3}" POPULAR_PLACES_FILE="${POPULAR_PLACES_FILE:-$INTDIR/popular_places.csv}" WIKIDATA_FILE="${WIKIDATA_FILE:-$INTDIR/idToWikidata.csv}" -BOOKING_SCRIPT="$PYTHON_SCRIPTS_PATH/booking_hotels.py" +BOOKING_SCRIPT="$PYTHON_SCRIPTS_PATH/booking/download_hotels.py" BOOKING_FILE="${BOOKING_FILE:-$INTDIR/hotels.csv}" OPENTABLE_SCRIPT="$PYTHON_SCRIPTS_PATH/opentable_restaurants.py" OPENTABLE_FILE="${OPENTABLE_FILE:-$INTDIR/restaurants.csv}"