From 1ac780c71dfd28a14233c03bf7a46c7506b37b64 Mon Sep 17 00:00:00 2001 From: Sergey Yershov Date: Tue, 24 May 2016 12:26:44 +0300 Subject: [PATCH 1/9] [booking] Script for fetch data from booking.com --- tools/python/booking_hotels.py | 196 +++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100755 tools/python/booking_hotels.py diff --git a/tools/python/booking_hotels.py b/tools/python/booking_hotels.py new file mode 100755 index 0000000000..24e9adbdc0 --- /dev/null +++ b/tools/python/booking_hotels.py @@ -0,0 +1,196 @@ +#!/usr/bin/python +# coding: utf8 +from __future__ import print_function + +import json +import urllib2 +import base64 +from datetime import datetime +import time +import logging +import pickle +import os +import argparse +from collections import namedtuple, defaultdict + +# init logging +logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s') + +Hotel = namedtuple('Hotel', + ['id', 'lat', 'lon', 'name', 'address', + 'stars', 'priceCategory', 'ratingBooking', + 'ratingUser', 'descUrl']) + + +class BookingApi: + def __init__(self, login, password): + self.login = login + self.password = password + self.baseConfig = { + "headers": { + "Content-Type": "application/json", + "Authorization": "Basic " + base64.encodestring( + "{login}:{password}".format(login=self.login, password=self.password)).replace('\n', '') + }, + "url": 'https://distribution-xml.booking.com/json/bookings'} + self.checkMinute = 0 + self.requestPerMinute = 0 + self.requestLimit = 15 # request per minute + + def call(self, function, params=None): + self.requestPerMinute += 1 + now = datetime.utcnow() + + if self.requestPerMinute >= self.requestLimit: + waittime = 60 - now.second + logging.warning("Limit for request per minute exceeded. Wait for: {0} sec.".format(waittime)) + time.sleep(waittime) + now = datetime.utcnow() + + if self.checkMinute != now.minute: + self.requestPerMinute = 0 + self.checkMinute = now.minute + + payload = '' + try: + p = "" if not params else '?' + "&".join( + ["{key}={value}".format(key=k, value=v) for (k, v) in params.iteritems()]) + url = "{base}.{func}{params}".format(base=self.baseConfig["url"], func=function, params=p) + logging.debug("{0} {1} API call:{2}".format(self.checkMinute, self.requestPerMinute, url)) + request = urllib2.Request(url, None, self.baseConfig["headers"]) + stream = urllib2.urlopen(request) + payload = stream.read() + return json.loads(payload) + + except Exception as e: + logging.error('Error: {0} Context: {1}'.format(e, payload)) + return None + + +def make_record(src, rate): + return Hotel( + int(src['hotel_id']), + float(src['location']['latitude']), + float(src['location']['longitude']), + src['name'], + src['address'], + int(src['class']), + rate, + src['ranking'], + src['review_score'], + src['url'] + ) + + +def download(user, password, path): + api = BookingApi(user, password) + + maxrows = 1000 + countries = api.call("getCountries", dict(languagecodes='en')) + for country in countries: + countrycode = country['countrycode'] + logging.info(u'{0} {1}'.format(countrycode, country['name'])) + + counter = 0 + allhotels = [] + while True: + hotels = api.call('getHotels', + dict(new_hotel_type=1, offset=counter, rows=maxrows, countrycodes=countrycode)) + if isinstance(hotels, dict) and 'ruid' in hotels: + logging.error('{0} Code: {1}'.format(hotels['message'], hotels['code'])) + exit(1) + + for hotel in hotels: + allhotels.append(hotel) + + counter += len(hotels) + + if len(hotels) < maxrows: + break + + logging.info('Total hotels: {0}'.format(len(allhotels))) + filename = os.path.join(path, + '{0} - {1}.pkl'.format(country['area'].encode('utf8'), country['name'].encode('utf8'))) + with open(filename, 'wb') as fd: + pickle.dump(allhotels, fd, pickle.HIGHEST_PROTOCOL) + + +def translate(source, output): + files = [] + data = [] + + for filename in os.listdir(source): + if filename.endswith(".pkl"): + files.append(filename) + + for filename in files: + logging.info('Processing {0}'.format(filename)) + with open(filename, 'rb') as fd: + data += pickle.load(fd) + + # Dict of dicts city_id -> { currency -> [prices] } + cities = defaultdict(lambda: defaultdict(list)) + + # Collect prices + for hotel in data: + if 'city_id' in hotel and 'currencycode' in hotel and 'minrate' in hotel and hotel['minrate'] is not None: + cities[hotel['city_id']][hotel['currencycode']].append(float(hotel['minrate'])) + + # Find median prices + for city in cities: + for cur in cities[city]: + cities[city][cur] = sorted(cities[city][cur])[len(cities[city][cur]) / 2] + + # Price rate ranges, relative to the median price for a city + rates = (0.7, 1.3) + + with open(output, 'w') as fd: + for hotel in data: + rate = 0 + if 'city_id' in hotel and 'currencycode' in hotel and 'minrate' in hotel and hotel['minrate'] is not None: + avg = cities[hotel['city_id']][hotel['currencycode']] + price = float(hotel['minrate']) + rate = 1 + while rate <= len(rates) and price > avg * rates[rate - 1]: + rate += 1 + cur = make_record(hotel, rate) + l = [(str(e) if e else '') if not isinstance(e, unicode) else e.encode('utf8') for e in cur] + print('\t'.join(l), file=fd) + + +def process_options(): + parser = argparse.ArgumentParser(description='Download and process booking hotels.') + parser.add_argument("-v", "--verbose", action="store_true", dest="verbose") + parser.add_argument("-q", "--quiet", action="store_false", dest="verbose") + + parser.add_argument("--password", dest="password", help="Booking.com account password") + parser.add_argument("--user", dest="user", help="Booking.com account user name") + + parser.add_argument("--path", dest="path", help="path to data files") + parser.add_argument("--output", dest="output", help="Name and destination for output file") + + parser.add_argument("--download", action="store_true", dest="download", default=False) + parser.add_argument("--translate", action="store_true", dest="translate", default=False) + + options = parser.parse_args() + + if not options.download and not options.translate: + parser.print_help() + + if options.translate and not options.output: + print("--output isn't set") + exit() + + return options + + +def main(): + options = process_options() + if options.download: + download(options.user, options.password, options.path) + if options.translate: + translate(options.path, options.output) + + +if __name__ == "__main__": + main() From 706e4467f3ac2cc490cf51660bc1c62733c438c0 Mon Sep 17 00:00:00 2001 From: Sergey Yershov Date: Tue, 24 May 2016 12:32:19 +0300 Subject: [PATCH 2/9] [booking] Support data from booking.com --- generator/booking_dataset.cpp | 252 ++++++++++++++++++ generator/booking_dataset.hpp | 70 +++++ generator/generate_info.hpp | 2 + generator/generator.pro | 2 + generator/generator_tool/generator_tool.cpp | 2 + generator/osm_element.cpp | 5 +- generator/osm_source.cpp | 14 + indexer/search_string_utils.hpp | 1 + .../generator.xcodeproj/project.pbxproj | 8 + 9 files changed, 355 insertions(+), 1 deletion(-) create mode 100644 generator/booking_dataset.cpp create mode 100644 generator/booking_dataset.hpp diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp new file mode 100644 index 0000000000..4a488cda86 --- /dev/null +++ b/generator/booking_dataset.cpp @@ -0,0 +1,252 @@ +#include "generator/booking_dataset.hpp" + +#include "base/string_utils.hpp" + +#include "geometry/distance_on_sphere.hpp" + +#include "indexer/search_delimiters.hpp" +#include "indexer/search_string_utils.hpp" + +#include "std/fstream.hpp" +#include "std/iostream.hpp" +#include "std/sstream.hpp" + +BookingDataset::BookingHotel::BookingHotel(string const & src) +{ + stringstream ss(src); + string elem; + vector rec(FieldsCount()); + for (size_t i = 0; getline(ss, elem, '\t') && i < rec.size(); ++i) + rec[i] = elem; + + id = static_cast(strtoul(rec[Index(Fields::Id)].c_str(), nullptr, 10)); + + lat = strtod(rec[Index(Fields::Latitude)].c_str(), nullptr); + lon = strtod(rec[Index(Fields::Longtitude)].c_str(), nullptr); + name = rec[Index(Fields::Name)]; + address = rec[Index(Fields::Address)]; + + stars = rec[Index(Fields::Stars)].empty() + ? 0 + : static_cast(strtoul(rec[Index(Fields::Stars)].c_str(), nullptr, 10)); + + priceCategory = + rec[Index(Fields::PriceCategory)].empty() + ? 0 + : static_cast(strtoul(rec[Index(Fields::PriceCategory)].c_str(), nullptr, 10)); + + ratingBooking = rec[Index(Fields::RatingBooking)].empty() + ? 0 + : strtod(rec[Index(Fields::RatingBooking)].c_str(), nullptr); + + ratingUser = rec[Index(Fields::RatingUsers)].empty() + ? 0 + : strtod(rec[Index(Fields::RatingUsers)].c_str(), nullptr); + + descUrl = rec[Index(Fields::DescUrl)]; + + type = rec[Index(Fields::Type)].empty() + ? 0 + : static_cast(strtoul(rec[Index(Fields::Type)].c_str(), nullptr, 10)); +} + +ostream & operator<<(ostream & s, BookingDataset::BookingHotel const & h) +{ + return s << "Name: " << h.name << " lon: " << h.lon << " lat: " << h.lat; +} + +void BookingDataset::LoadBookingHotels(string const & path) +{ + m_hotels.clear(); + + if(path.empty()) + return; + + ifstream src(path); + for (string elem; getline(src, elem);) + m_hotels.emplace_back(elem); +} + +BookingDataset::BookingDataset(string const & dataPath) +{ + LoadBookingHotels(dataPath); + + size_t counter = 0; + for (auto const & hotel : m_hotels) + { + TBox b(TPoint(hotel.lon, hotel.lat), TPoint(hotel.lon, hotel.lat)); + m_rtree.insert(std::make_pair(b, counter++)); + } +} + +bool CheckForValues(string const & value) +{ + for (char const * val : + {"hotel", "apartment", "camp_site", "chalet", "guest_house", "hostel", "motel", "resort"}) + { + if (value == val) + return true; + } + return false; +} + +bool BookingDataset::MatchWithBooking(OsmElement const & e) const +{ + string name; + for (auto const & tag : e.Tags()) + { + if (tag.key == "name") + { + name = tag.value; + break; + } + } + + if (name.empty()) + return false; + + // Find 3 nearest values to a point. + vector result; + for_each(boost::geometry::index::qbegin(m_rtree, + boost::geometry::index::nearest(TPoint(e.lon, e.lat), 3)), + boost::geometry::index::qend(m_rtree), [&](TValue const & v) + { + auto const & hotel = m_hotels[v.second]; + double dist = ms::DistanceOnEarth(e.lon, e.lat, hotel.lon, hotel.lat); + if (dist > 150 /* max distance in meters */) + return; + + result.emplace_back(v); + }); + + if (result.empty()) + return false; + + // Match name. + vector osmTokens; + NormalizeAndTokenizeString(name, osmTokens, search::Delimiters()); + + // cout << "\n------------- " << name << endl; + + bool matched = false; + for (auto const & e : result) + { + vector bookingTokens; + NormalizeAndTokenizeString(m_hotels[e.second].name, bookingTokens, search::Delimiters()); + + map>> weightPair; + + for (size_t j = 0; j < osmTokens.size(); ++j) + { + for (size_t i = 0; i < bookingTokens.size(); ++i) + { + size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(), + bookingTokens[i].begin(), bookingTokens[i].end()); + if (distance < 3) + weightPair[distance].emplace_back(i, j); + } + } + + if (!weightPair.empty()) + { + // cout << m_hotels[e.second] << endl; + matched = true; + } + } + return matched; +} + +bool BookingDataset::Filter(OsmElement const & e) const +{ + if (e.type != OsmElement::EntityType::Node) + return false; + + if (e.Tags().empty()) + return false; + + bool matched = false; + for (auto const & tag : e.Tags()) + { + if (tag.key == "tourism" && CheckForValues(tag.value)) + { + matched = MatchWithBooking(e); + break; + } + } + + // TODO: Need to write file with dropped osm features. + + return matched; +} + +void BookingDataset::BuildFeatures(function const & fn) const +{ + for (auto const & hotel : m_hotels) + { + OsmElement e; + e.type = OsmElement::EntityType::Node; + e.id = 1; + + e.lon = hotel.lon; + e.lat = hotel.lat; + + e.AddTag("name", hotel.name); + e.AddTag("ref:sponsored", strings::to_string(hotel.id)); + e.AddTag("website", hotel.descUrl); + e.AddTag("rating:sponsored", strings::to_string(hotel.ratingUser)); + e.AddTag("stars", strings::to_string(hotel.stars)); + e.AddTag("price_rate", strings::to_string(hotel.priceCategory)); + e.AddTag("addr:full", hotel.address); + + switch (hotel.type) + { + case 19: + case 205: e.AddTag("tourism", "motel"); break; + + case 21: + case 206: + case 212: e.AddTag("tourism", "resort"); break; + + case 3: + case 23: + case 24: + case 25: + case 202: + case 207: + case 208: + case 209: + case 210: + case 216: + case 220: + case 223: e.AddTag("tourism", "guest_house"); break; + + case 14: + case 204: + case 213: + case 218: + case 219: + case 226: + case 222: e.AddTag("tourism", "hotel"); break; + + case 211: + case 224: + case 228: e.AddTag("tourism", "chalet"); break; + + case 13: + case 225: + case 203: e.AddTag("tourism", "hostel"); break; + + case 215: + case 221: + case 227: + case 2: + case 201: e.AddTag("tourism", "apartment"); break; + + case 214: e.AddTag("tourism", "camp_site"); break; + + default: e.AddTag("tourism", "hotel"); break; + } + + fn(&e); + } +} diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp new file mode 100644 index 0000000000..55ffdd575a --- /dev/null +++ b/generator/booking_dataset.hpp @@ -0,0 +1,70 @@ +#pragma once + +#include "generator/osm_element.hpp" + +#include "boost/geometry.hpp" +#include "boost/geometry/geometries/point.hpp" +#include "boost/geometry/geometries/box.hpp" +#include "boost/geometry/index/rtree.hpp" + +#include "std/function.hpp" +#include "std/string.hpp" + +class BookingDataset +{ +public: + struct BookingHotel + { + enum class Fields : size_t + { + Id = 0, + Latitude = 1, + Longtitude = 2, + Name = 3, + Address = 4, + Stars = 5, + PriceCategory = 6, + RatingBooking = 7, + RatingUsers = 8, + DescUrl = 9, + Type = 10, + + Counter + }; + + uint32_t id = 0; + double lat = 0.0; + double lon = 0.0; + string name; + string address; + uint32_t stars = 0; + uint32_t priceCategory = 0; + double ratingBooking = 0.0; + double ratingUser = 0.0; + string descUrl; + uint32_t type = 0; + + constexpr size_t Index(Fields field) const { return static_cast(field); } + constexpr size_t FieldsCount() const { return static_cast(Fields::Counter); } + + BookingHotel(string const &src); + }; + + BookingDataset(string const & dataPath); + + bool Filter(OsmElement const & e) const; + void BuildFeatures(function const & fn) const; + +protected: + vector m_hotels; + + // create the rtree using default constructor + using TPoint = boost::geometry::model::point; + using TBox = boost::geometry::model::box; + using TValue = pair; + + boost::geometry::index::rtree> m_rtree; + + void LoadBookingHotels(string const & path); + bool MatchWithBooking(OsmElement const & e) const; +}; diff --git a/generator/generate_info.hpp b/generator/generate_info.hpp index 0713d1bf64..6acca8fa80 100644 --- a/generator/generate_info.hpp +++ b/generator/generate_info.hpp @@ -41,6 +41,8 @@ struct GenerateInfo NodeStorageType m_nodeStorageType; OsmSourceType m_osmFileType; string m_osmFileName; + + string m_bookingDatafileName; uint32_t m_versionDate = 0; diff --git a/generator/generator.pro b/generator/generator.pro index 42725c1953..083c7d3bf8 100644 --- a/generator/generator.pro +++ b/generator/generator.pro @@ -14,6 +14,7 @@ INCLUDEPATH *= $$ROOT_DIR/3party/gflags/src \ QT *= core SOURCES += \ + booking_dataset.cpp \ borders_generator.cpp \ borders_loader.cpp \ check_model.cpp \ @@ -37,6 +38,7 @@ SOURCES += \ unpack_mwm.cpp \ HEADERS += \ + booking_dataset.hpp \ borders_generator.hpp \ borders_loader.hpp \ check_model.hpp \ diff --git a/generator/generator_tool/generator_tool.cpp b/generator/generator_tool/generator_tool.cpp index cea7fc874d..9c884917a4 100644 --- a/generator/generator_tool/generator_tool.cpp +++ b/generator/generator_tool/generator_tool.cpp @@ -67,6 +67,7 @@ DEFINE_bool(make_cross_section, false, "Make corss section in routing file for c DEFINE_string(osm_file_name, "", "Input osm area file"); DEFINE_string(osm_file_type, "xml", "Input osm area file type [xml, o5m]"); DEFINE_string(user_resource_path, "", "User defined resource path for classificator.txt and etc."); +DEFINE_string(booking_data, "", "Path to booking data in .tsv format"); DEFINE_uint64(planet_version, my::SecondsSinceEpoch(), "Version as seconds since epoch, by default - now"); int main(int argc, char ** argv) @@ -100,6 +101,7 @@ int main(int argc, char ** argv) genInfo.m_osmFileName = FLAGS_osm_file_name; genInfo.m_failOnCoasts = FLAGS_fail_on_coasts; genInfo.m_preloadCache = FLAGS_preload_cache; + genInfo.m_bookingDatafileName = FLAGS_booking_data; genInfo.m_versionDate = static_cast(FLAGS_planet_version); diff --git a/generator/osm_element.cpp b/generator/osm_element.cpp index cf22bf41b2..1c65dd599d 100644 --- a/generator/osm_element.cpp +++ b/generator/osm_element.cpp @@ -1,5 +1,6 @@ #include "generator/osm_element.hpp" +#include "base/string_utils.hpp" #include "coding/parse_xml.hpp" #include "std/cstdio.hpp" @@ -63,7 +64,9 @@ void OsmElement::AddTag(string const & k, string const & v) SKIP_KEY("official_name"); #undef SKIP_KEY - m_tags.emplace_back(k, v); + string value = v; + strings::Trim(value); + m_tags.emplace_back(k, value); } string OsmElement::ToString(string const & shift) const diff --git a/generator/osm_source.cpp b/generator/osm_source.cpp index 8d564d5aae..d454abefab 100644 --- a/generator/osm_source.cpp +++ b/generator/osm_source.cpp @@ -1,3 +1,4 @@ +#include "generator/booking_dataset.hpp" #include "generator/coastlines_generator.hpp" #include "generator/feature_generator.hpp" #include "generator/intermediate_data.hpp" @@ -511,12 +512,19 @@ bool GenerateFeaturesImpl(feature::GenerateInfo & info) TagAdmixer tagAdmixer(info.GetIntermediateFileName("ways", ".csv"), info.GetIntermediateFileName("towns", ".csv")); TagReplacer tagReplacer(GetPlatform().ResourcesDir() + REPLACED_TAGS_FILE); + + // If info.m_bookingDatafileName is empty then no data will be loaded. + BookingDataset bookingDataset(info.m_bookingDatafileName); // Here we can add new tags to element!!! auto const fn = [&](OsmElement * e) { tagReplacer(e); tagAdmixer(e); + + if (bookingDataset.Filter(*e)) + return; + parser.EmitElement(e); }; @@ -533,6 +541,12 @@ bool GenerateFeaturesImpl(feature::GenerateInfo & info) LOG(LINFO, ("Processing", info.m_osmFileName, "done.")); + if (!info.m_bookingDatafileName.empty()) + { + bookingDataset.BuildFeatures([&](OsmElement * e) { parser.EmitElement(e); }); + LOG(LINFO, ("Processing booking data from", info.m_bookingDatafileName, "done.")); + } + parser.Finish(); // Stop if coasts are not merged and FLAG_fail_on_coasts is set diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index 6734d33625..87e529e0d0 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -1,4 +1,5 @@ #pragma once +#include "base/stl_add.hpp" #include "base/string_utils.hpp" #include "std/algorithm.hpp" diff --git a/xcode/generator/generator.xcodeproj/project.pbxproj b/xcode/generator/generator.xcodeproj/project.pbxproj index 9c5e8ae064..73a6567f2e 100644 --- a/xcode/generator/generator.xcodeproj/project.pbxproj +++ b/xcode/generator/generator.xcodeproj/project.pbxproj @@ -57,6 +57,8 @@ 677E2A161CAACC5F001DC42A /* tag_admixer.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 677E2A111CAACC5F001DC42A /* tag_admixer.hpp */; }; 677E2A171CAACC5F001DC42A /* towns_dumper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 677E2A121CAACC5F001DC42A /* towns_dumper.cpp */; }; 677E2A181CAACC5F001DC42A /* towns_dumper.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 677E2A131CAACC5F001DC42A /* towns_dumper.hpp */; }; + 67A0FEBE1CEB467F008F2A61 /* booking_dataset.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 67A0FEBC1CEB467F008F2A61 /* booking_dataset.cpp */; }; + 67A0FEBF1CEB467F008F2A61 /* booking_dataset.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 67A0FEBD1CEB467F008F2A61 /* booking_dataset.hpp */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -113,6 +115,8 @@ 677E2A111CAACC5F001DC42A /* tag_admixer.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = tag_admixer.hpp; sourceTree = ""; }; 677E2A121CAACC5F001DC42A /* towns_dumper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = towns_dumper.cpp; sourceTree = ""; }; 677E2A131CAACC5F001DC42A /* towns_dumper.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = towns_dumper.hpp; sourceTree = ""; }; + 67A0FEBC1CEB467F008F2A61 /* booking_dataset.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = booking_dataset.cpp; sourceTree = ""; }; + 67A0FEBD1CEB467F008F2A61 /* booking_dataset.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = booking_dataset.hpp; sourceTree = ""; }; 67F0F6761B8C9DCE003F52FF /* osm_xml_source.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = osm_xml_source.hpp; sourceTree = ""; }; /* End PBXFileReference section */ @@ -199,6 +203,8 @@ 670B84BB1A8CDB0000CE4492 /* osm_source.hpp */, 6764B8921ADD6A3300DD8B15 /* osm_o5m_source.hpp */, 67F0F6761B8C9DCE003F52FF /* osm_xml_source.hpp */, + 67A0FEBC1CEB467F008F2A61 /* booking_dataset.cpp */, + 67A0FEBD1CEB467F008F2A61 /* booking_dataset.hpp */, ); name = generator; path = ../../generator; @@ -227,6 +233,7 @@ 675340741A3F2A7400A0A8C3 /* generate_info.hpp in Headers */, 677E2A161CAACC5F001DC42A /* tag_admixer.hpp in Headers */, 675340861A3F2A7400A0A8C3 /* tesselator.hpp in Headers */, + 67A0FEBF1CEB467F008F2A61 /* booking_dataset.hpp in Headers */, 6753405F1A3F2A7400A0A8C3 /* borders_loader.hpp in Headers */, 675340801A3F2A7400A0A8C3 /* polygonizer.hpp in Headers */, 675340941C5231BA002CF0D9 /* search_index_builder.hpp in Headers */, @@ -309,6 +316,7 @@ 675340811A3F2A7400A0A8C3 /* routing_generator.cpp in Sources */, 675340931C5231BA002CF0D9 /* search_index_builder.cpp in Sources */, 6753406E1A3F2A7400A0A8C3 /* feature_merger.cpp in Sources */, + 67A0FEBE1CEB467F008F2A61 /* booking_dataset.cpp in Sources */, 6753408D1A3F2A7400A0A8C3 /* osm_element.cpp in Sources */, 6726C1D51A4AFEF4005EEA39 /* osm2meta.cpp in Sources */, 6753405E1A3F2A7400A0A8C3 /* borders_loader.cpp in Sources */, From d2bcd9e16f6c386bec3175f7a48cc968a48ecdc4 Mon Sep 17 00:00:00 2001 From: Sergey Yershov Date: Wed, 25 May 2016 15:23:47 +0300 Subject: [PATCH 3/9] Review fixes --- base/string_utils.cpp | 14 ++++++ base/string_utils.hpp | 2 + generator/booking_dataset.cpp | 82 +++++++++++++++------------------- generator/booking_dataset.hpp | 29 ++++++------ generator/osm_source.cpp | 2 +- tools/python/booking_hotels.py | 80 +++++++++++++++++---------------- 6 files changed, 111 insertions(+), 98 deletions(-) diff --git a/base/string_utils.cpp b/base/string_utils.cpp index da13ecb282..4456eb25c3 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -55,6 +55,20 @@ bool to_int(char const * s, int & i, int base /*= 10*/) return false; } +bool to_uint(char const * s, unsigned int & i, int base /*= 10*/) +{ + char * stop; + long const x = strtoul(s, &stop, base); + if (*stop == 0) + { + i = static_cast(x); + ASSERT_EQUAL(static_cast(i), x, ()); + return true; + } + return false; +} + + bool to_uint64(char const * s, uint64_t & i) { char * stop; diff --git a/base/string_utils.hpp b/base/string_utils.hpp index f7c5b775ff..ca5449ae2b 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -209,6 +209,7 @@ template bool IsInArray(T (&arr) [N], TT const & t /// @name From string to numeric. //@{ bool to_int(char const * s, int & i, int base = 10); +bool to_uint(char const * s, unsigned int & i, int base = 10); bool to_uint64(char const * s, uint64_t & i); bool to_int64(char const * s, int64_t & i); bool to_double(char const * s, double & d); @@ -216,6 +217,7 @@ bool to_double(char const * s, double & d); inline bool is_number(string const & s) { int64_t dummy; return to_int64(s.c_str(), dummy); } inline bool to_int(string const & s, int & i, int base = 10) { return to_int(s.c_str(), i, base); } +inline bool to_uint(string const & s, unsigned int & i, int base = 10) { return to_uint(s.c_str(), i, base); } inline bool to_uint64(string const & s, uint64_t & i) { return to_uint64(s.c_str(), i); } inline bool to_int64(string const & s, int64_t & i) { return to_int64(s.c_str(), i); } inline bool to_double(string const & s, double & d) { return to_double(s.c_str(), d); } diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index 4a488cda86..1d04c40e7d 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -1,81 +1,69 @@ #include "generator/booking_dataset.hpp" -#include "base/string_utils.hpp" - -#include "geometry/distance_on_sphere.hpp" - #include "indexer/search_delimiters.hpp" #include "indexer/search_string_utils.hpp" +#include "geometry/distance_on_sphere.hpp" + +#include "base/string_utils.hpp" + #include "std/fstream.hpp" #include "std/iostream.hpp" #include "std/sstream.hpp" -BookingDataset::BookingHotel::BookingHotel(string const & src) +namespace generator +{ +BookingDataset::Hotel::Hotel(string const & src) { - stringstream ss(src); - string elem; vector rec(FieldsCount()); - for (size_t i = 0; getline(ss, elem, '\t') && i < rec.size(); ++i) - rec[i] = elem; + strings::SimpleTokenizer token(src, "\t"); + for (size_t i = 0; token && i < rec.size(); ++i, ++token) + rec[i] = *token; - id = static_cast(strtoul(rec[Index(Fields::Id)].c_str(), nullptr, 10)); - - lat = strtod(rec[Index(Fields::Latitude)].c_str(), nullptr); - lon = strtod(rec[Index(Fields::Longtitude)].c_str(), nullptr); + strings::to_uint(rec[Index(Fields::Id)], id); + strings::to_double(rec[Index(Fields::Latitude)], lat); + strings::to_double(rec[Index(Fields::Longtitude)], lon); + name = rec[Index(Fields::Name)]; address = rec[Index(Fields::Address)]; - stars = rec[Index(Fields::Stars)].empty() - ? 0 - : static_cast(strtoul(rec[Index(Fields::Stars)].c_str(), nullptr, 10)); - - priceCategory = - rec[Index(Fields::PriceCategory)].empty() - ? 0 - : static_cast(strtoul(rec[Index(Fields::PriceCategory)].c_str(), nullptr, 10)); - - ratingBooking = rec[Index(Fields::RatingBooking)].empty() - ? 0 - : strtod(rec[Index(Fields::RatingBooking)].c_str(), nullptr); - - ratingUser = rec[Index(Fields::RatingUsers)].empty() - ? 0 - : strtod(rec[Index(Fields::RatingUsers)].c_str(), nullptr); + strings::to_uint(rec[Index(Fields::Stars)], stars); + strings::to_uint(rec[Index(Fields::PriceCategory)], priceCategory); + strings::to_double(rec[Index(Fields::RatingBooking)], ratingBooking); + strings::to_double(rec[Index(Fields::RatingUsers)], ratingUser); descUrl = rec[Index(Fields::DescUrl)]; - type = rec[Index(Fields::Type)].empty() - ? 0 - : static_cast(strtoul(rec[Index(Fields::Type)].c_str(), nullptr, 10)); + strings::to_uint(rec[Index(Fields::Type)], type); } -ostream & operator<<(ostream & s, BookingDataset::BookingHotel const & h) +ostream & operator<<(ostream & s, BookingDataset::Hotel const & h) { - return s << "Name: " << h.name << " lon: " << h.lon << " lat: " << h.lat; + return s << "Name: " << h.name << " lat: " << h.lat << " lon: " << h.lon; } -void BookingDataset::LoadBookingHotels(string const & path) +void BookingDataset::LoadHotels(string const & path) { m_hotels.clear(); - - if(path.empty()) + + if (path.empty()) return; - + ifstream src(path); - for (string elem; getline(src, elem);) - m_hotels.emplace_back(elem); + for (string line; getline(src, line);) + m_hotels.emplace_back(line); } BookingDataset::BookingDataset(string const & dataPath) { - LoadBookingHotels(dataPath); + LoadHotels(dataPath); size_t counter = 0; for (auto const & hotel : m_hotels) { - TBox b(TPoint(hotel.lon, hotel.lat), TPoint(hotel.lon, hotel.lat)); - m_rtree.insert(std::make_pair(b, counter++)); + TBox b(TPoint(hotel.lat, hotel.lon), TPoint(hotel.lat, hotel.lon)); + m_rtree.insert(std::make_pair(b, counter)); + ++counter; } } @@ -108,11 +96,11 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const // Find 3 nearest values to a point. vector result; for_each(boost::geometry::index::qbegin(m_rtree, - boost::geometry::index::nearest(TPoint(e.lon, e.lat), 3)), + boost::geometry::index::nearest(TPoint(e.lat, e.lon), 3)), boost::geometry::index::qend(m_rtree), [&](TValue const & v) { auto const & hotel = m_hotels[v.second]; - double dist = ms::DistanceOnEarth(e.lon, e.lat, hotel.lon, hotel.lat); + double dist = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); if (dist > 150 /* max distance in meters */) return; @@ -243,10 +231,12 @@ void BookingDataset::BuildFeatures(function const & fn) cons case 201: e.AddTag("tourism", "apartment"); break; case 214: e.AddTag("tourism", "camp_site"); break; - + default: e.AddTag("tourism", "hotel"); break; } fn(&e); } } + +} // namespace generator diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp index 55ffdd575a..cafe7afa72 100644 --- a/generator/booking_dataset.hpp +++ b/generator/booking_dataset.hpp @@ -10,12 +10,14 @@ #include "std/function.hpp" #include "std/string.hpp" +namespace generator +{ class BookingDataset { public: - struct BookingHotel + struct Hotel { - enum class Fields : size_t + enum class Fields { Id = 0, Latitude = 1, @@ -28,10 +30,10 @@ public: RatingUsers = 8, DescUrl = 9, Type = 10, - + Counter }; - + uint32_t id = 0; double lat = 0.0; double lon = 0.0; @@ -43,20 +45,19 @@ public: double ratingUser = 0.0; string descUrl; uint32_t type = 0; - + constexpr size_t Index(Fields field) const { return static_cast(field); } constexpr size_t FieldsCount() const { return static_cast(Fields::Counter); } - - BookingHotel(string const &src); + explicit Hotel(string const & src); }; - BookingDataset(string const & dataPath); - + explicit BookingDataset(string const & dataPath); + bool Filter(OsmElement const & e) const; void BuildFeatures(function const & fn) const; - + protected: - vector m_hotels; + vector m_hotels; // create the rtree using default constructor using TPoint = boost::geometry::model::point; @@ -64,7 +65,9 @@ protected: using TValue = pair; boost::geometry::index::rtree> m_rtree; - - void LoadBookingHotels(string const & path); + + void LoadHotels(string const & path); bool MatchWithBooking(OsmElement const & e) const; }; + +} // namespace generator diff --git a/generator/osm_source.cpp b/generator/osm_source.cpp index d454abefab..0bc4643abd 100644 --- a/generator/osm_source.cpp +++ b/generator/osm_source.cpp @@ -514,7 +514,7 @@ bool GenerateFeaturesImpl(feature::GenerateInfo & info) TagReplacer tagReplacer(GetPlatform().ResourcesDir() + REPLACED_TAGS_FILE); // If info.m_bookingDatafileName is empty then no data will be loaded. - BookingDataset bookingDataset(info.m_bookingDatafileName); + generator::BookingDataset bookingDataset(info.m_bookingDatafileName); // Here we can add new tags to element!!! auto const fn = [&](OsmElement * e) diff --git a/tools/python/booking_hotels.py b/tools/python/booking_hotels.py index 24e9adbdc0..5562e4f4dd 100755 --- a/tools/python/booking_hotels.py +++ b/tools/python/booking_hotels.py @@ -2,16 +2,16 @@ # coding: utf8 from __future__ import print_function -import json -import urllib2 -import base64 -from datetime import datetime -import time -import logging -import pickle -import os -import argparse from collections import namedtuple, defaultdict +from datetime import datetime +import argparse +import base64 +import json +import logging +import os +import pickle +import time +import urllib2 # init logging logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s') @@ -60,6 +60,7 @@ class BookingApi: request = urllib2.Request(url, None, self.baseConfig["headers"]) stream = urllib2.urlopen(request) payload = stream.read() + print(payload) return json.loads(payload) except Exception as e: @@ -69,41 +70,40 @@ class BookingApi: def make_record(src, rate): return Hotel( - int(src['hotel_id']), - float(src['location']['latitude']), - float(src['location']['longitude']), - src['name'], - src['address'], - int(src['class']), - rate, - src['ranking'], - src['review_score'], - src['url'] + unicode(src['hotel_id']), + unicode(src['location']['latitude']), + unicode(src['location']['longitude']), + unicode(src['name']), + unicode(src['address']), + unicode(src['class']), + unicode(rate), + unicode(src['ranking']), + unicode(src['review_score']), + unicode(src['url']) ) def download(user, password, path): + ''' + Download all hotels from booking.com and store then in them set of .pkl files. + ''' api = BookingApi(user, password) maxrows = 1000 countries = api.call("getCountries", dict(languagecodes='en')) for country in countries: countrycode = country['countrycode'] - logging.info(u'{0} {1}'.format(countrycode, country['name'])) + logging.info(u'Download[{0}]: {1}'.format(countrycode, country['name'])) - counter = 0 allhotels = [] while True: hotels = api.call('getHotels', - dict(new_hotel_type=1, offset=counter, rows=maxrows, countrycodes=countrycode)) + dict(new_hotel_type=1, offset=len(allhotels), rows=maxrows, countrycodes=countrycode)) if isinstance(hotels, dict) and 'ruid' in hotels: - logging.error('{0} Code: {1}'.format(hotels['message'], hotels['code'])) + logging.error('Api call failed with error: {0} Code: {1}'.format(hotels['message'], hotels['code'])) exit(1) - for hotel in hotels: - allhotels.append(hotel) - - counter += len(hotels) + allhotels.append(hotels) if len(hotels) < maxrows: break @@ -116,13 +116,12 @@ def download(user, password, path): def translate(source, output): - files = [] + ''' + Read *.pkl files and produce a single list of hotels as tab separated values. + ''' + files = [filename for filename in os.listdir(source) if filename.endswith('.pkl')] + data = [] - - for filename in os.listdir(source): - if filename.endswith(".pkl"): - files.append(filename) - for filename in files: logging.info('Processing {0}'.format(filename)) with open(filename, 'rb') as fd: @@ -131,12 +130,15 @@ def translate(source, output): # Dict of dicts city_id -> { currency -> [prices] } cities = defaultdict(lambda: defaultdict(list)) + def valid(hotel): + return 'city_id' in hotel and 'currencycode' in hotel and 'minrate' in hotel and hotel['minrate'] is not None + # Collect prices for hotel in data: - if 'city_id' in hotel and 'currencycode' in hotel and 'minrate' in hotel and hotel['minrate'] is not None: + if valid(hotel): cities[hotel['city_id']][hotel['currencycode']].append(float(hotel['minrate'])) - # Find median prices + # Replaces list of prices by a median price. for city in cities: for cur in cities[city]: cities[city][cur] = sorted(cities[city][cur])[len(cities[city][cur]) / 2] @@ -147,14 +149,15 @@ def translate(source, output): with open(output, 'w') as fd: for hotel in data: rate = 0 - if 'city_id' in hotel and 'currencycode' in hotel and 'minrate' in hotel and hotel['minrate'] is not None: + if valid(hotel): avg = cities[hotel['city_id']][hotel['currencycode']] price = float(hotel['minrate']) rate = 1 + # Find a range that contains the price while rate <= len(rates) and price > avg * rates[rate - 1]: rate += 1 cur = make_record(hotel, rate) - l = [(str(e) if e else '') if not isinstance(e, unicode) else e.encode('utf8') for e in cur] + l = [e.encode('utf8') for e in cur] print('\t'.join(l), file=fd) @@ -166,7 +169,7 @@ def process_options(): parser.add_argument("--password", dest="password", help="Booking.com account password") parser.add_argument("--user", dest="user", help="Booking.com account user name") - parser.add_argument("--path", dest="path", help="path to data files") + parser.add_argument("--path", dest="path", help="Path to data files") parser.add_argument("--output", dest="output", help="Name and destination for output file") parser.add_argument("--download", action="store_true", dest="download", default=False) @@ -179,6 +182,7 @@ def process_options(): if options.translate and not options.output: print("--output isn't set") + parser.print_help() exit() return options From f267357034d0aa81db61e2b08c466675492a0775 Mon Sep 17 00:00:00 2001 From: Sergey Yershov Date: Wed, 25 May 2016 12:50:03 +0300 Subject: [PATCH 4/9] [booking] Refactor filter methods --- generator/booking_dataset.cpp | 14 ++++++++++++-- generator/booking_dataset.hpp | 4 +++- generator/osm_source.cpp | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index 1d04c40e7d..8727ebf188 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -144,7 +144,17 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const return matched; } -bool BookingDataset::Filter(OsmElement const & e) const +bool BookingDataset::BookingFilter(OsmElement const & e) const +{ + return Filter(e, [&](OsmElement const & e){ return MatchWithBooking(e); }); +} + +bool BookingDataset::TourismFilter(OsmElement const & e) const +{ + return Filter(e, [&](OsmElement const & e){ return true; }); +} + +bool BookingDataset::Filter(OsmElement const & e, function const & fn) const { if (e.type != OsmElement::EntityType::Node) return false; @@ -157,7 +167,7 @@ bool BookingDataset::Filter(OsmElement const & e) const { if (tag.key == "tourism" && CheckForValues(tag.value)) { - matched = MatchWithBooking(e); + matched = fn(e); break; } } diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp index cafe7afa72..7628e3aa01 100644 --- a/generator/booking_dataset.hpp +++ b/generator/booking_dataset.hpp @@ -53,7 +53,8 @@ public: explicit BookingDataset(string const & dataPath); - bool Filter(OsmElement const & e) const; + bool BookingFilter(OsmElement const & e) const; + bool TourismFilter(OsmElement const & e) const; void BuildFeatures(function const & fn) const; protected: @@ -67,6 +68,7 @@ protected: boost::geometry::index::rtree> m_rtree; void LoadHotels(string const & path); + bool Filter(OsmElement const & e, function const & fn) const; bool MatchWithBooking(OsmElement const & e) const; }; diff --git a/generator/osm_source.cpp b/generator/osm_source.cpp index 0bc4643abd..8280a7d8b1 100644 --- a/generator/osm_source.cpp +++ b/generator/osm_source.cpp @@ -522,7 +522,7 @@ bool GenerateFeaturesImpl(feature::GenerateInfo & info) tagReplacer(e); tagAdmixer(e); - if (bookingDataset.Filter(*e)) + if (bookingDataset.BookingFilter(*e)) return; parser.EmitElement(e); From 1c92e00abd23f6e7cab2d36c264a7cb0470d2512 Mon Sep 17 00:00:00 2001 From: Sergey Yershov Date: Thu, 26 May 2016 18:16:23 +0300 Subject: [PATCH 5/9] [booking] Extract method GetNearesHotels --- generator/booking_dataset.cpp | 47 ++++++++++++++++++++++------------ generator/booking_dataset.hpp | 5 ++++ tools/python/booking_hotels.py | 1 - 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index 8727ebf188..56ba60a0bf 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -39,7 +39,7 @@ BookingDataset::Hotel::Hotel(string const & src) ostream & operator<<(ostream & s, BookingDataset::Hotel const & h) { - return s << "Name: " << h.name << " lat: " << h.lat << " lon: " << h.lon; + return s << "Name: " << h.name << "\t Address: " << h.address << "\t lat: " << h.lat << " lon: " << h.lon; } void BookingDataset::LoadHotels(string const & path) @@ -94,20 +94,8 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const return false; // Find 3 nearest values to a point. - vector result; - for_each(boost::geometry::index::qbegin(m_rtree, - boost::geometry::index::nearest(TPoint(e.lat, e.lon), 3)), - boost::geometry::index::qend(m_rtree), [&](TValue const & v) - { - auto const & hotel = m_hotels[v.second]; - double dist = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); - if (dist > 150 /* max distance in meters */) - return; - - result.emplace_back(v); - }); - - if (result.empty()) + auto const indexes = GetNearestHotels(e.lat, e.lon, 3, 150 /* max distance in meters */); + if (indexes.empty()) return false; // Match name. @@ -117,10 +105,10 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const // cout << "\n------------- " << name << endl; bool matched = false; - for (auto const & e : result) + for (auto const & index : indexes) { vector bookingTokens; - NormalizeAndTokenizeString(m_hotels[e.second].name, bookingTokens, search::Delimiters()); + NormalizeAndTokenizeString(m_hotels[index].name, bookingTokens, search::Delimiters()); map>> weightPair; @@ -144,6 +132,31 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const return matched; } +BookingDataset::Hotel const & BookingDataset::GetHotel(size_t index) const +{ + ASSERT_GREATER(m_hotels.size(), index, ()); + return m_hotels[index]; +} + +vector BookingDataset::GetNearestHotels(double lat, double lon, size_t limit, + double maxDistance /* = 0.0 */) const +{ + namespace bgi = boost::geometry::index; + + vector indexes; + for_each(bgi::qbegin(m_rtree, bgi::nearest(TPoint(lat, lon), limit)), bgi::qend(m_rtree), + [&](TValue const & v) + { + auto const & hotel = m_hotels[v.second]; + double const dist = ms::DistanceOnEarth(lat, lon, hotel.lat, hotel.lon); + if (maxDistance != 0.0 && dist > maxDistance /* max distance in meters */) + return; + + indexes.emplace_back(v.second); + }); + return indexes; +} + bool BookingDataset::BookingFilter(OsmElement const & e) const { return Filter(e, [&](OsmElement const & e){ return MatchWithBooking(e); }); diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp index 7628e3aa01..79f5754265 100644 --- a/generator/booking_dataset.hpp +++ b/generator/booking_dataset.hpp @@ -57,6 +57,9 @@ public: bool TourismFilter(OsmElement const & e) const; void BuildFeatures(function const & fn) const; + Hotel const & GetHotel(size_t index) const; + vector GetNearestHotels(double lat, double lon, size_t limit, double maxDistance = 0.0) const; + protected: vector m_hotels; @@ -72,4 +75,6 @@ protected: bool MatchWithBooking(OsmElement const & e) const; }; +ostream & operator<<(ostream & s, BookingDataset::Hotel const & h); + } // namespace generator diff --git a/tools/python/booking_hotels.py b/tools/python/booking_hotels.py index 5562e4f4dd..09ba86e439 100755 --- a/tools/python/booking_hotels.py +++ b/tools/python/booking_hotels.py @@ -21,7 +21,6 @@ Hotel = namedtuple('Hotel', 'stars', 'priceCategory', 'ratingBooking', 'ratingUser', 'descUrl']) - class BookingApi: def __init__(self, login, password): self.login = login From 01d9b2bb1ba7a6c6e318e301820e92a6b7791561 Mon Sep 17 00:00:00 2001 From: Sergey Yershov Date: Thu, 26 May 2016 19:11:11 +0300 Subject: [PATCH 6/9] [generator] Review fixes in c++ part --- base/string_utils.cpp | 27 +-- generator/booking_dataset.cpp | 339 ++++++++++++++++++---------------- generator/booking_dataset.hpp | 7 +- 3 files changed, 196 insertions(+), 177 deletions(-) diff --git a/base/string_utils.cpp b/base/string_utils.cpp index 4456eb25c3..d91dc5bb0d 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -42,30 +42,31 @@ UniChar LastUniChar(string const & s) return *iter; } -bool to_int(char const * s, int & i, int base /*= 10*/) +namespace +{ +template +bool IntegerCheck(T x, char const *stop, ET & out) { - char * stop; - long const x = strtol(s, &stop, base); if (*stop == 0) { - i = static_cast(x); - ASSERT_EQUAL(static_cast(i), x, ()); + out = static_cast(x); + ASSERT_EQUAL(static_cast(out), x, ()); return true; } return false; } +} // namespace + +bool to_int(char const * s, int & i, int base /*= 10*/) +{ + char * stop; + return IntegerCheck(strtol(s, &stop, base), stop, i); +} bool to_uint(char const * s, unsigned int & i, int base /*= 10*/) { char * stop; - long const x = strtoul(s, &stop, base); - if (*stop == 0) - { - i = static_cast(x); - ASSERT_EQUAL(static_cast(i), x, ()); - return true; - } - return false; + return IntegerCheck(strtoul(s, &stop, base), stop, i); } diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index 56ba60a0bf..4f0df671de 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -13,6 +13,21 @@ namespace generator { + +namespace +{ +bool CheckForValues(string const & value) +{ + for (char const * val : + {"hotel", "apartment", "camp_site", "chalet", "guest_house", "hostel", "motel", "resort"}) + { + if (value == val) + return true; + } + return false; +} +} // namespace + BookingDataset::Hotel::Hotel(string const & src) { vector rec(FieldsCount()); @@ -42,22 +57,10 @@ ostream & operator<<(ostream & s, BookingDataset::Hotel const & h) return s << "Name: " << h.name << "\t Address: " << h.address << "\t lat: " << h.lat << " lon: " << h.lon; } -void BookingDataset::LoadHotels(string const & path) -{ - m_hotels.clear(); - - if (path.empty()) - return; - - ifstream src(path); - for (string line; getline(src, line);) - m_hotels.emplace_back(line); -} - BookingDataset::BookingDataset(string const & dataPath) { LoadHotels(dataPath); - + size_t counter = 0; for (auto const & hotel : m_hotels) { @@ -67,15 +70,166 @@ BookingDataset::BookingDataset(string const & dataPath) } } -bool CheckForValues(string const & value) +bool BookingDataset::BookingFilter(OsmElement const & e) const +{ + return Filter(e, [&](OsmElement const & e){ return MatchWithBooking(e); }); +} + +bool BookingDataset::TourismFilter(OsmElement const & e) const +{ + return Filter(e, [&](OsmElement const & e){ return true; }); +} + +BookingDataset::Hotel const & BookingDataset::GetHotel(size_t index) const +{ + ASSERT_GREATER(m_hotels.size(), index, ()); + return m_hotels[index]; +} + +vector BookingDataset::GetNearestHotels(double lat, double lon, size_t limit, + double maxDistance /* = 0.0 */) const +{ + namespace bgi = boost::geometry::index; + + vector indexes; + for_each(bgi::qbegin(m_rtree, bgi::nearest(TPoint(lat, lon), limit)), bgi::qend(m_rtree), + [&](TValue const & v) + { + auto const & hotel = m_hotels[v.second]; + double const dist = ms::DistanceOnEarth(lat, lon, hotel.lat, hotel.lon); + if (maxDistance != 0.0 && dist > maxDistance /* max distance in meters */) + return; + + indexes.emplace_back(v.second); + }); + return indexes; +} + +bool BookingDataset::MatchByName(string const & osmName, vector const & bookingIndexes) const { - for (char const * val : - {"hotel", "apartment", "camp_site", "chalet", "guest_house", "hostel", "motel", "resort"}) - { - if (value == val) - return true; - } return false; + + // Match name. +// vector osmTokens; +// NormalizeAndTokenizeString(name, osmTokens, search::Delimiters()); +// +// cout << "\n------------- " << name << endl; +// +// bool matched = false; +// for (auto const & index : indexes) +// { +// vector bookingTokens; +// NormalizeAndTokenizeString(m_hotels[index].name, bookingTokens, search::Delimiters()); +// +// map>> weightPair; +// +// for (size_t j = 0; j < osmTokens.size(); ++j) +// { +// for (size_t i = 0; i < bookingTokens.size(); ++i) +// { +// size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(), +// bookingTokens[i].begin(), bookingTokens[i].end()); +// if (distance < 3) +// weightPair[distance].emplace_back(i, j); +// } +// } +// +// if (!weightPair.empty()) +// { +// cout << m_hotels[e.second] << endl; +// matched = true; +// } +// } +} + +void BookingDataset::BuildFeatures(function const & fn) const +{ + for (auto const & hotel : m_hotels) + { + OsmElement e; + e.type = OsmElement::EntityType::Node; + e.id = 1; + + e.lat = hotel.lat; + e.lon = hotel.lon; + + e.AddTag("name", hotel.name); + e.AddTag("ref:sponsored", strings::to_string(hotel.id)); + e.AddTag("website", hotel.descUrl); + e.AddTag("rating:sponsored", strings::to_string(hotel.ratingUser)); + e.AddTag("stars", strings::to_string(hotel.stars)); + e.AddTag("price_rate", strings::to_string(hotel.priceCategory)); + e.AddTag("addr:full", hotel.address); + + switch (hotel.type) + { + case 19: + case 205: e.AddTag("tourism", "motel"); break; + + case 21: + case 206: + case 212: e.AddTag("tourism", "resort"); break; + + case 3: + case 23: + case 24: + case 25: + case 202: + case 207: + case 208: + case 209: + case 210: + case 216: + case 220: + case 223: e.AddTag("tourism", "guest_house"); break; + + case 14: + case 204: + case 213: + case 218: + case 219: + case 226: + case 222: e.AddTag("tourism", "hotel"); break; + + case 211: + case 224: + case 228: e.AddTag("tourism", "chalet"); break; + + case 13: + case 225: + case 203: e.AddTag("tourism", "hostel"); break; + + case 215: + case 221: + case 227: + case 2: + case 201: e.AddTag("tourism", "apartment"); break; + + case 214: e.AddTag("tourism", "camp_site"); break; + + default: e.AddTag("tourism", "hotel"); break; + } + + fn(&e); + } +} + +void BookingDataset::LoadHotels(string const & path) +{ + m_hotels.clear(); + + if (path.empty()) + return; + + ifstream src(path); + if (!src.is_open()) + { + LOG(LERROR, ("Error while opening", path, ":", strerror(errno))); + return; + } + + for (string line; getline(src, line);) + m_hotels.emplace_back(line); } bool BookingDataset::MatchWithBooking(OsmElement const & e) const @@ -89,84 +243,19 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const break; } } - + if (name.empty()) return false; - + // Find 3 nearest values to a point. auto const indexes = GetNearestHotels(e.lat, e.lon, 3, 150 /* max distance in meters */); if (indexes.empty()) return false; - - // Match name. - vector osmTokens; - NormalizeAndTokenizeString(name, osmTokens, search::Delimiters()); - - // cout << "\n------------- " << name << endl; - - bool matched = false; - for (auto const & index : indexes) - { - vector bookingTokens; - NormalizeAndTokenizeString(m_hotels[index].name, bookingTokens, search::Delimiters()); - - map>> weightPair; - - for (size_t j = 0; j < osmTokens.size(); ++j) - { - for (size_t i = 0; i < bookingTokens.size(); ++i) - { - size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(), - bookingTokens[i].begin(), bookingTokens[i].end()); - if (distance < 3) - weightPair[distance].emplace_back(i, j); - } - } - - if (!weightPair.empty()) - { - // cout << m_hotels[e.second] << endl; - matched = true; - } - } + + bool matched = MatchByName(name, indexes); return matched; } -BookingDataset::Hotel const & BookingDataset::GetHotel(size_t index) const -{ - ASSERT_GREATER(m_hotels.size(), index, ()); - return m_hotels[index]; -} - -vector BookingDataset::GetNearestHotels(double lat, double lon, size_t limit, - double maxDistance /* = 0.0 */) const -{ - namespace bgi = boost::geometry::index; - - vector indexes; - for_each(bgi::qbegin(m_rtree, bgi::nearest(TPoint(lat, lon), limit)), bgi::qend(m_rtree), - [&](TValue const & v) - { - auto const & hotel = m_hotels[v.second]; - double const dist = ms::DistanceOnEarth(lat, lon, hotel.lat, hotel.lon); - if (maxDistance != 0.0 && dist > maxDistance /* max distance in meters */) - return; - - indexes.emplace_back(v.second); - }); - return indexes; -} - -bool BookingDataset::BookingFilter(OsmElement const & e) const -{ - return Filter(e, [&](OsmElement const & e){ return MatchWithBooking(e); }); -} - -bool BookingDataset::TourismFilter(OsmElement const & e) const -{ - return Filter(e, [&](OsmElement const & e){ return true; }); -} - bool BookingDataset::Filter(OsmElement const & e, function const & fn) const { if (e.type != OsmElement::EntityType::Node) @@ -190,76 +279,4 @@ bool BookingDataset::Filter(OsmElement const & e, function const & fn) const -{ - for (auto const & hotel : m_hotels) - { - OsmElement e; - e.type = OsmElement::EntityType::Node; - e.id = 1; - - e.lon = hotel.lon; - e.lat = hotel.lat; - - e.AddTag("name", hotel.name); - e.AddTag("ref:sponsored", strings::to_string(hotel.id)); - e.AddTag("website", hotel.descUrl); - e.AddTag("rating:sponsored", strings::to_string(hotel.ratingUser)); - e.AddTag("stars", strings::to_string(hotel.stars)); - e.AddTag("price_rate", strings::to_string(hotel.priceCategory)); - e.AddTag("addr:full", hotel.address); - - switch (hotel.type) - { - case 19: - case 205: e.AddTag("tourism", "motel"); break; - - case 21: - case 206: - case 212: e.AddTag("tourism", "resort"); break; - - case 3: - case 23: - case 24: - case 25: - case 202: - case 207: - case 208: - case 209: - case 210: - case 216: - case 220: - case 223: e.AddTag("tourism", "guest_house"); break; - - case 14: - case 204: - case 213: - case 218: - case 219: - case 226: - case 222: e.AddTag("tourism", "hotel"); break; - - case 211: - case 224: - case 228: e.AddTag("tourism", "chalet"); break; - - case 13: - case 225: - case 203: e.AddTag("tourism", "hostel"); break; - - case 215: - case 221: - case 227: - case 2: - case 201: e.AddTag("tourism", "apartment"); break; - - case 214: e.AddTag("tourism", "camp_site"); break; - - default: e.AddTag("tourism", "hotel"); break; - } - - fn(&e); - } -} - } // namespace generator diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp index 79f5754265..d0810e0590 100644 --- a/generator/booking_dataset.hpp +++ b/generator/booking_dataset.hpp @@ -55,11 +55,12 @@ public: bool BookingFilter(OsmElement const & e) const; bool TourismFilter(OsmElement const & e) const; - void BuildFeatures(function const & fn) const; Hotel const & GetHotel(size_t index) const; vector GetNearestHotels(double lat, double lon, size_t limit, double maxDistance = 0.0) const; - + bool MatchByName(string const & osmName, vector const & bookingIndexes) const; + + void BuildFeatures(function const & fn) const; protected: vector m_hotels; @@ -71,8 +72,8 @@ protected: boost::geometry::index::rtree> m_rtree; void LoadHotels(string const & path); - bool Filter(OsmElement const & e, function const & fn) const; bool MatchWithBooking(OsmElement const & e) const; + bool Filter(OsmElement const & e, function const & fn) const; }; ostream & operator<<(ostream & s, BookingDataset::Hotel const & h); From de7e82024bcf31df4a686e6476d8281be293d62f Mon Sep 17 00:00:00 2001 From: Sergey Yershov Date: Thu, 26 May 2016 19:16:25 +0300 Subject: [PATCH 7/9] [generator] Review fixes in python part --- tools/python/booking_hotels.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/python/booking_hotels.py b/tools/python/booking_hotels.py index 09ba86e439..cf3d3cbbdf 100755 --- a/tools/python/booking_hotels.py +++ b/tools/python/booking_hotels.py @@ -42,7 +42,7 @@ class BookingApi: if self.requestPerMinute >= self.requestLimit: waittime = 60 - now.second - logging.warning("Limit for request per minute exceeded. Wait for: {0} sec.".format(waittime)) + logging.warning("Limit for request per minute exceeded. Waiting for: {0} sec.".format(waittime)) time.sleep(waittime) now = datetime.utcnow() @@ -84,7 +84,7 @@ def make_record(src, rate): def download(user, password, path): ''' - Download all hotels from booking.com and store then in them set of .pkl files. + Downloads all hotels from booking.com and stores them in a bunch of .pkl files. ''' api = BookingApi(user, password) @@ -107,7 +107,7 @@ def download(user, password, path): if len(hotels) < maxrows: break - logging.info('Total hotels: {0}'.format(len(allhotels))) + logging.info('Num of hotels: {0}'.format(len(allhotels))) filename = os.path.join(path, '{0} - {1}.pkl'.format(country['area'].encode('utf8'), country['name'].encode('utf8'))) with open(filename, 'wb') as fd: @@ -116,7 +116,7 @@ def download(user, password, path): def translate(source, output): ''' - Read *.pkl files and produce a single list of hotels as tab separated values. + Reads *.pkl files and produces a single list of hotels as tab separated values. ''' files = [filename for filename in os.listdir(source) if filename.endswith('.pkl')] From 035fab80c5d7f3d225d2565070210dcf0b1e6620 Mon Sep 17 00:00:00 2001 From: Sergey Yershov Date: Fri, 27 May 2016 12:46:14 +0300 Subject: [PATCH 8/9] [base] Refactor to_int and to_uint. Implement tests. --- base/base_tests/string_utils_test.cpp | 39 ++++++++++++++++++++++++++- base/string_utils.cpp | 24 +++++++++-------- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp index f1e2dff2cc..684a33a527 100644 --- a/base/base_tests/string_utils_test.cpp +++ b/base/base_tests/string_utils_test.cpp @@ -176,6 +176,10 @@ UNIT_TEST(to_int) int i; string s; + s = "AF"; + TEST(strings::to_int(s, i, 16), ()); + TEST_EQUAL(175, i, ()); + s = "-2"; TEST(strings::to_int(s, i), ()); TEST_EQUAL(-2, i, ()); @@ -190,10 +194,43 @@ UNIT_TEST(to_int) s = "labuda"; TEST(!strings::to_int(s, i), ()); +} +UNIT_TEST(to_uint) +{ + unsigned int i; + string s; + + s = ""; + TEST(!strings::to_uint(s, i), ()); + + s = "-2"; + TEST(!strings::to_uint(s, i), ()); + + s = "0"; + TEST(strings::to_uint(s, i), ()); + TEST_EQUAL(0, i, ()); + + s = "123456789123456789123456789"; + TEST(!strings::to_uint(s, i), ()); + + s = "labuda"; + TEST(!strings::to_uint(s, i), ()); + s = "AF"; - TEST(strings::to_int(s, i, 16), ()); + TEST(strings::to_uint(s, i, 16), ()); TEST_EQUAL(175, i, ()); + + s = "100"; + TEST(strings::to_uint(s, i), ()); + TEST_EQUAL(100, i, ()); + + s = "4294967295"; + TEST(strings::to_uint(s, i), ()); + TEST_EQUAL(0xFFFFFFFF, i, ()); + + s = "4294967296"; + TEST(!strings::to_uint(s, i), ()); } UNIT_TEST(to_uint64) diff --git a/base/string_utils.cpp b/base/string_utils.cpp index d91dc5bb0d..22dfad3030 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -44,29 +44,31 @@ UniChar LastUniChar(string const & s) namespace { -template -bool IntegerCheck(T x, char const *stop, ET & out) +template +bool IntegerCheck(char const * start, char const * stop, T x, TResult & out) { - if (*stop == 0) + if (errno != EINVAL && *stop == 0) { - out = static_cast(x); - ASSERT_EQUAL(static_cast(out), x, ()); - return true; + out = static_cast(x); + return static_cast(out) == x; } + errno = 0; return false; } } // namespace - -bool to_int(char const * s, int & i, int base /*= 10*/) + +bool to_int(char const * start, int & i, int base /*= 10*/) { char * stop; - return IntegerCheck(strtol(s, &stop, base), stop, i); + long const v = strtol(start, &stop, base); + return IntegerCheck(start, stop, v, i); } -bool to_uint(char const * s, unsigned int & i, int base /*= 10*/) +bool to_uint(char const * start, unsigned int & i, int base /*= 10*/) { char * stop; - return IntegerCheck(strtoul(s, &stop, base), stop, i); + unsigned long const v = strtoul(start, &stop, base); + return IntegerCheck(start, stop, v, i); } From ff1fc714c7e4b47c224113158a591f3db2e46656 Mon Sep 17 00:00:00 2001 From: Sergey Yershov Date: Mon, 30 May 2016 12:36:55 +0300 Subject: [PATCH 9/9] Review fix --- tools/python/booking_hotels.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/python/booking_hotels.py b/tools/python/booking_hotels.py index cf3d3cbbdf..a0ba6d3f94 100755 --- a/tools/python/booking_hotels.py +++ b/tools/python/booking_hotels.py @@ -59,8 +59,11 @@ class BookingApi: request = urllib2.Request(url, None, self.baseConfig["headers"]) stream = urllib2.urlopen(request) payload = stream.read() - print(payload) - return json.loads(payload) + data = json.loads(payload) + if isinstance(data, dict) and 'ruid' in data: + logging.error('Api call failed with error: {0} Code: {1}'.format(data['message'], data['code'])) + return None + return data except Exception as e: logging.error('Error: {0} Context: {1}'.format(e, payload)) @@ -98,12 +101,14 @@ def download(user, password, path): while True: hotels = api.call('getHotels', dict(new_hotel_type=1, offset=len(allhotels), rows=maxrows, countrycodes=countrycode)) - if isinstance(hotels, dict) and 'ruid' in hotels: - logging.error('Api call failed with error: {0} Code: {1}'.format(hotels['message'], hotels['code'])) + + # Check for error. + if not hotels: exit(1) allhotels.append(hotels) + # If hotels in answer less then maxrows, we reach end of data. if len(hotels) < maxrows: break