forked from organicmaps/organicmaps
Merge pull request #3302 from syershov/MAPSME-1232
[booking] Process data from booking.com
This commit is contained in:
commit
6022f55ab6
13 changed files with 664 additions and 9 deletions
|
@ -176,6 +176,10 @@ UNIT_TEST(to_int)
|
|||
int i;
|
||||
string s;
|
||||
|
||||
s = "AF";
|
||||
TEST(strings::to_int(s, i, 16), ());
|
||||
TEST_EQUAL(175, i, ());
|
||||
|
||||
s = "-2";
|
||||
TEST(strings::to_int(s, i), ());
|
||||
TEST_EQUAL(-2, i, ());
|
||||
|
@ -190,10 +194,43 @@ UNIT_TEST(to_int)
|
|||
|
||||
s = "labuda";
|
||||
TEST(!strings::to_int(s, i), ());
|
||||
}
|
||||
|
||||
UNIT_TEST(to_uint)
|
||||
{
|
||||
unsigned int i;
|
||||
string s;
|
||||
|
||||
s = "";
|
||||
TEST(!strings::to_uint(s, i), ());
|
||||
|
||||
s = "-2";
|
||||
TEST(!strings::to_uint(s, i), ());
|
||||
|
||||
s = "0";
|
||||
TEST(strings::to_uint(s, i), ());
|
||||
TEST_EQUAL(0, i, ());
|
||||
|
||||
s = "123456789123456789123456789";
|
||||
TEST(!strings::to_uint(s, i), ());
|
||||
|
||||
s = "labuda";
|
||||
TEST(!strings::to_uint(s, i), ());
|
||||
|
||||
s = "AF";
|
||||
TEST(strings::to_int(s, i, 16), ());
|
||||
TEST(strings::to_uint(s, i, 16), ());
|
||||
TEST_EQUAL(175, i, ());
|
||||
|
||||
s = "100";
|
||||
TEST(strings::to_uint(s, i), ());
|
||||
TEST_EQUAL(100, i, ());
|
||||
|
||||
s = "4294967295";
|
||||
TEST(strings::to_uint(s, i), ());
|
||||
TEST_EQUAL(0xFFFFFFFF, i, ());
|
||||
|
||||
s = "4294967296";
|
||||
TEST(!strings::to_uint(s, i), ());
|
||||
}
|
||||
|
||||
UNIT_TEST(to_uint64)
|
||||
|
|
|
@ -42,19 +42,36 @@ UniChar LastUniChar(string const & s)
|
|||
return *iter;
|
||||
}
|
||||
|
||||
bool to_int(char const * s, int & i, int base /*= 10*/)
|
||||
namespace
|
||||
{
|
||||
char * stop;
|
||||
long const x = strtol(s, &stop, base);
|
||||
if (*stop == 0)
|
||||
template <typename T, typename TResult>
|
||||
bool IntegerCheck(char const * start, char const * stop, T x, TResult & out)
|
||||
{
|
||||
if (errno != EINVAL && *stop == 0)
|
||||
{
|
||||
i = static_cast<int>(x);
|
||||
ASSERT_EQUAL(static_cast<long>(i), x, ());
|
||||
return true;
|
||||
out = static_cast<TResult>(x);
|
||||
return static_cast<T>(out) == x;
|
||||
}
|
||||
errno = 0;
|
||||
return false;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
bool to_int(char const * start, int & i, int base /*= 10*/)
|
||||
{
|
||||
char * stop;
|
||||
long const v = strtol(start, &stop, base);
|
||||
return IntegerCheck(start, stop, v, i);
|
||||
}
|
||||
|
||||
bool to_uint(char const * start, unsigned int & i, int base /*= 10*/)
|
||||
{
|
||||
char * stop;
|
||||
unsigned long const v = strtoul(start, &stop, base);
|
||||
return IntegerCheck(start, stop, v, i);
|
||||
}
|
||||
|
||||
|
||||
bool to_uint64(char const * s, uint64_t & i)
|
||||
{
|
||||
char * stop;
|
||||
|
|
|
@ -209,6 +209,7 @@ template <class T, size_t N, class TT> bool IsInArray(T (&arr) [N], TT const & t
|
|||
/// @name From string to numeric.
|
||||
//@{
|
||||
bool to_int(char const * s, int & i, int base = 10);
|
||||
bool to_uint(char const * s, unsigned int & i, int base = 10);
|
||||
bool to_uint64(char const * s, uint64_t & i);
|
||||
bool to_int64(char const * s, int64_t & i);
|
||||
bool to_double(char const * s, double & d);
|
||||
|
@ -216,6 +217,7 @@ bool to_double(char const * s, double & d);
|
|||
inline bool is_number(string const & s) { int64_t dummy; return to_int64(s.c_str(), dummy); }
|
||||
|
||||
inline bool to_int(string const & s, int & i, int base = 10) { return to_int(s.c_str(), i, base); }
|
||||
inline bool to_uint(string const & s, unsigned int & i, int base = 10) { return to_uint(s.c_str(), i, base); }
|
||||
inline bool to_uint64(string const & s, uint64_t & i) { return to_uint64(s.c_str(), i); }
|
||||
inline bool to_int64(string const & s, int64_t & i) { return to_int64(s.c_str(), i); }
|
||||
inline bool to_double(string const & s, double & d) { return to_double(s.c_str(), d); }
|
||||
|
|
282
generator/booking_dataset.cpp
Normal file
282
generator/booking_dataset.cpp
Normal file
|
@ -0,0 +1,282 @@
|
|||
#include "generator/booking_dataset.hpp"
|
||||
|
||||
#include "indexer/search_delimiters.hpp"
|
||||
#include "indexer/search_string_utils.hpp"
|
||||
|
||||
#include "geometry/distance_on_sphere.hpp"
|
||||
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/fstream.hpp"
|
||||
#include "std/iostream.hpp"
|
||||
#include "std/sstream.hpp"
|
||||
|
||||
namespace generator
|
||||
{
|
||||
|
||||
namespace
|
||||
{
|
||||
bool CheckForValues(string const & value)
|
||||
{
|
||||
for (char const * val :
|
||||
{"hotel", "apartment", "camp_site", "chalet", "guest_house", "hostel", "motel", "resort"})
|
||||
{
|
||||
if (value == val)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
BookingDataset::Hotel::Hotel(string const & src)
|
||||
{
|
||||
vector<string> rec(FieldsCount());
|
||||
strings::SimpleTokenizer token(src, "\t");
|
||||
for (size_t i = 0; token && i < rec.size(); ++i, ++token)
|
||||
rec[i] = *token;
|
||||
|
||||
strings::to_uint(rec[Index(Fields::Id)], id);
|
||||
strings::to_double(rec[Index(Fields::Latitude)], lat);
|
||||
strings::to_double(rec[Index(Fields::Longtitude)], lon);
|
||||
|
||||
name = rec[Index(Fields::Name)];
|
||||
address = rec[Index(Fields::Address)];
|
||||
|
||||
strings::to_uint(rec[Index(Fields::Stars)], stars);
|
||||
strings::to_uint(rec[Index(Fields::PriceCategory)], priceCategory);
|
||||
strings::to_double(rec[Index(Fields::RatingBooking)], ratingBooking);
|
||||
strings::to_double(rec[Index(Fields::RatingUsers)], ratingUser);
|
||||
|
||||
descUrl = rec[Index(Fields::DescUrl)];
|
||||
|
||||
strings::to_uint(rec[Index(Fields::Type)], type);
|
||||
}
|
||||
|
||||
ostream & operator<<(ostream & s, BookingDataset::Hotel const & h)
|
||||
{
|
||||
return s << "Name: " << h.name << "\t Address: " << h.address << "\t lat: " << h.lat << " lon: " << h.lon;
|
||||
}
|
||||
|
||||
BookingDataset::BookingDataset(string const & dataPath)
|
||||
{
|
||||
LoadHotels(dataPath);
|
||||
|
||||
size_t counter = 0;
|
||||
for (auto const & hotel : m_hotels)
|
||||
{
|
||||
TBox b(TPoint(hotel.lat, hotel.lon), TPoint(hotel.lat, hotel.lon));
|
||||
m_rtree.insert(std::make_pair(b, counter));
|
||||
++counter;
|
||||
}
|
||||
}
|
||||
|
||||
bool BookingDataset::BookingFilter(OsmElement const & e) const
|
||||
{
|
||||
return Filter(e, [&](OsmElement const & e){ return MatchWithBooking(e); });
|
||||
}
|
||||
|
||||
bool BookingDataset::TourismFilter(OsmElement const & e) const
|
||||
{
|
||||
return Filter(e, [&](OsmElement const & e){ return true; });
|
||||
}
|
||||
|
||||
BookingDataset::Hotel const & BookingDataset::GetHotel(size_t index) const
|
||||
{
|
||||
ASSERT_GREATER(m_hotels.size(), index, ());
|
||||
return m_hotels[index];
|
||||
}
|
||||
|
||||
vector<size_t> BookingDataset::GetNearestHotels(double lat, double lon, size_t limit,
|
||||
double maxDistance /* = 0.0 */) const
|
||||
{
|
||||
namespace bgi = boost::geometry::index;
|
||||
|
||||
vector<size_t> indexes;
|
||||
for_each(bgi::qbegin(m_rtree, bgi::nearest(TPoint(lat, lon), limit)), bgi::qend(m_rtree),
|
||||
[&](TValue const & v)
|
||||
{
|
||||
auto const & hotel = m_hotels[v.second];
|
||||
double const dist = ms::DistanceOnEarth(lat, lon, hotel.lat, hotel.lon);
|
||||
if (maxDistance != 0.0 && dist > maxDistance /* max distance in meters */)
|
||||
return;
|
||||
|
||||
indexes.emplace_back(v.second);
|
||||
});
|
||||
return indexes;
|
||||
}
|
||||
|
||||
bool BookingDataset::MatchByName(string const & osmName, vector<size_t> const & bookingIndexes) const
|
||||
{
|
||||
return false;
|
||||
|
||||
// Match name.
|
||||
// vector<strings::UniString> osmTokens;
|
||||
// NormalizeAndTokenizeString(name, osmTokens, search::Delimiters());
|
||||
//
|
||||
// cout << "\n------------- " << name << endl;
|
||||
//
|
||||
// bool matched = false;
|
||||
// for (auto const & index : indexes)
|
||||
// {
|
||||
// vector<strings::UniString> bookingTokens;
|
||||
// NormalizeAndTokenizeString(m_hotels[index].name, bookingTokens, search::Delimiters());
|
||||
//
|
||||
// map<size_t, vector<pair<size_t, size_t>>> weightPair;
|
||||
//
|
||||
// for (size_t j = 0; j < osmTokens.size(); ++j)
|
||||
// {
|
||||
// for (size_t i = 0; i < bookingTokens.size(); ++i)
|
||||
// {
|
||||
// size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(),
|
||||
// bookingTokens[i].begin(), bookingTokens[i].end());
|
||||
// if (distance < 3)
|
||||
// weightPair[distance].emplace_back(i, j);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// if (!weightPair.empty())
|
||||
// {
|
||||
// cout << m_hotels[e.second] << endl;
|
||||
// matched = true;
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) const
|
||||
{
|
||||
for (auto const & hotel : m_hotels)
|
||||
{
|
||||
OsmElement e;
|
||||
e.type = OsmElement::EntityType::Node;
|
||||
e.id = 1;
|
||||
|
||||
e.lat = hotel.lat;
|
||||
e.lon = hotel.lon;
|
||||
|
||||
e.AddTag("name", hotel.name);
|
||||
e.AddTag("ref:sponsored", strings::to_string(hotel.id));
|
||||
e.AddTag("website", hotel.descUrl);
|
||||
e.AddTag("rating:sponsored", strings::to_string(hotel.ratingUser));
|
||||
e.AddTag("stars", strings::to_string(hotel.stars));
|
||||
e.AddTag("price_rate", strings::to_string(hotel.priceCategory));
|
||||
e.AddTag("addr:full", hotel.address);
|
||||
|
||||
switch (hotel.type)
|
||||
{
|
||||
case 19:
|
||||
case 205: e.AddTag("tourism", "motel"); break;
|
||||
|
||||
case 21:
|
||||
case 206:
|
||||
case 212: e.AddTag("tourism", "resort"); break;
|
||||
|
||||
case 3:
|
||||
case 23:
|
||||
case 24:
|
||||
case 25:
|
||||
case 202:
|
||||
case 207:
|
||||
case 208:
|
||||
case 209:
|
||||
case 210:
|
||||
case 216:
|
||||
case 220:
|
||||
case 223: e.AddTag("tourism", "guest_house"); break;
|
||||
|
||||
case 14:
|
||||
case 204:
|
||||
case 213:
|
||||
case 218:
|
||||
case 219:
|
||||
case 226:
|
||||
case 222: e.AddTag("tourism", "hotel"); break;
|
||||
|
||||
case 211:
|
||||
case 224:
|
||||
case 228: e.AddTag("tourism", "chalet"); break;
|
||||
|
||||
case 13:
|
||||
case 225:
|
||||
case 203: e.AddTag("tourism", "hostel"); break;
|
||||
|
||||
case 215:
|
||||
case 221:
|
||||
case 227:
|
||||
case 2:
|
||||
case 201: e.AddTag("tourism", "apartment"); break;
|
||||
|
||||
case 214: e.AddTag("tourism", "camp_site"); break;
|
||||
|
||||
default: e.AddTag("tourism", "hotel"); break;
|
||||
}
|
||||
|
||||
fn(&e);
|
||||
}
|
||||
}
|
||||
|
||||
void BookingDataset::LoadHotels(string const & path)
|
||||
{
|
||||
m_hotels.clear();
|
||||
|
||||
if (path.empty())
|
||||
return;
|
||||
|
||||
ifstream src(path);
|
||||
if (!src.is_open())
|
||||
{
|
||||
LOG(LERROR, ("Error while opening", path, ":", strerror(errno)));
|
||||
return;
|
||||
}
|
||||
|
||||
for (string line; getline(src, line);)
|
||||
m_hotels.emplace_back(line);
|
||||
}
|
||||
|
||||
bool BookingDataset::MatchWithBooking(OsmElement const & e) const
|
||||
{
|
||||
string name;
|
||||
for (auto const & tag : e.Tags())
|
||||
{
|
||||
if (tag.key == "name")
|
||||
{
|
||||
name = tag.value;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (name.empty())
|
||||
return false;
|
||||
|
||||
// Find 3 nearest values to a point.
|
||||
auto const indexes = GetNearestHotels(e.lat, e.lon, 3, 150 /* max distance in meters */);
|
||||
if (indexes.empty())
|
||||
return false;
|
||||
|
||||
bool matched = MatchByName(name, indexes);
|
||||
return matched;
|
||||
}
|
||||
|
||||
bool BookingDataset::Filter(OsmElement const & e, function<bool(OsmElement const &)> const & fn) const
|
||||
{
|
||||
if (e.type != OsmElement::EntityType::Node)
|
||||
return false;
|
||||
|
||||
if (e.Tags().empty())
|
||||
return false;
|
||||
|
||||
bool matched = false;
|
||||
for (auto const & tag : e.Tags())
|
||||
{
|
||||
if (tag.key == "tourism" && CheckForValues(tag.value))
|
||||
{
|
||||
matched = fn(e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Need to write file with dropped osm features.
|
||||
|
||||
return matched;
|
||||
}
|
||||
|
||||
} // namespace generator
|
81
generator/booking_dataset.hpp
Normal file
81
generator/booking_dataset.hpp
Normal file
|
@ -0,0 +1,81 @@
|
|||
#pragma once
|
||||
|
||||
#include "generator/osm_element.hpp"
|
||||
|
||||
#include "boost/geometry.hpp"
|
||||
#include "boost/geometry/geometries/point.hpp"
|
||||
#include "boost/geometry/geometries/box.hpp"
|
||||
#include "boost/geometry/index/rtree.hpp"
|
||||
|
||||
#include "std/function.hpp"
|
||||
#include "std/string.hpp"
|
||||
|
||||
namespace generator
|
||||
{
|
||||
class BookingDataset
|
||||
{
|
||||
public:
|
||||
struct Hotel
|
||||
{
|
||||
enum class Fields
|
||||
{
|
||||
Id = 0,
|
||||
Latitude = 1,
|
||||
Longtitude = 2,
|
||||
Name = 3,
|
||||
Address = 4,
|
||||
Stars = 5,
|
||||
PriceCategory = 6,
|
||||
RatingBooking = 7,
|
||||
RatingUsers = 8,
|
||||
DescUrl = 9,
|
||||
Type = 10,
|
||||
|
||||
Counter
|
||||
};
|
||||
|
||||
uint32_t id = 0;
|
||||
double lat = 0.0;
|
||||
double lon = 0.0;
|
||||
string name;
|
||||
string address;
|
||||
uint32_t stars = 0;
|
||||
uint32_t priceCategory = 0;
|
||||
double ratingBooking = 0.0;
|
||||
double ratingUser = 0.0;
|
||||
string descUrl;
|
||||
uint32_t type = 0;
|
||||
|
||||
constexpr size_t Index(Fields field) const { return static_cast<size_t>(field); }
|
||||
constexpr size_t FieldsCount() const { return static_cast<size_t>(Fields::Counter); }
|
||||
explicit Hotel(string const & src);
|
||||
};
|
||||
|
||||
explicit BookingDataset(string const & dataPath);
|
||||
|
||||
bool BookingFilter(OsmElement const & e) const;
|
||||
bool TourismFilter(OsmElement const & e) const;
|
||||
|
||||
Hotel const & GetHotel(size_t index) const;
|
||||
vector<size_t> GetNearestHotels(double lat, double lon, size_t limit, double maxDistance = 0.0) const;
|
||||
bool MatchByName(string const & osmName, vector<size_t> const & bookingIndexes) const;
|
||||
|
||||
void BuildFeatures(function<void(OsmElement *)> const & fn) const;
|
||||
protected:
|
||||
vector<Hotel> m_hotels;
|
||||
|
||||
// create the rtree using default constructor
|
||||
using TPoint = boost::geometry::model::point<float, 2, boost::geometry::cs::cartesian>;
|
||||
using TBox = boost::geometry::model::box<TPoint>;
|
||||
using TValue = pair<TBox, size_t>;
|
||||
|
||||
boost::geometry::index::rtree<TValue, boost::geometry::index::quadratic<16>> m_rtree;
|
||||
|
||||
void LoadHotels(string const & path);
|
||||
bool MatchWithBooking(OsmElement const & e) const;
|
||||
bool Filter(OsmElement const & e, function<bool(OsmElement const &)> const & fn) const;
|
||||
};
|
||||
|
||||
ostream & operator<<(ostream & s, BookingDataset::Hotel const & h);
|
||||
|
||||
} // namespace generator
|
|
@ -41,6 +41,8 @@ struct GenerateInfo
|
|||
NodeStorageType m_nodeStorageType;
|
||||
OsmSourceType m_osmFileType;
|
||||
string m_osmFileName;
|
||||
|
||||
string m_bookingDatafileName;
|
||||
|
||||
uint32_t m_versionDate = 0;
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@ INCLUDEPATH *= $$ROOT_DIR/3party/gflags/src \
|
|||
QT *= core
|
||||
|
||||
SOURCES += \
|
||||
booking_dataset.cpp \
|
||||
borders_generator.cpp \
|
||||
borders_loader.cpp \
|
||||
check_model.cpp \
|
||||
|
@ -37,6 +38,7 @@ SOURCES += \
|
|||
unpack_mwm.cpp \
|
||||
|
||||
HEADERS += \
|
||||
booking_dataset.hpp \
|
||||
borders_generator.hpp \
|
||||
borders_loader.hpp \
|
||||
check_model.hpp \
|
||||
|
|
|
@ -67,6 +67,7 @@ DEFINE_bool(make_cross_section, false, "Make corss section in routing file for c
|
|||
DEFINE_string(osm_file_name, "", "Input osm area file");
|
||||
DEFINE_string(osm_file_type, "xml", "Input osm area file type [xml, o5m]");
|
||||
DEFINE_string(user_resource_path, "", "User defined resource path for classificator.txt and etc.");
|
||||
DEFINE_string(booking_data, "", "Path to booking data in .tsv format");
|
||||
DEFINE_uint64(planet_version, my::SecondsSinceEpoch(), "Version as seconds since epoch, by default - now");
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
|
@ -100,6 +101,7 @@ int main(int argc, char ** argv)
|
|||
genInfo.m_osmFileName = FLAGS_osm_file_name;
|
||||
genInfo.m_failOnCoasts = FLAGS_fail_on_coasts;
|
||||
genInfo.m_preloadCache = FLAGS_preload_cache;
|
||||
genInfo.m_bookingDatafileName = FLAGS_booking_data;
|
||||
|
||||
genInfo.m_versionDate = static_cast<uint32_t>(FLAGS_planet_version);
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#include "generator/osm_element.hpp"
|
||||
|
||||
#include "base/string_utils.hpp"
|
||||
#include "coding/parse_xml.hpp"
|
||||
|
||||
#include "std/cstdio.hpp"
|
||||
|
@ -63,7 +64,9 @@ void OsmElement::AddTag(string const & k, string const & v)
|
|||
SKIP_KEY("official_name");
|
||||
#undef SKIP_KEY
|
||||
|
||||
m_tags.emplace_back(k, v);
|
||||
string value = v;
|
||||
strings::Trim(value);
|
||||
m_tags.emplace_back(k, value);
|
||||
}
|
||||
|
||||
string OsmElement::ToString(string const & shift) const
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#include "generator/booking_dataset.hpp"
|
||||
#include "generator/coastlines_generator.hpp"
|
||||
#include "generator/feature_generator.hpp"
|
||||
#include "generator/intermediate_data.hpp"
|
||||
|
@ -511,12 +512,19 @@ bool GenerateFeaturesImpl(feature::GenerateInfo & info)
|
|||
TagAdmixer tagAdmixer(info.GetIntermediateFileName("ways", ".csv"),
|
||||
info.GetIntermediateFileName("towns", ".csv"));
|
||||
TagReplacer tagReplacer(GetPlatform().ResourcesDir() + REPLACED_TAGS_FILE);
|
||||
|
||||
// If info.m_bookingDatafileName is empty then no data will be loaded.
|
||||
generator::BookingDataset bookingDataset(info.m_bookingDatafileName);
|
||||
|
||||
// Here we can add new tags to element!!!
|
||||
auto const fn = [&](OsmElement * e)
|
||||
{
|
||||
tagReplacer(e);
|
||||
tagAdmixer(e);
|
||||
|
||||
if (bookingDataset.BookingFilter(*e))
|
||||
return;
|
||||
|
||||
parser.EmitElement(e);
|
||||
};
|
||||
|
||||
|
@ -533,6 +541,12 @@ bool GenerateFeaturesImpl(feature::GenerateInfo & info)
|
|||
|
||||
LOG(LINFO, ("Processing", info.m_osmFileName, "done."));
|
||||
|
||||
if (!info.m_bookingDatafileName.empty())
|
||||
{
|
||||
bookingDataset.BuildFeatures([&](OsmElement * e) { parser.EmitElement(e); });
|
||||
LOG(LINFO, ("Processing booking data from", info.m_bookingDatafileName, "done."));
|
||||
}
|
||||
|
||||
parser.Finish();
|
||||
|
||||
// Stop if coasts are not merged and FLAG_fail_on_coasts is set
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#pragma once
|
||||
#include "base/stl_add.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
|
|
204
tools/python/booking_hotels.py
Executable file
204
tools/python/booking_hotels.py
Executable file
|
@ -0,0 +1,204 @@
|
|||
#!/usr/bin/python
|
||||
# coding: utf8
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import namedtuple, defaultdict
|
||||
from datetime import datetime
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
import urllib2
|
||||
|
||||
# init logging
|
||||
logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s')
|
||||
|
||||
Hotel = namedtuple('Hotel',
|
||||
['id', 'lat', 'lon', 'name', 'address',
|
||||
'stars', 'priceCategory', 'ratingBooking',
|
||||
'ratingUser', 'descUrl'])
|
||||
|
||||
class BookingApi:
|
||||
def __init__(self, login, password):
|
||||
self.login = login
|
||||
self.password = password
|
||||
self.baseConfig = {
|
||||
"headers": {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": "Basic " + base64.encodestring(
|
||||
"{login}:{password}".format(login=self.login, password=self.password)).replace('\n', '')
|
||||
},
|
||||
"url": 'https://distribution-xml.booking.com/json/bookings'}
|
||||
self.checkMinute = 0
|
||||
self.requestPerMinute = 0
|
||||
self.requestLimit = 15 # request per minute
|
||||
|
||||
def call(self, function, params=None):
|
||||
self.requestPerMinute += 1
|
||||
now = datetime.utcnow()
|
||||
|
||||
if self.requestPerMinute >= self.requestLimit:
|
||||
waittime = 60 - now.second
|
||||
logging.warning("Limit for request per minute exceeded. Waiting for: {0} sec.".format(waittime))
|
||||
time.sleep(waittime)
|
||||
now = datetime.utcnow()
|
||||
|
||||
if self.checkMinute != now.minute:
|
||||
self.requestPerMinute = 0
|
||||
self.checkMinute = now.minute
|
||||
|
||||
payload = ''
|
||||
try:
|
||||
p = "" if not params else '?' + "&".join(
|
||||
["{key}={value}".format(key=k, value=v) for (k, v) in params.iteritems()])
|
||||
url = "{base}.{func}{params}".format(base=self.baseConfig["url"], func=function, params=p)
|
||||
logging.debug("{0} {1} API call:{2}".format(self.checkMinute, self.requestPerMinute, url))
|
||||
request = urllib2.Request(url, None, self.baseConfig["headers"])
|
||||
stream = urllib2.urlopen(request)
|
||||
payload = stream.read()
|
||||
data = json.loads(payload)
|
||||
if isinstance(data, dict) and 'ruid' in data:
|
||||
logging.error('Api call failed with error: {0} Code: {1}'.format(data['message'], data['code']))
|
||||
return None
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
logging.error('Error: {0} Context: {1}'.format(e, payload))
|
||||
return None
|
||||
|
||||
|
||||
def make_record(src, rate):
|
||||
return Hotel(
|
||||
unicode(src['hotel_id']),
|
||||
unicode(src['location']['latitude']),
|
||||
unicode(src['location']['longitude']),
|
||||
unicode(src['name']),
|
||||
unicode(src['address']),
|
||||
unicode(src['class']),
|
||||
unicode(rate),
|
||||
unicode(src['ranking']),
|
||||
unicode(src['review_score']),
|
||||
unicode(src['url'])
|
||||
)
|
||||
|
||||
|
||||
def download(user, password, path):
|
||||
'''
|
||||
Downloads all hotels from booking.com and stores them in a bunch of .pkl files.
|
||||
'''
|
||||
api = BookingApi(user, password)
|
||||
|
||||
maxrows = 1000
|
||||
countries = api.call("getCountries", dict(languagecodes='en'))
|
||||
for country in countries:
|
||||
countrycode = country['countrycode']
|
||||
logging.info(u'Download[{0}]: {1}'.format(countrycode, country['name']))
|
||||
|
||||
allhotels = []
|
||||
while True:
|
||||
hotels = api.call('getHotels',
|
||||
dict(new_hotel_type=1, offset=len(allhotels), rows=maxrows, countrycodes=countrycode))
|
||||
|
||||
# Check for error.
|
||||
if not hotels:
|
||||
exit(1)
|
||||
|
||||
allhotels.append(hotels)
|
||||
|
||||
# If hotels in answer less then maxrows, we reach end of data.
|
||||
if len(hotels) < maxrows:
|
||||
break
|
||||
|
||||
logging.info('Num of hotels: {0}'.format(len(allhotels)))
|
||||
filename = os.path.join(path,
|
||||
'{0} - {1}.pkl'.format(country['area'].encode('utf8'), country['name'].encode('utf8')))
|
||||
with open(filename, 'wb') as fd:
|
||||
pickle.dump(allhotels, fd, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def translate(source, output):
|
||||
'''
|
||||
Reads *.pkl files and produces a single list of hotels as tab separated values.
|
||||
'''
|
||||
files = [filename for filename in os.listdir(source) if filename.endswith('.pkl')]
|
||||
|
||||
data = []
|
||||
for filename in files:
|
||||
logging.info('Processing {0}'.format(filename))
|
||||
with open(filename, 'rb') as fd:
|
||||
data += pickle.load(fd)
|
||||
|
||||
# Dict of dicts city_id -> { currency -> [prices] }
|
||||
cities = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
def valid(hotel):
|
||||
return 'city_id' in hotel and 'currencycode' in hotel and 'minrate' in hotel and hotel['minrate'] is not None
|
||||
|
||||
# Collect prices
|
||||
for hotel in data:
|
||||
if valid(hotel):
|
||||
cities[hotel['city_id']][hotel['currencycode']].append(float(hotel['minrate']))
|
||||
|
||||
# Replaces list of prices by a median price.
|
||||
for city in cities:
|
||||
for cur in cities[city]:
|
||||
cities[city][cur] = sorted(cities[city][cur])[len(cities[city][cur]) / 2]
|
||||
|
||||
# Price rate ranges, relative to the median price for a city
|
||||
rates = (0.7, 1.3)
|
||||
|
||||
with open(output, 'w') as fd:
|
||||
for hotel in data:
|
||||
rate = 0
|
||||
if valid(hotel):
|
||||
avg = cities[hotel['city_id']][hotel['currencycode']]
|
||||
price = float(hotel['minrate'])
|
||||
rate = 1
|
||||
# Find a range that contains the price
|
||||
while rate <= len(rates) and price > avg * rates[rate - 1]:
|
||||
rate += 1
|
||||
cur = make_record(hotel, rate)
|
||||
l = [e.encode('utf8') for e in cur]
|
||||
print('\t'.join(l), file=fd)
|
||||
|
||||
|
||||
def process_options():
|
||||
parser = argparse.ArgumentParser(description='Download and process booking hotels.')
|
||||
parser.add_argument("-v", "--verbose", action="store_true", dest="verbose")
|
||||
parser.add_argument("-q", "--quiet", action="store_false", dest="verbose")
|
||||
|
||||
parser.add_argument("--password", dest="password", help="Booking.com account password")
|
||||
parser.add_argument("--user", dest="user", help="Booking.com account user name")
|
||||
|
||||
parser.add_argument("--path", dest="path", help="Path to data files")
|
||||
parser.add_argument("--output", dest="output", help="Name and destination for output file")
|
||||
|
||||
parser.add_argument("--download", action="store_true", dest="download", default=False)
|
||||
parser.add_argument("--translate", action="store_true", dest="translate", default=False)
|
||||
|
||||
options = parser.parse_args()
|
||||
|
||||
if not options.download and not options.translate:
|
||||
parser.print_help()
|
||||
|
||||
if options.translate and not options.output:
|
||||
print("--output isn't set")
|
||||
parser.print_help()
|
||||
exit()
|
||||
|
||||
return options
|
||||
|
||||
|
||||
def main():
|
||||
options = process_options()
|
||||
if options.download:
|
||||
download(options.user, options.password, options.path)
|
||||
if options.translate:
|
||||
translate(options.path, options.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -57,6 +57,8 @@
|
|||
677E2A161CAACC5F001DC42A /* tag_admixer.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 677E2A111CAACC5F001DC42A /* tag_admixer.hpp */; };
|
||||
677E2A171CAACC5F001DC42A /* towns_dumper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 677E2A121CAACC5F001DC42A /* towns_dumper.cpp */; };
|
||||
677E2A181CAACC5F001DC42A /* towns_dumper.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 677E2A131CAACC5F001DC42A /* towns_dumper.hpp */; };
|
||||
67A0FEBE1CEB467F008F2A61 /* booking_dataset.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 67A0FEBC1CEB467F008F2A61 /* booking_dataset.cpp */; };
|
||||
67A0FEBF1CEB467F008F2A61 /* booking_dataset.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 67A0FEBD1CEB467F008F2A61 /* booking_dataset.hpp */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
|
@ -113,6 +115,8 @@
|
|||
677E2A111CAACC5F001DC42A /* tag_admixer.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = tag_admixer.hpp; sourceTree = "<group>"; };
|
||||
677E2A121CAACC5F001DC42A /* towns_dumper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = towns_dumper.cpp; sourceTree = "<group>"; };
|
||||
677E2A131CAACC5F001DC42A /* towns_dumper.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = towns_dumper.hpp; sourceTree = "<group>"; };
|
||||
67A0FEBC1CEB467F008F2A61 /* booking_dataset.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = booking_dataset.cpp; sourceTree = "<group>"; };
|
||||
67A0FEBD1CEB467F008F2A61 /* booking_dataset.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = booking_dataset.hpp; sourceTree = "<group>"; };
|
||||
67F0F6761B8C9DCE003F52FF /* osm_xml_source.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = osm_xml_source.hpp; sourceTree = "<group>"; };
|
||||
/* End PBXFileReference section */
|
||||
|
||||
|
@ -199,6 +203,8 @@
|
|||
670B84BB1A8CDB0000CE4492 /* osm_source.hpp */,
|
||||
6764B8921ADD6A3300DD8B15 /* osm_o5m_source.hpp */,
|
||||
67F0F6761B8C9DCE003F52FF /* osm_xml_source.hpp */,
|
||||
67A0FEBC1CEB467F008F2A61 /* booking_dataset.cpp */,
|
||||
67A0FEBD1CEB467F008F2A61 /* booking_dataset.hpp */,
|
||||
);
|
||||
name = generator;
|
||||
path = ../../generator;
|
||||
|
@ -227,6 +233,7 @@
|
|||
675340741A3F2A7400A0A8C3 /* generate_info.hpp in Headers */,
|
||||
677E2A161CAACC5F001DC42A /* tag_admixer.hpp in Headers */,
|
||||
675340861A3F2A7400A0A8C3 /* tesselator.hpp in Headers */,
|
||||
67A0FEBF1CEB467F008F2A61 /* booking_dataset.hpp in Headers */,
|
||||
6753405F1A3F2A7400A0A8C3 /* borders_loader.hpp in Headers */,
|
||||
675340801A3F2A7400A0A8C3 /* polygonizer.hpp in Headers */,
|
||||
675340941C5231BA002CF0D9 /* search_index_builder.hpp in Headers */,
|
||||
|
@ -309,6 +316,7 @@
|
|||
675340811A3F2A7400A0A8C3 /* routing_generator.cpp in Sources */,
|
||||
675340931C5231BA002CF0D9 /* search_index_builder.cpp in Sources */,
|
||||
6753406E1A3F2A7400A0A8C3 /* feature_merger.cpp in Sources */,
|
||||
67A0FEBE1CEB467F008F2A61 /* booking_dataset.cpp in Sources */,
|
||||
6753408D1A3F2A7400A0A8C3 /* osm_element.cpp in Sources */,
|
||||
6726C1D51A4AFEF4005EEA39 /* osm2meta.cpp in Sources */,
|
||||
6753405E1A3F2A7400A0A8C3 /* borders_loader.cpp in Sources */,
|
||||
|
|
Loading…
Add table
Reference in a new issue