[booking] Support data from booking.com

This commit is contained in:
Sergey Yershov 2016-05-24 12:32:19 +03:00
parent 1ac780c71d
commit 706e4467f3
9 changed files with 355 additions and 1 deletions

View file

@ -0,0 +1,252 @@
#include "generator/booking_dataset.hpp"
#include "base/string_utils.hpp"
#include "geometry/distance_on_sphere.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "std/fstream.hpp"
#include "std/iostream.hpp"
#include "std/sstream.hpp"
BookingDataset::BookingHotel::BookingHotel(string const & src)
{
stringstream ss(src);
string elem;
vector<string> rec(FieldsCount());
for (size_t i = 0; getline(ss, elem, '\t') && i < rec.size(); ++i)
rec[i] = elem;
id = static_cast<uint32_t>(strtoul(rec[Index(Fields::Id)].c_str(), nullptr, 10));
lat = strtod(rec[Index(Fields::Latitude)].c_str(), nullptr);
lon = strtod(rec[Index(Fields::Longtitude)].c_str(), nullptr);
name = rec[Index(Fields::Name)];
address = rec[Index(Fields::Address)];
stars = rec[Index(Fields::Stars)].empty()
? 0
: static_cast<uint32_t>(strtoul(rec[Index(Fields::Stars)].c_str(), nullptr, 10));
priceCategory =
rec[Index(Fields::PriceCategory)].empty()
? 0
: static_cast<uint32_t>(strtoul(rec[Index(Fields::PriceCategory)].c_str(), nullptr, 10));
ratingBooking = rec[Index(Fields::RatingBooking)].empty()
? 0
: strtod(rec[Index(Fields::RatingBooking)].c_str(), nullptr);
ratingUser = rec[Index(Fields::RatingUsers)].empty()
? 0
: strtod(rec[Index(Fields::RatingUsers)].c_str(), nullptr);
descUrl = rec[Index(Fields::DescUrl)];
type = rec[Index(Fields::Type)].empty()
? 0
: static_cast<uint32_t>(strtoul(rec[Index(Fields::Type)].c_str(), nullptr, 10));
}
ostream & operator<<(ostream & s, BookingDataset::BookingHotel const & h)
{
return s << "Name: " << h.name << " lon: " << h.lon << " lat: " << h.lat;
}
void BookingDataset::LoadBookingHotels(string const & path)
{
m_hotels.clear();
if(path.empty())
return;
ifstream src(path);
for (string elem; getline(src, elem);)
m_hotels.emplace_back(elem);
}
BookingDataset::BookingDataset(string const & dataPath)
{
LoadBookingHotels(dataPath);
size_t counter = 0;
for (auto const & hotel : m_hotels)
{
TBox b(TPoint(hotel.lon, hotel.lat), TPoint(hotel.lon, hotel.lat));
m_rtree.insert(std::make_pair(b, counter++));
}
}
bool CheckForValues(string const & value)
{
for (char const * val :
{"hotel", "apartment", "camp_site", "chalet", "guest_house", "hostel", "motel", "resort"})
{
if (value == val)
return true;
}
return false;
}
bool BookingDataset::MatchWithBooking(OsmElement const & e) const
{
string name;
for (auto const & tag : e.Tags())
{
if (tag.key == "name")
{
name = tag.value;
break;
}
}
if (name.empty())
return false;
// Find 3 nearest values to a point.
vector<TValue> result;
for_each(boost::geometry::index::qbegin(m_rtree,
boost::geometry::index::nearest(TPoint(e.lon, e.lat), 3)),
boost::geometry::index::qend(m_rtree), [&](TValue const & v)
{
auto const & hotel = m_hotels[v.second];
double dist = ms::DistanceOnEarth(e.lon, e.lat, hotel.lon, hotel.lat);
if (dist > 150 /* max distance in meters */)
return;
result.emplace_back(v);
});
if (result.empty())
return false;
// Match name.
vector<strings::UniString> osmTokens;
NormalizeAndTokenizeString(name, osmTokens, search::Delimiters());
// cout << "\n------------- " << name << endl;
bool matched = false;
for (auto const & e : result)
{
vector<strings::UniString> bookingTokens;
NormalizeAndTokenizeString(m_hotels[e.second].name, bookingTokens, search::Delimiters());
map<size_t, vector<pair<size_t, size_t>>> weightPair;
for (size_t j = 0; j < osmTokens.size(); ++j)
{
for (size_t i = 0; i < bookingTokens.size(); ++i)
{
size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(),
bookingTokens[i].begin(), bookingTokens[i].end());
if (distance < 3)
weightPair[distance].emplace_back(i, j);
}
}
if (!weightPair.empty())
{
// cout << m_hotels[e.second] << endl;
matched = true;
}
}
return matched;
}
bool BookingDataset::Filter(OsmElement const & e) const
{
if (e.type != OsmElement::EntityType::Node)
return false;
if (e.Tags().empty())
return false;
bool matched = false;
for (auto const & tag : e.Tags())
{
if (tag.key == "tourism" && CheckForValues(tag.value))
{
matched = MatchWithBooking(e);
break;
}
}
// TODO: Need to write file with dropped osm features.
return matched;
}
void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) const
{
for (auto const & hotel : m_hotels)
{
OsmElement e;
e.type = OsmElement::EntityType::Node;
e.id = 1;
e.lon = hotel.lon;
e.lat = hotel.lat;
e.AddTag("name", hotel.name);
e.AddTag("ref:sponsored", strings::to_string(hotel.id));
e.AddTag("website", hotel.descUrl);
e.AddTag("rating:sponsored", strings::to_string(hotel.ratingUser));
e.AddTag("stars", strings::to_string(hotel.stars));
e.AddTag("price_rate", strings::to_string(hotel.priceCategory));
e.AddTag("addr:full", hotel.address);
switch (hotel.type)
{
case 19:
case 205: e.AddTag("tourism", "motel"); break;
case 21:
case 206:
case 212: e.AddTag("tourism", "resort"); break;
case 3:
case 23:
case 24:
case 25:
case 202:
case 207:
case 208:
case 209:
case 210:
case 216:
case 220:
case 223: e.AddTag("tourism", "guest_house"); break;
case 14:
case 204:
case 213:
case 218:
case 219:
case 226:
case 222: e.AddTag("tourism", "hotel"); break;
case 211:
case 224:
case 228: e.AddTag("tourism", "chalet"); break;
case 13:
case 225:
case 203: e.AddTag("tourism", "hostel"); break;
case 215:
case 221:
case 227:
case 2:
case 201: e.AddTag("tourism", "apartment"); break;
case 214: e.AddTag("tourism", "camp_site"); break;
default: e.AddTag("tourism", "hotel"); break;
}
fn(&e);
}
}

View file

@ -0,0 +1,70 @@
#pragma once
#include "generator/osm_element.hpp"
#include "boost/geometry.hpp"
#include "boost/geometry/geometries/point.hpp"
#include "boost/geometry/geometries/box.hpp"
#include "boost/geometry/index/rtree.hpp"
#include "std/function.hpp"
#include "std/string.hpp"
class BookingDataset
{
public:
struct BookingHotel
{
enum class Fields : size_t
{
Id = 0,
Latitude = 1,
Longtitude = 2,
Name = 3,
Address = 4,
Stars = 5,
PriceCategory = 6,
RatingBooking = 7,
RatingUsers = 8,
DescUrl = 9,
Type = 10,
Counter
};
uint32_t id = 0;
double lat = 0.0;
double lon = 0.0;
string name;
string address;
uint32_t stars = 0;
uint32_t priceCategory = 0;
double ratingBooking = 0.0;
double ratingUser = 0.0;
string descUrl;
uint32_t type = 0;
constexpr size_t Index(Fields field) const { return static_cast<size_t>(field); }
constexpr size_t FieldsCount() const { return static_cast<size_t>(Fields::Counter); }
BookingHotel(string const &src);
};
BookingDataset(string const & dataPath);
bool Filter(OsmElement const & e) const;
void BuildFeatures(function<void(OsmElement *)> const & fn) const;
protected:
vector<BookingHotel> m_hotels;
// create the rtree using default constructor
using TPoint = boost::geometry::model::point<float, 2, boost::geometry::cs::cartesian>;
using TBox = boost::geometry::model::box<TPoint>;
using TValue = pair<TBox, size_t>;
boost::geometry::index::rtree<TValue, boost::geometry::index::quadratic<16>> m_rtree;
void LoadBookingHotels(string const & path);
bool MatchWithBooking(OsmElement const & e) const;
};

View file

@ -41,6 +41,8 @@ struct GenerateInfo
NodeStorageType m_nodeStorageType;
OsmSourceType m_osmFileType;
string m_osmFileName;
string m_bookingDatafileName;
uint32_t m_versionDate = 0;

View file

@ -14,6 +14,7 @@ INCLUDEPATH *= $$ROOT_DIR/3party/gflags/src \
QT *= core
SOURCES += \
booking_dataset.cpp \
borders_generator.cpp \
borders_loader.cpp \
check_model.cpp \
@ -37,6 +38,7 @@ SOURCES += \
unpack_mwm.cpp \
HEADERS += \
booking_dataset.hpp \
borders_generator.hpp \
borders_loader.hpp \
check_model.hpp \

View file

@ -67,6 +67,7 @@ DEFINE_bool(make_cross_section, false, "Make corss section in routing file for c
DEFINE_string(osm_file_name, "", "Input osm area file");
DEFINE_string(osm_file_type, "xml", "Input osm area file type [xml, o5m]");
DEFINE_string(user_resource_path, "", "User defined resource path for classificator.txt and etc.");
DEFINE_string(booking_data, "", "Path to booking data in .tsv format");
DEFINE_uint64(planet_version, my::SecondsSinceEpoch(), "Version as seconds since epoch, by default - now");
int main(int argc, char ** argv)
@ -100,6 +101,7 @@ int main(int argc, char ** argv)
genInfo.m_osmFileName = FLAGS_osm_file_name;
genInfo.m_failOnCoasts = FLAGS_fail_on_coasts;
genInfo.m_preloadCache = FLAGS_preload_cache;
genInfo.m_bookingDatafileName = FLAGS_booking_data;
genInfo.m_versionDate = static_cast<uint32_t>(FLAGS_planet_version);

View file

@ -1,5 +1,6 @@
#include "generator/osm_element.hpp"
#include "base/string_utils.hpp"
#include "coding/parse_xml.hpp"
#include "std/cstdio.hpp"
@ -63,7 +64,9 @@ void OsmElement::AddTag(string const & k, string const & v)
SKIP_KEY("official_name");
#undef SKIP_KEY
m_tags.emplace_back(k, v);
string value = v;
strings::Trim(value);
m_tags.emplace_back(k, value);
}
string OsmElement::ToString(string const & shift) const

View file

@ -1,3 +1,4 @@
#include "generator/booking_dataset.hpp"
#include "generator/coastlines_generator.hpp"
#include "generator/feature_generator.hpp"
#include "generator/intermediate_data.hpp"
@ -511,12 +512,19 @@ bool GenerateFeaturesImpl(feature::GenerateInfo & info)
TagAdmixer tagAdmixer(info.GetIntermediateFileName("ways", ".csv"),
info.GetIntermediateFileName("towns", ".csv"));
TagReplacer tagReplacer(GetPlatform().ResourcesDir() + REPLACED_TAGS_FILE);
// If info.m_bookingDatafileName is empty then no data will be loaded.
BookingDataset bookingDataset(info.m_bookingDatafileName);
// Here we can add new tags to element!!!
auto const fn = [&](OsmElement * e)
{
tagReplacer(e);
tagAdmixer(e);
if (bookingDataset.Filter(*e))
return;
parser.EmitElement(e);
};
@ -533,6 +541,12 @@ bool GenerateFeaturesImpl(feature::GenerateInfo & info)
LOG(LINFO, ("Processing", info.m_osmFileName, "done."));
if (!info.m_bookingDatafileName.empty())
{
bookingDataset.BuildFeatures([&](OsmElement * e) { parser.EmitElement(e); });
LOG(LINFO, ("Processing booking data from", info.m_bookingDatafileName, "done."));
}
parser.Finish();
// Stop if coasts are not merged and FLAG_fail_on_coasts is set

View file

@ -1,4 +1,5 @@
#pragma once
#include "base/stl_add.hpp"
#include "base/string_utils.hpp"
#include "std/algorithm.hpp"

View file

@ -57,6 +57,8 @@
677E2A161CAACC5F001DC42A /* tag_admixer.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 677E2A111CAACC5F001DC42A /* tag_admixer.hpp */; };
677E2A171CAACC5F001DC42A /* towns_dumper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 677E2A121CAACC5F001DC42A /* towns_dumper.cpp */; };
677E2A181CAACC5F001DC42A /* towns_dumper.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 677E2A131CAACC5F001DC42A /* towns_dumper.hpp */; };
67A0FEBE1CEB467F008F2A61 /* booking_dataset.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 67A0FEBC1CEB467F008F2A61 /* booking_dataset.cpp */; };
67A0FEBF1CEB467F008F2A61 /* booking_dataset.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 67A0FEBD1CEB467F008F2A61 /* booking_dataset.hpp */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
@ -113,6 +115,8 @@
677E2A111CAACC5F001DC42A /* tag_admixer.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = tag_admixer.hpp; sourceTree = "<group>"; };
677E2A121CAACC5F001DC42A /* towns_dumper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = towns_dumper.cpp; sourceTree = "<group>"; };
677E2A131CAACC5F001DC42A /* towns_dumper.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = towns_dumper.hpp; sourceTree = "<group>"; };
67A0FEBC1CEB467F008F2A61 /* booking_dataset.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = booking_dataset.cpp; sourceTree = "<group>"; };
67A0FEBD1CEB467F008F2A61 /* booking_dataset.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = booking_dataset.hpp; sourceTree = "<group>"; };
67F0F6761B8C9DCE003F52FF /* osm_xml_source.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = osm_xml_source.hpp; sourceTree = "<group>"; };
/* End PBXFileReference section */
@ -199,6 +203,8 @@
670B84BB1A8CDB0000CE4492 /* osm_source.hpp */,
6764B8921ADD6A3300DD8B15 /* osm_o5m_source.hpp */,
67F0F6761B8C9DCE003F52FF /* osm_xml_source.hpp */,
67A0FEBC1CEB467F008F2A61 /* booking_dataset.cpp */,
67A0FEBD1CEB467F008F2A61 /* booking_dataset.hpp */,
);
name = generator;
path = ../../generator;
@ -227,6 +233,7 @@
675340741A3F2A7400A0A8C3 /* generate_info.hpp in Headers */,
677E2A161CAACC5F001DC42A /* tag_admixer.hpp in Headers */,
675340861A3F2A7400A0A8C3 /* tesselator.hpp in Headers */,
67A0FEBF1CEB467F008F2A61 /* booking_dataset.hpp in Headers */,
6753405F1A3F2A7400A0A8C3 /* borders_loader.hpp in Headers */,
675340801A3F2A7400A0A8C3 /* polygonizer.hpp in Headers */,
675340941C5231BA002CF0D9 /* search_index_builder.hpp in Headers */,
@ -309,6 +316,7 @@
675340811A3F2A7400A0A8C3 /* routing_generator.cpp in Sources */,
675340931C5231BA002CF0D9 /* search_index_builder.cpp in Sources */,
6753406E1A3F2A7400A0A8C3 /* feature_merger.cpp in Sources */,
67A0FEBE1CEB467F008F2A61 /* booking_dataset.cpp in Sources */,
6753408D1A3F2A7400A0A8C3 /* osm_element.cpp in Sources */,
6726C1D51A4AFEF4005EEA39 /* osm2meta.cpp in Sources */,
6753405E1A3F2A7400A0A8C3 /* borders_loader.cpp in Sources */,