Refactor booking (now sponsored) datase. Allow new datasets.

This commit is contained in:
Sergey Magidovich 2016-10-04 10:31:40 +03:00
parent d763f66c0d
commit 964eb58b81
13 changed files with 400 additions and 157 deletions

View file

@ -1,7 +1,7 @@
#include "generator/booking_dataset.hpp"
#include "generator/booking_scoring.hpp"
#include "generator/feature_builder.hpp"
#include "generator/sponsored_scoring.hpp"
#include "indexer/classificator.hpp"
#include "indexer/ftypes_matcher.hpp"
@ -72,6 +72,7 @@ bool BookingDataset::NecessaryMatchingConditionHolds(FeatureBuilder1 const & fb)
return ftypes::IsHotelChecker::Instance()(fb.GetTypes());
}
// TODO(mgsergio): Try to eliminate as much code duplication as possible. (See opentable_dataset.cpp)
template <>
void BookingDataset::BuildObject(Object const & hotel,
function<void(FeatureBuilder1 &)> const & fn) const
@ -82,7 +83,6 @@ void BookingDataset::BuildObject(Object const & hotel,
fb.SetCenter(MercatorBounds::FromLatLon(hotel.m_lat, hotel.m_lon));
auto & metadata = params.GetMetadata();
// TODO(mgsergio): Rename FMD_SPONSORED_ID to FMD_BOOKING_ID.
metadata.Set(feature::Metadata::FMD_SPONSORED_ID, strings::to_string(hotel.m_id.Get()));
metadata.Set(feature::Metadata::FMD_WEBSITE, hotel.m_descUrl);
metadata.Set(feature::Metadata::FMD_RATING, strings::to_string(hotel.m_ratingUser));
@ -186,7 +186,7 @@ BookingDataset::ObjectId BookingDataset::FindMatchingObjectIdImpl(FeatureBuilder
for (auto const j : bookingIndexes)
{
if (booking_scoring::Match(GetObjectById(j), fb).IsMatched())
if (sponsored_scoring::Match(GetObjectById(j), fb).IsMatched())
return j;
}

View file

@ -2,23 +2,11 @@
#include "generator/sponsored_dataset.hpp"
#include "indexer/index.hpp"
#include "search/reverse_geocoder.hpp"
#include "base/newtype.hpp"
#include "std/function.hpp"
#include "std/map.hpp"
#include "std/limits.hpp"
#include "std/string.hpp"
#include "boost/geometry.hpp"
#include "boost/geometry/geometries/box.hpp"
#include "boost/geometry/geometries/point.hpp"
#include "boost/geometry/index/rtree.hpp"
class FeatureBuilder1;
namespace generator
{
// TODO(mgsergio): Try to get rid of code deuplication. (See OpenTableRestaurant)

View file

@ -1,144 +1,47 @@
#include "generator/booking_scoring.hpp"
#include "generator/sponsored_scoring.hpp"
#include "generator/booking_dataset.hpp"
#include "generator/feature_builder.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "geometry/distance_on_sphere.hpp"
#include "base/collection_cast.hpp"
#include "base/stl_iterator.hpp"
#include "std/algorithm.hpp"
#include "std/vector.hpp"
namespace generator
{
namespace booking_scoring
{
namespace
{
// Calculated with tools/python/booking_hotels_quality.py.
double constexpr kOptimalThreshold = 0.304875;
template <typename T, typename U>
struct decay_equiv :
std::is_same<typename std::decay<T>::type, U>::type
{};
using WeightedBagOfWords = vector<pair<strings::UniString, double>>;
vector<strings::UniString> StringToSetOfWords(string const & str)
{
vector<strings::UniString> result;
search::NormalizeAndTokenizeString(str, result, search::Delimiters{});
sort(begin(result), end(result));
return result;
}
WeightedBagOfWords MakeWeightedBagOfWords(vector<strings::UniString> const & words)
{
// TODO(mgsergio): Calculate tf-idsf score for every word.
auto constexpr kTfIdfScorePlaceholder = 1;
WeightedBagOfWords result;
for (auto i = 0; i < words.size(); ++i)
{
result.emplace_back(words[i], kTfIdfScorePlaceholder);
while (i + 1 < words.size() && words[i] == words[i + 1])
{
result.back().second += kTfIdfScorePlaceholder; // TODO(mgsergio): tf-idf score for result[i].frist;
++i;
}
}
return result;
}
double WeightedBagsDotProduct(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
{
double result{};
auto lhsIt = begin(lhs);
auto rhsIt = begin(rhs);
while (lhsIt != end(lhs) && rhsIt != end(rhs))
{
if (lhsIt->first == rhsIt->first)
{
result += lhsIt->second * rhsIt->second;
++lhsIt;
++rhsIt;
}
else if (lhsIt->first < rhsIt->first)
{
++lhsIt;
}
else
{
++rhsIt;
}
}
return result;
}
double WeightedBagOfWordsCos(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
{
auto const product = WeightedBagsDotProduct(lhs, rhs);
auto const lhsLength = sqrt(WeightedBagsDotProduct(lhs, lhs));
auto const rhsLength = sqrt(WeightedBagsDotProduct(rhs, rhs));
if (product == 0.0)
return 0.0;
return product / (lhsLength * rhsLength);
}
double GetLinearNormDistanceScore(double distance)
{
distance = my::clamp(distance, 0, BookingDataset::kDistanceLimitInMeters);
return 1.0 - distance / BookingDataset::kDistanceLimitInMeters;
}
double GetNameSimilarityScore(string const & booking_name, string const & osm_name)
{
auto const aws = MakeWeightedBagOfWords(StringToSetOfWords(booking_name));
auto const bws = MakeWeightedBagOfWords(StringToSetOfWords(osm_name));
if (aws.empty() && bws.empty())
return 1.0;
if (aws.empty() || bws.empty())
return 0.0;
return WeightedBagOfWordsCos(aws, bws);
}
} // namespace
double BookingMatchScore::GetMatchingScore() const
namespace generator
{
namespace sponsored_scoring
{
template <>
double MatchStats<BookingHotel>::GetMatchingScore() const
{
// TODO(mgsergio): Use tuner to get optimal function.
return m_linearNormDistanceScore * m_nameSimilarityScore;
}
bool BookingMatchScore::IsMatched() const
template <>
bool MatchStats<BookingHotel>::IsMatched() const
{
return GetMatchingScore() > kOptimalThreshold;
}
BookingMatchScore Match(BookingDataset::Object const & h, FeatureBuilder1 const & fb)
// TODO(mgsergio): Do I need to spesialize this method?
template <>
MatchStats<BookingHotel> Match(BookingHotel const & h, FeatureBuilder1 const & fb)
{
BookingMatchScore score;
MatchStats<BookingHotel> score;
auto const fbCenter = MercatorBounds::ToLatLon(fb.GetKeyPoint());
auto const distance = ms::DistanceOnEarth(fbCenter.lat, fbCenter.lon, h.m_lat, h.m_lon);
score.m_linearNormDistanceScore = GetLinearNormDistanceScore(distance);
score.m_linearNormDistanceScore =
impl::GetLinearNormDistanceScore(distance, BookingDataset::kDistanceLimitInMeters);
// TODO(mgsergio): Check all translations and use the best one.
score.m_nameSimilarityScore =
GetNameSimilarityScore(h.m_name, fb.GetName(StringUtf8Multilang::kDefaultCode));
impl::GetNameSimilarityScore(h.m_name, fb.GetName(StringUtf8Multilang::kDefaultCode));
return score;
}
} // namespace booking_scoring
} // namespace sponsored_scoring
} // namespace generator

View file

@ -1,22 +0,0 @@
#pragma once
#include "generator/booking_dataset.hpp"
class FeatureBuilder1;
namespace generator
{
namespace booking_scoring
{
struct BookingMatchScore
{
double GetMatchingScore() const;
bool IsMatched() const;
double m_linearNormDistanceScore{};
double m_nameSimilarityScore{};
};
BookingMatchScore Match(BookingDataset::Object const & h, FeatureBuilder1 const & fb);
} // namespace booking_scoring
} // namespace generator

View file

@ -42,6 +42,9 @@ struct GenerateInfo
string m_bookingDatafileName;
string m_bookingReferenceDir;
string m_opentableDataFile;
// TODO(mgsergio): Uncomment when I need this.
// string m_opentableReferenceDir;
uint32_t m_versionDate = 0;

View file

@ -29,6 +29,8 @@ SOURCES += \
feature_generator.cpp \
feature_merger.cpp \
feature_sorter.cpp \
opentable_dataset.cpp \
opentable_scoring.cpp \
osm2meta.cpp \
osm2type.cpp \
osm_element.cpp \
@ -37,6 +39,7 @@ SOURCES += \
region_meta.cpp \
routing_generator.cpp \
search_index_builder.cpp \
sponsored_scoring.cpp \
srtm_parser.cpp \
statistics.cpp \
tesselator.cpp \
@ -47,7 +50,6 @@ HEADERS += \
aggregating_sponsored_dataset.hpp \
altitude_generator.hpp \
booking_dataset.hpp \
booking_scoring.hpp \
borders_generator.hpp \
borders_loader.hpp \
centers_table_builder.hpp \
@ -63,6 +65,7 @@ HEADERS += \
generate_info.hpp \
intermediate_data.hpp\
intermediate_elements.hpp\
opentable_datatset.hpp \
osm2meta.hpp \
osm2type.hpp \
osm_element.hpp \
@ -76,6 +79,7 @@ HEADERS += \
search_index_builder.hpp \
sponsored_dataset.hpp \
sponsored_dataset_inl.hpp \
sponsored_scoring.hpp \
srtm_parser.hpp \
statistics.hpp \
tag_admixer.hpp \

View file

@ -0,0 +1,125 @@
#include "generator/opentable_dataset.hpp"
//#include "generator/openatble_scoring.hpp" // or just sonsored scoring
#include "generator/feature_builder.hpp"
#include "indexer/classificator.hpp"
#include "indexer/ftypes_matcher.hpp"
#include "base/string_utils.hpp"
namespace generator
{
namespace
{
string EscapeTabs(string const & str)
{
stringstream ss;
for (char c : str)
{
if (c == '\t')
ss << "\\t";
else
ss << c;
}
return ss.str();
}
} // namespace
// OpentableRestaurant ------------------------------------------------------------------------------
OpentableRestaurant::OpentableRestaurant(string const & src)
{
vector<string> rec;
strings::ParseCSVRow(src, '\t', rec);
CHECK_EQUAL(rec.size(), FieldsCount(), ("Error parsing restaurants.tsv line:", EscapeTabs(src)));
strings::to_uint(rec[Index(Fields::Id)], m_id.Get());
// TODO(mgsergio): Use ms::LatLon.
strings::to_double(rec[Index(Fields::Latitude)], m_lat);
strings::to_double(rec[Index(Fields::Longtitude)], m_lon);
m_name = rec[Index(Fields::Name)];
m_address = rec[Index(Fields::Address)];
m_descUrl = rec[Index(Fields::DescUrl)];
}
ostream & operator<<(ostream & s, OpentableRestaurant const & h)
{
s << fixed << setprecision(7);
return s << "Id: " << h.m_id << "\t Name: " << h.m_name << "\t Address: " << h.m_address
<< "\t lat: " << h.m_lat << " lon: " << h.m_lon;
}
// OpentableDataset ---------------------------------------------------------------------------------
template <>
bool OpentableDataset::NecessaryMatchingConditionHolds(FeatureBuilder1 const & fb) const
{
if (fb.GetName(StringUtf8Multilang::kDefaultCode).empty())
return false;
// TODO(mgsergio): Handle all types of restaurants:
// bar cafe (fast_food ??) pub restaurant
// return ftypes::IsRestaurantChecker::Instance()(fb.GetTypes());
return true;
}
// TODO(mgsergio): Try to eliminate as much code duplication as possible. (See booking_dataset.cpp)
template <>
void OpentableDataset::BuildObject(Object const & restaurant,
function<void(FeatureBuilder1 &)> const & fn) const
{
FeatureBuilder1 fb;
FeatureParams params;
fb.SetCenter(MercatorBounds::FromLatLon(restaurant.m_lat, restaurant.m_lon));
auto & metadata = params.GetMetadata();
// TODO(mgsergio): Rename FMD_SPONSORED_ID to FMD_BOOKING_ID.
metadata.Set(feature::Metadata::FMD_SPONSORED_ID, strings::to_string(restaurant.m_id.Get()));
metadata.Set(feature::Metadata::FMD_WEBSITE, restaurant.m_descUrl);
// params.AddAddress(restaurant.address);
// TODO(mgsergio): addr:full ???
if (!restaurant.m_street.empty())
fb.AddStreet(restaurant.m_street);
if (!restaurant.m_houseNumber.empty())
fb.AddHouseNumber(restaurant.m_houseNumber);
params.AddName(StringUtf8Multilang::GetLangByCode(StringUtf8Multilang::kDefaultCode),
restaurant.m_name);
auto const & clf = classif();
params.AddType(clf.GetTypeByPath({"sponsored", "booking"}));
fb.SetParams(params);
fn(fb);
}
template <>
OpentableDataset::ObjectId OpentableDataset::FindMatchingObjectIdImpl(FeatureBuilder1 const & fb) const
{
auto const name = fb.GetName(StringUtf8Multilang::kDefaultCode);
if (name.empty())
return Object::InvalidObjectId();
// Find |kMaxSelectedElements| nearest values to a point.
auto const bookingIndexes = GetNearestObjects(MercatorBounds::ToLatLon(fb.GetKeyPoint()),
kMaxSelectedElements, kDistanceLimitInMeters);
CHECK(false, ("Not implemented yet"));
// for (auto const j : bookingIndexes)
// {
// if (booking_scoring::Match(GetObjectById(j), fb).IsMatched())
// return j;
// }
return Object::InvalidObjectId();
}
} // namespace generator

View file

@ -0,0 +1,58 @@
#pragma once
#include "generator/sponsored_dataset.hpp"
#include "base/newtype.hpp"
#include "std/limits.hpp"
#include "std/string.hpp"
namespace generator
{
// TODO(mgsergio): Try to get rid of code deuplication. (See BookingHotel)
struct OpentableRestaurant
{
NEWTYPE(uint32_t, ObjectId);
enum class Fields
{
Id = 0,
Latitude,
Longtitude,
Name,
Address,
DescUrl,
Phone,
// Opentable doesn't have translations.
// Translations,
Counter
};
static constexpr ObjectId InvalidObjectId()
{
return ObjectId(numeric_limits<typename ObjectId::RepType>::max());
}
explicit OpentableRestaurant(string const & src);
static constexpr size_t Index(Fields field) { return static_cast<size_t>(field); }
static constexpr size_t FieldsCount() { return static_cast<size_t>(Fields::Counter); }
bool IsAddressPartsFilled() const { return !m_street.empty() || !m_houseNumber.empty(); }
ObjectId m_id{InvalidObjectId()};
double m_lat = 0.0;
double m_lon = 0.0;
string m_name;
string m_street;
string m_houseNumber;
string m_address;
string m_descUrl;
// string m_translations;
};
ostream & operator<<(ostream & s, OpentableRestaurant const & r);
NEWTYPE_SIMPLE_OUTPUT(OpentableRestaurant::ObjectId);
using OpentableDataset = SponsoredDataset<OpentableRestaurant>;
} // namespace generator

View file

@ -0,0 +1,45 @@
#include "generator/sponsored_scoring.hpp"
#include "generator/opentable_dataset.hpp"
#include "generator/feature_builder.hpp"
namespace
{
// Calculated with tools/python/booking_hotels_quality.py.
double constexpr kOptimalThreshold = 0.304875;
} // namespace
namespace generator
{
namespace sponsored_scoring
{
template <>
double MatchStats<OpentableRestaurant>::GetMatchingScore() const
{
// TODO(mgsergio): Use tuner to get optimal function.
return m_linearNormDistanceScore * m_nameSimilarityScore;
}
template <>
bool MatchStats<OpentableRestaurant>::IsMatched() const
{
return GetMatchingScore() > kOptimalThreshold;
}
template <>
MatchStats<OpentableRestaurant> Match(OpentableRestaurant const & h, FeatureBuilder1 const & fb)
{
MatchStats<OpentableRestaurant> score;
auto const fbCenter = MercatorBounds::ToLatLon(fb.GetKeyPoint());
auto const distance = ms::DistanceOnEarth(fbCenter.lat, fbCenter.lon, h.m_lat, h.m_lon);
score.m_linearNormDistanceScore =
impl::GetLinearNormDistanceScore(distance, OpentableDataset::kDistanceLimitInMeters);
score.m_nameSimilarityScore =
impl::GetNameSimilarityScore(h.m_name, fb.GetName(StringUtf8Multilang::kDefaultCode));
return score;
}
} // namespace sponsored_scoring
} // namespace generator

View file

@ -11,7 +11,6 @@
#include "base/newtype.hpp"
#include "std/function.hpp"
#include "std/limits.hpp"
#include "std/map.hpp"
#include "std/string.hpp"

View file

@ -7,7 +7,6 @@
#include "std/fstream.hpp"
#include "std/iostream.hpp"
#include "std/limits.hpp"
namespace generator
{
@ -137,7 +136,7 @@ void SponsoredDataset<SponsoredObject>::LoadData(istream & src, string const & a
if (!addressReferencePath.empty())
{
LOG(LINFO, ("Reference addresses for booking objects", addressReferencePath));
LOG(LINFO, ("Reference addresses for sponsored objects", addressReferencePath));
Platform & platform = GetPlatform();
string const backupPath = platform.WritableDir();
// TODO(mgsergio): What is this for?

View file

@ -0,0 +1,107 @@
#include "generator/sponsored_scoring.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "geometry/distance_on_sphere.hpp"
// #include "base/stl_iterator.hpp"
#include "std/algorithm.hpp"
#include "std/vector.hpp"
namespace
{
using WeightedBagOfWords = vector<pair<strings::UniString, double>>;
vector<strings::UniString> StringToSetOfWords(string const & str)
{
vector<strings::UniString> result;
search::NormalizeAndTokenizeString(str, result, search::Delimiters{});
sort(begin(result), end(result));
return result;
}
WeightedBagOfWords MakeWeightedBagOfWords(vector<strings::UniString> const & words)
{
// TODO(mgsergio): Calculate tf-idsf score for every word.
auto constexpr kTfIdfScorePlaceholder = 1;
WeightedBagOfWords result;
for (auto i = 0; i < words.size(); ++i)
{
result.emplace_back(words[i], kTfIdfScorePlaceholder);
while (i + 1 < words.size() && words[i] == words[i + 1])
{
result.back().second += kTfIdfScorePlaceholder; // TODO(mgsergio): tf-idf score for result[i].frist;
++i;
}
}
return result;
}
double WeightedBagsDotProduct(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
{
double result{};
auto lhsIt = begin(lhs);
auto rhsIt = begin(rhs);
while (lhsIt != end(lhs) && rhsIt != end(rhs))
{
if (lhsIt->first == rhsIt->first)
{
result += lhsIt->second * rhsIt->second;
++lhsIt;
++rhsIt;
}
else if (lhsIt->first < rhsIt->first)
{
++lhsIt;
}
else
{
++rhsIt;
}
}
return result;
}
double WeightedBagOfWordsCos(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
{
auto const product = WeightedBagsDotProduct(lhs, rhs);
auto const lhsLength = sqrt(WeightedBagsDotProduct(lhs, lhs));
auto const rhsLength = sqrt(WeightedBagsDotProduct(rhs, rhs));
if (product == 0.0)
return 0.0;
return product / (lhsLength * rhsLength);
}
} // namespace
namespace generator
{
namespace impl
{
double GetLinearNormDistanceScore(double distance, double const maxDistance)
{
distance = my::clamp(distance, 0, maxDistance);
return 1.0 - distance / maxDistance;
}
double GetNameSimilarityScore(string const & booking_name, string const & osm_name)
{
auto const aws = MakeWeightedBagOfWords(StringToSetOfWords(booking_name));
auto const bws = MakeWeightedBagOfWords(StringToSetOfWords(osm_name));
if (aws.empty() && bws.empty())
return 1.0;
if (aws.empty() || bws.empty())
return 0.0;
return WeightedBagOfWordsCos(aws, bws);
}
} // namespace impl
} // namespace generator

View file

@ -0,0 +1,34 @@
#pragma once
#include "generator/booking_dataset.hpp"
class FeatureBuilder1;
namespace generator
{
namespace impl
{
double GetLinearNormDistanceScore(double distance, double maxDistance);
double GetNameSimilarityScore(string const & booking_name, string const & osm_name);
} // namespace impl
namespace sponsored_scoring
{
/// Represents a match scoring statystics of a sponsored object agains osm object.
template <typename SponsoredObject>
struct MatchStats
{
/// Returns some score based on geven fields and classificator tuning.
double GetMatchingScore() const;
/// Returns true if GetMatchingScore is greater then some theshold.
bool IsMatched() const;
double m_linearNormDistanceScore{};
double m_nameSimilarityScore{};
};
/// Matches a given sponsored object against a given OSM object.
template <typename SponsoredObject>
MatchStats<SponsoredObject> Match(SponsoredObject const & o, FeatureBuilder1 const & fb);
} // namespace booking_scoring
} // namespace generator