forked from organicmaps/organicmaps-tmp
Refactor booking (now sponsored) datase. Allow new datasets.
This commit is contained in:
parent
d763f66c0d
commit
964eb58b81
13 changed files with 400 additions and 157 deletions
|
@ -1,7 +1,7 @@
|
|||
#include "generator/booking_dataset.hpp"
|
||||
|
||||
#include "generator/booking_scoring.hpp"
|
||||
#include "generator/feature_builder.hpp"
|
||||
#include "generator/sponsored_scoring.hpp"
|
||||
|
||||
#include "indexer/classificator.hpp"
|
||||
#include "indexer/ftypes_matcher.hpp"
|
||||
|
@ -72,6 +72,7 @@ bool BookingDataset::NecessaryMatchingConditionHolds(FeatureBuilder1 const & fb)
|
|||
return ftypes::IsHotelChecker::Instance()(fb.GetTypes());
|
||||
}
|
||||
|
||||
// TODO(mgsergio): Try to eliminate as much code duplication as possible. (See opentable_dataset.cpp)
|
||||
template <>
|
||||
void BookingDataset::BuildObject(Object const & hotel,
|
||||
function<void(FeatureBuilder1 &)> const & fn) const
|
||||
|
@ -82,7 +83,6 @@ void BookingDataset::BuildObject(Object const & hotel,
|
|||
fb.SetCenter(MercatorBounds::FromLatLon(hotel.m_lat, hotel.m_lon));
|
||||
|
||||
auto & metadata = params.GetMetadata();
|
||||
// TODO(mgsergio): Rename FMD_SPONSORED_ID to FMD_BOOKING_ID.
|
||||
metadata.Set(feature::Metadata::FMD_SPONSORED_ID, strings::to_string(hotel.m_id.Get()));
|
||||
metadata.Set(feature::Metadata::FMD_WEBSITE, hotel.m_descUrl);
|
||||
metadata.Set(feature::Metadata::FMD_RATING, strings::to_string(hotel.m_ratingUser));
|
||||
|
@ -186,7 +186,7 @@ BookingDataset::ObjectId BookingDataset::FindMatchingObjectIdImpl(FeatureBuilder
|
|||
|
||||
for (auto const j : bookingIndexes)
|
||||
{
|
||||
if (booking_scoring::Match(GetObjectById(j), fb).IsMatched())
|
||||
if (sponsored_scoring::Match(GetObjectById(j), fb).IsMatched())
|
||||
return j;
|
||||
}
|
||||
|
||||
|
|
|
@ -2,23 +2,11 @@
|
|||
|
||||
#include "generator/sponsored_dataset.hpp"
|
||||
|
||||
#include "indexer/index.hpp"
|
||||
|
||||
#include "search/reverse_geocoder.hpp"
|
||||
|
||||
#include "base/newtype.hpp"
|
||||
|
||||
#include "std/function.hpp"
|
||||
#include "std/map.hpp"
|
||||
#include "std/limits.hpp"
|
||||
#include "std/string.hpp"
|
||||
|
||||
#include "boost/geometry.hpp"
|
||||
#include "boost/geometry/geometries/box.hpp"
|
||||
#include "boost/geometry/geometries/point.hpp"
|
||||
#include "boost/geometry/index/rtree.hpp"
|
||||
|
||||
class FeatureBuilder1;
|
||||
|
||||
namespace generator
|
||||
{
|
||||
// TODO(mgsergio): Try to get rid of code deuplication. (See OpenTableRestaurant)
|
||||
|
|
|
@ -1,144 +1,47 @@
|
|||
#include "generator/booking_scoring.hpp"
|
||||
#include "generator/sponsored_scoring.hpp"
|
||||
|
||||
#include "generator/booking_dataset.hpp"
|
||||
#include "generator/feature_builder.hpp"
|
||||
|
||||
#include "indexer/search_delimiters.hpp"
|
||||
#include "indexer/search_string_utils.hpp"
|
||||
|
||||
#include "geometry/distance_on_sphere.hpp"
|
||||
|
||||
#include "base/collection_cast.hpp"
|
||||
#include "base/stl_iterator.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
namespace generator
|
||||
{
|
||||
namespace booking_scoring
|
||||
{
|
||||
namespace
|
||||
{
|
||||
// Calculated with tools/python/booking_hotels_quality.py.
|
||||
double constexpr kOptimalThreshold = 0.304875;
|
||||
|
||||
template <typename T, typename U>
|
||||
struct decay_equiv :
|
||||
std::is_same<typename std::decay<T>::type, U>::type
|
||||
{};
|
||||
|
||||
using WeightedBagOfWords = vector<pair<strings::UniString, double>>;
|
||||
|
||||
vector<strings::UniString> StringToSetOfWords(string const & str)
|
||||
{
|
||||
vector<strings::UniString> result;
|
||||
search::NormalizeAndTokenizeString(str, result, search::Delimiters{});
|
||||
sort(begin(result), end(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
WeightedBagOfWords MakeWeightedBagOfWords(vector<strings::UniString> const & words)
|
||||
{
|
||||
// TODO(mgsergio): Calculate tf-idsf score for every word.
|
||||
auto constexpr kTfIdfScorePlaceholder = 1;
|
||||
|
||||
WeightedBagOfWords result;
|
||||
for (auto i = 0; i < words.size(); ++i)
|
||||
{
|
||||
result.emplace_back(words[i], kTfIdfScorePlaceholder);
|
||||
while (i + 1 < words.size() && words[i] == words[i + 1])
|
||||
{
|
||||
result.back().second += kTfIdfScorePlaceholder; // TODO(mgsergio): tf-idf score for result[i].frist;
|
||||
++i;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
double WeightedBagsDotProduct(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
|
||||
{
|
||||
double result{};
|
||||
|
||||
auto lhsIt = begin(lhs);
|
||||
auto rhsIt = begin(rhs);
|
||||
|
||||
while (lhsIt != end(lhs) && rhsIt != end(rhs))
|
||||
{
|
||||
if (lhsIt->first == rhsIt->first)
|
||||
{
|
||||
result += lhsIt->second * rhsIt->second;
|
||||
++lhsIt;
|
||||
++rhsIt;
|
||||
}
|
||||
else if (lhsIt->first < rhsIt->first)
|
||||
{
|
||||
++lhsIt;
|
||||
}
|
||||
else
|
||||
{
|
||||
++rhsIt;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
double WeightedBagOfWordsCos(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
|
||||
{
|
||||
auto const product = WeightedBagsDotProduct(lhs, rhs);
|
||||
auto const lhsLength = sqrt(WeightedBagsDotProduct(lhs, lhs));
|
||||
auto const rhsLength = sqrt(WeightedBagsDotProduct(rhs, rhs));
|
||||
|
||||
if (product == 0.0)
|
||||
return 0.0;
|
||||
|
||||
return product / (lhsLength * rhsLength);
|
||||
}
|
||||
|
||||
double GetLinearNormDistanceScore(double distance)
|
||||
{
|
||||
distance = my::clamp(distance, 0, BookingDataset::kDistanceLimitInMeters);
|
||||
return 1.0 - distance / BookingDataset::kDistanceLimitInMeters;
|
||||
}
|
||||
|
||||
double GetNameSimilarityScore(string const & booking_name, string const & osm_name)
|
||||
{
|
||||
auto const aws = MakeWeightedBagOfWords(StringToSetOfWords(booking_name));
|
||||
auto const bws = MakeWeightedBagOfWords(StringToSetOfWords(osm_name));
|
||||
|
||||
if (aws.empty() && bws.empty())
|
||||
return 1.0;
|
||||
if (aws.empty() || bws.empty())
|
||||
return 0.0;
|
||||
|
||||
return WeightedBagOfWordsCos(aws, bws);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
double BookingMatchScore::GetMatchingScore() const
|
||||
namespace generator
|
||||
{
|
||||
namespace sponsored_scoring
|
||||
{
|
||||
template <>
|
||||
double MatchStats<BookingHotel>::GetMatchingScore() const
|
||||
{
|
||||
// TODO(mgsergio): Use tuner to get optimal function.
|
||||
return m_linearNormDistanceScore * m_nameSimilarityScore;
|
||||
}
|
||||
|
||||
bool BookingMatchScore::IsMatched() const
|
||||
template <>
|
||||
bool MatchStats<BookingHotel>::IsMatched() const
|
||||
{
|
||||
return GetMatchingScore() > kOptimalThreshold;
|
||||
}
|
||||
|
||||
BookingMatchScore Match(BookingDataset::Object const & h, FeatureBuilder1 const & fb)
|
||||
// TODO(mgsergio): Do I need to spesialize this method?
|
||||
template <>
|
||||
MatchStats<BookingHotel> Match(BookingHotel const & h, FeatureBuilder1 const & fb)
|
||||
{
|
||||
BookingMatchScore score;
|
||||
MatchStats<BookingHotel> score;
|
||||
|
||||
auto const fbCenter = MercatorBounds::ToLatLon(fb.GetKeyPoint());
|
||||
auto const distance = ms::DistanceOnEarth(fbCenter.lat, fbCenter.lon, h.m_lat, h.m_lon);
|
||||
score.m_linearNormDistanceScore = GetLinearNormDistanceScore(distance);
|
||||
score.m_linearNormDistanceScore =
|
||||
impl::GetLinearNormDistanceScore(distance, BookingDataset::kDistanceLimitInMeters);
|
||||
|
||||
// TODO(mgsergio): Check all translations and use the best one.
|
||||
score.m_nameSimilarityScore =
|
||||
GetNameSimilarityScore(h.m_name, fb.GetName(StringUtf8Multilang::kDefaultCode));
|
||||
impl::GetNameSimilarityScore(h.m_name, fb.GetName(StringUtf8Multilang::kDefaultCode));
|
||||
|
||||
return score;
|
||||
}
|
||||
} // namespace booking_scoring
|
||||
} // namespace sponsored_scoring
|
||||
} // namespace generator
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "generator/booking_dataset.hpp"
|
||||
|
||||
class FeatureBuilder1;
|
||||
|
||||
namespace generator
|
||||
{
|
||||
namespace booking_scoring
|
||||
{
|
||||
struct BookingMatchScore
|
||||
{
|
||||
double GetMatchingScore() const;
|
||||
bool IsMatched() const;
|
||||
|
||||
double m_linearNormDistanceScore{};
|
||||
double m_nameSimilarityScore{};
|
||||
};
|
||||
|
||||
BookingMatchScore Match(BookingDataset::Object const & h, FeatureBuilder1 const & fb);
|
||||
} // namespace booking_scoring
|
||||
} // namespace generator
|
|
@ -42,6 +42,9 @@ struct GenerateInfo
|
|||
|
||||
string m_bookingDatafileName;
|
||||
string m_bookingReferenceDir;
|
||||
string m_opentableDataFile;
|
||||
// TODO(mgsergio): Uncomment when I need this.
|
||||
// string m_opentableReferenceDir;
|
||||
|
||||
uint32_t m_versionDate = 0;
|
||||
|
||||
|
|
|
@ -29,6 +29,8 @@ SOURCES += \
|
|||
feature_generator.cpp \
|
||||
feature_merger.cpp \
|
||||
feature_sorter.cpp \
|
||||
opentable_dataset.cpp \
|
||||
opentable_scoring.cpp \
|
||||
osm2meta.cpp \
|
||||
osm2type.cpp \
|
||||
osm_element.cpp \
|
||||
|
@ -37,6 +39,7 @@ SOURCES += \
|
|||
region_meta.cpp \
|
||||
routing_generator.cpp \
|
||||
search_index_builder.cpp \
|
||||
sponsored_scoring.cpp \
|
||||
srtm_parser.cpp \
|
||||
statistics.cpp \
|
||||
tesselator.cpp \
|
||||
|
@ -47,7 +50,6 @@ HEADERS += \
|
|||
aggregating_sponsored_dataset.hpp \
|
||||
altitude_generator.hpp \
|
||||
booking_dataset.hpp \
|
||||
booking_scoring.hpp \
|
||||
borders_generator.hpp \
|
||||
borders_loader.hpp \
|
||||
centers_table_builder.hpp \
|
||||
|
@ -63,6 +65,7 @@ HEADERS += \
|
|||
generate_info.hpp \
|
||||
intermediate_data.hpp\
|
||||
intermediate_elements.hpp\
|
||||
opentable_datatset.hpp \
|
||||
osm2meta.hpp \
|
||||
osm2type.hpp \
|
||||
osm_element.hpp \
|
||||
|
@ -76,6 +79,7 @@ HEADERS += \
|
|||
search_index_builder.hpp \
|
||||
sponsored_dataset.hpp \
|
||||
sponsored_dataset_inl.hpp \
|
||||
sponsored_scoring.hpp \
|
||||
srtm_parser.hpp \
|
||||
statistics.hpp \
|
||||
tag_admixer.hpp \
|
||||
|
|
125
generator/opentable_dataset.cpp
Normal file
125
generator/opentable_dataset.cpp
Normal file
|
@ -0,0 +1,125 @@
|
|||
#include "generator/opentable_dataset.hpp"
|
||||
|
||||
//#include "generator/openatble_scoring.hpp" // or just sonsored scoring
|
||||
#include "generator/feature_builder.hpp"
|
||||
|
||||
#include "indexer/classificator.hpp"
|
||||
#include "indexer/ftypes_matcher.hpp"
|
||||
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
namespace generator
|
||||
{
|
||||
namespace
|
||||
{
|
||||
string EscapeTabs(string const & str)
|
||||
{
|
||||
stringstream ss;
|
||||
for (char c : str)
|
||||
{
|
||||
if (c == '\t')
|
||||
ss << "\\t";
|
||||
else
|
||||
ss << c;
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// OpentableRestaurant ------------------------------------------------------------------------------
|
||||
|
||||
OpentableRestaurant::OpentableRestaurant(string const & src)
|
||||
{
|
||||
vector<string> rec;
|
||||
strings::ParseCSVRow(src, '\t', rec);
|
||||
CHECK_EQUAL(rec.size(), FieldsCount(), ("Error parsing restaurants.tsv line:", EscapeTabs(src)));
|
||||
|
||||
strings::to_uint(rec[Index(Fields::Id)], m_id.Get());
|
||||
// TODO(mgsergio): Use ms::LatLon.
|
||||
strings::to_double(rec[Index(Fields::Latitude)], m_lat);
|
||||
strings::to_double(rec[Index(Fields::Longtitude)], m_lon);
|
||||
|
||||
m_name = rec[Index(Fields::Name)];
|
||||
m_address = rec[Index(Fields::Address)];
|
||||
|
||||
m_descUrl = rec[Index(Fields::DescUrl)];
|
||||
}
|
||||
|
||||
ostream & operator<<(ostream & s, OpentableRestaurant const & h)
|
||||
{
|
||||
s << fixed << setprecision(7);
|
||||
return s << "Id: " << h.m_id << "\t Name: " << h.m_name << "\t Address: " << h.m_address
|
||||
<< "\t lat: " << h.m_lat << " lon: " << h.m_lon;
|
||||
}
|
||||
|
||||
// OpentableDataset ---------------------------------------------------------------------------------
|
||||
|
||||
template <>
|
||||
bool OpentableDataset::NecessaryMatchingConditionHolds(FeatureBuilder1 const & fb) const
|
||||
{
|
||||
if (fb.GetName(StringUtf8Multilang::kDefaultCode).empty())
|
||||
return false;
|
||||
|
||||
// TODO(mgsergio): Handle all types of restaurants:
|
||||
// bar cafe (fast_food ??) pub restaurant
|
||||
// return ftypes::IsRestaurantChecker::Instance()(fb.GetTypes());
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO(mgsergio): Try to eliminate as much code duplication as possible. (See booking_dataset.cpp)
|
||||
template <>
|
||||
void OpentableDataset::BuildObject(Object const & restaurant,
|
||||
function<void(FeatureBuilder1 &)> const & fn) const
|
||||
{
|
||||
FeatureBuilder1 fb;
|
||||
FeatureParams params;
|
||||
|
||||
fb.SetCenter(MercatorBounds::FromLatLon(restaurant.m_lat, restaurant.m_lon));
|
||||
|
||||
auto & metadata = params.GetMetadata();
|
||||
// TODO(mgsergio): Rename FMD_SPONSORED_ID to FMD_BOOKING_ID.
|
||||
metadata.Set(feature::Metadata::FMD_SPONSORED_ID, strings::to_string(restaurant.m_id.Get()));
|
||||
metadata.Set(feature::Metadata::FMD_WEBSITE, restaurant.m_descUrl);
|
||||
|
||||
// params.AddAddress(restaurant.address);
|
||||
// TODO(mgsergio): addr:full ???
|
||||
|
||||
if (!restaurant.m_street.empty())
|
||||
fb.AddStreet(restaurant.m_street);
|
||||
|
||||
if (!restaurant.m_houseNumber.empty())
|
||||
fb.AddHouseNumber(restaurant.m_houseNumber);
|
||||
|
||||
params.AddName(StringUtf8Multilang::GetLangByCode(StringUtf8Multilang::kDefaultCode),
|
||||
restaurant.m_name);
|
||||
|
||||
auto const & clf = classif();
|
||||
params.AddType(clf.GetTypeByPath({"sponsored", "booking"}));
|
||||
|
||||
fb.SetParams(params);
|
||||
|
||||
fn(fb);
|
||||
}
|
||||
|
||||
template <>
|
||||
OpentableDataset::ObjectId OpentableDataset::FindMatchingObjectIdImpl(FeatureBuilder1 const & fb) const
|
||||
{
|
||||
auto const name = fb.GetName(StringUtf8Multilang::kDefaultCode);
|
||||
|
||||
if (name.empty())
|
||||
return Object::InvalidObjectId();
|
||||
|
||||
// Find |kMaxSelectedElements| nearest values to a point.
|
||||
auto const bookingIndexes = GetNearestObjects(MercatorBounds::ToLatLon(fb.GetKeyPoint()),
|
||||
kMaxSelectedElements, kDistanceLimitInMeters);
|
||||
|
||||
CHECK(false, ("Not implemented yet"));
|
||||
// for (auto const j : bookingIndexes)
|
||||
// {
|
||||
// if (booking_scoring::Match(GetObjectById(j), fb).IsMatched())
|
||||
// return j;
|
||||
// }
|
||||
|
||||
return Object::InvalidObjectId();
|
||||
}
|
||||
} // namespace generator
|
58
generator/opentable_dataset.hpp
Normal file
58
generator/opentable_dataset.hpp
Normal file
|
@ -0,0 +1,58 @@
|
|||
#pragma once
|
||||
|
||||
#include "generator/sponsored_dataset.hpp"
|
||||
|
||||
#include "base/newtype.hpp"
|
||||
|
||||
#include "std/limits.hpp"
|
||||
#include "std/string.hpp"
|
||||
|
||||
namespace generator
|
||||
{
|
||||
// TODO(mgsergio): Try to get rid of code deuplication. (See BookingHotel)
|
||||
struct OpentableRestaurant
|
||||
{
|
||||
NEWTYPE(uint32_t, ObjectId);
|
||||
|
||||
enum class Fields
|
||||
{
|
||||
Id = 0,
|
||||
Latitude,
|
||||
Longtitude,
|
||||
Name,
|
||||
Address,
|
||||
DescUrl,
|
||||
Phone,
|
||||
// Opentable doesn't have translations.
|
||||
// Translations,
|
||||
Counter
|
||||
};
|
||||
|
||||
static constexpr ObjectId InvalidObjectId()
|
||||
{
|
||||
return ObjectId(numeric_limits<typename ObjectId::RepType>::max());
|
||||
}
|
||||
|
||||
explicit OpentableRestaurant(string const & src);
|
||||
|
||||
static constexpr size_t Index(Fields field) { return static_cast<size_t>(field); }
|
||||
static constexpr size_t FieldsCount() { return static_cast<size_t>(Fields::Counter); }
|
||||
bool IsAddressPartsFilled() const { return !m_street.empty() || !m_houseNumber.empty(); }
|
||||
|
||||
ObjectId m_id{InvalidObjectId()};
|
||||
double m_lat = 0.0;
|
||||
double m_lon = 0.0;
|
||||
string m_name;
|
||||
string m_street;
|
||||
string m_houseNumber;
|
||||
|
||||
string m_address;
|
||||
string m_descUrl;
|
||||
// string m_translations;
|
||||
};
|
||||
|
||||
ostream & operator<<(ostream & s, OpentableRestaurant const & r);
|
||||
|
||||
NEWTYPE_SIMPLE_OUTPUT(OpentableRestaurant::ObjectId);
|
||||
using OpentableDataset = SponsoredDataset<OpentableRestaurant>;
|
||||
} // namespace generator
|
45
generator/opentable_scoring.cpp
Normal file
45
generator/opentable_scoring.cpp
Normal file
|
@ -0,0 +1,45 @@
|
|||
#include "generator/sponsored_scoring.hpp"
|
||||
|
||||
#include "generator/opentable_dataset.hpp"
|
||||
#include "generator/feature_builder.hpp"
|
||||
|
||||
namespace
|
||||
{
|
||||
// Calculated with tools/python/booking_hotels_quality.py.
|
||||
double constexpr kOptimalThreshold = 0.304875;
|
||||
} // namespace
|
||||
|
||||
namespace generator
|
||||
{
|
||||
namespace sponsored_scoring
|
||||
{
|
||||
template <>
|
||||
double MatchStats<OpentableRestaurant>::GetMatchingScore() const
|
||||
{
|
||||
// TODO(mgsergio): Use tuner to get optimal function.
|
||||
return m_linearNormDistanceScore * m_nameSimilarityScore;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool MatchStats<OpentableRestaurant>::IsMatched() const
|
||||
{
|
||||
return GetMatchingScore() > kOptimalThreshold;
|
||||
}
|
||||
|
||||
template <>
|
||||
MatchStats<OpentableRestaurant> Match(OpentableRestaurant const & h, FeatureBuilder1 const & fb)
|
||||
{
|
||||
MatchStats<OpentableRestaurant> score;
|
||||
|
||||
auto const fbCenter = MercatorBounds::ToLatLon(fb.GetKeyPoint());
|
||||
auto const distance = ms::DistanceOnEarth(fbCenter.lat, fbCenter.lon, h.m_lat, h.m_lon);
|
||||
score.m_linearNormDistanceScore =
|
||||
impl::GetLinearNormDistanceScore(distance, OpentableDataset::kDistanceLimitInMeters);
|
||||
|
||||
score.m_nameSimilarityScore =
|
||||
impl::GetNameSimilarityScore(h.m_name, fb.GetName(StringUtf8Multilang::kDefaultCode));
|
||||
|
||||
return score;
|
||||
}
|
||||
} // namespace sponsored_scoring
|
||||
} // namespace generator
|
|
@ -11,7 +11,6 @@
|
|||
#include "base/newtype.hpp"
|
||||
|
||||
#include "std/function.hpp"
|
||||
#include "std/limits.hpp"
|
||||
#include "std/map.hpp"
|
||||
#include "std/string.hpp"
|
||||
|
||||
|
|
|
@ -7,7 +7,6 @@
|
|||
|
||||
#include "std/fstream.hpp"
|
||||
#include "std/iostream.hpp"
|
||||
#include "std/limits.hpp"
|
||||
|
||||
namespace generator
|
||||
{
|
||||
|
@ -137,7 +136,7 @@ void SponsoredDataset<SponsoredObject>::LoadData(istream & src, string const & a
|
|||
|
||||
if (!addressReferencePath.empty())
|
||||
{
|
||||
LOG(LINFO, ("Reference addresses for booking objects", addressReferencePath));
|
||||
LOG(LINFO, ("Reference addresses for sponsored objects", addressReferencePath));
|
||||
Platform & platform = GetPlatform();
|
||||
string const backupPath = platform.WritableDir();
|
||||
// TODO(mgsergio): What is this for?
|
||||
|
|
107
generator/sponsored_scoring.cpp
Normal file
107
generator/sponsored_scoring.cpp
Normal file
|
@ -0,0 +1,107 @@
|
|||
#include "generator/sponsored_scoring.hpp"
|
||||
|
||||
#include "indexer/search_delimiters.hpp"
|
||||
#include "indexer/search_string_utils.hpp"
|
||||
|
||||
#include "geometry/distance_on_sphere.hpp"
|
||||
|
||||
// #include "base/stl_iterator.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
namespace
|
||||
{
|
||||
using WeightedBagOfWords = vector<pair<strings::UniString, double>>;
|
||||
|
||||
vector<strings::UniString> StringToSetOfWords(string const & str)
|
||||
{
|
||||
vector<strings::UniString> result;
|
||||
search::NormalizeAndTokenizeString(str, result, search::Delimiters{});
|
||||
sort(begin(result), end(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
WeightedBagOfWords MakeWeightedBagOfWords(vector<strings::UniString> const & words)
|
||||
{
|
||||
// TODO(mgsergio): Calculate tf-idsf score for every word.
|
||||
auto constexpr kTfIdfScorePlaceholder = 1;
|
||||
|
||||
WeightedBagOfWords result;
|
||||
for (auto i = 0; i < words.size(); ++i)
|
||||
{
|
||||
result.emplace_back(words[i], kTfIdfScorePlaceholder);
|
||||
while (i + 1 < words.size() && words[i] == words[i + 1])
|
||||
{
|
||||
result.back().second += kTfIdfScorePlaceholder; // TODO(mgsergio): tf-idf score for result[i].frist;
|
||||
++i;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
double WeightedBagsDotProduct(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
|
||||
{
|
||||
double result{};
|
||||
|
||||
auto lhsIt = begin(lhs);
|
||||
auto rhsIt = begin(rhs);
|
||||
|
||||
while (lhsIt != end(lhs) && rhsIt != end(rhs))
|
||||
{
|
||||
if (lhsIt->first == rhsIt->first)
|
||||
{
|
||||
result += lhsIt->second * rhsIt->second;
|
||||
++lhsIt;
|
||||
++rhsIt;
|
||||
}
|
||||
else if (lhsIt->first < rhsIt->first)
|
||||
{
|
||||
++lhsIt;
|
||||
}
|
||||
else
|
||||
{
|
||||
++rhsIt;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
double WeightedBagOfWordsCos(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
|
||||
{
|
||||
auto const product = WeightedBagsDotProduct(lhs, rhs);
|
||||
auto const lhsLength = sqrt(WeightedBagsDotProduct(lhs, lhs));
|
||||
auto const rhsLength = sqrt(WeightedBagsDotProduct(rhs, rhs));
|
||||
|
||||
if (product == 0.0)
|
||||
return 0.0;
|
||||
|
||||
return product / (lhsLength * rhsLength);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace generator
|
||||
{
|
||||
namespace impl
|
||||
{
|
||||
double GetLinearNormDistanceScore(double distance, double const maxDistance)
|
||||
{
|
||||
distance = my::clamp(distance, 0, maxDistance);
|
||||
return 1.0 - distance / maxDistance;
|
||||
}
|
||||
|
||||
double GetNameSimilarityScore(string const & booking_name, string const & osm_name)
|
||||
{
|
||||
auto const aws = MakeWeightedBagOfWords(StringToSetOfWords(booking_name));
|
||||
auto const bws = MakeWeightedBagOfWords(StringToSetOfWords(osm_name));
|
||||
|
||||
if (aws.empty() && bws.empty())
|
||||
return 1.0;
|
||||
if (aws.empty() || bws.empty())
|
||||
return 0.0;
|
||||
|
||||
return WeightedBagOfWordsCos(aws, bws);
|
||||
}
|
||||
} // namespace impl
|
||||
} // namespace generator
|
34
generator/sponsored_scoring.hpp
Normal file
34
generator/sponsored_scoring.hpp
Normal file
|
@ -0,0 +1,34 @@
|
|||
#pragma once
|
||||
|
||||
#include "generator/booking_dataset.hpp"
|
||||
|
||||
class FeatureBuilder1;
|
||||
|
||||
namespace generator
|
||||
{
|
||||
namespace impl
|
||||
{
|
||||
double GetLinearNormDistanceScore(double distance, double maxDistance);
|
||||
double GetNameSimilarityScore(string const & booking_name, string const & osm_name);
|
||||
} // namespace impl
|
||||
|
||||
namespace sponsored_scoring
|
||||
{
|
||||
/// Represents a match scoring statystics of a sponsored object agains osm object.
|
||||
template <typename SponsoredObject>
|
||||
struct MatchStats
|
||||
{
|
||||
/// Returns some score based on geven fields and classificator tuning.
|
||||
double GetMatchingScore() const;
|
||||
/// Returns true if GetMatchingScore is greater then some theshold.
|
||||
bool IsMatched() const;
|
||||
|
||||
double m_linearNormDistanceScore{};
|
||||
double m_nameSimilarityScore{};
|
||||
};
|
||||
|
||||
/// Matches a given sponsored object against a given OSM object.
|
||||
template <typename SponsoredObject>
|
||||
MatchStats<SponsoredObject> Match(SponsoredObject const & o, FeatureBuilder1 const & fb);
|
||||
} // namespace booking_scoring
|
||||
} // namespace generator
|
Loading…
Add table
Reference in a new issue