From ad0bca72b5f0adad982fffee1b5bf7e75d87f439 Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Tue, 23 Apr 2019 16:23:46 +0300 Subject: [PATCH] [search] Heuristics for rating. --- search/intermediate_result.hpp | 1 + search/pre_ranker.cpp | 7 +++++++ search/pre_ranking_info.cpp | 9 +++++---- search/pre_ranking_info.hpp | 11 +++++++++++ search/ranker.cpp | 1 + search/ranking_info.cpp | 25 +++++++++++++++++++++++++ search/ranking_info.hpp | 4 ++++ search/search_quality/scoring_model.py | 2 +- 8 files changed, 55 insertions(+), 5 deletions(-) diff --git a/search/intermediate_result.hpp b/search/intermediate_result.hpp index b8ff62c324..6a865354f1 100644 --- a/search/intermediate_result.hpp +++ b/search/intermediate_result.hpp @@ -51,6 +51,7 @@ public: double GetDistance() const { return m_info.m_distanceToPivot; } uint8_t GetRank() const { return m_info.m_rank; } uint8_t GetPopularity() const { return m_info.m_popularity; } + std::pair GetRating() const { return m_info.m_rating; } PreRankingInfo & GetInfo() { return m_info; } PreRankingInfo const & GetInfo() const { return m_info; } std::vector const & GetProvenance() const { return m_provenance; } diff --git a/search/pre_ranker.cpp b/search/pre_ranker.cpp index 711526d17c..5d6c16b309 100644 --- a/search/pre_ranker.cpp +++ b/search/pre_ranker.cpp @@ -5,6 +5,8 @@ #include "search/pre_ranking_info.hpp" #include "search/tracer.hpp" +#include "ugc/types.hpp" + #include "indexer/data_source.hpp" #include "indexer/mwm_set.hpp" #include "indexer/rank_table.hpp" @@ -77,6 +79,7 @@ void PreRanker::FillMissingFieldsInPreResults() MwmSet::MwmHandle mwmHandle; unique_ptr ranks = make_unique(); unique_ptr popularityRanks = make_unique(); + unique_ptr ratings = make_unique(); unique_ptr centers; bool pivotFeaturesInitialized = false; @@ -94,16 +97,20 @@ void PreRanker::FillMissingFieldsInPreResults() ranks = RankTable::Load(mwmHandle.GetValue()->m_cont, SEARCH_RANKS_FILE_TAG); popularityRanks = RankTable::Load(mwmHandle.GetValue()->m_cont, POPULARITY_RANKS_FILE_TAG); + ratings = RankTable::Load(mwmHandle.GetValue()->m_cont, RATINGS_FILE_TAG); centers = make_unique(*mwmHandle.GetValue()); } if (!ranks) ranks = make_unique(); if (!popularityRanks) popularityRanks = make_unique(); + if (!ratings) + ratings = make_unique(); } info.m_rank = ranks->Get(id.m_index); info.m_popularity = popularityRanks->Get(id.m_index); + info.m_rating = ugc::UGC::UnpackRating(ratings->Get(id.m_index)); m2::PointD center; if (centers && centers->Get(id.m_index, center)) diff --git a/search/pre_ranking_info.cpp b/search/pre_ranking_info.cpp index 2b05f68d5c..c5a4afa668 100644 --- a/search/pre_ranking_info.cpp +++ b/search/pre_ranking_info.cpp @@ -8,17 +8,18 @@ std::string DebugPrint(PreRankingInfo const & info) { std::ostringstream os; os << "PreRankingInfo ["; - os << "m_distanceToPivot:" << info.m_distanceToPivot << ","; + os << "m_distanceToPivot:" << info.m_distanceToPivot << ", "; for (size_t i = 0; i < static_cast(Model::TYPE_COUNT); ++i) { if (info.m_tokenRange[i].Empty()) continue; auto const type = static_cast(i); - os << "m_tokenRange[" << DebugPrint(type) << "]:" << DebugPrint(info.m_tokenRange[i]) << ","; + os << "m_tokenRange[" << DebugPrint(type) << "]:" << DebugPrint(info.m_tokenRange[i]) << ", "; } - os << "m_rank:" << static_cast(info.m_rank) << ","; - os << "m_popularity:" << static_cast(info.m_popularity) << ","; + os << "m_rank:" << static_cast(info.m_rank) << ", "; + os << "m_popularity:" << static_cast(info.m_popularity) << ", "; + os << "m_rating: [" << static_cast(info.m_rating.first) << ", "<< info.m_rating.second << "], "; os << "m_type:" << info.m_type; os << "]"; return os.str(); diff --git a/search/pre_ranking_info.hpp b/search/pre_ranking_info.hpp index b19a76dd1d..8150bf0e36 100644 --- a/search/pre_ranking_info.hpp +++ b/search/pre_ranking_info.hpp @@ -12,6 +12,7 @@ #include #include +#include namespace search { @@ -59,6 +60,16 @@ struct PreRankingInfo // Popularity rank of the feature. uint8_t m_popularity = 0; + // Confidence and UGC rating. + // Confidence: 0 - no information + // 1 - based on few reviews + // 2 - based on average reviews number + // 3 - based on large number of reviews. + // Rating [4.0 ... 10.0]: + // 4.0 and lower represented as 4.0 + // higher ratings saved as is from UGC. + std::pair m_rating = {0, 0.0f}; + // Search type for the feature. Model::Type m_type = Model::TYPE_COUNT; }; diff --git a/search/ranker.cpp b/search/ranker.cpp index 8a1c3fada5..2e5bb0fd73 100644 --- a/search/ranker.cpp +++ b/search/ranker.cpp @@ -298,6 +298,7 @@ class RankerResultMaker info.m_distanceToPivot = MercatorBounds::DistanceOnEarth(center, pivot); info.m_rank = preInfo.m_rank; info.m_popularity = preInfo.m_popularity; + info.m_rating = preInfo.m_rating; info.m_type = preInfo.m_type; info.m_allTokensUsed = preInfo.m_allTokensUsed; info.m_categorialRequest = m_params.IsCategorialRequest(); diff --git a/search/ranking_info.cpp b/search/ranking_info.cpp index fdb4f675aa..a80cd70cb2 100644 --- a/search/ranking_info.cpp +++ b/search/ranking_info.cpp @@ -1,5 +1,7 @@ #include "search/ranking_info.hpp" +#include "ugc/types.hpp" + #include #include #include @@ -16,10 +18,13 @@ double constexpr kDistanceToPivot = -1.0000000; double constexpr kRank = 1.0000000; // todo: (@t.yan) Adjust. double constexpr kPopularity = 0.0500000; +// todo: (@t.yan) Adjust. +double constexpr kRating = 0.0500000; double constexpr kFalseCats = -0.3691859; double constexpr kErrorsMade = -0.0579812; double constexpr kAllTokensUsed = 0.0000000; double constexpr kHasName = 0.5; + double constexpr kNameScore[NameScore::NAME_SCORE_COUNT] = { -0.7245815 /* Zero */, 0.1853727 /* Substring */, @@ -48,6 +53,20 @@ double TransformDistance(double distance) { return min(distance, RankingInfo::kMaxDistMeters) / RankingInfo::kMaxDistMeters; } + +double TransformRating(pair const & rating) +{ + double r = 0.0; + // From statistics. + double constexpr kAverageRating = 7.6; + if (rating.first != 0) + { + r = (static_cast(rating.second) - kAverageRating) / + (ugc::UGC::kMaxRating - ugc::UGC::kRatingDetalizationThreshold); + r *= static_cast(rating.first) / 3.0 /* maximal confidence */; + } + return r; +} } // namespace // static @@ -59,6 +78,7 @@ void RankingInfo::PrintCSVHeader(ostream & os) os << "DistanceToPivot" << ",Rank" << ",Popularity" + << ",Rating" << ",NameScore" << ",ErrorsMade" << ",SearchType" @@ -75,6 +95,8 @@ string DebugPrint(RankingInfo const & info) os << "m_distanceToPivot:" << info.m_distanceToPivot; os << ", m_rank:" << static_cast(info.m_rank); os << ", m_popularity:" << static_cast(info.m_popularity); + os << ", m_rating:[" << static_cast(info.m_rating.first) << ", " << info.m_rating.second + << "]"; os << ", m_nameScore:" << DebugPrint(info.m_nameScore); os << ", m_errorsMade:" << DebugPrint(info.m_errorsMade); os << ", m_type:" << DebugPrint(info.m_type); @@ -93,6 +115,7 @@ void RankingInfo::ToCSV(ostream & os) const os << m_distanceToPivot << ","; os << static_cast(m_rank) << ","; os << static_cast(m_popularity) << ","; + os << TransformRating(m_rating) << ","; os << DebugPrint(m_nameScore) << ","; os << GetErrorsMade() << ","; os << DebugPrint(m_type) << ","; @@ -112,6 +135,7 @@ double RankingInfo::GetLinearModelRank() const double const distanceToPivot = TransformDistance(m_distanceToPivot); double const rank = static_cast(m_rank) / numeric_limits::max(); double const popularity = static_cast(m_popularity) / numeric_limits::max(); + double const rating = TransformRating(m_rating); auto nameScore = m_nameScore; if (m_pureCats || m_falseCats) @@ -129,6 +153,7 @@ double RankingInfo::GetLinearModelRank() const result += kDistanceToPivot * distanceToPivot; result += kRank * rank; result += kPopularity * popularity; + result += kRating * rating; result += m_falseCats * kFalseCats; if (!m_categorialRequest) { diff --git a/search/ranking_info.hpp b/search/ranking_info.hpp index 2b47b23632..2668ea0f45 100644 --- a/search/ranking_info.hpp +++ b/search/ranking_info.hpp @@ -8,6 +8,7 @@ #include #include #include +#include class FeatureType; @@ -36,6 +37,9 @@ struct RankingInfo // Popularity rank of the feature. uint8_t m_popularity = 0; + // Confidence and UGC rating. + std::pair m_rating = {0, 0.0f}; + // Score for the feature's name. NameScore m_nameScore = NAME_SCORE_ZERO; diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index ab010dfd64..20b77314d6 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -20,7 +20,7 @@ MAX_POPULARITY = 255 RELEVANCES = {'Harmful': -3, 'Irrelevant': 0, 'Relevant': 1, 'Vital': 3} NAME_SCORES = ['Zero', 'Substring', 'Prefix', 'Full Match'] SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country'] -FEATURES = ['DistanceToPivot', 'Rank', 'Popularity', 'FalseCats', 'ErrorsMade', 'AllTokensUsed', +FEATURES = ['DistanceToPivot', 'Rank', 'Popularity', 'Rating', 'FalseCats', 'ErrorsMade', 'AllTokensUsed', 'CategorialRequest', 'HasName'] + NAME_SCORES + SEARCH_TYPES BOOTSTRAP_ITERATIONS = 10000