From 93c21b9cc23bd51053299995e1eb7dc966486d7e Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Wed, 19 Jun 2019 14:02:20 +0300 Subject: [PATCH] [search] Add MatchedFraction parameter to RankingInfo. --- search/ranker.cpp | 37 ++++++++++++++++++++------ search/ranking_info.cpp | 10 ++++++- search/ranking_info.hpp | 3 +++ search/search_quality/scoring_model.py | 2 +- 4 files changed, 42 insertions(+), 10 deletions(-) diff --git a/search/ranker.cpp b/search/ranker.cpp index ce3ca18015..9a9d2c502d 100644 --- a/search/ranker.cpp +++ b/search/ranker.cpp @@ -37,6 +37,7 @@ struct NameScores { NameScore m_nameScore = NAME_SCORE_ZERO; ErrorsMade m_errorsMade; + size_t m_matchedLength = 0; }; template @@ -59,8 +60,14 @@ NameScores GetNameScores(FeatureType & ft, Geocoder::Params const & params, { NameScores bestScores; - TokenSlice slice(params, range); - TokenSliceNoCategories sliceNoCategories(params, range); + TokenSlice const slice(params, range); + TokenSliceNoCategories const sliceNoCategories(params, range); + + if (type != Model::Type::TYPE_COUNT) + { + for (size_t i = 0; i < slice.Size(); ++i) + bestScores.m_matchedLength += slice.Get(i).GetOriginal().size(); + } for (auto const lang : params.GetLangs()) { @@ -100,12 +107,14 @@ NameScores GetNameScores(FeatureType & ft, Geocoder::Params const & params, return bestScores; } -ErrorsMade GetErrorsMade(FeatureType & ft, Geocoder::Params const & params, - TokenRange const & range, Model::Type type) +pair MatchTokenRange(FeatureType & ft, Geocoder::Params const & params, + TokenRange const & range, Model::Type type) { - auto errorsMade = GetNameScores(ft, params, range, type).m_errorsMade; + auto const nameScores = GetNameScores(ft, params, range, type); + auto errorsMade = nameScores.m_errorsMade; + auto matchedLength = nameScores.m_matchedLength; if (errorsMade.IsValid()) - return errorsMade; + return make_pair(errorsMade, matchedLength); for (auto const token : range) { @@ -114,9 +123,10 @@ ErrorsMade GetErrorsMade(FeatureType & ft, Geocoder::Params const & params, tokenErrors = ErrorsMade::Max(tokenErrors, ErrorsMade{GetMaxErrorsForToken(s)}); }); errorsMade += tokenErrors; + matchedLength += params.GetToken(token).GetOriginal().size(); } - return errorsMade; + return make_pair(errorsMade, matchedLength); } void RemoveDuplicatingLinear(vector & results) @@ -313,6 +323,7 @@ class RankerResultMaker auto nameScore = nameScores.m_nameScore; auto errorsMade = nameScores.m_errorsMade; + auto matchedLength = nameScores.m_matchedLength; if (info.m_type != Model::TYPE_STREET && preInfo.m_geoParts.m_street != IntersectionResult::kInvalidId) @@ -327,6 +338,7 @@ class RankerResultMaker nameScore = min(nameScore, nameScores.m_nameScore); errorsMade += nameScores.m_errorsMade; + matchedLength += nameScores.m_matchedLength; } } @@ -337,12 +349,21 @@ class RankerResultMaker { auto const type = Model::TYPE_CITY; auto const & range = preInfo.m_tokenRange[type]; - errorsMade += GetErrorsMade(*city, m_params, range, type); + auto const matchingResult = MatchTokenRange(*city, m_params, range, type); + errorsMade += matchingResult.first; + matchedLength += matchingResult.second; } } + size_t totalLength = 0; + for (size_t i = 0; i < m_params.GetNumTokens(); ++i) + totalLength += m_params.GetToken(i).GetOriginal().size(); + info.m_nameScore = nameScore; info.m_errorsMade = errorsMade; + info.m_matchedFraction = + totalLength == 0 ? 1.0 + : static_cast(matchedLength) / static_cast(totalLength); } CategoriesInfo const categoriesInfo(feature::TypesHolder(ft), diff --git a/search/ranking_info.cpp b/search/ranking_info.cpp index a80cd70cb2..40980ff245 100644 --- a/search/ranking_info.cpp +++ b/search/ranking_info.cpp @@ -22,6 +22,8 @@ double constexpr kPopularity = 0.0500000; double constexpr kRating = 0.0500000; double constexpr kFalseCats = -0.3691859; double constexpr kErrorsMade = -0.0579812; +// todo: (@t.yan) Adjust. +double constexpr kMatchedFraction = 0.3; double constexpr kAllTokensUsed = 0.0000000; double constexpr kHasName = 0.5; @@ -81,10 +83,13 @@ void RankingInfo::PrintCSVHeader(ostream & os) << ",Rating" << ",NameScore" << ",ErrorsMade" + << ",MatchedFraction" << ",SearchType" << ",PureCats" << ",FalseCats" - << ",AllTokensUsed"; + << ",AllTokensUsed" + << ",IsCategorialRequest" + << ",HasName"; } string DebugPrint(RankingInfo const & info) @@ -99,6 +104,7 @@ string DebugPrint(RankingInfo const & info) << "]"; os << ", m_nameScore:" << DebugPrint(info.m_nameScore); os << ", m_errorsMade:" << DebugPrint(info.m_errorsMade); + os << ", m_matchedFraction:" << info.m_matchedFraction; os << ", m_type:" << DebugPrint(info.m_type); os << ", m_pureCats:" << info.m_pureCats; os << ", m_falseCats:" << info.m_falseCats; @@ -118,6 +124,7 @@ void RankingInfo::ToCSV(ostream & os) const os << TransformRating(m_rating) << ","; os << DebugPrint(m_nameScore) << ","; os << GetErrorsMade() << ","; + os << m_matchedFraction << ","; os << DebugPrint(m_type) << ","; os << m_pureCats << ","; os << m_falseCats << ","; @@ -160,6 +167,7 @@ double RankingInfo::GetLinearModelRank() const result += kType[m_type]; result += kNameScore[nameScore]; result += kErrorsMade * GetErrorsMade(); + result += kMatchedFraction * m_matchedFraction; result += (m_allTokensUsed ? 1 : 0) * kAllTokensUsed; } else diff --git a/search/ranking_info.hpp b/search/ranking_info.hpp index 8250d6a12b..112cce8c34 100644 --- a/search/ranking_info.hpp +++ b/search/ranking_info.hpp @@ -46,6 +46,9 @@ struct RankingInfo // Number of typos. ErrorsMade m_errorsMade; + // Fraction of characters from original query matched to feature. + double m_matchedFraction = 0.0; + // True iff all tokens that are not stop-words // were used when retrieving the feature. bool m_allTokensUsed = true; diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index e5d31379a3..93e7d03826 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -20,7 +20,7 @@ MAX_POPULARITY = 255 RELEVANCES = {'Harmful': -3, 'Irrelevant': 0, 'Relevant': 1, 'Vital': 3} NAME_SCORES = ['Zero', 'Substring', 'Prefix', 'Full Match'] SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country'] -FEATURES = ['DistanceToPivot', 'Rank', 'Popularity', 'Rating', 'FalseCats', 'ErrorsMade', +FEATURES = ['DistanceToPivot', 'Rank', 'Popularity', 'Rating', 'FalseCats', 'ErrorsMade', 'MatchedFraction', 'AllTokensUsed'] + NAME_SCORES + SEARCH_TYPES BOOTSTRAP_ITERATIONS = 10000