From 83dd94fd2bcb1b829d18287467b19e40c23b75ea Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Thu, 18 May 2017 18:39:50 +0300 Subject: [PATCH] [search] Changed the name scoring scheme. --- search/intermediate_result.cpp | 14 +++++---- search/locality_scorer.cpp | 2 +- search/ranking_info.cpp | 25 ++++++++++------ search/ranking_utils.cpp | 3 +- search/ranking_utils.hpp | 40 +++++++++++++------------- search/search_quality/scoring_model.py | 9 +----- search/search_tests/ranking_tests.cpp | 12 ++++---- 7 files changed, 54 insertions(+), 51 deletions(-) diff --git a/search/intermediate_result.cpp b/search/intermediate_result.cpp index f0c37fef80..81f53e5bb6 100644 --- a/search/intermediate_result.cpp +++ b/search/intermediate_result.cpp @@ -64,7 +64,8 @@ void ProcessMetadata(FeatureType const & ft, Result::Metadata & meta) meta.m_hotelRating = rating; int pricing; - strings::to_int(src.Get(feature::Metadata::FMD_PRICE_RATE), pricing); + if (!strings::to_int(src.Get(feature::Metadata::FMD_PRICE_RATE), pricing)) + pricing = 0; string pricingStr; CHECK_GREATER_OR_EQUAL(pricing, 0, ("Pricing must be positive!")); for (auto i = 0; i < pricing; i++) @@ -292,11 +293,12 @@ bool PreResult2::IsStreet() const string PreResult2::DebugPrint() const { stringstream ss; - ss << "{ IntermediateResult: " << - "Name: " << m_str << - "; Type: " << GetBestType() << - "; Rank: " << static_cast(m_info.m_rank) << - "; Distance: " << m_distance << " }"; + ss << "IntermediateResult [ " + << "Name: " << m_str + << "; Type: " << GetBestType() + << "; Ranking info: " << search::DebugPrint(m_info) + << "; Linear model rank: " << m_info.GetLinearModelRank() + << " ]"; return ss.str(); } diff --git a/search/locality_scorer.cpp b/search/locality_scorer.cpp index 3fecd7ec4b..06583618e9 100644 --- a/search/locality_scorer.cpp +++ b/search/locality_scorer.cpp @@ -19,7 +19,7 @@ namespace { bool IsAlmostFullMatch(NameScore score) { - return score == NAME_SCORE_FULL_MATCH_PREFIX || score == NAME_SCORE_FULL_MATCH; + return score == NAME_SCORE_PREFIX || score == NAME_SCORE_FULL_MATCH; } } // namespace diff --git a/search/ranking_info.cpp b/search/ranking_info.cpp index 1d299d6a8c..05f1d3659a 100644 --- a/search/ranking_info.cpp +++ b/search/ranking_info.cpp @@ -11,19 +11,26 @@ namespace { // See search/search_quality/scoring_model.py for details. In short, // these coeffs correspond to coeffs in a linear model. -double const kDistanceToPivot = -1.0000000; -double const kRank = 0.7165246; -double const kFalseCats = -0.3833900; +double const kDistanceToPivot = -0.37897824370302247; +double const kRank = 1.0; +double const kFalseCats = -0.05775625793967508; + double const kNameScore[NameScore::NAME_SCORE_COUNT] = { - -0.1069757 /* Zero */, -0.0250079 /* Substring Prefix */, 0.0447104 /* Substring */, - 0.0872732 /* Full Match Prefix */, 0.0872732 /* Full Match */ + -0.11436302557264734 /* Zero */ + , 0.014295634567960331 /* Substring */ + , 0.046219090910780115 /* Prefix */ + , 0.05384830009390816 /* Full Match */ }; double const kSearchType[SearchModel::SEARCH_TYPE_COUNT] = { - -0.3884116 /* POI */, -0.3884116 /* Building */, - -0.3214653 /* Street */, -0.3357469 /* Unclassified */, - -0.4341714 /* Village */, 0.2721947 /* City */, - 0.4708555 /* State */, 0.7367450 /* Country */ + -0.09164609318265761 /* POI */ + , -0.09164609318265761 /* Building */ + , -0.0805969548653964 /* Street */ + , -0.030493728520630793 /* Unclassified */ + , -0.19242203325862917 /* Village */ + , -0.10945592241057521 /* City */ + , 0.19250143015921584 /* State */ + , 0.31211330207867427 /* Country */ }; double TransformDistance(double distance) diff --git a/search/ranking_utils.cpp b/search/ranking_utils.cpp index 16d6f4bcbc..97d0c5792f 100644 --- a/search/ranking_utils.cpp +++ b/search/ranking_utils.cpp @@ -65,9 +65,8 @@ string DebugPrint(NameScore score) switch (score) { case NAME_SCORE_ZERO: return "Zero"; - case NAME_SCORE_SUBSTRING_PREFIX: return "Substring Prefix"; case NAME_SCORE_SUBSTRING: return "Substring"; - case NAME_SCORE_FULL_MATCH_PREFIX: return "Full Match Prefix"; + case NAME_SCORE_PREFIX: return "Prefix"; case NAME_SCORE_FULL_MATCH: return "Full Match"; case NAME_SCORE_COUNT: return "Count"; } diff --git a/search/ranking_utils.hpp b/search/ranking_utils.hpp index bf359e42b2..29a6ae0102 100644 --- a/search/ranking_utils.hpp +++ b/search/ranking_utils.hpp @@ -30,10 +30,9 @@ bool PrefixMatch(QueryParams::Token const & token, strings::UniString const & te enum NameScore { NAME_SCORE_ZERO = 0, - NAME_SCORE_SUBSTRING_PREFIX = 1, - NAME_SCORE_SUBSTRING = 2, - NAME_SCORE_FULL_MATCH_PREFIX = 3, - NAME_SCORE_FULL_MATCH = 4, + NAME_SCORE_SUBSTRING = 1, + NAME_SCORE_PREFIX = 2, + NAME_SCORE_FULL_MATCH = 3, NAME_SCORE_COUNT }; @@ -44,8 +43,8 @@ bool IsStopWord(strings::UniString const & s); // Normalizes, simplifies and splits string, removes stop-words. void PrepareStringForMatching(std::string const & name, std::vector & tokens); -template -NameScore GetNameScore(std::string const & name, TSlice const & slice) +template +NameScore GetNameScore(std::string const & name, Slice const & slice) { if (slice.Empty()) return NAME_SCORE_ZERO; @@ -55,8 +54,8 @@ NameScore GetNameScore(std::string const & name, TSlice const & slice) return GetNameScore(tokens, slice); } -template -NameScore GetNameScore(std::vector const & tokens, TSlice const & slice) +template +NameScore GetNameScore(std::vector const & tokens, Slice const & slice) { if (slice.Empty()) return NAME_SCORE_ZERO; @@ -75,18 +74,19 @@ NameScore GetNameScore(std::vector const & tokens, TSlice co if (!match) continue; - if (impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1])) - { - if (m == n) - return NAME_SCORE_FULL_MATCH; - score = max(score, NAME_SCORE_SUBSTRING); - } - if (lastTokenIsPrefix && impl::PrefixMatch(slice.Get(m - 1), tokens[offset + m - 1])) - { - if (m == n) - return NAME_SCORE_FULL_MATCH_PREFIX; - score = max(score, NAME_SCORE_SUBSTRING_PREFIX); - } + bool const fullMatch = impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1]); + bool const prefixMatch = + lastTokenIsPrefix && impl::PrefixMatch(slice.Get(m - 1), tokens[offset + m - 1]); + if (!fullMatch && !prefixMatch) + continue; + + if (m == n && fullMatch) + return NAME_SCORE_FULL_MATCH; + + if (offset == 0) + score = max(score, NAME_SCORE_PREFIX); + + score = max(score, NAME_SCORE_SUBSTRING); } return score; } diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index 12cfa8d9de..3a6b259e56 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -16,7 +16,7 @@ import sys MAX_DISTANCE_METERS = 2e6 MAX_RANK = 255 RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3} -NAME_SCORES = ['Zero', 'Substring Prefix', 'Substring', 'Full Match Prefix', 'Full Match'] +NAME_SCORES = ['Zero', 'Substring', 'Prefix', 'Full Match'] SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country'] FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats'] + NAME_SCORES + SEARCH_TYPES @@ -25,8 +25,6 @@ FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats'] + NAME_SCORES + SEARCH_TYPES def transform_name_score(value, categories_match): if categories_match == 1: return 'Zero' - elif value == 'Full Match Prefix': - return 'Full Match' else: return value @@ -40,10 +38,6 @@ def normalize_data(data): cats = data['PureCats'].combine(data['FalseCats'], max) - # Full prefix match is unified with a full match as these features - # are collinear. But we need both of them as they're also used in - # locality sorting. - # # TODO (@y, @m): do forward/backward/subset selection of features # instead of this merging. It would be great to conduct PCA on # the features too. @@ -277,7 +271,6 @@ def main(args): # Following code restores coeffs for merged features. ws[FEATURES.index('Building')] = ws[FEATURES.index('POI')] - ws[FEATURES.index('Full Match Prefix')] = ws[FEATURES.index('Full Match')] ndcgs = compute_ndcgs_for_ws(data, ws) diff --git a/search/search_tests/ranking_tests.cpp b/search/search_tests/ranking_tests.cpp index e9dbc4af09..2b10b73d51 100644 --- a/search/search_tests/ranking_tests.cpp +++ b/search/search_tests/ranking_tests.cpp @@ -45,12 +45,14 @@ UNIT_TEST(NameTest_Smoke) TEST_EQUAL(GetScore("New York", "Central Park, New York, US", TokenRange(2, 4)), NAME_SCORE_FULL_MATCH, ()); TEST_EQUAL(GetScore("New York", "York", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ()); - TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH_PREFIX, - ()); + TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", TokenRange(2, 3)), NAME_SCORE_PREFIX, ()); TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH, ()); - TEST_EQUAL(GetScore("San Francisco", "Fran", TokenRange(0, 1)), NAME_SCORE_SUBSTRING_PREFIX, ()); + TEST_EQUAL(GetScore("San Francisco", "Fran", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ()); TEST_EQUAL(GetScore("San Francisco", "Fran ", TokenRange(0, 1)), NAME_SCORE_ZERO, ()); - TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH_PREFIX, - ()); + TEST_EQUAL(GetScore("San Francisco", "Sa", TokenRange(0, 1)), NAME_SCORE_PREFIX, ()); + TEST_EQUAL(GetScore("San Francisco", "San ", TokenRange(0, 1)), NAME_SCORE_PREFIX, ()); + TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_PREFIX, ()); + TEST_EQUAL(GetScore("фото на документы", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ()); + TEST_EQUAL(GetScore("фотоателье", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ()); } } // namespace