From 8ac74cf266315b7010a66eca620545664dfba1b5 Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Mon, 8 Jul 2019 18:34:52 +0300 Subject: [PATCH] [search] Allow misprints while determing NameScore for ranking. --- search/ranker.cpp | 28 +++-- search/ranking_info.cpp | 45 ++++---- search/ranking_info.hpp | 6 +- search/ranking_utils.cpp | 51 ++++++--- search/ranking_utils.hpp | 104 ++++++++---------- .../processor_test.cpp | 9 +- search/search_tests/ranking_tests.cpp | 8 +- 7 files changed, 140 insertions(+), 111 deletions(-) diff --git a/search/ranker.cpp b/search/ranker.cpp index 1149987bc6..914dff4620 100644 --- a/search/ranker.cpp +++ b/search/ranker.cpp @@ -33,26 +33,33 @@ namespace search { namespace { -struct NameScores +size_t GetMaxNumberOfErros(Geocoder::Params const & params) +{ + size_t result = 0; + for (size_t i = 0; i < params.GetNumTokens(); ++i) + result += GetMaxErrorsForToken(params.GetToken(i).GetOriginal()); + + return result; +} + +struct NameScoresEx : public NameScores { - NameScore m_nameScore = NAME_SCORE_ZERO; - ErrorsMade m_errorsMade; size_t m_matchedLength = 0; }; template void UpdateNameScores(string const & name, Slice const & slice, NameScores & bestScores) { - bestScores.m_nameScore = max(bestScores.m_nameScore, GetNameScore(name, slice)); - bestScores.m_errorsMade = ErrorsMade::Min(bestScores.m_errorsMade, GetErrorsMade(name, slice)); + auto const newScores = GetNameScores(name, slice); + bestScores = NameScores::BestScores(newScores, bestScores); } template void UpdateNameScores(vector const & tokens, Slice const & slice, NameScores & bestScores) { - bestScores.m_nameScore = max(bestScores.m_nameScore, GetNameScore(tokens, slice)); - bestScores.m_errorsMade = ErrorsMade::Min(bestScores.m_errorsMade, GetErrorsMade(tokens, slice)); + auto const newScores = GetNameScores(tokens, slice); + bestScores = NameScores::BestScores(newScores, bestScores); } // This function supports only street names like "abcdstrasse"/"abcd strasse". @@ -94,10 +101,10 @@ vector> ModifyStrasse(vector cons return result; } -NameScores GetNameScores(FeatureType & ft, Geocoder::Params const & params, - TokenRange const & range, Model::Type type) +NameScoresEx GetNameScores(FeatureType & ft, Geocoder::Params const & params, + TokenRange const & range, Model::Type type) { - NameScores bestScores; + NameScoresEx bestScores; TokenSlice const slice(params, range); TokenSliceNoCategories const sliceNoCategories(params, range); @@ -420,6 +427,7 @@ class RankerResultMaker info.m_nameScore = nameScore; info.m_errorsMade = errorsMade; + info.m_maxErrorsMade = GetMaxNumberOfErros(m_params); info.m_matchedFraction = totalLength == 0 ? 1.0 : static_cast(matchedLength) / static_cast(totalLength); diff --git a/search/ranking_info.cpp b/search/ranking_info.cpp index 9870576b06..c4cf9838ef 100644 --- a/search/ranking_info.cpp +++ b/search/ranking_info.cpp @@ -14,32 +14,32 @@ namespace { // See search/search_quality/scoring_model.py for details. In short, // these coeffs correspond to coeffs in a linear model. -double constexpr kDistanceToPivot = -0.4639722; +double constexpr kDistanceToPivot = -0.8175524; double constexpr kRank = 1.0000000; // todo: (@t.yan) Adjust. double constexpr kPopularity = 0.0500000; // todo: (@t.yan) Adjust. double constexpr kRating = 0.0500000; -double constexpr kFalseCats = -1.0000000; -double constexpr kErrorsMade = -0.0221024; -double constexpr kMatchedFraction = 0.3817912; -double constexpr kAllTokensUsed = 0.6343994; +double constexpr kFalseCats = -0.3745520; +double constexpr kErrorsMade = -0.1090870; +double constexpr kMatchedFraction = 0.7859737; +double constexpr kAllTokensUsed = 1.0000000; double constexpr kHasName = 0.5; double constexpr kNameScore[NameScore::NAME_SCORE_COUNT] = { - -0.2330337 /* Zero */, - 0.0413221 /* Substring */, - 0.0578796 /* Prefix */, - 0.1338319 /* Full Match */ + -0.1752510 /* Zero */, + 0.0309111 /* Substring */, + 0.0127291 /* Prefix */, + 0.1316108 /* Full Match */ }; double constexpr kType[Model::TYPE_COUNT] = { - -0.1252380 /* POI */, - -0.1252380 /* Building */, - -0.1197951 /* Street */, - -0.1371600 /* Unclassified */, - -0.0394436 /* Village */, - 0.1370968 /* City */, - -0.0810345 /* State */, - 0.3655743 /* Country */ + -0.1554708 /* POI */, + -0.1554708 /* Building */, + -0.1052415 /* Street */, + -0.1650949 /* Unclassified */, + -0.1556262 /* Village */, + 0.1771632 /* City */, + 0.0604687 /* State */, + 0.3438015 /* Country */ }; // Coeffs sanity checks. @@ -102,6 +102,7 @@ string DebugPrint(RankingInfo const & info) << "]"; os << ", m_nameScore:" << DebugPrint(info.m_nameScore); os << ", m_errorsMade:" << DebugPrint(info.m_errorsMade); + os << ", m_maxErrorsMade:" << info.m_maxErrorsMade; os << ", m_matchedFraction:" << info.m_matchedFraction; os << ", m_type:" << DebugPrint(info.m_type); os << ", m_pureCats:" << info.m_pureCats; @@ -175,8 +176,14 @@ double RankingInfo::GetLinearModelRank() const return result; } -size_t RankingInfo::GetErrorsMade() const +double RankingInfo::GetErrorsMade() const { - return m_errorsMade.IsValid() ? m_errorsMade.m_errorsMade : 0; + if (!m_errorsMade.IsValid()) + return 1.0; + + if (m_maxErrorsMade == 0) + return 0.0; + + return static_cast(m_errorsMade.m_errorsMade) / static_cast(m_maxErrorsMade); } } // namespace search diff --git a/search/ranking_info.hpp b/search/ranking_info.hpp index 112cce8c34..6d0faeb8af 100644 --- a/search/ranking_info.hpp +++ b/search/ranking_info.hpp @@ -26,7 +26,7 @@ struct RankingInfo // correspond to important features. double GetLinearModelRank() const; - size_t GetErrorsMade() const; + double GetErrorsMade() const; // Distance from the feature to the pivot point. double m_distanceToPivot = kMaxDistMeters; @@ -43,8 +43,10 @@ struct RankingInfo // Score for the feature's name. NameScore m_nameScore = NAME_SCORE_ZERO; - // Number of typos. + // Number of misprints. ErrorsMade m_errorsMade; + // Maximal number of allowed misprints for query. + size_t m_maxErrorsMade; // Fraction of characters from original query matched to feature. double m_matchedFraction = 0.0; diff --git a/search/ranking_utils.cpp b/search/ranking_utils.cpp index ae94cf9ff2..90f77c493d 100644 --- a/search/ranking_utils.cpp +++ b/search/ranking_utils.cpp @@ -1,4 +1,5 @@ #include "search/ranking_utils.hpp" + #include "search/token_slice.hpp" #include "search/utils.hpp" @@ -26,6 +27,18 @@ struct TokenInfo }; } // namespace +// static +NameScores NameScores::BestScores(NameScores const & lhs, NameScores const & rhs) +{ + if (lhs.m_nameScore != rhs.m_nameScore) + return lhs.m_nameScore > rhs.m_nameScore ? lhs : rhs; + + NameScores result = lhs; + result.m_errorsMade = ErrorsMade::Min(lhs.m_errorsMade, rhs.m_errorsMade); + + return result; +} + // CategoriesInfo ---------------------------------------------------------------------------------- CategoriesInfo::CategoriesInfo(feature::TypesHolder const & holder, TokenSlice const & tokens, Locales const & locales, CategoriesHolder const & categories) @@ -68,30 +81,32 @@ string DebugPrint(ErrorsMade const & errorsMade) namespace impl { -bool FullMatch(QueryParams::Token const & token, UniString const & text) -{ - return token.AnyOfSynonyms([&text](UniString const & s) { return s == text; }); -} - -bool PrefixMatch(QueryParams::Token const & token, UniString const & text) -{ - return token.AnyOfSynonyms([&text](UniString const & s) { return StartsWith(text, s); }); -} - -ErrorsMade GetMinErrorsMade(vector const & tokens, - strings::UniString const & text) +ErrorsMade GetErrorsMade(QueryParams::Token const & token, strings::UniString const & text) { + ErrorsMade errorsMade; auto const dfa = BuildLevenshteinDFA(text); - ErrorsMade errorsMade; - - for (auto const & token : tokens) - { + token.ForEachSynonym([&](strings::UniString const & s) { auto it = dfa.Begin(); - strings::DFAMove(it, token.begin(), token.end()); + strings::DFAMove(it, s.begin(), s.end()); if (it.Accepts()) errorsMade = ErrorsMade::Min(errorsMade, ErrorsMade(it.ErrorsMade())); - } + }); + + return errorsMade; +} + +ErrorsMade GetPrefixErrorsMade(QueryParams::Token const & token, strings::UniString const & text) +{ + ErrorsMade errorsMade; + auto const dfa = PrefixDFAModifier(BuildLevenshteinDFA(text)); + + token.ForEachSynonym([&](strings::UniString const & s) { + auto it = dfa.Begin(); + strings::DFAMove(it, s.begin(), s.end()); + if (!it.Rejects()) + errorsMade = ErrorsMade::Min(errorsMade, ErrorsMade(it.ErrorsMade())); + }); return errorsMade; } diff --git a/search/ranking_utils.hpp b/search/ranking_utils.hpp index 19ff21c6fe..2df47eb731 100644 --- a/search/ranking_utils.hpp +++ b/search/ranking_utils.hpp @@ -93,19 +93,15 @@ struct ErrorsMade size_t m_errorsMade = kInfiniteErrors; }; -string DebugPrint(ErrorsMade const & errorsMade); +std::string DebugPrint(ErrorsMade const & errorsMade); namespace impl { -bool FullMatch(QueryParams::Token const & token, strings::UniString const & text); - -bool PrefixMatch(QueryParams::Token const & token, strings::UniString const & text); - // Returns the minimum number of errors needed to match |text| with // any of the |tokens|. If it's not possible in accordance with // GetMaxErrorsForToken(|text|), returns kInfiniteErrors. -ErrorsMade GetMinErrorsMade(std::vector const & tokens, - strings::UniString const & text); +ErrorsMade GetErrorsMade(QueryParams::Token const & token, strings::UniString const & text); +ErrorsMade GetPrefixErrorsMade(QueryParams::Token const & token, strings::UniString const & text); } // namespace impl // The order and numeric values are important here. Please, check all @@ -120,6 +116,14 @@ enum NameScore NAME_SCORE_COUNT }; +struct NameScores +{ + static NameScores BestScores(NameScores const & lhs, NameScores const & rhs); + + NameScore m_nameScore = NAME_SCORE_ZERO; + ErrorsMade m_errorsMade; +}; + // Returns true when |s| is a stop-word and may be removed from a query. bool IsStopWord(strings::UniString const & s); @@ -127,78 +131,66 @@ bool IsStopWord(strings::UniString const & s); void PrepareStringForMatching(std::string const & name, std::vector & tokens); template -NameScore GetNameScore(std::string const & name, Slice const & slice) +NameScores GetNameScores(std::vector const & tokens, Slice const & slice) { if (slice.Empty()) - return NAME_SCORE_ZERO; - - std::vector tokens; - SplitUniString(NormalizeAndSimplifyString(name), base::MakeBackInsertFunctor(tokens), Delimiters()); - return GetNameScore(tokens, slice); -} - -template -NameScore GetNameScore(std::vector const & tokens, Slice const & slice) -{ - if (slice.Empty()) - return NAME_SCORE_ZERO; + return {}; size_t const n = tokens.size(); size_t const m = slice.Size(); bool const lastTokenIsPrefix = slice.IsPrefix(m - 1); - NameScore score = NAME_SCORE_ZERO; + NameScores scores; for (size_t offset = 0; offset + m <= n; ++offset) { + ErrorsMade totalErrorsMade; bool match = true; for (size_t i = 0; i < m - 1 && match; ++i) - match = match && impl::FullMatch(slice.Get(i), tokens[offset + i]); + { + auto errorsMade = impl::GetErrorsMade(slice.Get(i), tokens[offset + i]); + match = match && errorsMade.IsValid(); + totalErrorsMade += errorsMade; + } + if (!match) continue; - bool const fullMatch = impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1]); - bool const prefixMatch = - lastTokenIsPrefix && impl::PrefixMatch(slice.Get(m - 1), tokens[offset + m - 1]); - if (!fullMatch && !prefixMatch) + auto const prefixErrorsMade = + impl::GetPrefixErrorsMade(slice.Get(m - 1), tokens[offset + m - 1]); + auto const fullErrorsMade = impl::GetErrorsMade(slice.Get(m - 1), tokens[offset + m - 1]); + if (!fullErrorsMade.IsValid() && !(prefixErrorsMade.IsValid() && lastTokenIsPrefix)) continue; - if (m == n && fullMatch) - return NAME_SCORE_FULL_MATCH; + if (m == n && fullErrorsMade.IsValid()) + { + scores.m_nameScore = NAME_SCORE_FULL_MATCH; + scores.m_errorsMade = totalErrorsMade + fullErrorsMade; + return scores; + } if (offset == 0) - score = std::max(score, NAME_SCORE_PREFIX); - - score = std::max(score, NAME_SCORE_SUBSTRING); + { + scores.m_nameScore = std::max(scores.m_nameScore, NAME_SCORE_PREFIX); + scores.m_errorsMade = totalErrorsMade + prefixErrorsMade; + } + else + { + scores.m_nameScore = std::max(scores.m_nameScore, NAME_SCORE_SUBSTRING); + scores.m_errorsMade = totalErrorsMade + prefixErrorsMade; + } } - return score; -} - -string DebugPrint(NameScore score); - -// Returns total number of errors that were made during matching -// feature |tokens| by a query - query tokens are in |slice|. -template -ErrorsMade GetErrorsMade(std::vector const & tokens, Slice const & slice) -{ - ErrorsMade totalErrorsMade; - - for (size_t i = 0; i < slice.Size(); ++i) - { - ErrorsMade errorsMade; - slice.Get(i).ForEachSynonym([&](strings::UniString const & s) { - errorsMade = ErrorsMade::Min(errorsMade, impl::GetMinErrorsMade(tokens, s)); - }); - - totalErrorsMade += errorsMade; - } - - return totalErrorsMade; + return scores; } template -ErrorsMade GetErrorsMade(std::string const & s, Slice const & slice) +NameScores GetNameScores(std::string const & name, Slice const & slice) { - return GetErrorsMade({strings::MakeUniString(s)}, slice); + std::vector tokens; + SplitUniString(NormalizeAndSimplifyString(name), base::MakeBackInsertFunctor(tokens), + Delimiters()); + return GetNameScores(tokens, slice); } + +std::string DebugPrint(NameScore score); } // namespace search diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp index 8b830efbe8..0f780347cc 100644 --- a/search/search_integration_tests/processor_test.cpp +++ b/search/search_integration_tests/processor_test.cpp @@ -561,8 +561,9 @@ UNIT_CLASS_TEST(ProcessorTest, TestRankingInfo_ErrorsMade) checkErrors("кафе", ErrorsMade()); checkErrors("Cafe Yesenina", ErrorsMade(0)); - checkErrors("Cafe Esenina", ErrorsMade(1)); checkErrors("Cafe Jesenina", ErrorsMade(1)); + // We allow only Y->{E, J, I, U} misprints for the first letter. + checkErrors("Cafe Esenina", ErrorsMade(2)); checkErrors("Островского кафе", ErrorsMade(0)); checkErrors("Астровского кафе", ErrorsMade(1)); @@ -1897,9 +1898,9 @@ UNIT_CLASS_TEST(ProcessorTest, ExactMatchTest) TEST(ResultsMatch(results, rules), ()); TEST_EQUAL(2, results.size(), ("Unexpected number of retrieved cafes.")); - TEST(ResultsMatch({results[0]}, {ExactMatch(wonderlandId, cafe)}), ()); - TEST(results[0].GetRankingInfo().m_exactMatch, ()); - TEST(!results[1].GetRankingInfo().m_exactMatch, ()); + TEST(ResultsMatch({results[0]}, {ExactMatch(wonderlandId, lermontov)}), ()); + TEST(!results[0].GetRankingInfo().m_exactMatch, ()); + TEST(results[1].GetRankingInfo().m_exactMatch, ()); } { diff --git a/search/search_tests/ranking_tests.cpp b/search/search_tests/ranking_tests.cpp index caa82ba84e..7ee3b6c7b7 100644 --- a/search/search_tests/ranking_tests.cpp +++ b/search/search_tests/ranking_tests.cpp @@ -39,7 +39,7 @@ NameScore GetScore(string const & name, string const & query, TokenRange const & params.InitNoPrefix(tokens.begin(), tokens.end()); } - return GetNameScore(name, TokenSlice(params, tokenRange)); + return GetNameScores(name, TokenSlice(params, tokenRange)).m_nameScore; } UNIT_TEST(NameTest_Smoke) @@ -49,11 +49,15 @@ UNIT_TEST(NameTest_Smoke) TEST_EQUAL(GetScore("New York", "York", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ()); TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", TokenRange(2, 3)), NAME_SCORE_PREFIX, ()); TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH, ()); + TEST_EQUAL(GetScore("Moscow", "Red Square Moscw", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH, ()); TEST_EQUAL(GetScore("San Francisco", "Fran", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ()); TEST_EQUAL(GetScore("San Francisco", "Fran ", TokenRange(0, 1)), NAME_SCORE_ZERO, ()); TEST_EQUAL(GetScore("San Francisco", "Sa", TokenRange(0, 1)), NAME_SCORE_PREFIX, ()); TEST_EQUAL(GetScore("San Francisco", "San ", TokenRange(0, 1)), NAME_SCORE_PREFIX, ()); - TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_PREFIX, ()); + TEST_EQUAL(GetScore("Лермонтовъ", "Лермон", TokenRange(0, 1)), NAME_SCORE_PREFIX, ()); + TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH, ()); + TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтово", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH, ()); + TEST_EQUAL(GetScore("Лермонтовъ", "Лермнтовъ", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH, ()); TEST_EQUAL(GetScore("фото на документы", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ()); TEST_EQUAL(GetScore("фотоателье", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ()); }