diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp index 4a6b41bc03..e962159b5a 100644 --- a/indexer/search_string_utils.cpp +++ b/indexer/search_string_utils.cpp @@ -57,16 +57,21 @@ void RemoveNumeroSigns(UniString & s) } } // namespace +size_t GetMaxErrorsForTokenLength(size_t length) +{ + if (length < 4) + return 0; + if (length < 8) + return 1; + return 2; +} + size_t GetMaxErrorsForToken(strings::UniString const & token) { bool const digitsOnly = all_of(token.begin(), token.end(), ::isdigit); if (digitsOnly) return 0; - if (token.size() < 4) - return 0; - if (token.size() < 8) - return 1; - return 2; + return GetMaxErrorsForTokenLength(token.size()); } strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s) diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index 20415be63d..7ac9e27d53 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -13,6 +13,7 @@ namespace search { +size_t GetMaxErrorsForTokenLength(size_t length); size_t GetMaxErrorsForToken(strings::UniString const & token); strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s); diff --git a/search/ranker.cpp b/search/ranker.cpp index fd6edfa0a8..d565c535a8 100644 --- a/search/ranker.cpp +++ b/search/ranker.cpp @@ -33,15 +33,6 @@ namespace search { namespace { -size_t GetMaxNumberOfErrors(Geocoder::Params const & params) -{ - size_t result = 0; - for (size_t i = 0; i < params.GetNumTokens(); ++i) - result += GetMaxErrorsForToken(params.GetToken(i).GetOriginal()); - - return result; -} - template void UpdateNameScores(string const & name, Slice const & slice, NameScores & bestScores) { @@ -420,7 +411,7 @@ class RankerResultMaker info.m_nameScore = nameScore; info.m_errorsMade = errorsMade; - info.m_maxErrorsMade = GetMaxNumberOfErrors(m_params); + info.m_numTokens = m_params.GetNumTokens(); info.m_matchedFraction = totalLength == 0 ? 1.0 : static_cast(matchedLength) / static_cast(totalLength); diff --git a/search/ranking_info.cpp b/search/ranking_info.cpp index 17c347342d..71acc6ec8f 100644 --- a/search/ranking_info.cpp +++ b/search/ranking_info.cpp @@ -2,6 +2,8 @@ #include "ugc/types.hpp" +#include "indexer/search_string_utils.hpp" + #include #include #include @@ -102,7 +104,7 @@ string DebugPrint(RankingInfo const & info) << "]"; os << ", m_nameScore:" << DebugPrint(info.m_nameScore); os << ", m_errorsMade:" << DebugPrint(info.m_errorsMade); - os << ", m_maxErrorsMade:" << info.m_maxErrorsMade; + os << ", m_numTokens:" << info.m_numTokens; os << ", m_matchedFraction:" << info.m_matchedFraction; os << ", m_type:" << DebugPrint(info.m_type); os << ", m_pureCats:" << info.m_pureCats; @@ -122,7 +124,7 @@ void RankingInfo::ToCSV(ostream & os) const os << static_cast(m_popularity) << ","; os << TransformRating(m_rating) << ","; os << DebugPrint(m_nameScore) << ","; - os << GetErrorsMade() << ","; + os << GetErrorsMadePerToken() << ","; os << m_matchedFraction << ","; os << DebugPrint(m_type) << ","; os << m_pureCats << ","; @@ -165,7 +167,7 @@ double RankingInfo::GetLinearModelRank() const { result += kType[m_type]; result += kNameScore[nameScore]; - result += kErrorsMade * GetErrorsMade(); + result += kErrorsMade * GetErrorsMadePerToken(); result += kMatchedFraction * m_matchedFraction; result += (m_allTokensUsed ? 1 : 0) * kAllTokensUsed; } @@ -176,15 +178,19 @@ double RankingInfo::GetLinearModelRank() const return result; } -double RankingInfo::GetErrorsMade() const +// We build LevensteinDFA based on feature tokens to match query. +// Feature tokens can be longer than query tokens that's why every query token can be +// matched to feature token with maximal supported errors number. +// As maximal errors number depends only on tokens number (not tokens length), +// errorsMade per token is supposed to be a good metric. +double RankingInfo::GetErrorsMadePerToken() const { + size_t static const kMaxErrorsPerToken = + GetMaxErrorsForTokenLength(numeric_limits::max()); if (!m_errorsMade.IsValid()) - return 1.0; + return static_cast(kMaxErrorsPerToken); - if (m_maxErrorsMade == 0) - return 0.0; - - CHECK_GREATER_OR_EQUAL(m_maxErrorsMade, m_errorsMade.m_errorsMade, ()); - return static_cast(m_errorsMade.m_errorsMade) / static_cast(m_maxErrorsMade); + CHECK_GREATER(m_numTokens, 0, ()); + return static_cast(m_errorsMade.m_errorsMade) / static_cast(m_numTokens); } } // namespace search diff --git a/search/ranking_info.hpp b/search/ranking_info.hpp index 6d0faeb8af..e871a4f1c5 100644 --- a/search/ranking_info.hpp +++ b/search/ranking_info.hpp @@ -26,7 +26,7 @@ struct RankingInfo // correspond to important features. double GetLinearModelRank() const; - double GetErrorsMade() const; + double GetErrorsMadePerToken() const; // Distance from the feature to the pivot point. double m_distanceToPivot = kMaxDistMeters; @@ -45,8 +45,8 @@ struct RankingInfo // Number of misprints. ErrorsMade m_errorsMade; - // Maximal number of allowed misprints for query. - size_t m_maxErrorsMade; + // Query tokens number. + size_t m_numTokens; // Fraction of characters from original query matched to feature. double m_matchedFraction = 0.0;