[search] Allow misprints while determing NameScore for ranking.

This commit is contained in:
tatiana-yan 2019-07-08 18:34:52 +03:00 committed by mpimenov
parent 1bdc912179
commit 8ac74cf266
7 changed files with 140 additions and 111 deletions

View file

@ -33,26 +33,33 @@ namespace search
{
namespace
{
struct NameScores
size_t GetMaxNumberOfErros(Geocoder::Params const & params)
{
size_t result = 0;
for (size_t i = 0; i < params.GetNumTokens(); ++i)
result += GetMaxErrorsForToken(params.GetToken(i).GetOriginal());
return result;
}
struct NameScoresEx : public NameScores
{
NameScore m_nameScore = NAME_SCORE_ZERO;
ErrorsMade m_errorsMade;
size_t m_matchedLength = 0;
};
template <typename Slice>
void UpdateNameScores(string const & name, Slice const & slice, NameScores & bestScores)
{
bestScores.m_nameScore = max(bestScores.m_nameScore, GetNameScore(name, slice));
bestScores.m_errorsMade = ErrorsMade::Min(bestScores.m_errorsMade, GetErrorsMade(name, slice));
auto const newScores = GetNameScores(name, slice);
bestScores = NameScores::BestScores(newScores, bestScores);
}
template <typename Slice>
void UpdateNameScores(vector<strings::UniString> const & tokens, Slice const & slice,
NameScores & bestScores)
{
bestScores.m_nameScore = max(bestScores.m_nameScore, GetNameScore(tokens, slice));
bestScores.m_errorsMade = ErrorsMade::Min(bestScores.m_errorsMade, GetErrorsMade(tokens, slice));
auto const newScores = GetNameScores(tokens, slice);
bestScores = NameScores::BestScores(newScores, bestScores);
}
// This function supports only street names like "abcdstrasse"/"abcd strasse".
@ -94,10 +101,10 @@ vector<vector<strings::UniString>> ModifyStrasse(vector<strings::UniString> cons
return result;
}
NameScores GetNameScores(FeatureType & ft, Geocoder::Params const & params,
TokenRange const & range, Model::Type type)
NameScoresEx GetNameScores(FeatureType & ft, Geocoder::Params const & params,
TokenRange const & range, Model::Type type)
{
NameScores bestScores;
NameScoresEx bestScores;
TokenSlice const slice(params, range);
TokenSliceNoCategories const sliceNoCategories(params, range);
@ -420,6 +427,7 @@ class RankerResultMaker
info.m_nameScore = nameScore;
info.m_errorsMade = errorsMade;
info.m_maxErrorsMade = GetMaxNumberOfErros(m_params);
info.m_matchedFraction =
totalLength == 0 ? 1.0
: static_cast<double>(matchedLength) / static_cast<double>(totalLength);

View file

@ -14,32 +14,32 @@ namespace
{
// See search/search_quality/scoring_model.py for details. In short,
// these coeffs correspond to coeffs in a linear model.
double constexpr kDistanceToPivot = -0.4639722;
double constexpr kDistanceToPivot = -0.8175524;
double constexpr kRank = 1.0000000;
// todo: (@t.yan) Adjust.
double constexpr kPopularity = 0.0500000;
// todo: (@t.yan) Adjust.
double constexpr kRating = 0.0500000;
double constexpr kFalseCats = -1.0000000;
double constexpr kErrorsMade = -0.0221024;
double constexpr kMatchedFraction = 0.3817912;
double constexpr kAllTokensUsed = 0.6343994;
double constexpr kFalseCats = -0.3745520;
double constexpr kErrorsMade = -0.1090870;
double constexpr kMatchedFraction = 0.7859737;
double constexpr kAllTokensUsed = 1.0000000;
double constexpr kHasName = 0.5;
double constexpr kNameScore[NameScore::NAME_SCORE_COUNT] = {
-0.2330337 /* Zero */,
0.0413221 /* Substring */,
0.0578796 /* Prefix */,
0.1338319 /* Full Match */
-0.1752510 /* Zero */,
0.0309111 /* Substring */,
0.0127291 /* Prefix */,
0.1316108 /* Full Match */
};
double constexpr kType[Model::TYPE_COUNT] = {
-0.1252380 /* POI */,
-0.1252380 /* Building */,
-0.1197951 /* Street */,
-0.1371600 /* Unclassified */,
-0.0394436 /* Village */,
0.1370968 /* City */,
-0.0810345 /* State */,
0.3655743 /* Country */
-0.1554708 /* POI */,
-0.1554708 /* Building */,
-0.1052415 /* Street */,
-0.1650949 /* Unclassified */,
-0.1556262 /* Village */,
0.1771632 /* City */,
0.0604687 /* State */,
0.3438015 /* Country */
};
// Coeffs sanity checks.
@ -102,6 +102,7 @@ string DebugPrint(RankingInfo const & info)
<< "]";
os << ", m_nameScore:" << DebugPrint(info.m_nameScore);
os << ", m_errorsMade:" << DebugPrint(info.m_errorsMade);
os << ", m_maxErrorsMade:" << info.m_maxErrorsMade;
os << ", m_matchedFraction:" << info.m_matchedFraction;
os << ", m_type:" << DebugPrint(info.m_type);
os << ", m_pureCats:" << info.m_pureCats;
@ -175,8 +176,14 @@ double RankingInfo::GetLinearModelRank() const
return result;
}
size_t RankingInfo::GetErrorsMade() const
double RankingInfo::GetErrorsMade() const
{
return m_errorsMade.IsValid() ? m_errorsMade.m_errorsMade : 0;
if (!m_errorsMade.IsValid())
return 1.0;
if (m_maxErrorsMade == 0)
return 0.0;
return static_cast<double>(m_errorsMade.m_errorsMade) / static_cast<double>(m_maxErrorsMade);
}
} // namespace search

View file

@ -26,7 +26,7 @@ struct RankingInfo
// correspond to important features.
double GetLinearModelRank() const;
size_t GetErrorsMade() const;
double GetErrorsMade() const;
// Distance from the feature to the pivot point.
double m_distanceToPivot = kMaxDistMeters;
@ -43,8 +43,10 @@ struct RankingInfo
// Score for the feature's name.
NameScore m_nameScore = NAME_SCORE_ZERO;
// Number of typos.
// Number of misprints.
ErrorsMade m_errorsMade;
// Maximal number of allowed misprints for query.
size_t m_maxErrorsMade;
// Fraction of characters from original query matched to feature.
double m_matchedFraction = 0.0;

View file

@ -1,4 +1,5 @@
#include "search/ranking_utils.hpp"
#include "search/token_slice.hpp"
#include "search/utils.hpp"
@ -26,6 +27,18 @@ struct TokenInfo
};
} // namespace
// static
NameScores NameScores::BestScores(NameScores const & lhs, NameScores const & rhs)
{
if (lhs.m_nameScore != rhs.m_nameScore)
return lhs.m_nameScore > rhs.m_nameScore ? lhs : rhs;
NameScores result = lhs;
result.m_errorsMade = ErrorsMade::Min(lhs.m_errorsMade, rhs.m_errorsMade);
return result;
}
// CategoriesInfo ----------------------------------------------------------------------------------
CategoriesInfo::CategoriesInfo(feature::TypesHolder const & holder, TokenSlice const & tokens,
Locales const & locales, CategoriesHolder const & categories)
@ -68,30 +81,32 @@ string DebugPrint(ErrorsMade const & errorsMade)
namespace impl
{
bool FullMatch(QueryParams::Token const & token, UniString const & text)
{
return token.AnyOfSynonyms([&text](UniString const & s) { return s == text; });
}
bool PrefixMatch(QueryParams::Token const & token, UniString const & text)
{
return token.AnyOfSynonyms([&text](UniString const & s) { return StartsWith(text, s); });
}
ErrorsMade GetMinErrorsMade(vector<strings::UniString> const & tokens,
strings::UniString const & text)
ErrorsMade GetErrorsMade(QueryParams::Token const & token, strings::UniString const & text)
{
ErrorsMade errorsMade;
auto const dfa = BuildLevenshteinDFA(text);
ErrorsMade errorsMade;
for (auto const & token : tokens)
{
token.ForEachSynonym([&](strings::UniString const & s) {
auto it = dfa.Begin();
strings::DFAMove(it, token.begin(), token.end());
strings::DFAMove(it, s.begin(), s.end());
if (it.Accepts())
errorsMade = ErrorsMade::Min(errorsMade, ErrorsMade(it.ErrorsMade()));
}
});
return errorsMade;
}
ErrorsMade GetPrefixErrorsMade(QueryParams::Token const & token, strings::UniString const & text)
{
ErrorsMade errorsMade;
auto const dfa = PrefixDFAModifier<LevenshteinDFA>(BuildLevenshteinDFA(text));
token.ForEachSynonym([&](strings::UniString const & s) {
auto it = dfa.Begin();
strings::DFAMove(it, s.begin(), s.end());
if (!it.Rejects())
errorsMade = ErrorsMade::Min(errorsMade, ErrorsMade(it.ErrorsMade()));
});
return errorsMade;
}

View file

@ -93,19 +93,15 @@ struct ErrorsMade
size_t m_errorsMade = kInfiniteErrors;
};
string DebugPrint(ErrorsMade const & errorsMade);
std::string DebugPrint(ErrorsMade const & errorsMade);
namespace impl
{
bool FullMatch(QueryParams::Token const & token, strings::UniString const & text);
bool PrefixMatch(QueryParams::Token const & token, strings::UniString const & text);
// Returns the minimum number of errors needed to match |text| with
// any of the |tokens|. If it's not possible in accordance with
// GetMaxErrorsForToken(|text|), returns kInfiniteErrors.
ErrorsMade GetMinErrorsMade(std::vector<strings::UniString> const & tokens,
strings::UniString const & text);
ErrorsMade GetErrorsMade(QueryParams::Token const & token, strings::UniString const & text);
ErrorsMade GetPrefixErrorsMade(QueryParams::Token const & token, strings::UniString const & text);
} // namespace impl
// The order and numeric values are important here. Please, check all
@ -120,6 +116,14 @@ enum NameScore
NAME_SCORE_COUNT
};
struct NameScores
{
static NameScores BestScores(NameScores const & lhs, NameScores const & rhs);
NameScore m_nameScore = NAME_SCORE_ZERO;
ErrorsMade m_errorsMade;
};
// Returns true when |s| is a stop-word and may be removed from a query.
bool IsStopWord(strings::UniString const & s);
@ -127,78 +131,66 @@ bool IsStopWord(strings::UniString const & s);
void PrepareStringForMatching(std::string const & name, std::vector<strings::UniString> & tokens);
template <typename Slice>
NameScore GetNameScore(std::string const & name, Slice const & slice)
NameScores GetNameScores(std::vector<strings::UniString> const & tokens, Slice const & slice)
{
if (slice.Empty())
return NAME_SCORE_ZERO;
std::vector<strings::UniString> tokens;
SplitUniString(NormalizeAndSimplifyString(name), base::MakeBackInsertFunctor(tokens), Delimiters());
return GetNameScore(tokens, slice);
}
template <typename Slice>
NameScore GetNameScore(std::vector<strings::UniString> const & tokens, Slice const & slice)
{
if (slice.Empty())
return NAME_SCORE_ZERO;
return {};
size_t const n = tokens.size();
size_t const m = slice.Size();
bool const lastTokenIsPrefix = slice.IsPrefix(m - 1);
NameScore score = NAME_SCORE_ZERO;
NameScores scores;
for (size_t offset = 0; offset + m <= n; ++offset)
{
ErrorsMade totalErrorsMade;
bool match = true;
for (size_t i = 0; i < m - 1 && match; ++i)
match = match && impl::FullMatch(slice.Get(i), tokens[offset + i]);
{
auto errorsMade = impl::GetErrorsMade(slice.Get(i), tokens[offset + i]);
match = match && errorsMade.IsValid();
totalErrorsMade += errorsMade;
}
if (!match)
continue;
bool const fullMatch = impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1]);
bool const prefixMatch =
lastTokenIsPrefix && impl::PrefixMatch(slice.Get(m - 1), tokens[offset + m - 1]);
if (!fullMatch && !prefixMatch)
auto const prefixErrorsMade =
impl::GetPrefixErrorsMade(slice.Get(m - 1), tokens[offset + m - 1]);
auto const fullErrorsMade = impl::GetErrorsMade(slice.Get(m - 1), tokens[offset + m - 1]);
if (!fullErrorsMade.IsValid() && !(prefixErrorsMade.IsValid() && lastTokenIsPrefix))
continue;
if (m == n && fullMatch)
return NAME_SCORE_FULL_MATCH;
if (m == n && fullErrorsMade.IsValid())
{
scores.m_nameScore = NAME_SCORE_FULL_MATCH;
scores.m_errorsMade = totalErrorsMade + fullErrorsMade;
return scores;
}
if (offset == 0)
score = std::max(score, NAME_SCORE_PREFIX);
score = std::max(score, NAME_SCORE_SUBSTRING);
{
scores.m_nameScore = std::max(scores.m_nameScore, NAME_SCORE_PREFIX);
scores.m_errorsMade = totalErrorsMade + prefixErrorsMade;
}
else
{
scores.m_nameScore = std::max(scores.m_nameScore, NAME_SCORE_SUBSTRING);
scores.m_errorsMade = totalErrorsMade + prefixErrorsMade;
}
}
return score;
}
string DebugPrint(NameScore score);
// Returns total number of errors that were made during matching
// feature |tokens| by a query - query tokens are in |slice|.
template <typename Slice>
ErrorsMade GetErrorsMade(std::vector<strings::UniString> const & tokens, Slice const & slice)
{
ErrorsMade totalErrorsMade;
for (size_t i = 0; i < slice.Size(); ++i)
{
ErrorsMade errorsMade;
slice.Get(i).ForEachSynonym([&](strings::UniString const & s) {
errorsMade = ErrorsMade::Min(errorsMade, impl::GetMinErrorsMade(tokens, s));
});
totalErrorsMade += errorsMade;
}
return totalErrorsMade;
return scores;
}
template <typename Slice>
ErrorsMade GetErrorsMade(std::string const & s, Slice const & slice)
NameScores GetNameScores(std::string const & name, Slice const & slice)
{
return GetErrorsMade({strings::MakeUniString(s)}, slice);
std::vector<strings::UniString> tokens;
SplitUniString(NormalizeAndSimplifyString(name), base::MakeBackInsertFunctor(tokens),
Delimiters());
return GetNameScores(tokens, slice);
}
std::string DebugPrint(NameScore score);
} // namespace search

View file

@ -561,8 +561,9 @@ UNIT_CLASS_TEST(ProcessorTest, TestRankingInfo_ErrorsMade)
checkErrors("кафе", ErrorsMade());
checkErrors("Cafe Yesenina", ErrorsMade(0));
checkErrors("Cafe Esenina", ErrorsMade(1));
checkErrors("Cafe Jesenina", ErrorsMade(1));
// We allow only Y->{E, J, I, U} misprints for the first letter.
checkErrors("Cafe Esenina", ErrorsMade(2));
checkErrors("Островского кафе", ErrorsMade(0));
checkErrors("Астровского кафе", ErrorsMade(1));
@ -1897,9 +1898,9 @@ UNIT_CLASS_TEST(ProcessorTest, ExactMatchTest)
TEST(ResultsMatch(results, rules), ());
TEST_EQUAL(2, results.size(), ("Unexpected number of retrieved cafes."));
TEST(ResultsMatch({results[0]}, {ExactMatch(wonderlandId, cafe)}), ());
TEST(results[0].GetRankingInfo().m_exactMatch, ());
TEST(!results[1].GetRankingInfo().m_exactMatch, ());
TEST(ResultsMatch({results[0]}, {ExactMatch(wonderlandId, lermontov)}), ());
TEST(!results[0].GetRankingInfo().m_exactMatch, ());
TEST(results[1].GetRankingInfo().m_exactMatch, ());
}
{

View file

@ -39,7 +39,7 @@ NameScore GetScore(string const & name, string const & query, TokenRange const &
params.InitNoPrefix(tokens.begin(), tokens.end());
}
return GetNameScore(name, TokenSlice(params, tokenRange));
return GetNameScores(name, TokenSlice(params, tokenRange)).m_nameScore;
}
UNIT_TEST(NameTest_Smoke)
@ -49,11 +49,15 @@ UNIT_TEST(NameTest_Smoke)
TEST_EQUAL(GetScore("New York", "York", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ());
TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", TokenRange(2, 3)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH, ());
TEST_EQUAL(GetScore("Moscow", "Red Square Moscw", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH, ());
TEST_EQUAL(GetScore("San Francisco", "Fran", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ());
TEST_EQUAL(GetScore("San Francisco", "Fran ", TokenRange(0, 1)), NAME_SCORE_ZERO, ());
TEST_EQUAL(GetScore("San Francisco", "Sa", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("San Francisco", "San ", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("Лермонтовъ", "Лермон", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH, ());
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтово", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH, ());
TEST_EQUAL(GetScore("Лермонтовъ", "Лермнтовъ", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH, ());
TEST_EQUAL(GetScore("фото на документы", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("фотоателье", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
}