forked from organicmaps/organicmaps
[search] Allow misprints while determing NameScore for ranking.
This commit is contained in:
parent
1bdc912179
commit
8ac74cf266
7 changed files with 140 additions and 111 deletions
|
@ -33,26 +33,33 @@ namespace search
|
|||
{
|
||||
namespace
|
||||
{
|
||||
struct NameScores
|
||||
size_t GetMaxNumberOfErros(Geocoder::Params const & params)
|
||||
{
|
||||
size_t result = 0;
|
||||
for (size_t i = 0; i < params.GetNumTokens(); ++i)
|
||||
result += GetMaxErrorsForToken(params.GetToken(i).GetOriginal());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct NameScoresEx : public NameScores
|
||||
{
|
||||
NameScore m_nameScore = NAME_SCORE_ZERO;
|
||||
ErrorsMade m_errorsMade;
|
||||
size_t m_matchedLength = 0;
|
||||
};
|
||||
|
||||
template <typename Slice>
|
||||
void UpdateNameScores(string const & name, Slice const & slice, NameScores & bestScores)
|
||||
{
|
||||
bestScores.m_nameScore = max(bestScores.m_nameScore, GetNameScore(name, slice));
|
||||
bestScores.m_errorsMade = ErrorsMade::Min(bestScores.m_errorsMade, GetErrorsMade(name, slice));
|
||||
auto const newScores = GetNameScores(name, slice);
|
||||
bestScores = NameScores::BestScores(newScores, bestScores);
|
||||
}
|
||||
|
||||
template <typename Slice>
|
||||
void UpdateNameScores(vector<strings::UniString> const & tokens, Slice const & slice,
|
||||
NameScores & bestScores)
|
||||
{
|
||||
bestScores.m_nameScore = max(bestScores.m_nameScore, GetNameScore(tokens, slice));
|
||||
bestScores.m_errorsMade = ErrorsMade::Min(bestScores.m_errorsMade, GetErrorsMade(tokens, slice));
|
||||
auto const newScores = GetNameScores(tokens, slice);
|
||||
bestScores = NameScores::BestScores(newScores, bestScores);
|
||||
}
|
||||
|
||||
// This function supports only street names like "abcdstrasse"/"abcd strasse".
|
||||
|
@ -94,10 +101,10 @@ vector<vector<strings::UniString>> ModifyStrasse(vector<strings::UniString> cons
|
|||
return result;
|
||||
}
|
||||
|
||||
NameScores GetNameScores(FeatureType & ft, Geocoder::Params const & params,
|
||||
TokenRange const & range, Model::Type type)
|
||||
NameScoresEx GetNameScores(FeatureType & ft, Geocoder::Params const & params,
|
||||
TokenRange const & range, Model::Type type)
|
||||
{
|
||||
NameScores bestScores;
|
||||
NameScoresEx bestScores;
|
||||
|
||||
TokenSlice const slice(params, range);
|
||||
TokenSliceNoCategories const sliceNoCategories(params, range);
|
||||
|
@ -420,6 +427,7 @@ class RankerResultMaker
|
|||
|
||||
info.m_nameScore = nameScore;
|
||||
info.m_errorsMade = errorsMade;
|
||||
info.m_maxErrorsMade = GetMaxNumberOfErros(m_params);
|
||||
info.m_matchedFraction =
|
||||
totalLength == 0 ? 1.0
|
||||
: static_cast<double>(matchedLength) / static_cast<double>(totalLength);
|
||||
|
|
|
@ -14,32 +14,32 @@ namespace
|
|||
{
|
||||
// See search/search_quality/scoring_model.py for details. In short,
|
||||
// these coeffs correspond to coeffs in a linear model.
|
||||
double constexpr kDistanceToPivot = -0.4639722;
|
||||
double constexpr kDistanceToPivot = -0.8175524;
|
||||
double constexpr kRank = 1.0000000;
|
||||
// todo: (@t.yan) Adjust.
|
||||
double constexpr kPopularity = 0.0500000;
|
||||
// todo: (@t.yan) Adjust.
|
||||
double constexpr kRating = 0.0500000;
|
||||
double constexpr kFalseCats = -1.0000000;
|
||||
double constexpr kErrorsMade = -0.0221024;
|
||||
double constexpr kMatchedFraction = 0.3817912;
|
||||
double constexpr kAllTokensUsed = 0.6343994;
|
||||
double constexpr kFalseCats = -0.3745520;
|
||||
double constexpr kErrorsMade = -0.1090870;
|
||||
double constexpr kMatchedFraction = 0.7859737;
|
||||
double constexpr kAllTokensUsed = 1.0000000;
|
||||
double constexpr kHasName = 0.5;
|
||||
double constexpr kNameScore[NameScore::NAME_SCORE_COUNT] = {
|
||||
-0.2330337 /* Zero */,
|
||||
0.0413221 /* Substring */,
|
||||
0.0578796 /* Prefix */,
|
||||
0.1338319 /* Full Match */
|
||||
-0.1752510 /* Zero */,
|
||||
0.0309111 /* Substring */,
|
||||
0.0127291 /* Prefix */,
|
||||
0.1316108 /* Full Match */
|
||||
};
|
||||
double constexpr kType[Model::TYPE_COUNT] = {
|
||||
-0.1252380 /* POI */,
|
||||
-0.1252380 /* Building */,
|
||||
-0.1197951 /* Street */,
|
||||
-0.1371600 /* Unclassified */,
|
||||
-0.0394436 /* Village */,
|
||||
0.1370968 /* City */,
|
||||
-0.0810345 /* State */,
|
||||
0.3655743 /* Country */
|
||||
-0.1554708 /* POI */,
|
||||
-0.1554708 /* Building */,
|
||||
-0.1052415 /* Street */,
|
||||
-0.1650949 /* Unclassified */,
|
||||
-0.1556262 /* Village */,
|
||||
0.1771632 /* City */,
|
||||
0.0604687 /* State */,
|
||||
0.3438015 /* Country */
|
||||
};
|
||||
|
||||
// Coeffs sanity checks.
|
||||
|
@ -102,6 +102,7 @@ string DebugPrint(RankingInfo const & info)
|
|||
<< "]";
|
||||
os << ", m_nameScore:" << DebugPrint(info.m_nameScore);
|
||||
os << ", m_errorsMade:" << DebugPrint(info.m_errorsMade);
|
||||
os << ", m_maxErrorsMade:" << info.m_maxErrorsMade;
|
||||
os << ", m_matchedFraction:" << info.m_matchedFraction;
|
||||
os << ", m_type:" << DebugPrint(info.m_type);
|
||||
os << ", m_pureCats:" << info.m_pureCats;
|
||||
|
@ -175,8 +176,14 @@ double RankingInfo::GetLinearModelRank() const
|
|||
return result;
|
||||
}
|
||||
|
||||
size_t RankingInfo::GetErrorsMade() const
|
||||
double RankingInfo::GetErrorsMade() const
|
||||
{
|
||||
return m_errorsMade.IsValid() ? m_errorsMade.m_errorsMade : 0;
|
||||
if (!m_errorsMade.IsValid())
|
||||
return 1.0;
|
||||
|
||||
if (m_maxErrorsMade == 0)
|
||||
return 0.0;
|
||||
|
||||
return static_cast<double>(m_errorsMade.m_errorsMade) / static_cast<double>(m_maxErrorsMade);
|
||||
}
|
||||
} // namespace search
|
||||
|
|
|
@ -26,7 +26,7 @@ struct RankingInfo
|
|||
// correspond to important features.
|
||||
double GetLinearModelRank() const;
|
||||
|
||||
size_t GetErrorsMade() const;
|
||||
double GetErrorsMade() const;
|
||||
|
||||
// Distance from the feature to the pivot point.
|
||||
double m_distanceToPivot = kMaxDistMeters;
|
||||
|
@ -43,8 +43,10 @@ struct RankingInfo
|
|||
// Score for the feature's name.
|
||||
NameScore m_nameScore = NAME_SCORE_ZERO;
|
||||
|
||||
// Number of typos.
|
||||
// Number of misprints.
|
||||
ErrorsMade m_errorsMade;
|
||||
// Maximal number of allowed misprints for query.
|
||||
size_t m_maxErrorsMade;
|
||||
|
||||
// Fraction of characters from original query matched to feature.
|
||||
double m_matchedFraction = 0.0;
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "search/ranking_utils.hpp"
|
||||
|
||||
#include "search/token_slice.hpp"
|
||||
#include "search/utils.hpp"
|
||||
|
||||
|
@ -26,6 +27,18 @@ struct TokenInfo
|
|||
};
|
||||
} // namespace
|
||||
|
||||
// static
|
||||
NameScores NameScores::BestScores(NameScores const & lhs, NameScores const & rhs)
|
||||
{
|
||||
if (lhs.m_nameScore != rhs.m_nameScore)
|
||||
return lhs.m_nameScore > rhs.m_nameScore ? lhs : rhs;
|
||||
|
||||
NameScores result = lhs;
|
||||
result.m_errorsMade = ErrorsMade::Min(lhs.m_errorsMade, rhs.m_errorsMade);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// CategoriesInfo ----------------------------------------------------------------------------------
|
||||
CategoriesInfo::CategoriesInfo(feature::TypesHolder const & holder, TokenSlice const & tokens,
|
||||
Locales const & locales, CategoriesHolder const & categories)
|
||||
|
@ -68,30 +81,32 @@ string DebugPrint(ErrorsMade const & errorsMade)
|
|||
|
||||
namespace impl
|
||||
{
|
||||
bool FullMatch(QueryParams::Token const & token, UniString const & text)
|
||||
{
|
||||
return token.AnyOfSynonyms([&text](UniString const & s) { return s == text; });
|
||||
}
|
||||
|
||||
bool PrefixMatch(QueryParams::Token const & token, UniString const & text)
|
||||
{
|
||||
return token.AnyOfSynonyms([&text](UniString const & s) { return StartsWith(text, s); });
|
||||
}
|
||||
|
||||
ErrorsMade GetMinErrorsMade(vector<strings::UniString> const & tokens,
|
||||
strings::UniString const & text)
|
||||
ErrorsMade GetErrorsMade(QueryParams::Token const & token, strings::UniString const & text)
|
||||
{
|
||||
ErrorsMade errorsMade;
|
||||
auto const dfa = BuildLevenshteinDFA(text);
|
||||
|
||||
ErrorsMade errorsMade;
|
||||
|
||||
for (auto const & token : tokens)
|
||||
{
|
||||
token.ForEachSynonym([&](strings::UniString const & s) {
|
||||
auto it = dfa.Begin();
|
||||
strings::DFAMove(it, token.begin(), token.end());
|
||||
strings::DFAMove(it, s.begin(), s.end());
|
||||
if (it.Accepts())
|
||||
errorsMade = ErrorsMade::Min(errorsMade, ErrorsMade(it.ErrorsMade()));
|
||||
}
|
||||
});
|
||||
|
||||
return errorsMade;
|
||||
}
|
||||
|
||||
ErrorsMade GetPrefixErrorsMade(QueryParams::Token const & token, strings::UniString const & text)
|
||||
{
|
||||
ErrorsMade errorsMade;
|
||||
auto const dfa = PrefixDFAModifier<LevenshteinDFA>(BuildLevenshteinDFA(text));
|
||||
|
||||
token.ForEachSynonym([&](strings::UniString const & s) {
|
||||
auto it = dfa.Begin();
|
||||
strings::DFAMove(it, s.begin(), s.end());
|
||||
if (!it.Rejects())
|
||||
errorsMade = ErrorsMade::Min(errorsMade, ErrorsMade(it.ErrorsMade()));
|
||||
});
|
||||
|
||||
return errorsMade;
|
||||
}
|
||||
|
|
|
@ -93,19 +93,15 @@ struct ErrorsMade
|
|||
size_t m_errorsMade = kInfiniteErrors;
|
||||
};
|
||||
|
||||
string DebugPrint(ErrorsMade const & errorsMade);
|
||||
std::string DebugPrint(ErrorsMade const & errorsMade);
|
||||
|
||||
namespace impl
|
||||
{
|
||||
bool FullMatch(QueryParams::Token const & token, strings::UniString const & text);
|
||||
|
||||
bool PrefixMatch(QueryParams::Token const & token, strings::UniString const & text);
|
||||
|
||||
// Returns the minimum number of errors needed to match |text| with
|
||||
// any of the |tokens|. If it's not possible in accordance with
|
||||
// GetMaxErrorsForToken(|text|), returns kInfiniteErrors.
|
||||
ErrorsMade GetMinErrorsMade(std::vector<strings::UniString> const & tokens,
|
||||
strings::UniString const & text);
|
||||
ErrorsMade GetErrorsMade(QueryParams::Token const & token, strings::UniString const & text);
|
||||
ErrorsMade GetPrefixErrorsMade(QueryParams::Token const & token, strings::UniString const & text);
|
||||
} // namespace impl
|
||||
|
||||
// The order and numeric values are important here. Please, check all
|
||||
|
@ -120,6 +116,14 @@ enum NameScore
|
|||
NAME_SCORE_COUNT
|
||||
};
|
||||
|
||||
struct NameScores
|
||||
{
|
||||
static NameScores BestScores(NameScores const & lhs, NameScores const & rhs);
|
||||
|
||||
NameScore m_nameScore = NAME_SCORE_ZERO;
|
||||
ErrorsMade m_errorsMade;
|
||||
};
|
||||
|
||||
// Returns true when |s| is a stop-word and may be removed from a query.
|
||||
bool IsStopWord(strings::UniString const & s);
|
||||
|
||||
|
@ -127,78 +131,66 @@ bool IsStopWord(strings::UniString const & s);
|
|||
void PrepareStringForMatching(std::string const & name, std::vector<strings::UniString> & tokens);
|
||||
|
||||
template <typename Slice>
|
||||
NameScore GetNameScore(std::string const & name, Slice const & slice)
|
||||
NameScores GetNameScores(std::vector<strings::UniString> const & tokens, Slice const & slice)
|
||||
{
|
||||
if (slice.Empty())
|
||||
return NAME_SCORE_ZERO;
|
||||
|
||||
std::vector<strings::UniString> tokens;
|
||||
SplitUniString(NormalizeAndSimplifyString(name), base::MakeBackInsertFunctor(tokens), Delimiters());
|
||||
return GetNameScore(tokens, slice);
|
||||
}
|
||||
|
||||
template <typename Slice>
|
||||
NameScore GetNameScore(std::vector<strings::UniString> const & tokens, Slice const & slice)
|
||||
{
|
||||
if (slice.Empty())
|
||||
return NAME_SCORE_ZERO;
|
||||
return {};
|
||||
|
||||
size_t const n = tokens.size();
|
||||
size_t const m = slice.Size();
|
||||
|
||||
bool const lastTokenIsPrefix = slice.IsPrefix(m - 1);
|
||||
|
||||
NameScore score = NAME_SCORE_ZERO;
|
||||
NameScores scores;
|
||||
for (size_t offset = 0; offset + m <= n; ++offset)
|
||||
{
|
||||
ErrorsMade totalErrorsMade;
|
||||
bool match = true;
|
||||
for (size_t i = 0; i < m - 1 && match; ++i)
|
||||
match = match && impl::FullMatch(slice.Get(i), tokens[offset + i]);
|
||||
{
|
||||
auto errorsMade = impl::GetErrorsMade(slice.Get(i), tokens[offset + i]);
|
||||
match = match && errorsMade.IsValid();
|
||||
totalErrorsMade += errorsMade;
|
||||
}
|
||||
|
||||
if (!match)
|
||||
continue;
|
||||
|
||||
bool const fullMatch = impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1]);
|
||||
bool const prefixMatch =
|
||||
lastTokenIsPrefix && impl::PrefixMatch(slice.Get(m - 1), tokens[offset + m - 1]);
|
||||
if (!fullMatch && !prefixMatch)
|
||||
auto const prefixErrorsMade =
|
||||
impl::GetPrefixErrorsMade(slice.Get(m - 1), tokens[offset + m - 1]);
|
||||
auto const fullErrorsMade = impl::GetErrorsMade(slice.Get(m - 1), tokens[offset + m - 1]);
|
||||
if (!fullErrorsMade.IsValid() && !(prefixErrorsMade.IsValid() && lastTokenIsPrefix))
|
||||
continue;
|
||||
|
||||
if (m == n && fullMatch)
|
||||
return NAME_SCORE_FULL_MATCH;
|
||||
if (m == n && fullErrorsMade.IsValid())
|
||||
{
|
||||
scores.m_nameScore = NAME_SCORE_FULL_MATCH;
|
||||
scores.m_errorsMade = totalErrorsMade + fullErrorsMade;
|
||||
return scores;
|
||||
}
|
||||
|
||||
if (offset == 0)
|
||||
score = std::max(score, NAME_SCORE_PREFIX);
|
||||
|
||||
score = std::max(score, NAME_SCORE_SUBSTRING);
|
||||
{
|
||||
scores.m_nameScore = std::max(scores.m_nameScore, NAME_SCORE_PREFIX);
|
||||
scores.m_errorsMade = totalErrorsMade + prefixErrorsMade;
|
||||
}
|
||||
else
|
||||
{
|
||||
scores.m_nameScore = std::max(scores.m_nameScore, NAME_SCORE_SUBSTRING);
|
||||
scores.m_errorsMade = totalErrorsMade + prefixErrorsMade;
|
||||
}
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
||||
string DebugPrint(NameScore score);
|
||||
|
||||
// Returns total number of errors that were made during matching
|
||||
// feature |tokens| by a query - query tokens are in |slice|.
|
||||
template <typename Slice>
|
||||
ErrorsMade GetErrorsMade(std::vector<strings::UniString> const & tokens, Slice const & slice)
|
||||
{
|
||||
ErrorsMade totalErrorsMade;
|
||||
|
||||
for (size_t i = 0; i < slice.Size(); ++i)
|
||||
{
|
||||
ErrorsMade errorsMade;
|
||||
slice.Get(i).ForEachSynonym([&](strings::UniString const & s) {
|
||||
errorsMade = ErrorsMade::Min(errorsMade, impl::GetMinErrorsMade(tokens, s));
|
||||
});
|
||||
|
||||
totalErrorsMade += errorsMade;
|
||||
}
|
||||
|
||||
return totalErrorsMade;
|
||||
return scores;
|
||||
}
|
||||
|
||||
template <typename Slice>
|
||||
ErrorsMade GetErrorsMade(std::string const & s, Slice const & slice)
|
||||
NameScores GetNameScores(std::string const & name, Slice const & slice)
|
||||
{
|
||||
return GetErrorsMade({strings::MakeUniString(s)}, slice);
|
||||
std::vector<strings::UniString> tokens;
|
||||
SplitUniString(NormalizeAndSimplifyString(name), base::MakeBackInsertFunctor(tokens),
|
||||
Delimiters());
|
||||
return GetNameScores(tokens, slice);
|
||||
}
|
||||
|
||||
std::string DebugPrint(NameScore score);
|
||||
} // namespace search
|
||||
|
|
|
@ -561,8 +561,9 @@ UNIT_CLASS_TEST(ProcessorTest, TestRankingInfo_ErrorsMade)
|
|||
checkErrors("кафе", ErrorsMade());
|
||||
|
||||
checkErrors("Cafe Yesenina", ErrorsMade(0));
|
||||
checkErrors("Cafe Esenina", ErrorsMade(1));
|
||||
checkErrors("Cafe Jesenina", ErrorsMade(1));
|
||||
// We allow only Y->{E, J, I, U} misprints for the first letter.
|
||||
checkErrors("Cafe Esenina", ErrorsMade(2));
|
||||
|
||||
checkErrors("Островского кафе", ErrorsMade(0));
|
||||
checkErrors("Астровского кафе", ErrorsMade(1));
|
||||
|
@ -1897,9 +1898,9 @@ UNIT_CLASS_TEST(ProcessorTest, ExactMatchTest)
|
|||
TEST(ResultsMatch(results, rules), ());
|
||||
|
||||
TEST_EQUAL(2, results.size(), ("Unexpected number of retrieved cafes."));
|
||||
TEST(ResultsMatch({results[0]}, {ExactMatch(wonderlandId, cafe)}), ());
|
||||
TEST(results[0].GetRankingInfo().m_exactMatch, ());
|
||||
TEST(!results[1].GetRankingInfo().m_exactMatch, ());
|
||||
TEST(ResultsMatch({results[0]}, {ExactMatch(wonderlandId, lermontov)}), ());
|
||||
TEST(!results[0].GetRankingInfo().m_exactMatch, ());
|
||||
TEST(results[1].GetRankingInfo().m_exactMatch, ());
|
||||
}
|
||||
|
||||
{
|
||||
|
|
|
@ -39,7 +39,7 @@ NameScore GetScore(string const & name, string const & query, TokenRange const &
|
|||
params.InitNoPrefix(tokens.begin(), tokens.end());
|
||||
}
|
||||
|
||||
return GetNameScore(name, TokenSlice(params, tokenRange));
|
||||
return GetNameScores(name, TokenSlice(params, tokenRange)).m_nameScore;
|
||||
}
|
||||
|
||||
UNIT_TEST(NameTest_Smoke)
|
||||
|
@ -49,11 +49,15 @@ UNIT_TEST(NameTest_Smoke)
|
|||
TEST_EQUAL(GetScore("New York", "York", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ());
|
||||
TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", TokenRange(2, 3)), NAME_SCORE_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH, ());
|
||||
TEST_EQUAL(GetScore("Moscow", "Red Square Moscw", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "Fran", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "Fran ", TokenRange(0, 1)), NAME_SCORE_ZERO, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "Sa", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "San ", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("Лермонтовъ", "Лермон", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH, ());
|
||||
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтово", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH, ());
|
||||
TEST_EQUAL(GetScore("Лермонтовъ", "Лермнтовъ", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH, ());
|
||||
TEST_EQUAL(GetScore("фото на документы", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("фотоателье", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue