[search] Improved locality sorting and scoring.

This commit is contained in:
Yuri Gorshenin 2016-02-24 13:18:45 +03:00 committed by Sergey Yershov
parent 03c15cf7d5
commit 395ea36459
9 changed files with 376 additions and 89 deletions

View file

@ -53,6 +53,7 @@ HEADERS += \
v2/locality_scorer.hpp \
v2/mwm_context.hpp \
v2/rank_table_cache.hpp \
v2/ranking_utils.hpp \
v2/search_model.hpp \
v2/search_query_v2.hpp \
v2/stats_cache.hpp \
@ -92,6 +93,7 @@ SOURCES += \
v2/locality_scorer.cpp \
v2/mwm_context.cpp \
v2/rank_table_cache.cpp \
v2/ranking_utils.cpp \
v2/search_model.cpp \
v2/search_query_v2.cpp \
v2/street_vicinity_loader.cpp \

View file

@ -1,6 +1,5 @@
#include "testing/testing.hpp"
#include "search/dummy_rank_table.hpp"
#include "search/v2/locality_scorer.hpp"
#include "indexer/search_delimiters.hpp"
@ -13,6 +12,7 @@
#include "std/algorithm.hpp"
#include "std/set.hpp"
#include "std/unordered_map.hpp"
#include "std/vector.hpp"
using namespace search::v2;
@ -79,10 +79,10 @@ void AddLocality(string const & name, uint32_t featureId, SearchQueryParams & pa
}
}
class LocalityScorerTest
class LocalityScorerTest : public LocalityScorer::Delegate
{
public:
LocalityScorerTest() : m_scorer(m_table, m_params) {}
LocalityScorerTest() : m_scorer(m_params, static_cast<LocalityScorer::Delegate &>(*this)) {}
void InitParams(string const & query, bool lastTokenIsPrefix)
{
@ -92,18 +92,29 @@ public:
void AddLocality(string const & name, uint32_t featureId)
{
::AddLocality(name, featureId, m_params, m_localities);
m_names[featureId].push_back(name);
}
void LeaveTopLocalities(size_t limit)
void GetTopLocalities(size_t limit)
{
m_scorer.LeaveTopLocalities(limit, m_localities);
m_scorer.GetTopLocalities(limit, m_localities);
sort(m_localities.begin(), m_localities.end(), my::CompareBy(&Geocoder::Locality::m_featureId));
}
// LocalityScorer::Delegate overrides:
void GetNames(uint32_t featureId, vector<string> & names) const override
{
auto it = m_names.find(featureId);
if (it != m_names.end())
names.insert(names.end(), it->second.begin(), it->second.end());
}
uint8_t GetRank(uint32_t featureId) const override { return 0; }
protected:
DummyRankTable m_table;
SearchQueryParams m_params;
vector<Geocoder::Locality> m_localities;
unordered_map<uint32_t, vector<string>> m_names;
LocalityScorer m_scorer;
};
} // namespace
@ -123,14 +134,14 @@ UNIT_CLASS_TEST(LocalityScorerTest, Smoke)
AddLocality("York", ID_YORK);
AddLocality("New York", ID_NEW_YORK);
LeaveTopLocalities(100 /* limit */);
GetTopLocalities(100 /* limit */);
TEST_EQUAL(3, m_localities.size(), ());
TEST_EQUAL(m_localities[0].m_featureId, ID_NEW_ORLEANS, ());
TEST_EQUAL(m_localities[1].m_featureId, ID_YORK, ());
TEST_EQUAL(m_localities[2].m_featureId, ID_NEW_YORK, ());
// New York is the best matching locality
LeaveTopLocalities(1 /* limit */);
GetTopLocalities(1 /* limit */);
TEST_EQUAL(1, m_localities.size(), ());
TEST_EQUAL(m_localities[0].m_featureId, ID_NEW_YORK, ());
}
@ -152,7 +163,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, NumbersMatch)
AddLocality("поселок 1 мая", ID_MAY);
AddLocality("тверь", ID_TVER);
LeaveTopLocalities(100 /* limit */);
GetTopLocalities(100 /* limit */);
TEST_EQUAL(4, m_localities.size(), ());
TEST_EQUAL(m_localities[0].m_featureId, ID_MARCH, ());
TEST_EQUAL(m_localities[1].m_featureId, ID_APRIL, ());
@ -161,7 +172,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, NumbersMatch)
// Tver is the best matching locality, as other localities were
// matched by number.
LeaveTopLocalities(1 /* limit */);
GetTopLocalities(1 /* limit */);
TEST_EQUAL(1, m_localities.size(), ());
TEST_EQUAL(m_localities[0].m_featureId, ID_TVER, ());
}
@ -182,7 +193,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, NumbersComplexMatch)
// "May 1" contains a numeric token, but as it was matched by at
// least two tokens, there is no penalty for numeric token. And, as
// it has smaller featureId, it should be left.
LeaveTopLocalities(1 /* limit */);
GetTopLocalities(1 /* limit */);
TEST_EQUAL(1, m_localities.size(), ());
TEST_EQUAL(m_localities[0].m_featureId, ID_MAY, ());
}
@ -198,7 +209,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch)
};
// SearchQueryParams params;
InitParams("New York San Anto", true /*lastTokenIsPrefix */);
InitParams("New York San Anto", true /* lastTokenIsPrefix */);
// vector<Geocoder::Locality> localities;
AddLocality("San Antonio", ID_SAN_ANTONIO);
@ -207,7 +218,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch)
AddLocality("Moscow", ID_MOSCOW);
// All localities except Moscow match to the search query.
LeaveTopLocalities(100 /* limit */);
GetTopLocalities(100 /* limit */);
TEST_EQUAL(3, m_localities.size(), ());
TEST_EQUAL(m_localities[0].m_featureId, ID_SAN_ANTONIO, ());
TEST_EQUAL(m_localities[1].m_featureId, ID_NEW_YORK, ());
@ -216,7 +227,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch)
// New York and San Antonio are better than York, because they match
// by two tokens (second token is prefix for San Antonio), whereas
// York matches by only one token.
LeaveTopLocalities(2 /* limit */);
GetTopLocalities(2 /* limit */);
TEST_EQUAL(2, m_localities.size(), ());
TEST_EQUAL(m_localities[0].m_featureId, ID_SAN_ANTONIO, ());
TEST_EQUAL(m_localities[1].m_featureId, ID_NEW_YORK, ());
@ -224,7 +235,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch)
// New York is a better than San Antonio because it matches by two
// full tokens whereas San Antonio matches by one full token and by
// one prefix token.
LeaveTopLocalities(1 /* limit */);
GetTopLocalities(1 /* limit */);
TEST_EQUAL(1, m_localities.size(), ());
TEST_EQUAL(m_localities[0].m_featureId, ID_NEW_YORK, ());
}

View file

@ -0,0 +1,47 @@
#include "testing/testing.hpp"
#include "search/search_query_params.hpp"
#include "search/v2/ranking_utils.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "base/string_utils.hpp"
#include "std/cstdint.hpp"
#include "std/string.hpp"
using namespace search;
using namespace search::v2;
using namespace strings;
namespace
{
NameScore GetScore(string const & name, string const & query, size_t startToken, size_t endToken)
{
search::Delimiters delims;
SearchQueryParams params;
auto addToken = [&params](UniString const & token)
{
params.m_tokens.push_back({token});
};
SplitUniString(NormalizeAndSimplifyString(query), addToken, delims);
if (!params.m_tokens.empty() && !delims(strings::LastUniChar(query)))
{
params.m_prefixTokens.swap(params.m_tokens.back());
params.m_tokens.pop_back();
}
return GetNameScore(name, params, startToken, endToken);
}
UNIT_TEST(NameTest_Smoke)
{
TEST_EQUAL(GetScore("New York", "Central Park, New York, US", 2, 4), NAME_SCORE_FULL_MATCH, ());
TEST_EQUAL(GetScore("New York", "York", 0, 1), NAME_SCORE_SUBSTRING, ());
TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", 2, 3), NAME_SCORE_FULL_MATCH_PREFIX, ());
TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", 2, 3), NAME_SCORE_FULL_MATCH, ());
TEST_EQUAL(GetScore("San Francisco", "Fran", 0, 1), NAME_SCORE_SUBSTRING_PREFIX, ());
TEST_EQUAL(GetScore("San Francisco", "Fran ", 0, 1), NAME_SCORE_ZERO, ());
}
} // namespace

View file

@ -26,6 +26,7 @@ SOURCES += \
locality_finder_test.cpp \
locality_scorer_test.cpp \
query_saver_tests.cpp \
ranking_tests.cpp \
string_intersection_test.cpp \
string_match_test.cpp \

View file

@ -141,6 +141,38 @@ class LazyRankTable : public RankTable
mutable unique_ptr<search::RankTable> m_table;
};
class LocalityScorerDelegate : public LocalityScorer::Delegate
{
public:
LocalityScorerDelegate(MwmContext const & context)
: m_context(context), m_ranks(m_context.m_value)
{
}
// LocalityScorer::Delegate overrides:
void GetNames(uint32_t featureId, vector<string> & names) const override
{
static vector<int8_t> const kLangs = {StringUtf8Multilang::GetLangIndex("en"),
StringUtf8Multilang::GetLangIndex("int_name"),
StringUtf8Multilang::GetLangIndex("default")};
FeatureType ft;
m_context.GetFeature(featureId, ft);
for (auto const & lang : kLangs)
{
string name;
if (ft.GetName(lang, name))
names.push_back(name);
}
}
uint8_t GetRank(uint32_t featureId) const override { return m_ranks.Get(featureId); }
private:
MwmContext const & m_context;
LazyRankTable m_ranks;
};
class StreetCategories
{
public:
@ -661,9 +693,9 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter
}
}
LazyRankTable rankTable(m_context->m_value);
LocalityScorer scorer(rankTable, m_params);
scorer.LeaveTopLocalities(maxNumLocalities, preLocalities);
LocalityScorerDelegate delegate(*m_context);
LocalityScorer scorer(m_params, delegate);
scorer.GetTopLocalities(maxNumLocalities, preLocalities);
}
void Geocoder::FillLocalitiesTable()
@ -1381,8 +1413,8 @@ size_t Geocoder::SkipUsedTokens(size_t curToken) const
string DebugPrint(Geocoder::Locality const & locality)
{
ostringstream os;
os << "Locality [" << DebugPrint(locality.m_countryId) << ", " << locality.m_featureId << ", "
<< locality.m_startToken << ", " << locality.m_endToken << "]";
os << "Locality [" << DebugPrint(locality.m_countryId) << ", featureId=" << locality.m_featureId
<< ", startToken=" << locality.m_startToken << ", endToken=" << locality.m_endToken << "]";
return os.str();
}
} // namespace v2

View file

@ -1,84 +1,136 @@
#include "search/v2/locality_scorer.hpp"
#include "search/dummy_rank_table.hpp"
#include "search/search_query_params.hpp"
#include "search/v2/mwm_context.hpp"
#include "indexer/feature_impl.hpp"
#include "indexer/index.hpp"
#include "indexer/rank_table.hpp"
#include "std/algorithm.hpp"
#include "std/unique_ptr.hpp"
namespace search
{
namespace v2
{
LocalityScorer::LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params)
: m_rankTable(rankTable), m_params(params)
namespace
{
const size_t kDefaultReadLimit = 50;
bool IsAlmostFullMatch(NameScore score)
{
return score == NAME_SCORE_FULL_MATCH_PREFIX || score == NAME_SCORE_FULL_MATCH;
}
} // namespace
// LocalityScorer::ExLocality ----------------------------------------------------------------------
LocalityScorer::ExLocality::ExLocality() : m_numTokens(0), m_rank(0), m_nameScore(NAME_SCORE_ZERO)
{
}
void LocalityScorer::LeaveTopLocalities(size_t limit, vector<Geocoder::Locality> & localities) const
LocalityScorer::ExLocality::ExLocality(Geocoder::Locality const & locality)
: m_locality(locality)
, m_numTokens(locality.m_endToken - locality.m_startToken)
, m_rank(0)
, m_nameScore(NAME_SCORE_ZERO)
{
// Unique localities by featureId but leave the longest range if equal.
sort(localities.begin(), localities.end(), [&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
{
if (lhs.m_featureId != rhs.m_featureId)
return lhs.m_featureId < rhs.m_featureId;
return GetTokensScore(lhs) > GetTokensScore(rhs);
});
localities.erase(unique(localities.begin(), localities.end(),
[](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
{
return lhs.m_featureId == rhs.m_featureId;
}),
localities.end());
// Leave the most popular localities.
/// @todo Calculate match costs according to the exact locality name
/// (for 'york' query "york city" is better than "new york").
sort(localities.begin(), localities.end(),
[&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
{
auto const ls = GetTokensScore(lhs);
auto const rs = GetTokensScore(rhs);
if (ls != rs)
return ls > rs;
return m_rankTable.Get(lhs.m_featureId) > m_rankTable.Get(rhs.m_featureId);
});
if (localities.size() > limit)
localities.resize(limit);
}
size_t LocalityScorer::GetTokensScore(Geocoder::Locality const & locality) const
// LocalityScorer ----------------------------------------------------------------------------------
LocalityScorer::LocalityScorer(SearchQueryParams const & params, Delegate const & delegate)
: m_params(params), m_delegate(delegate)
{
// *NOTE*
// * full token match costs 2
// * prefix match costs 1
//
// If locality is matched only by a single integral token or by an
// integral token + a prefix, overall score is reduced by one.
//
// TODO (@y, @m, @vng): consider to loop over all non-prefix
// tokens and decrement overall score by one for each integral
// token.
size_t const numTokens = locality.m_endToken - locality.m_startToken;
bool const prefixMatch = locality.m_endToken == m_params.m_tokens.size() + 1;
}
size_t score = 2 * numTokens;
if (prefixMatch)
--score;
void LocalityScorer::GetTopLocalities(size_t limit, vector<Geocoder::Locality> & localities) const
{
vector<ExLocality> ls;
ls.reserve(localities.size());
for (auto const & locality : localities)
ls.emplace_back(locality);
if ((numTokens == 2 && prefixMatch) || (numTokens == 1 && !prefixMatch))
RemoveDuplicates(ls);
LeaveTopByRank(std::max(limit, kDefaultReadLimit), ls);
SortByName(ls);
if (ls.size() > limit)
ls.resize(limit);
localities.clear();
localities.reserve(ls.size());
for (auto const & l : ls)
localities.push_back(l.m_locality);
}
void LocalityScorer::RemoveDuplicates(vector<ExLocality> & ls) const
{
sort(ls.begin(), ls.end(), [](ExLocality const & lhs, ExLocality const & rhs)
{
if (lhs.GetId() != rhs.GetId())
return lhs.GetId() < rhs.GetId();
return lhs.m_numTokens > rhs.m_numTokens;
});
ls.erase(unique(ls.begin(), ls.end(),
[](ExLocality const & lhs, ExLocality const & rhs)
{
return lhs.GetId() == rhs.GetId();
}),
ls.end());
}
void LocalityScorer::LeaveTopByRank(size_t limit, vector<ExLocality> & ls) const
{
if (ls.size() <= limit)
return;
for (auto & l : ls)
l.m_rank = m_delegate.GetRank(l.GetId());
sort(ls.begin(), ls.end(), [](ExLocality const & lhs, ExLocality const & rhs)
{
if (lhs.m_rank != rhs.m_rank)
return lhs.m_rank > rhs.m_rank;
return lhs.m_numTokens > rhs.m_numTokens;
});
ls.resize(limit);
}
void LocalityScorer::SortByName(vector<ExLocality> & ls) const
{
vector<string> names;
for (auto & l : ls)
{
auto const & token = m_params.GetTokens(locality.m_startToken).front();
if (feature::IsNumber(token))
--score;
names.clear();
m_delegate.GetNames(l.GetId(), names);
auto score = NAME_SCORE_ZERO;
for (auto const & name : names)
{
score = max(score,
GetNameScore(name, m_params, l.m_locality.m_startToken, l.m_locality.m_endToken));
}
l.m_nameScore = score;
}
return score;
sort(ls.begin(), ls.end(), [](ExLocality const & lhs, ExLocality const & rhs)
{
if (IsAlmostFullMatch(lhs.m_nameScore) && IsAlmostFullMatch(rhs.m_nameScore))
{
// When both localities match well, e.g. full or full prefix
// match, the one with larger number of tokens is selected. In
// case of tie, the one with better score is selected.
if (lhs.m_numTokens != rhs.m_numTokens)
return lhs.m_numTokens > rhs.m_numTokens;
if (lhs.m_nameScore != rhs.m_nameScore)
return lhs.m_nameScore > rhs.m_nameScore;
}
else
{
// When name scores differ, the one with better name score is
// selected. In case of tie, the one with larger number of
// matched tokens is selected.
if (lhs.m_nameScore != rhs.m_nameScore)
return lhs.m_nameScore > rhs.m_nameScore;
if (lhs.m_numTokens != rhs.m_numTokens)
return lhs.m_numTokens > rhs.m_numTokens;
}
// Okay, in case of tie we select the one with better rank. This
// is a quite arbitrary decision and definitely may be improved.
return lhs.m_rank > rhs.m_rank;
});
}
} // namespace v2
} // namespace search

View file

@ -1,12 +1,13 @@
#pragma once
#include "search/v2/geocoder.hpp"
#include "search/v2/ranking_utils.hpp"
#include "std/string.hpp"
#include "std/vector.hpp"
namespace search
{
class RankTable;
struct SearchQueryParams;
namespace v2
@ -14,18 +15,41 @@ namespace v2
class LocalityScorer
{
public:
LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params);
class Delegate
{
public:
virtual ~Delegate() = default;
// After the call there will be no more than |limit| unique elements
// in |localities|, in descending order by number of matched tokens
// and ranks.
void LeaveTopLocalities(size_t limit, vector<Geocoder::Locality> & localities) const;
virtual void GetNames(uint32_t featureId, vector<string> & names) const = 0;
virtual uint8_t GetRank(uint32_t featureId) const = 0;
};
LocalityScorer(SearchQueryParams const & params, Delegate const & delegate);
// Leaves at most |limit| elements of |localities|, ordered by some
// combination of ranks and number of matched tokens.
void GetTopLocalities(size_t limit, vector<Geocoder::Locality> & localities) const;
private:
size_t GetTokensScore(Geocoder::Locality const & locality) const;
struct ExLocality
{
ExLocality();
explicit ExLocality(Geocoder::Locality const & locality);
inline uint32_t GetId() const { return m_locality.m_featureId; }
Geocoder::Locality m_locality;
size_t m_numTokens;
uint8_t m_rank;
NameScore m_nameScore;
};
void RemoveDuplicates(vector<ExLocality> & ls) const;
void LeaveTopByRank(size_t limit, vector<ExLocality> & ls) const;
void SortByName(vector<ExLocality> & ls) const;
RankTable const & m_rankTable;
SearchQueryParams const & m_params;
Delegate const & m_delegate;
};
} // namespace v2
} // namespace search

View file

@ -0,0 +1,90 @@
#include "search/v2/ranking_utils.hpp"
#include "search/search_query_params.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "base/stl_add.hpp"
#include "base/string_utils.hpp"
#include "std/algorithm.hpp"
#include "std/vector.hpp"
using namespace strings;
namespace search
{
namespace v2
{
namespace
{
bool Match(vector<UniString> const & tokens, UniString const & token)
{
return find(tokens.begin(), tokens.end(), token) != tokens.end();
}
bool PrefixMatch(vector<UniString> const & prefixes, UniString const & token)
{
for (auto const & prefix : prefixes)
{
if (StartsWith(token, prefix))
return true;
}
return false;
}
} // namespace
NameScore GetNameScore(string const & name, SearchQueryParams const & params, size_t startToken,
size_t endToken)
{
if (startToken >= endToken)
return NAME_SCORE_ZERO;
vector<UniString> tokens;
SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());
size_t const n = tokens.size();
size_t const m = endToken - startToken;
bool const lastTokenIsPrefix = (endToken == params.m_tokens.size() + 1);
NameScore score = NAME_SCORE_ZERO;
for (int offset = 0; offset + m <= n; ++offset)
{
bool match = true;
for (int i = 0; i + 1 < m && match; ++i)
match = match && Match(params.GetTokens(startToken + i), tokens[offset + i]);
if (!match)
continue;
if (Match(params.GetTokens(endToken - 1), tokens[offset + m - 1]))
{
if (m == n)
return NAME_SCORE_FULL_MATCH;
score = max(score, NAME_SCORE_SUBSTRING);
}
if (lastTokenIsPrefix && PrefixMatch(params.GetTokens(endToken - 1), tokens[offset + m - 1]))
{
if (m == n)
return NAME_SCORE_FULL_MATCH_PREFIX;
score = max(score, NAME_SCORE_SUBSTRING_PREFIX);
}
}
return score;
}
string DebugPrint(NameScore score)
{
switch (score)
{
case NAME_SCORE_ZERO: return "Zero";
case NAME_SCORE_SUBSTRING_PREFIX: return "Substring Prefix";
case NAME_SCORE_SUBSTRING: return "Substring";
case NAME_SCORE_FULL_MATCH_PREFIX: return "Full Match Prefix";
case NAME_SCORE_FULL_MATCH: return "Full Match";
}
return "Unknown";
}
} // namespace v2
} // namespace search

View file

@ -0,0 +1,28 @@
#pragma once
#include "std/cstdint.hpp"
#include "std/string.hpp"
namespace search
{
struct SearchQueryParams;
namespace v2
{
// The order and numeric values are important here. Please, check all
// use-cases before changing this enum.
enum NameScore
{
NAME_SCORE_ZERO = 0,
NAME_SCORE_SUBSTRING_PREFIX = 1,
NAME_SCORE_SUBSTRING = 2,
NAME_SCORE_FULL_MATCH_PREFIX = 3,
NAME_SCORE_FULL_MATCH = 4
};
NameScore GetNameScore(string const & name, SearchQueryParams const & params, size_t startToken,
size_t endToken);
string DebugPrint(NameScore score);
} // namespace v2
} // namespace search