forked from organicmaps/organicmaps
[search] Improved locality sorting and scoring.
This commit is contained in:
parent
03c15cf7d5
commit
395ea36459
9 changed files with 376 additions and 89 deletions
|
@ -53,6 +53,7 @@ HEADERS += \
|
|||
v2/locality_scorer.hpp \
|
||||
v2/mwm_context.hpp \
|
||||
v2/rank_table_cache.hpp \
|
||||
v2/ranking_utils.hpp \
|
||||
v2/search_model.hpp \
|
||||
v2/search_query_v2.hpp \
|
||||
v2/stats_cache.hpp \
|
||||
|
@ -92,6 +93,7 @@ SOURCES += \
|
|||
v2/locality_scorer.cpp \
|
||||
v2/mwm_context.cpp \
|
||||
v2/rank_table_cache.cpp \
|
||||
v2/ranking_utils.cpp \
|
||||
v2/search_model.cpp \
|
||||
v2/search_query_v2.cpp \
|
||||
v2/street_vicinity_loader.cpp \
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#include "testing/testing.hpp"
|
||||
|
||||
#include "search/dummy_rank_table.hpp"
|
||||
#include "search/v2/locality_scorer.hpp"
|
||||
|
||||
#include "indexer/search_delimiters.hpp"
|
||||
|
@ -13,6 +12,7 @@
|
|||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/set.hpp"
|
||||
#include "std/unordered_map.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
using namespace search::v2;
|
||||
|
@ -79,10 +79,10 @@ void AddLocality(string const & name, uint32_t featureId, SearchQueryParams & pa
|
|||
}
|
||||
}
|
||||
|
||||
class LocalityScorerTest
|
||||
class LocalityScorerTest : public LocalityScorer::Delegate
|
||||
{
|
||||
public:
|
||||
LocalityScorerTest() : m_scorer(m_table, m_params) {}
|
||||
LocalityScorerTest() : m_scorer(m_params, static_cast<LocalityScorer::Delegate &>(*this)) {}
|
||||
|
||||
void InitParams(string const & query, bool lastTokenIsPrefix)
|
||||
{
|
||||
|
@ -92,18 +92,29 @@ public:
|
|||
void AddLocality(string const & name, uint32_t featureId)
|
||||
{
|
||||
::AddLocality(name, featureId, m_params, m_localities);
|
||||
m_names[featureId].push_back(name);
|
||||
}
|
||||
|
||||
void LeaveTopLocalities(size_t limit)
|
||||
void GetTopLocalities(size_t limit)
|
||||
{
|
||||
m_scorer.LeaveTopLocalities(limit, m_localities);
|
||||
m_scorer.GetTopLocalities(limit, m_localities);
|
||||
sort(m_localities.begin(), m_localities.end(), my::CompareBy(&Geocoder::Locality::m_featureId));
|
||||
}
|
||||
|
||||
// LocalityScorer::Delegate overrides:
|
||||
void GetNames(uint32_t featureId, vector<string> & names) const override
|
||||
{
|
||||
auto it = m_names.find(featureId);
|
||||
if (it != m_names.end())
|
||||
names.insert(names.end(), it->second.begin(), it->second.end());
|
||||
}
|
||||
|
||||
uint8_t GetRank(uint32_t featureId) const override { return 0; }
|
||||
|
||||
protected:
|
||||
DummyRankTable m_table;
|
||||
SearchQueryParams m_params;
|
||||
vector<Geocoder::Locality> m_localities;
|
||||
unordered_map<uint32_t, vector<string>> m_names;
|
||||
LocalityScorer m_scorer;
|
||||
};
|
||||
} // namespace
|
||||
|
@ -123,14 +134,14 @@ UNIT_CLASS_TEST(LocalityScorerTest, Smoke)
|
|||
AddLocality("York", ID_YORK);
|
||||
AddLocality("New York", ID_NEW_YORK);
|
||||
|
||||
LeaveTopLocalities(100 /* limit */);
|
||||
GetTopLocalities(100 /* limit */);
|
||||
TEST_EQUAL(3, m_localities.size(), ());
|
||||
TEST_EQUAL(m_localities[0].m_featureId, ID_NEW_ORLEANS, ());
|
||||
TEST_EQUAL(m_localities[1].m_featureId, ID_YORK, ());
|
||||
TEST_EQUAL(m_localities[2].m_featureId, ID_NEW_YORK, ());
|
||||
|
||||
// New York is the best matching locality
|
||||
LeaveTopLocalities(1 /* limit */);
|
||||
GetTopLocalities(1 /* limit */);
|
||||
TEST_EQUAL(1, m_localities.size(), ());
|
||||
TEST_EQUAL(m_localities[0].m_featureId, ID_NEW_YORK, ());
|
||||
}
|
||||
|
@ -152,7 +163,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, NumbersMatch)
|
|||
AddLocality("поселок 1 мая", ID_MAY);
|
||||
AddLocality("тверь", ID_TVER);
|
||||
|
||||
LeaveTopLocalities(100 /* limit */);
|
||||
GetTopLocalities(100 /* limit */);
|
||||
TEST_EQUAL(4, m_localities.size(), ());
|
||||
TEST_EQUAL(m_localities[0].m_featureId, ID_MARCH, ());
|
||||
TEST_EQUAL(m_localities[1].m_featureId, ID_APRIL, ());
|
||||
|
@ -161,7 +172,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, NumbersMatch)
|
|||
|
||||
// Tver is the best matching locality, as other localities were
|
||||
// matched by number.
|
||||
LeaveTopLocalities(1 /* limit */);
|
||||
GetTopLocalities(1 /* limit */);
|
||||
TEST_EQUAL(1, m_localities.size(), ());
|
||||
TEST_EQUAL(m_localities[0].m_featureId, ID_TVER, ());
|
||||
}
|
||||
|
@ -182,7 +193,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, NumbersComplexMatch)
|
|||
// "May 1" contains a numeric token, but as it was matched by at
|
||||
// least two tokens, there is no penalty for numeric token. And, as
|
||||
// it has smaller featureId, it should be left.
|
||||
LeaveTopLocalities(1 /* limit */);
|
||||
GetTopLocalities(1 /* limit */);
|
||||
TEST_EQUAL(1, m_localities.size(), ());
|
||||
TEST_EQUAL(m_localities[0].m_featureId, ID_MAY, ());
|
||||
}
|
||||
|
@ -198,7 +209,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch)
|
|||
};
|
||||
|
||||
// SearchQueryParams params;
|
||||
InitParams("New York San Anto", true /*lastTokenIsPrefix */);
|
||||
InitParams("New York San Anto", true /* lastTokenIsPrefix */);
|
||||
|
||||
// vector<Geocoder::Locality> localities;
|
||||
AddLocality("San Antonio", ID_SAN_ANTONIO);
|
||||
|
@ -207,7 +218,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch)
|
|||
AddLocality("Moscow", ID_MOSCOW);
|
||||
|
||||
// All localities except Moscow match to the search query.
|
||||
LeaveTopLocalities(100 /* limit */);
|
||||
GetTopLocalities(100 /* limit */);
|
||||
TEST_EQUAL(3, m_localities.size(), ());
|
||||
TEST_EQUAL(m_localities[0].m_featureId, ID_SAN_ANTONIO, ());
|
||||
TEST_EQUAL(m_localities[1].m_featureId, ID_NEW_YORK, ());
|
||||
|
@ -216,7 +227,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch)
|
|||
// New York and San Antonio are better than York, because they match
|
||||
// by two tokens (second token is prefix for San Antonio), whereas
|
||||
// York matches by only one token.
|
||||
LeaveTopLocalities(2 /* limit */);
|
||||
GetTopLocalities(2 /* limit */);
|
||||
TEST_EQUAL(2, m_localities.size(), ());
|
||||
TEST_EQUAL(m_localities[0].m_featureId, ID_SAN_ANTONIO, ());
|
||||
TEST_EQUAL(m_localities[1].m_featureId, ID_NEW_YORK, ());
|
||||
|
@ -224,7 +235,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch)
|
|||
// New York is a better than San Antonio because it matches by two
|
||||
// full tokens whereas San Antonio matches by one full token and by
|
||||
// one prefix token.
|
||||
LeaveTopLocalities(1 /* limit */);
|
||||
GetTopLocalities(1 /* limit */);
|
||||
TEST_EQUAL(1, m_localities.size(), ());
|
||||
TEST_EQUAL(m_localities[0].m_featureId, ID_NEW_YORK, ());
|
||||
}
|
||||
|
|
47
search/search_tests/ranking_tests.cpp
Normal file
47
search/search_tests/ranking_tests.cpp
Normal file
|
@ -0,0 +1,47 @@
|
|||
#include "testing/testing.hpp"
|
||||
|
||||
#include "search/search_query_params.hpp"
|
||||
#include "search/v2/ranking_utils.hpp"
|
||||
|
||||
#include "indexer/search_delimiters.hpp"
|
||||
#include "indexer/search_string_utils.hpp"
|
||||
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/cstdint.hpp"
|
||||
#include "std/string.hpp"
|
||||
|
||||
using namespace search;
|
||||
using namespace search::v2;
|
||||
using namespace strings;
|
||||
|
||||
namespace
|
||||
{
|
||||
NameScore GetScore(string const & name, string const & query, size_t startToken, size_t endToken)
|
||||
{
|
||||
search::Delimiters delims;
|
||||
SearchQueryParams params;
|
||||
auto addToken = [¶ms](UniString const & token)
|
||||
{
|
||||
params.m_tokens.push_back({token});
|
||||
};
|
||||
|
||||
SplitUniString(NormalizeAndSimplifyString(query), addToken, delims);
|
||||
if (!params.m_tokens.empty() && !delims(strings::LastUniChar(query)))
|
||||
{
|
||||
params.m_prefixTokens.swap(params.m_tokens.back());
|
||||
params.m_tokens.pop_back();
|
||||
}
|
||||
return GetNameScore(name, params, startToken, endToken);
|
||||
}
|
||||
|
||||
UNIT_TEST(NameTest_Smoke)
|
||||
{
|
||||
TEST_EQUAL(GetScore("New York", "Central Park, New York, US", 2, 4), NAME_SCORE_FULL_MATCH, ());
|
||||
TEST_EQUAL(GetScore("New York", "York", 0, 1), NAME_SCORE_SUBSTRING, ());
|
||||
TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", 2, 3), NAME_SCORE_FULL_MATCH_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", 2, 3), NAME_SCORE_FULL_MATCH, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "Fran", 0, 1), NAME_SCORE_SUBSTRING_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "Fran ", 0, 1), NAME_SCORE_ZERO, ());
|
||||
}
|
||||
} // namespace
|
|
@ -26,6 +26,7 @@ SOURCES += \
|
|||
locality_finder_test.cpp \
|
||||
locality_scorer_test.cpp \
|
||||
query_saver_tests.cpp \
|
||||
ranking_tests.cpp \
|
||||
string_intersection_test.cpp \
|
||||
string_match_test.cpp \
|
||||
|
||||
|
|
|
@ -141,6 +141,38 @@ class LazyRankTable : public RankTable
|
|||
mutable unique_ptr<search::RankTable> m_table;
|
||||
};
|
||||
|
||||
class LocalityScorerDelegate : public LocalityScorer::Delegate
|
||||
{
|
||||
public:
|
||||
LocalityScorerDelegate(MwmContext const & context)
|
||||
: m_context(context), m_ranks(m_context.m_value)
|
||||
{
|
||||
}
|
||||
|
||||
// LocalityScorer::Delegate overrides:
|
||||
void GetNames(uint32_t featureId, vector<string> & names) const override
|
||||
{
|
||||
static vector<int8_t> const kLangs = {StringUtf8Multilang::GetLangIndex("en"),
|
||||
StringUtf8Multilang::GetLangIndex("int_name"),
|
||||
StringUtf8Multilang::GetLangIndex("default")};
|
||||
|
||||
FeatureType ft;
|
||||
m_context.GetFeature(featureId, ft);
|
||||
for (auto const & lang : kLangs)
|
||||
{
|
||||
string name;
|
||||
if (ft.GetName(lang, name))
|
||||
names.push_back(name);
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t GetRank(uint32_t featureId) const override { return m_ranks.Get(featureId); }
|
||||
|
||||
private:
|
||||
MwmContext const & m_context;
|
||||
LazyRankTable m_ranks;
|
||||
};
|
||||
|
||||
class StreetCategories
|
||||
{
|
||||
public:
|
||||
|
@ -661,9 +693,9 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter
|
|||
}
|
||||
}
|
||||
|
||||
LazyRankTable rankTable(m_context->m_value);
|
||||
LocalityScorer scorer(rankTable, m_params);
|
||||
scorer.LeaveTopLocalities(maxNumLocalities, preLocalities);
|
||||
LocalityScorerDelegate delegate(*m_context);
|
||||
LocalityScorer scorer(m_params, delegate);
|
||||
scorer.GetTopLocalities(maxNumLocalities, preLocalities);
|
||||
}
|
||||
|
||||
void Geocoder::FillLocalitiesTable()
|
||||
|
@ -1381,8 +1413,8 @@ size_t Geocoder::SkipUsedTokens(size_t curToken) const
|
|||
string DebugPrint(Geocoder::Locality const & locality)
|
||||
{
|
||||
ostringstream os;
|
||||
os << "Locality [" << DebugPrint(locality.m_countryId) << ", " << locality.m_featureId << ", "
|
||||
<< locality.m_startToken << ", " << locality.m_endToken << "]";
|
||||
os << "Locality [" << DebugPrint(locality.m_countryId) << ", featureId=" << locality.m_featureId
|
||||
<< ", startToken=" << locality.m_startToken << ", endToken=" << locality.m_endToken << "]";
|
||||
return os.str();
|
||||
}
|
||||
} // namespace v2
|
||||
|
|
|
@ -1,84 +1,136 @@
|
|||
#include "search/v2/locality_scorer.hpp"
|
||||
|
||||
#include "search/dummy_rank_table.hpp"
|
||||
#include "search/search_query_params.hpp"
|
||||
#include "search/v2/mwm_context.hpp"
|
||||
|
||||
#include "indexer/feature_impl.hpp"
|
||||
#include "indexer/index.hpp"
|
||||
#include "indexer/rank_table.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/unique_ptr.hpp"
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace v2
|
||||
{
|
||||
LocalityScorer::LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params)
|
||||
: m_rankTable(rankTable), m_params(params)
|
||||
namespace
|
||||
{
|
||||
const size_t kDefaultReadLimit = 50;
|
||||
|
||||
bool IsAlmostFullMatch(NameScore score)
|
||||
{
|
||||
return score == NAME_SCORE_FULL_MATCH_PREFIX || score == NAME_SCORE_FULL_MATCH;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// LocalityScorer::ExLocality ----------------------------------------------------------------------
|
||||
LocalityScorer::ExLocality::ExLocality() : m_numTokens(0), m_rank(0), m_nameScore(NAME_SCORE_ZERO)
|
||||
{
|
||||
}
|
||||
|
||||
void LocalityScorer::LeaveTopLocalities(size_t limit, vector<Geocoder::Locality> & localities) const
|
||||
LocalityScorer::ExLocality::ExLocality(Geocoder::Locality const & locality)
|
||||
: m_locality(locality)
|
||||
, m_numTokens(locality.m_endToken - locality.m_startToken)
|
||||
, m_rank(0)
|
||||
, m_nameScore(NAME_SCORE_ZERO)
|
||||
{
|
||||
// Unique localities by featureId but leave the longest range if equal.
|
||||
sort(localities.begin(), localities.end(), [&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
|
||||
{
|
||||
if (lhs.m_featureId != rhs.m_featureId)
|
||||
return lhs.m_featureId < rhs.m_featureId;
|
||||
return GetTokensScore(lhs) > GetTokensScore(rhs);
|
||||
});
|
||||
localities.erase(unique(localities.begin(), localities.end(),
|
||||
[](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
|
||||
{
|
||||
return lhs.m_featureId == rhs.m_featureId;
|
||||
}),
|
||||
localities.end());
|
||||
|
||||
// Leave the most popular localities.
|
||||
/// @todo Calculate match costs according to the exact locality name
|
||||
/// (for 'york' query "york city" is better than "new york").
|
||||
sort(localities.begin(), localities.end(),
|
||||
[&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
|
||||
{
|
||||
auto const ls = GetTokensScore(lhs);
|
||||
auto const rs = GetTokensScore(rhs);
|
||||
if (ls != rs)
|
||||
return ls > rs;
|
||||
return m_rankTable.Get(lhs.m_featureId) > m_rankTable.Get(rhs.m_featureId);
|
||||
});
|
||||
if (localities.size() > limit)
|
||||
localities.resize(limit);
|
||||
}
|
||||
|
||||
size_t LocalityScorer::GetTokensScore(Geocoder::Locality const & locality) const
|
||||
// LocalityScorer ----------------------------------------------------------------------------------
|
||||
LocalityScorer::LocalityScorer(SearchQueryParams const & params, Delegate const & delegate)
|
||||
: m_params(params), m_delegate(delegate)
|
||||
{
|
||||
// *NOTE*
|
||||
// * full token match costs 2
|
||||
// * prefix match costs 1
|
||||
//
|
||||
// If locality is matched only by a single integral token or by an
|
||||
// integral token + a prefix, overall score is reduced by one.
|
||||
//
|
||||
// TODO (@y, @m, @vng): consider to loop over all non-prefix
|
||||
// tokens and decrement overall score by one for each integral
|
||||
// token.
|
||||
size_t const numTokens = locality.m_endToken - locality.m_startToken;
|
||||
bool const prefixMatch = locality.m_endToken == m_params.m_tokens.size() + 1;
|
||||
}
|
||||
|
||||
size_t score = 2 * numTokens;
|
||||
if (prefixMatch)
|
||||
--score;
|
||||
void LocalityScorer::GetTopLocalities(size_t limit, vector<Geocoder::Locality> & localities) const
|
||||
{
|
||||
vector<ExLocality> ls;
|
||||
ls.reserve(localities.size());
|
||||
for (auto const & locality : localities)
|
||||
ls.emplace_back(locality);
|
||||
|
||||
if ((numTokens == 2 && prefixMatch) || (numTokens == 1 && !prefixMatch))
|
||||
RemoveDuplicates(ls);
|
||||
LeaveTopByRank(std::max(limit, kDefaultReadLimit), ls);
|
||||
SortByName(ls);
|
||||
if (ls.size() > limit)
|
||||
ls.resize(limit);
|
||||
|
||||
localities.clear();
|
||||
localities.reserve(ls.size());
|
||||
for (auto const & l : ls)
|
||||
localities.push_back(l.m_locality);
|
||||
}
|
||||
|
||||
void LocalityScorer::RemoveDuplicates(vector<ExLocality> & ls) const
|
||||
{
|
||||
sort(ls.begin(), ls.end(), [](ExLocality const & lhs, ExLocality const & rhs)
|
||||
{
|
||||
if (lhs.GetId() != rhs.GetId())
|
||||
return lhs.GetId() < rhs.GetId();
|
||||
return lhs.m_numTokens > rhs.m_numTokens;
|
||||
});
|
||||
ls.erase(unique(ls.begin(), ls.end(),
|
||||
[](ExLocality const & lhs, ExLocality const & rhs)
|
||||
{
|
||||
return lhs.GetId() == rhs.GetId();
|
||||
}),
|
||||
ls.end());
|
||||
}
|
||||
|
||||
void LocalityScorer::LeaveTopByRank(size_t limit, vector<ExLocality> & ls) const
|
||||
{
|
||||
if (ls.size() <= limit)
|
||||
return;
|
||||
|
||||
for (auto & l : ls)
|
||||
l.m_rank = m_delegate.GetRank(l.GetId());
|
||||
|
||||
sort(ls.begin(), ls.end(), [](ExLocality const & lhs, ExLocality const & rhs)
|
||||
{
|
||||
if (lhs.m_rank != rhs.m_rank)
|
||||
return lhs.m_rank > rhs.m_rank;
|
||||
return lhs.m_numTokens > rhs.m_numTokens;
|
||||
});
|
||||
ls.resize(limit);
|
||||
}
|
||||
|
||||
void LocalityScorer::SortByName(vector<ExLocality> & ls) const
|
||||
{
|
||||
vector<string> names;
|
||||
for (auto & l : ls)
|
||||
{
|
||||
auto const & token = m_params.GetTokens(locality.m_startToken).front();
|
||||
if (feature::IsNumber(token))
|
||||
--score;
|
||||
names.clear();
|
||||
m_delegate.GetNames(l.GetId(), names);
|
||||
|
||||
auto score = NAME_SCORE_ZERO;
|
||||
for (auto const & name : names)
|
||||
{
|
||||
score = max(score,
|
||||
GetNameScore(name, m_params, l.m_locality.m_startToken, l.m_locality.m_endToken));
|
||||
}
|
||||
l.m_nameScore = score;
|
||||
}
|
||||
|
||||
return score;
|
||||
sort(ls.begin(), ls.end(), [](ExLocality const & lhs, ExLocality const & rhs)
|
||||
{
|
||||
if (IsAlmostFullMatch(lhs.m_nameScore) && IsAlmostFullMatch(rhs.m_nameScore))
|
||||
{
|
||||
// When both localities match well, e.g. full or full prefix
|
||||
// match, the one with larger number of tokens is selected. In
|
||||
// case of tie, the one with better score is selected.
|
||||
if (lhs.m_numTokens != rhs.m_numTokens)
|
||||
return lhs.m_numTokens > rhs.m_numTokens;
|
||||
if (lhs.m_nameScore != rhs.m_nameScore)
|
||||
return lhs.m_nameScore > rhs.m_nameScore;
|
||||
}
|
||||
else
|
||||
{
|
||||
// When name scores differ, the one with better name score is
|
||||
// selected. In case of tie, the one with larger number of
|
||||
// matched tokens is selected.
|
||||
if (lhs.m_nameScore != rhs.m_nameScore)
|
||||
return lhs.m_nameScore > rhs.m_nameScore;
|
||||
if (lhs.m_numTokens != rhs.m_numTokens)
|
||||
return lhs.m_numTokens > rhs.m_numTokens;
|
||||
}
|
||||
|
||||
// Okay, in case of tie we select the one with better rank. This
|
||||
// is a quite arbitrary decision and definitely may be improved.
|
||||
return lhs.m_rank > rhs.m_rank;
|
||||
});
|
||||
}
|
||||
} // namespace v2
|
||||
} // namespace search
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
#pragma once
|
||||
|
||||
#include "search/v2/geocoder.hpp"
|
||||
#include "search/v2/ranking_utils.hpp"
|
||||
|
||||
#include "std/string.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
namespace search
|
||||
{
|
||||
class RankTable;
|
||||
struct SearchQueryParams;
|
||||
|
||||
namespace v2
|
||||
|
@ -14,18 +15,41 @@ namespace v2
|
|||
class LocalityScorer
|
||||
{
|
||||
public:
|
||||
LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params);
|
||||
class Delegate
|
||||
{
|
||||
public:
|
||||
virtual ~Delegate() = default;
|
||||
|
||||
// After the call there will be no more than |limit| unique elements
|
||||
// in |localities|, in descending order by number of matched tokens
|
||||
// and ranks.
|
||||
void LeaveTopLocalities(size_t limit, vector<Geocoder::Locality> & localities) const;
|
||||
virtual void GetNames(uint32_t featureId, vector<string> & names) const = 0;
|
||||
virtual uint8_t GetRank(uint32_t featureId) const = 0;
|
||||
};
|
||||
|
||||
LocalityScorer(SearchQueryParams const & params, Delegate const & delegate);
|
||||
|
||||
// Leaves at most |limit| elements of |localities|, ordered by some
|
||||
// combination of ranks and number of matched tokens.
|
||||
void GetTopLocalities(size_t limit, vector<Geocoder::Locality> & localities) const;
|
||||
|
||||
private:
|
||||
size_t GetTokensScore(Geocoder::Locality const & locality) const;
|
||||
struct ExLocality
|
||||
{
|
||||
ExLocality();
|
||||
explicit ExLocality(Geocoder::Locality const & locality);
|
||||
|
||||
inline uint32_t GetId() const { return m_locality.m_featureId; }
|
||||
|
||||
Geocoder::Locality m_locality;
|
||||
size_t m_numTokens;
|
||||
uint8_t m_rank;
|
||||
NameScore m_nameScore;
|
||||
};
|
||||
|
||||
void RemoveDuplicates(vector<ExLocality> & ls) const;
|
||||
void LeaveTopByRank(size_t limit, vector<ExLocality> & ls) const;
|
||||
void SortByName(vector<ExLocality> & ls) const;
|
||||
|
||||
RankTable const & m_rankTable;
|
||||
SearchQueryParams const & m_params;
|
||||
Delegate const & m_delegate;
|
||||
};
|
||||
} // namespace v2
|
||||
} // namespace search
|
||||
|
|
90
search/v2/ranking_utils.cpp
Normal file
90
search/v2/ranking_utils.cpp
Normal file
|
@ -0,0 +1,90 @@
|
|||
#include "search/v2/ranking_utils.hpp"
|
||||
|
||||
#include "search/search_query_params.hpp"
|
||||
|
||||
#include "indexer/search_delimiters.hpp"
|
||||
#include "indexer/search_string_utils.hpp"
|
||||
|
||||
#include "base/stl_add.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
using namespace strings;
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace v2
|
||||
{
|
||||
namespace
|
||||
{
|
||||
bool Match(vector<UniString> const & tokens, UniString const & token)
|
||||
{
|
||||
return find(tokens.begin(), tokens.end(), token) != tokens.end();
|
||||
}
|
||||
|
||||
bool PrefixMatch(vector<UniString> const & prefixes, UniString const & token)
|
||||
{
|
||||
for (auto const & prefix : prefixes)
|
||||
{
|
||||
if (StartsWith(token, prefix))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
NameScore GetNameScore(string const & name, SearchQueryParams const & params, size_t startToken,
|
||||
size_t endToken)
|
||||
{
|
||||
if (startToken >= endToken)
|
||||
return NAME_SCORE_ZERO;
|
||||
|
||||
vector<UniString> tokens;
|
||||
SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());
|
||||
|
||||
size_t const n = tokens.size();
|
||||
size_t const m = endToken - startToken;
|
||||
|
||||
bool const lastTokenIsPrefix = (endToken == params.m_tokens.size() + 1);
|
||||
|
||||
NameScore score = NAME_SCORE_ZERO;
|
||||
for (int offset = 0; offset + m <= n; ++offset)
|
||||
{
|
||||
bool match = true;
|
||||
for (int i = 0; i + 1 < m && match; ++i)
|
||||
match = match && Match(params.GetTokens(startToken + i), tokens[offset + i]);
|
||||
if (!match)
|
||||
continue;
|
||||
|
||||
if (Match(params.GetTokens(endToken - 1), tokens[offset + m - 1]))
|
||||
{
|
||||
if (m == n)
|
||||
return NAME_SCORE_FULL_MATCH;
|
||||
score = max(score, NAME_SCORE_SUBSTRING);
|
||||
}
|
||||
if (lastTokenIsPrefix && PrefixMatch(params.GetTokens(endToken - 1), tokens[offset + m - 1]))
|
||||
{
|
||||
if (m == n)
|
||||
return NAME_SCORE_FULL_MATCH_PREFIX;
|
||||
score = max(score, NAME_SCORE_SUBSTRING_PREFIX);
|
||||
}
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
||||
string DebugPrint(NameScore score)
|
||||
{
|
||||
switch (score)
|
||||
{
|
||||
case NAME_SCORE_ZERO: return "Zero";
|
||||
case NAME_SCORE_SUBSTRING_PREFIX: return "Substring Prefix";
|
||||
case NAME_SCORE_SUBSTRING: return "Substring";
|
||||
case NAME_SCORE_FULL_MATCH_PREFIX: return "Full Match Prefix";
|
||||
case NAME_SCORE_FULL_MATCH: return "Full Match";
|
||||
}
|
||||
return "Unknown";
|
||||
}
|
||||
} // namespace v2
|
||||
} // namespace search
|
28
search/v2/ranking_utils.hpp
Normal file
28
search/v2/ranking_utils.hpp
Normal file
|
@ -0,0 +1,28 @@
|
|||
#pragma once
|
||||
|
||||
#include "std/cstdint.hpp"
|
||||
#include "std/string.hpp"
|
||||
|
||||
namespace search
|
||||
{
|
||||
struct SearchQueryParams;
|
||||
|
||||
namespace v2
|
||||
{
|
||||
// The order and numeric values are important here. Please, check all
|
||||
// use-cases before changing this enum.
|
||||
enum NameScore
|
||||
{
|
||||
NAME_SCORE_ZERO = 0,
|
||||
NAME_SCORE_SUBSTRING_PREFIX = 1,
|
||||
NAME_SCORE_SUBSTRING = 2,
|
||||
NAME_SCORE_FULL_MATCH_PREFIX = 3,
|
||||
NAME_SCORE_FULL_MATCH = 4
|
||||
};
|
||||
|
||||
NameScore GetNameScore(string const & name, SearchQueryParams const & params, size_t startToken,
|
||||
size_t endToken);
|
||||
|
||||
string DebugPrint(NameScore score);
|
||||
} // namespace v2
|
||||
} // namespace search
|
Loading…
Add table
Reference in a new issue