diff --git a/search/search.pro b/search/search.pro index 8d45f03d1a..83fbce598c 100644 --- a/search/search.pro +++ b/search/search.pro @@ -53,6 +53,7 @@ HEADERS += \ v2/locality_scorer.hpp \ v2/mwm_context.hpp \ v2/rank_table_cache.hpp \ + v2/ranking_utils.hpp \ v2/search_model.hpp \ v2/search_query_v2.hpp \ v2/stats_cache.hpp \ @@ -92,6 +93,7 @@ SOURCES += \ v2/locality_scorer.cpp \ v2/mwm_context.cpp \ v2/rank_table_cache.cpp \ + v2/ranking_utils.cpp \ v2/search_model.cpp \ v2/search_query_v2.cpp \ v2/street_vicinity_loader.cpp \ diff --git a/search/search_tests/locality_scorer_test.cpp b/search/search_tests/locality_scorer_test.cpp index 537f3df82b..1372c8de90 100644 --- a/search/search_tests/locality_scorer_test.cpp +++ b/search/search_tests/locality_scorer_test.cpp @@ -1,6 +1,5 @@ #include "testing/testing.hpp" -#include "search/dummy_rank_table.hpp" #include "search/v2/locality_scorer.hpp" #include "indexer/search_delimiters.hpp" @@ -13,6 +12,7 @@ #include "std/algorithm.hpp" #include "std/set.hpp" +#include "std/unordered_map.hpp" #include "std/vector.hpp" using namespace search::v2; @@ -79,10 +79,10 @@ void AddLocality(string const & name, uint32_t featureId, SearchQueryParams & pa } } -class LocalityScorerTest +class LocalityScorerTest : public LocalityScorer::Delegate { public: - LocalityScorerTest() : m_scorer(m_table, m_params) {} + LocalityScorerTest() : m_scorer(m_params, static_cast(*this)) {} void InitParams(string const & query, bool lastTokenIsPrefix) { @@ -92,18 +92,29 @@ public: void AddLocality(string const & name, uint32_t featureId) { ::AddLocality(name, featureId, m_params, m_localities); + m_names[featureId].push_back(name); } - void LeaveTopLocalities(size_t limit) + void GetTopLocalities(size_t limit) { - m_scorer.LeaveTopLocalities(limit, m_localities); + m_scorer.GetTopLocalities(limit, m_localities); sort(m_localities.begin(), m_localities.end(), my::CompareBy(&Geocoder::Locality::m_featureId)); } + // LocalityScorer::Delegate overrides: + void GetNames(uint32_t featureId, vector & names) const override + { + auto it = m_names.find(featureId); + if (it != m_names.end()) + names.insert(names.end(), it->second.begin(), it->second.end()); + } + + uint8_t GetRank(uint32_t featureId) const override { return 0; } + protected: - DummyRankTable m_table; SearchQueryParams m_params; vector m_localities; + unordered_map> m_names; LocalityScorer m_scorer; }; } // namespace @@ -123,14 +134,14 @@ UNIT_CLASS_TEST(LocalityScorerTest, Smoke) AddLocality("York", ID_YORK); AddLocality("New York", ID_NEW_YORK); - LeaveTopLocalities(100 /* limit */); + GetTopLocalities(100 /* limit */); TEST_EQUAL(3, m_localities.size(), ()); TEST_EQUAL(m_localities[0].m_featureId, ID_NEW_ORLEANS, ()); TEST_EQUAL(m_localities[1].m_featureId, ID_YORK, ()); TEST_EQUAL(m_localities[2].m_featureId, ID_NEW_YORK, ()); // New York is the best matching locality - LeaveTopLocalities(1 /* limit */); + GetTopLocalities(1 /* limit */); TEST_EQUAL(1, m_localities.size(), ()); TEST_EQUAL(m_localities[0].m_featureId, ID_NEW_YORK, ()); } @@ -152,7 +163,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, NumbersMatch) AddLocality("поселок 1 мая", ID_MAY); AddLocality("тверь", ID_TVER); - LeaveTopLocalities(100 /* limit */); + GetTopLocalities(100 /* limit */); TEST_EQUAL(4, m_localities.size(), ()); TEST_EQUAL(m_localities[0].m_featureId, ID_MARCH, ()); TEST_EQUAL(m_localities[1].m_featureId, ID_APRIL, ()); @@ -161,7 +172,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, NumbersMatch) // Tver is the best matching locality, as other localities were // matched by number. - LeaveTopLocalities(1 /* limit */); + GetTopLocalities(1 /* limit */); TEST_EQUAL(1, m_localities.size(), ()); TEST_EQUAL(m_localities[0].m_featureId, ID_TVER, ()); } @@ -182,7 +193,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, NumbersComplexMatch) // "May 1" contains a numeric token, but as it was matched by at // least two tokens, there is no penalty for numeric token. And, as // it has smaller featureId, it should be left. - LeaveTopLocalities(1 /* limit */); + GetTopLocalities(1 /* limit */); TEST_EQUAL(1, m_localities.size(), ()); TEST_EQUAL(m_localities[0].m_featureId, ID_MAY, ()); } @@ -198,7 +209,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch) }; // SearchQueryParams params; - InitParams("New York San Anto", true /*lastTokenIsPrefix */); + InitParams("New York San Anto", true /* lastTokenIsPrefix */); // vector localities; AddLocality("San Antonio", ID_SAN_ANTONIO); @@ -207,7 +218,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch) AddLocality("Moscow", ID_MOSCOW); // All localities except Moscow match to the search query. - LeaveTopLocalities(100 /* limit */); + GetTopLocalities(100 /* limit */); TEST_EQUAL(3, m_localities.size(), ()); TEST_EQUAL(m_localities[0].m_featureId, ID_SAN_ANTONIO, ()); TEST_EQUAL(m_localities[1].m_featureId, ID_NEW_YORK, ()); @@ -216,7 +227,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch) // New York and San Antonio are better than York, because they match // by two tokens (second token is prefix for San Antonio), whereas // York matches by only one token. - LeaveTopLocalities(2 /* limit */); + GetTopLocalities(2 /* limit */); TEST_EQUAL(2, m_localities.size(), ()); TEST_EQUAL(m_localities[0].m_featureId, ID_SAN_ANTONIO, ()); TEST_EQUAL(m_localities[1].m_featureId, ID_NEW_YORK, ()); @@ -224,7 +235,7 @@ UNIT_CLASS_TEST(LocalityScorerTest, PrefixMatch) // New York is a better than San Antonio because it matches by two // full tokens whereas San Antonio matches by one full token and by // one prefix token. - LeaveTopLocalities(1 /* limit */); + GetTopLocalities(1 /* limit */); TEST_EQUAL(1, m_localities.size(), ()); TEST_EQUAL(m_localities[0].m_featureId, ID_NEW_YORK, ()); } diff --git a/search/search_tests/ranking_tests.cpp b/search/search_tests/ranking_tests.cpp new file mode 100644 index 0000000000..605900dfbb --- /dev/null +++ b/search/search_tests/ranking_tests.cpp @@ -0,0 +1,47 @@ +#include "testing/testing.hpp" + +#include "search/search_query_params.hpp" +#include "search/v2/ranking_utils.hpp" + +#include "indexer/search_delimiters.hpp" +#include "indexer/search_string_utils.hpp" + +#include "base/string_utils.hpp" + +#include "std/cstdint.hpp" +#include "std/string.hpp" + +using namespace search; +using namespace search::v2; +using namespace strings; + +namespace +{ +NameScore GetScore(string const & name, string const & query, size_t startToken, size_t endToken) +{ + search::Delimiters delims; + SearchQueryParams params; + auto addToken = [¶ms](UniString const & token) + { + params.m_tokens.push_back({token}); + }; + + SplitUniString(NormalizeAndSimplifyString(query), addToken, delims); + if (!params.m_tokens.empty() && !delims(strings::LastUniChar(query))) + { + params.m_prefixTokens.swap(params.m_tokens.back()); + params.m_tokens.pop_back(); + } + return GetNameScore(name, params, startToken, endToken); +} + +UNIT_TEST(NameTest_Smoke) +{ + TEST_EQUAL(GetScore("New York", "Central Park, New York, US", 2, 4), NAME_SCORE_FULL_MATCH, ()); + TEST_EQUAL(GetScore("New York", "York", 0, 1), NAME_SCORE_SUBSTRING, ()); + TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", 2, 3), NAME_SCORE_FULL_MATCH_PREFIX, ()); + TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", 2, 3), NAME_SCORE_FULL_MATCH, ()); + TEST_EQUAL(GetScore("San Francisco", "Fran", 0, 1), NAME_SCORE_SUBSTRING_PREFIX, ()); + TEST_EQUAL(GetScore("San Francisco", "Fran ", 0, 1), NAME_SCORE_ZERO, ()); +} +} // namespace diff --git a/search/search_tests/search_tests.pro b/search/search_tests/search_tests.pro index 0ed91bb1ad..ef8f542b4a 100644 --- a/search/search_tests/search_tests.pro +++ b/search/search_tests/search_tests.pro @@ -26,6 +26,7 @@ SOURCES += \ locality_finder_test.cpp \ locality_scorer_test.cpp \ query_saver_tests.cpp \ + ranking_tests.cpp \ string_intersection_test.cpp \ string_match_test.cpp \ diff --git a/search/v2/geocoder.cpp b/search/v2/geocoder.cpp index 4249f9e181..452e368368 100644 --- a/search/v2/geocoder.cpp +++ b/search/v2/geocoder.cpp @@ -141,6 +141,38 @@ class LazyRankTable : public RankTable mutable unique_ptr m_table; }; +class LocalityScorerDelegate : public LocalityScorer::Delegate +{ +public: + LocalityScorerDelegate(MwmContext const & context) + : m_context(context), m_ranks(m_context.m_value) + { + } + + // LocalityScorer::Delegate overrides: + void GetNames(uint32_t featureId, vector & names) const override + { + static vector const kLangs = {StringUtf8Multilang::GetLangIndex("en"), + StringUtf8Multilang::GetLangIndex("int_name"), + StringUtf8Multilang::GetLangIndex("default")}; + + FeatureType ft; + m_context.GetFeature(featureId, ft); + for (auto const & lang : kLangs) + { + string name; + if (ft.GetName(lang, name)) + names.push_back(name); + } + } + + uint8_t GetRank(uint32_t featureId) const override { return m_ranks.Get(featureId); } + +private: + MwmContext const & m_context; + LazyRankTable m_ranks; +}; + class StreetCategories { public: @@ -661,9 +693,9 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter } } - LazyRankTable rankTable(m_context->m_value); - LocalityScorer scorer(rankTable, m_params); - scorer.LeaveTopLocalities(maxNumLocalities, preLocalities); + LocalityScorerDelegate delegate(*m_context); + LocalityScorer scorer(m_params, delegate); + scorer.GetTopLocalities(maxNumLocalities, preLocalities); } void Geocoder::FillLocalitiesTable() @@ -1381,8 +1413,8 @@ size_t Geocoder::SkipUsedTokens(size_t curToken) const string DebugPrint(Geocoder::Locality const & locality) { ostringstream os; - os << "Locality [" << DebugPrint(locality.m_countryId) << ", " << locality.m_featureId << ", " - << locality.m_startToken << ", " << locality.m_endToken << "]"; + os << "Locality [" << DebugPrint(locality.m_countryId) << ", featureId=" << locality.m_featureId + << ", startToken=" << locality.m_startToken << ", endToken=" << locality.m_endToken << "]"; return os.str(); } } // namespace v2 diff --git a/search/v2/locality_scorer.cpp b/search/v2/locality_scorer.cpp index 6d1b71c42b..49c02ac564 100644 --- a/search/v2/locality_scorer.cpp +++ b/search/v2/locality_scorer.cpp @@ -1,84 +1,136 @@ #include "search/v2/locality_scorer.hpp" -#include "search/dummy_rank_table.hpp" -#include "search/search_query_params.hpp" -#include "search/v2/mwm_context.hpp" - -#include "indexer/feature_impl.hpp" -#include "indexer/index.hpp" -#include "indexer/rank_table.hpp" - #include "std/algorithm.hpp" -#include "std/unique_ptr.hpp" namespace search { namespace v2 { -LocalityScorer::LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params) - : m_rankTable(rankTable), m_params(params) +namespace +{ +const size_t kDefaultReadLimit = 50; + +bool IsAlmostFullMatch(NameScore score) +{ + return score == NAME_SCORE_FULL_MATCH_PREFIX || score == NAME_SCORE_FULL_MATCH; +} +} // namespace + +// LocalityScorer::ExLocality ---------------------------------------------------------------------- +LocalityScorer::ExLocality::ExLocality() : m_numTokens(0), m_rank(0), m_nameScore(NAME_SCORE_ZERO) { } -void LocalityScorer::LeaveTopLocalities(size_t limit, vector & localities) const +LocalityScorer::ExLocality::ExLocality(Geocoder::Locality const & locality) + : m_locality(locality) + , m_numTokens(locality.m_endToken - locality.m_startToken) + , m_rank(0) + , m_nameScore(NAME_SCORE_ZERO) { - // Unique localities by featureId but leave the longest range if equal. - sort(localities.begin(), localities.end(), [&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs) - { - if (lhs.m_featureId != rhs.m_featureId) - return lhs.m_featureId < rhs.m_featureId; - return GetTokensScore(lhs) > GetTokensScore(rhs); - }); - localities.erase(unique(localities.begin(), localities.end(), - [](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs) - { - return lhs.m_featureId == rhs.m_featureId; - }), - localities.end()); - - // Leave the most popular localities. - /// @todo Calculate match costs according to the exact locality name - /// (for 'york' query "york city" is better than "new york"). - sort(localities.begin(), localities.end(), - [&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs) - { - auto const ls = GetTokensScore(lhs); - auto const rs = GetTokensScore(rhs); - if (ls != rs) - return ls > rs; - return m_rankTable.Get(lhs.m_featureId) > m_rankTable.Get(rhs.m_featureId); - }); - if (localities.size() > limit) - localities.resize(limit); } -size_t LocalityScorer::GetTokensScore(Geocoder::Locality const & locality) const +// LocalityScorer ---------------------------------------------------------------------------------- +LocalityScorer::LocalityScorer(SearchQueryParams const & params, Delegate const & delegate) + : m_params(params), m_delegate(delegate) { - // *NOTE* - // * full token match costs 2 - // * prefix match costs 1 - // - // If locality is matched only by a single integral token or by an - // integral token + a prefix, overall score is reduced by one. - // - // TODO (@y, @m, @vng): consider to loop over all non-prefix - // tokens and decrement overall score by one for each integral - // token. - size_t const numTokens = locality.m_endToken - locality.m_startToken; - bool const prefixMatch = locality.m_endToken == m_params.m_tokens.size() + 1; +} - size_t score = 2 * numTokens; - if (prefixMatch) - --score; +void LocalityScorer::GetTopLocalities(size_t limit, vector & localities) const +{ + vector ls; + ls.reserve(localities.size()); + for (auto const & locality : localities) + ls.emplace_back(locality); - if ((numTokens == 2 && prefixMatch) || (numTokens == 1 && !prefixMatch)) + RemoveDuplicates(ls); + LeaveTopByRank(std::max(limit, kDefaultReadLimit), ls); + SortByName(ls); + if (ls.size() > limit) + ls.resize(limit); + + localities.clear(); + localities.reserve(ls.size()); + for (auto const & l : ls) + localities.push_back(l.m_locality); +} + +void LocalityScorer::RemoveDuplicates(vector & ls) const +{ + sort(ls.begin(), ls.end(), [](ExLocality const & lhs, ExLocality const & rhs) + { + if (lhs.GetId() != rhs.GetId()) + return lhs.GetId() < rhs.GetId(); + return lhs.m_numTokens > rhs.m_numTokens; + }); + ls.erase(unique(ls.begin(), ls.end(), + [](ExLocality const & lhs, ExLocality const & rhs) + { + return lhs.GetId() == rhs.GetId(); + }), + ls.end()); +} + +void LocalityScorer::LeaveTopByRank(size_t limit, vector & ls) const +{ + if (ls.size() <= limit) + return; + + for (auto & l : ls) + l.m_rank = m_delegate.GetRank(l.GetId()); + + sort(ls.begin(), ls.end(), [](ExLocality const & lhs, ExLocality const & rhs) + { + if (lhs.m_rank != rhs.m_rank) + return lhs.m_rank > rhs.m_rank; + return lhs.m_numTokens > rhs.m_numTokens; + }); + ls.resize(limit); +} + +void LocalityScorer::SortByName(vector & ls) const +{ + vector names; + for (auto & l : ls) { - auto const & token = m_params.GetTokens(locality.m_startToken).front(); - if (feature::IsNumber(token)) - --score; + names.clear(); + m_delegate.GetNames(l.GetId(), names); + + auto score = NAME_SCORE_ZERO; + for (auto const & name : names) + { + score = max(score, + GetNameScore(name, m_params, l.m_locality.m_startToken, l.m_locality.m_endToken)); + } + l.m_nameScore = score; } - return score; + sort(ls.begin(), ls.end(), [](ExLocality const & lhs, ExLocality const & rhs) + { + if (IsAlmostFullMatch(lhs.m_nameScore) && IsAlmostFullMatch(rhs.m_nameScore)) + { + // When both localities match well, e.g. full or full prefix + // match, the one with larger number of tokens is selected. In + // case of tie, the one with better score is selected. + if (lhs.m_numTokens != rhs.m_numTokens) + return lhs.m_numTokens > rhs.m_numTokens; + if (lhs.m_nameScore != rhs.m_nameScore) + return lhs.m_nameScore > rhs.m_nameScore; + } + else + { + // When name scores differ, the one with better name score is + // selected. In case of tie, the one with larger number of + // matched tokens is selected. + if (lhs.m_nameScore != rhs.m_nameScore) + return lhs.m_nameScore > rhs.m_nameScore; + if (lhs.m_numTokens != rhs.m_numTokens) + return lhs.m_numTokens > rhs.m_numTokens; + } + + // Okay, in case of tie we select the one with better rank. This + // is a quite arbitrary decision and definitely may be improved. + return lhs.m_rank > rhs.m_rank; + }); } } // namespace v2 } // namespace search diff --git a/search/v2/locality_scorer.hpp b/search/v2/locality_scorer.hpp index a41550bb76..8994849b59 100644 --- a/search/v2/locality_scorer.hpp +++ b/search/v2/locality_scorer.hpp @@ -1,12 +1,13 @@ #pragma once #include "search/v2/geocoder.hpp" +#include "search/v2/ranking_utils.hpp" +#include "std/string.hpp" #include "std/vector.hpp" namespace search { -class RankTable; struct SearchQueryParams; namespace v2 @@ -14,18 +15,41 @@ namespace v2 class LocalityScorer { public: - LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params); + class Delegate + { + public: + virtual ~Delegate() = default; - // After the call there will be no more than |limit| unique elements - // in |localities|, in descending order by number of matched tokens - // and ranks. - void LeaveTopLocalities(size_t limit, vector & localities) const; + virtual void GetNames(uint32_t featureId, vector & names) const = 0; + virtual uint8_t GetRank(uint32_t featureId) const = 0; + }; + + LocalityScorer(SearchQueryParams const & params, Delegate const & delegate); + + // Leaves at most |limit| elements of |localities|, ordered by some + // combination of ranks and number of matched tokens. + void GetTopLocalities(size_t limit, vector & localities) const; private: - size_t GetTokensScore(Geocoder::Locality const & locality) const; + struct ExLocality + { + ExLocality(); + explicit ExLocality(Geocoder::Locality const & locality); + + inline uint32_t GetId() const { return m_locality.m_featureId; } + + Geocoder::Locality m_locality; + size_t m_numTokens; + uint8_t m_rank; + NameScore m_nameScore; + }; + + void RemoveDuplicates(vector & ls) const; + void LeaveTopByRank(size_t limit, vector & ls) const; + void SortByName(vector & ls) const; - RankTable const & m_rankTable; SearchQueryParams const & m_params; + Delegate const & m_delegate; }; } // namespace v2 } // namespace search diff --git a/search/v2/ranking_utils.cpp b/search/v2/ranking_utils.cpp new file mode 100644 index 0000000000..cb0e741785 --- /dev/null +++ b/search/v2/ranking_utils.cpp @@ -0,0 +1,90 @@ +#include "search/v2/ranking_utils.hpp" + +#include "search/search_query_params.hpp" + +#include "indexer/search_delimiters.hpp" +#include "indexer/search_string_utils.hpp" + +#include "base/stl_add.hpp" +#include "base/string_utils.hpp" + +#include "std/algorithm.hpp" +#include "std/vector.hpp" + +using namespace strings; + +namespace search +{ +namespace v2 +{ +namespace +{ +bool Match(vector const & tokens, UniString const & token) +{ + return find(tokens.begin(), tokens.end(), token) != tokens.end(); +} + +bool PrefixMatch(vector const & prefixes, UniString const & token) +{ + for (auto const & prefix : prefixes) + { + if (StartsWith(token, prefix)) + return true; + } + return false; +} +} // namespace + +NameScore GetNameScore(string const & name, SearchQueryParams const & params, size_t startToken, + size_t endToken) +{ + if (startToken >= endToken) + return NAME_SCORE_ZERO; + + vector tokens; + SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters()); + + size_t const n = tokens.size(); + size_t const m = endToken - startToken; + + bool const lastTokenIsPrefix = (endToken == params.m_tokens.size() + 1); + + NameScore score = NAME_SCORE_ZERO; + for (int offset = 0; offset + m <= n; ++offset) + { + bool match = true; + for (int i = 0; i + 1 < m && match; ++i) + match = match && Match(params.GetTokens(startToken + i), tokens[offset + i]); + if (!match) + continue; + + if (Match(params.GetTokens(endToken - 1), tokens[offset + m - 1])) + { + if (m == n) + return NAME_SCORE_FULL_MATCH; + score = max(score, NAME_SCORE_SUBSTRING); + } + if (lastTokenIsPrefix && PrefixMatch(params.GetTokens(endToken - 1), tokens[offset + m - 1])) + { + if (m == n) + return NAME_SCORE_FULL_MATCH_PREFIX; + score = max(score, NAME_SCORE_SUBSTRING_PREFIX); + } + } + return score; +} + +string DebugPrint(NameScore score) +{ + switch (score) + { + case NAME_SCORE_ZERO: return "Zero"; + case NAME_SCORE_SUBSTRING_PREFIX: return "Substring Prefix"; + case NAME_SCORE_SUBSTRING: return "Substring"; + case NAME_SCORE_FULL_MATCH_PREFIX: return "Full Match Prefix"; + case NAME_SCORE_FULL_MATCH: return "Full Match"; + } + return "Unknown"; +} +} // namespace v2 +} // namespace search diff --git a/search/v2/ranking_utils.hpp b/search/v2/ranking_utils.hpp new file mode 100644 index 0000000000..028f120566 --- /dev/null +++ b/search/v2/ranking_utils.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "std/cstdint.hpp" +#include "std/string.hpp" + +namespace search +{ +struct SearchQueryParams; + +namespace v2 +{ +// The order and numeric values are important here. Please, check all +// use-cases before changing this enum. +enum NameScore +{ + NAME_SCORE_ZERO = 0, + NAME_SCORE_SUBSTRING_PREFIX = 1, + NAME_SCORE_SUBSTRING = 2, + NAME_SCORE_FULL_MATCH_PREFIX = 3, + NAME_SCORE_FULL_MATCH = 4 +}; + +NameScore GetNameScore(string const & name, SearchQueryParams const & params, size_t startToken, + size_t endToken); + +string DebugPrint(NameScore score); +} // namespace v2 +} // namespace search