From bfa4d51762868e36845be8dcb2b791f20014081e Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Fri, 29 Jan 2016 18:59:38 +0300 Subject: [PATCH] [search] Fixed locality scoring. Score is decreased for localities matched by a number. --- search/search.pro | 2 + search/search_tests/locality_scorer_test.cpp | 129 +++++++++++++++++++ search/search_tests/search_tests.pro | 1 + search/v2/geocoder.cpp | 107 +++++++-------- search/v2/geocoder.hpp | 49 ++++--- search/v2/locality_scorer.cpp | 87 +++++++++++++ search/v2/locality_scorer.hpp | 28 ++++ 7 files changed, 322 insertions(+), 81 deletions(-) create mode 100644 search/search_tests/locality_scorer_test.cpp create mode 100644 search/v2/locality_scorer.cpp create mode 100644 search/v2/locality_scorer.hpp diff --git a/search/search.pro b/search/search.pro index 5ddfd576f7..3b01f26030 100644 --- a/search/search.pro +++ b/search/search.pro @@ -53,6 +53,7 @@ HEADERS += \ v2/house_numbers_matcher.hpp \ v2/house_to_street_table.hpp \ v2/intersection_result.hpp \ + v2/locality_scorer.hpp \ v2/mwm_context.hpp \ v2/rank_table_cache.hpp \ v2/search_model.hpp \ @@ -94,6 +95,7 @@ SOURCES += \ v2/house_numbers_matcher.cpp \ v2/house_to_street_table.cpp \ v2/intersection_result.cpp \ + v2/locality_scorer.cpp \ v2/mwm_context.cpp \ v2/rank_table_cache.cpp \ v2/search_model.cpp \ diff --git a/search/search_tests/locality_scorer_test.cpp b/search/search_tests/locality_scorer_test.cpp new file mode 100644 index 0000000000..b13acc38a2 --- /dev/null +++ b/search/search_tests/locality_scorer_test.cpp @@ -0,0 +1,129 @@ +#include "testing/testing.hpp" + +#include "search/dummy_rank_table.hpp" +#include "search/search_delimiters.hpp" +#include "search/search_string_utils.hpp" +#include "search/v2/locality_scorer.hpp" + +#include "base/string_utils.hpp" + +#include "std/set.hpp" +#include "std/vector.hpp" + +using namespace strings; +using namespace search; +using namespace search::v2; + +namespace +{ +void InitParams(string const & query, SearchQueryParams & params) +{ + params.m_tokens.clear(); + params.m_prefixTokens.clear(); + + vector tokens; + + Delimiters delims; + SplitUniString(NormalizeAndSimplifyString(query), MakeBackInsertFunctor(tokens), delims); + for (auto const & token : tokens) + params.m_tokens.push_back({token}); +} + +void AddLocality(string const & name, uint32_t featureId, SearchQueryParams & params, + vector & localities) +{ + set tokens; + + Delimiters delims; + SplitUniString(NormalizeAndSimplifyString(name), [&tokens](UniString const & token) + { + tokens.insert(token); + }, + delims); + + for (size_t startToken = 0; startToken < params.m_tokens.size(); ++startToken) + { + for (size_t endToken = startToken + 1; endToken <= params.m_tokens.size(); ++endToken) + { + bool matches = true; + for (size_t i = startToken; i != endToken && matches; ++i) + { + if (tokens.count(params.m_tokens[i].front()) == 0) + matches = false; + } + if (matches) + localities.emplace_back(featureId, startToken, endToken); + } + } +} +} // namespace + +UNIT_TEST(LocalityScorer_Smoke) +{ + enum + { + ID_NEW_ORLEANS, + ID_YORK, + ID_NEW_YORK, + }; + + SearchQueryParams params; + InitParams("New York Time Square", params); + + vector localities; + + AddLocality("New Orleans", ID_NEW_ORLEANS, params, localities); + AddLocality("York", ID_YORK, params, localities); + AddLocality("New York", ID_NEW_YORK, params, localities); + + LocalityScorer scorer(DummyRankTable(), params); + scorer.LeaveTopLocalities(100 /* limit */, localities); + + TEST_EQUAL(3, localities.size(), ()); + + TEST_EQUAL(localities[0].m_featureId, ID_NEW_ORLEANS, ()); + TEST_EQUAL(localities[1].m_featureId, ID_YORK, ()); + TEST_EQUAL(localities[2].m_featureId, ID_NEW_YORK, ()); + + // New York is the best matching locality + scorer.LeaveTopLocalities(1 /* limit */, localities); + TEST_EQUAL(1, localities.size(), ()); + TEST_EQUAL(localities[0].m_featureId, ID_NEW_YORK, ()); +} + +UNIT_TEST(LocalityScorer_NumbersMatching) +{ + enum + { + ID_MARCH, + ID_APRIL, + ID_MAY, + ID_TVER + }; + + SearchQueryParams params; + InitParams("тверь советская 1", params); + + vector localities; + + AddLocality("поселок 1 марта", ID_MARCH, params, localities); + AddLocality("поселок 1 апреля", ID_APRIL, params, localities); + AddLocality("поселок 1 мая", ID_MAY, params, localities); + AddLocality("тверь", ID_TVER, params, localities); + + LocalityScorer scorer(DummyRankTable(), params); + scorer.LeaveTopLocalities(100 /* limit */, localities); + + TEST_EQUAL(4, localities.size(), ()); + + TEST_EQUAL(localities[0].m_featureId, ID_MARCH, ()); + TEST_EQUAL(localities[1].m_featureId, ID_APRIL, ()); + TEST_EQUAL(localities[2].m_featureId, ID_MAY, ()); + TEST_EQUAL(localities[3].m_featureId, ID_TVER, ()); + + // Tver is the best matching locality, as other localities were + // matched by number. + scorer.LeaveTopLocalities(1 /* limit */, localities); + TEST_EQUAL(1, localities.size(), ()); + TEST_EQUAL(localities[0].m_featureId, ID_TVER, ()); +} diff --git a/search/search_tests/search_tests.pro b/search/search_tests/search_tests.pro index 8dfe9a7727..69c898770a 100644 --- a/search/search_tests/search_tests.pro +++ b/search/search_tests/search_tests.pro @@ -25,6 +25,7 @@ SOURCES += \ keyword_matcher_test.cpp \ latlon_match_test.cpp \ locality_finder_test.cpp \ + locality_scorer_test.cpp \ query_saver_tests.cpp \ search_string_utils_test.cpp \ string_intersection_test.cpp \ diff --git a/search/v2/geocoder.cpp b/search/v2/geocoder.cpp index 42f1d94b06..4e1164035c 100644 --- a/search/v2/geocoder.cpp +++ b/search/v2/geocoder.cpp @@ -6,6 +6,7 @@ #include "search/search_string_utils.hpp" #include "search/v2/cbv_ptr.hpp" #include "search/v2/features_layer_matcher.hpp" +#include "search/v2/locality_scorer.hpp" #include "indexer/classificator.hpp" #include "indexer/feature_decl.hpp" @@ -33,6 +34,7 @@ #include "std/algorithm.hpp" #include "std/bind.hpp" #include "std/iterator.hpp" +#include "std/sstream.hpp" #include "std/target_os.hpp" #include "std/transform_iterator.hpp" @@ -95,23 +97,47 @@ struct ScopedMarkTokens size_t const m_to; }; -struct LazyRankTable +class LazyRankTable : public RankTable { + public: LazyRankTable(MwmValue const & value) : m_value(value) {} - uint8_t Get(uint64_t i) + uint8_t Get(uint64_t i) const override { - if (!m_table) - { - m_table = search::RankTable::Load(m_value.m_cont); - if (!m_table) - m_table = make_unique(); - } + EnsureTableLoaded(); return m_table->Get(i); } + uint64_t Size() const override + { + EnsureTableLoaded(); + return m_table->Size(); + } + + RankTable::Version GetVersion() const override + { + EnsureTableLoaded(); + return m_table->GetVersion(); + } + + void Serialize(Writer & writer, bool preserveHostEndiannes) override + { + EnsureTableLoaded(); + m_table->Serialize(writer, preserveHostEndiannes); + } + + private: + void EnsureTableLoaded() const + { + if (m_table) + return; + m_table = search::RankTable::Load(m_value.m_cont); + if (!m_table) + m_table = make_unique(); + } + MwmValue const & m_value; - unique_ptr m_table; + mutable unique_ptr m_table; }; class StreetCategories @@ -624,7 +650,6 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter l.m_endToken = endToken; preLocalities.push_back(l); }); - if (endToken < m_numTokens) { intersection.Intersect(m_addressFeatures[endToken].get()); @@ -634,51 +659,9 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter } } - auto const tokensCountFn = [&](Locality const & l) - { - // Important! Prefix match costs 1 while token match costs 2 for locality comparison. - size_t d = 2 * (l.m_endToken - l.m_startToken); - ASSERT_GREATER(d, 0, ()); - if (l.m_endToken == m_numTokens && !m_params.m_prefixTokens.empty()) - d -= 1; - return d; - }; - - // Unique preLocalities with featureId but leave the longest range if equal. - sort(preLocalities.begin(), preLocalities.end(), - [&](Locality const & l1, Locality const & l2) - { - if (l1.m_featureId != l2.m_featureId) - return l1.m_featureId < l2.m_featureId; - return tokensCountFn(l1) > tokensCountFn(l2); - }); - - preLocalities.erase(unique(preLocalities.begin(), preLocalities.end(), - [](Locality const & l1, Locality const & l2) - { - return l1.m_featureId == l2.m_featureId; - }), - preLocalities.end()); - LazyRankTable rankTable(m_context->m_value); - - // Leave the most popular localities. - if (preLocalities.size() > maxNumLocalities) - { - /// @todo Calculate match costs according to the exact locality name - /// (for 'york' query "york city" is better than "new york"). - - sort(preLocalities.begin(), preLocalities.end(), - [&](Locality const & l1, Locality const & l2) - { - auto const d1 = tokensCountFn(l1); - auto const d2 = tokensCountFn(l2); - if (d1 != d2) - return d1 > d2; - return rankTable.Get(l1.m_featureId) > rankTable.Get(l2.m_featureId); - }); - preLocalities.resize(maxNumLocalities); - } + LocalityScorer scorer(rankTable, m_params); + scorer.LeaveTopLocalities(maxNumLocalities, preLocalities); } void Geocoder::FillLocalitiesTable() @@ -999,11 +982,8 @@ void Geocoder::GreedilyMatchStreets() if (IsStreetSynonym(token)) continue; - if (feature::IsHouseNumber(token) && - !coding::CompressedBitVector::IsEmpty(allFeatures)) - { + if (feature::IsHouseNumber(token)) CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures); - } unique_ptr buffer; if (startToken == curToken || coding::CompressedBitVector::IsEmpty(allFeatures)) @@ -1017,9 +997,6 @@ void Geocoder::GreedilyMatchStreets() allFeatures.swap(buffer); } - if (coding::CompressedBitVector::IsEmpty(allFeatures)) - continue; - CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures); } } @@ -1328,5 +1305,13 @@ bool Geocoder::HasUsedTokensInRange(size_t from, size_t to) const { return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, Id()); } + +string DebugPrint(Geocoder::Locality const & locality) +{ + ostringstream os; + os << "Locality [" << DebugPrint(locality.m_countryId) << ", " << locality.m_featureId << ", " + << locality.m_startToken << ", " << locality.m_endToken << "]"; + return os.str(); +} } // namespace v2 } // namespace search diff --git a/search/v2/geocoder.hpp b/search/v2/geocoder.hpp index c3bb6c3646..f8a73358ee 100644 --- a/search/v2/geocoder.hpp +++ b/search/v2/geocoder.hpp @@ -75,21 +75,6 @@ public: size_t m_maxNumResults; }; - Geocoder(Index & index, storage::CountryInfoGetter const & infoGetter); - - ~Geocoder() override; - - // Sets search query params. - void SetParams(Params const & params); - - // Starts geocoding, retrieved features will be appended to - // |results|. - void GoEverywhere(vector & results); - void GoInViewport(vector & results); - - void ClearCaches(); - -private: enum RegionType { REGION_TYPE_STATE, @@ -97,14 +82,19 @@ private: REGION_TYPE_COUNT }; - void GoImpl(vector> & infos, bool inViewport); - struct Locality { + Locality() : m_featureId(0), m_startToken(0), m_endToken(0) {} + + Locality(uint32_t featureId, size_t startToken, size_t endToken) + : m_featureId(featureId), m_startToken(startToken), m_endToken(endToken) + { + } + MwmSet::MwmId m_countryId; - uint32_t m_featureId = 0; - size_t m_startToken = 0; - size_t m_endToken = 0; + uint32_t m_featureId; + size_t m_startToken; + size_t m_endToken; }; // This struct represents a country or US- or Canadian- state. It @@ -131,6 +121,23 @@ private: m2::RectD m_rect; }; + Geocoder(Index & index, storage::CountryInfoGetter const & infoGetter); + + ~Geocoder() override; + + // Sets search query params. + void SetParams(Params const & params); + + // Starts geocoding, retrieved features will be appended to + // |results|. + void GoEverywhere(vector & results); + void GoInViewport(vector & results); + + void ClearCaches(); + +private: + void GoImpl(vector> & infos, bool inViewport); + template using TLocalitiesCache = map, vector>; @@ -295,5 +302,7 @@ private: // Non-owning pointer to a vector of results. vector * m_results; }; + +string DebugPrint(Geocoder::Locality const & locality); } // namespace v2 } // namespace search diff --git a/search/v2/locality_scorer.cpp b/search/v2/locality_scorer.cpp new file mode 100644 index 0000000000..308871a6e3 --- /dev/null +++ b/search/v2/locality_scorer.cpp @@ -0,0 +1,87 @@ +#include "search/v2/locality_scorer.hpp" + +#include "search/dummy_rank_table.hpp" +#include "search/search_query_params.hpp" +#include "search/v2/mwm_context.hpp" + +#include "indexer/feature_impl.hpp" +#include "indexer/index.hpp" +#include "indexer/rank_table.hpp" + +#include "std/algorithm.hpp" +#include "std/unique_ptr.hpp" + +namespace search +{ +namespace v2 +{ +LocalityScorer::LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params) + : m_rankTable(rankTable), m_params(params) +{ +} + +void LocalityScorer::LeaveTopLocalities(size_t const limit, + vector & localities) const +{ + // Unique localities by featureId but leave the longest range if equal. + sort(localities.begin(), localities.end(), [&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs) + { + if (lhs.m_featureId != rhs.m_featureId) + return lhs.m_featureId < rhs.m_featureId; + return GetTokensScore(lhs) > GetTokensScore(rhs); + }); + localities.erase(unique(localities.begin(), localities.end(), + [](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs) + { + return lhs.m_featureId == rhs.m_featureId; + }), + localities.end()); + + // Leave the most popular localities. + if (localities.size() > limit) + { + /// @todo Calculate match costs according to the exact locality name + /// (for 'york' query "york city" is better than "new york"). + sort(localities.begin(), localities.end(), + [&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs) + { + auto const ls = GetTokensScore(lhs); + auto const rs = GetTokensScore(rhs); + if (ls != rs) + return ls > rs; + return m_rankTable.Get(lhs.m_featureId) > m_rankTable.Get(rhs.m_featureId); + }); + localities.resize(limit); + } +} + +size_t LocalityScorer::GetTokensScore(Geocoder::Locality const & locality) const +{ + // *NOTE* + // * full token match costs 2 + // * prefix match costs 1 + // + // If locality is matched only by a single integral token or by an + // integral token + a prefix, overall score is reduced by one. + // + // TODO (@y, @m, @vng): consider to loop over all non-prefix + // tokens and decrement overall score by one for each integral + // token. + size_t const numTokens = locality.m_endToken - locality.m_startToken; + bool const prefixMatch = locality.m_endToken == m_params.m_tokens.size() + 1; + + size_t score = 2 * numTokens; + if (prefixMatch) + --score; + + if ((numTokens == 2 && prefixMatch) || (numTokens == 1 && !prefixMatch)) + { + auto const & token = m_params.GetTokens(locality.m_startToken).front(); + if (feature::IsNumber(token)) + --score; + } + + return score; +} +} // namespace v2 +} // namespace search diff --git a/search/v2/locality_scorer.hpp b/search/v2/locality_scorer.hpp new file mode 100644 index 0000000000..4a0490d1a3 --- /dev/null +++ b/search/v2/locality_scorer.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "search/v2/geocoder.hpp" + +#include "std/vector.hpp" + +namespace search +{ +class RankTable; +struct SearchQueryParams; + +namespace v2 +{ +class LocalityScorer +{ +public: + LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params); + + void LeaveTopLocalities(size_t const limit, vector & localities) const; + +private: + size_t GetTokensScore(Geocoder::Locality const & locality) const; + + RankTable const & m_rankTable; + SearchQueryParams const & m_params; +}; +} // namespace v2 +} // namespace search