From e09b14f438b37dbabbf530255537f2e1890a2cb5 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Mon, 6 Feb 2017 16:11:28 +0300 Subject: [PATCH] [search] Fixed discrepancy in tokens between geocoder and ranker. --- search/geocoder.cpp | 52 ------------------- search/processor.cpp | 47 +++++++++++++++++ search/ranker.cpp | 2 +- search/ranking_utils.cpp | 33 +++++++++++- search/ranking_utils.hpp | 20 ++++--- .../processor_test.cpp | 28 ++++++++++ 6 files changed, 120 insertions(+), 62 deletions(-) diff --git a/search/geocoder.cpp b/search/geocoder.cpp index 82a10dff1a..4585d44258 100644 --- a/search/geocoder.cpp +++ b/search/geocoder.cpp @@ -218,21 +218,6 @@ MwmSet::MwmHandle FindWorld(Index const & index, vector> con return handle; } -UniString AsciiToUniString(char const * s) { return UniString(s, s + strlen(s)); } - -bool IsStopWord(UniString const & s) -{ - /// @todo Get all common used stop words and factor out this array into - /// search_string_utils.cpp module for example. - static char const * arr[] = {"a", "de", "da", "la"}; - - static set const kStopWords( - make_transform_iterator(arr, &AsciiToUniString), - make_transform_iterator(arr + ARRAY_SIZE(arr), &AsciiToUniString)); - - return kStopWords.count(s) > 0; -} - double Area(m2::RectD const & rect) { return rect.IsValid() ? rect.SizeX() * rect.SizeY() : 0; } // Computes an average similaty between |rect| and |pivot|. By @@ -375,43 +360,6 @@ void Geocoder::SetParams(Params const & params) { m_params = params; - // Filter stop words. - if (m_params.GetNumTokens() > 1) - { - for (size_t i = 0; i < m_params.GetNumTokens();) - { - if (m_params.IsPrefixToken(i)) - { - ++i; - continue; - } - - auto & token = m_params.GetToken(i); - if (IsStopWord(token.m_original)) - { - m_params.RemoveToken(i); - } - else - { - my::EraseIf(token.m_synonyms, &IsStopWord); - ++i; - } - } - - // If all tokens are stop words - give up. - if (m_params.GetNumTokens() == 0) - m_params = params; - } - - // Remove all category synonyms for streets, as they're extracted - // individually. - for (size_t i = 0; i < m_params.GetNumTokens(); ++i) - { - auto & token = m_params.GetToken(i); - if (IsStreetSynonym(token.m_original)) - m_params.GetTypeIndices(i).clear(); - } - m_tokenRequests.clear(); m_prefixTokenRequest.Clear(); for (size_t i = 0; i < m_params.GetNumTokens(); ++i) diff --git a/search/processor.cpp b/search/processor.cpp index 63d325e7e8..ab6caeb387 100644 --- a/search/processor.cpp +++ b/search/processor.cpp @@ -129,6 +129,42 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res alohalytics::LogEvent("searchEmitResultsAndCoords", stats); GetPlatform().GetMarketingService().SendMarketingEvent(marketing::kSearchEmitResultsAndCoords, {}); } + +// Removes all full-token stop words from |params|, unless |params| +// consists of all such tokens. +void RemoveStopWordsIfNeeded(QueryParams & params) +{ + size_t numStopWords = 0; + for (size_t i = 0; i < params.GetNumTokens(); ++i) + { + auto & token = params.GetToken(i); + if (!params.IsPrefixToken(i) && IsStopWord(token.m_original)) + ++numStopWords; + } + + if (numStopWords == params.GetNumTokens()) + return; + + for (size_t i = 0; i < params.GetNumTokens();) + { + if (params.IsPrefixToken(i)) + { + ++i; + continue; + } + + auto & token = params.GetToken(i); + if (IsStopWord(token.m_original)) + { + params.RemoveToken(i); + } + else + { + my::EraseIf(token.m_synonyms, &IsStopWord); + ++i; + } + } +} } // namespace // static @@ -642,6 +678,17 @@ void Processor::InitParams(QueryParams & params) auto & langs = params.GetLangs(); for (int i = 0; i < LANG_COUNT; ++i) langs.insert(GetLanguage(i)); + + RemoveStopWordsIfNeeded(params); + + // Remove all type indices for streets, as they're considired + // individually. + for (size_t i = 0; i < params.GetNumTokens(); ++i) + { + auto & token = params.GetToken(i); + if (IsStreetSynonym(token.m_original)) + params.GetTypeIndices(i).clear(); + } } void Processor::InitGeocoder(Geocoder::Params & params) diff --git a/search/ranker.cpp b/search/ranker.cpp index 93bdecc88f..84b4048f2c 100644 --- a/search/ranker.cpp +++ b/search/ranker.cpp @@ -188,7 +188,7 @@ class PreResult2Maker if (!ft.GetName(lang, name)) continue; vector tokens; - SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters()); + PrepareStringForMatching(name, tokens); UpdateNameScore(tokens, slice, info.m_nameScore); UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore); diff --git a/search/ranking_utils.cpp b/search/ranking_utils.cpp index 9d8a97b978..16d6f4bcbc 100644 --- a/search/ranking_utils.cpp +++ b/search/ranking_utils.cpp @@ -1,11 +1,18 @@ #include "search/ranking_utils.hpp" -#include "std/algorithm.hpp" +#include "std/transform_iterator.hpp" + +#include using namespace strings; namespace search { +namespace +{ +UniString AsciiToUniString(char const * s) { return UniString(s, s + strlen(s)); } +} // namespace + namespace impl { bool FullMatch(QueryParams::Token const & token, UniString const & text) @@ -30,6 +37,29 @@ bool PrefixMatch(QueryParams::Token const & token, UniString const & text) } } // namespace impl +bool IsStopWord(UniString const & s) +{ + /// @todo Get all common used stop words and factor out this array into + /// search_string_utils.cpp module for example. + static char const * arr[] = {"a", "de", "da", "la"}; + + static std::set const kStopWords( + make_transform_iterator(arr, &AsciiToUniString), + make_transform_iterator(arr + ARRAY_SIZE(arr), &AsciiToUniString)); + + return kStopWords.count(s) > 0; +} + +void PrepareStringForMatching(std::string const & name, std::vector & tokens) +{ + auto filter = [&tokens](strings::UniString const & token) + { + if (!IsStopWord(token)) + tokens.push_back(token); + }; + SplitUniString(NormalizeAndSimplifyString(name), filter, Delimiters()); +} + string DebugPrint(NameScore score) { switch (score) @@ -43,5 +73,4 @@ string DebugPrint(NameScore score) } return "Unknown"; } - } // namespace search diff --git a/search/ranking_utils.hpp b/search/ranking_utils.hpp index 0d6a58046b..bf359e42b2 100644 --- a/search/ranking_utils.hpp +++ b/search/ranking_utils.hpp @@ -9,10 +9,10 @@ #include "base/stl_add.hpp" #include "base/string_utils.hpp" -#include "std/cstdint.hpp" -#include "std/limits.hpp" -#include "std/string.hpp" -#include "std/vector.hpp" +#include +#include +#include +#include namespace search { @@ -38,19 +38,25 @@ enum NameScore NAME_SCORE_COUNT }; +// Returns true when |s| is a stop-word and may be removed from a query. +bool IsStopWord(strings::UniString const & s); + +// Normalizes, simplifies and splits string, removes stop-words. +void PrepareStringForMatching(std::string const & name, std::vector & tokens); + template -NameScore GetNameScore(string const & name, TSlice const & slice) +NameScore GetNameScore(std::string const & name, TSlice const & slice) { if (slice.Empty()) return NAME_SCORE_ZERO; - vector tokens; + std::vector tokens; SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters()); return GetNameScore(tokens, slice); } template -NameScore GetNameScore(vector const & tokens, TSlice const & slice) +NameScore GetNameScore(std::vector const & tokens, TSlice const & slice) { if (slice.Empty()) return NAME_SCORE_ZERO; diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp index 114205fbd5..ef8fdc3bc4 100644 --- a/search/search_integration_tests/processor_test.cpp +++ b/search/search_integration_tests/processor_test.cpp @@ -805,5 +805,33 @@ UNIT_CLASS_TEST(ProcessorTest, SpacesInCategories) TEST(ResultsMatch("Москва ночной клуб", "ru", rules), ()); } } + +UNIT_CLASS_TEST(ProcessorTest, StopWords) +{ + TestCountry country(m2::PointD(0, 0), "France", "en"); + TestCity city(m2::PointD(0, 0), "Paris", "en", 100 /* rank */); + TestStreet street( + vector{m2::PointD(-0.001, -0.001), m2::PointD(0, 0), m2::PointD(0.001, 0.001)}, + "Rue de la Paix", "en"); + + BuildWorld([&](TestMwmBuilder & builder) { + builder.Add(country); + builder.Add(city); + }); + + auto id = BuildCountry(country.GetName(), [&](TestMwmBuilder & builder) { builder.Add(street); }); + + { + auto request = MakeRequest("la France à Paris Rue de la Paix"); + + TRules rules = {ExactMatch(id, street)}; + + auto const & results = request->Results(); + TEST(MatchResults(rules, results), ()); + + auto const & info = results[0].GetRankingInfo(); + TEST_EQUAL(info.m_nameScore, NAME_SCORE_FULL_MATCH, ()); + } +} } // namespace } // namespace search