From 8808823db18c6dcd4842681f0b5765361e337a93 Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Wed, 1 Feb 2017 19:14:01 +0300 Subject: [PATCH 1/4] [search] Fuzzy search by category. --- base/mem_trie.hpp | 66 +++++++++--- indexer/categories_holder.hpp | 9 ++ search/CMakeLists.txt | 1 + search/common.hpp | 12 --- search/feature_offset_match.hpp | 2 +- search/geocoder.cpp | 12 --- search/keyword_matcher.cpp | 4 +- search/processor.cpp | 24 +++-- search/processor.hpp | 3 + search/search.pro | 1 + .../processor_test.cpp | 15 +++ search/utils.hpp | 100 +++++++++++++++++- 12 files changed, 197 insertions(+), 52 deletions(-) diff --git a/base/mem_trie.hpp b/base/mem_trie.hpp index fdd35af365..d5394f5bf3 100644 --- a/base/mem_trie.hpp +++ b/base/mem_trie.hpp @@ -15,7 +15,12 @@ namespace my template class MemTrie { +private: + struct Node; + public: + using Char = typename String::value_type; + MemTrie() = default; MemTrie(MemTrie && rhs) { *this = std::move(rhs); } @@ -27,6 +32,31 @@ public: return *this; } + // A read-only iterator. Any modification to the + // underlying trie is assumed to invalidate the iterator. + class Iterator + { + public: + Iterator(MemTrie::Node const & node) : m_node(node) {} + + template + void ForEachMove(ToDo && todo) const + { + for (auto const & move : m_node.m_moves) + todo(move.first, Iterator(*move.second)); + } + + template + void ForEachInNode(ToDo && todo) const + { + for (auto const & value : m_node.m_values) + todo(value); + } + + private: + MemTrie::Node const & m_node; + }; + // Adds a key-value pair to the trie. void Add(String const & key, Value const & value) { @@ -41,40 +71,42 @@ public: cur->AddValue(value); } - // Traverses all key-value pairs in the trie and calls |toDo| on each of them. + // Traverses all key-value pairs in the trie and calls |todo| on each of them. template - void ForEachInTrie(ToDo && toDo) const + void ForEachInTrie(ToDo && todo) const { String prefix; - ForEachInSubtree(m_root, prefix, std::forward(toDo)); + ForEachInSubtree(m_root, prefix, std::forward(todo)); } - // Calls |toDo| for each key-value pair in the node that is reachable + // Calls |todo| for each key-value pair in the node that is reachable // by |prefix| from the trie root. Does nothing if such node does // not exist. template - void ForEachInNode(String const & prefix, ToDo && toDo) const + void ForEachInNode(String const & prefix, ToDo && todo) const { if (auto const * root = MoveTo(prefix)) - ForEachInNode(*root, prefix, std::forward(toDo)); + ForEachInNode(*root, prefix, std::forward(todo)); } - // Calls |toDo| for each key-value pair in a subtree that is + // Calls |todo| for each key-value pair in a subtree that is // reachable by |prefix| from the trie root. Does nothing if such // subtree does not exist. template - void ForEachInSubtree(String prefix, ToDo && toDo) const + void ForEachInSubtree(String prefix, ToDo && todo) const { if (auto const * root = MoveTo(prefix)) - ForEachInSubtree(*root, prefix, std::forward(toDo)); + ForEachInSubtree(*root, prefix, std::forward(todo)); } size_t GetNumNodes() const { return m_numNodes; } + Iterator GetRootIterator() const { return Iterator(m_root); } + Node const & GetRoot() const { return m_root; } private: struct Node { - using Char = typename String::value_type; + friend class MemTrie::Iterator; Node() = default; Node(Node && /* rhs */) = default; @@ -117,27 +149,27 @@ private: return cur; } - // Calls |toDo| for each key-value pair in a |node| that is + // Calls |todo| for each key-value pair in a |node| that is // reachable by |prefix| from the trie root. template - void ForEachInNode(Node const & node, String const & prefix, ToDo && toDo) const + void ForEachInNode(Node const & node, String const & prefix, ToDo && todo) const { for (auto const & value : node.m_values) - toDo(prefix, value); + todo(prefix, value); } - // Calls |toDo| for each key-value pair in subtree where |node| is a + // Calls |todo| for each key-value pair in subtree where |node| is a // root of the subtree. |prefix| is a path from the trie root to the // |node|. template - void ForEachInSubtree(Node const & node, String & prefix, ToDo && toDo) const + void ForEachInSubtree(Node const & node, String & prefix, ToDo && todo) const { - ForEachInNode(node, prefix, toDo); + ForEachInNode(node, prefix, todo); for (auto const & move : node.m_moves) { prefix.push_back(move.first); - ForEachInSubtree(*move.second, prefix, toDo); + ForEachInSubtree(*move.second, prefix, todo); prefix.pop_back(); } } diff --git a/indexer/categories_holder.hpp b/indexer/categories_holder.hpp index ba40e43460..b40982e001 100644 --- a/indexer/categories_holder.hpp +++ b/indexer/categories_holder.hpp @@ -125,6 +125,15 @@ public: /// @returns raw classificator type if it's not localized in categories.txt. string GetReadableFeatureType(uint32_t type, int8_t locale) const; + // Exposes the tries that map category tokens to types. + Trie const * GetNameToTypesTrie(int8_t locale) const + { + auto const it = m_name2type.find(locale); + if (it == m_name2type.end()) + return nullptr; + return it->second.get(); + } + bool IsTypeExist(uint32_t type) const; inline void Swap(CategoriesHolder & r) diff --git a/search/CMakeLists.txt b/search/CMakeLists.txt index 5b2c22322f..870853153e 100644 --- a/search/CMakeLists.txt +++ b/search/CMakeLists.txt @@ -123,6 +123,7 @@ set( token_slice.hpp types_skipper.cpp types_skipper.hpp + utils.cpp utils.hpp viewport_search_callback.cpp viewport_search_callback.hpp diff --git a/search/common.hpp b/search/common.hpp index 2bdc9300a6..abd8a88c71 100644 --- a/search/common.hpp +++ b/search/common.hpp @@ -5,16 +5,4 @@ namespace search /// Upper bound for max count of tokens for indexing and scoring. int constexpr MAX_TOKENS = 32; int constexpr MAX_SUGGESTS_COUNT = 5; - -template -bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix) -{ - while (beg != end && begPrefix != endPrefix && *beg == *begPrefix) - { - ++beg; - ++begPrefix; - } - return begPrefix == endPrefix; -} - } // namespace search diff --git a/search/feature_offset_match.hpp b/search/feature_offset_match.hpp index 1c57683304..f05cf5eff9 100644 --- a/search/feature_offset_match.hpp +++ b/search/feature_offset_match.hpp @@ -221,7 +221,7 @@ struct SearchTrieRequest QueryParams::Langs m_langs; }; -// Calls |toDo| for each feature accepted but at least one DFA. +// Calls |toDo| for each feature accepted by at least one DFA. // // *NOTE* |toDo| may be called several times for the same feature. template diff --git a/search/geocoder.cpp b/search/geocoder.cpp index 6a279ed36e..41a65e01ef 100644 --- a/search/geocoder.cpp +++ b/search/geocoder.cpp @@ -315,18 +315,6 @@ size_t OrderCountries(m2::RectD const & pivot, vector> & inf auto const sep = stable_partition(infos.begin(), infos.end(), intersects); return distance(infos.begin(), sep); } - -size_t GetMaxErrorsForToken(UniString const & token) -{ - bool const digitsOnly = all_of(token.begin(), token.end(), isdigit); - if (digitsOnly) - return 0; - if (token.size() < 4) - return 0; - if (token.size() < 8) - return 1; - return 2; -} } // namespace // Geocoder::Params -------------------------------------------------------------------------------- diff --git a/search/keyword_matcher.cpp b/search/keyword_matcher.cpp index 3b7ddcee60..42672f5119 100644 --- a/search/keyword_matcher.cpp +++ b/search/keyword_matcher.cpp @@ -1,4 +1,6 @@ -#include "keyword_matcher.hpp" +#include "search/keyword_matcher.hpp" + +#include "search/utils.hpp" #include "indexer/search_delimiters.hpp" #include "indexer/search_string_utils.hpp" diff --git a/search/processor.cpp b/search/processor.cpp index ea1042eb60..b5e7c4e577 100644 --- a/search/processor.cpp +++ b/search/processor.cpp @@ -128,8 +128,8 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res GetPlatform().GetMarketingService().SendMarketingEvent(marketing::kSearchEmitResultsAndCoords, {}); } -// Removes all full-token stop words from |params|, unless |params| -// consists of all such tokens. +// Removes all full-token stop words from |params|. +// Does nothing if all tokens in |params| are stop words. void RemoveStopWordsIfNeeded(QueryParams & params) { size_t numStopWords = 0; @@ -331,6 +331,7 @@ int8_t Processor::GetLanguage(int id) const { return m_ranker.GetLanguage(GetLangIndex(id)); } + m2::PointD Processor::GetPivotPoint() const { bool const viewportSearch = m_mode == Mode::Viewport; @@ -413,6 +414,13 @@ void Processor::ForEachCategoryType(StringSliceBase const & slice, ToDo && todo) ::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, forward(todo)); } +template +void Processor::ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const +{ + ::search::ForEachCategoryTypeFuzzy(slice, GetCategoryLocales(), m_categories, + forward(todo)); +} + void Processor::Search(SearchParams const & params, m2::RectD const & viewport) { SetMode(params.m_mode); @@ -671,11 +679,9 @@ void Processor::InitParams(QueryParams & params) } } }; - ForEachCategoryType(QuerySliceOnRawStrings(m_tokens, m_prefix), addSyms); - auto & langs = params.GetLangs(); - for (int i = 0; i < LANG_COUNT; ++i) - langs.Insert(GetLanguage(i)); + // todo(@m, @y). Shall we match prefix tokens for categories? + ForEachCategoryTypeFuzzy(QuerySliceOnRawStrings(m_tokens, m_prefix), addSyms); RemoveStopWordsIfNeeded(params); @@ -687,6 +693,12 @@ void Processor::InitParams(QueryParams & params) if (IsStreetSynonym(token.m_original)) params.GetTypeIndices(i).clear(); } + + for (size_t i = 0; i < params.GetNumTokens(); ++i) + my::SortUnique(params.GetTypeIndices(i)); + + for (int i = 0; i < LANG_COUNT; ++i) + params.GetLangs().Insert(GetLanguage(i)); } void Processor::InitGeocoder(Geocoder::Params & params) diff --git a/search/processor.hpp b/search/processor.hpp index 5e784954fd..c32d03f16d 100644 --- a/search/processor.hpp +++ b/search/processor.hpp @@ -141,6 +141,9 @@ protected: template void ForEachCategoryType(StringSliceBase const & slice, ToDo && todo) const; + template + void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const; + m2::PointD GetPivotPoint() const; m2::RectD GetPivotRect() const; diff --git a/search/search.pro b/search/search.pro index abb050f3c0..2c11aa354f 100644 --- a/search/search.pro +++ b/search/search.pro @@ -135,4 +135,5 @@ SOURCES += \ streets_matcher.cpp \ token_slice.cpp \ types_skipper.cpp \ + utils.cpp \ viewport_search_callback.cpp diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp index ef8fdc3bc4..63aa3b5342 100644 --- a/search/search_integration_tests/processor_test.cpp +++ b/search/search_integration_tests/processor_test.cpp @@ -754,6 +754,9 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch) TestPOI bar(m2::PointD(0, 0), "Черчилль", "ru"); bar.SetTypes({{"amenity", "pub"}}); + TestPOI metro(m2::PointD(5.0, 5.0), "Liceu", "es"); + metro.SetTypes({{"railway", "subway_entrance"}}); + BuildWorld([&](TestMwmBuilder & builder) { builder.Add(country); builder.Add(city); @@ -762,6 +765,7 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch) auto id = BuildCountry(countryName, [&](TestMwmBuilder & builder) { builder.Add(street); builder.Add(bar); + builder.Add(metro); }); SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0))); @@ -778,6 +782,17 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch) TEST(ResultsMatch("масква ленинргадский чирчиль", "ru", TRules{}), ()); TEST(ResultsMatch("моксва ленинргадский черчиль", "ru", rules), ()); + + TEST(ResultsMatch("food", "ru", rules), ()); + TEST(ResultsMatch("foood", "ru", rules), ()); + TEST(ResultsMatch("fod", "ru", TRules{}), ()); + + TRules rulesMetro = {ExactMatch(id, metro)}; + TEST(ResultsMatch("transporte", "es", rulesMetro), ()); + TEST(ResultsMatch("transport", "es", rulesMetro), ()); + TEST(ResultsMatch("transpurt", "en", rulesMetro), ()); + TEST(ResultsMatch("transpurrt", "es", rulesMetro), ()); + TEST(ResultsMatch("transportation", "en", TRules{}), ()); } } diff --git a/search/utils.hpp b/search/utils.hpp index c36f74736a..919a18e907 100644 --- a/search/utils.hpp +++ b/search/utils.hpp @@ -3,14 +3,74 @@ #include "search/token_slice.hpp" #include "indexer/categories_holder.hpp" +#include "indexer/search_delimiters.hpp" +#include "indexer/search_string_utils.hpp" #include "base/buffer_vector.hpp" +#include "base/levenshtein_dfa.hpp" +#include "base/stl_helpers.hpp" #include "base/string_utils.hpp" +#include +#include +#include +#include + namespace search { +namespace +{ +// my::MemTrie +// todo(@m, @y). Unite with the similar function in search/feature_offset_match.hpp. +template +bool MatchInTrie(Trie const & trie, DFA const & dfa, ToDo && toDo) +{ + using Char = typename Trie::Char; + using TrieIt = typename Trie::Iterator; + using DFAIt = typename DFA::Iterator; + using State = pair; + + std::queue q; + + { + auto it = dfa.Begin(); + if (it.Rejects()) + return false; + q.emplace(trie.GetRootIterator(), it); + } + + bool found = false; + + while (!q.empty()) + { + auto const p = q.front(); + q.pop(); + + auto const & trieIt = p.first; + auto const & dfaIt = p.second; + + if (dfaIt.Accepts()) + { + trieIt.ForEachInNode(toDo); + found = true; + } + + trieIt.ForEachMove([&](Char const & c, TrieIt const & nextTrieIt) { + auto nextDfaIt = dfaIt; + nextDfaIt.Move(c); + if (!nextDfaIt.Rejects()) + q.emplace(nextTrieIt, nextDfaIt); + }); + } + + return found; +} +} // namespace + using TLocales = buffer_vector; +size_t GetMaxErrorsForToken(strings::UniString const & token); + template void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales, CategoriesHolder const & categories, ToDo && todo) @@ -18,16 +78,50 @@ void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales for (size_t i = 0; i < slice.Size(); ++i) { auto const & token = slice.Get(i); - for (size_t j = 0; j < locales.size(); ++j) - categories.ForEachTypeByName(locales[j], token, bind(todo, i, _1)); + for (int8_t const locale : locales) + categories.ForEachTypeByName(locale, token, std::bind(todo, i, std::placeholders::_1)); // Special case processing of 2 codepoints emoji (e.g. black guy on a bike). // Only emoji synonyms can have one codepoint. if (token.size() > 1) { categories.ForEachTypeByName(CategoriesHolder::kEnglishCode, strings::UniString(1, token[0]), - bind(todo, i, _1)); + std::bind(todo, i, std::placeholders::_1)); } } } + +// Unlike ForEachCategoryType which extracts types by a token +// from |slice| by exactly matching it to a token in the name +// of a category, in the worst case this function has to loop through the tokens +// in all category synonyms in all |locales| in order to find a token +// whose edit distance is close enough to the required token from |slice|. +template +void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, TLocales const & locales, + CategoriesHolder const & categories, ToDo && todo) +{ + for (size_t i = 0; i < slice.Size(); ++i) + { + auto const & token = slice.Get(i); + auto const & dfa = + strings::LevenshteinDFA(token, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(token)); + for (int8_t const locale : locales) + { + auto const * trie = categories.GetNameToTypesTrie(locale); + if (trie != nullptr) + MatchInTrie(*trie, dfa, std::bind(todo, i, std::placeholders::_1)); + } + } +} + +template +bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix) +{ + while (beg != end && begPrefix != endPrefix && *beg == *begPrefix) + { + ++beg; + ++begPrefix; + } + return begPrefix == endPrefix; +} } // namespace search From 5ddc8db2baa8f158f5ed84c09af550f52e50a1ce Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Mon, 13 Feb 2017 17:40:58 +0300 Subject: [PATCH 2/4] Review fixes. --- base/mem_trie.hpp | 48 +++++++++++++++++++++----------------- base/string_utils.cpp | 11 +++------ base/string_utils.hpp | 11 +++++++++ search/keyword_matcher.cpp | 5 ++-- search/processor.cpp | 10 ++++---- search/processor.hpp | 4 ++-- search/ranker.cpp | 3 ++- search/utils.hpp | 14 ----------- 8 files changed, 51 insertions(+), 55 deletions(-) diff --git a/base/mem_trie.hpp b/base/mem_trie.hpp index d5394f5bf3..6bd161eed9 100644 --- a/base/mem_trie.hpp +++ b/base/mem_trie.hpp @@ -3,6 +3,7 @@ #include "base/macros.hpp" #include "base/stl_add.hpp" +#include #include #include #include @@ -32,25 +33,28 @@ public: return *this; } - // A read-only iterator. Any modification to the + // A read-only iterator wrapping a Node. Any modification to the // underlying trie is assumed to invalidate the iterator. class Iterator { public: Iterator(MemTrie::Node const & node) : m_node(node) {} + // Iterates over all possible moves from this Iterator's node + // and calls |toDo| with two arguments: + // (Char of the move, Iterator wrapping the node of the move). template - void ForEachMove(ToDo && todo) const + void ForEachMove(ToDo && toDo) const { for (auto const & move : m_node.m_moves) - todo(move.first, Iterator(*move.second)); + toDo(move.first, Iterator(*move.second)); } + // Calls |toDo| for every value in this Iterator's node. template - void ForEachInNode(ToDo && todo) const + void ForEachInNode(ToDo && toDo) const { - for (auto const & value : m_node.m_values) - todo(value); + std::for_each(m_node.m_values.begin(), m_node.m_values.end(), std::forward(toDo)); } private: @@ -71,32 +75,32 @@ public: cur->AddValue(value); } - // Traverses all key-value pairs in the trie and calls |todo| on each of them. + // Traverses all key-value pairs in the trie and calls |toDo| on each of them. template - void ForEachInTrie(ToDo && todo) const + void ForEachInTrie(ToDo && toDo) const { String prefix; - ForEachInSubtree(m_root, prefix, std::forward(todo)); + ForEachInSubtree(m_root, prefix, std::forward(toDo)); } - // Calls |todo| for each key-value pair in the node that is reachable + // Calls |toDo| for each key-value pair in the node that is reachable // by |prefix| from the trie root. Does nothing if such node does // not exist. template - void ForEachInNode(String const & prefix, ToDo && todo) const + void ForEachInNode(String const & prefix, ToDo && toDo) const { if (auto const * root = MoveTo(prefix)) - ForEachInNode(*root, prefix, std::forward(todo)); + ForEachInNode(*root, prefix, std::forward(toDo)); } - // Calls |todo| for each key-value pair in a subtree that is + // Calls |toDo| for each key-value pair in a subtree that is // reachable by |prefix| from the trie root. Does nothing if such // subtree does not exist. template - void ForEachInSubtree(String prefix, ToDo && todo) const + void ForEachInSubtree(String prefix, ToDo && toDo) const { if (auto const * root = MoveTo(prefix)) - ForEachInSubtree(*root, prefix, std::forward(todo)); + ForEachInSubtree(*root, prefix, std::forward(toDo)); } size_t GetNumNodes() const { return m_numNodes; } @@ -149,27 +153,27 @@ private: return cur; } - // Calls |todo| for each key-value pair in a |node| that is + // Calls |toDo| for each key-value pair in a |node| that is // reachable by |prefix| from the trie root. template - void ForEachInNode(Node const & node, String const & prefix, ToDo && todo) const + void ForEachInNode(Node const & node, String const & prefix, ToDo && toDo) const { for (auto const & value : node.m_values) - todo(prefix, value); + toDo(prefix, value); } - // Calls |todo| for each key-value pair in subtree where |node| is a + // Calls |toDo| for each key-value pair in subtree where |node| is a // root of the subtree. |prefix| is a path from the trie root to the // |node|. template - void ForEachInSubtree(Node const & node, String & prefix, ToDo && todo) const + void ForEachInSubtree(Node const & node, String & prefix, ToDo && toDo) const { - ForEachInNode(node, prefix, todo); + ForEachInNode(node, prefix, toDo); for (auto const & move : node.m_moves) { prefix.push_back(move.first); - ForEachInSubtree(*move.second, prefix, todo); + ForEachInSubtree(*move.second, prefix, toDo); prefix.pop_back(); } } diff --git a/base/string_utils.cpp b/base/string_utils.cpp index d1ac84b24a..f2546161fb 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -237,19 +237,14 @@ bool IsASCIIString(std::string const & str) bool IsASCIIDigit(UniChar c) { return c >= '0' && c <= '9'; } bool IsASCIILatin(UniChar c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } + bool StartsWith(UniString const & s, UniString const & p) { - if (p.size() > s.size()) - return false; - for (size_t i = 0; i < p.size(); ++i) - { - if (s[i] != p[i]) - return false; - } - return true; + return StartsWith(s.begin(), s.end(), p.begin(), p.end()); } bool StartsWith(std::string const & s1, char const * s2) { return (s1.compare(0, strlen(s2), s2) == 0); } + bool EndsWith(std::string const & s1, char const * s2) { size_t const n = s1.size(); diff --git a/base/string_utils.hpp b/base/string_utils.hpp index af8670c5a4..8fc2228b40 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -439,6 +439,17 @@ std::string to_string_dac(double d, int dac); inline std::string to_string_with_digits_after_comma(double d, int dac) { return to_string_dac(d, dac); } //@} +template +bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix) +{ + while (beg != end && begPrefix != endPrefix && *beg == *begPrefix) + { + ++beg; + ++begPrefix; + } + return begPrefix == endPrefix; +} + bool StartsWith(UniString const & s, UniString const & p); bool StartsWith(std::string const & s1, char const * s2); diff --git a/search/keyword_matcher.cpp b/search/keyword_matcher.cpp index 42672f5119..073a9f80b8 100644 --- a/search/keyword_matcher.cpp +++ b/search/keyword_matcher.cpp @@ -1,11 +1,10 @@ #include "search/keyword_matcher.hpp" -#include "search/utils.hpp" - #include "indexer/search_delimiters.hpp" #include "indexer/search_string_utils.hpp" #include "base/stl_add.hpp" +#include "base/string_utils.hpp" #include "std/algorithm.hpp" #include "std/sstream.hpp" @@ -69,7 +68,7 @@ KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const * tokens, size_t coun bPrefixMatched = false; for (int j = 0; j < count && !bPrefixMatched; ++j) if (!isNameTokenMatched[j] && - StartsWith(tokens[j].begin(), tokens[j].end(), m_prefix.begin(), m_prefix.end())) + strings::StartsWith(tokens[j].begin(), tokens[j].end(), m_prefix.begin(), m_prefix.end())) { isNameTokenMatched[j] = bPrefixMatched = true; int8_t const tokenMatchDistance = int(m_keywords.size()) - j; diff --git a/search/processor.cpp b/search/processor.cpp index b5e7c4e577..1382b9db5c 100644 --- a/search/processor.cpp +++ b/search/processor.cpp @@ -129,7 +129,7 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res } // Removes all full-token stop words from |params|. -// Does nothing if all tokens in |params| are stop words. +// Does nothing if all tokens in |params| are non-prefix stop words. void RemoveStopWordsIfNeeded(QueryParams & params) { size_t numStopWords = 0; @@ -409,16 +409,16 @@ TLocales Processor::GetCategoryLocales() const } template -void Processor::ForEachCategoryType(StringSliceBase const & slice, ToDo && todo) const +void Processor::ForEachCategoryType(StringSliceBase const & slice, ToDo && toDo) const { - ::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, forward(todo)); + ::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, forward(toDo)); } template -void Processor::ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const +void Processor::ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && toDo) const { ::search::ForEachCategoryTypeFuzzy(slice, GetCategoryLocales(), m_categories, - forward(todo)); + forward(toDo)); } void Processor::Search(SearchParams const & params, m2::RectD const & viewport) diff --git a/search/processor.hpp b/search/processor.hpp index c32d03f16d..3ad34f7ad1 100644 --- a/search/processor.hpp +++ b/search/processor.hpp @@ -139,10 +139,10 @@ protected: TLocales GetCategoryLocales() const; template - void ForEachCategoryType(StringSliceBase const & slice, ToDo && todo) const; + void ForEachCategoryType(StringSliceBase const & slice, ToDo && toDo) const; template - void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const; + void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && toDo) const; m2::PointD GetPivotPoint() const; m2::RectD GetPivotRect() const; diff --git a/search/ranker.cpp b/search/ranker.cpp index 84b4048f2c..b2e708a162 100644 --- a/search/ranker.cpp +++ b/search/ranker.cpp @@ -8,6 +8,7 @@ #include "indexer/feature_algo.hpp" #include "base/logging.hpp" +#include "base/string_utils.hpp" #include "std/algorithm.hpp" #include "std/iterator.hpp" @@ -433,7 +434,7 @@ void Ranker::MatchForSuggestions(strings::UniString const & token, int8_t locale if ((suggest.m_prefixLength <= token.size()) && (token != s) && // do not push suggestion if it already equals to token (suggest.m_locale == locale) && // push suggestions only for needed language - StartsWith(s.begin(), s.end(), token.begin(), token.end())) + strings::StartsWith(s.begin(), s.end(), token.begin(), token.end())) { string const utf8Str = strings::ToUtf8(s); Result r(utf8Str, prologue + utf8Str + " "); diff --git a/search/utils.hpp b/search/utils.hpp index 919a18e907..717f100eb9 100644 --- a/search/utils.hpp +++ b/search/utils.hpp @@ -18,8 +18,6 @@ namespace search { -namespace -{ // my::MemTrie // todo(@m, @y). Unite with the similar function in search/feature_offset_match.hpp. template @@ -65,7 +63,6 @@ bool MatchInTrie(Trie const & trie, DFA const & dfa, ToDo && toDo) return found; } -} // namespace using TLocales = buffer_vector; @@ -113,15 +110,4 @@ void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, TLocales const & lo } } } - -template -bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix) -{ - while (beg != end && begPrefix != endPrefix && *beg == *begPrefix) - { - ++beg; - ++begPrefix; - } - return begPrefix == endPrefix; -} } // namespace search From a1f4ac5953e858dc4199a54e0eef81fde41ffaa2 Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Mon, 13 Feb 2017 18:59:55 +0300 Subject: [PATCH 3/4] [indexer] United all tries-by-locale into one trie. --- indexer/categories_holder.cpp | 9 ++++----- indexer/categories_holder.hpp | 19 ++++++------------- search/utils.hpp | 27 ++++++++++++++++++--------- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/indexer/categories_holder.cpp b/indexer/categories_holder.cpp index 382d21f7f9..f4ef385381 100644 --- a/indexer/categories_holder.cpp +++ b/indexer/categories_holder.cpp @@ -203,6 +203,8 @@ void CategoriesHolder::AddCategory(Category & cat, vector & types) auto const locale = synonym.m_locale; ASSERT_NOT_EQUAL(locale, kUnsupportedLocaleCode, ()); + auto const localePrefix = String(1, static_cast(locale)); + auto const uniName = search::NormalizeAndSimplifyString(synonym.m_name); vector tokens; @@ -213,10 +215,7 @@ void CategoriesHolder::AddCategory(Category & cat, vector & types) if (!ValidKeyToken(token)) continue; for (uint32_t const t : types) - { - auto it = m_name2type.emplace(locale, make_unique()).first; - it->second->Add(token, t); - } + m_name2type->Add(localePrefix + token, t); } } } @@ -243,7 +242,7 @@ bool CategoriesHolder::ValidKeyToken(String const & s) void CategoriesHolder::LoadFromStream(istream & s) { m_type2cat.clear(); - m_name2type.clear(); + m_name2type = make_unique(); m_groupTranslations.clear(); State state = EParseTypes; diff --git a/indexer/categories_holder.hpp b/indexer/categories_holder.hpp index b40982e001..95050ca09e 100644 --- a/indexer/categories_holder.hpp +++ b/indexer/categories_holder.hpp @@ -56,7 +56,8 @@ private: Type2CategoryCont m_type2cat; // Maps locale and category token to the list of corresponding types. - map> m_name2type; + // Locale is treated as a special symbol prepended to the token. + unique_ptr m_name2type; GroupTranslations m_groupTranslations; @@ -109,10 +110,9 @@ public: template void ForEachTypeByName(int8_t locale, String const & name, ToDo && toDo) const { - auto const it = m_name2type.find(locale); - if (it == m_name2type.end()) - return; - it->second->ForEachInNode(name, my::MakeIgnoreFirstArgument(forward(toDo))); + auto const localePrefix = String(1, static_cast(locale)); + m_name2type->ForEachInNode(localePrefix + name, + my::MakeIgnoreFirstArgument(forward(toDo))); } inline GroupTranslations const & GetGroupTranslations() const { return m_groupTranslations; } @@ -126,14 +126,7 @@ public: string GetReadableFeatureType(uint32_t type, int8_t locale) const; // Exposes the tries that map category tokens to types. - Trie const * GetNameToTypesTrie(int8_t locale) const - { - auto const it = m_name2type.find(locale); - if (it == m_name2type.end()) - return nullptr; - return it->second.get(); - } - + Trie const & GetNameToTypesTrie() const { return *m_name2type; } bool IsTypeExist(uint32_t type) const; inline void Swap(CategoriesHolder & r) diff --git a/search/utils.hpp b/search/utils.hpp index 717f100eb9..6f2b6ac5e9 100644 --- a/search/utils.hpp +++ b/search/utils.hpp @@ -15,13 +15,14 @@ #include #include #include +#include namespace search { -// my::MemTrie // todo(@m, @y). Unite with the similar function in search/feature_offset_match.hpp. template -bool MatchInTrie(Trie const & trie, DFA const & dfa, ToDo && toDo) +bool MatchInTrie(Trie const & /* trie */, typename Trie::Iterator const & trieStartIt, + DFA const & dfa, ToDo && toDo) { using Char = typename Trie::Char; using TrieIt = typename Trie::Iterator; @@ -34,7 +35,7 @@ bool MatchInTrie(Trie const & trie, DFA const & dfa, ToDo && toDo) auto it = dfa.Begin(); if (it.Rejects()) return false; - q.emplace(trie.GetRootIterator(), it); + q.emplace(trieStartIt, it); } bool found = false; @@ -97,17 +98,25 @@ template void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, TLocales const & locales, CategoriesHolder const & categories, ToDo && todo) { + using Trie = my::MemTrie; + + auto const & trie = categories.GetNameToTypesTrie(); + auto const & trieRootIt = trie.GetRootIterator(); + std::set localeSet(locales.begin(), locales.end()); + for (size_t i = 0; i < slice.Size(); ++i) { auto const & token = slice.Get(i); auto const & dfa = strings::LevenshteinDFA(token, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(token)); - for (int8_t const locale : locales) - { - auto const * trie = categories.GetNameToTypesTrie(locale); - if (trie != nullptr) - MatchInTrie(*trie, dfa, std::bind(todo, i, std::placeholders::_1)); - } + + trieRootIt.ForEachMove([&](Trie::Char const & c, Trie::Iterator const & moveIt) { + if (localeSet.count(static_cast(c)) != 0) + { + MatchInTrie(trie /* passed to infer the iterator's type */, moveIt, dfa, + std::bind(todo, i, std::placeholders::_1)); + } + }); } } } // namespace search From 0ddbe71640f6b4b65221761204306711dd0158e1 Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Tue, 14 Feb 2017 17:38:13 +0300 Subject: [PATCH 4/4] Review fixes. --- base/mem_trie.hpp | 14 +++++++++++++- indexer/categories_holder.cpp | 6 +++--- indexer/categories_holder.hpp | 9 +++++---- search/utils.hpp | 13 ++++++++----- 4 files changed, 29 insertions(+), 13 deletions(-) diff --git a/base/mem_trie.hpp b/base/mem_trie.hpp index 6bd161eed9..efc5a90cc0 100644 --- a/base/mem_trie.hpp +++ b/base/mem_trie.hpp @@ -29,7 +29,7 @@ public: { m_root = std::move(rhs.m_root); m_numNodes = rhs.m_numNodes; - rhs.m_numNodes = 1; + rhs.Clear(); return *this; } @@ -103,6 +103,12 @@ public: ForEachInSubtree(*root, prefix, std::forward(toDo)); } + void Clear() + { + m_root.Clear(); + m_numNodes = 1; + } + size_t GetNumNodes() const { return m_numNodes; } Iterator GetRootIterator() const { return Iterator(m_root); } Node const & GetRoot() const { return m_root; } @@ -134,6 +140,12 @@ private: void AddValue(Value const & value) { m_values.push_back(value); } + void Clear() + { + m_moves.clear(); + m_values.clear(); + } + std::map> m_moves; std::vector m_values; diff --git a/indexer/categories_holder.cpp b/indexer/categories_holder.cpp index f4ef385381..30771f4659 100644 --- a/indexer/categories_holder.cpp +++ b/indexer/categories_holder.cpp @@ -69,7 +69,7 @@ bool ParseEmoji(CategoriesHolder::Category::Name & name) return false; } - name.m_name = ToUtf8(UniString(1, static_cast(c))); + name.m_name = ToUtf8(UniString(1 /* numChars */, static_cast(c))); if (IsASCIIString(ToUtf8(search::NormalizeAndSimplifyString(name.m_name)))) { @@ -215,7 +215,7 @@ void CategoriesHolder::AddCategory(Category & cat, vector & types) if (!ValidKeyToken(token)) continue; for (uint32_t const t : types) - m_name2type->Add(localePrefix + token, t); + m_name2type.Add(localePrefix + token, t); } } } @@ -242,7 +242,7 @@ bool CategoriesHolder::ValidKeyToken(String const & s) void CategoriesHolder::LoadFromStream(istream & s) { m_type2cat.clear(); - m_name2type = make_unique(); + m_name2type.Clear(); m_groupTranslations.clear(); State state = EParseTypes; diff --git a/indexer/categories_holder.hpp b/indexer/categories_holder.hpp index 95050ca09e..03c773af09 100644 --- a/indexer/categories_holder.hpp +++ b/indexer/categories_holder.hpp @@ -4,6 +4,7 @@ #include "base/stl_helpers.hpp" #include "base/string_utils.hpp" +#include "std/algorithm.hpp" #include "std/deque.hpp" #include "std/iostream.hpp" #include "std/map.hpp" @@ -57,7 +58,7 @@ private: // Maps locale and category token to the list of corresponding types. // Locale is treated as a special symbol prepended to the token. - unique_ptr m_name2type; + Trie m_name2type; GroupTranslations m_groupTranslations; @@ -111,7 +112,7 @@ public: void ForEachTypeByName(int8_t locale, String const & name, ToDo && toDo) const { auto const localePrefix = String(1, static_cast(locale)); - m_name2type->ForEachInNode(localePrefix + name, + m_name2type.ForEachInNode(localePrefix + name, my::MakeIgnoreFirstArgument(forward(toDo))); } @@ -126,13 +127,13 @@ public: string GetReadableFeatureType(uint32_t type, int8_t locale) const; // Exposes the tries that map category tokens to types. - Trie const & GetNameToTypesTrie() const { return *m_name2type; } + Trie const & GetNameToTypesTrie() const { return m_name2type; } bool IsTypeExist(uint32_t type) const; inline void Swap(CategoriesHolder & r) { m_type2cat.swap(r.m_type2cat); - m_name2type.swap(r.m_name2type); + std::swap(m_name2type, r.m_name2type); } // Converts any language |locale| from UI to the corresponding diff --git a/search/utils.hpp b/search/utils.hpp index 6f2b6ac5e9..e64fc20a5c 100644 --- a/search/utils.hpp +++ b/search/utils.hpp @@ -15,7 +15,7 @@ #include #include #include -#include +#include namespace search { @@ -102,16 +102,19 @@ void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, TLocales const & lo auto const & trie = categories.GetNameToTypesTrie(); auto const & trieRootIt = trie.GetRootIterator(); - std::set localeSet(locales.begin(), locales.end()); + vector sortedLocales(locales.begin(), locales.end()); + my::SortUnique(sortedLocales); for (size_t i = 0; i < slice.Size(); ++i) { auto const & token = slice.Get(i); - auto const & dfa = - strings::LevenshteinDFA(token, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(token)); + // todo(@m, @y). We build dfa twice for each token: here and in geocoder.cpp. + // A possible optimization is to build each dfa once and save it. Note that + // dfas for the prefix tokens differ, i.e. we ignore slice.IsPrefix(i) here. + strings::LevenshteinDFA const dfa(token, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(token)); trieRootIt.ForEachMove([&](Trie::Char const & c, Trie::Iterator const & moveIt) { - if (localeSet.count(static_cast(c)) != 0) + if (std::binary_search(sortedLocales.begin(), sortedLocales.end(), static_cast(c))) { MatchInTrie(trie /* passed to infer the iterator's type */, moveIt, dfa, std::bind(todo, i, std::placeholders::_1));