From 28bad4ed8b1f421088a9287ca86081e76ce6a59a Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Mon, 27 Feb 2017 12:40:20 +0300 Subject: [PATCH 1/2] [search] Use MemTrie for street synonyms. --- base/base_tests/mem_trie_test.cpp | 2 +- base/mem_trie.hpp | 125 +++++++++++++------ indexer/categories_holder.hpp | 2 +- indexer/categories_index.cpp | 8 +- indexer/categories_index.hpp | 2 +- indexer/search_string_utils.cpp | 84 +++++++++++-- search/search_tests/locality_scorer_test.cpp | 2 +- search/utils.hpp | 17 +-- 8 files changed, 179 insertions(+), 63 deletions(-) diff --git a/base/base_tests/mem_trie_test.cpp b/base/base_tests/mem_trie_test.cpp index 42e4c627f6..1c4bb3ac9b 100644 --- a/base/base_tests/mem_trie_test.cpp +++ b/base/base_tests/mem_trie_test.cpp @@ -9,7 +9,7 @@ namespace { -using Trie = my::MemTrie; +using Trie = my::MemTrie>; using Data = std::vector>; void GetTrieContents(Trie const & trie, Data & data) diff --git a/base/mem_trie.hpp b/base/mem_trie.hpp index efc5a90cc0..b4ceabf244 100644 --- a/base/mem_trie.hpp +++ b/base/mem_trie.hpp @@ -5,15 +5,76 @@ #include #include +#include #include #include #include namespace my { +template +class MapMoves +{ +public: + template + void ForEach(ToDo && toDo) const + { + for (auto const & subtree : m_subtrees) + toDo(subtree.first, *subtree.second); + } + + SubTree * GetSubTree(Char const & c) const + { + auto const it = m_subtrees.find(c); + if (it == m_subtrees.end()) + return nullptr; + return it->second.get(); + } + + SubTree & GetOrCreateSubTree(Char const & c, bool & created) + { + auto & node = m_subtrees[c]; + if (!node) + { + node = my::make_unique(); + created = true; + } + else + { + created = false; + } + return *node; + } + + void Clear() { m_subtrees.clear(); } + +private: + std::map> m_subtrees; +}; + +template +struct VectorValues +{ + template + void Add(V && v) + { + m_values.push_back(std::forward(v)); + } + + template + void ForEach(ToDo && toDo) const + { + std::for_each(m_values.begin(), m_values.end(), std::forward(toDo)); + } + + void Clear() { m_values.clear(); } + + std::vector m_values; +}; + // This class is a simple in-memory trie which allows to add // key-value pairs and then traverse them in a sorted order. -template +template class Moves = MapMoves> class MemTrie { private: @@ -38,6 +99,8 @@ public: class Iterator { public: + using Char = MemTrie::Char; + Iterator(MemTrie::Node const & node) : m_node(node) {} // Iterates over all possible moves from this Iterator's node @@ -46,15 +109,14 @@ public: template void ForEachMove(ToDo && toDo) const { - for (auto const & move : m_node.m_moves) - toDo(move.first, Iterator(*move.second)); + m_node.m_moves.ForEach([&](Char c, Node const & node) { toDo(c, Iterator(node)); }); } // Calls |toDo| for every value in this Iterator's node. template void ForEachInNode(ToDo && toDo) const { - std::for_each(m_node.m_values.begin(), m_node.m_values.end(), std::forward(toDo)); + m_node.m_values.ForEach(std::forward(toDo)); } private: @@ -62,7 +124,8 @@ public: }; // Adds a key-value pair to the trie. - void Add(String const & key, Value const & value) + template + void Add(String const & key, Value && value) { auto * cur = &m_root; for (auto const & c : key) @@ -72,7 +135,7 @@ public: if (created) ++m_numNodes; } - cur->AddValue(value); + cur->Add(std::forward(value)); } // Traverses all key-value pairs in the trie and calls |toDo| on each of them. @@ -116,8 +179,6 @@ public: private: struct Node { - friend class MemTrie::Iterator; - Node() = default; Node(Node && /* rhs */) = default; @@ -125,29 +186,23 @@ private: Node & GetMove(Char const & c, bool & created) { - auto & node = m_moves[c]; - if (!node) - { - node = my::make_unique(); - created = true; - } - else - { - created = false; - } - return *node; + return m_moves.GetOrCreateSubTree(c, created); } - void AddValue(Value const & value) { m_values.push_back(value); } + template + void Add(Value && value) + { + m_values.Add(std::forward(value)); + } void Clear() { - m_moves.clear(); - m_values.clear(); + m_moves.Clear(); + m_values.Clear(); } - std::map> m_moves; - std::vector m_values; + Moves m_moves; + Values m_values; DISALLOW_COPY(Node); }; @@ -157,10 +212,9 @@ private: auto const * cur = &m_root; for (auto const & c : key) { - auto const it = cur->m_moves.find(c); - if (it == cur->m_moves.end()) - return nullptr; - cur = it->second.get(); + cur = cur->m_moves.GetSubTree(c); + if (!cur) + break; } return cur; } @@ -170,8 +224,7 @@ private: template void ForEachInNode(Node const & node, String const & prefix, ToDo && toDo) const { - for (auto const & value : node.m_values) - toDo(prefix, value); + node.m_values.ForEach(std::bind(std::forward(toDo), prefix, std::placeholders::_1)); } // Calls |toDo| for each key-value pair in subtree where |node| is a @@ -182,12 +235,12 @@ private: { ForEachInNode(node, prefix, toDo); - for (auto const & move : node.m_moves) - { - prefix.push_back(move.first); - ForEachInSubtree(*move.second, prefix, toDo); - prefix.pop_back(); - } + node.m_moves.ForEach([&](Char c, Node const & node) + { + prefix.push_back(c); + ForEachInSubtree(node, prefix, toDo); + prefix.pop_back(); + }); } Node m_root; diff --git a/indexer/categories_holder.hpp b/indexer/categories_holder.hpp index 7475cdf235..d46fd47615 100644 --- a/indexer/categories_holder.hpp +++ b/indexer/categories_holder.hpp @@ -52,7 +52,7 @@ public: private: using String = strings::UniString; using Type2CategoryCont = multimap>; - using Trie = my::MemTrie; + using Trie = my::MemTrie>; Type2CategoryCont m_type2cat; diff --git a/indexer/categories_index.cpp b/indexer/categories_index.cpp index b57559a723..29e41e74e8 100644 --- a/indexer/categories_index.cpp +++ b/indexer/categories_index.cpp @@ -12,8 +12,8 @@ namespace { -void AddAllNonemptySubstrings(my::MemTrie & trie, string const & s, - uint32_t value) +void AddAllNonemptySubstrings(my::MemTrie> & trie, + string const & s, uint32_t value) { ASSERT(!s.empty(), ()); for (size_t i = 0; i < s.length(); ++i) @@ -37,8 +37,8 @@ void ForEachToken(string const & s, TF && fn) fn(strings::ToUtf8(token)); } -void TokenizeAndAddAllSubstrings(my::MemTrie & trie, string const & s, - uint32_t value) +void TokenizeAndAddAllSubstrings(my::MemTrie> & trie, + string const & s, uint32_t value) { auto fn = [&](string const & token) { diff --git a/indexer/categories_index.hpp b/indexer/categories_index.hpp index a95f1dc2c3..498fbfb489 100644 --- a/indexer/categories_index.hpp +++ b/indexer/categories_index.hpp @@ -69,6 +69,6 @@ private: // here because this class may be used from Objectvie-C // so a default constructor is needed. CategoriesHolder const * m_catHolder = nullptr; - my::MemTrie m_trie; + my::MemTrie> m_trie; }; } // namespace indexer diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp index 216b5b59b8..0503d2a591 100644 --- a/indexer/search_string_utils.cpp +++ b/indexer/search_string_utils.cpp @@ -1,7 +1,9 @@ #include "indexer/search_string_utils.hpp" #include "indexer/string_set.hpp" +#include "base/assert.hpp" #include "base/macros.hpp" +#include "base/mem_trie.hpp" #include "std/algorithm.hpp" @@ -94,11 +96,67 @@ char const * kStreetTokensSeparator = "\t -,."; /// @todo Move prefixes, suffixes into separate file (autogenerated). /// It's better to distinguish synonyms comparison according to language/region. - class StreetsSynonymsHolder { public: - using TStrings = search::StringSet; + struct BooleanSum + { + void Add(bool value) { m_value = m_value || value; } + + template + void ForEach(ToDo && toDo) const + { + if (m_value) + toDo(m_value); + } + + void Clear() { m_value = false; } + + bool m_value = false; + }; + + template + class Moves + { + public: + template + void ForEach(ToDo && toDo) const + { + for (auto const & subtree : m_subtrees) + toDo(subtree.first, *subtree.second); + } + + SubTree * GetSubTree(Char const & c) const + { + for (auto const & subtree : m_subtrees) + { + if (subtree.first == c) + return subtree.second.get(); + } + return nullptr; + } + + SubTree & GetOrCreateSubTree(Char const & c, bool & created) + { + for (size_t i = 0; i < m_subtrees.size(); ++i) + { + if (m_subtrees[i].first == c) + { + created = false; + return *m_subtrees[i].second; + } + } + + created = true; + m_subtrees.emplace_back(c, make_unique()); + return *m_subtrees.back().second; + } + + void Clear() { m_subtrees.clear(); } + + private: + buffer_vector>, 8> m_subtrees; + }; StreetsSynonymsHolder() { @@ -168,24 +226,34 @@ public: for (auto const * s : affics) { UniString const us = NormalizeAndSimplifyString(s); - m_strings.Add(us.begin(), us.end()); + m_strings.Add(us, true); } } bool MatchPrefix(UniString const & s) const { - auto const status = m_strings.Has(s.begin(), s.end()); - return status != TStrings::Status::Absent; + bool found = false; + m_strings.ForEachInNode(s, [&](UniString const & prefix, bool value) { + ASSERT_EQUAL(s, prefix, ()); + ASSERT(value, ()); + found = true; + }); + return found; } bool FullMatch(UniString const & s) const { - auto const status = m_strings.Has(s.begin(), s.end()); - return status == TStrings::Status::Full; + bool found = false; + m_strings.ForEachInNode(s, [&](UniString const & prefix, bool value) { + ASSERT_EQUAL(s, prefix, ()); + ASSERT(value, ()); + found = value; + }); + return found; } private: - TStrings m_strings; + my::MemTrie m_strings; }; StreetsSynonymsHolder g_streets; diff --git a/search/search_tests/locality_scorer_test.cpp b/search/search_tests/locality_scorer_test.cpp index 8580ca7e0c..2b96945193 100644 --- a/search/search_tests/locality_scorer_test.cpp +++ b/search/search_tests/locality_scorer_test.cpp @@ -115,7 +115,7 @@ protected: unordered_map> m_names; LocalityScorer m_scorer; - my::MemTrie m_searchIndex; + my::MemTrie> m_searchIndex; }; } // namespace diff --git a/search/utils.hpp b/search/utils.hpp index aaee404d91..4fb6b17a36 100644 --- a/search/utils.hpp +++ b/search/utils.hpp @@ -19,12 +19,10 @@ namespace search { // todo(@m, @y). Unite with the similar function in search/feature_offset_match.hpp. -template -bool MatchInTrie(Trie const & /* trie */, typename Trie::Iterator const & trieStartIt, - DFA const & dfa, ToDo && toDo) +template +bool MatchInTrie(TrieIt const & trieStartIt, DFA const & dfa, ToDo && toDo) { - using Char = typename Trie::Char; - using TrieIt = typename Trie::Iterator; + using Char = typename TrieIt::Char; using DFAIt = typename DFA::Iterator; using State = pair; @@ -97,7 +95,7 @@ template void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, TLocales const & locales, CategoriesHolder const & categories, ToDo && todo) { - using Trie = my::MemTrie; + using Trie = my::MemTrie>; auto const & trie = categories.GetNameToTypesTrie(); auto const & trieRootIt = trie.GetRootIterator(); @@ -112,12 +110,9 @@ void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, TLocales const & lo // dfas for the prefix tokens differ, i.e. we ignore slice.IsPrefix(i) here. strings::LevenshteinDFA const dfa(token, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(token)); - trieRootIt.ForEachMove([&](Trie::Char const & c, Trie::Iterator const & moveIt) { + trieRootIt.ForEachMove([&](Trie::Char const & c, Trie::Iterator const & trieStartIt) { if (std::binary_search(sortedLocales.begin(), sortedLocales.end(), static_cast(c))) - { - MatchInTrie(trie /* passed to infer the iterator's type */, moveIt, dfa, - std::bind(todo, i, std::placeholders::_1)); - } + MatchInTrie(trieStartIt, dfa, std::bind(todo, i, std::placeholders::_1)); }); } } From 79076088879f631c8c6f5b3f738499fa478a4122 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Mon, 27 Feb 2017 14:07:59 +0300 Subject: [PATCH 2/2] Review fixes. --- base/mem_trie.hpp | 19 ++++++++++--------- indexer/search_string_utils.cpp | 19 ++++++++----------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/base/mem_trie.hpp b/base/mem_trie.hpp index b4ceabf244..e19a7a71dc 100644 --- a/base/mem_trie.hpp +++ b/base/mem_trie.hpp @@ -12,7 +12,7 @@ namespace my { -template +template class MapMoves { public: @@ -23,7 +23,7 @@ public: toDo(subtree.first, *subtree.second); } - SubTree * GetSubTree(Char const & c) const + Subtree * GetSubtree(Char const & c) const { auto const it = m_subtrees.find(c); if (it == m_subtrees.end()) @@ -31,12 +31,12 @@ public: return it->second.get(); } - SubTree & GetOrCreateSubTree(Char const & c, bool & created) + Subtree & GetOrCreateSubtree(Char const & c, bool & created) { auto & node = m_subtrees[c]; if (!node) { - node = my::make_unique(); + node = my::make_unique(); created = true; } else @@ -49,7 +49,7 @@ public: void Clear() { m_subtrees.clear(); } private: - std::map> m_subtrees; + std::map> m_subtrees; }; template @@ -58,7 +58,7 @@ struct VectorValues template void Add(V && v) { - m_values.push_back(std::forward(v)); + m_values.emplace_back(std::forward(v)); } template @@ -186,7 +186,7 @@ private: Node & GetMove(Char const & c, bool & created) { - return m_moves.GetOrCreateSubTree(c, created); + return m_moves.GetOrCreateSubtree(c, created); } template @@ -212,7 +212,7 @@ private: auto const * cur = &m_root; for (auto const & c : key) { - cur = cur->m_moves.GetSubTree(c); + cur = cur->m_moves.GetSubtree(c); if (!cur) break; } @@ -224,7 +224,8 @@ private: template void ForEachInNode(Node const & node, String const & prefix, ToDo && toDo) const { - node.m_values.ForEach(std::bind(std::forward(toDo), prefix, std::placeholders::_1)); + node.m_values.ForEach( + std::bind(std::forward(toDo), std::ref(prefix), std::placeholders::_1)); } // Calls |toDo| for each key-value pair in subtree where |node| is a diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp index 0503d2a591..eb73e314a0 100644 --- a/indexer/search_string_utils.cpp +++ b/indexer/search_string_utils.cpp @@ -106,8 +106,7 @@ public: template void ForEach(ToDo && toDo) const { - if (m_value) - toDo(m_value); + toDo(m_value); } void Clear() { m_value = false; } @@ -115,7 +114,7 @@ public: bool m_value = false; }; - template + template class Moves { public: @@ -126,7 +125,7 @@ public: toDo(subtree.first, *subtree.second); } - SubTree * GetSubTree(Char const & c) const + Subtree * GetSubtree(Char const & c) const { for (auto const & subtree : m_subtrees) { @@ -136,7 +135,7 @@ public: return nullptr; } - SubTree & GetOrCreateSubTree(Char const & c, bool & created) + Subtree & GetOrCreateSubtree(Char const & c, bool & created) { for (size_t i = 0; i < m_subtrees.size(); ++i) { @@ -148,14 +147,14 @@ public: } created = true; - m_subtrees.emplace_back(c, make_unique()); + m_subtrees.emplace_back(c, make_unique()); return *m_subtrees.back().second; } void Clear() { m_subtrees.clear(); } private: - buffer_vector>, 8> m_subtrees; + buffer_vector>, 8> m_subtrees; }; StreetsSynonymsHolder() @@ -226,16 +225,15 @@ public: for (auto const * s : affics) { UniString const us = NormalizeAndSimplifyString(s); - m_strings.Add(us, true); + m_strings.Add(us, true /* end of string */); } } bool MatchPrefix(UniString const & s) const { bool found = false; - m_strings.ForEachInNode(s, [&](UniString const & prefix, bool value) { + m_strings.ForEachInNode(s, [&](UniString const & prefix, bool /* value */) { ASSERT_EQUAL(s, prefix, ()); - ASSERT(value, ()); found = true; }); return found; @@ -246,7 +244,6 @@ public: bool found = false; m_strings.ForEachInNode(s, [&](UniString const & prefix, bool value) { ASSERT_EQUAL(s, prefix, ()); - ASSERT(value, ()); found = value; }); return found;