diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index 89acc66ef7..e71d7f4a6b 100644 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -20,10 +20,13 @@ set( const_helper.hpp deferred_task.cpp deferred_task.hpp + dfa_helpers.hpp exception.cpp exception.hpp gmtime.cpp gmtime.hpp + levenshtein_dfa.cpp + levenshtein_dfa.hpp limited_priority_queue.hpp logging.cpp logging.hpp @@ -71,12 +74,15 @@ set( threaded_container.hpp threaded_list.hpp threaded_priority_queue.hpp + internal/message.cpp + internal/message.hpp timegm.cpp timegm.hpp timer.cpp timer.hpp + uni_string_dfa.cpp + uni_string_dfa.hpp worker_thread.hpp - internal/message.cpp ) add_library(${PROJECT_NAME} ${SRC}) diff --git a/base/base.pro b/base/base.pro index 9f3fe820b7..31f3f5bd88 100644 --- a/base/base.pro +++ b/base/base.pro @@ -32,6 +32,7 @@ SOURCES += \ time_samples.cpp \ timegm.cpp \ timer.cpp \ + uni_string_dfa.cpp \ HEADERS += \ SRC_FIRST.hpp \ @@ -46,9 +47,10 @@ HEADERS += \ condition.hpp \ const_helper.hpp \ deferred_task.hpp \ + dfa_helpers.hpp \ exception.hpp \ gmtime.hpp \ - internal/messagex.hpp \ + internal/message.hpp \ levenshtein_dfa.hpp \ limited_priority_queue.hpp \ logging.hpp \ @@ -87,4 +89,5 @@ HEADERS += \ time_samples.hpp \ timegm.hpp \ timer.hpp \ + uni_string_dfa.hpp \ worker_thread.hpp \ diff --git a/base/base_tests/CMakeLists.txt b/base/base_tests/CMakeLists.txt index 3120b52052..0fa0ec6b3e 100644 --- a/base/base_tests/CMakeLists.txt +++ b/base/base_tests/CMakeLists.txt @@ -12,6 +12,7 @@ set( condition_test.cpp const_helper.cpp containers_test.cpp + levenshtein_dfa_test.cpp logging_test.cpp math_test.cpp matrix_test.cpp @@ -33,6 +34,7 @@ set( threads_test.cpp timegm_test.cpp timer_test.cpp + uni_string_dfa_test.cpp worker_thread_test.cpp ) diff --git a/base/base_tests/base_tests.pro b/base/base_tests/base_tests.pro index 1a4f2bb9ec..c8708b84d4 100644 --- a/base/base_tests/base_tests.pro +++ b/base/base_tests/base_tests.pro @@ -44,6 +44,7 @@ SOURCES += \ threads_test.cpp \ timegm_test.cpp \ timer_test.cpp \ + uni_string_dfa_test.cpp \ worker_thread_test.cpp \ HEADERS += diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp index c671b1aa1e..f3cd896fa2 100644 --- a/base/base_tests/levenshtein_dfa_test.cpp +++ b/base/base_tests/levenshtein_dfa_test.cpp @@ -1,5 +1,6 @@ #include "testing/testing.hpp" +#include "base/dfa_helpers.hpp" #include "base/levenshtein_dfa.hpp" #include "std/string.hpp" @@ -18,7 +19,7 @@ enum class Status Status GetStatus(LevenshteinDFA const & dfa, string const & s) { auto it = dfa.Begin(); - it.Move(s); + DFAMove(it, s); if (it.Accepts()) return Status::Accepts; if (it.Rejects()) @@ -86,5 +87,10 @@ UNIT_TEST(LevenshteinDFA_Smoke) TEST(Accepts(dfa, "ленигнрадский"), ()); TEST(Rejects(dfa, "ленинский"), ()); } + + { + LevenshteinDFA dfa("atm", 1 /* maxErrors */); + TEST(Rejects(dfa, "san"), ()); + } } } // namespace diff --git a/base/base_tests/uni_string_dfa_test.cpp b/base/base_tests/uni_string_dfa_test.cpp new file mode 100644 index 0000000000..33191e18dd --- /dev/null +++ b/base/base_tests/uni_string_dfa_test.cpp @@ -0,0 +1,90 @@ +#include "testing/testing.hpp" + +#include "base/dfa_helpers.hpp" +#include "base/string_utils.hpp" +#include "base/uni_string_dfa.hpp" + +using namespace strings; + +namespace +{ +UNIT_TEST(UniStringDFA_Smoke) +{ + { + UniStringDFA dfa(""); + + auto it = dfa.Begin(); + TEST(it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "a"); + TEST(!it.Accepts(), ()); + TEST(it.Rejects(), ()); + } + + { + UniStringDFA dfa("абв"); + + auto it = dfa.Begin(); + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "а"); + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "б"); + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "в"); + TEST(it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "г"); + TEST(!it.Accepts(), ()); + TEST(it.Rejects(), ()); + } + + { + UniStringDFA dfa("абв"); + + auto it = dfa.Begin(); + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "а"); + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "г"); + TEST(!it.Accepts(), ()); + TEST(it.Rejects(), ()); + } +} + +UNIT_TEST(UniStringDFA_Prefix) +{ + { + PrefixDFAModifier dfa(UniStringDFA("abc")); + + auto it = dfa.Begin(); + DFAMove(it, "ab"); + + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "c"); + TEST(it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "d"); + TEST(it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "efghijk"); + TEST(it.Accepts(), ()); + TEST(!it.Rejects(), ()); + } +} +} // namespace diff --git a/base/dfa_helpers.hpp b/base/dfa_helpers.hpp new file mode 100644 index 0000000000..5815f380f9 --- /dev/null +++ b/base/dfa_helpers.hpp @@ -0,0 +1,66 @@ +#pragma once + +#include "base/string_utils.hpp" + +#include "std/string.hpp" + +namespace strings +{ +template +class PrefixDFAModifier +{ +public: + class Iterator + { + public: + Iterator & Move(strings::UniChar c) + { + if (Accepts() || Rejects()) + return *this; + + m_it.Move(c); + if (m_it.Accepts()) + m_accepts = true; + + return *this; + } + + bool Accepts() const { return m_accepts; } + bool Rejects() const { return !Accepts() && m_it.Rejects(); } + + private: + friend class PrefixDFAModifier; + + Iterator(typename DFA::Iterator it) : m_it(it), m_accepts(m_it.Accepts()) {} + + typename DFA::Iterator m_it; + bool m_accepts; + }; + + explicit PrefixDFAModifier(DFA const & dfa) : m_dfa(dfa) {} + + Iterator Begin() const { return Iterator(m_dfa.Begin()); } + +private: + DFA const m_dfa; +}; + +template +void DFAMove(DFAIt & it, It begin, It end) +{ + for (; begin != end; ++begin) + it.Move(*begin); +} + +template +void DFAMove(DFAIt & it, UniString const & s) +{ + DFAMove(it, s.begin(), s.end()); +} + +template +void DFAMove(DFAIt & it, string const & s) +{ + DFAMove(it, MakeUniString(s)); +} +} // namespace strings diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp index cef354c458..bdf5ca5b30 100644 --- a/base/levenshtein_dfa.hpp +++ b/base/levenshtein_dfa.hpp @@ -71,29 +71,13 @@ public: return *this; } - inline Iterator & Move(UniString const & s) { return Move(s.begin(), s.end()); } - - inline Iterator & Move(string const & s) - { - UniString us = MakeUniString(s); - return Move(us); - } - - template - Iterator & Move(It begin, It end) - { - for (; begin != end; ++begin) - Move(*begin); - return *this; - } - bool Accepts() const { return m_dfa.IsAccepting(m_s); } bool Rejects() const { return m_dfa.IsRejecting(m_s); } private: friend class LevenshteinDFA; - Iterator(LevenshteinDFA const & dfa) : m_s(kStartingState), m_dfa(dfa) {} + explicit Iterator(LevenshteinDFA const & dfa) : m_s(kStartingState), m_dfa(dfa) {} size_t m_s; LevenshteinDFA const & m_dfa; diff --git a/base/string_utils.hpp b/base/string_utils.hpp index 114ce0f5d8..7875c55890 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -484,40 +484,45 @@ void ForEachMatched(string const & s, regex const & regex, TFn && fn) fn(*cur); } -// Computes the minimum number of insertions, deletions and alterations -// of characters needed to transform one string into another. -// The function works in O(length1 * length2) time and memory -// where length1 and length2 are the lengths of the argument strings. -// See https://en.wikipedia.org/wiki/Levenshtein_distance. -// One string is [b1, e1) and the other is [b2, e2). The iterator -// form is chosen to fit both std::string and strings::UniString. -// This function does not normalize either of the strings. +// Computes the minimum number of insertions, deletions and +// alterations of characters needed to transform one string into +// another. The function works in O(length1 * length2) time and +// O(min(length1, length2)) memory where length1 and length2 are the +// lengths of the argument strings. See +// https://en.wikipedia.org/wiki/Levenshtein_distance. One string is +// [b1, e1) and the other is [b2, e2). The iterator form is chosen to +// fit both std::string and strings::UniString. This function does +// not normalize either of the strings. template size_t EditDistance(TIter const & b1, TIter const & e1, TIter const & b2, TIter const & e2) { size_t const n = distance(b1, e1); size_t const m = distance(b2, e2); - // |cur| and |prev| are current and previous rows of the + + if (m > n) + return EditDistance(b2, e2, b1, e1); + + // |curr| and |prev| are current and previous rows of the // dynamic programming table. vector prev(m + 1); - vector cur(m + 1); + vector curr(m + 1); for (size_t j = 0; j <= m; j++) prev[j] = j; auto it1 = b1; // 1-based to avoid corner cases. for (size_t i = 1; i <= n; ++i, ++it1) { - cur[0] = i; + curr[0] = i; auto const & c1 = *it1; auto it2 = b2; for (size_t j = 1; j <= m; ++j, ++it2) { auto const & c2 = *it2; - cur[j] = min(cur[j - 1], prev[j]) + 1; - cur[j] = min(cur[j], prev[j - 1] + (c1 == c2 ? 0 : 1)); + curr[j] = min(curr[j - 1], prev[j]) + 1; + curr[j] = min(curr[j], prev[j - 1] + (c1 == c2 ? 0 : 1)); } - prev.swap(cur); + prev.swap(curr); } return prev[m]; } diff --git a/base/uni_string_dfa.cpp b/base/uni_string_dfa.cpp new file mode 100644 index 0000000000..4106bb8717 --- /dev/null +++ b/base/uni_string_dfa.cpp @@ -0,0 +1,41 @@ +#include "base/uni_string_dfa.hpp" + +#include "base/assert.hpp" + +namespace strings +{ +// UniStringDFA::Iterator -------------------------------------------------------------------------- +UniStringDFA::Iterator::Iterator(UniString const & s) : m_s(s), m_pos(0), m_rejected(false) {} + +UniStringDFA::Iterator & UniStringDFA::Iterator::Move(UniChar c) +{ + if (Accepts()) + { + m_rejected = true; + return *this; + } + + if (Rejects()) + return *this; + + ASSERT_LESS(m_pos, m_s.size(), ()); + if (m_s[m_pos] != c) + { + m_rejected = true; + return *this; + } + + ++m_pos; + return *this; +} + +// UniStringDFA::UniStringDFA ---------------------------------------------------------------------- +UniStringDFA::UniStringDFA(UniString const & s) : m_s(s) {} + +UniStringDFA::UniStringDFA(string const & s): UniStringDFA(MakeUniString(s)) {} + +UniStringDFA::Iterator UniStringDFA::Begin() const +{ + return Iterator(m_s); +} +} // namespace strings diff --git a/base/uni_string_dfa.hpp b/base/uni_string_dfa.hpp new file mode 100644 index 0000000000..f7e75085a1 --- /dev/null +++ b/base/uni_string_dfa.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include "base/logging.hpp" +#include "base/string_utils.hpp" + +#include "std/string.hpp" + +namespace strings +{ +class UniStringDFA +{ +public: + class Iterator + { + public: + Iterator & Move(UniChar c); + + inline bool Accepts() const { return !Rejects() && m_pos == m_s.size(); } + inline bool Rejects() const { return m_rejected; } + + private: + friend class UniStringDFA; + + Iterator(UniString const & s); + + UniString const & m_s; + size_t m_pos; + bool m_rejected; + }; + + explicit UniStringDFA(UniString const & s); + explicit UniStringDFA(string const & s); + + Iterator Begin() const; + +private: + UniString const m_s; +}; +} // namespace strings diff --git a/search/feature_offset_match.hpp b/search/feature_offset_match.hpp index 26d411517e..8edd9292e5 100644 --- a/search/feature_offset_match.hpp +++ b/search/feature_offset_match.hpp @@ -7,12 +7,16 @@ #include "indexer/trie.hpp" +#include "base/dfa_helpers.hpp" +#include "base/levenshtein_dfa.hpp" #include "base/mutex.hpp" #include "base/scope_guard.hpp" #include "base/stl_add.hpp" #include "base/string_utils.hpp" +#include "base/uni_string_dfa.hpp" #include "std/algorithm.hpp" +#include "std/queue.hpp" #include "std/target_os.hpp" #include "std/unique_ptr.hpp" #include "std/unordered_set.hpp" @@ -23,90 +27,10 @@ namespace search { namespace impl { -template -size_t CalcEqualLength(TSrcIter b, TSrcIter e, TCompIter bC, TCompIter eC) -{ - size_t count = 0; - while ((b != e) && (bC != eC) && (*b++ == *bC++)) - ++count; - return count; -} - -template -inline shared_ptr>> MoveTrieIteratorToString( - trie::Iterator> const & trieRoot, strings::UniString const & queryS, - size_t & symbolsMatched, bool & bFullEdgeMatched) -{ - symbolsMatched = 0; - bFullEdgeMatched = false; - - auto it = trieRoot.Clone(); - - size_t const szQuery = queryS.size(); - - while (symbolsMatched < szQuery) - { - bool bMatched = false; - - ASSERT_LESS(it->m_edge.size(), std::numeric_limits::max(), ()); - uint32_t const edgeCount = static_cast(it->m_edge.size()); - - for (uint32_t i = 0; i < edgeCount; ++i) - { - size_t const szEdge = it->m_edge[i].m_label.size(); - - size_t const count = - CalcEqualLength(it->m_edge[i].m_label.begin(), it->m_edge[i].m_label.end(), - queryS.begin() + symbolsMatched, queryS.end()); - - if ((count > 0) && (count == szEdge || szQuery == count + symbolsMatched)) - { - it = it->GoToEdge(i); - - bFullEdgeMatched = (count == szEdge); - symbolsMatched += count; - bMatched = true; - break; - } - } - - if (!bMatched) - return NULL; - } - return it->Clone(); -} - namespace { -bool CheckMatchString(strings::UniChar const * rootPrefix, size_t rootPrefixSize, - strings::UniString & s, bool prefix) -{ - if (rootPrefixSize == 0) - return true; - - if (prefix && s.size() < rootPrefixSize && - StartsWith(rootPrefix, rootPrefix + rootPrefixSize, s.begin(), s.end())) - { - // In the case of prefix match query may be a prefix of the root - // label string. In this case we continue processing as if the - // string is equal to root label. - s.clear(); - return true; - } - if (s.size() >= rootPrefixSize && - StartsWith(s.begin(), s.end(), rootPrefix, rootPrefix + rootPrefixSize)) - { - // In both (prefix and not-prefix) cases when string has root label - // as a prefix, we continue processing. - s = strings::UniString(s.begin() + rootPrefixSize, s.end()); - return true; - } - - return false; -} - -template -bool FindLangIndex(trie::Iterator> const & trieRoot, uint8_t lang, uint32_t & langIx) +template +bool FindLangIndex(trie::Iterator> const & trieRoot, uint8_t lang, uint32_t & langIx) { ASSERT_LESS(trieRoot.m_edge.size(), numeric_limits::max(), ()); @@ -125,95 +49,82 @@ bool FindLangIndex(trie::Iterator> const & trieRoot, uint8_t l } } // namespace -template -void FullMatchInTrie(trie::Iterator> const & trieRoot, - strings::UniChar const * rootPrefix, size_t rootPrefixSize, - strings::UniString s, TF & f) +template +bool MatchInTrie(trie::Iterator> const & trieRoot, + strings::UniChar const * rootPrefix, size_t rootPrefixSize, DFAIt dfa, + ToDo && toDo) { - if (!CheckMatchString(rootPrefix, rootPrefixSize, s, false /* prefix */)) - return; + using TrieDFA = shared_ptr>>; + using State = pair; - size_t symbolsMatched = 0; - bool bFullEdgeMatched; - auto const it = MoveTrieIteratorToString(trieRoot, s, symbolsMatched, bFullEdgeMatched); + for (size_t i = 0; i < rootPrefixSize; ++i) + dfa.Move(rootPrefix[i]); - if (!it || (!s.empty() && !bFullEdgeMatched) || symbolsMatched != s.size()) - return; + if (dfa.Rejects()) + return false; -#if defined(OMIM_OS_IPHONE) && !defined(__clang__) - // Here is the dummy mutex to avoid mysterious iOS GCC-LLVM bug here. - static threads::Mutex dummyM; - threads::MutexGuard dummyG(dummyM); -#endif + queue q; + q.emplace(trieRoot.Clone(), dfa); - ASSERT_EQUAL(symbolsMatched, s.size(), ()); + bool found = false; - it->m_valueList.ForEach(f); -} - -template -void PrefixMatchInTrie(trie::Iterator> const & trieRoot, - strings::UniChar const * rootPrefix, size_t rootPrefixSize, - strings::UniString s, TF & f) -{ - if (!CheckMatchString(rootPrefix, rootPrefixSize, s, true /* prefix */)) - return; - - using TIterator = trie::Iterator>; - - using TQueue = vector>; - TQueue trieQueue; + while (!q.empty()) { - size_t symbolsMatched = 0; - bool bFullEdgeMatched; - auto const it = MoveTrieIteratorToString(trieRoot, s, symbolsMatched, bFullEdgeMatched); + auto const p = q.front(); + q.pop(); - UNUSED_VALUE(symbolsMatched); - UNUSED_VALUE(bFullEdgeMatched); + auto const & trie = p.first; + auto const & dfa = p.second; - if (!it) - return; + if (dfa.Accepts()) + { + trie->m_valueList.ForEach(toDo); + found = true; + } - trieQueue.push_back(it); + size_t const numEdges = trie->m_edge.size(); + for (size_t i = 0; i < numEdges; ++i) + { + auto const & edge = trie->m_edge[i]; + + auto cur = dfa; + for (auto c : edge.m_label) + cur.Move(c); + + if (!cur.Rejects()) + q.emplace(trie->GoToEdge(i), cur); + } } - while (!trieQueue.empty()) - { - auto const it = trieQueue.back(); - trieQueue.pop_back(); - - it->m_valueList.ForEach(f); - - for (size_t i = 0; i < it->m_edge.size(); ++i) - trieQueue.push_back(it->GoToEdge(i)); - } + return found; } -template +template class OffsetIntersecter { - struct HashFn + struct Hash { - size_t operator()(TValue const & v) const { return v.m_featureId; } + size_t operator()(Value const & v) const { return v.m_featureId; } }; - struct EqualFn + + struct Equal { - bool operator()(TValue const & v1, TValue const & v2) const + bool operator()(Value const & v1, Value const & v2) const { - return (v1.m_featureId == v2.m_featureId); + return v1.m_featureId == v2.m_featureId; } }; - using TSet = unordered_set; + using Set = unordered_set; - TFilter const & m_filter; - unique_ptr m_prevSet; - unique_ptr m_set; + Filter const & m_filter; + unique_ptr m_prevSet; + unique_ptr m_set; public: - explicit OffsetIntersecter(TFilter const & filter) : m_filter(filter), m_set(new TSet) {} + explicit OffsetIntersecter(Filter const & filter) : m_filter(filter), m_set(make_unique()) {} - void operator()(TValue const & v) + void operator()(Value const & v) { if (m_prevSet && !m_prevSet->count(v)) return; @@ -227,7 +138,7 @@ public: void NextStep() { if (!m_prevSet) - m_prevSet.reset(new TSet); + m_prevSet = make_unique(); m_prevSet.swap(m_set); m_set->clear(); @@ -244,15 +155,16 @@ public: }; } // namespace impl -template +template struct TrieRootPrefix { - using TIterator = trie::Iterator>; - TIterator const & m_root; + using Iterator = trie::Iterator>; + + Iterator const & m_root; strings::UniChar const * m_prefix; size_t m_prefixSize; - TrieRootPrefix(TIterator const & root, typename TIterator::Edge::TEdgeLabel const & edge) + TrieRootPrefix(Iterator const & root, typename Iterator::Edge::TEdgeLabel const & edge) : m_root(root) { if (edge.size() == 1) @@ -268,102 +180,55 @@ struct TrieRootPrefix } }; -template +template class TrieValuesHolder { public: - TrieValuesHolder(TFilter const & filter) : m_filter(filter) {} + TrieValuesHolder(Filter const & filter) : m_filter(filter) {} - void Resize(size_t count) { m_holder.resize(count); } - - void SwitchTo(size_t index) - { - ASSERT_LESS(index, m_holder.size(), ()); - m_index = index; - } - - void operator()(TValue const & v) + void operator()(Value const & v) { if (m_filter(v.m_featureId)) - m_holder[m_index].push_back(v); + m_values.push_back(v); } template - void ForEachValue(size_t index, ToDo && toDo) const + void ForEachValue(ToDo && toDo) const { - ASSERT_LESS(index, m_holder.size(), ()); - for (auto const & value : m_holder[index]) + for (auto const & value : m_values) toDo(value); } private: - vector> m_holder; - size_t m_index; - TFilter const & m_filter; + vector m_values; + Filter const & m_filter; }; -// Calls toDo for each feature corresponding to at least one synonym. -// *NOTE* toDo may be called several times for the same feature. -template -void MatchTokenInTrie(QueryParams::TSynonymsVector const & syns, - TrieRootPrefix const & trieRoot, ToDo && toDo) +template +struct SearchTrieRequest { - for (auto const & syn : syns) - { - ASSERT(!syn.empty(), ()); - impl::FullMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize, syn, toDo); - } + inline bool IsLangExist(int8_t lang) const { return m_langs.count(lang) != 0; } + + vector m_dfas; + unordered_set m_langs; +}; + +// Calls |toDo| for each feature accepted but at least one DFA. +// +// *NOTE* |toDo| may be called several times for the same feature. +template +void MatchInTrie(vector const & dfas, TrieRootPrefix const & trieRoot, ToDo && toDo) +{ + for (auto const & dfa : dfas) + impl::MatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize, dfa.Begin(), toDo); } -// Calls toDo for each feature whose tokens contains at least one -// synonym as a prefix. -// *NOTE* toDo may be called serveral times for the same feature. -template -void MatchTokenPrefixInTrie(QueryParams::TSynonymsVector const & syns, - TrieRootPrefix const & trieRoot, ToDo && toDo) -{ - for (auto const & syn : syns) - { - ASSERT(!syn.empty(), ()); - impl::PrefixMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize, syn, toDo); - } -} - -// Fills holder with features whose names correspond to tokens list up to synonyms. -// *NOTE* the same feature may be put in the same holder's slot several times. -template -void MatchTokensInTrie(vector const & tokens, - TrieRootPrefix const & trieRoot, THolder && holder) -{ - holder.Resize(tokens.size()); - for (size_t i = 0; i < tokens.size(); ++i) - { - holder.SwitchTo(i); - MatchTokenInTrie(tokens[i], trieRoot, holder); - } -} - -// Fills holder with features whose names correspond to tokens list up to synonyms, -// also, last holder's slot will be filled with features corresponding to prefixTokens. -// *NOTE* the same feature may be put in the same holder's slot several times. -template -void MatchTokensAndPrefixInTrie(vector const & tokens, - QueryParams::TSynonymsVector const & prefixTokens, - TrieRootPrefix const & trieRoot, THolder && holder) -{ - MatchTokensInTrie(tokens, trieRoot, holder); - - holder.Resize(tokens.size() + 1); - holder.SwitchTo(tokens.size()); - MatchTokenPrefixInTrie(prefixTokens, trieRoot, holder); -} - -// Fills holder with categories whose description matches to at least one -// token from a search query. -// *NOTE* query prefix will be treated as a complete token in the function. -template -bool MatchCategoriesInTrie(QueryParams const & params, - trie::Iterator> const & trieRoot, THolder && holder) +// Calls |toDo| for each feature in categories branch matching to |request|. +// +// *NOTE* |toDo| may be called several times for the same feature. +template +bool MatchCategoriesInTrie(SearchTrieRequest const & request, + trie::Iterator> const & trieRoot, ToDo && toDo) { uint32_t langIx = 0; if (!impl::FindLangIndex(trieRoot, search::kCategoriesLang, langIx)) @@ -373,82 +238,65 @@ bool MatchCategoriesInTrie(QueryParams const & params, ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ()); auto const catRoot = trieRoot.GoToEdge(langIx); - MatchTokensInTrie(params.m_tokens, TrieRootPrefix(*catRoot, edge), holder); + MatchInTrie(request.m_dfas, TrieRootPrefix(*catRoot, edge), toDo); - // Last token's prefix is used as a complete token here, to limit - // the number of features in the last bucket of a holder. Probably, - // this is a false optimization. - holder.Resize(params.m_tokens.size() + 1); - holder.SwitchTo(params.m_tokens.size()); - MatchTokenInTrie(params.m_prefixTokens, TrieRootPrefix(*catRoot, edge), holder); return true; } -// Calls toDo with trie root prefix and language code on each language -// allowed by params. -template -void ForEachLangPrefix(QueryParams const & params, - trie::Iterator> const & trieRoot, ToDo && toDo) +// Calls |toDo| with trie root prefix and language code on each +// language allowed by |request|. +template +void ForEachLangPrefix(SearchTrieRequest const & request, + trie::Iterator> const & trieRoot, ToDo && toDo) { ASSERT_LESS(trieRoot.m_edge.size(), numeric_limits::max(), ()); + uint32_t const numLangs = static_cast(trieRoot.m_edge.size()); for (uint32_t langIx = 0; langIx < numLangs; ++langIx) { auto const & edge = trieRoot.m_edge[langIx].m_label; ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ()); int8_t const lang = static_cast(edge[0]); - if (edge[0] < search::kCategoriesLang && params.IsLangExist(lang)) + if (edge[0] < search::kCategoriesLang && request.IsLangExist(lang)) { auto const langRoot = trieRoot.GoToEdge(langIx); - TrieRootPrefix langPrefix(*langRoot, edge); + TrieRootPrefix langPrefix(*langRoot, edge); toDo(langPrefix, lang); } } } -// Calls toDo for each feature whose description contains *ALL* tokens from a search query. -// Each feature will be passed to toDo only once. -template -void MatchFeaturesInTrie(QueryParams const & params, - trie::Iterator> const & trieRoot, TFilter const & filter, +// Calls |toDo| for each feature whose description matches to +// |request|. Each feature will be passed to |toDo| only once. +template +void MatchFeaturesInTrie(SearchTrieRequest const & request, + trie::Iterator> const & trieRoot, Filter const & filter, ToDo && toDo) { - using TIterator = trie::Iterator>; + using Iterator = trie::Iterator>; - TrieValuesHolder categoriesHolder(filter); - bool const categoriesMatched = MatchCategoriesInTrie(params, trieRoot, categoriesHolder); + TrieValuesHolder categoriesHolder(filter); + bool const categoriesMatched = MatchCategoriesInTrie(request, trieRoot, categoriesHolder); - impl::OffsetIntersecter intersecter(filter); - for (size_t i = 0; i < params.m_tokens.size(); ++i) - { - ForEachLangPrefix(params, trieRoot, [&](TrieRootPrefix & langRoot, int8_t lang) - { - MatchTokenInTrie(params.m_tokens[i], langRoot, intersecter); - }); - if (categoriesMatched) - categoriesHolder.ForEachValue(i, intersecter); - intersecter.NextStep(); - } - - if (!params.m_prefixTokens.empty()) - { - ForEachLangPrefix(params, trieRoot, [&](TrieRootPrefix & langRoot, int8_t /* lang */) - { - MatchTokenPrefixInTrie(params.m_prefixTokens, langRoot, intersecter); - }); - if (categoriesMatched) - categoriesHolder.ForEachValue(params.m_tokens.size(), intersecter); - intersecter.NextStep(); - } + impl::OffsetIntersecter intersecter(filter); + ForEachLangPrefix(request, trieRoot, [&](TrieRootPrefix & langRoot, int8_t lang) + { + MatchInTrie(request.m_dfas, langRoot, intersecter); + }); + if (categoriesMatched) + categoriesHolder.ForEachValue(intersecter); + intersecter.NextStep(); intersecter.ForEachResult(forward(toDo)); } -template +template void MatchPostcodesInTrie(TokenSlice const & slice, - trie::Iterator> const & trieRoot, - TFilter const & filter, ToDo && toDo) + trie::Iterator> const & trieRoot, + Filter const & filter, ToDo && toDo) { + using namespace strings; + uint32_t langIx = 0; if (!impl::FindLangIndex(trieRoot, search::kPostcodesLang, langIx)) return; @@ -456,13 +304,25 @@ void MatchPostcodesInTrie(TokenSlice const & slice, auto const & edge = trieRoot.m_edge[langIx].m_label; auto const postcodesRoot = trieRoot.GoToEdge(langIx); - impl::OffsetIntersecter intersecter(filter); + impl::OffsetIntersecter intersecter(filter); for (size_t i = 0; i < slice.Size(); ++i) { if (slice.IsPrefix(i)) - MatchTokenPrefixInTrie(slice.Get(i), TrieRootPrefix(*postcodesRoot, edge), intersecter); + { + vector> dfas; + for (auto const & s : slice.Get(i)) + dfas.emplace_back(UniStringDFA(s)); + + MatchInTrie(dfas, TrieRootPrefix(*postcodesRoot, edge), intersecter); + } else - MatchTokenInTrie(slice.Get(i), TrieRootPrefix(*postcodesRoot, edge), intersecter); + { + vector dfas; + for (auto const & s : slice.Get(i)) + dfas.emplace_back(s); + MatchInTrie(dfas, TrieRootPrefix(*postcodesRoot, edge), intersecter); + } + intersecter.NextStep(); } diff --git a/search/retrieval.cpp b/search/retrieval.cpp index 2c25219706..77cee7c994 100644 --- a/search/retrieval.cpp +++ b/search/retrieval.cpp @@ -24,8 +24,12 @@ #include "coding/compressed_bit_vector.hpp" #include "coding/reader_wrapper.hpp" +#include "base/dfa_helpers.hpp" +#include "base/uni_string_dfa.hpp" + #include "std/algorithm.hpp" +using namespace strings; using osm::Editor; namespace search @@ -182,7 +186,7 @@ bool MatchFeatureByNameAndType(FeatureType const & ft, QueryParams const & param bool MatchFeatureByPostcode(FeatureType const & ft, TokenSlice const & slice) { string const postcode = ft.GetMetadata().Get(feature::Metadata::FMD_POSTCODE); - vector tokens; + vector tokens; NormalizeAndTokenizeString(postcode, tokens, Delimiters()); if (slice.Size() > tokens.size()) return false; @@ -226,12 +230,46 @@ unique_ptr RetrieveAddressFeaturesImpl( vector features; FeaturesCollector collector(cancellable, features); - WithSearchTrieRoot(context.m_value, [&](TrieRoot const & root) { - MatchFeaturesInTrie( - params, root, - [&holder](uint32_t featureIndex) { return !holder.ModifiedOrDeleted(featureIndex); }, - collector); - }); + // TODO (@y): this code highly depends on the fact that the function + // is called on a single-token query params. In any case, this code + // must be fixed ASAP, as this is the wrong place for DFA creation. + ASSERT_EQUAL(1, params.GetNumTokens(), ()); + + for (size_t i = 0; i < params.GetNumTokens(); ++i) + { + if (params.IsPrefixToken(i)) + { + using DFA = PrefixDFAModifier; + + SearchTrieRequest request; + for (auto const & sym : params.GetTokens(i)) + request.m_dfas.emplace_back(UniStringDFA(sym)); + request.m_langs = params.m_langs; + + WithSearchTrieRoot(context.m_value, [&](TrieRoot const & root) { + MatchFeaturesInTrie( + request, root, + [&holder](uint32_t featureIndex) { return !holder.ModifiedOrDeleted(featureIndex); }, + collector); + }); + } + else + { + using DFA = UniStringDFA; + + SearchTrieRequest request; + for (auto const & sym : params.GetTokens(i)) + request.m_dfas.emplace_back(sym); + request.m_langs = params.m_langs; + + WithSearchTrieRoot(context.m_value, [&](TrieRoot const & root) { + MatchFeaturesInTrie( + request, root, + [&holder](uint32_t featureIndex) { return !holder.ModifiedOrDeleted(featureIndex); }, + collector); + }); + } + } holder.ForEachModifiedOrCreated([&](FeatureType & ft, uint64_t index) { if (MatchFeatureByNameAndType(ft, params)) features.push_back(index);