diff --git a/indexer/indexer.pro b/indexer/indexer.pro index bd25339f6e..ee3e3624ef 100644 --- a/indexer/indexer.pro +++ b/indexer/indexer.pro @@ -56,6 +56,7 @@ SOURCES += \ scales.cpp \ search_delimiters.cpp \ # it's in indexer because of CategoriesHolder dependency. search_string_utils.cpp \ # it's in indexer because of CategoriesHolder dependency. + string_slice.cpp \ types_mapping.cpp \ HEADERS += \ @@ -114,6 +115,7 @@ HEADERS += \ scales.hpp \ search_delimiters.hpp \ # it's in indexer because of CategoriesHolder dependency. search_string_utils.hpp \ # it's in indexer because of CategoriesHolder dependency. + string_set.hpp \ string_slice.hpp \ succinct_trie_builder.hpp \ succinct_trie_reader.hpp \ diff --git a/indexer/indexer_tests/indexer_tests.pro b/indexer/indexer_tests/indexer_tests.pro index f80aa0a440..0528baa426 100644 --- a/indexer/indexer_tests/indexer_tests.pro +++ b/indexer/indexer_tests/indexer_tests.pro @@ -39,6 +39,7 @@ SOURCES += \ scales_test.cpp \ search_string_utils_test.cpp \ sort_and_merge_intervals_test.cpp \ + string_slice_tests.cpp \ succinct_trie_test.cpp \ test_polylines.cpp \ test_type.cpp \ diff --git a/indexer/indexer_tests/string_slice_tests.cpp b/indexer/indexer_tests/string_slice_tests.cpp new file mode 100644 index 0000000000..7ddb1c7818 --- /dev/null +++ b/indexer/indexer_tests/string_slice_tests.cpp @@ -0,0 +1,74 @@ +#include "../../testing/testing.hpp" + +#include "indexer/string_slice.hpp" + +#include "base/string_utils.hpp" + +using namespace search; +using namespace strings; + +namespace +{ +UniString ToString(vector const & v) +{ + NoPrefixStringSlice slice(v); + UniString r; + copy(JoinIterator::Begin(slice), JoinIterator::End(slice), back_inserter(r)); + return r; +} + +UNIT_TEST(JoinIterator_Smoke) +{ + { + vector v; + + NoPrefixStringSlice slice1(v); + auto begin1 = JoinIterator::Begin(slice1); + auto end1 = JoinIterator::End(slice1); + + NoPrefixStringSlice slice2(v); + auto begin2 = JoinIterator::Begin(slice2); + auto end2 = JoinIterator::End(slice2); + + TEST(begin1 == end1, ()); + + TEST(begin2 == end2, ()); + + TEST(begin1 != begin2, ()); + TEST(end1 != end2, ()); + + TEST(begin1 != end2, ()); + TEST(begin2 != end1, ()); + } + + { + vector const v; + TEST_EQUAL(MakeUniString(""), ToString(v), ()); + } + + { + vector const v = {MakeUniString("")}; + TEST_EQUAL(MakeUniString(""), ToString(v), ()); + } + + { + vector const v = {MakeUniString(""), MakeUniString("")}; + TEST_EQUAL(MakeUniString(" "), ToString(v), ()); + } + + { + vector const v = {MakeUniString(""), MakeUniString("b"), MakeUniString("")}; + TEST_EQUAL(MakeUniString(" b "), ToString(v), ()); + } + + { + vector const v = {MakeUniString("Hello")}; + TEST_EQUAL(MakeUniString("Hello"), ToString(v), ()); + } + + { + vector const v = {MakeUniString("Hello"), MakeUniString("World!")}; + TEST_EQUAL(MakeUniString("Hello World!"), ToString(v), ()); + } +} +} // namespace diff --git a/indexer/postcodes_matcher.cpp b/indexer/postcodes_matcher.cpp index 143b462216..941d272e30 100644 --- a/indexer/postcodes_matcher.cpp +++ b/indexer/postcodes_matcher.cpp @@ -1,6 +1,7 @@ #include "indexer/postcodes_matcher.hpp" #include "indexer/search_delimiters.hpp" #include "indexer/search_string_utils.hpp" +#include "indexer/string_set.hpp" #include "base/logging.hpp" #include "base/macros.hpp" @@ -37,93 +38,30 @@ UniChar SimplifyChar(UniChar const & c) return c; } -struct Node -{ - Node() : m_isLeaf(false) {} - - Node const * Move(UniChar c) const - { - for (auto const & p : m_moves) - { - if (p.first == c) - return p.second.get(); - } - return nullptr; - } - - template - Node const * Move(TIt begin, TIt end) const - { - Node const * cur = this; - for (; begin != end && cur; ++begin) - cur = cur->Move(*begin); - return cur; - } - - Node & MakeMove(UniChar c) - { - for (auto const & p : m_moves) - { - if (p.first == c) - return *p.second; - } - m_moves.emplace_back(c, make_unique()); - return *m_moves.back().second; - } - - template - Node & MakeMove(TIt begin, TIt end) - { - Node * cur = this; - for (; begin != end; ++begin) - cur = &cur->MakeMove(*begin); - return *cur; - } - - buffer_vector>, 2> m_moves; - bool m_isLeaf; - - DISALLOW_COPY(Node); -}; - // This class puts all strings from g_patterns to a trie with a low // branching factor and matches queries against these patterns. class PostcodesMatcher { public: - PostcodesMatcher() : m_root(), m_maxNumTokensInPostcode(0) + using TStringSet = StringSet; + + PostcodesMatcher() : m_maxNumTokensInPostcode(0) { search::Delimiters delimiters; for (auto const * pattern : g_patterns) AddString(MakeUniString(pattern), delimiters); } - // Checks that given tokens match to at least one of postcodes - // patterns. - // - // Complexity: O(total length of tokens in |slice|). bool HasString(StringSliceBase const & slice, bool isPrefix) const { - if (slice.Size() == 0) - return m_root.m_isLeaf; - - Node const * cur = &m_root; - for (size_t i = 0; i < slice.Size() && cur; ++i) + auto status = m_strings.Has(make_transform_iterator(JoinIterator::Begin(slice), &SimplifyChar), + make_transform_iterator(JoinIterator::End(slice), &SimplifyChar)); + switch (status) { - auto const & s = slice.Get(i); - cur = cur->Move(make_transform_iterator(s.begin(), &SimplifyChar), - make_transform_iterator(s.end(), &SimplifyChar)); - if (cur && i + 1 < slice.Size()) - cur = cur->Move(' '); + case TStringSet::Status::Absent: return false; + case TStringSet::Status::Prefix: return isPrefix; + case TStringSet::Status::Full: return true; } - - if (!cur) - return false; - - if (isPrefix) - return true; - - return cur->m_isLeaf; } inline size_t GetMaxNumTokensInPostcode() const { return m_maxNumTokensInPostcode; } @@ -133,20 +71,13 @@ private: { vector tokens; SplitUniString(s, MakeBackInsertFunctor(tokens), delimiters); - m_maxNumTokensInPostcode = max(m_maxNumTokensInPostcode, tokens.size()); + NoPrefixStringSlice slice(tokens); - Node * cur = &m_root; - for (size_t i = 0; i < tokens.size(); ++i) - { - cur = &cur->MakeMove(tokens[i].begin(), tokens[i].end()); - if (i + 1 != tokens.size()) - cur = &cur->MakeMove(' '); - } - cur->m_isLeaf = true; + m_maxNumTokensInPostcode = max(m_maxNumTokensInPostcode, tokens.size()); + m_strings.Add(JoinIterator::Begin(slice), JoinIterator::End(slice)); } - Node m_root; - + TStringSet m_strings; size_t m_maxNumTokensInPostcode; DISALLOW_COPY(PostcodesMatcher); diff --git a/indexer/string_set.hpp b/indexer/string_set.hpp new file mode 100644 index 0000000000..dde65e5daf --- /dev/null +++ b/indexer/string_set.hpp @@ -0,0 +1,95 @@ +#pragma once + +#include "base/buffer_vector.hpp" +#include "base/macros.hpp" + +#include "std/cstdint.hpp" +#include "std/unique_ptr.hpp" + +namespace search +{ +template +class StringSet +{ +public: + enum class Status + { + Absent, + Prefix, + Full, + }; + + StringSet() = default; + + template + void Add(TIt begin, TIt end) + { + auto & cur = m_root.MakeMove(begin, end); + cur.m_isLeaf = true; + } + + template + Status Has(TIt begin, TIt end) const + { + auto const * cur = m_root.Move(begin, end); + if (!cur) + return Status::Absent; + + return cur->m_isLeaf ? Status::Full : Status::Prefix; + } + +private: + struct Node + { + Node() : m_isLeaf(false) {} + + Node const * Move(TChar c) const + { + for (auto const & p : m_moves) + { + if (p.first == c) + return p.second.get(); + } + return nullptr; + } + + template + Node const * Move(TIt begin, TIt end) const + { + Node const * cur = this; + for (; begin != end && cur; ++begin) + cur = cur->Move(*begin); + return cur; + } + + Node & MakeMove(TChar c) + { + for (auto const & p : m_moves) + { + if (p.first == c) + return *p.second; + } + m_moves.emplace_back(c, make_unique()); + return *m_moves.back().second; + } + + template + Node & MakeMove(TIt begin, TIt end) + { + Node * cur = this; + for (; begin != end; ++begin) + cur = &cur->MakeMove(*begin); + return *cur; + } + + buffer_vector>, OutDegree> m_moves; + bool m_isLeaf; + + DISALLOW_COPY(Node); + }; + + Node m_root; + + DISALLOW_COPY(StringSet); +}; +} // namespace search diff --git a/indexer/string_slice.cpp b/indexer/string_slice.cpp new file mode 100644 index 0000000000..ed4a881e8e --- /dev/null +++ b/indexer/string_slice.cpp @@ -0,0 +1,70 @@ +#include "indexer/string_slice.hpp" + +namespace search +{ +JoinIterator::JoinIterator(StringSliceBase const & slice, Position position) : m_slice(slice) +{ + if (position == Position::Begin) + { + m_string = 0; + m_offset = 0; + Normalize(); + } + else + { + m_string = GetMaxSize(); + m_offset = 0; + } +} + +// static +JoinIterator JoinIterator::Begin(StringSliceBase const & slice) +{ + return JoinIterator(slice, Position::Begin); +} + +// static +JoinIterator JoinIterator::End(StringSliceBase const & slice) +{ + return JoinIterator(slice, Position::End); +} + +JoinIterator & JoinIterator::operator++() +{ + ++m_offset; + Normalize(); + return *this; +} + +void JoinIterator::Normalize() +{ + while (m_string != GetMaxSize() && m_offset >= GetSize(m_string)) + { + ++m_string; + m_offset = 0; + } +} + +size_t JoinIterator::GetSize(size_t string) const +{ + if (string >= GetMaxSize()) + return 0; + if (string & 1) + return 1; + return m_slice.Get(string >> 1).size(); +} + +strings::UniChar JoinIterator::GetChar(size_t string, size_t offset) const +{ + if (string >= GetMaxSize()) + return 0; + if (string & 1) + { + ASSERT_EQUAL(offset, 0, ()); + return ' '; + } + auto const & s = m_slice.Get(string >> 1); + ASSERT_LESS(offset, s.size(), ()); + return s[offset]; +} +} // namespace search diff --git a/indexer/string_slice.hpp b/indexer/string_slice.hpp index 510cdbe8cb..38d2a1230c 100644 --- a/indexer/string_slice.hpp +++ b/indexer/string_slice.hpp @@ -22,10 +22,7 @@ public: class NoPrefixStringSlice : public StringSliceBase { public: - NoPrefixStringSlice(vector const & strings) - : m_strings(strings) - { - } + NoPrefixStringSlice(vector const & strings) : m_strings(strings) {} virtual TString const & Get(size_t i) const override { return m_strings[i]; } virtual size_t Size() const override { return m_strings.size(); } @@ -33,4 +30,50 @@ public: private: vector const & m_strings; }; + +// Allows to iterate over space-separated strings in StringSliceBase. +class JoinIterator +{ +public: + using difference_type = ptrdiff_t; + using value_type = strings::UniChar; + using pointer = strings::UniChar *; + using reference = strings::UniChar &; + using iterator_category = std::forward_iterator_tag; + + static JoinIterator Begin(StringSliceBase const & slice); + static JoinIterator End(StringSliceBase const & slice); + + inline bool operator==(JoinIterator const & rhs) const + { + return &m_slice == &rhs.m_slice && m_string == rhs.m_string && m_offset == rhs.m_offset; + } + + inline bool operator!=(JoinIterator const & rhs) const { return !(*this == rhs); } + + inline strings::UniChar operator*() const { return GetChar(m_string, m_offset); } + + JoinIterator & operator++(); + +private: + enum class Position + { + Begin, + End + }; + + JoinIterator(StringSliceBase const & slice, Position position); + + void Normalize(); + + size_t GetSize(size_t string) const; + + inline size_t GetMaxSize() const { return m_slice.Size() == 0 ? 0 : m_slice.Size() * 2 - 1; } + + strings::UniChar GetChar(size_t string, size_t offset) const; + + StringSliceBase const & m_slice; + size_t m_string; + size_t m_offset; +}; } // namespace search