diff --git a/search/base/text_index/mem.hpp b/search/base/text_index/mem.hpp index 367becbe14..c3887e215e 100644 --- a/search/base/text_index/mem.hpp +++ b/search/base/text_index/mem.hpp @@ -8,7 +8,6 @@ #include "coding/reader.hpp" #include "coding/varint.hpp" -#include "coding/write_to_sink.hpp" #include "base/assert.hpp" #include "base/string_utils.hpp" @@ -88,30 +87,31 @@ private: class MemPostingsFetcher : public PostingsFetcher { public: - MemPostingsFetcher(std::map> const & postingsByToken) + explicit MemPostingsFetcher(std::map> const & postingsByToken) + : m_postingsByToken(postingsByToken), m_it(m_postingsByToken.begin()) { - // todo(@m) An unnecessary copy? - m_postings.reserve(postingsByToken.size()); - for (auto const & entry : postingsByToken) - m_postings.emplace_back(entry.second); } // PostingsFetcher overrides: - bool GetPostingsForNextToken(std::vector & postings) + bool IsValid() const override { return m_it != m_postingsByToken.end(); } + + void Advance() override { - CHECK_LESS_OR_EQUAL(m_tokenId, m_postings.size(), ()); - if (m_tokenId == m_postings.size()) - return false; - postings.swap(m_postings[m_tokenId++]); - return true; + if (m_it != m_postingsByToken.end()) + ++m_it; + } + + void ForEachPosting(Fn const & fn) const override + { + CHECK(IsValid(), ()); + for (uint32_t p : m_it->second) + fn(p); } private: - std::vector> m_postings; - // Index of the next token to be processed. The - // copy of the postings list in |m_postings| is not guaranteed - // to be valid after it's been processed. - size_t m_tokenId = 0; + std::map> const & m_postingsByToken; + // Iterator to the current token that will be processed when ForEachPosting is called. + std::map>::const_iterator m_it; }; void SortPostings(); diff --git a/search/base/text_index/merger.cpp b/search/base/text_index/merger.cpp index 7ea8425475..50f8af1d9a 100644 --- a/search/base/text_index/merger.cpp +++ b/search/base/text_index/merger.cpp @@ -14,6 +14,8 @@ #include "base/stl_helpers.hpp" #include +#include +#include #include #include @@ -30,45 +32,67 @@ public: TextIndexReader const & index2) : m_dict(dict), m_index1(index1), m_index2(index2) { + ReadPostings(); } // PostingsFetcher overrides: - bool GetPostingsForNextToken(std::vector & postings) + bool IsValid() const override { - postings.clear(); + auto const & tokens = m_dict.GetTokens(); + CHECK_LESS_OR_EQUAL(m_tokenId, tokens.size(), ()); + return m_tokenId < tokens.size(); + } + void Advance() override + { auto const & tokens = m_dict.GetTokens(); CHECK_LESS_OR_EQUAL(m_tokenId, tokens.size(), ()); if (m_tokenId == tokens.size()) - return false; + return; - m_index1.ForEachPosting(tokens[m_tokenId], MakeBackInsertFunctor(postings)); - m_index2.ForEachPosting(tokens[m_tokenId], MakeBackInsertFunctor(postings)); - my::SortUnique(postings); ++m_tokenId; - return true; + ReadPostings(); + } + + void ForEachPosting(Fn const & fn) const override + { + CHECK(IsValid(), ()); + for (uint32_t p : m_postings) + fn(p); } private: + // Reads postings for the current token. + void ReadPostings() + { + m_postings.clear(); + if (!IsValid()) + return; + + auto const & tokens = m_dict.GetTokens(); + m_index1.ForEachPosting(tokens[m_tokenId], MakeBackInsertFunctor(m_postings)); + m_index2.ForEachPosting(tokens[m_tokenId], MakeBackInsertFunctor(m_postings)); + my::SortUnique(m_postings); + } + TextIndexDictionary const & m_dict; TextIndexReader const & m_index1; TextIndexReader const & m_index2; // Index of the next token from |m_dict| to be processed. size_t m_tokenId = 0; + vector m_postings; }; TextIndexDictionary MergeDictionaries(TextIndexDictionary const & dict1, TextIndexDictionary const & dict2) { - vector commonTokens = dict1.GetTokens(); - for (auto const & token : dict2.GetTokens()) - { - size_t dummy; - if (!dict1.GetTokenId(token, dummy)) - commonTokens.emplace_back(token); - } + vector commonTokens; + auto const & ts1 = dict1.GetTokens(); + auto const & ts2 = dict2.GetTokens(); + merge(ts1.begin(), ts1.end(), ts2.begin(), ts2.end(), back_inserter(commonTokens)); + ASSERT(is_sorted(commonTokens.begin(), commonTokens.end()), ()); + commonTokens.erase(unique(commonTokens.begin(), commonTokens.end()), commonTokens.end()); - sort(commonTokens.begin(), commonTokens.end()); TextIndexDictionary dict; dict.SetTokens(move(commonTokens)); return dict; diff --git a/search/base/text_index/postings.hpp b/search/base/text_index/postings.hpp index ccfc32ab25..81a790beda 100644 --- a/search/base/text_index/postings.hpp +++ b/search/base/text_index/postings.hpp @@ -5,7 +5,10 @@ #include "search/base/text_index/utils.hpp" #include "coding/varint.hpp" +#include "coding/write_to_sink.hpp" +#include +#include #include namespace search @@ -20,11 +23,21 @@ struct TextIndexHeader; class PostingsFetcher { public: - // Returns true and fills |postings| with the postings list of the next token - // when there is one. - // Returns false if the underlying source is exhausted, i.e. there are - // no more tokens left. - virtual bool GetPostingsForNextToken(std::vector & postings) = 0; + using Fn = std::function; + + virtual ~PostingsFetcher() = default; + + // Returns true when there are tokens left in the fetcher and false otherwise. + virtual bool IsValid() const = 0; + + // Advances fetcher to the next token. + virtual void Advance() = 0; + + // Calls |fn| for every posting for the current token. Initially, + // current token is the first token and then calls to Advance + // may be used to process the next token until the underlying + // source of the tokens is exhausted and the fetcher is no longer valid. + virtual void ForEachPosting(Fn const & fn) const = 0; }; // Fetches the postings list one by one from |fetcher| and writes them @@ -44,20 +57,21 @@ void WritePostings(Sink & sink, uint64_t startPos, TextIndexHeader & header, std::vector postingsStarts; postingsStarts.reserve(header.m_numTokens); - - // todo(@m) s/uint32_t/Posting/ ? - std::vector postings; - while (fetcher.GetPostingsForNextToken(postings)) { - postingsStarts.emplace_back(RelativePos(sink, startPos)); - - uint32_t last = 0; - for (auto const p : postings) - { + uint32_t last; + // todo(@m) s/uint32_t/Posting/ ? + auto writePostings = [&](uint32_t p) { CHECK(last == 0 || last < p, (last, p)); uint32_t const delta = p - last; WriteVarUint(sink, delta); last = p; + }; + while (fetcher.IsValid()) + { + postingsStarts.emplace_back(RelativePos(sink, startPos)); + last = 0; + fetcher.ForEachPosting(writePostings); + fetcher.Advance(); } } // One more for convenience. diff --git a/search/base/text_index/reader.hpp b/search/base/text_index/reader.hpp index 772aacd77e..5d9be4aaf7 100644 --- a/search/base/text_index/reader.hpp +++ b/search/base/text_index/reader.hpp @@ -1,5 +1,6 @@ #pragma once +#include "search/base/text_index/dictionary.hpp" #include "search/base/text_index/text_index.hpp" #include "coding/file_reader.hpp" diff --git a/search/base/text_index/utils.hpp b/search/base/text_index/utils.hpp index fe896c6c37..0f6067fba5 100644 --- a/search/base/text_index/utils.hpp +++ b/search/base/text_index/utils.hpp @@ -9,7 +9,7 @@ namespace search namespace base { template -static uint32_t RelativePos(Sink & sink, uint64_t startPos) +uint32_t RelativePos(Sink & sink, uint64_t startPos) { return ::base::checked_cast(sink.Pos() - startPos); } diff --git a/xcode/search/search.xcodeproj/project.pbxproj b/xcode/search/search.xcodeproj/project.pbxproj index 5761092b24..ce60485e3c 100644 --- a/xcode/search/search.xcodeproj/project.pbxproj +++ b/xcode/search/search.xcodeproj/project.pbxproj @@ -100,6 +100,10 @@ 3974BB921FB4723000F265E5 /* testingmain.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 671C62241AE9229A00076BD0 /* testingmain.cpp */; }; 397AFE061D6C9AC700F583E7 /* downloader_search_callback.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 397AFE041D6C9AC700F583E7 /* downloader_search_callback.cpp */; }; 397AFE071D6C9AC700F583E7 /* downloader_search_callback.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 397AFE051D6C9AC700F583E7 /* downloader_search_callback.hpp */; }; + 39831A3820F3C5C6005FF294 /* merger.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 39831A3420F3C5C6005FF294 /* merger.cpp */; }; + 39831A3920F3C5C6005FF294 /* utils.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 39831A3520F3C5C6005FF294 /* utils.hpp */; }; + 39831A3A20F3C5C6005FF294 /* merger.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 39831A3620F3C5C6005FF294 /* merger.hpp */; }; + 39831A3B20F3C5C6005FF294 /* postings.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 39831A3720F3C5C6005FF294 /* postings.hpp */; }; 39AEF8361FB4597300943FC9 /* tracer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 39AEF8351FB4597300943FC9 /* tracer.cpp */; }; 39AEF8381FB4597F00943FC9 /* point_rect_matcher.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 39AEF8371FB4597E00943FC9 /* point_rect_matcher.hpp */; }; 39AEF83A1FB4598900943FC9 /* tracer.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 39AEF8391FB4598900943FC9 /* tracer.hpp */; }; @@ -379,6 +383,10 @@ 3936A60C20EA2F5F00A68C09 /* mem.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = mem.hpp; path = ../text_index/mem.hpp; sourceTree = ""; }; 397AFE041D6C9AC700F583E7 /* downloader_search_callback.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = downloader_search_callback.cpp; sourceTree = ""; }; 397AFE051D6C9AC700F583E7 /* downloader_search_callback.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = downloader_search_callback.hpp; sourceTree = ""; }; + 39831A3420F3C5C6005FF294 /* merger.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = merger.cpp; sourceTree = ""; }; + 39831A3520F3C5C6005FF294 /* utils.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = utils.hpp; sourceTree = ""; }; + 39831A3620F3C5C6005FF294 /* merger.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = merger.hpp; sourceTree = ""; }; + 39831A3720F3C5C6005FF294 /* postings.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = postings.hpp; sourceTree = ""; }; 39AEF8351FB4597300943FC9 /* tracer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tracer.cpp; sourceTree = ""; }; 39AEF8371FB4597E00943FC9 /* point_rect_matcher.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = point_rect_matcher.hpp; sourceTree = ""; }; 39AEF8391FB4598900943FC9 /* tracer.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = tracer.hpp; sourceTree = ""; }; @@ -673,6 +681,10 @@ 3936A60420EA2F3E00A68C09 /* text_index */ = { isa = PBXGroup; children = ( + 39831A3420F3C5C6005FF294 /* merger.cpp */, + 39831A3620F3C5C6005FF294 /* merger.hpp */, + 39831A3720F3C5C6005FF294 /* postings.hpp */, + 39831A3520F3C5C6005FF294 /* utils.hpp */, 3936A60920EA2F5E00A68C09 /* dictionary.hpp */, 3936A60520EA2F5E00A68C09 /* header.cpp */, 3936A60620EA2F5E00A68C09 /* header.hpp */, @@ -999,11 +1011,14 @@ 3461C9A21D79949600E6E6F5 /* categories_set.hpp in Headers */, 397AFE071D6C9AC700F583E7 /* downloader_search_callback.hpp in Headers */, 3936A60F20EA2F5F00A68C09 /* text_index.hpp in Headers */, + 39831A3A20F3C5C6005FF294 /* merger.hpp in Headers */, 675346EE1A40560D00A0A8C3 /* locality_finder.hpp in Headers */, 675346EC1A40560D00A0A8C3 /* latlon_match.hpp in Headers */, A1347D521B8758C3009050FF /* query_saver.hpp in Headers */, 3459A7A81E4C4D0200ED235F /* geocoder_locality.hpp in Headers */, 675346EA1A40560D00A0A8C3 /* keyword_matcher.hpp in Headers */, + 39831A3B20F3C5C6005FF294 /* postings.hpp in Headers */, + 39831A3920F3C5C6005FF294 /* utils.hpp in Headers */, 347F33211C4540A8009758CC /* projection_on_street.hpp in Headers */, F652D8EF1CFDE21900FC29A0 /* features_layer.hpp in Headers */, 0831F255200E56110034C365 /* mem_search_index.hpp in Headers */, @@ -1225,6 +1240,7 @@ 39B2B9491FB4620200AB85A1 /* pre_ranker_test.cpp in Sources */, 3461C9A31D79949600E6E6F5 /* editor_delegate.cpp in Sources */, 39BBC13B1F9FD65C009D1687 /* highlighting.cpp in Sources */, + 39831A3820F3C5C6005FF294 /* merger.cpp in Sources */, F652D8BF1CFDE1E800FC29A0 /* engine.cpp in Sources */, 675346DD1A40560D00A0A8C3 /* approximate_string_match.cpp in Sources */, 34586B8B1DCB1E8300CF7FC9 /* interval_set_test.cpp in Sources */,