From 1039487357fbd54469735f8bb6255c46600f1131 Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Fri, 29 Jun 2018 16:13:17 +0300 Subject: [PATCH] [search] The reader part of the text index. --- search/base/text_index.hpp | 262 ++++++++++++++++------- search/search_tests/CMakeLists.txt | 1 + search/search_tests/text_index_tests.cpp | 59 +++-- 3 files changed, 226 insertions(+), 96 deletions(-) diff --git a/search/base/text_index.hpp b/search/base/text_index.hpp index 6fcbc6b4fb..a500b147e2 100644 --- a/search/base/text_index.hpp +++ b/search/base/text_index.hpp @@ -1,5 +1,6 @@ #pragma once +#include "coding/file_reader.hpp" #include "coding/reader.hpp" #include "coding/varint.hpp" #include "coding/write_to_sink.hpp" @@ -94,6 +95,101 @@ struct TextIndexHeader uint32_t m_postingsListsOffset = 0; }; +// The dictionary contains all tokens that are present +// in the text index. +template +class TextIndexDictionary +{ +public: + bool GetTokenId(Token const & token, uint32_t & id) const + { + auto const it = std::lower_bound(m_tokens.cbegin(), m_tokens.cend(), token); + if (it == m_tokens.cend() || *it != token) + return false; + id = static_cast(std::distance(m_tokens.cbegin(), it)); + return true; + } + + void SetTokens(std::vector && tokens) { m_tokens = std::move(tokens); } + std::vector const & GetTokens() const { return m_tokens; } + + template + void Serialize(Sink & sink, TextIndexHeader & header, uint64_t startPos) const + { + header.m_numTokens = ::base::checked_cast(m_tokens.size()); + + header.m_dictPositionsOffset = RelativePos(sink, startPos); + // An uint32_t for each 32-bit offset and an uint32_t for the dummy entry at the end. + WriteZeroesToSink(sink, sizeof(uint32_t) * (header.m_numTokens + 1)); + header.m_dictWordsOffset = RelativePos(sink, startPos); + + std::vector offsets; + offsets.reserve(header.m_numTokens + 1); + for (auto const & token : m_tokens) + { + offsets.emplace_back(RelativePos(sink, startPos)); + SerializeToken(sink, token); + } + offsets.emplace_back(RelativePos(sink, startPos)); + + { + uint64_t const savedPos = sink.Pos(); + sink.Seek(startPos + header.m_dictPositionsOffset); + + for (uint32_t const o : offsets) + WriteToSink(sink, o); + + CHECK_EQUAL(sink.Pos(), startPos + header.m_dictWordsOffset, ()); + sink.Seek(savedPos); + } + } + + template + void Deserialize(Source & source, TextIndexHeader header) + { + auto const startPos = source.Pos(); + + std::vector tokenOffsets(header.m_numTokens + 1); + for (uint32_t & offset : tokenOffsets) + offset = ReadPrimitiveFromSource(source); + + uint64_t const expectedSize = header.m_dictWordsOffset - header.m_dictPositionsOffset; + CHECK_EQUAL(source.Pos(), startPos + expectedSize, ()); + m_tokens.resize(header.m_numTokens); + for (size_t i = 0; i < m_tokens.size(); ++i) + { + size_t const size = ::base::checked_cast(tokenOffsets[i + 1] - tokenOffsets[i]); + DeserializeToken(source, m_tokens[i], size); + } + } + +private: + template + static void SerializeToken(Sink & sink, Token const & token) + { + CHECK(!token.empty(), ()); + // todo(@m) Endianness. + sink.Write(token.data(), token.size() * sizeof(typename Token::value_type)); + } + + template + static void DeserializeToken(Source & source, Token & token, size_t size) + { + CHECK_GREATER(size, 0, ()); + ASSERT_EQUAL(size % sizeof(typename Token::value_type), 0, ()); + token.resize(size / sizeof(typename Token::value_type)); + source.Read(&token[0], size); + } + + template + static uint32_t RelativePos(Sink & sink, uint64_t startPos) + { + return ::base::checked_cast(sink.Pos() - startPos); + } + + std::vector m_tokens; +}; + template class MemTextIndex { @@ -105,22 +201,23 @@ public: m_postingsByToken[token].emplace_back(posting); } - // Executes |f| on every posting associated with |token|. + // Executes |fn| on every posting associated with |token|. // The order of postings is not specified. - template - void ForEachPosting(Token const & token, F && f) const + template + void ForEachPosting(Token const & token, Fn && fn) const { auto const it = m_postingsByToken.find(token); if (it == m_postingsByToken.end()) return; for (auto const p : it->second) - f(p); + fn(p); } template void Serialize(Sink & sink) { SortPostings(); + BuildDictionary(); TextIndexHeader header; @@ -128,7 +225,6 @@ public: // Will be filled in later. header.Serialize(sink); - header.m_numTokens = ::base::checked_cast(m_postingsByToken.size()); SerializeDictionary(sink, header, startPos); SerializePostingsLists(sink, header, startPos); @@ -146,75 +242,43 @@ public: TextIndexHeader header; header.Deserialize(source); - std::vector tokens; - DeserializeDictionary(source, header, startPos, tokens); - DeserializePostingsLists(source, header, startPos, tokens); + DeserializeDictionary(source, header, startPos); + DeserializePostingsLists(source, header, startPos); } private: + void SortPostings() + { + for (auto & entry : m_postingsByToken) + { + // A posting may occur several times in a document, + // so we remove duplicates for the docid index. + // If the count is needed for ranking it may be stored + // separately. + my::SortUnique(entry.second); + } + } + + void BuildDictionary() + { + std::vector tokens; + tokens.reserve(m_postingsByToken.size()); + for (auto const & entry : m_postingsByToken) + tokens.emplace_back(entry.first); + m_dictionary.SetTokens(std::move(tokens)); + } + template void SerializeDictionary(Sink & sink, TextIndexHeader & header, uint64_t startPos) const { - header.m_dictPositionsOffset = RelativePos(sink, startPos); - // An uint32_t for each 32-bit offset and an uint32_t for the dummy entry at the end. - WriteZeroesToSink(sink, sizeof(uint32_t) * (header.m_numTokens + 1)); - header.m_dictWordsOffset = RelativePos(sink, startPos); - - std::vector offsets; - offsets.reserve(header.m_numTokens + 1); - - for (auto const & entry : m_postingsByToken) - { - offsets.emplace_back(RelativePos(sink, startPos)); - SerializeToken(sink, entry.first); - } - offsets.emplace_back(RelativePos(sink, startPos)); - - { - uint64_t const savedPos = sink.Pos(); - sink.Seek(startPos + header.m_dictPositionsOffset); - - for (uint32_t const o : offsets) - WriteToSink(sink, o); - - CHECK_EQUAL(sink.Pos(), startPos + header.m_dictWordsOffset, ()); - sink.Seek(savedPos); - } + m_dictionary.Serialize(sink, header, startPos); } template - static void DeserializeDictionary(Source & source, TextIndexHeader const & header, - uint64_t startPos, std::vector & tokens) + void DeserializeDictionary(Source & source, TextIndexHeader const & header, uint64_t startPos) { CHECK_EQUAL(source.Pos(), startPos + header.m_dictPositionsOffset, ()); - std::vector tokenOffsets(header.m_numTokens + 1); - for (uint32_t & offset : tokenOffsets) - offset = ReadPrimitiveFromSource(source); - - CHECK_EQUAL(source.Pos(), startPos + header.m_dictWordsOffset, ()); - tokens.resize(header.m_numTokens); - for (size_t i = 0; i < tokens.size(); ++i) - { - size_t const size = ::base::checked_cast(tokenOffsets[i + 1] - tokenOffsets[i]); - DeserializeToken(source, tokens[i], size); - } - } - - template - static void SerializeToken(Sink & sink, Token const & token) - { - CHECK(!token.empty(), ()); - // todo(@m) Endianness. - sink.Write(token.data(), token.size() * sizeof(typename Token::value_type)); - } - - template - static void DeserializeToken(Source & source, Token & token, size_t size) - { - CHECK_GREATER(size, 0, ()); - ASSERT_EQUAL(size % sizeof(typename Token::value_type), 0, ()); - token.resize(size / sizeof(typename Token::value_type)); - source.Read(&token[0], size); + m_dictionary.Deserialize(source, header); } template @@ -258,14 +322,14 @@ private: } template - void DeserializePostingsLists(Source & source, TextIndexHeader const & header, uint64_t startPos, - std::vector const & tokens) + void DeserializePostingsLists(Source & source, TextIndexHeader const & header, uint64_t startPos) { CHECK_EQUAL(source.Pos(), startPos + header.m_postingsStartsOffset, ()); std::vector postingsStarts(header.m_numTokens + 1); for (uint32_t & start : postingsStarts) start = ReadPrimitiveFromSource(source); + auto const & tokens = m_dictionary.GetTokens(); CHECK_EQUAL(source.Pos(), startPos + header.m_postingsListsOffset, ()); m_postingsByToken.clear(); for (size_t i = 0; i < header.m_numTokens; ++i) @@ -283,18 +347,6 @@ private: } } - void SortPostings() - { - for (auto & entry : m_postingsByToken) - { - // A posting may occur several times in a document, - // so we remove duplicates for the docid index. - // If the count is needed for ranking it may be stored - // separately. - my::SortUnique(entry.second); - } - } - template static uint32_t RelativePos(Sink & sink, uint64_t startPos) { @@ -302,6 +354,62 @@ private: } std::map> m_postingsByToken; + TextIndexDictionary m_dictionary; +}; + +// A reader class for on-demand reading of postings lists from disk. +template +class TextIndexReader +{ +public: + TextIndexReader(FileReader const & fileReader) : m_fileReader(fileReader) + { + ReaderSource headerSource(m_fileReader); + TextIndexHeader header; + header.Deserialize(headerSource); + + uint64_t const dictStart = header.m_dictPositionsOffset; + uint64_t const dictEnd = header.m_postingsStartsOffset; + ReaderSource dictSource(m_fileReader.SubReader(dictStart, dictEnd - dictStart)); + m_dictionary.Deserialize(dictSource, header); + + uint64_t const postStart = header.m_postingsStartsOffset; + uint64_t const postEnd = header.m_postingsListsOffset; + ReaderSource postingsSource(m_fileReader.SubReader(postStart, postEnd - postStart)); + m_postingsStarts.resize(header.m_numTokens + 1); + for (uint32_t & start : m_postingsStarts) + start = ReadPrimitiveFromSource(postingsSource); + } + + // Executes |fn| on every posting associated with |token|. + // The order of postings is not specified. + template + void ForEachPosting(Token const & token, Fn && fn) const + { + uint32_t tokenId = 0; + if (!m_dictionary.GetTokenId(token, tokenId)) + return; + CHECK_LESS(tokenId + 1, m_postingsStarts.size(), ()); + + uint64_t const allPostingsStart = m_header.m_postingsListsOffset; + uint64_t const tokenPostingsStart = allPostingsStart + m_postingsStarts[tokenId]; + uint64_t const tokenPostingsEnd = allPostingsStart + m_postingsStarts[tokenId + 1]; + ReaderSource source( + m_fileReader.SubReader(tokenPostingsStart, tokenPostingsEnd - tokenPostingsStart)); + + uint32_t last = 0; + while (source.Size() > 0) + { + last += ReadVarUint(source); + fn(last); + } + } + +private: + FileReader m_fileReader; + TextIndexHeader m_header; + TextIndexDictionary m_dictionary; + std::vector m_postingsStarts; }; std::string DebugPrint(TextIndexVersion const & version); diff --git a/search/search_tests/CMakeLists.txt b/search/search_tests/CMakeLists.txt index dbc99ad2ac..bba643e1bd 100644 --- a/search/search_tests/CMakeLists.txt +++ b/search/search_tests/CMakeLists.txt @@ -33,6 +33,7 @@ omim_link_libraries( ${PROJECT_NAME} search_tests_support generator_tests_support + platform_tests_support search editor indexer diff --git a/search/search_tests/text_index_tests.cpp b/search/search_tests/text_index_tests.cpp index 5003ea1880..4007d1cb9d 100644 --- a/search/search_tests/text_index_tests.cpp +++ b/search/search_tests/text_index_tests.cpp @@ -4,6 +4,8 @@ #include "indexer/search_string_utils.hpp" +#include "platform/platform_tests_support/scoped_file.hpp" + #include "coding/reader.hpp" #include "coding/write_to_sink.hpp" #include "coding/writer.hpp" @@ -13,24 +15,29 @@ #include "std/transform_iterator.hpp" +#include #include +#include #include #include +using namespace platform::tests_support; using namespace search::base; using namespace search; using namespace std; namespace { +// Prepend several bytes to serialized indexes in order to check the relative offsets. +size_t const kSkip = 10; + template -void Serdes(MemTextIndex & memIndex, MemTextIndex & deserializedMemIndex) +void Serdes(MemTextIndex & memIndex, MemTextIndex & deserializedMemIndex, + vector & buf) { - // Prepend several bytes to check the relative offsets. - size_t const kSkip = 10; - vector buf; + buf.clear(); { - MemWriter writer(buf); + MemWriter> writer(buf); WriteZeroesToSink(writer, kSkip); memIndex.Serialize(writer); } @@ -42,9 +49,8 @@ void Serdes(MemTextIndex & memIndex, MemTextIndex & deserializedMe } } -template -void TestForEach(MemTextIndex const & index, Token const & token, - vector const & expected) +template +void TestForEach(Index const & index, Token const & token, vector const & expected) { vector actual; index.ForEachPosting(token, MakeBackInsertFunctor(actual)); @@ -54,7 +60,7 @@ void TestForEach(MemTextIndex const & index, Token const & token, namespace search { -UNIT_TEST(MemTextIndex_Smoke) +UNIT_TEST(TextIndex_Smoke) { using Token = string; @@ -75,18 +81,32 @@ UNIT_TEST(MemTextIndex_Smoke) } } + vector indexData; MemTextIndex deserializedMemIndex; - Serdes(memIndex, deserializedMemIndex); + Serdes(memIndex, deserializedMemIndex, indexData); for (auto const & index : {memIndex, deserializedMemIndex}) { - TestForEach(index, "a", {0, 1}); - TestForEach(index, "b", {0}); - TestForEach(index, "c", {0, 1}); + TestForEach(index, "a", {0, 1}); + TestForEach(index, "b", {0}); + TestForEach(index, "c", {0, 1}); + TestForEach(index, "d", {}); + } + + { + string contents; + copy_n(indexData.begin() + kSkip, indexData.size() - kSkip, back_inserter(contents)); + ScopedFile file("text_index_tmp", contents); + FileReader fileReader(file.GetFullPath()); + TextIndexReader textIndexReader(fileReader); + TestForEach(textIndexReader, "a", {0, 1}); + TestForEach(textIndexReader, "b", {0}); + TestForEach(textIndexReader, "c", {0, 1}); + TestForEach(textIndexReader, "d", {}); } } -UNIT_TEST(MemTextIndex_UniString) +UNIT_TEST(TextIndex_UniString) { using Token = strings::UniString; @@ -109,15 +129,16 @@ UNIT_TEST(MemTextIndex_UniString) SplitUniString(docsCollection[docId], addToIndex, delims); } + vector indexData; MemTextIndex deserializedMemIndex; - Serdes(memIndex, deserializedMemIndex); + Serdes(memIndex, deserializedMemIndex, indexData); for (auto const & index : {memIndex, deserializedMemIndex}) { - TestForEach(index, strings::MakeUniString("a"), {}); - TestForEach(index, strings::MakeUniString("â"), {0, 1}); - TestForEach(index, strings::MakeUniString("b"), {0}); - TestForEach(index, strings::MakeUniString("ç"), {0, 1}); + TestForEach(index, strings::MakeUniString("a"), {}); + TestForEach(index, strings::MakeUniString("â"), {0, 1}); + TestForEach(index, strings::MakeUniString("b"), {0}); + TestForEach(index, strings::MakeUniString("ç"), {0, 1}); } } } // namespace search