diff --git a/coding/write_to_sink.hpp b/coding/write_to_sink.hpp index 2943f03255..f5921e4290 100644 --- a/coding/write_to_sink.hpp +++ b/coding/write_to_sink.hpp @@ -2,17 +2,19 @@ #include "coding/endianness.hpp" +#include #include -template +template std::enable_if_t::value || std::is_enum::value, void> WriteToSink( - TSink & sink, T const & v) + Sink & sink, T const & v) { T const t = SwapIfBigEndian(v); sink.Write(&t, sizeof(T)); } -template void WriteZeroesToSink(TSink & sink, uint64_t size) +template +void WriteZeroesToSink(Sink & sink, uint64_t size) { uint8_t const zeroes[256] = { 0 }; for (uint64_t i = 0; i < (size >> 8); ++i) @@ -20,14 +22,17 @@ template void WriteZeroesToSink(TSink & sink, uint64_t size) sink.Write(zeroes, size & 255); } -template class WriterFunctor +template +class WriterFunctor { - SinkT & m_Sink; - public: - explicit WriterFunctor(SinkT & sink) : m_Sink(sink) {} + explicit WriterFunctor(Sink & sink) : m_sink(sink) {} + template void operator() (T const & t) const { - m_Sink.Write(&t, sizeof(T)); + m_sink.Write(&t, sizeof(T)); } + +private: + Sink & m_sink; }; diff --git a/search/CMakeLists.txt b/search/CMakeLists.txt index 2d274a3b49..c5963745c2 100644 --- a/search/CMakeLists.txt +++ b/search/CMakeLists.txt @@ -7,6 +7,8 @@ set( approximate_string_match.hpp base/inverted_list.hpp base/mem_search_index.hpp + base/text_index.cpp + base/text_index.hpp bookmarks/data.cpp bookmarks/data.hpp bookmarks/processor.cpp diff --git a/search/base/text_index.cpp b/search/base/text_index.cpp new file mode 100644 index 0000000000..9b66ea5b30 --- /dev/null +++ b/search/base/text_index.cpp @@ -0,0 +1,27 @@ +#include "search/base/text_index.hpp" + +#include "base/assert.hpp" +#include "base/string_utils.hpp" + +using namespace std; + +namespace search +{ +namespace base +{ +// static +string const TextIndexHeader::kHeaderMagic = "mapsmetextidx"; + +string DebugPrint(TextIndexVersion const & version) +{ + switch (version) + { + case TextIndexVersion::V0: return "V0"; + } + string ret = + "Unknown TextIndexHeader version: " + strings::to_string(static_cast(version)); + ASSERT(false, (ret)); + return ret; +} +} // namespace base +} // namespace search diff --git a/search/base/text_index.hpp b/search/base/text_index.hpp new file mode 100644 index 0000000000..833c403b08 --- /dev/null +++ b/search/base/text_index.hpp @@ -0,0 +1,259 @@ +#pragma once + +#include "coding/reader.hpp" +#include "coding/varint.hpp" +#include "coding/write_to_sink.hpp" + +#include "base/assert.hpp" +#include "base/checked_cast.hpp" +#include "base/stl_helpers.hpp" +#include "base/string_utils.hpp" + +#include +#include +#include +#include +#include +#include + +// This file contains the structures needed to store an +// updatable text index on disk. +// +// The index maps tokens of string type (typically std::string or +// strings::UniString) to postings lists, i.e. to lists of entities +// called postings that encode the locations of the strings in the collection +// of the text documents that is being indexed. An example of a posting +// is a document id (docid). Another example is a pair of a document id and +// a position within the corresponding document. +// +// The updates are performed by rebuilding the index, either as a result +// of merging several indexes together, or as a result of clearing outdated +// entries from an old index. +// +// For version 0, the postings lists are docid arrays, i.e. arrays of unsigned +// 32-bit integers stored in increasing order. +// The structure of the index is: +// [header: version and offsets] +// [array containing the starting positions of tokens] +// [tokens, written without separators in the lexicographical order] +// [array containing the offsets for the postings lists] +// [postings lists, stored as delta-encoded varints] +// +// All offsets are measured relative to the start of the index. +namespace search +{ +namespace base +{ +using Posting = uint32_t; + +enum class TextIndexVersion : uint8_t +{ + V0 = 0, + Latest = V0 +}; + +struct TextIndexHeader +{ + template + void Serialize(Sink & sink) const + { + CHECK_EQUAL(m_version, TextIndexVersion::V0, ()); + + sink.Write(kHeaderMagic.data(), kHeaderMagic.size()); + WriteToSink(sink, static_cast(m_version)); + WriteToSink(sink, m_numTokens); + WriteToSink(sink, m_dictPositionsOffset); + WriteToSink(sink, m_dictWordsOffset); + WriteToSink(sink, m_postingsStartsOffset); + WriteToSink(sink, m_postingsListsOffset); + } + + template + void Deserialize(Source & source) + { + CHECK_EQUAL(m_version, TextIndexVersion::V0, ()); + + std::string headerMagic(kHeaderMagic.size(), ' '); + source.Read(&headerMagic[0], headerMagic.size()); + CHECK_EQUAL(headerMagic, kHeaderMagic, ()); + m_version = static_cast(ReadPrimitiveFromSource(source)); + m_numTokens = ReadPrimitiveFromSource(source); + m_dictPositionsOffset = ReadPrimitiveFromSource(source); + m_dictWordsOffset = ReadPrimitiveFromSource(source); + m_postingsStartsOffset = ReadPrimitiveFromSource(source); + m_postingsListsOffset = ReadPrimitiveFromSource(source); + } + + static std::string const kHeaderMagic; + TextIndexVersion m_version = TextIndexVersion::Latest; + uint32_t m_numTokens = 0; + uint32_t m_dictPositionsOffset = 0; + uint32_t m_dictWordsOffset = 0; + uint32_t m_postingsStartsOffset = 0; + uint32_t m_postingsListsOffset = 0; +}; + +template +class MemTextIndex +{ +public: + MemTextIndex() = default; + + void AddPosting(Token const & token, Posting const & posting) + { + m_postingsByToken[token].emplace_back(posting); + } + + // Executes |f| on every posting associated with |token|. + // The order of postings is not specified. + template + void ForEachPosting(Token const & token, F && f) const + { + auto it = m_postingsByToken.find(token); + if (it == m_postingsByToken.end()) + return; + for (auto const & p : it->second) + f(p); + } + + template + void Serialize(Sink & sink) + { + SortPostings(); + + TextIndexHeader header; + + uint64_t const startPos = sink.Pos(); + header.Serialize(sink); + + header.m_numTokens = ::base::checked_cast(m_postingsByToken.size()); + + header.m_dictPositionsOffset = RelativePos(sink, startPos); + uint32_t offset = header.m_dictPositionsOffset; + for (auto const & entry : m_postingsByToken) + { + auto const & token = entry.first; + WriteToSink(sink, offset); + offset += static_cast(token.size()); + } + // One more for convenience. + WriteToSink(sink, offset); + + header.m_dictWordsOffset = RelativePos(sink, startPos); + for (auto const & entry : m_postingsByToken) + { + auto const & token = entry.first; + sink.Write(token.data(), token.size()); + } + + header.m_postingsStartsOffset = RelativePos(sink, startPos); + // 4 bytes for each 32-bit position and 4 bytes for the dummy entry at the end. + WriteZeroesToSink(sink, 4 * (header.m_numTokens + 1)); + + header.m_postingsListsOffset = RelativePos(sink, startPos); + + std::vector postingsStarts; + postingsStarts.reserve(header.m_numTokens); + for (auto const & entry : m_postingsByToken) + { + auto const & postings = entry.second; + + postingsStarts.emplace_back(RelativePos(sink, startPos)); + + uint64_t last = 0; + for (auto const & p : postings) + { + CHECK(last == 0 || last < p, (last, p)); + uint64_t const delta = ::base::checked_cast(p) - last; + WriteVarUint(sink, delta); + last = p; + } + } + // One more for convenience. + postingsStarts.emplace_back(RelativePos(sink, startPos)); + + { + uint64_t const savedPos = sink.Pos(); + sink.Seek(startPos + header.m_postingsStartsOffset); + for (uint32_t const s : postingsStarts) + WriteToSink(sink, s); + + CHECK_EQUAL(sink.Pos(), startPos + header.m_postingsListsOffset, ()); + sink.Seek(savedPos); + } + + uint64_t const finishPos = sink.Pos(); + sink.Seek(startPos); + header.Serialize(sink); + sink.Seek(finishPos); + } + + template + void Deserialize(Source & source) + { + uint64_t startPos = source.Pos(); + + TextIndexHeader header; + header.Deserialize(source); + + CHECK_EQUAL(source.Pos(), startPos + header.m_dictPositionsOffset, ()); + std::vector tokenOffsets(header.m_numTokens + 1); + for (size_t i = 0; i < tokenOffsets.size(); ++i) + tokenOffsets[i] = ReadPrimitiveFromSource(source); + + CHECK_EQUAL(source.Pos(), startPos + header.m_dictWordsOffset, ()); + std::vector tokens(header.m_numTokens); + for (size_t i = 0; i < tokens.size(); ++i) + { + size_t const size = ::base::checked_cast(tokenOffsets[i + 1] - tokenOffsets[i]); + tokens[i].resize(size); + source.Read(&tokens[i][0], size); + } + + CHECK_EQUAL(source.Pos(), startPos + header.m_postingsStartsOffset, ()); + std::vector postingsStarts(header.m_numTokens + 1); + for (size_t i = 0; i < postingsStarts.size(); ++i) + postingsStarts[i] = ReadPrimitiveFromSource(source); + + CHECK_EQUAL(source.Pos(), startPos + header.m_postingsListsOffset, ()); + m_postingsByToken.clear(); + for (size_t i = 0; i < header.m_numTokens; ++i) + { + std::vector postings; + uint32_t last = 0; + while (source.Pos() < startPos + postingsStarts[i + 1]) + { + last += ReadVarUint(source); + postings.emplace_back(last); + } + CHECK_EQUAL(source.Pos(), postingsStarts[i + 1], ()); + + m_postingsByToken.emplace(tokens[i], postings); + } + } + +private: + void SortPostings() + { + for (auto & entry : m_postingsByToken) + { + // A posting may occur several times in a document, + // so we remove duplicates for the docid index. + // If the count is needed for ranking it may be stored + // separately. + my::SortUnique(entry.second); + } + } + + template + uint32_t RelativePos(Sink & sink, uint64_t startPos) + { + return ::base::checked_cast(sink.Pos() - startPos); + } + + std::map> m_postingsByToken; +}; + +std::string DebugPrint(TextIndexVersion const & version); +} // namespace base +} // namespace search diff --git a/search/search_tests/CMakeLists.txt b/search/search_tests/CMakeLists.txt index f03f075bee..29bf341186 100644 --- a/search/search_tests/CMakeLists.txt +++ b/search/search_tests/CMakeLists.txt @@ -24,6 +24,7 @@ set( region_info_getter_tests.cpp segment_tree_tests.cpp string_match_test.cpp + text_index_tests.cpp ) omim_add_test(${PROJECT_NAME} ${SRC}) diff --git a/search/search_tests/text_index_tests.cpp b/search/search_tests/text_index_tests.cpp new file mode 100644 index 0000000000..342960205e --- /dev/null +++ b/search/search_tests/text_index_tests.cpp @@ -0,0 +1,63 @@ +#include "testing/testing.hpp" + +#include "search/base/text_index.hpp" + +#include "coding/reader.hpp" +#include "coding/write_to_sink.hpp" +#include "coding/writer.hpp" + +#include "base/stl_add.hpp" +#include "base/string_utils.hpp" + +using namespace search::base; +using namespace search; +using namespace std; + +UNIT_TEST(MemTextIndex_Smoke) +{ + vector const docsCollection = { + "a b c", + "a c", + }; + + MemTextIndex memIndex; + + for (size_t docId = 0; docId < docsCollection.size(); ++docId) + { + strings::SimpleTokenizer tok(docsCollection[docId], " "); + while (tok) + { + memIndex.AddPosting(*tok, static_cast(docId)); + ++tok; + } + } + + // Prepend several bytes to check the relative offsets. + size_t const kSkip = 10; + vector buf; + { + MemWriter writer(buf); + WriteZeroesToSink(writer, kSkip); + memIndex.Serialize(writer); + } + + MemTextIndex deserializedMemIndex; + { + MemReaderWithExceptions reader(buf.data() + kSkip, buf.size()); + ReaderSource source(reader); + deserializedMemIndex.Deserialize(source); + } + + auto testForEach = [&](string const & token, vector const & expected) { + for (auto const & idx : {memIndex, deserializedMemIndex}) + { + vector actual; + idx.ForEachPosting(token, MakeBackInsertFunctor(actual)); + TEST_EQUAL(actual, expected, ()); + } + }; + + testForEach("a", {0, 1}); + testForEach("b", {0}); + testForEach("c", {0, 1}); +}