diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro index f782173851..3241f9eea5 100644 --- a/coding/coding_tests/coding_tests.pro +++ b/coding/coding_tests/coding_tests.pro @@ -6,7 +6,7 @@ TEMPLATE = app ROOT_DIR = ../.. -DEPENDENCIES = coding base minizip tomcrypt succinct +DEPENDENCIES = coding base indexer minizip tomcrypt succinct include($$ROOT_DIR/common.pri) diff --git a/coding/coding_tests/trie_test.cpp b/coding/coding_tests/trie_test.cpp index 8576cad436..8935d42563 100644 --- a/coding/coding_tests/trie_test.cpp +++ b/coding/coding_tests/trie_test.cpp @@ -1,20 +1,23 @@ #include "testing/testing.hpp" +#include "coding/byte_stream.hpp" +#include "coding/reader.hpp" #include "coding/trie.hpp" #include "coding/trie_builder.hpp" #include "coding/trie_reader.hpp" -#include "coding/byte_stream.hpp" #include "coding/write_to_sink.hpp" +#include "indexer/coding_params.hpp" +#include "indexer/string_file_values.hpp" + #include "base/logging.hpp" #include "std/algorithm.hpp" +#include "std/cstring.hpp" #include "std/string.hpp" #include "std/vector.hpp" -#include "std/cstring.hpp" #include - namespace { @@ -83,14 +86,13 @@ string DebugPrint(KeyValuePair const & p) struct KeyValuePairBackInserter { - vector m_v; template - void operator()(TString const & s, trie::FixedSizeValueReader<4>::ValueType const & rawValue) + void operator()(TString const & s, uint32_t const & value) { - uint32_t value; - memcpy(&value, &rawValue, 4); m_v.push_back(KeyValuePair(s, value)); } + + vector m_v; }; struct MaxValueCalc @@ -110,14 +112,18 @@ struct MaxValueCalc class CharValueList { public: + using TValue = char; + + void Init(vector const &) {} + CharValueList(const string & s) : m_string(s) {} - size_t size() const { return m_string.size(); } + size_t Size() const { return m_string.size(); } - bool empty() const { return m_string.empty(); } + bool IsEmpty() const { return m_string.empty(); } template - void Dump(TSink & sink) const + void Serialize(TSink & sink) const { sink.Write(m_string.data(), m_string.size()); } @@ -126,31 +132,61 @@ private: string m_string; }; -class Uint32ValueList +} // namespace + +template <> +class ValueList { public: - using TBuffer = vector; + using TValue = uint32_t; - void Append(uint32_t value) - { - m_values.push_back(value); - } + ValueList() = default; + ValueList(serial::CodingParams const & codingParams) : m_codingParams(codingParams) {} - uint32_t size() const { return m_values.size(); } + void Init(vector const & values) { m_values = values; } - bool empty() const { return m_values.empty(); } + size_t Size() const { return m_values.size(); } + + bool IsEmpty() const { return m_values.empty(); } template - void Dump(TSink & sink) const + void Serialize(TSink & sink) const { - sink.Write(m_values.data(), m_values.size() * sizeof(TBuffer::value_type)); + for (auto const & value : m_values) + WriteToSink(sink, value); } -private: - TBuffer m_values; -}; + template + void Deserialize(TSource & src, uint32_t valueCount) + { + m_values.resize(valueCount); + for (size_t i = 0; i < valueCount; ++i) + m_values[i] = ReadPrimitiveFromSource(src); + } -} // unnamed namespace + template + void Deserialize(TSource & src) + { + while (src.Size() > 0) + { + m_values.push_back(TValue()); + m_values.back() = ReadPrimitiveFromSource(src); + } + } + + template + void ForEach(TF && f) const + { + for (auto const & value : m_values) + f(value); + } + + void SetCodingParams(serial::CodingParams const & codingParams) { m_codingParams = codingParams; } + +private: + vector m_values; + serial::CodingParams m_codingParams; +}; #define ZENC bits::ZigZagEncode #define MKSC(x) static_cast(x) @@ -158,8 +194,8 @@ private: UNIT_TEST(TrieBuilder_WriteNode_Smoke) { - vector serial; - PushBackByteSink > sink(serial); + vector buf; + PushBackByteSink> sink(buf); ChildNodeInfo children[] = { ChildNodeInfo(true, 1, "1A"), ChildNodeInfo(false, 2, "B"), ChildNodeInfo(false, 3, "zz"), ChildNodeInfo(true, 4, @@ -194,7 +230,7 @@ UNIT_TEST(TrieBuilder_WriteNode_Smoke) MKUC(BOOST_BINARY(11000000) | ZENC(0)), // Child 5: header: [+leaf] [+supershort] }; - TEST_EQUAL(serial, vector(&expected[0], &expected[0] + ARRAY_SIZE(expected)), ()); + TEST_EQUAL(buf, vector(&expected[0], &expected[0] + ARRAY_SIZE(expected)), ()); } UNIT_TEST(TrieBuilder_Build) @@ -230,15 +266,15 @@ UNIT_TEST(TrieBuilder_Build) for (size_t i = 0; i < v.size(); ++i) vs.push_back(string(v[i].m_key.begin(), v[i].m_key.end())); - vector serial; - PushBackByteSink > sink(serial); + vector buf; + PushBackByteSink> sink(buf); trie::Build>, typename vector::iterator, - Uint32ValueList>(sink, v.begin(), v.end()); - reverse(serial.begin(), serial.end()); + ValueList>(sink, v.begin(), v.end()); + reverse(buf.begin(), buf.end()); - MemReader memReader = MemReader(&serial[0], serial.size()); - using TIterator = trie::Iterator::ValueType>; - auto const root = trie::ReadTrie(memReader, trie::FixedSizeValueReader<4>()); + MemReader memReader = MemReader(&buf[0], buf.size()); + auto const root = + trie::ReadTrie>(memReader, serial::CodingParams()); vector res; KeyValuePairBackInserter f; trie::ForEachRef(*root, f, vector()); diff --git a/coding/compressed_bit_vector.hpp b/coding/compressed_bit_vector.hpp index d048c86767..95d2d86ac4 100644 --- a/coding/compressed_bit_vector.hpp +++ b/coding/compressed_bit_vector.hpp @@ -156,6 +156,14 @@ public: static unique_ptr Deserialize(TReader & reader) { ReaderSource src(reader); + return DeserializeFromSource(src); + } + + // Reads a bit vector from source which must contain a valid + // bit vector representation (see CompressedBitVector::Serialize for the format). + template + static unique_ptr DeserializeFromSource(TSource & src) + { uint8_t header = ReadPrimitiveFromSource(src); CompressedBitVector::StorageStrategy strat = static_cast(header); @@ -174,7 +182,7 @@ public: return make_unique(move(setBits)); } } - return nullptr; + return unique_ptr(); } }; diff --git a/coding/trie.hpp b/coding/trie.hpp index 589541f450..ef80a14e11 100644 --- a/coding/trie.hpp +++ b/coding/trie.hpp @@ -4,6 +4,8 @@ #include "base/base.hpp" #include "base/buffer_vector.hpp" +#include "indexer/string_file_values.hpp" + #include "std/unique_ptr.hpp" namespace trie @@ -16,7 +18,7 @@ typedef uint32_t TrieChar; // However 0 is used because the first byte is actually language id. static uint32_t const DEFAULT_CHAR = 0; -template +template class Iterator { //dbg::ObjectTracker m_tracker; @@ -29,12 +31,12 @@ public: }; buffer_vector m_edge; - buffer_vector m_value; + TValueList m_valueList; virtual ~Iterator() {} - virtual unique_ptr> Clone() const = 0; - virtual unique_ptr> GoToEdge(size_t i) const = 0; + virtual unique_ptr> Clone() const = 0; + virtual unique_ptr> GoToEdge(size_t i) const = 0; }; struct EmptyValueReader @@ -65,11 +67,13 @@ struct FixedSizeValueReader } }; -template -void ForEachRef(Iterator const & iter, F & f, TString const & s) +template +void ForEachRef(Iterator const & iter, TF && f, TString const & s) { - for (size_t i = 0; i < iter.m_value.size(); ++i) - f(s, iter.m_value[i]); + iter.m_valueList.ForEach([&f, &s](typename TValueList::TValue value) + { + f(s, value); + }); for (size_t i = 0; i < iter.m_edge.size(); ++i) { TString s1(s); @@ -79,4 +83,4 @@ void ForEachRef(Iterator const & iter, F & f, TString const & s) } } -} // namespace Trie +} // namespace trie diff --git a/coding/trie_builder.hpp b/coding/trie_builder.hpp index 03ab9f3614..d42ebaf119 100644 --- a/coding/trie_builder.hpp +++ b/coding/trie_builder.hpp @@ -6,6 +6,7 @@ #include "base/buffer_vector.hpp" #include "std/algorithm.hpp" +#include "std/vector.hpp" // Trie format: // [1: header] @@ -46,18 +47,18 @@ void WriteNode(TSink & sink, TrieChar baseChar, TValueList const & valueList, if (begChild == endChild && !isRoot) { // Leaf node. - valueList.Dump(sink); + valueList.Serialize(sink); return; } uint32_t const childCount = endChild - begChild; - uint32_t const valueCount = valueList.size(); + uint32_t const valueCount = valueList.Size(); uint8_t const header = static_cast((min(valueCount, 3U) << 6) + min(childCount, 63U)); sink.Write(&header, 1); if (valueCount >= 3) WriteVarUint(sink, valueCount); if (childCount >= 63) WriteVarUint(sink, childCount); - valueList.Dump(sink); + valueList.Serialize(sink); for (TChildIter it = begChild; it != endChild; /*++it*/) { uint8_t header = (it->IsLeaf() ? 128 : 0); @@ -103,7 +104,7 @@ struct ChildInfo { bool m_isLeaf; uint32_t m_size; - buffer_vector m_edge; + vector m_edge; ChildInfo(bool isLeaf, uint32_t size, TrieChar c) : m_isLeaf(isLeaf), m_size(size), m_edge(1, c) { @@ -121,19 +122,44 @@ struct NodeInfo uint64_t m_begPos; TrieChar m_char; vector m_children; + + // This is ugly but will do until we rename ValueList. + // Here is the rationale. ValueList<> is the entity that + // we store in the nodes of the search trie. It can be read + // or written via its methods but not directly as was assumed + // in a previous version of this code. That version provided + // serialization methods for ValueList but the deserialization + // was ad hoc. + // Because of the possibility of serialized ValueLists to represent + // something completely different from an array of FeatureIds + // (a compressed bit vector, for instance) and because of the + // need to update a node's ValueList until the node is finalized + // this vector is needed here. It is better to leave it here + // than to expose it in ValueList. + vector m_temporaryValueList; TValueList m_valueList; + bool m_mayAppend; NodeInfo() : m_begPos(0), m_char(0) {} - NodeInfo(uint64_t pos, TrieChar trieChar) : m_begPos(pos), m_char(trieChar) {} + NodeInfo(uint64_t pos, TrieChar trieChar) : m_begPos(pos), m_char(trieChar), m_mayAppend(true) {} + + // It is finalized in the sense that no more appends are possible + // so it is a fine moment to initialize the underlying ValueList. + void FinalizeValueList() + { + m_valueList.Init(m_temporaryValueList); + m_mayAppend = false; + } }; template -void WriteNodeReverse(TSink & sink, TrieChar baseChar, NodeInfo const & node, +void WriteNodeReverse(TSink & sink, TrieChar baseChar, NodeInfo & node, bool isRoot = false) { using TOutStorage = buffer_vector; TOutStorage out; PushBackByteSink outSink(out); + node.FinalizeValueList(); WriteNode(outSink, baseChar, node.m_valueList, node.m_children.rbegin(), node.m_children.rend(), isRoot); reverse(out.begin(), out.end()); @@ -150,33 +176,48 @@ void PopNodes(TSink & sink, TNodes & nodes, int nodesToPop) TNodeInfo & node = nodes.back(); TNodeInfo & prevNode = nodes[nodes.size() - 2]; - if (node.m_valueList.empty() && node.m_children.size() <= 1) + if (node.m_temporaryValueList.empty() && node.m_children.size() <= 1) { ASSERT_EQUAL(node.m_children.size(), 1, ()); ChildInfo & child = node.m_children[0]; - prevNode.m_children.push_back(ChildInfo(child.m_isLeaf, child.m_size, node.m_char)); - prevNode.m_children.back().m_edge.append(child.m_edge.begin(), child.m_edge.end()); + prevNode.m_children.emplace_back(child.m_isLeaf, child.m_size, node.m_char); + auto & prevChild = prevNode.m_children.back(); + prevChild.m_edge.insert(prevChild.m_edge.end(), child.m_edge.begin(), child.m_edge.end()); } else { WriteNodeReverse(sink, node.m_char, node); - prevNode.m_children.push_back(ChildInfo(node.m_children.empty(), - static_cast(sink.Pos() - node.m_begPos), - node.m_char)); + prevNode.m_children.emplace_back( + node.m_children.empty(), static_cast(sink.Pos() - node.m_begPos), node.m_char); } nodes.pop_back(); } } +template +void AppendValue(TNodeInfo & node, TValue const & value) +{ + // External-memory trie adds pairs in a sorted + // order so the values are supposed to be accumulated in the + // sorted order and we can avoid sorting them before doing + // further operations such as ValueList construction. + using namespace std::rel_ops; + ASSERT(node.m_temporaryValueList.empty() || node.m_temporaryValueList.back() <= value, ()); + if (!node.m_temporaryValueList.empty() && node.m_temporaryValueList.back() == value) + return; + ASSERT(node.m_mayAppend, ()); + node.m_temporaryValueList.push_back(value); +} + template void Build(TSink & sink, TIter const beg, TIter const end) { using TTrieString = buffer_vector; using TNodeInfo = NodeInfo; - buffer_vector nodes; - nodes.push_back(TNodeInfo(sink.Pos(), DEFAULT_CHAR)); + vector nodes; + nodes.emplace_back(sink.Pos(), DEFAULT_CHAR); TTrieString prevKey; @@ -200,8 +241,8 @@ void Build(TSink & sink, TIter const beg, TIter const end) uint64_t const pos = sink.Pos(); for (size_t i = nCommon; i < key.size(); ++i) - nodes.push_back(TNodeInfo(pos, key[i])); - nodes.back().m_valueList.Append(e.GetValue()); + nodes.emplace_back(pos, key[i]); + AppendValue(nodes.back(), e.GetValue()); prevKey.swap(key); prevE.Swap(e); diff --git a/coding/trie_reader.hpp b/coding/trie_reader.hpp index 30c4596fce..f3978f460a 100644 --- a/coding/trie_reader.hpp +++ b/coding/trie_reader.hpp @@ -3,42 +3,38 @@ #include "coding/reader.hpp" #include "coding/varint.hpp" +#include "indexer/coding_params.hpp" +#include "indexer/string_file_values.hpp" + #include "base/assert.hpp" #include "base/bits.hpp" #include "base/macros.hpp" namespace trie { -template -class LeafIterator0 : public Iterator +template +class LeafIterator0 : public Iterator { public: - using ValueType = typename TValueReader::ValueType; + using Iterator::m_valueList; template - LeafIterator0(TReader const & reader, TValueReader const & valueReader) + LeafIterator0(TReader const & reader, serial::CodingParams const & codingParams) { - uint32_t const size = static_cast(reader.Size()); ReaderSource src(reader); - while (src.Pos() < size) - { - this->m_value.push_back(ValueType()); -#ifdef DEBUG - uint64_t const pos = src.Pos(); -#endif - valueReader(src, this->m_value.back()); - ASSERT_NOT_EQUAL(pos, src.Pos(), ()); - } - ASSERT_EQUAL(size, src.Pos(), ()); + m_valueList.SetCodingParams(codingParams); + m_valueList.Deserialize(src); + // todo(@mpimenov) There used to be an assert here + // that src is completely exhausted by this time. } // trie::Iterator overrides: - unique_ptr> Clone() const override + unique_ptr> Clone() const override { - return make_unique>(*this); + return make_unique>(*this); } - unique_ptr> GoToEdge(size_t i) const override + unique_ptr> GoToEdge(size_t i) const override { ASSERT(false, (i)); UNUSED_VALUE(i); @@ -46,70 +42,38 @@ public: } }; -template -class IteratorImplBase : public Iterator -{ -protected: - enum { IS_READER_IN_MEMORY = 0 }; -}; - -template -class IteratorImplBase - : public Iterator -{ -protected: - enum { IS_READER_IN_MEMORY = 1 }; -}; - -template -class Iterator0 : public IteratorImplBase +template +class Iterator0 : public Iterator { public: - typedef typename TValueReader::ValueType ValueType; + using Iterator::m_valueList; + using Iterator::m_edge; - Iterator0(TReader const & reader, TValueReader const & valueReader, TrieChar baseChar) - : m_reader(reader), m_valueReader(valueReader) + Iterator0(TReader const & reader, TrieChar baseChar, serial::CodingParams const & codingParams) + : m_reader(reader), m_codingParams(codingParams) { + m_valueList.SetCodingParams(m_codingParams); ParseNode(baseChar); } // trie::Iterator overrides: - unique_ptr> Clone() const override + unique_ptr> Clone() const override { - return make_unique>(*this); + return make_unique>(*this); } - unique_ptr> GoToEdge(size_t i) const override + unique_ptr> GoToEdge(size_t i) const override { ASSERT_LESS(i, this->m_edge.size(), ()); uint32_t const offset = m_edgeInfo[i].m_offset; uint32_t const size = m_edgeInfo[i+1].m_offset - offset; - // TODO: Profile to check that MemReader optimization helps? - /* - if (!IteratorImplBase::IS_READER_IN_MEMORY && - size < 1024) - { - SharedMemReader memReader(size); - m_reader.Read(offset, memReader.Data(), size); - if (m_edgeInfo[i].m_isLeaf) - return make_unique>( - memReader, m_valueReader); - else - return make_unique>( - memReader, m_valueReader, - this->m_edge[i].m_str.back()); - } - else - */ - { - if (m_edgeInfo[i].m_isLeaf) - return make_unique>(m_reader.SubReader(offset, size), - m_valueReader); - else - return make_unique>( - m_reader.SubReader(offset, size), m_valueReader, this->m_edge[i].m_str.back()); - } + if (m_edgeInfo[i].m_isLeaf) + return make_unique>(m_reader.SubReader(offset, size), + m_codingParams); + + return make_unique>( + m_reader.SubReader(offset, size), this->m_edge[i].m_str.back(), m_codingParams); } private: @@ -131,9 +95,7 @@ private: childCount = ReadVarUint(src); // [value] ... [value] - this->m_value.resize(valueCount); - for (uint32_t i = 0; i < valueCount; ++i) - m_valueReader(src, this->m_value[i]); + m_valueList.Deserialize(src, valueCount); // [childInfo] ... [childInfo] this->m_edge.resize(childCount); @@ -141,7 +103,7 @@ private: m_edgeInfo[0].m_offset = 0; for (uint32_t i = 0; i < childCount; ++i) { - typename Iterator::Edge & e = this->m_edge[i]; + typename Iterator::Edge & e = this->m_edge[i]; // [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)] uint8_t const header = ReadPrimitiveFromSource(src); @@ -185,15 +147,15 @@ private: buffer_vector m_edgeInfo; TReader m_reader; - TValueReader m_valueReader; + serial::CodingParams m_codingParams; }; // Returns iterator to the root of the trie. -template -unique_ptr> ReadTrie( - TReader const & reader, TValueReader valueReader = TValueReader()) +template +unique_ptr> ReadTrie(TReader const & reader, + serial::CodingParams const & codingParams) { - return make_unique>(reader, valueReader, DEFAULT_CHAR); + return make_unique>(reader, DEFAULT_CHAR, codingParams); } } // namespace trie diff --git a/generator/dumper.cpp b/generator/dumper.cpp index daed6b2d1f..38a5dde4c8 100644 --- a/generator/dumper.cpp +++ b/generator/dumper.cpp @@ -164,7 +164,7 @@ namespace feature SearchTokensCollector() : m_currentS(), m_currentCount(0) {} - void operator()(strings::UniString const & s, trie::ValueReader::ValueType const &) + void operator()(strings::UniString const & s, FeatureWithRankAndCenter const &) { if (m_currentS == s) { @@ -198,10 +198,10 @@ namespace feature { FilesContainerR container(new FileReader(fPath)); feature::DataHeader header(container); - serial::CodingParams cp(trie::GetCodingParams(header.GetDefCodingParams())); + serial::CodingParams codingParams(trie::GetCodingParams(header.GetDefCodingParams())); - auto const pTrieRoot = - trie::ReadTrie(container.GetReader(SEARCH_INDEX_FILE_TAG), trie::ValueReader(cp)); + auto const pTrieRoot = trie::ReadTrie>( + container.GetReader(SEARCH_INDEX_FILE_TAG), codingParams); SearchTokensCollector f; trie::ForEachRef(*pTrieRoot, f, strings::UniString()); diff --git a/indexer/search_index_builder.cpp b/indexer/search_index_builder.cpp index 0176fdac25..62220efc6d 100644 --- a/indexer/search_index_builder.cpp +++ b/indexer/search_index_builder.cpp @@ -13,12 +13,13 @@ #include "indexer/string_file_values.hpp" #include "indexer/types_skipper.hpp" -#include "search/search_common.hpp" // for MAX_TOKENS constant +#include "search/search_common.hpp" // for MAX_TOKENS constant #include "defines.hpp" #include "platform/platform.hpp" +#include "coding/file_name_utils.hpp" #include "coding/reader_writer_ops.hpp" #include "coding/trie_builder.hpp" #include "coding/writer.hpp" @@ -42,7 +43,6 @@ namespace { - class SynonymsHolder { unordered_multimap m_map; @@ -155,27 +155,23 @@ template struct ValueBuilder; template <> -struct ValueBuilder +struct ValueBuilder { - using TSaver = trie::ValueReader; - TSaver m_valueSaver; - - ValueBuilder(serial::CodingParams const & cp) : m_valueSaver(cp) {} + ValueBuilder(serial::CodingParams const & cp) : m_cp(cp) {} void MakeValue(FeatureType const & ft, feature::TypesHolder const & types, uint32_t index, - SerializedFeatureInfoValue & value) const + FeatureWithRankAndCenter & v) const { - TSaver::ValueType v; + v.SetCodingParams(m_cp); + v.m_featureId = index; // get BEST geometry rect of feature v.m_pt = feature::GetCenter(ft); v.m_rank = feature::GetSearchRank(types, v.m_pt, ft.GetPopulation()); - - // write to buffer - PushBackByteSink sink(value.m_value); - m_valueSaver.Save(sink, v); } + + serial::CodingParams m_cp; }; template <> @@ -197,7 +193,6 @@ class FeatureInserter CategoriesHolder const & m_categories; using ValueT = typename TStringsFile::ValueT; - using TSaver = trie::ValueReader; pair m_scales; @@ -265,169 +260,91 @@ public: void AddFeatureNameIndexPairs(FilesContainerR const & container, CategoriesHolder & categoriesHolder, - StringsFile & stringsFile) + StringsFile & stringsFile) { FeaturesVectorTest features(container); feature::DataHeader const & header = features.GetHeader(); - ValueBuilder valueBuilder; + serial::CodingParams codingParams(trie::GetCodingParams(header.GetDefCodingParams())); + + ValueBuilder valueBuilder(codingParams); unique_ptr synonyms; if (header.GetType() == feature::DataHeader::world) synonyms.reset(new SynonymsHolder(GetPlatform().WritablePathForFile(SYNONYMS_FILE))); - features.GetVector().ForEach(FeatureInserter>( + features.GetVector().ForEach(FeatureInserter>( synonyms.get(), stringsFile, categoriesHolder, header.GetScaleRange(), valueBuilder)); } - -void BuildSearchIndex(FilesContainerR const & cont, CategoriesHolder const & catHolder, - Writer & writer, string const & tmpFilePath) -{ - { - FeaturesVectorTest features(cont); - feature::DataHeader const & header = features.GetHeader(); - - serial::CodingParams cp(trie::GetCodingParams(header.GetDefCodingParams())); - ValueBuilder valueBuilder(cp); - - unique_ptr synonyms; - if (header.GetType() == feature::DataHeader::world) - synonyms.reset(new SynonymsHolder(GetPlatform().WritablePathForFile(SYNONYMS_FILE))); - - StringsFile names(tmpFilePath); - - features.GetVector().ForEach(FeatureInserter>( - synonyms.get(), names, catHolder, header.GetScaleRange(), valueBuilder)); - - names.EndAdding(); - names.OpenForRead(); - - trie::Build::IteratorT, - ValueList>(writer, names.Begin(), names.End()); - - // at this point all readers of StringsFile should be dead - } - - FileWriter::DeleteFileX(tmpFilePath); -} } // namespace -namespace indexer { -bool BuildSearchIndexFromDatFile(string const & datFile, bool forceRebuild) +namespace indexer { - LOG(LINFO, ("Start building search index. Bits = ", search::kPointCodingBits)); - - try - { - Platform & pl = GetPlatform(); - string const tmpFile1 = datFile + ".search_index_1.tmp"; - string const tmpFile2 = datFile + ".search_index_2.tmp"; - - { - FilesContainerR readCont(datFile); - - if (!forceRebuild && readCont.IsExist(SEARCH_INDEX_FILE_TAG)) - return true; - - FileWriter writer(tmpFile2); - - CategoriesHolder catHolder(pl.GetReader(SEARCH_CATEGORIES_FILE_NAME)); - - BuildSearchIndex(readCont, catHolder, writer, tmpFile1); - - LOG(LINFO, ("Search index size = ", writer.Size())); - } - - { - // Write to container in reversed order. - FilesContainerW writeCont(datFile, FileWriter::OP_WRITE_EXISTING); - FileWriter writer = writeCont.GetWriter(SEARCH_INDEX_FILE_TAG); - rw_ops::Reverse(FileReader(tmpFile2), writer); - } - - FileWriter::DeleteFileX(tmpFile2); - } - catch (Reader::Exception const & e) - { - LOG(LERROR, ("Error while reading file: ", e.Msg())); - return false; - } - catch (Writer::Exception const & e) - { - LOG(LERROR, ("Error writing index file: ", e.Msg())); - return false; - } - - LOG(LINFO, ("End building search index.")); - return true; -} - -bool AddCompresedSearchIndexSection(string const & fName, bool forceRebuild) +bool BuildSearchIndexFromDatFile(string const & datFile, bool forceRebuild) { Platform & platform = GetPlatform(); - FilesContainerR readContainer(platform.GetReader(fName)); - if (readContainer.IsExist(COMPRESSED_SEARCH_INDEX_FILE_TAG) && !forceRebuild) + FilesContainerR readContainer(platform.GetReader(datFile)); + if (readContainer.IsExist(SEARCH_INDEX_FILE_TAG) && !forceRebuild) return true; - string const indexFile = platform.WritablePathForFile("compressed-search-index.tmp"); - MY_SCOPE_GUARD(indexFileGuard, bind(&FileWriter::DeleteFileX, indexFile)); + string mwmName = datFile; + my::GetNameFromFullPath(mwmName); + my::GetNameWithoutExt(mwmName); + string const indexFilePath = platform.WritablePathForFile(mwmName + ".sdx.tmp"); + string const stringsFilePath = platform.WritablePathForFile(mwmName + ".sdx.strings.tmp"); + MY_SCOPE_GUARD(indexFileGuard, bind(&FileWriter::DeleteFileX, indexFilePath)); + MY_SCOPE_GUARD(stringsFileGuard, bind(&FileWriter::DeleteFileX, stringsFilePath)); try { { - FileWriter indexWriter(indexFile); - BuildCompressedSearchIndex(readContainer, indexWriter); + FileWriter indexWriter(indexFilePath); + BuildSearchIndex(readContainer, indexWriter, stringsFilePath); + LOG(LINFO, ("Search index size = ", indexWriter.Size())); } { FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING); - FileWriter writer = writeContainer.GetWriter(COMPRESSED_SEARCH_INDEX_FILE_TAG); - rw_ops::Reverse(FileReader(indexFile), writer); + FileWriter writer = writeContainer.GetWriter(SEARCH_INDEX_FILE_TAG); + rw_ops::Reverse(FileReader(indexFilePath), writer); } } catch (Reader::Exception const & e) { - LOG(LERROR, ("Error while reading file: ", e.Msg())); + LOG(LERROR, ("Error while reading file:", e.Msg())); return false; } catch (Writer::Exception const & e) { - LOG(LERROR, ("Error writing index file: ", e.Msg())); + LOG(LERROR, ("Error writing index file:", e.Msg())); return false; } return true; } -void BuildCompressedSearchIndex(FilesContainerR & container, Writer & indexWriter) +void BuildSearchIndex(FilesContainerR & container, Writer & indexWriter, + string const & stringsFilePath) { Platform & platform = GetPlatform(); - LOG(LINFO, ("Start building compressed search index for", container.GetFileName())); + LOG(LINFO, ("Start building search index for", container.GetFileName())); my::Timer timer; - string stringsFilePath = platform.WritablePathForFile("strings.tmp"); - StringsFile stringsFile(stringsFilePath); - MY_SCOPE_GUARD(stringsFileGuard, bind(&FileWriter::DeleteFileX, stringsFilePath)); + StringsFile stringsFile(stringsFilePath); CategoriesHolder categoriesHolder(platform.GetReader(SEARCH_CATEGORIES_FILE_NAME)); AddFeatureNameIndexPairs(container, categoriesHolder, stringsFile); stringsFile.EndAdding(); - LOG(LINFO, ("End sorting strings:", timer.ElapsedSeconds())); stringsFile.OpenForRead(); - trie::Build::IteratorT, - ValueList>(indexWriter, stringsFile.Begin(), stringsFile.End()); + trie::Build::IteratorT, + ValueList>(indexWriter, stringsFile.Begin(), + stringsFile.End()); - LOG(LINFO, ("End building compressed search index, elapsed seconds:", timer.ElapsedSeconds())); -} - -void BuildCompressedSearchIndex(string const & fName, Writer & indexWriter) -{ - FilesContainerR container(GetPlatform().GetReader(fName)); - BuildCompressedSearchIndex(container, indexWriter); + LOG(LINFO, ("End building search index, elapsed seconds:", timer.ElapsedSeconds())); } } // namespace indexer diff --git a/indexer/search_index_builder.hpp b/indexer/search_index_builder.hpp index e22a9e35af..9181e08f17 100644 --- a/indexer/search_index_builder.hpp +++ b/indexer/search_index_builder.hpp @@ -7,11 +7,8 @@ class Writer; namespace indexer { -bool BuildSearchIndexFromDatFile(string const & fName, bool forceRebuild = false); +bool BuildSearchIndexFromDatFile(string const & filename, bool forceRebuild = false); -bool AddCompresedSearchIndexSection(string const & fName, bool forceRebuild); - -void BuildCompressedSearchIndex(FilesContainerR & container, Writer & indexWriter); - -void BuildCompressedSearchIndex(string const & fName, Writer & indexWriter); +void BuildSearchIndex(FilesContainerR & container, Writer & indexWriter, + string const & stringsFilePath); } // namespace indexer diff --git a/indexer/search_trie.hpp b/indexer/search_trie.hpp index 8a13fa73e1..baece68132 100644 --- a/indexer/search_trie.hpp +++ b/indexer/search_trie.hpp @@ -1,6 +1,7 @@ #pragma once #include "indexer/geometry_serialization.hpp" +#include "indexer/string_file_values.hpp" #include "coding/reader.hpp" #include "coding/trie.hpp" @@ -15,40 +16,7 @@ static const uint8_t kPointCodingBits = 20; namespace trie { - -// Value: feature offset and search rank are stored. -class ValueReader -{ - serial::CodingParams const & m_cp; - -public: - explicit ValueReader(serial::CodingParams const & cp) : m_cp(cp) {} - - struct ValueType - { - m2::PointD m_pt; // Center point of feature; - uint32_t m_featureId; // Offset of the feature; - uint8_t m_rank; // Rank of feature; - }; - - template - void operator()(TSource & src, ValueType & v) const - { - v.m_pt = serial::LoadPoint(src, m_cp); - v.m_featureId = ReadPrimitiveFromSource(src); - v.m_rank = ReadPrimitiveFromSource(src); - } - - template - void Save(TSink & sink, ValueType const & v) const - { - serial::SavePoint(sink, v.m_pt, m_cp); - WriteToSink(sink, v.m_featureId); - WriteToSink(sink, v.m_rank); - } -}; - -using DefaultIterator = trie::Iterator; +using DefaultIterator = trie::Iterator>; inline serial::CodingParams GetCodingParams(serial::CodingParams const & orig) { diff --git a/indexer/string_file.hpp b/indexer/string_file.hpp index d2c888917c..105f7cf94b 100644 --- a/indexer/string_file.hpp +++ b/indexer/string_file.hpp @@ -74,7 +74,7 @@ public: void Read(TReader & src) { rw::Read(src, m_name); - m_val.Read(src); + m_val.DeserializeFromSource(src); } inline void const * value_data() const { return m_val.data(); } @@ -84,7 +84,7 @@ public: void Swap(TString & r) { m_name.swap(r.m_name); - m_val.swap(r.m_val); + m_val.Swap(r.m_val); } }; @@ -125,7 +125,7 @@ public: trie.ForEach([&memWriter](const strings::UniString & s, const ValueT & v) { rw::Write(memWriter, s); - v.Write(memWriter); + v.Serialize(memWriter); }); } diff --git a/indexer/string_file_values.hpp b/indexer/string_file_values.hpp index d75e449371..45279c7012 100644 --- a/indexer/string_file_values.hpp +++ b/indexer/string_file_values.hpp @@ -1,12 +1,17 @@ #pragma once -#include "coding/old_compressed_bit_vector.hpp" +#include "coding/compressed_bit_vector.hpp" #include "coding/read_write_utils.hpp" #include "coding/write_to_sink.hpp" +#include "indexer/coding_params.hpp" +#include "indexer/geometry_serialization.hpp" + #include "base/assert.hpp" #include "std/algorithm.hpp" +#include "std/unique_ptr.hpp" +#include "std/utility.hpp" /// Following classes are supposed to be used with StringsFile. They /// allow to write/read them, compare or serialize to an in-memory @@ -29,7 +34,7 @@ struct FeatureIndexValue template void Read(TReader & reader) { - m_value = ReadPrimitiveFromSource(reader); + m_value = ReadPrimitiveFromSource(reader); } inline void const * data() const { return &m_value; } @@ -42,114 +47,190 @@ struct FeatureIndexValue void swap(FeatureIndexValue & value) { ::swap(m_value, value.m_value); } - uint32_t m_value; + uint64_t m_value; }; -/// A wrapper around serialized SaverT::ValueType. -struct SerializedFeatureInfoValue +struct FeatureWithRankAndCenter { - using ValueT = buffer_vector; + FeatureWithRankAndCenter() = default; + + FeatureWithRankAndCenter(m2::PointD pt, uint32_t featureId, uint8_t rank, + serial::CodingParams codingParams) + : m_pt(pt), m_featureId(featureId), m_rank(rank), m_codingParams(codingParams) + { + } template - void Write(TWriter & writer) const + void Serialize(TWriter & writer) const { - rw::WriteVectorOfPOD(writer, m_value); + serial::SavePoint(writer, m_pt, m_codingParams); + WriteToSink(writer, m_featureId); + WriteToSink(writer, m_rank); } template - void Read(TReader & reader) + void Deserialize(TReader & reader) { - rw::ReadVectorOfPOD(reader, m_value); + ReaderSource src(reader); + DeserializeFromSource(src); } - inline void const * data() const { return m_value.data(); } - - inline size_t size() const { return m_value.size() * sizeof(ValueT::value_type); } - - bool operator<(SerializedFeatureInfoValue const & value) const { return m_value < value.m_value; } - - bool operator==(SerializedFeatureInfoValue const & value) const + template + void DeserializeFromSource(TSource & src) { - return m_value == value.m_value; + m_pt = serial::LoadPoint(src, m_codingParams); + m_featureId = ReadPrimitiveFromSource(src); + m_rank = ReadPrimitiveFromSource(src); } - void swap(SerializedFeatureInfoValue & value) { m_value.swap(value.m_value); } + bool operator<(FeatureWithRankAndCenter const & o) const { return m_featureId < o.m_featureId; } - ValueT m_value; + bool operator==(FeatureWithRankAndCenter const & o) const { return m_featureId == o.m_featureId; } + + void Swap(FeatureWithRankAndCenter & o) + { + swap(m_pt, o.m_pt); + swap(m_featureId, o.m_featureId); + swap(m_rank, o.m_rank); + } + + void SetCodingParams(serial::CodingParams const & codingParams) { m_codingParams = codingParams; } + + m2::PointD m_pt; // Center point of the feature. + uint32_t m_featureId; // Offset of the feature. + uint8_t m_rank; // Rank of the feature. + serial::CodingParams m_codingParams; }; -/// This template is used to accumulate and serialize a group of -/// values of the same type. -template +// This template is used to accumulate, serialize and deserialize +// a group of values of the same type. +template class ValueList; -/// ValueList serializes a group of features -/// indices as a compressed bit vector, thus, allowing us to save a -/// disk space. +// ValueList serializes a group of feature +// indices as a compressed bit vector. template <> class ValueList { public: - void Append(FeatureIndexValue const & value) + using TValue = FeatureIndexValue; + + ValueList() : m_cbv(unique_ptr()) {} + + void Init(vector const & values) { - // External-memory trie adds pairs in a sorted - // order, thus, values are supposed to be accumulated in a - // sorted order, and we can avoid sorting them before construction - // of a CompressedBitVector. - ASSERT(m_offsets.empty() || m_offsets.back() <= value.m_value, ()); - if (!m_offsets.empty() && m_offsets.back() == value.m_value) - return; - m_offsets.push_back(value.m_value); + vector offsets(values.size()); + for (size_t i = 0; i < offsets.size(); ++i) + offsets[i] = values[i].m_value; + m_cbv = coding::CompressedBitVectorBuilder::FromBitPositions(offsets); } - /// This method returns number of values in the current instance of - /// ValueList, but as these values are actually - /// features indices and can be dumped as a single serialized - /// compressed bit vector, this method returns 1 when there're at - /// least one feature's index in the list - so, compressed bit - /// vector will be built and serialized - and 0 otherwise. - size_t size() const { return m_offsets.empty() ? 0 : 1; } + // This method returns number of values in the current instance of + // ValueList, but as these values are actually + // features indices and can be dumped as a single serialized + // compressed bit vector, this method returns 1 when there're at + // least one feature's index in the list - so, compressed bit + // vector will be built and serialized - and 0 otherwise. + size_t Size() const { return m_cbv->PopCount() == 0 ? 0 : 1; } - bool empty() const { return m_offsets.empty(); } + bool IsEmpty() const { return m_cbv->PopCount(); } - template - void Dump(SinkT & sink) const + template + void Serialize(TSink & sink) const { - vector buffer; - MemWriter> writer(buffer); - BuildCompressedBitVector(writer, m_offsets); - sink.Write(buffer.data(), buffer.size()); + vector buf; + MemWriter> writer(buf); + m_cbv->Serialize(writer); + sink.Write(buf.data(), buf.size()); } + // Note the default parameter. It is here for compatibility with + // an old data format that was serializing FeatureWithRankAndCenter`s. + // They were put in a vector, this vector's size was encoded somehow + // and then the vector was written with a method similar to Serialize above. + // The deserialization code read the valueCount separately and then + // read each FeatureWithRankAndCenter one by one. + // A newer approach is to make Serialize/Deserialize responsible for + // every part of serialization and as such it does not need valueCount. + template + void Deserialize(TSource & src, uint32_t valueCount = 0) + { + m_cbv = coding::CompressedBitVectorBuilder::DeserializeFromSource(src); + } + + template + void ForEach(TF && f) const + { + coding::CompressedBitVectorEnumerator::ForEach(*m_cbv, forward(f)); + } + + void SetCodingParams(serial::CodingParams const & codingParams) { m_codingParams = codingParams; } + private: - vector m_offsets; + unique_ptr m_cbv; + serial::CodingParams m_codingParams; }; -/// ValueList sequentially serializes +/// ValueList sequentially serializes /// encoded features infos. template <> -class ValueList +class ValueList { public: - ValueList() : m_size(0) {} + using TValue = FeatureWithRankAndCenter; - void Append(SerializedFeatureInfoValue const & value) + ValueList() = default; + ValueList(serial::CodingParams const & codingParams) : m_codingParams(codingParams) {} + + void Init(vector const & values) { m_values = values; } + + size_t Size() const { return m_values.size(); } + + bool IsEmpty() const { return m_values.empty(); } + + template + void Serialize(TSink & sink) const { - m_value.insert(m_value.end(), value.m_value.begin(), value.m_value.end()); - ++m_size; + for (auto const & value : m_values) + value.Serialize(sink); } - size_t size() const { return m_size; } - - bool empty() const { return !m_size; } - - template - void Dump(SinkT & sink) const + template + void Deserialize(TSource & src, uint32_t valueCount) { - sink.Write(m_value.data(), m_value.size()); + m_values.resize(valueCount); + for (size_t i = 0; i < valueCount; ++i) + m_values[i].DeserializeFromSource(src); } + // When valueCount is not known, Deserialize reads + // until the source is exhausted. + template + void Deserialize(TSource & src) + { + uint32_t const size = static_cast(src.Size()); + while (src.Pos() < size) + { +#ifdef DEBUG + uint64_t const pos = src.Pos(); +#endif + m_values.push_back(TValue()); + m_values.back().DeserializeFromSource(src); + ASSERT_NOT_EQUAL(pos, src.Pos(), ()); + } + ASSERT_EQUAL(size, src.Pos(), ()); + } + + template + void ForEach(TF && f) const + { + for (auto const & value : m_values) + f(value); + } + + void SetCodingParams(serial::CodingParams const & codingParams) { m_codingParams = codingParams; } + private: - buffer_vector m_value; - uint32_t m_size; + vector m_values; + serial::CodingParams m_codingParams; }; diff --git a/search/feature_offset_match.hpp b/search/feature_offset_match.hpp index 1c69bc42f7..c3bcb4c638 100644 --- a/search/feature_offset_match.hpp +++ b/search/feature_offset_match.hpp @@ -112,8 +112,9 @@ void FullMatchInTrie(trie::DefaultIterator const & trieRoot, strings::UniChar co #endif ASSERT_EQUAL ( symbolsMatched, s.size(), () ); - for (size_t i = 0; i < it->m_value.size(); ++i) - f(it->m_value[i]); + + LOG(LINFO, ("foreach`ing", it->m_valueList.Size())); + it->m_valueList.ForEach(f); } template @@ -141,13 +142,10 @@ void PrefixMatchInTrie(trie::DefaultIterator const & trieRoot, strings::UniChar while (!trieQueue.empty()) { - // Next 2 lines don't throw any exceptions while moving - // ownership from container to smart pointer. auto const it = trieQueue.back(); trieQueue.pop_back(); - for (size_t i = 0; i < it->m_value.size(); ++i) - f(it->m_value[i]); + it->m_valueList.ForEach(f); for (size_t i = 0; i < it->m_edge.size(); ++i) trieQueue.push_back(it->GoToEdge(i)); @@ -157,24 +155,21 @@ void PrefixMatchInTrie(trie::DefaultIterator const & trieRoot, strings::UniChar template class OffsetIntersecter { - using ValueT = trie::ValueReader::ValueType; + using TValue = FeatureWithRankAndCenter; struct HashFn { - size_t operator() (ValueT const & v) const - { - return v.m_featureId; - } + size_t operator()(TValue const & v) const { return v.m_featureId; } }; struct EqualFn { - bool operator() (ValueT const & v1, ValueT const & v2) const + bool operator()(TValue const & v1, TValue const & v2) const { return (v1.m_featureId == v2.m_featureId); } }; - using TSet = unordered_set; + using TSet = unordered_set; TFilter const & m_filter; unique_ptr m_prevSet; @@ -183,7 +178,7 @@ class OffsetIntersecter public: explicit OffsetIntersecter(TFilter const & filter) : m_filter(filter), m_set(new TSet) {} - void operator() (ValueT const & v) + void operator()(TValue const & v) { if (m_prevSet && !m_prevSet->count(v)) return; diff --git a/search/retrieval.cpp b/search/retrieval.cpp index 48c0582536..c715469060 100644 --- a/search/retrieval.cpp +++ b/search/retrieval.cpp @@ -67,8 +67,9 @@ unique_ptr RetrieveAddressFeatures(MwmSet::MwmHandl ASSERT(value, ()); serial::CodingParams codingParams(trie::GetCodingParams(value->GetHeader().GetDefCodingParams())); ModelReaderPtr searchReader = value->m_cont.GetReader(SEARCH_INDEX_FILE_TAG); - auto const trieRoot = trie::ReadTrie(SubReaderWrapper(searchReader.GetPtr()), - trie::ValueReader(codingParams)); + auto const trieRoot = + trie::ReadTrie, ValueList>( + SubReaderWrapper(searchReader.GetPtr()), codingParams); auto emptyFilter = [](uint32_t /* featureId */) { diff --git a/search/search_query.cpp b/search/search_query.cpp index 7ecbb920ec..120cc4bf22 100644 --- a/search/search_query.cpp +++ b/search/search_query.cpp @@ -1612,7 +1612,8 @@ void Query::SearchLocality(MwmValue const * pMwm, Locality & res1, Region & res2 ModelReaderPtr searchReader = pMwm->m_cont.GetReader(SEARCH_INDEX_FILE_TAG); auto const trieRoot = - trie::ReadTrie(SubReaderWrapper(searchReader.GetPtr()), trie::ValueReader(cp)); + trie::ReadTrie, ValueList>( + SubReaderWrapper(searchReader.GetPtr()), cp); ForEachLangPrefix(params, *trieRoot, [&](TrieRootPrefix & langRoot, int8_t lang) { diff --git a/search/search_query.hpp b/search/search_query.hpp index 0671f169bf..db6ba4c74a 100644 --- a/search/search_query.hpp +++ b/search/search_query.hpp @@ -110,7 +110,7 @@ public: /// @name This stuff is public for implementation classes in search_query.cpp /// Do not use it in client code. //@{ - using TTrieValue = trie::ValueReader::ValueType; + using TTrieValue = FeatureWithRankAndCenter; void InitParams(bool localitySearch, SearchQueryParams & params);