diff --git a/coding/coding_tests/trie_test.cpp b/coding/coding_tests/trie_test.cpp index 4bed389d6e..f57ba2dfe5 100644 --- a/coding/coding_tests/trie_test.cpp +++ b/coding/coding_tests/trie_test.cpp @@ -54,11 +54,12 @@ struct KeyValuePair uint32_t GetKeySize() const { return m_key.size(); } trie::TrieChar const * GetKeyData() const { return m_key.data(); } + uint32_t GetValue() const { return m_value; } template void SerializeValue(TCont & cont) const { - cont.resize(4); - memcpy(cont.data(), &m_value, 4); + cont.resize(sizeof(m_value)); + memcpy(cont.data(), &m_value, sizeof(m_value)); } bool operator == (KeyValuePair const & p) const @@ -113,6 +114,49 @@ struct MaxValueCalc } }; +class CharValueList +{ +public: + CharValueList(const string & s) : m_string(s) {} + + size_t size() const { return m_string.size(); } + + template + void Dump(SinkT & sink) const + { + sink.Write(m_string.data(), m_string.size()); + } + +private: + string m_string; +}; + +class Uint32ValueList +{ +public: + Uint32ValueList() : m_size(0) {} + + void Append(uint32_t value) + { + size_t const originalSize = m_value.size(); + m_value.resize(originalSize + sizeof(value)); + memcpy(m_value.data() + originalSize, &value, sizeof(value)); + ++m_size; + } + + uint32_t size() const { return m_size; } + + template + void Dump(SinkT & sink) const + { + sink.Write(&m_value[0], m_value.size()); + } + +private: + vector m_value; + uint32_t m_size; +}; + } // unnamed namespace #define ZENC bits::ZigZagEncode @@ -132,8 +176,9 @@ UNIT_TEST(TrieBuilder_WriteNode_Smoke) "abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij", "i4"), ChildNodeInfo(true, 5, "a", "5z") }; - trie::builder::WriteNode(sink, 0, 3, "123", 3, - &children[0], &children[0] + ARRAY_SIZE(children)); + + CharValueList valueList("123"); + trie::builder::WriteNode(sink, 0, valueList, &children[0], &children[0] + ARRAY_SIZE(children)); uint8_t const expected [] = { BOOST_BINARY(11000101), // Header: [0b11] [0b000101] @@ -202,7 +247,9 @@ UNIT_TEST(TrieBuilder_Build) vector serial; PushBackByteSink > sink(serial); - trie::Build(sink, v.begin(), v.end(), trie::builder::MaxValueEdgeBuilder()); + trie::Build >, typename vector::iterator, + trie::builder::MaxValueEdgeBuilder, Uint32ValueList>( + sink, v.begin(), v.end(), trie::builder::MaxValueEdgeBuilder()); reverse(serial.begin(), serial.end()); // LOG(LINFO, (serial.size(), vs)); diff --git a/coding/trie_builder.hpp b/coding/trie_builder.hpp index f14461284c..7f222de854 100644 --- a/coding/trie_builder.hpp +++ b/coding/trie_builder.hpp @@ -1,7 +1,10 @@ #pragma once -#include "../coding/byte_stream.hpp" -#include "../coding/varint.hpp" + +#include "byte_stream.hpp" +#include "varint.hpp" + #include "../base/buffer_vector.hpp" + #include "../std/algorithm.hpp" // Trie format: @@ -39,27 +42,25 @@ namespace trie { namespace builder { - -template -void WriteNode(SinkT & sink, TrieChar baseChar, uint32_t const valueCount, - void const * const valuesDataSize, uint32_t const valuesSize, - ChildIterT const begChild, ChildIterT const endChild, - bool isRoot = false) +template +void WriteNode(SinkT & sink, TrieChar baseChar, ValueListT & valueList, + ChildIterT const begChild, ChildIterT const endChild, bool isRoot = false) { if (begChild == endChild && !isRoot) { // Leaf node. - sink.Write(valuesDataSize, valuesSize); + valueList.Dump(sink); return; } uint32_t const childCount = endChild - begChild; + uint32_t const valueCount = valueList.size(); uint8_t const header = static_cast((min(valueCount, 3U) << 6) + min(childCount, 63U)); sink.Write(&header, 1); if (valueCount >= 3) WriteVarUint(sink, valueCount); if (childCount >= 63) WriteVarUint(sink, childCount); - sink.Write(valuesDataSize, valuesSize); + valueList.Dump(sink); for (ChildIterT it = begChild; it != endChild; /*++it*/) { uint8_t header = (it->IsLeaf() ? 128 : 0); @@ -70,9 +71,9 @@ void WriteNode(SinkT & sink, TrieChar baseChar, uint32_t const valueCount, uint32_t const diff0 = bits::ZigZagEncode(int32_t(edge[0] - baseChar)); if (edgeSize == 1 && (diff0 & ~63U) == 0) { - header |= 64; - header |= diff0; - WriteToSink(sink, header); + header |= 64; + header |= diff0; + WriteToSink(sink, header); } else { @@ -102,20 +103,6 @@ void WriteNode(SinkT & sink, TrieChar baseChar, uint32_t const valueCount, } } -template -void WriteNodeReverse(SinkT & sink, TrieChar baseChar, uint32_t const valueCount, - void const * const valuesDataSize, uint32_t const valuesSize, - ChildIterT const begChild, ChildIterT const endChild, - bool isRoot = false) -{ - typedef buffer_vector OutStorageType; - OutStorageType out; - PushBackByteSink outSink(out); - WriteNode(outSink, baseChar, valueCount, valuesDataSize, valuesSize, begChild, endChild, isRoot); - reverse(out.begin(), out.end()); - sink.Write(out.data(), out.size()); -} - struct ChildInfo { bool m_isLeaf; @@ -136,21 +123,35 @@ struct ChildInfo uint32_t GetEdgeValueSize() const { return m_edgeValue.size(); } }; -template +template struct NodeInfo { uint64_t m_begPos; TrieChar m_char; vector m_children; - buffer_vector m_values; - uint32_t m_valueCount; + ValueListT m_valueList; EdgeBuilderT m_edgeBuilder; - NodeInfo() : m_valueCount(0) {} + NodeInfo() : m_begPos(0), m_char(0) {} NodeInfo(uint64_t pos, TrieChar trieChar, EdgeBuilderT const & edgeBuilder) - : m_begPos(pos), m_char(trieChar), m_valueCount(0), m_edgeBuilder(edgeBuilder) {} + : m_begPos(pos), m_char(trieChar), m_edgeBuilder(edgeBuilder) + { + } }; +template +void WriteNodeReverse(SinkT & sink, TrieChar baseChar, NodeInfo & node, + bool isRoot = false) +{ + typedef buffer_vector OutStorageType; + OutStorageType out; + PushBackByteSink outSink(out); + WriteNode(outSink, baseChar, node.m_valueList, node.m_children.rbegin(), node.m_children.rend(), + isRoot); + reverse(out.begin(), out.end()); + sink.Write(out.data(), out.size()); +} + template void PopNodes(SinkT & sink, NodesT & nodes, int nodesToPop) { @@ -161,9 +162,8 @@ void PopNodes(SinkT & sink, NodesT & nodes, int nodesToPop) NodeInfoType & node = nodes.back(); NodeInfoType & prevNode = nodes[nodes.size() - 2]; - if (node.m_valueCount == 0 && node.m_children.size() <= 1) + if (node.m_valueList.size() == 0 && node.m_children.size() <= 1) { - ASSERT(node.m_values.empty(), ()); ASSERT_EQUAL(node.m_children.size(), 1, ()); ChildInfo & child = node.m_children[0]; prevNode.m_children.push_back(ChildInfo(child.m_isLeaf, child.m_size, node.m_char)); @@ -171,9 +171,7 @@ void PopNodes(SinkT & sink, NodesT & nodes, int nodesToPop) } else { - WriteNodeReverse(sink, node.m_char, node.m_valueCount, - node.m_values.data(), node.m_values.size(), - node.m_children.rbegin(), node.m_children.rend()); + WriteNodeReverse(sink, node.m_char, node); prevNode.m_children.push_back(ChildInfo(node.m_children.empty(), static_cast(sink.Pos() - node.m_begPos), node.m_char)); @@ -231,12 +229,12 @@ struct MaxValueEdgeBuilder } // namespace builder -template +template void Build(SinkT & sink, IterT const beg, IterT const end, EdgeBuilderT const & edgeBuilder) { typedef buffer_vector TrieString; typedef buffer_vector TrieValue; - typedef builder::NodeInfo NodeInfoT; + typedef builder::NodeInfo NodeInfoT; buffer_vector nodes; nodes.push_back(NodeInfoT(sink.Pos(), DEFAULT_CHAR, edgeBuilder)); @@ -264,12 +262,10 @@ void Build(SinkT & sink, IterT const beg, IterT const end, EdgeBuilderT const & uint64_t const pos = sink.Pos(); for (size_t i = nCommon; i < key.size(); ++i) - nodes.push_back(builder::NodeInfo(pos, key[i], edgeBuilder)); + nodes.push_back(NodeInfoT(pos, key[i], edgeBuilder)); + nodes.back().m_valueList.Append(e.GetValue()); e.SerializeValue(value); - nodes.back().m_values.insert(nodes.back().m_values.end(), - value.begin(), value.end()); - nodes.back().m_valueCount += 1; nodes.back().m_edgeBuilder.AddValue(value.data(), value.size()); prevKey.swap(key); @@ -280,10 +276,7 @@ void Build(SinkT & sink, IterT const beg, IterT const end, EdgeBuilderT const & builder::PopNodes(sink, nodes, nodes.size() - 1); // Write the root. - WriteNodeReverse(sink, DEFAULT_CHAR, nodes.back().m_valueCount, - nodes.back().m_values.data(), nodes.back().m_values.size(), - nodes.back().m_children.rbegin(), nodes.back().m_children.rend(), - true); + WriteNodeReverse(sink, DEFAULT_CHAR /* baseChar */, nodes.back(), true /* isRoot */); } } // namespace trie diff --git a/defines.hpp b/defines.hpp index 0e949c8e3c..8fec4e0fdb 100644 --- a/defines.hpp +++ b/defines.hpp @@ -15,6 +15,7 @@ #define METADATA_FILE_TAG "meta" #define METADATA_INDEX_FILE_TAG "metaidx" #define FEATURES_OFFSETS_TABLE_FILE_TAG "offsets" +#define COMPRESSED_SEARCH_INDEX_FILE_TAG "csdx" #define ROUTING_MATRIX_FILE_TAG "mercedes" #define ROUTING_EDGEDATA_FILE_TAG "daewoo" diff --git a/indexer/indexer.pro b/indexer/indexer.pro index c45197d6b5..b227e74d31 100644 --- a/indexer/indexer.pro +++ b/indexer/indexer.pro @@ -44,7 +44,6 @@ SOURCES += \ search_delimiters.cpp \ search_index_builder.cpp \ search_string_utils.cpp \ - string_file.cpp \ types_mapping.cpp \ HEADERS += \ @@ -94,6 +93,7 @@ HEADERS += \ search_string_utils.hpp \ search_trie.hpp \ string_file.hpp \ + string_file_values.hpp \ tesselator_decl.hpp \ tree_structure.hpp \ types_mapping.hpp \ diff --git a/indexer/search_index_builder.cpp b/indexer/search_index_builder.cpp index 12ca7b4e68..fb1c5cdcaf 100644 --- a/indexer/search_index_builder.cpp +++ b/indexer/search_index_builder.cpp @@ -5,6 +5,7 @@ #include "search_trie.hpp" #include "search_string_utils.hpp" #include "string_file.hpp" +#include "string_file_values.hpp" #include "classificator.hpp" #include "feature_visibility.hpp" #include "categories_holder.hpp" @@ -20,15 +21,19 @@ #include "../coding/writer.hpp" #include "../coding/reader_writer_ops.hpp" +#include "../base/assert.hpp" +#include "../base/timer.hpp" +#include "../base/scope_guard.hpp" #include "../base/string_utils.hpp" #include "../base/logging.hpp" #include "../base/stl_add.hpp" #include "../std/algorithm.hpp" -#include "../std/vector.hpp" -#include "../std/unordered_map.hpp" #include "../std/fstream.hpp" #include "../std/initializer_list.hpp" +#include "../std/limits.hpp" +#include "../std/unordered_map.hpp" +#include "../std/vector.hpp" #define SYNONYMS_FILE "synonyms.txt" @@ -84,20 +89,21 @@ public: } }; +template struct FeatureNameInserter { SynonymsHolder * m_synonyms; - StringsFile & m_names; - StringsFile::ValueT m_val; + StringsFileT & m_names; + typename StringsFileT::ValueT m_val; - FeatureNameInserter(SynonymsHolder * synonyms, StringsFile & names) + FeatureNameInserter(SynonymsHolder * synonyms, StringsFileT & names) : m_synonyms(synonyms), m_names(names) { } void AddToken(signed char lang, strings::UniString const & s) const { - m_names.AddString(StringsFile::StringT(s, lang, m_val)); + m_names.AddString(typename StringsFileT::StringT(s, lang, m_val)); } private: @@ -141,22 +147,19 @@ public: } }; -class FeatureInserter +template +struct ValueBuilder; + +template <> +struct ValueBuilder { - SynonymsHolder * m_synonyms; - StringsFile & m_names; - - CategoriesHolder const & m_categories; - - typedef StringsFile::ValueT ValueT; typedef search::trie::ValueReader SaverT; SaverT m_valueSaver; - pair m_scales; + ValueBuilder(serial::CodingParams const & cp) : m_valueSaver(cp) {} - - void MakeValue(FeatureType const & f, feature::TypesHolder const & types, - uint64_t pos, ValueT & value) const + void MakeValue(FeatureType const & f, feature::TypesHolder const & types, uint64_t pos, + SerializedFeatureInfoValue & value) const { SaverT::ValueType v; v.m_featureId = static_cast(pos); @@ -166,9 +169,36 @@ class FeatureInserter v.m_rank = feature::GetSearchRank(types, v.m_pt, f.GetPopulation()); // write to buffer - PushBackByteSink sink(value); + PushBackByteSink sink(value.m_value); m_valueSaver.Save(sink, v); } +}; + +template <> +struct ValueBuilder +{ + void MakeValue(FeatureType const & /* f */, feature::TypesHolder const & /* types */, + uint64_t pos, FeatureIndexValue & value) const + { + ASSERT_LESS(pos, numeric_limits::max(), ()); + value.m_value = static_cast(pos); + } +}; + +template +class FeatureInserter +{ + SynonymsHolder * m_synonyms; + StringsFileT & m_names; + + CategoriesHolder const & m_categories; + + typedef typename StringsFileT::ValueT ValueT; + typedef search::trie::ValueReader SaverT; + + pair m_scales; + + ValueBuilder const & m_valueBuilder; /// There are 3 different ways of search index skipping: /// - skip features in any case (m_skipFeatures) @@ -276,17 +306,16 @@ class FeatureInserter }; public: - FeatureInserter(SynonymsHolder * synonyms, StringsFile & names, - CategoriesHolder const & catHolder, - serial::CodingParams const & cp, - pair const & scales) + FeatureInserter(SynonymsHolder * synonyms, StringsFileT & names, + CategoriesHolder const & catHolder, pair const & scales, + ValueBuilder const & valueBuilder) : m_synonyms(synonyms), m_names(names), - m_categories(catHolder), m_valueSaver(cp), m_scales(scales) + m_categories(catHolder), m_scales(scales), m_valueBuilder(valueBuilder) { } void operator() (FeatureType const & f, uint64_t pos) const - { + { feature::TypesHolder types(f); static SkipIndexing skipIndex; @@ -297,8 +326,9 @@ public: // Init inserter with serialized value. // Insert synonyms only for countries and states (maybe will add cities in future). - FeatureNameInserter inserter(skipIndex.IsCountryOrState(types) ? m_synonyms : 0, m_names); - MakeValue(f, types, pos, inserter.m_val); + FeatureNameInserter inserter(skipIndex.IsCountryOrState(types) ? m_synonyms : 0, + m_names); + m_valueBuilder.MakeValue(f, types, pos, inserter.m_val); // Skip types for features without names. if (!f.ForEachNameRef(inserter)) @@ -332,6 +362,41 @@ public: } }; +template +struct FeatureInserterAdapter +{ + FeatureInserterAdapter(FeatureInserterT & inserter) : m_inserter(inserter), m_index(0) {} + + void operator()(FeatureType const & f, uint64_t pos) + { + m_inserter(f, m_index++); + } + + FeatureInserterT & m_inserter; + uint64_t m_index; +}; + +void AddFeatureNameIndexPairs(FilesContainerR const & container, + CategoriesHolder & categoriesHolder, + StringsFile & stringsFile) +{ + feature::DataHeader header; + header.Load(container.GetReader(HEADER_FILE_TAG)); + FeaturesVector features(container, header); + + ValueBuilder valueBuilder; + + unique_ptr synonyms; + if (header.GetType() == feature::DataHeader::world) + synonyms.reset(new SynonymsHolder(GetPlatform().WritablePathForFile(SYNONYMS_FILE))); + + FeatureInserter> inserter( + synonyms.get(), stringsFile, categoriesHolder, header.GetScaleRange(), valueBuilder); + + FeatureInserterAdapter>> adapter(inserter); + features.ForEachOffset(adapter); +} + void BuildSearchIndex(FilesContainerR const & cont, CategoriesHolder const & catHolder, Writer & writer, string const & tmpFilePath) { @@ -341,30 +406,33 @@ void BuildSearchIndex(FilesContainerR const & cont, CategoriesHolder const & cat FeaturesVector featuresV(cont, header); serial::CodingParams cp(search::GetCPForTrie(header.GetDefCodingParams())); + ValueBuilder valueBuilder(cp); unique_ptr synonyms; if (header.GetType() == feature::DataHeader::world) synonyms.reset(new SynonymsHolder(GetPlatform().WritablePathForFile(SYNONYMS_FILE))); - StringsFile names(tmpFilePath); + StringsFile names(tmpFilePath); - featuresV.ForEachOffset(FeatureInserter(synonyms.get(), names, - catHolder, cp, header.GetScaleRange())); + featuresV.ForEachOffset(FeatureInserter>( + synonyms.get(), names, catHolder, header.GetScaleRange(), valueBuilder)); names.EndAdding(); names.OpenForRead(); - trie::Build(writer, names.Begin(), names.End(), trie::builder::EmptyEdgeBuilder()); + trie::Build::IteratorT, + trie::builder::EmptyEdgeBuilder, ValueList>( + writer, names.Begin(), names.End(), trie::builder::EmptyEdgeBuilder()); // at this point all readers of StringsFile should be dead } FileWriter::DeleteFileX(tmpFilePath); } +} // namespace -} - -bool indexer::BuildSearchIndexFromDatFile(string const & datFile, bool forceRebuild) +namespace indexer { +bool BuildSearchIndexFromDatFile(string const & datFile, bool forceRebuild) { LOG(LINFO, ("Start building search index. Bits = ", search::POINT_CODING_BITS)); @@ -412,3 +480,74 @@ bool indexer::BuildSearchIndexFromDatFile(string const & datFile, bool forceRebu LOG(LINFO, ("End building search index.")); return true; } + +bool AddCompresedSearchIndexSection(string const & fName, bool forceRebuild) +{ + Platform & platform = GetPlatform(); + + FilesContainerR readContainer(platform.GetReader(fName)); + if (readContainer.IsExist(COMPRESSED_SEARCH_INDEX_FILE_TAG) && !forceRebuild) + return true; + + string const indexFile = platform.WritablePathForFile("compressed-search-index.tmp"); + MY_SCOPE_GUARD(indexFileGuard, bind(&FileWriter::DeleteFileX, indexFile)); + + try + { + { + FileWriter indexWriter(indexFile); + BuildCompressedSearchIndex(readContainer, indexWriter); + } + { + FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING); + FileWriter writer = writeContainer.GetWriter(COMPRESSED_SEARCH_INDEX_FILE_TAG); + rw_ops::Reverse(FileReader(indexFile), writer); + } + } + catch (Reader::Exception const & e) + { + LOG(LERROR, ("Error while reading file: ", e.Msg())); + return false; + } + catch (Writer::Exception const & e) + { + LOG(LERROR, ("Error writing index file: ", e.Msg())); + return false; + } + + return true; +} + +void BuildCompressedSearchIndex(FilesContainerR & container, Writer & indexWriter) +{ + Platform & platform = GetPlatform(); + + LOG(LINFO, ("Start building compressed search index for", container.GetFileName())); + my::Timer timer; + + string stringsFilePath = platform.WritablePathForFile("strings.tmp"); + StringsFile stringsFile(stringsFilePath); + MY_SCOPE_GUARD(stringsFileGuard, bind(&FileWriter::DeleteFileX, stringsFilePath)); + + CategoriesHolder categoriesHolder(platform.GetReader(SEARCH_CATEGORIES_FILE_NAME)); + + AddFeatureNameIndexPairs(container, categoriesHolder, stringsFile); + + stringsFile.EndAdding(); + + LOG(LINFO, ("End sorting strings:", timer.ElapsedSeconds())); + + stringsFile.OpenForRead(); + trie::Build::IteratorT, + trie::builder::EmptyEdgeBuilder, ValueList>( + indexWriter, stringsFile.Begin(), stringsFile.End(), trie::builder::EmptyEdgeBuilder()); + + LOG(LINFO, ("End building compressed search index, elapsed seconds:", timer.ElapsedSeconds())); +} + +void BuildCompressedSearchIndex(string const & fName, Writer & indexWriter) +{ + FilesContainerR container(GetPlatform().GetReader(fName)); + BuildCompressedSearchIndex(container, indexWriter); +} +} // namespace indexer diff --git a/indexer/search_index_builder.hpp b/indexer/search_index_builder.hpp index 41ea88b8cb..c661a6ebb4 100644 --- a/indexer/search_index_builder.hpp +++ b/indexer/search_index_builder.hpp @@ -1,8 +1,17 @@ #pragma once + #include "../std/string.hpp" +class FilesContainerR; +class Writer; namespace indexer { - bool BuildSearchIndexFromDatFile(string const & fName, bool forceRebuild = false); -} +bool BuildSearchIndexFromDatFile(string const & fName, bool forceRebuild = false); + +bool AddCompresedSearchIndexSection(string const & fName, bool forceRebuild); + +void BuildCompressedSearchIndex(FilesContainerR & container, Writer & indexWriter); + +void BuildCompressedSearchIndex(string const & fName, Writer & indexWriter); +} // namespace indexer diff --git a/indexer/string_file.cpp b/indexer/string_file.cpp deleted file mode 100644 index 6bfb1ee8c7..0000000000 --- a/indexer/string_file.cpp +++ /dev/null @@ -1,128 +0,0 @@ -#include "string_file.hpp" - -#include "../std/algorithm.hpp" -#include "../std/bind.hpp" - - -template -StringsFile::IdT StringsFile::StringT::Write(TWriter & writer) const -{ - IdT const pos = static_cast(writer.Pos()); - CHECK_EQUAL(static_cast(pos), writer.Pos(), ()); - - rw::Write(writer, m_name); - rw::WriteVectorOfPOD(writer, m_val); - - return pos; -} - -template -void StringsFile::StringT::Read(TReader & src) -{ - rw::Read(src, m_name); - rw::ReadVectorOfPOD(src, m_val); -} - -bool StringsFile::StringT::operator < (StringT const & name) const -{ - if (m_name != name.m_name) - return (m_name < name.m_name); - - return (m_val < name.m_val); -} - -bool StringsFile::StringT::operator == (StringT const & name) const -{ - return (m_name == name.m_name && m_val == name.m_val); -} - -void StringsFile::AddString(StringT const & s) -{ -#ifdef OMIM_OS_DESKTOP - size_t const maxSize = 1000000; -#else - size_t const maxSize = 30000; -#endif - - if (m_strings.size() >= maxSize) - Flush(); - - m_strings.push_back(s); -} - -bool StringsFile::IteratorT::IsEnd() const -{ - return m_file.m_queue.empty(); -} - -StringsFile::StringT StringsFile::IteratorT::dereference() const -{ - ASSERT ( IsValid(), () ); - return m_file.m_queue.top().m_string; -} - -void StringsFile::IteratorT::increment() -{ - ASSERT ( IsValid(), () ); - int const index = m_file.m_queue.top().m_index; - - m_file.m_queue.pop(); - - if (!m_file.PushNextValue(index)) - m_end = IsEnd(); -} - -StringsFile::StringsFile(string const & fPath) - : m_workerThread(1 /* maxTasks */) -{ - m_writer.reset(new FileWriter(fPath)); -} - -void StringsFile::Flush() -{ - shared_ptr task( - new SortAndDumpStringsTask(*m_writer, m_offsets, m_strings)); - m_workerThread.Push(task); -} - -bool StringsFile::PushNextValue(size_t i) -{ - // reach the end of the portion file - if (m_offsets[i].first >= m_offsets[i].second) - return false; - - // init source to needed offset - ReaderSource src(*m_reader); - src.Skip(m_offsets[i].first); - - // read string - StringT s; - s.Read(src); - - // update offset - m_offsets[i].first = src.Pos(); - - // push value to queue - m_queue.push(QValue(s, i)); - return true; -} - -void StringsFile::EndAdding() -{ - Flush(); - - m_workerThread.RunUntilIdleAndStop(); - - m_writer->Flush(); -} - -void StringsFile::OpenForRead() -{ - string const fPath = m_writer->GetName(); - m_writer.reset(); - - m_reader.reset(new FileReader(fPath)); - - for (size_t i = 0; i < m_offsets.size(); ++i) - PushNextValue(i); -} diff --git a/indexer/string_file.hpp b/indexer/string_file.hpp index d6f4b2ad7a..2651394cd5 100644 --- a/indexer/string_file.hpp +++ b/indexer/string_file.hpp @@ -14,12 +14,12 @@ #include "../std/functional.hpp" #include "../std/unique_ptr.hpp" - +template class StringsFile { public: - typedef buffer_vector ValueT; - typedef uint32_t IdT; + using ValueT = TValue; + using IdT = uint32_t; class StringT { @@ -28,14 +28,17 @@ public: public: StringT() {} - StringT(strings::UniString const & name, signed char lang, ValueT const & val) - : m_val(val) + StringT(strings::UniString const & name, signed char lang, ValueT const & val) : m_val(val) { m_name.reserve(name.size() + 1); m_name.push_back(static_cast(lang)); m_name.append(name.begin(), name.end()); } + StringT(strings::UniString const & langName, ValueT const & val) : m_name(langName), m_val(val) + { + } + uint32_t GetKeySize() const { return m_name.size(); } uint32_t const * GetKeyData() const { return m_name.data(); } @@ -43,16 +46,42 @@ public: ValueT const & GetValue() const { return m_val; } - template void SerializeValue(TCont & cont) const + bool operator<(StringT const & name) const { - cont.assign(m_val.begin(), m_val.end()); + if (m_name != name.m_name) + return m_name < name.m_name; + return m_val < name.m_val; } - bool operator < (StringT const & name) const; - bool operator == (StringT const & name) const; + bool operator==(StringT const & name) const + { + return m_name == name.m_name && m_val == name.m_val; + } - template IdT Write(TWriter & writer) const; - template void Read(TReader & src); + template + IdT Write(TWriter & writer) const + { + IdT const pos = static_cast(writer.Pos()); + CHECK_EQUAL(static_cast(pos), writer.Pos(), ()); + + rw::Write(writer, m_name); + m_val.Write(writer); + + return pos; + } + + template + void Read(TReader & src) + { + rw::Read(src, m_name); + m_val.Read(src); + } + + template + void SerializeValue(TCont & cont) const + { + m_val.Serialize(cont); + } void Swap(StringT & r) { @@ -98,7 +127,7 @@ public: trie.ForEach([&memWriter](const strings::UniString & s, const ValueT & v) { rw::Write(memWriter, s); - rw::WriteVectorOfPOD(memWriter, v); + v.Write(memWriter); }); } @@ -126,8 +155,7 @@ public: inline bool IsValid() const { return (!m_end && !IsEnd()); } public: - IteratorT(StringsFile & file, bool isEnd) - : m_file(file), m_end(isEnd) + IteratorT(StringsFile & file, bool isEnd) : m_file(file), m_end(isEnd) { // Additional check in case for empty sequence. if (!m_end) @@ -173,8 +201,104 @@ private: QValue(StringT const & s, size_t i) : m_string(s), m_index(i) {} - inline bool operator > (QValue const & rhs) const { return !(m_string < rhs.m_string); } + inline bool operator>(QValue const & rhs) const { return !(m_string < rhs.m_string); } }; - priority_queue, greater > m_queue; + priority_queue, greater> m_queue; }; + +template +void StringsFile::AddString(StringT const & s) +{ + size_t const maxSize = 1000000; + + if (m_strings.size() >= maxSize) + Flush(); + + m_strings.push_back(s); +} + +template +bool StringsFile::IteratorT::IsEnd() const +{ + return m_file.m_queue.empty(); +} + +template +typename StringsFile::StringT StringsFile::IteratorT::dereference() const +{ + ASSERT(IsValid(), ()); + return m_file.m_queue.top().m_string; +} + +template +void StringsFile::IteratorT::increment() +{ + ASSERT(IsValid(), ()); + int const index = m_file.m_queue.top().m_index; + + m_file.m_queue.pop(); + + if (!m_file.PushNextValue(index)) + m_end = IsEnd(); +} + +template +StringsFile::StringsFile(string const & fPath) + : m_workerThread(1 /* maxTasks */) +{ + m_writer.reset(new FileWriter(fPath)); +} + +template +void StringsFile::Flush() +{ + shared_ptr task( + new SortAndDumpStringsTask(*m_writer, m_offsets, m_strings)); + m_workerThread.Push(task); +} + +template +bool StringsFile::PushNextValue(size_t i) +{ + // reach the end of the portion file + if (m_offsets[i].first >= m_offsets[i].second) + return false; + + // init source to needed offset + ReaderSource src(*m_reader); + src.Skip(m_offsets[i].first); + + // read string + StringT s; + s.Read(src); + + // update offset + m_offsets[i].first = src.Pos(); + + // push value to queue + m_queue.push(QValue(s, i)); + return true; +} + +template +void StringsFile::EndAdding() +{ + Flush(); + + m_workerThread.RunUntilIdleAndStop(); + + m_writer->Flush(); +} + +template +void StringsFile::OpenForRead() +{ + string const fPath = m_writer->GetName(); + m_writer.reset(); + + m_reader.reset(new FileReader(fPath)); + + for (size_t i = 0; i < m_offsets.size(); ++i) + PushNextValue(i); +} diff --git a/indexer/string_file_values.hpp b/indexer/string_file_values.hpp new file mode 100644 index 0000000000..a0da9deecc --- /dev/null +++ b/indexer/string_file_values.hpp @@ -0,0 +1,150 @@ +#pragma once + +#include "../coding/compressed_bit_vector.hpp" +#include "../coding/read_write_utils.hpp" +#include "../coding/write_to_sink.hpp" + +#include "../base/assert.hpp" + +#include "../std/algorithm.hpp" + +/// Following classes are supposed to be used with StringsFile. They +/// allow to write/read them, compare or serialize to an in-memory +/// buffer. The reason to use these classes instead of +/// buffer_vector is that in some specific cases, like +/// compressed search index construction, they allow to avoid +/// redundant serialization-deserialization or sorting. + +/// A wrapper around feature index. +struct FeatureIndexValue +{ + FeatureIndexValue() : m_value(0) {} + + template + void Write(TWriter & writer) const + { + WriteToSink(writer, m_value); + } + + template + void Read(TReader & reader) + { + m_value = ReadPrimitiveFromSource(reader); + } + + template + void Serialize(TCont & cont) const + { + cont.resize(sizeof(m_value)); + memcpy(cont.data(), &m_value, sizeof(m_value)); + } + + bool operator<(FeatureIndexValue const & value) const { return m_value < value.m_value; } + + bool operator==(FeatureIndexValue const & value) const { return m_value == value.m_value; } + + void swap(FeatureIndexValue & value) { ::swap(m_value, value.m_value); } + + uint32_t m_value; +}; + +/// A wrapper around serialized SaverT::ValueType. +struct SerializedFeatureInfoValue +{ + using ValueT = buffer_vector; + + template + void Write(TWriter & writer) const + { + rw::WriteVectorOfPOD(writer, m_value); + } + + template + void Read(TReader & reader) + { + rw::ReadVectorOfPOD(reader, m_value); + } + + template + void Serialize(TCont & cont) const + { + cont.assign(m_value.begin(), m_value.end()); + } + + bool operator<(SerializedFeatureInfoValue const & value) const { return m_value < value.m_value; } + + bool operator==(SerializedFeatureInfoValue const & value) const + { + return m_value == value.m_value; + } + + void swap(SerializedFeatureInfoValue & value) { m_value.swap(value.m_value); } + + ValueT m_value; +}; + +/// This template is used to accumulate and serialize a group of +/// values of the same type. +template +class ValueList; + +/// ValueList serializes a group of features +/// indices as a compressed bit vector, thus, allowing us to save a +/// disk space. +template <> +class ValueList +{ +public: + void Append(FeatureIndexValue const & value) + { + // External-memory trie adds pairs in a sorted + // order, thus, values are supposed to be accumulated in a + // sorted order, and we can avoid sorting them before construction + // of a CompressedBitVector. + ASSERT(m_offsets.empty() || m_offsets.back() <= value.m_value, ()); + if (!m_offsets.empty() && m_offsets.back() == value.m_value) + return; + m_offsets.push_back(value.m_value); + } + + size_t size() const { return m_offsets.empty() ? 0 : 1; } + + template + void Dump(SinkT & sink) const + { + vector buffer; + MemWriter> writer(buffer); + BuildCompressedBitVector(writer, m_offsets); + sink.Write(buffer.data(), buffer.size()); + } + +private: + vector m_offsets; +}; + +/// ValueList sequentially serializes +/// encoded features infos. +template <> +class ValueList +{ +public: + ValueList() : m_size(0) {} + + void Append(SerializedFeatureInfoValue const & value) + { + m_value.insert(m_value.end(), value.m_value.begin(), value.m_value.end()); + ++m_size; + } + + size_t size() const { return m_size; } + + template + void Dump(SinkT & sink) const + { + sink.Write(m_value.data(), m_value.size()); + } + +private: + buffer_vector m_value; + uint32_t m_size; +};