Implemented compressed search-index building.

This commit is contained in:
Yuri Gorshenin 2015-03-02 16:50:57 +03:00 committed by Alex Zolotarev
parent dc10c7fffa
commit bbbd269cd3
9 changed files with 567 additions and 232 deletions

View file

@ -54,11 +54,12 @@ struct KeyValuePair
uint32_t GetKeySize() const { return m_key.size(); }
trie::TrieChar const * GetKeyData() const { return m_key.data(); }
uint32_t GetValue() const { return m_value; }
template <class TCont> void SerializeValue(TCont & cont) const
{
cont.resize(4);
memcpy(cont.data(), &m_value, 4);
cont.resize(sizeof(m_value));
memcpy(cont.data(), &m_value, sizeof(m_value));
}
bool operator == (KeyValuePair const & p) const
@ -113,6 +114,49 @@ struct MaxValueCalc
}
};
class CharValueList
{
public:
CharValueList(const string & s) : m_string(s) {}
size_t size() const { return m_string.size(); }
template <typename SinkT>
void Dump(SinkT & sink) const
{
sink.Write(m_string.data(), m_string.size());
}
private:
string m_string;
};
class Uint32ValueList
{
public:
Uint32ValueList() : m_size(0) {}
void Append(uint32_t value)
{
size_t const originalSize = m_value.size();
m_value.resize(originalSize + sizeof(value));
memcpy(m_value.data() + originalSize, &value, sizeof(value));
++m_size;
}
uint32_t size() const { return m_size; }
template <typename SinkT>
void Dump(SinkT & sink) const
{
sink.Write(&m_value[0], m_value.size());
}
private:
vector<uint8_t> m_value;
uint32_t m_size;
};
} // unnamed namespace
#define ZENC bits::ZigZagEncode
@ -132,8 +176,9 @@ UNIT_TEST(TrieBuilder_WriteNode_Smoke)
"abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij", "i4"),
ChildNodeInfo(true, 5, "a", "5z")
};
trie::builder::WriteNode(sink, 0, 3, "123", 3,
&children[0], &children[0] + ARRAY_SIZE(children));
CharValueList valueList("123");
trie::builder::WriteNode(sink, 0, valueList, &children[0], &children[0] + ARRAY_SIZE(children));
uint8_t const expected [] =
{
BOOST_BINARY(11000101), // Header: [0b11] [0b000101]
@ -202,7 +247,9 @@ UNIT_TEST(TrieBuilder_Build)
vector<uint8_t> serial;
PushBackByteSink<vector<uint8_t> > sink(serial);
trie::Build(sink, v.begin(), v.end(), trie::builder::MaxValueEdgeBuilder<MaxValueCalc>());
trie::Build<PushBackByteSink<vector<uint8_t> >, typename vector<KeyValuePair>::iterator,
trie::builder::MaxValueEdgeBuilder<MaxValueCalc>, Uint32ValueList>(
sink, v.begin(), v.end(), trie::builder::MaxValueEdgeBuilder<MaxValueCalc>());
reverse(serial.begin(), serial.end());
// LOG(LINFO, (serial.size(), vs));

View file

@ -1,7 +1,10 @@
#pragma once
#include "../coding/byte_stream.hpp"
#include "../coding/varint.hpp"
#include "byte_stream.hpp"
#include "varint.hpp"
#include "../base/buffer_vector.hpp"
#include "../std/algorithm.hpp"
// Trie format:
@ -39,27 +42,25 @@ namespace trie
{
namespace builder
{
template <typename SinkT, typename ChildIterT>
void WriteNode(SinkT & sink, TrieChar baseChar, uint32_t const valueCount,
void const * const valuesDataSize, uint32_t const valuesSize,
ChildIterT const begChild, ChildIterT const endChild,
bool isRoot = false)
template <typename SinkT, typename ChildIterT, typename ValueListT>
void WriteNode(SinkT & sink, TrieChar baseChar, ValueListT & valueList,
ChildIterT const begChild, ChildIterT const endChild, bool isRoot = false)
{
if (begChild == endChild && !isRoot)
{
// Leaf node.
sink.Write(valuesDataSize, valuesSize);
valueList.Dump(sink);
return;
}
uint32_t const childCount = endChild - begChild;
uint32_t const valueCount = valueList.size();
uint8_t const header = static_cast<uint32_t>((min(valueCount, 3U) << 6) + min(childCount, 63U));
sink.Write(&header, 1);
if (valueCount >= 3)
WriteVarUint(sink, valueCount);
if (childCount >= 63)
WriteVarUint(sink, childCount);
sink.Write(valuesDataSize, valuesSize);
valueList.Dump(sink);
for (ChildIterT it = begChild; it != endChild; /*++it*/)
{
uint8_t header = (it->IsLeaf() ? 128 : 0);
@ -70,9 +71,9 @@ void WriteNode(SinkT & sink, TrieChar baseChar, uint32_t const valueCount,
uint32_t const diff0 = bits::ZigZagEncode(int32_t(edge[0] - baseChar));
if (edgeSize == 1 && (diff0 & ~63U) == 0)
{
header |= 64;
header |= diff0;
WriteToSink(sink, header);
header |= 64;
header |= diff0;
WriteToSink(sink, header);
}
else
{
@ -102,20 +103,6 @@ void WriteNode(SinkT & sink, TrieChar baseChar, uint32_t const valueCount,
}
}
template <typename SinkT, typename ChildIterT>
void WriteNodeReverse(SinkT & sink, TrieChar baseChar, uint32_t const valueCount,
void const * const valuesDataSize, uint32_t const valuesSize,
ChildIterT const begChild, ChildIterT const endChild,
bool isRoot = false)
{
typedef buffer_vector<uint8_t, 64> OutStorageType;
OutStorageType out;
PushBackByteSink<OutStorageType> outSink(out);
WriteNode(outSink, baseChar, valueCount, valuesDataSize, valuesSize, begChild, endChild, isRoot);
reverse(out.begin(), out.end());
sink.Write(out.data(), out.size());
}
struct ChildInfo
{
bool m_isLeaf;
@ -136,21 +123,35 @@ struct ChildInfo
uint32_t GetEdgeValueSize() const { return m_edgeValue.size(); }
};
template <class EdgeBuilderT>
template <class EdgeBuilderT, class ValueListT>
struct NodeInfo
{
uint64_t m_begPos;
TrieChar m_char;
vector<ChildInfo> m_children;
buffer_vector<uint8_t, 32> m_values;
uint32_t m_valueCount;
ValueListT m_valueList;
EdgeBuilderT m_edgeBuilder;
NodeInfo() : m_valueCount(0) {}
NodeInfo() : m_begPos(0), m_char(0) {}
NodeInfo(uint64_t pos, TrieChar trieChar, EdgeBuilderT const & edgeBuilder)
: m_begPos(pos), m_char(trieChar), m_valueCount(0), m_edgeBuilder(edgeBuilder) {}
: m_begPos(pos), m_char(trieChar), m_edgeBuilder(edgeBuilder)
{
}
};
template <typename SinkT, typename EdgeBuilderT, typename ValueListT>
void WriteNodeReverse(SinkT & sink, TrieChar baseChar, NodeInfo<EdgeBuilderT, ValueListT> & node,
bool isRoot = false)
{
typedef buffer_vector<uint8_t, 64> OutStorageType;
OutStorageType out;
PushBackByteSink<OutStorageType> outSink(out);
WriteNode(outSink, baseChar, node.m_valueList, node.m_children.rbegin(), node.m_children.rend(),
isRoot);
reverse(out.begin(), out.end());
sink.Write(out.data(), out.size());
}
template <typename SinkT, class NodesT>
void PopNodes(SinkT & sink, NodesT & nodes, int nodesToPop)
{
@ -161,9 +162,8 @@ void PopNodes(SinkT & sink, NodesT & nodes, int nodesToPop)
NodeInfoType & node = nodes.back();
NodeInfoType & prevNode = nodes[nodes.size() - 2];
if (node.m_valueCount == 0 && node.m_children.size() <= 1)
if (node.m_valueList.size() == 0 && node.m_children.size() <= 1)
{
ASSERT(node.m_values.empty(), ());
ASSERT_EQUAL(node.m_children.size(), 1, ());
ChildInfo & child = node.m_children[0];
prevNode.m_children.push_back(ChildInfo(child.m_isLeaf, child.m_size, node.m_char));
@ -171,9 +171,7 @@ void PopNodes(SinkT & sink, NodesT & nodes, int nodesToPop)
}
else
{
WriteNodeReverse(sink, node.m_char, node.m_valueCount,
node.m_values.data(), node.m_values.size(),
node.m_children.rbegin(), node.m_children.rend());
WriteNodeReverse(sink, node.m_char, node);
prevNode.m_children.push_back(ChildInfo(node.m_children.empty(),
static_cast<uint32_t>(sink.Pos() - node.m_begPos),
node.m_char));
@ -231,12 +229,12 @@ struct MaxValueEdgeBuilder
} // namespace builder
template <typename SinkT, typename IterT, class EdgeBuilderT>
template <typename SinkT, typename IterT, typename EdgeBuilderT, typename ValueListT>
void Build(SinkT & sink, IterT const beg, IterT const end, EdgeBuilderT const & edgeBuilder)
{
typedef buffer_vector<TrieChar, 32> TrieString;
typedef buffer_vector<uint8_t, 32> TrieValue;
typedef builder::NodeInfo<EdgeBuilderT> NodeInfoT;
typedef builder::NodeInfo<EdgeBuilderT, ValueListT> NodeInfoT;
buffer_vector<NodeInfoT, 32> nodes;
nodes.push_back(NodeInfoT(sink.Pos(), DEFAULT_CHAR, edgeBuilder));
@ -264,12 +262,10 @@ void Build(SinkT & sink, IterT const beg, IterT const end, EdgeBuilderT const &
uint64_t const pos = sink.Pos();
for (size_t i = nCommon; i < key.size(); ++i)
nodes.push_back(builder::NodeInfo<EdgeBuilderT>(pos, key[i], edgeBuilder));
nodes.push_back(NodeInfoT(pos, key[i], edgeBuilder));
nodes.back().m_valueList.Append(e.GetValue());
e.SerializeValue(value);
nodes.back().m_values.insert(nodes.back().m_values.end(),
value.begin(), value.end());
nodes.back().m_valueCount += 1;
nodes.back().m_edgeBuilder.AddValue(value.data(), value.size());
prevKey.swap(key);
@ -280,10 +276,7 @@ void Build(SinkT & sink, IterT const beg, IterT const end, EdgeBuilderT const &
builder::PopNodes(sink, nodes, nodes.size() - 1);
// Write the root.
WriteNodeReverse(sink, DEFAULT_CHAR, nodes.back().m_valueCount,
nodes.back().m_values.data(), nodes.back().m_values.size(),
nodes.back().m_children.rbegin(), nodes.back().m_children.rend(),
true);
WriteNodeReverse(sink, DEFAULT_CHAR /* baseChar */, nodes.back(), true /* isRoot */);
}
} // namespace trie

View file

@ -15,6 +15,7 @@
#define METADATA_FILE_TAG "meta"
#define METADATA_INDEX_FILE_TAG "metaidx"
#define FEATURES_OFFSETS_TABLE_FILE_TAG "offsets"
#define COMPRESSED_SEARCH_INDEX_FILE_TAG "csdx"
#define ROUTING_MATRIX_FILE_TAG "mercedes"
#define ROUTING_EDGEDATA_FILE_TAG "daewoo"

View file

@ -44,7 +44,6 @@ SOURCES += \
search_delimiters.cpp \
search_index_builder.cpp \
search_string_utils.cpp \
string_file.cpp \
types_mapping.cpp \
HEADERS += \
@ -94,6 +93,7 @@ HEADERS += \
search_string_utils.hpp \
search_trie.hpp \
string_file.hpp \
string_file_values.hpp \
tesselator_decl.hpp \
tree_structure.hpp \
types_mapping.hpp \

View file

@ -5,6 +5,7 @@
#include "search_trie.hpp"
#include "search_string_utils.hpp"
#include "string_file.hpp"
#include "string_file_values.hpp"
#include "classificator.hpp"
#include "feature_visibility.hpp"
#include "categories_holder.hpp"
@ -20,15 +21,19 @@
#include "../coding/writer.hpp"
#include "../coding/reader_writer_ops.hpp"
#include "../base/assert.hpp"
#include "../base/timer.hpp"
#include "../base/scope_guard.hpp"
#include "../base/string_utils.hpp"
#include "../base/logging.hpp"
#include "../base/stl_add.hpp"
#include "../std/algorithm.hpp"
#include "../std/vector.hpp"
#include "../std/unordered_map.hpp"
#include "../std/fstream.hpp"
#include "../std/initializer_list.hpp"
#include "../std/limits.hpp"
#include "../std/unordered_map.hpp"
#include "../std/vector.hpp"
#define SYNONYMS_FILE "synonyms.txt"
@ -84,20 +89,21 @@ public:
}
};
template<typename StringsFileT>
struct FeatureNameInserter
{
SynonymsHolder * m_synonyms;
StringsFile & m_names;
StringsFile::ValueT m_val;
StringsFileT & m_names;
typename StringsFileT::ValueT m_val;
FeatureNameInserter(SynonymsHolder * synonyms, StringsFile & names)
FeatureNameInserter(SynonymsHolder * synonyms, StringsFileT & names)
: m_synonyms(synonyms), m_names(names)
{
}
void AddToken(signed char lang, strings::UniString const & s) const
{
m_names.AddString(StringsFile::StringT(s, lang, m_val));
m_names.AddString(typename StringsFileT::StringT(s, lang, m_val));
}
private:
@ -141,22 +147,19 @@ public:
}
};
class FeatureInserter
template <typename ValueT>
struct ValueBuilder;
template <>
struct ValueBuilder<SerializedFeatureInfoValue>
{
SynonymsHolder * m_synonyms;
StringsFile & m_names;
CategoriesHolder const & m_categories;
typedef StringsFile::ValueT ValueT;
typedef search::trie::ValueReader SaverT;
SaverT m_valueSaver;
pair<int, int> m_scales;
ValueBuilder(serial::CodingParams const & cp) : m_valueSaver(cp) {}
void MakeValue(FeatureType const & f, feature::TypesHolder const & types,
uint64_t pos, ValueT & value) const
void MakeValue(FeatureType const & f, feature::TypesHolder const & types, uint64_t pos,
SerializedFeatureInfoValue & value) const
{
SaverT::ValueType v;
v.m_featureId = static_cast<uint32_t>(pos);
@ -166,9 +169,36 @@ class FeatureInserter
v.m_rank = feature::GetSearchRank(types, v.m_pt, f.GetPopulation());
// write to buffer
PushBackByteSink<ValueT> sink(value);
PushBackByteSink<SerializedFeatureInfoValue::ValueT> sink(value.m_value);
m_valueSaver.Save(sink, v);
}
};
template <>
struct ValueBuilder<FeatureIndexValue>
{
void MakeValue(FeatureType const & /* f */, feature::TypesHolder const & /* types */,
uint64_t pos, FeatureIndexValue & value) const
{
ASSERT_LESS(pos, numeric_limits<uint32_t>::max(), ());
value.m_value = static_cast<uint32_t>(pos);
}
};
template <typename StringsFileT>
class FeatureInserter
{
SynonymsHolder * m_synonyms;
StringsFileT & m_names;
CategoriesHolder const & m_categories;
typedef typename StringsFileT::ValueT ValueT;
typedef search::trie::ValueReader SaverT;
pair<int, int> m_scales;
ValueBuilder<ValueT> const & m_valueBuilder;
/// There are 3 different ways of search index skipping:
/// - skip features in any case (m_skipFeatures)
@ -276,17 +306,16 @@ class FeatureInserter
};
public:
FeatureInserter(SynonymsHolder * synonyms, StringsFile & names,
CategoriesHolder const & catHolder,
serial::CodingParams const & cp,
pair<int, int> const & scales)
FeatureInserter(SynonymsHolder * synonyms, StringsFileT & names,
CategoriesHolder const & catHolder, pair<int, int> const & scales,
ValueBuilder<ValueT> const & valueBuilder)
: m_synonyms(synonyms), m_names(names),
m_categories(catHolder), m_valueSaver(cp), m_scales(scales)
m_categories(catHolder), m_scales(scales), m_valueBuilder(valueBuilder)
{
}
void operator() (FeatureType const & f, uint64_t pos) const
{
{
feature::TypesHolder types(f);
static SkipIndexing skipIndex;
@ -297,8 +326,9 @@ public:
// Init inserter with serialized value.
// Insert synonyms only for countries and states (maybe will add cities in future).
FeatureNameInserter inserter(skipIndex.IsCountryOrState(types) ? m_synonyms : 0, m_names);
MakeValue(f, types, pos, inserter.m_val);
FeatureNameInserter<StringsFileT> inserter(skipIndex.IsCountryOrState(types) ? m_synonyms : 0,
m_names);
m_valueBuilder.MakeValue(f, types, pos, inserter.m_val);
// Skip types for features without names.
if (!f.ForEachNameRef(inserter))
@ -332,6 +362,41 @@ public:
}
};
template <typename FeatureInserterT>
struct FeatureInserterAdapter
{
FeatureInserterAdapter(FeatureInserterT & inserter) : m_inserter(inserter), m_index(0) {}
void operator()(FeatureType const & f, uint64_t pos)
{
m_inserter(f, m_index++);
}
FeatureInserterT & m_inserter;
uint64_t m_index;
};
void AddFeatureNameIndexPairs(FilesContainerR const & container,
CategoriesHolder & categoriesHolder,
StringsFile<FeatureIndexValue> & stringsFile)
{
feature::DataHeader header;
header.Load(container.GetReader(HEADER_FILE_TAG));
FeaturesVector features(container, header);
ValueBuilder<FeatureIndexValue> valueBuilder;
unique_ptr<SynonymsHolder> synonyms;
if (header.GetType() == feature::DataHeader::world)
synonyms.reset(new SynonymsHolder(GetPlatform().WritablePathForFile(SYNONYMS_FILE)));
FeatureInserter<StringsFile<FeatureIndexValue>> inserter(
synonyms.get(), stringsFile, categoriesHolder, header.GetScaleRange(), valueBuilder);
FeatureInserterAdapter<FeatureInserter<StringsFile<FeatureIndexValue>>> adapter(inserter);
features.ForEachOffset(adapter);
}
void BuildSearchIndex(FilesContainerR const & cont, CategoriesHolder const & catHolder,
Writer & writer, string const & tmpFilePath)
{
@ -341,30 +406,33 @@ void BuildSearchIndex(FilesContainerR const & cont, CategoriesHolder const & cat
FeaturesVector featuresV(cont, header);
serial::CodingParams cp(search::GetCPForTrie(header.GetDefCodingParams()));
ValueBuilder<SerializedFeatureInfoValue> valueBuilder(cp);
unique_ptr<SynonymsHolder> synonyms;
if (header.GetType() == feature::DataHeader::world)
synonyms.reset(new SynonymsHolder(GetPlatform().WritablePathForFile(SYNONYMS_FILE)));
StringsFile names(tmpFilePath);
StringsFile<SerializedFeatureInfoValue> names(tmpFilePath);
featuresV.ForEachOffset(FeatureInserter(synonyms.get(), names,
catHolder, cp, header.GetScaleRange()));
featuresV.ForEachOffset(FeatureInserter<StringsFile<SerializedFeatureInfoValue>>(
synonyms.get(), names, catHolder, header.GetScaleRange(), valueBuilder));
names.EndAdding();
names.OpenForRead();
trie::Build(writer, names.Begin(), names.End(), trie::builder::EmptyEdgeBuilder());
trie::Build<Writer, typename StringsFile<SerializedFeatureInfoValue>::IteratorT,
trie::builder::EmptyEdgeBuilder, ValueList<SerializedFeatureInfoValue>>(
writer, names.Begin(), names.End(), trie::builder::EmptyEdgeBuilder());
// at this point all readers of StringsFile should be dead
}
FileWriter::DeleteFileX(tmpFilePath);
}
} // namespace
}
bool indexer::BuildSearchIndexFromDatFile(string const & datFile, bool forceRebuild)
namespace indexer {
bool BuildSearchIndexFromDatFile(string const & datFile, bool forceRebuild)
{
LOG(LINFO, ("Start building search index. Bits = ", search::POINT_CODING_BITS));
@ -412,3 +480,74 @@ bool indexer::BuildSearchIndexFromDatFile(string const & datFile, bool forceRebu
LOG(LINFO, ("End building search index."));
return true;
}
bool AddCompresedSearchIndexSection(string const & fName, bool forceRebuild)
{
Platform & platform = GetPlatform();
FilesContainerR readContainer(platform.GetReader(fName));
if (readContainer.IsExist(COMPRESSED_SEARCH_INDEX_FILE_TAG) && !forceRebuild)
return true;
string const indexFile = platform.WritablePathForFile("compressed-search-index.tmp");
MY_SCOPE_GUARD(indexFileGuard, bind(&FileWriter::DeleteFileX, indexFile));
try
{
{
FileWriter indexWriter(indexFile);
BuildCompressedSearchIndex(readContainer, indexWriter);
}
{
FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING);
FileWriter writer = writeContainer.GetWriter(COMPRESSED_SEARCH_INDEX_FILE_TAG);
rw_ops::Reverse(FileReader(indexFile), writer);
}
}
catch (Reader::Exception const & e)
{
LOG(LERROR, ("Error while reading file: ", e.Msg()));
return false;
}
catch (Writer::Exception const & e)
{
LOG(LERROR, ("Error writing index file: ", e.Msg()));
return false;
}
return true;
}
void BuildCompressedSearchIndex(FilesContainerR & container, Writer & indexWriter)
{
Platform & platform = GetPlatform();
LOG(LINFO, ("Start building compressed search index for", container.GetFileName()));
my::Timer timer;
string stringsFilePath = platform.WritablePathForFile("strings.tmp");
StringsFile<FeatureIndexValue> stringsFile(stringsFilePath);
MY_SCOPE_GUARD(stringsFileGuard, bind(&FileWriter::DeleteFileX, stringsFilePath));
CategoriesHolder categoriesHolder(platform.GetReader(SEARCH_CATEGORIES_FILE_NAME));
AddFeatureNameIndexPairs(container, categoriesHolder, stringsFile);
stringsFile.EndAdding();
LOG(LINFO, ("End sorting strings:", timer.ElapsedSeconds()));
stringsFile.OpenForRead();
trie::Build<Writer, typename StringsFile<FeatureIndexValue>::IteratorT,
trie::builder::EmptyEdgeBuilder, ValueList<FeatureIndexValue>>(
indexWriter, stringsFile.Begin(), stringsFile.End(), trie::builder::EmptyEdgeBuilder());
LOG(LINFO, ("End building compressed search index, elapsed seconds:", timer.ElapsedSeconds()));
}
void BuildCompressedSearchIndex(string const & fName, Writer & indexWriter)
{
FilesContainerR container(GetPlatform().GetReader(fName));
BuildCompressedSearchIndex(container, indexWriter);
}
} // namespace indexer

View file

@ -1,8 +1,17 @@
#pragma once
#include "../std/string.hpp"
class FilesContainerR;
class Writer;
namespace indexer
{
bool BuildSearchIndexFromDatFile(string const & fName, bool forceRebuild = false);
}
bool BuildSearchIndexFromDatFile(string const & fName, bool forceRebuild = false);
bool AddCompresedSearchIndexSection(string const & fName, bool forceRebuild);
void BuildCompressedSearchIndex(FilesContainerR & container, Writer & indexWriter);
void BuildCompressedSearchIndex(string const & fName, Writer & indexWriter);
} // namespace indexer

View file

@ -1,128 +0,0 @@
#include "string_file.hpp"
#include "../std/algorithm.hpp"
#include "../std/bind.hpp"
template <class TWriter>
StringsFile::IdT StringsFile::StringT::Write(TWriter & writer) const
{
IdT const pos = static_cast<IdT>(writer.Pos());
CHECK_EQUAL(static_cast<uint64_t>(pos), writer.Pos(), ());
rw::Write(writer, m_name);
rw::WriteVectorOfPOD(writer, m_val);
return pos;
}
template <class TReader>
void StringsFile::StringT::Read(TReader & src)
{
rw::Read(src, m_name);
rw::ReadVectorOfPOD(src, m_val);
}
bool StringsFile::StringT::operator < (StringT const & name) const
{
if (m_name != name.m_name)
return (m_name < name.m_name);
return (m_val < name.m_val);
}
bool StringsFile::StringT::operator == (StringT const & name) const
{
return (m_name == name.m_name && m_val == name.m_val);
}
void StringsFile::AddString(StringT const & s)
{
#ifdef OMIM_OS_DESKTOP
size_t const maxSize = 1000000;
#else
size_t const maxSize = 30000;
#endif
if (m_strings.size() >= maxSize)
Flush();
m_strings.push_back(s);
}
bool StringsFile::IteratorT::IsEnd() const
{
return m_file.m_queue.empty();
}
StringsFile::StringT StringsFile::IteratorT::dereference() const
{
ASSERT ( IsValid(), () );
return m_file.m_queue.top().m_string;
}
void StringsFile::IteratorT::increment()
{
ASSERT ( IsValid(), () );
int const index = m_file.m_queue.top().m_index;
m_file.m_queue.pop();
if (!m_file.PushNextValue(index))
m_end = IsEnd();
}
StringsFile::StringsFile(string const & fPath)
: m_workerThread(1 /* maxTasks */)
{
m_writer.reset(new FileWriter(fPath));
}
void StringsFile::Flush()
{
shared_ptr<SortAndDumpStringsTask> task(
new SortAndDumpStringsTask(*m_writer, m_offsets, m_strings));
m_workerThread.Push(task);
}
bool StringsFile::PushNextValue(size_t i)
{
// reach the end of the portion file
if (m_offsets[i].first >= m_offsets[i].second)
return false;
// init source to needed offset
ReaderSource<FileReader> src(*m_reader);
src.Skip(m_offsets[i].first);
// read string
StringT s;
s.Read(src);
// update offset
m_offsets[i].first = src.Pos();
// push value to queue
m_queue.push(QValue(s, i));
return true;
}
void StringsFile::EndAdding()
{
Flush();
m_workerThread.RunUntilIdleAndStop();
m_writer->Flush();
}
void StringsFile::OpenForRead()
{
string const fPath = m_writer->GetName();
m_writer.reset();
m_reader.reset(new FileReader(fPath));
for (size_t i = 0; i < m_offsets.size(); ++i)
PushNextValue(i);
}

View file

@ -14,12 +14,12 @@
#include "../std/functional.hpp"
#include "../std/unique_ptr.hpp"
template <typename TValue>
class StringsFile
{
public:
typedef buffer_vector<uint8_t, 32> ValueT;
typedef uint32_t IdT;
using ValueT = TValue;
using IdT = uint32_t;
class StringT
{
@ -28,14 +28,17 @@ public:
public:
StringT() {}
StringT(strings::UniString const & name, signed char lang, ValueT const & val)
: m_val(val)
StringT(strings::UniString const & name, signed char lang, ValueT const & val) : m_val(val)
{
m_name.reserve(name.size() + 1);
m_name.push_back(static_cast<uint8_t>(lang));
m_name.append(name.begin(), name.end());
}
StringT(strings::UniString const & langName, ValueT const & val) : m_name(langName), m_val(val)
{
}
uint32_t GetKeySize() const { return m_name.size(); }
uint32_t const * GetKeyData() const { return m_name.data(); }
@ -43,16 +46,42 @@ public:
ValueT const & GetValue() const { return m_val; }
template <class TCont> void SerializeValue(TCont & cont) const
bool operator<(StringT const & name) const
{
cont.assign(m_val.begin(), m_val.end());
if (m_name != name.m_name)
return m_name < name.m_name;
return m_val < name.m_val;
}
bool operator < (StringT const & name) const;
bool operator == (StringT const & name) const;
bool operator==(StringT const & name) const
{
return m_name == name.m_name && m_val == name.m_val;
}
template <class TWriter> IdT Write(TWriter & writer) const;
template <class TReader> void Read(TReader & src);
template <class TWriter>
IdT Write(TWriter & writer) const
{
IdT const pos = static_cast<IdT>(writer.Pos());
CHECK_EQUAL(static_cast<uint64_t>(pos), writer.Pos(), ());
rw::Write(writer, m_name);
m_val.Write(writer);
return pos;
}
template <class TReader>
void Read(TReader & src)
{
rw::Read(src, m_name);
m_val.Read(src);
}
template <class TCont>
void SerializeValue(TCont & cont) const
{
m_val.Serialize(cont);
}
void Swap(StringT & r)
{
@ -98,7 +127,7 @@ public:
trie.ForEach([&memWriter](const strings::UniString & s, const ValueT & v)
{
rw::Write(memWriter, s);
rw::WriteVectorOfPOD(memWriter, v);
v.Write(memWriter);
});
}
@ -126,8 +155,7 @@ public:
inline bool IsValid() const { return (!m_end && !IsEnd()); }
public:
IteratorT(StringsFile & file, bool isEnd)
: m_file(file), m_end(isEnd)
IteratorT(StringsFile & file, bool isEnd) : m_file(file), m_end(isEnd)
{
// Additional check in case for empty sequence.
if (!m_end)
@ -173,8 +201,104 @@ private:
QValue(StringT const & s, size_t i) : m_string(s), m_index(i) {}
inline bool operator > (QValue const & rhs) const { return !(m_string < rhs.m_string); }
inline bool operator>(QValue const & rhs) const { return !(m_string < rhs.m_string); }
};
priority_queue<QValue, vector<QValue>, greater<QValue> > m_queue;
priority_queue<QValue, vector<QValue>, greater<QValue>> m_queue;
};
template <typename ValueT>
void StringsFile<ValueT>::AddString(StringT const & s)
{
size_t const maxSize = 1000000;
if (m_strings.size() >= maxSize)
Flush();
m_strings.push_back(s);
}
template <typename ValueT>
bool StringsFile<ValueT>::IteratorT::IsEnd() const
{
return m_file.m_queue.empty();
}
template <typename ValueT>
typename StringsFile<ValueT>::StringT StringsFile<ValueT>::IteratorT::dereference() const
{
ASSERT(IsValid(), ());
return m_file.m_queue.top().m_string;
}
template <typename ValueT>
void StringsFile<ValueT>::IteratorT::increment()
{
ASSERT(IsValid(), ());
int const index = m_file.m_queue.top().m_index;
m_file.m_queue.pop();
if (!m_file.PushNextValue(index))
m_end = IsEnd();
}
template <typename ValueT>
StringsFile<ValueT>::StringsFile(string const & fPath)
: m_workerThread(1 /* maxTasks */)
{
m_writer.reset(new FileWriter(fPath));
}
template <typename ValueT>
void StringsFile<ValueT>::Flush()
{
shared_ptr<SortAndDumpStringsTask> task(
new SortAndDumpStringsTask(*m_writer, m_offsets, m_strings));
m_workerThread.Push(task);
}
template <typename ValueT>
bool StringsFile<ValueT>::PushNextValue(size_t i)
{
// reach the end of the portion file
if (m_offsets[i].first >= m_offsets[i].second)
return false;
// init source to needed offset
ReaderSource<FileReader> src(*m_reader);
src.Skip(m_offsets[i].first);
// read string
StringT s;
s.Read(src);
// update offset
m_offsets[i].first = src.Pos();
// push value to queue
m_queue.push(QValue(s, i));
return true;
}
template <typename ValueT>
void StringsFile<ValueT>::EndAdding()
{
Flush();
m_workerThread.RunUntilIdleAndStop();
m_writer->Flush();
}
template <typename ValueT>
void StringsFile<ValueT>::OpenForRead()
{
string const fPath = m_writer->GetName();
m_writer.reset();
m_reader.reset(new FileReader(fPath));
for (size_t i = 0; i < m_offsets.size(); ++i)
PushNextValue(i);
}

View file

@ -0,0 +1,150 @@
#pragma once
#include "../coding/compressed_bit_vector.hpp"
#include "../coding/read_write_utils.hpp"
#include "../coding/write_to_sink.hpp"
#include "../base/assert.hpp"
#include "../std/algorithm.hpp"
/// Following classes are supposed to be used with StringsFile. They
/// allow to write/read them, compare or serialize to an in-memory
/// buffer. The reason to use these classes instead of
/// buffer_vector<uint8_t, N> is that in some specific cases, like
/// compressed search index construction, they allow to avoid
/// redundant serialization-deserialization or sorting.
/// A wrapper around feature index.
struct FeatureIndexValue
{
FeatureIndexValue() : m_value(0) {}
template <typename TWriter>
void Write(TWriter & writer) const
{
WriteToSink(writer, m_value);
}
template <typename TReader>
void Read(TReader & reader)
{
m_value = ReadPrimitiveFromSource<uint32_t>(reader);
}
template <typename TCont>
void Serialize(TCont & cont) const
{
cont.resize(sizeof(m_value));
memcpy(cont.data(), &m_value, sizeof(m_value));
}
bool operator<(FeatureIndexValue const & value) const { return m_value < value.m_value; }
bool operator==(FeatureIndexValue const & value) const { return m_value == value.m_value; }
void swap(FeatureIndexValue & value) { ::swap(m_value, value.m_value); }
uint32_t m_value;
};
/// A wrapper around serialized SaverT::ValueType.
struct SerializedFeatureInfoValue
{
using ValueT = buffer_vector<uint8_t, 32>;
template <typename TWriter>
void Write(TWriter & writer) const
{
rw::WriteVectorOfPOD(writer, m_value);
}
template <typename TReader>
void Read(TReader & reader)
{
rw::ReadVectorOfPOD(reader, m_value);
}
template <typename TCont>
void Serialize(TCont & cont) const
{
cont.assign(m_value.begin(), m_value.end());
}
bool operator<(SerializedFeatureInfoValue const & value) const { return m_value < value.m_value; }
bool operator==(SerializedFeatureInfoValue const & value) const
{
return m_value == value.m_value;
}
void swap(SerializedFeatureInfoValue & value) { m_value.swap(value.m_value); }
ValueT m_value;
};
/// This template is used to accumulate and serialize a group of
/// values of the same type.
template <typename Value>
class ValueList;
/// ValueList<FeatureIndexValue> serializes a group of features
/// indices as a compressed bit vector, thus, allowing us to save a
/// disk space.
template <>
class ValueList<FeatureIndexValue>
{
public:
void Append(FeatureIndexValue const & value)
{
// External-memory trie adds <string, value> pairs in a sorted
// order, thus, values are supposed to be accumulated in a
// sorted order, and we can avoid sorting them before construction
// of a CompressedBitVector.
ASSERT(m_offsets.empty() || m_offsets.back() <= value.m_value, ());
if (!m_offsets.empty() && m_offsets.back() == value.m_value)
return;
m_offsets.push_back(value.m_value);
}
size_t size() const { return m_offsets.empty() ? 0 : 1; }
template <typename SinkT>
void Dump(SinkT & sink) const
{
vector<uint8_t> buffer;
MemWriter<vector<uint8_t>> writer(buffer);
BuildCompressedBitVector(writer, m_offsets);
sink.Write(buffer.data(), buffer.size());
}
private:
vector<uint32_t> m_offsets;
};
/// ValueList<SerializedFeatureInfoValue> sequentially serializes
/// encoded features infos.
template <>
class ValueList<SerializedFeatureInfoValue>
{
public:
ValueList() : m_size(0) {}
void Append(SerializedFeatureInfoValue const & value)
{
m_value.insert(m_value.end(), value.m_value.begin(), value.m_value.end());
++m_size;
}
size_t size() const { return m_size; }
template <typename SinkT>
void Dump(SinkT & sink) const
{
sink.Write(m_value.data(), m_value.size());
}
private:
buffer_vector<uint8_t, 32> m_value;
uint32_t m_size;
};