forked from organicmaps/organicmaps
[coding] Compressed text storage.
This commit is contained in:
parent
50d77ee9b1
commit
de30819018
8 changed files with 443 additions and 1 deletions
|
@ -72,6 +72,7 @@ set(
|
|||
streams_common.hpp
|
||||
streams_sink.hpp
|
||||
succinct_mapper.hpp
|
||||
text_storage.hpp
|
||||
traffic.cpp
|
||||
traffic.hpp
|
||||
transliteration.cpp
|
||||
|
|
|
@ -85,7 +85,8 @@ public:
|
|||
mtf.Transform(b);
|
||||
}
|
||||
|
||||
CHECK_LESS(start, n, ());
|
||||
if (n != 0)
|
||||
CHECK_LESS(start, n, ());
|
||||
|
||||
revBuffer.resize(n);
|
||||
base::RevBWT(n, static_cast<size_t>(start), bwtBuffer.data(), revBuffer.data());
|
||||
|
|
|
@ -82,6 +82,7 @@ HEADERS += \
|
|||
streams_common.hpp \
|
||||
streams_sink.hpp \
|
||||
succinct_mapper.hpp \
|
||||
text_storage.hpp \
|
||||
traffic.hpp \
|
||||
transliteration.hpp \
|
||||
uri.hpp \
|
||||
|
|
|
@ -32,6 +32,7 @@ set(
|
|||
reader_writer_ops_test.cpp
|
||||
simple_dense_coding_test.cpp
|
||||
succinct_mapper_test.cpp
|
||||
text_storage_tests.cpp
|
||||
traffic_test.cpp
|
||||
uri_test.cpp
|
||||
url_encode_test.cpp
|
||||
|
|
|
@ -34,6 +34,26 @@ string EncodeDecode(BWTCoder::Params const & params, string const & s)
|
|||
return result;
|
||||
}
|
||||
|
||||
string EncodeDecodeBlock(string const & s)
|
||||
{
|
||||
vector<uint8_t> data;
|
||||
|
||||
{
|
||||
MemWriter<decltype(data)> sink(data);
|
||||
BWTCoder::EncodeAndWriteBlock(sink, s.size(), reinterpret_cast<uint8_t const *>(s.data()));
|
||||
}
|
||||
|
||||
string result;
|
||||
{
|
||||
MemReader reader(data.data(), data.size());
|
||||
ReaderSource<MemReader> source(reader);
|
||||
|
||||
BWTCoder::ReadAndDecodeBlock(source, back_inserter(result));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
UNIT_TEST(BWTEncoder_Smoke)
|
||||
{
|
||||
for (size_t blockSize = 1; blockSize < 100; ++blockSize)
|
||||
|
@ -42,12 +62,16 @@ UNIT_TEST(BWTEncoder_Smoke)
|
|||
|
||||
params.m_blockSize = blockSize;
|
||||
string const s = "abracadabra";
|
||||
TEST_EQUAL(s, EncodeDecodeBlock(s), ());
|
||||
TEST_EQUAL(s, EncodeDecode(params, s), (blockSize));
|
||||
}
|
||||
|
||||
string const strings[] = {"", "mississippi", "again and again and again"};
|
||||
for (auto const & s : strings)
|
||||
{
|
||||
TEST_EQUAL(s, EncodeDecodeBlock(s), ());
|
||||
TEST_EQUAL(s, EncodeDecode(BWTCoder::Params{}, s), ());
|
||||
}
|
||||
}
|
||||
|
||||
UNIT_TEST(BWT_Large)
|
||||
|
|
|
@ -39,6 +39,7 @@ SOURCES += ../../testing/testingmain.cpp \
|
|||
reader_writer_ops_test.cpp \
|
||||
simple_dense_coding_test.cpp \
|
||||
succinct_mapper_test.cpp \
|
||||
text_storage_tests.cpp \
|
||||
traffic_test.cpp \
|
||||
uri_test.cpp \
|
||||
url_encode_test.cpp \
|
||||
|
|
131
coding/coding_tests/text_storage_tests.cpp
Normal file
131
coding/coding_tests/text_storage_tests.cpp
Normal file
|
@ -0,0 +1,131 @@
|
|||
#include "testing/testing.hpp"
|
||||
|
||||
#include "coding/reader.hpp"
|
||||
#include "coding/text_storage.hpp"
|
||||
#include "coding/writer.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using namespace coding;
|
||||
using namespace std;
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename Engine>
|
||||
string GenerateRandomString(Engine & engine)
|
||||
{
|
||||
int const kMinLength = 0;
|
||||
int const kMaxLength = 400;
|
||||
|
||||
int const kMinByte = 0;
|
||||
int const kMaxByte = 255;
|
||||
|
||||
uniform_int_distribution<int> length(kMinLength, kMaxLength);
|
||||
uniform_int_distribution<int> byte(kMinByte, kMaxByte);
|
||||
string s(length(engine), '\0');
|
||||
for (auto & b : s)
|
||||
b = byte(engine);
|
||||
return s;
|
||||
}
|
||||
|
||||
void DumpStrings(vector<string> const & strings, uint64_t blockSize, vector<uint8_t> & buffer)
|
||||
{
|
||||
MemWriter<vector<uint8_t>> writer(buffer);
|
||||
BlockedTextStorageWriter<decltype(writer)> ts(writer, blockSize);
|
||||
for (auto const & s : strings)
|
||||
ts.Append(s);
|
||||
}
|
||||
|
||||
UNIT_TEST(TextStorage_Smoke)
|
||||
{
|
||||
vector<uint8_t> buffer;
|
||||
DumpStrings({} /* strings */, 10 /* blockSize */, buffer);
|
||||
|
||||
{
|
||||
MemReader reader(buffer.data(), buffer.size());
|
||||
BlockedTextStorageIndex index;
|
||||
index.Read(reader);
|
||||
TEST_EQUAL(index.GetNumStrings(), 0, ());
|
||||
TEST_EQUAL(index.GetNumBlockInfos(), 0, ());
|
||||
}
|
||||
|
||||
{
|
||||
MemReader reader(buffer.data(), buffer.size());
|
||||
BlockedTextStorageReader<decltype(reader)> ts(reader);
|
||||
TEST_EQUAL(ts.GetNumStrings(), 0, ());
|
||||
}
|
||||
}
|
||||
|
||||
UNIT_TEST(TextStorage_Simple)
|
||||
{
|
||||
vector<string> const strings = {{"", "Hello", "Hello, World!", "Hola mundo", "Smoke test"}};
|
||||
|
||||
vector<uint8_t> buffer;
|
||||
DumpStrings(strings, 10 /* blockSize */, buffer);
|
||||
|
||||
{
|
||||
MemReader reader(buffer.data(), buffer.size());
|
||||
BlockedTextStorageIndex index;
|
||||
index.Read(reader);
|
||||
TEST_EQUAL(index.GetNumStrings(), strings.size(), ());
|
||||
TEST_EQUAL(index.GetNumBlockInfos(), 3, ());
|
||||
}
|
||||
|
||||
{
|
||||
MemReader reader(buffer.data(), buffer.size());
|
||||
BlockedTextStorageReader<MemReader> ts(reader);
|
||||
TEST_EQUAL(ts.GetNumStrings(), strings.size(), ());
|
||||
for (size_t i = 0; i < ts.GetNumStrings(); ++i)
|
||||
TEST_EQUAL(ts.ExtractString(i), strings[i], ());
|
||||
}
|
||||
}
|
||||
|
||||
UNIT_TEST(TextStorage_Empty)
|
||||
{
|
||||
vector<string> strings;
|
||||
for (int i = 0; i < 1000; ++i) {
|
||||
strings.emplace_back(string(1 /* size */, i % 256));
|
||||
for (int j = 0; j < 1000; ++j)
|
||||
strings.emplace_back();
|
||||
}
|
||||
|
||||
vector<uint8_t> buffer;
|
||||
DumpStrings(strings, 5 /* blockSize */, buffer);
|
||||
|
||||
{
|
||||
MemReader reader(buffer.data(), buffer.size());
|
||||
BlockedTextStorageReader<MemReader> ts(reader);
|
||||
TEST_EQUAL(ts.GetNumStrings(), strings.size(), ());
|
||||
for (size_t i = 0; i < ts.GetNumStrings(); ++i)
|
||||
TEST_EQUAL(ts.ExtractString(i), strings[i], ());
|
||||
}
|
||||
}
|
||||
|
||||
UNIT_TEST(TextStorage_Random)
|
||||
{
|
||||
int const kSeed = 42;
|
||||
int const kNumStrings = 1000;
|
||||
int const kBlockSize = 100;
|
||||
mt19937 engine(kSeed);
|
||||
|
||||
vector<string> strings;
|
||||
for (int i = 0; i < kNumStrings; ++i)
|
||||
strings.push_back(GenerateRandomString(engine));
|
||||
|
||||
vector<uint8_t> buffer;
|
||||
DumpStrings(strings, kBlockSize, buffer);
|
||||
|
||||
MemReader reader(buffer.data(), buffer.size());
|
||||
BlockedTextStorageReader<MemReader> ts(reader);
|
||||
|
||||
TEST_EQUAL(ts.GetNumStrings(), strings.size(), ());
|
||||
for (size_t i = 0; i < ts.GetNumStrings(); ++i)
|
||||
TEST_EQUAL(ts.ExtractString(i), strings[i], ());
|
||||
ts.ClearCache();
|
||||
for (size_t i = ts.GetNumStrings() - 1; i < ts.GetNumStrings(); --i)
|
||||
TEST_EQUAL(ts.ExtractString(i), strings[i], ());
|
||||
}
|
||||
} // namespace
|
282
coding/text_storage.hpp
Normal file
282
coding/text_storage.hpp
Normal file
|
@ -0,0 +1,282 @@
|
|||
#pragma once
|
||||
|
||||
#include "coding/bwt_coder.hpp"
|
||||
#include "coding/reader.hpp"
|
||||
#include "coding/varint.hpp"
|
||||
#include "coding/write_to_sink.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
namespace coding
|
||||
{
|
||||
// Writes set of strings in a format that allows to access blocks of
|
||||
// strings. The size of each block roughly equals to the |blockSize|,
|
||||
// because the whole number of strings is packed into a single block.
|
||||
//
|
||||
// Format description:
|
||||
// * first 8 bytes - little endian-encoded offset of the index section
|
||||
// * data section - represents a catenated sequence of BWT-compressed blocks with
|
||||
// the sequence of individual string lengths in the block
|
||||
// * index section - represents a delta-encoded sequence of
|
||||
// BWT-compressed blocks offsets intermixed with the number of
|
||||
// strings inside each block.
|
||||
//
|
||||
// All numbers except the first offset are varints.
|
||||
template <typename Writer>
|
||||
class BlockedTextStorageWriter
|
||||
{
|
||||
public:
|
||||
BlockedTextStorageWriter(Writer & writer, uint64_t blockSize)
|
||||
: m_writer(writer), m_blockSize(blockSize), m_startOffset(writer.Pos()), m_blocks(1)
|
||||
{
|
||||
CHECK(m_blockSize != 0, ());
|
||||
WriteToSink(m_writer, static_cast<uint64_t>(0));
|
||||
m_dataOffset = m_writer.Pos();
|
||||
}
|
||||
|
||||
~BlockedTextStorageWriter()
|
||||
{
|
||||
if (!m_lengths.empty())
|
||||
FlushPool(m_lengths, m_pool);
|
||||
|
||||
if (m_blocks.back().IsEmpty())
|
||||
m_blocks.pop_back();
|
||||
|
||||
{
|
||||
auto const currentOffset = m_writer.Pos();
|
||||
ASSERT_GREATER_OR_EQUAL(currentOffset, m_startOffset, ());
|
||||
m_writer.Seek(m_startOffset);
|
||||
WriteToSink(m_writer, static_cast<uint64_t>(currentOffset - m_startOffset));
|
||||
m_writer.Seek(currentOffset);
|
||||
}
|
||||
|
||||
WriteVarUint(m_writer, m_blocks.size());
|
||||
|
||||
uint64_t prevOffset = 0;
|
||||
for (auto const & block : m_blocks)
|
||||
{
|
||||
ASSERT_GREATER_OR_EQUAL(block.m_offset, prevOffset, ());
|
||||
WriteVarUint(m_writer, block.m_offset - prevOffset);
|
||||
|
||||
ASSERT(!block.IsEmpty(), ());
|
||||
WriteVarUint(m_writer, block.m_subs);
|
||||
|
||||
prevOffset = block.m_offset;
|
||||
}
|
||||
}
|
||||
|
||||
void Append(std::string const & s)
|
||||
{
|
||||
ASSERT(!m_blocks.empty(), ());
|
||||
|
||||
ASSERT_LESS(m_pool.size(), m_blockSize, ());
|
||||
|
||||
++m_blocks.back().m_subs;
|
||||
m_pool.append(s);
|
||||
m_lengths.push_back(s.size());
|
||||
|
||||
if (m_pool.size() >= m_blockSize)
|
||||
{
|
||||
FlushPool(m_lengths, m_pool);
|
||||
m_pool.clear();
|
||||
m_lengths.clear();
|
||||
m_blocks.emplace_back(m_writer.Pos() - m_dataOffset /* offset */, 0 /* subs */);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
struct Block
|
||||
{
|
||||
Block() = default;
|
||||
Block(uint64_t offset, uint64_t subs) : m_offset(offset), m_subs(subs) {}
|
||||
|
||||
bool IsEmpty() const { return m_subs == 0; }
|
||||
|
||||
uint64_t m_offset = 0; // offset of the block inside the sequence of compressed blocks
|
||||
uint64_t m_subs = 0; // number of strings inside the block
|
||||
};
|
||||
|
||||
void FlushPool(vector<uint64_t> const & lengths, string const & pool)
|
||||
{
|
||||
for (auto const & length : lengths)
|
||||
WriteVarUint(m_writer, length);
|
||||
BWTCoder::EncodeAndWriteBlock(m_writer, pool.size(),
|
||||
reinterpret_cast<uint8_t const *>(pool.c_str()));
|
||||
}
|
||||
|
||||
Writer & m_writer;
|
||||
uint64_t const m_blockSize;
|
||||
uint64_t m_startOffset = 0;
|
||||
uint64_t m_dataOffset = 0;
|
||||
|
||||
vector<Block> m_blocks;
|
||||
|
||||
std::string m_pool; // concatenated strings
|
||||
vector<uint64_t> m_lengths; // lengths of strings inside the |m_pool|
|
||||
};
|
||||
|
||||
class BlockedTextStorageIndex
|
||||
{
|
||||
public:
|
||||
struct BlockInfo
|
||||
{
|
||||
// Returns the index of the first string belonging to the block.
|
||||
uint64_t From() const { return m_from; }
|
||||
|
||||
// Returns the index of the first string from the next block.
|
||||
uint64_t To() const { return m_from + m_subs; }
|
||||
|
||||
uint64_t m_offset = 0; // offset of the block from the beginning of the section
|
||||
uint64_t m_from = 0; // index of the first string in the block
|
||||
uint64_t m_subs = 0; // number of strings in the block
|
||||
};
|
||||
|
||||
size_t GetNumBlockInfos() const { return m_blocks.size(); }
|
||||
size_t GetNumStrings() const { return m_blocks.empty() ? 0 : m_blocks.back().To(); }
|
||||
|
||||
BlockInfo const & GetBlockInfo(size_t blockIx) const
|
||||
{
|
||||
ASSERT_LESS(blockIx, GetNumBlockInfos(), ());
|
||||
return m_blocks[blockIx];
|
||||
}
|
||||
|
||||
// Returns the index of the block the |stringIx| belongs to.
|
||||
// Returns the number of blocks if there're no such block.
|
||||
size_t GetBlockIx(size_t stringIx) const
|
||||
{
|
||||
if (m_blocks.empty() || stringIx >= m_blocks.back().To())
|
||||
return GetNumBlockInfos();
|
||||
if (stringIx >= m_blocks.back().From())
|
||||
return GetNumBlockInfos() - 1;
|
||||
|
||||
size_t lo = 0, hi = GetNumBlockInfos() - 1;
|
||||
while (lo + 1 != hi)
|
||||
{
|
||||
ASSERT_GREATER_OR_EQUAL(stringIx, m_blocks[lo].From(), ());
|
||||
ASSERT_LESS(stringIx, m_blocks[hi].From(), ());
|
||||
|
||||
auto const mi = lo + (hi - lo) / 2;
|
||||
if (stringIx >= m_blocks[mi].From())
|
||||
lo = mi;
|
||||
else
|
||||
hi = mi;
|
||||
}
|
||||
|
||||
ASSERT_GREATER_OR_EQUAL(stringIx, m_blocks[lo].From(), ());
|
||||
ASSERT_LESS(stringIx, m_blocks[hi].From(), ());
|
||||
return lo;
|
||||
}
|
||||
|
||||
template <typename Reader>
|
||||
void Read(Reader & reader)
|
||||
{
|
||||
auto const indexOffset = ReadPrimitiveFromPos<uint64_t, Reader>(reader, 0);
|
||||
|
||||
NonOwningReaderSource source(reader);
|
||||
source.Skip(indexOffset);
|
||||
|
||||
auto const numBlocks = ReadVarUint<uint64_t, NonOwningReaderSource>(source);
|
||||
m_blocks.assign(numBlocks, {});
|
||||
|
||||
uint64_t prevOffset = 8; // 8 bytes for the offset
|
||||
for (uint64_t i = 0; i < numBlocks; ++i)
|
||||
{
|
||||
auto const delta = ReadVarUint<uint64_t, NonOwningReaderSource>(source);
|
||||
CHECK_GREATER_OR_EQUAL(prevOffset + delta, prevOffset, ());
|
||||
prevOffset += delta;
|
||||
|
||||
auto & block = m_blocks[i];
|
||||
block.m_offset = prevOffset;
|
||||
block.m_from = i == 0 ? 0 : m_blocks[i - 1].To();
|
||||
block.m_subs = ReadVarUint<uint64_t, NonOwningReaderSource>(source);
|
||||
CHECK_GREATER_OR_EQUAL(block.m_from + block.m_subs, block.m_from, ());
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<BlockInfo> m_blocks;
|
||||
};
|
||||
|
||||
template <typename Reader>
|
||||
class BlockedTextStorageReader
|
||||
{
|
||||
public:
|
||||
explicit BlockedTextStorageReader(Reader & reader) : m_reader(reader) { m_index.Read(reader); }
|
||||
|
||||
size_t GetNumStrings() const { return m_index.GetNumStrings(); }
|
||||
|
||||
std::string ExtractString(size_t stringIx)
|
||||
{
|
||||
auto const blockIx = m_index.GetBlockIx(stringIx);
|
||||
CHECK_LESS(blockIx, m_index.GetNumBlockInfos(), ());
|
||||
|
||||
if (blockIx >= m_cache.size())
|
||||
m_cache.resize(blockIx + 1);
|
||||
ASSERT_LESS(blockIx, m_cache.size(), ());
|
||||
|
||||
auto const & bi = m_index.GetBlockInfo(blockIx);
|
||||
|
||||
auto & entry = m_cache[blockIx];
|
||||
if (!entry.m_valid)
|
||||
{
|
||||
NonOwningReaderSource source(m_reader);
|
||||
source.Skip(bi.m_offset);
|
||||
|
||||
entry.m_value.clear();
|
||||
entry.m_subs.resize(bi.m_subs);
|
||||
|
||||
uint64_t offset = 0;
|
||||
for (size_t i = 0; i < entry.m_subs.size(); ++i)
|
||||
{
|
||||
auto & sub = entry.m_subs[i];
|
||||
sub.m_offset = offset;
|
||||
sub.m_length = ReadVarUint<uint64_t>(source);
|
||||
CHECK_GREATER_OR_EQUAL(sub.m_offset + sub.m_length, sub.m_offset, ());
|
||||
offset += sub.m_length;
|
||||
}
|
||||
BWTCoder::ReadAndDecodeBlock(source, std::back_inserter(entry.m_value));
|
||||
entry.m_valid = true;
|
||||
}
|
||||
ASSERT(entry.m_valid, ());
|
||||
|
||||
ASSERT_GREATER_OR_EQUAL(stringIx, bi.From(), ());
|
||||
ASSERT_LESS(stringIx, bi.To(), ());
|
||||
|
||||
stringIx -= bi.From();
|
||||
ASSERT_LESS(stringIx, entry.m_subs.size(), ());
|
||||
|
||||
auto const & si = entry.m_subs[stringIx];
|
||||
auto const & value = entry.m_value;
|
||||
ASSERT_LESS_OR_EQUAL(si.m_offset + si.m_length, value.size(), ());
|
||||
return value.substr(si.m_offset, si.m_length);
|
||||
}
|
||||
|
||||
void ClearCache() { m_cache.clear(); }
|
||||
|
||||
private:
|
||||
struct StringInfo
|
||||
{
|
||||
StringInfo() = default;
|
||||
StringInfo(uint64_t offset, uint64_t length): m_offset(offset), m_length(length) {}
|
||||
|
||||
uint64_t m_offset = 0; // offset of the string inside the decompressed block
|
||||
uint64_t m_length = 0; // length of the string
|
||||
};
|
||||
|
||||
struct CacheEntry
|
||||
{
|
||||
std::string m_value; // concatenation of the strings
|
||||
std::vector<StringInfo> m_subs; // indices of individual strings
|
||||
bool m_valid = false;
|
||||
};
|
||||
|
||||
Reader & m_reader;
|
||||
BlockedTextStorageIndex m_index;
|
||||
std::vector<CacheEntry> m_cache;
|
||||
};
|
||||
} // namespace coding
|
Loading…
Add table
Reference in a new issue