forked from organicmaps/organicmaps
[coding] Implemented SDC (Simple Dense Coding).
This commit is contained in:
parent
d31d1a6a66
commit
122fdfda89
8 changed files with 484 additions and 0 deletions
|
@ -13,6 +13,14 @@
|
|||
|
||||
#include "intrinsics.hpp"
|
||||
|
||||
namespace coding
|
||||
{
|
||||
template<typename TWriter>
|
||||
class FreezeVisitor;
|
||||
class MapVisitor;
|
||||
class ReverseMapVisitor;
|
||||
}
|
||||
|
||||
namespace succinct { namespace mapper {
|
||||
|
||||
namespace detail {
|
||||
|
@ -115,6 +123,11 @@ namespace succinct { namespace mapper {
|
|||
friend class detail::map_visitor;
|
||||
friend class detail::sizeof_visitor;
|
||||
|
||||
template<typename TWriter>
|
||||
friend class coding::FreezeVisitor;
|
||||
friend class coding::MapVisitor;
|
||||
friend class coding::ReverseMapVisitor;
|
||||
|
||||
protected:
|
||||
const T* m_data;
|
||||
uint64_t m_size;
|
||||
|
|
|
@ -30,6 +30,7 @@ SOURCES += \
|
|||
reader.cpp \
|
||||
reader_streambuf.cpp \
|
||||
reader_writer_ops.cpp \
|
||||
simple_dense_coding.cpp \
|
||||
sha2.cpp \
|
||||
uri.cpp \
|
||||
# varint_vector.cpp \
|
||||
|
@ -80,10 +81,12 @@ HEADERS += \
|
|||
reader_streambuf.hpp \
|
||||
reader_wrapper.hpp \
|
||||
reader_writer_ops.hpp \
|
||||
simple_dense_coding.hpp \
|
||||
sha2.hpp \
|
||||
streams.hpp \
|
||||
streams_common.hpp \
|
||||
streams_sink.hpp \
|
||||
succinct_mapper.hpp \
|
||||
succinct_trie.hpp \
|
||||
succinct_trie_builder.hpp \
|
||||
succinct_trie_reader.hpp \
|
||||
|
|
|
@ -35,7 +35,9 @@ SOURCES += ../../testing/testingmain.cpp \
|
|||
reader_cache_test.cpp \
|
||||
reader_test.cpp \
|
||||
reader_writer_ops_test.cpp \
|
||||
simple_dense_coding_test.cpp \
|
||||
sha2_test.cpp \
|
||||
succinct_mapper_test.cpp \
|
||||
succinct_trie_test.cpp \
|
||||
trie_test.cpp \
|
||||
uri_test.cpp \
|
||||
|
|
50
coding/coding_tests/simple_dense_coding_test.cpp
Normal file
50
coding/coding_tests/simple_dense_coding_test.cpp
Normal file
|
@ -0,0 +1,50 @@
|
|||
#include "testing/testing.hpp"
|
||||
|
||||
#include "coding/file_writer.hpp"
|
||||
#include "coding/mmap_reader.hpp"
|
||||
#include "coding/simple_dense_coding.hpp"
|
||||
#include "coding/succinct_mapper.hpp"
|
||||
|
||||
#include "base/logging.hpp"
|
||||
#include "base/scope_guard.hpp"
|
||||
|
||||
#include "std/limits.hpp"
|
||||
#include "std/string.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
using namespace coding;
|
||||
|
||||
namespace
|
||||
{
|
||||
void TestSDC(vector<uint8_t> const & data, SimpleDenseCoding const & coding)
|
||||
{
|
||||
TEST_EQUAL(data.size(), coding.Size(), ());
|
||||
for (size_t i = 0; i < data.size(); ++i)
|
||||
TEST_EQUAL(data[i], coding.Get(i), ());
|
||||
}
|
||||
} // namespace
|
||||
|
||||
UNIT_TEST(SimpleDenseCoding_Smoke)
|
||||
{
|
||||
size_t const kSize = numeric_limits<uint8_t>::max();
|
||||
vector<uint8_t> data(kSize);
|
||||
for (size_t i = 0; i < data.size(); ++i)
|
||||
data[i] = i;
|
||||
|
||||
string const kTestFile = "test.tmp";
|
||||
MY_SCOPE_GUARD(cleanup, bind(&FileWriter::DeleteFileX, kTestFile));
|
||||
|
||||
{
|
||||
SimpleDenseCoding coding(data);
|
||||
TestSDC(data, coding);
|
||||
FileWriter writer(kTestFile);
|
||||
Freeze(coding, writer);
|
||||
}
|
||||
|
||||
{
|
||||
MmapReader reader(kTestFile);
|
||||
SimpleDenseCoding coding;
|
||||
Map(coding, reader.Data());
|
||||
TestSDC(data, coding);
|
||||
}
|
||||
}
|
38
coding/coding_tests/succinct_mapper_test.cpp
Normal file
38
coding/coding_tests/succinct_mapper_test.cpp
Normal file
|
@ -0,0 +1,38 @@
|
|||
#include "testing/testing.hpp"
|
||||
|
||||
#include "coding/succinct_mapper.hpp"
|
||||
#include "coding/writer.hpp"
|
||||
|
||||
#include "std/vector.hpp"
|
||||
|
||||
#include "3party/succinct/mapper.hpp"
|
||||
|
||||
using namespace coding;
|
||||
|
||||
UNIT_TEST(ReverseMapper_Smoke)
|
||||
{
|
||||
uint64_t data = 0x0123456789abcdef;
|
||||
uint64_t rdata = 0x0;
|
||||
TEST_EQUAL(8, ReverseMap(rdata, reinterpret_cast<uint8_t *>(&data), "uint64_t"), ());
|
||||
|
||||
// Test that reversed uint64_t was read.
|
||||
TEST_EQUAL(0xefcdab8967452301, rdata, ());
|
||||
|
||||
// Test that underlying buffer was modified.
|
||||
TEST_EQUAL(0xefcdab8967452301, data, ());
|
||||
}
|
||||
|
||||
UNIT_TEST(Freeze_Smoke)
|
||||
{
|
||||
vector<uint8_t> data;
|
||||
{
|
||||
MemWriter<decltype(data)> writer(data);
|
||||
uint64_t const data = 0x0123456789abcdef;
|
||||
Freeze(data, writer);
|
||||
}
|
||||
TEST_EQUAL(8, data.size(), ());
|
||||
|
||||
uint64_t value = 0x0;
|
||||
TEST_EQUAL(8, Map(value, reinterpret_cast<uint8_t const *>(data.data())), ());
|
||||
TEST_EQUAL(0x0123456789abcdef, value, ());
|
||||
}
|
124
coding/simple_dense_coding.cpp
Normal file
124
coding/simple_dense_coding.cpp
Normal file
|
@ -0,0 +1,124 @@
|
|||
#include "coding/simple_dense_coding.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/limits.hpp"
|
||||
|
||||
namespace coding
|
||||
{
|
||||
namespace
|
||||
{
|
||||
struct Code
|
||||
{
|
||||
Code() : m_code(0), m_length(0) {}
|
||||
|
||||
uint8_t m_code;
|
||||
uint8_t m_length;
|
||||
};
|
||||
|
||||
size_t const kAlphabetSize = static_cast<size_t>(numeric_limits<uint8_t>::max()) + 1;
|
||||
Code g_codeTable[kAlphabetSize];
|
||||
bool g_codeTableInitialized = false;
|
||||
|
||||
// Returns pointer to an initialized code table. If necessary,
|
||||
// initializes it. In the latter case, code table is filled with
|
||||
// following code words: 0, 1, 00, 01, 10, 11, 000, 001, ...
|
||||
Code const * GetCodeTable()
|
||||
{
|
||||
if (g_codeTableInitialized)
|
||||
return g_codeTable;
|
||||
|
||||
unsigned length = 1;
|
||||
size_t rank = 0;
|
||||
while (rank < kAlphabetSize)
|
||||
{
|
||||
// Number of codes with the same bit length.
|
||||
size_t const numCodes = static_cast<size_t>(1) << length;
|
||||
|
||||
size_t base = rank;
|
||||
while (rank - base < numCodes && rank < kAlphabetSize)
|
||||
{
|
||||
g_codeTable[rank].m_code = rank - base;
|
||||
g_codeTable[rank].m_length = length;
|
||||
++rank;
|
||||
}
|
||||
|
||||
++length;
|
||||
}
|
||||
|
||||
g_codeTableInitialized = true;
|
||||
return g_codeTable;
|
||||
}
|
||||
|
||||
// Computes frequences for data symbols.
|
||||
void GetFrequences(vector<uint8_t> const & data, uint64_t frequency[])
|
||||
{
|
||||
memset(frequency, 0, sizeof(*frequency) * kAlphabetSize);
|
||||
for (uint8_t symbol : data)
|
||||
++frequency[symbol];
|
||||
}
|
||||
} // namespace
|
||||
|
||||
SimpleDenseCoding::SimpleDenseCoding(vector<uint8_t> const & data)
|
||||
{
|
||||
uint64_t frequency[kAlphabetSize]; // Maps symbols to frequences.
|
||||
GetFrequences(data, frequency);
|
||||
|
||||
uint8_t symbols[kAlphabetSize]; // Maps ranks to symbols.
|
||||
uint8_t rank[kAlphabetSize]; // Maps symbols to ranks.
|
||||
|
||||
for (size_t i = 0; i < kAlphabetSize; ++i)
|
||||
symbols[i] = i;
|
||||
sort(symbols, symbols + kAlphabetSize, [&frequency](uint8_t lsym, uint8_t rsym)
|
||||
{
|
||||
return frequency[lsym] > frequency[rsym];
|
||||
});
|
||||
for (size_t r = 0; r < kAlphabetSize; ++r)
|
||||
rank[symbols[r]] = r;
|
||||
|
||||
Code const * codeTable = GetCodeTable();
|
||||
ASSERT(codeTable, ());
|
||||
|
||||
uint64_t bitLength = 0;
|
||||
for (size_t symbol = 0; symbol < kAlphabetSize; ++symbol)
|
||||
bitLength += static_cast<uint64_t>(frequency[symbol]) * codeTable[rank[symbol]].m_length;
|
||||
|
||||
succinct::bit_vector_builder bitsBuilder;
|
||||
bitsBuilder.reserve(bitLength);
|
||||
vector<bool> indexBuilder(bitLength);
|
||||
size_t pos = 0;
|
||||
{
|
||||
for (uint8_t symbol : data)
|
||||
{
|
||||
Code const & code = codeTable[rank[symbol]];
|
||||
ASSERT_LESS(pos, bitLength, ());
|
||||
indexBuilder[pos] = 1;
|
||||
|
||||
bitsBuilder.append_bits(code.m_code, code.m_length);
|
||||
pos += code.m_length;
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(pos, bitLength, ());
|
||||
|
||||
succinct::bit_vector(&bitsBuilder).swap(m_bits);
|
||||
succinct::rs_bit_vector(indexBuilder, true /* with_select_hints */).swap(m_index);
|
||||
m_symbols.assign(symbols);
|
||||
}
|
||||
|
||||
uint8_t SimpleDenseCoding::Get(uint64_t i) const
|
||||
{
|
||||
ASSERT_LESS(i, Size(), ());
|
||||
uint64_t const start = m_index.select(i);
|
||||
uint64_t const end = i + 1 == Size() ? m_index.size() : m_index.select(i + 1);
|
||||
|
||||
ASSERT_LESS(start, end, ());
|
||||
|
||||
uint8_t const length = static_cast<uint8_t>(end - start);
|
||||
ASSERT_LESS_OR_EQUAL(length, 8, ());
|
||||
|
||||
uint8_t const code = m_bits.get_bits(start, length);
|
||||
uint8_t const rank = (1 << length) - 2 + code;
|
||||
return m_symbols[rank];
|
||||
}
|
||||
} // namespace coding
|
58
coding/simple_dense_coding.hpp
Normal file
58
coding/simple_dense_coding.hpp
Normal file
|
@ -0,0 +1,58 @@
|
|||
#pragma once
|
||||
|
||||
#include "std/vector.hpp"
|
||||
|
||||
#include "3party/succinct/bit_vector.hpp"
|
||||
#include "3party/succinct/mappable_vector.hpp"
|
||||
#include "3party/succinct/rs_bit_vector.hpp"
|
||||
|
||||
namespace coding
|
||||
{
|
||||
// This class represents so-called simple dense coding for byte
|
||||
// strings. It can be used when it's necessary to compress strings
|
||||
// with skewed entropy and nevertheless efficient access to the
|
||||
// string's elements is needed.
|
||||
//
|
||||
// The main idea is to assign codewords from the set { 0, 1, 00, 01,
|
||||
// 10, 11, 000, ... } to string's symbols in accordance with their
|
||||
// frequencies and to create a helper bit-vector for starting
|
||||
// positions of the codewords in compressed string.
|
||||
//
|
||||
// Memory complexity: 2 * n * (H_0(T) + 1) bits for a text T, but note
|
||||
// that this is an upper bound and too pessimistic.
|
||||
|
||||
// Time complexity: O(log(n * H_0(T))) to access i-th element of the
|
||||
// string, because of logarithmic complexity of
|
||||
// rs_bit_vector::select. This will be fixed when RRR will be
|
||||
// implemented.
|
||||
//
|
||||
// For details, see Kimmo Fredriksson, Fedor Nikitin, "Simple Random
|
||||
// Access Compression", Fundamenta Informaticae 2009,
|
||||
// http://www.cs.uku.fi/~fredriks/pub/papers/fi09.pdf.
|
||||
class SimpleDenseCoding
|
||||
{
|
||||
public:
|
||||
SimpleDenseCoding() = default;
|
||||
|
||||
SimpleDenseCoding(vector<uint8_t> const & data);
|
||||
|
||||
uint8_t Get(uint64_t i) const;
|
||||
|
||||
inline uint64_t Size() const { return m_index.num_ones(); }
|
||||
|
||||
// map is used here (instead of Map) for compatibility with succinct
|
||||
// structures.
|
||||
template <typename TVisitor>
|
||||
void map(TVisitor & visitor)
|
||||
{
|
||||
visitor(m_bits, "m_bits");
|
||||
visitor(m_index, "m_index");
|
||||
visitor(m_symbols, "m_symbols");
|
||||
}
|
||||
|
||||
private:
|
||||
succinct::bit_vector m_bits;
|
||||
succinct::rs_bit_vector m_index;
|
||||
succinct::mapper::mappable_vector<uint8_t> m_symbols;
|
||||
};
|
||||
} // namespace coding
|
196
coding/succinct_mapper.hpp
Normal file
196
coding/succinct_mapper.hpp
Normal file
|
@ -0,0 +1,196 @@
|
|||
#pragma once
|
||||
|
||||
#include "coding/endianness.hpp"
|
||||
|
||||
#include "std/type_traits.hpp"
|
||||
|
||||
#include "3party/succinct/mappable_vector.hpp"
|
||||
#include "3party/succinct/mapper.hpp"
|
||||
|
||||
namespace coding
|
||||
{
|
||||
template <typename T>
|
||||
static T * Align8Ptr(T * ptr)
|
||||
{
|
||||
uint64_t value = (reinterpret_cast<uint64_t>(ptr) + 0x7) & 0xfffffffffffffff8;
|
||||
return reinterpret_cast<T *>(value);
|
||||
}
|
||||
|
||||
inline uint32_t NeedToAlign8(uint64_t written) { return 0x8 - (written & 0x7); }
|
||||
|
||||
class MapVisitor
|
||||
{
|
||||
public:
|
||||
MapVisitor(uint8_t const * base) : m_base(base), m_cur(m_base) {}
|
||||
|
||||
template <typename T>
|
||||
typename enable_if<!is_pod<T>::value, MapVisitor &>::type operator()(
|
||||
T & val, char const * /* friendlyName */)
|
||||
{
|
||||
val.map(*this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename enable_if<is_pod<T>::value, MapVisitor &>::type operator()(
|
||||
T & val, char const * /* friendlyName */)
|
||||
{
|
||||
T const * valPtr = reinterpret_cast<T const *>(m_cur);
|
||||
val = *valPtr;
|
||||
m_cur += sizeof(T);
|
||||
|
||||
m_cur = Align8Ptr(m_cur);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
MapVisitor & operator()(succinct::mapper::mappable_vector<T> & vec,
|
||||
char const * /* friendlyName */)
|
||||
{
|
||||
vec.clear();
|
||||
(*this)(vec.m_size, "size");
|
||||
|
||||
vec.m_data = reinterpret_cast<const T *>(m_cur);
|
||||
size_t const bytes = vec.m_size * sizeof(T);
|
||||
|
||||
m_cur += bytes;
|
||||
m_cur = Align8Ptr(m_cur);
|
||||
return *this;
|
||||
}
|
||||
|
||||
size_t BytesRead() const { return static_cast<size_t>(m_cur - m_base); }
|
||||
|
||||
private:
|
||||
uint8_t const * const m_base;
|
||||
uint8_t const * m_cur;
|
||||
};
|
||||
|
||||
class ReverseMapVisitor
|
||||
{
|
||||
public:
|
||||
ReverseMapVisitor(uint8_t * base) : m_base(base), m_cur(m_base) {}
|
||||
|
||||
template <typename T>
|
||||
typename enable_if<!is_pod<T>::value, ReverseMapVisitor &>::type operator()(
|
||||
T & val, char const * /* friendlyName */)
|
||||
{
|
||||
val.map(*this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename enable_if<is_pod<T>::value, ReverseMapVisitor &>::type operator()(
|
||||
T & val, char const * /* friendlyName */)
|
||||
{
|
||||
T * valPtr = reinterpret_cast<T *>(m_cur);
|
||||
*valPtr = ReverseByteOrder(*valPtr);
|
||||
val = *valPtr;
|
||||
m_cur += sizeof(T);
|
||||
|
||||
m_cur = Align8Ptr(m_cur);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ReverseMapVisitor & operator()(succinct::mapper::mappable_vector<T> & vec,
|
||||
char const * /* friendlyName */)
|
||||
{
|
||||
vec.clear();
|
||||
(*this)(vec.m_size, "size");
|
||||
|
||||
vec.m_data = reinterpret_cast<const T *>(m_cur);
|
||||
for (auto const it = vec.begin(); it != vec.end(); ++it)
|
||||
*it = ReverseByteOrder(*it);
|
||||
size_t const bytes = vec.m_size * sizeof(T);
|
||||
|
||||
m_cur += bytes;
|
||||
m_cur = Align8Ptr(m_cur);
|
||||
return *this;
|
||||
}
|
||||
|
||||
size_t BytesRead() const { return static_cast<size_t>(m_cur - m_base); }
|
||||
|
||||
private:
|
||||
uint8_t * m_base;
|
||||
uint8_t * m_cur;
|
||||
};
|
||||
|
||||
template <typename TWriter>
|
||||
class FreezeVisitor
|
||||
{
|
||||
public:
|
||||
FreezeVisitor(TWriter & writer) : m_writer(writer), m_written(0) {}
|
||||
|
||||
template <typename T>
|
||||
typename enable_if<!is_pod<T>::value, FreezeVisitor &>::type operator()(
|
||||
T & val, char const * /* friendlyName */)
|
||||
{
|
||||
val.map(*this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename enable_if<is_pod<T>::value, FreezeVisitor &>::type operator()(
|
||||
T & val, char const * /* friendlyName */)
|
||||
{
|
||||
m_writer.Write(reinterpret_cast<void const *>(&val), sizeof(T));
|
||||
m_written += sizeof(T);
|
||||
WritePadding();
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
FreezeVisitor & operator()(succinct::mapper::mappable_vector<T> & vec,
|
||||
char const * /* friendlyName */)
|
||||
{
|
||||
(*this)(vec.m_size, "size");
|
||||
|
||||
size_t const bytes = static_cast<size_t>(vec.m_size * sizeof(T));
|
||||
m_writer.Write(vec.m_data, static_cast<size_t>(bytes));
|
||||
m_written += bytes;
|
||||
WritePadding();
|
||||
return *this;
|
||||
}
|
||||
|
||||
size_t Written() const { return m_written; }
|
||||
|
||||
private:
|
||||
void WritePadding()
|
||||
{
|
||||
uint32_t const padding = NeedToAlign8(m_written);
|
||||
static uint64_t const zero = 0;
|
||||
if (padding > 0 && padding < 8)
|
||||
{
|
||||
m_writer.Write(reinterpret_cast<void const *>(&zero), padding);
|
||||
m_written += padding;
|
||||
}
|
||||
}
|
||||
|
||||
TWriter & m_writer;
|
||||
uint64_t m_written;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
size_t Map(T & value, uint8_t const * base, char const * friendlyName = "<TOP>")
|
||||
{
|
||||
MapVisitor visitor(base);
|
||||
visitor(value, friendlyName);
|
||||
return visitor.BytesRead();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t ReverseMap(T & value, uint8_t * base, char const * friendlyName = "<TOP>")
|
||||
{
|
||||
ReverseMapVisitor visitor(base);
|
||||
visitor(value, friendlyName);
|
||||
return visitor.BytesRead();
|
||||
}
|
||||
|
||||
template <typename T, typename TWriter>
|
||||
size_t Freeze(T & val, TWriter & writer, char const * friendlyName = "<TOP>")
|
||||
{
|
||||
FreezeVisitor<TWriter> visitor(writer);
|
||||
visitor(val, friendlyName);
|
||||
return visitor.Written();
|
||||
}
|
||||
} // namespace coding
|
Loading…
Add table
Reference in a new issue