[coding] Implemented SDC (Simple Dense Coding).

This commit is contained in:
Yuri Gorshenin 2015-09-01 11:09:25 +03:00 committed by Sergey Yershov
parent d31d1a6a66
commit 122fdfda89
8 changed files with 484 additions and 0 deletions

View file

@ -13,6 +13,14 @@
#include "intrinsics.hpp"
namespace coding
{
template<typename TWriter>
class FreezeVisitor;
class MapVisitor;
class ReverseMapVisitor;
}
namespace succinct { namespace mapper {
namespace detail {
@ -115,6 +123,11 @@ namespace succinct { namespace mapper {
friend class detail::map_visitor;
friend class detail::sizeof_visitor;
template<typename TWriter>
friend class coding::FreezeVisitor;
friend class coding::MapVisitor;
friend class coding::ReverseMapVisitor;
protected:
const T* m_data;
uint64_t m_size;

View file

@ -30,6 +30,7 @@ SOURCES += \
reader.cpp \
reader_streambuf.cpp \
reader_writer_ops.cpp \
simple_dense_coding.cpp \
sha2.cpp \
uri.cpp \
# varint_vector.cpp \
@ -80,10 +81,12 @@ HEADERS += \
reader_streambuf.hpp \
reader_wrapper.hpp \
reader_writer_ops.hpp \
simple_dense_coding.hpp \
sha2.hpp \
streams.hpp \
streams_common.hpp \
streams_sink.hpp \
succinct_mapper.hpp \
succinct_trie.hpp \
succinct_trie_builder.hpp \
succinct_trie_reader.hpp \

View file

@ -35,7 +35,9 @@ SOURCES += ../../testing/testingmain.cpp \
reader_cache_test.cpp \
reader_test.cpp \
reader_writer_ops_test.cpp \
simple_dense_coding_test.cpp \
sha2_test.cpp \
succinct_mapper_test.cpp \
succinct_trie_test.cpp \
trie_test.cpp \
uri_test.cpp \

View file

@ -0,0 +1,50 @@
#include "testing/testing.hpp"
#include "coding/file_writer.hpp"
#include "coding/mmap_reader.hpp"
#include "coding/simple_dense_coding.hpp"
#include "coding/succinct_mapper.hpp"
#include "base/logging.hpp"
#include "base/scope_guard.hpp"
#include "std/limits.hpp"
#include "std/string.hpp"
#include "std/vector.hpp"
using namespace coding;
namespace
{
void TestSDC(vector<uint8_t> const & data, SimpleDenseCoding const & coding)
{
TEST_EQUAL(data.size(), coding.Size(), ());
for (size_t i = 0; i < data.size(); ++i)
TEST_EQUAL(data[i], coding.Get(i), ());
}
} // namespace
UNIT_TEST(SimpleDenseCoding_Smoke)
{
size_t const kSize = numeric_limits<uint8_t>::max();
vector<uint8_t> data(kSize);
for (size_t i = 0; i < data.size(); ++i)
data[i] = i;
string const kTestFile = "test.tmp";
MY_SCOPE_GUARD(cleanup, bind(&FileWriter::DeleteFileX, kTestFile));
{
SimpleDenseCoding coding(data);
TestSDC(data, coding);
FileWriter writer(kTestFile);
Freeze(coding, writer);
}
{
MmapReader reader(kTestFile);
SimpleDenseCoding coding;
Map(coding, reader.Data());
TestSDC(data, coding);
}
}

View file

@ -0,0 +1,38 @@
#include "testing/testing.hpp"
#include "coding/succinct_mapper.hpp"
#include "coding/writer.hpp"
#include "std/vector.hpp"
#include "3party/succinct/mapper.hpp"
using namespace coding;
UNIT_TEST(ReverseMapper_Smoke)
{
uint64_t data = 0x0123456789abcdef;
uint64_t rdata = 0x0;
TEST_EQUAL(8, ReverseMap(rdata, reinterpret_cast<uint8_t *>(&data), "uint64_t"), ());
// Test that reversed uint64_t was read.
TEST_EQUAL(0xefcdab8967452301, rdata, ());
// Test that underlying buffer was modified.
TEST_EQUAL(0xefcdab8967452301, data, ());
}
UNIT_TEST(Freeze_Smoke)
{
vector<uint8_t> data;
{
MemWriter<decltype(data)> writer(data);
uint64_t const data = 0x0123456789abcdef;
Freeze(data, writer);
}
TEST_EQUAL(8, data.size(), ());
uint64_t value = 0x0;
TEST_EQUAL(8, Map(value, reinterpret_cast<uint8_t const *>(data.data())), ());
TEST_EQUAL(0x0123456789abcdef, value, ());
}

View file

@ -0,0 +1,124 @@
#include "coding/simple_dense_coding.hpp"
#include "base/assert.hpp"
#include "std/algorithm.hpp"
#include "std/limits.hpp"
namespace coding
{
namespace
{
struct Code
{
Code() : m_code(0), m_length(0) {}
uint8_t m_code;
uint8_t m_length;
};
size_t const kAlphabetSize = static_cast<size_t>(numeric_limits<uint8_t>::max()) + 1;
Code g_codeTable[kAlphabetSize];
bool g_codeTableInitialized = false;
// Returns pointer to an initialized code table. If necessary,
// initializes it. In the latter case, code table is filled with
// following code words: 0, 1, 00, 01, 10, 11, 000, 001, ...
Code const * GetCodeTable()
{
if (g_codeTableInitialized)
return g_codeTable;
unsigned length = 1;
size_t rank = 0;
while (rank < kAlphabetSize)
{
// Number of codes with the same bit length.
size_t const numCodes = static_cast<size_t>(1) << length;
size_t base = rank;
while (rank - base < numCodes && rank < kAlphabetSize)
{
g_codeTable[rank].m_code = rank - base;
g_codeTable[rank].m_length = length;
++rank;
}
++length;
}
g_codeTableInitialized = true;
return g_codeTable;
}
// Computes frequences for data symbols.
void GetFrequences(vector<uint8_t> const & data, uint64_t frequency[])
{
memset(frequency, 0, sizeof(*frequency) * kAlphabetSize);
for (uint8_t symbol : data)
++frequency[symbol];
}
} // namespace
SimpleDenseCoding::SimpleDenseCoding(vector<uint8_t> const & data)
{
uint64_t frequency[kAlphabetSize]; // Maps symbols to frequences.
GetFrequences(data, frequency);
uint8_t symbols[kAlphabetSize]; // Maps ranks to symbols.
uint8_t rank[kAlphabetSize]; // Maps symbols to ranks.
for (size_t i = 0; i < kAlphabetSize; ++i)
symbols[i] = i;
sort(symbols, symbols + kAlphabetSize, [&frequency](uint8_t lsym, uint8_t rsym)
{
return frequency[lsym] > frequency[rsym];
});
for (size_t r = 0; r < kAlphabetSize; ++r)
rank[symbols[r]] = r;
Code const * codeTable = GetCodeTable();
ASSERT(codeTable, ());
uint64_t bitLength = 0;
for (size_t symbol = 0; symbol < kAlphabetSize; ++symbol)
bitLength += static_cast<uint64_t>(frequency[symbol]) * codeTable[rank[symbol]].m_length;
succinct::bit_vector_builder bitsBuilder;
bitsBuilder.reserve(bitLength);
vector<bool> indexBuilder(bitLength);
size_t pos = 0;
{
for (uint8_t symbol : data)
{
Code const & code = codeTable[rank[symbol]];
ASSERT_LESS(pos, bitLength, ());
indexBuilder[pos] = 1;
bitsBuilder.append_bits(code.m_code, code.m_length);
pos += code.m_length;
}
}
ASSERT_EQUAL(pos, bitLength, ());
succinct::bit_vector(&bitsBuilder).swap(m_bits);
succinct::rs_bit_vector(indexBuilder, true /* with_select_hints */).swap(m_index);
m_symbols.assign(symbols);
}
uint8_t SimpleDenseCoding::Get(uint64_t i) const
{
ASSERT_LESS(i, Size(), ());
uint64_t const start = m_index.select(i);
uint64_t const end = i + 1 == Size() ? m_index.size() : m_index.select(i + 1);
ASSERT_LESS(start, end, ());
uint8_t const length = static_cast<uint8_t>(end - start);
ASSERT_LESS_OR_EQUAL(length, 8, ());
uint8_t const code = m_bits.get_bits(start, length);
uint8_t const rank = (1 << length) - 2 + code;
return m_symbols[rank];
}
} // namespace coding

View file

@ -0,0 +1,58 @@
#pragma once
#include "std/vector.hpp"
#include "3party/succinct/bit_vector.hpp"
#include "3party/succinct/mappable_vector.hpp"
#include "3party/succinct/rs_bit_vector.hpp"
namespace coding
{
// This class represents so-called simple dense coding for byte
// strings. It can be used when it's necessary to compress strings
// with skewed entropy and nevertheless efficient access to the
// string's elements is needed.
//
// The main idea is to assign codewords from the set { 0, 1, 00, 01,
// 10, 11, 000, ... } to string's symbols in accordance with their
// frequencies and to create a helper bit-vector for starting
// positions of the codewords in compressed string.
//
// Memory complexity: 2 * n * (H_0(T) + 1) bits for a text T, but note
// that this is an upper bound and too pessimistic.
// Time complexity: O(log(n * H_0(T))) to access i-th element of the
// string, because of logarithmic complexity of
// rs_bit_vector::select. This will be fixed when RRR will be
// implemented.
//
// For details, see Kimmo Fredriksson, Fedor Nikitin, "Simple Random
// Access Compression", Fundamenta Informaticae 2009,
// http://www.cs.uku.fi/~fredriks/pub/papers/fi09.pdf.
class SimpleDenseCoding
{
public:
SimpleDenseCoding() = default;
SimpleDenseCoding(vector<uint8_t> const & data);
uint8_t Get(uint64_t i) const;
inline uint64_t Size() const { return m_index.num_ones(); }
// map is used here (instead of Map) for compatibility with succinct
// structures.
template <typename TVisitor>
void map(TVisitor & visitor)
{
visitor(m_bits, "m_bits");
visitor(m_index, "m_index");
visitor(m_symbols, "m_symbols");
}
private:
succinct::bit_vector m_bits;
succinct::rs_bit_vector m_index;
succinct::mapper::mappable_vector<uint8_t> m_symbols;
};
} // namespace coding

196
coding/succinct_mapper.hpp Normal file
View file

@ -0,0 +1,196 @@
#pragma once
#include "coding/endianness.hpp"
#include "std/type_traits.hpp"
#include "3party/succinct/mappable_vector.hpp"
#include "3party/succinct/mapper.hpp"
namespace coding
{
template <typename T>
static T * Align8Ptr(T * ptr)
{
uint64_t value = (reinterpret_cast<uint64_t>(ptr) + 0x7) & 0xfffffffffffffff8;
return reinterpret_cast<T *>(value);
}
inline uint32_t NeedToAlign8(uint64_t written) { return 0x8 - (written & 0x7); }
class MapVisitor
{
public:
MapVisitor(uint8_t const * base) : m_base(base), m_cur(m_base) {}
template <typename T>
typename enable_if<!is_pod<T>::value, MapVisitor &>::type operator()(
T & val, char const * /* friendlyName */)
{
val.map(*this);
return *this;
}
template <typename T>
typename enable_if<is_pod<T>::value, MapVisitor &>::type operator()(
T & val, char const * /* friendlyName */)
{
T const * valPtr = reinterpret_cast<T const *>(m_cur);
val = *valPtr;
m_cur += sizeof(T);
m_cur = Align8Ptr(m_cur);
return *this;
}
template <typename T>
MapVisitor & operator()(succinct::mapper::mappable_vector<T> & vec,
char const * /* friendlyName */)
{
vec.clear();
(*this)(vec.m_size, "size");
vec.m_data = reinterpret_cast<const T *>(m_cur);
size_t const bytes = vec.m_size * sizeof(T);
m_cur += bytes;
m_cur = Align8Ptr(m_cur);
return *this;
}
size_t BytesRead() const { return static_cast<size_t>(m_cur - m_base); }
private:
uint8_t const * const m_base;
uint8_t const * m_cur;
};
class ReverseMapVisitor
{
public:
ReverseMapVisitor(uint8_t * base) : m_base(base), m_cur(m_base) {}
template <typename T>
typename enable_if<!is_pod<T>::value, ReverseMapVisitor &>::type operator()(
T & val, char const * /* friendlyName */)
{
val.map(*this);
return *this;
}
template <typename T>
typename enable_if<is_pod<T>::value, ReverseMapVisitor &>::type operator()(
T & val, char const * /* friendlyName */)
{
T * valPtr = reinterpret_cast<T *>(m_cur);
*valPtr = ReverseByteOrder(*valPtr);
val = *valPtr;
m_cur += sizeof(T);
m_cur = Align8Ptr(m_cur);
return *this;
}
template <typename T>
ReverseMapVisitor & operator()(succinct::mapper::mappable_vector<T> & vec,
char const * /* friendlyName */)
{
vec.clear();
(*this)(vec.m_size, "size");
vec.m_data = reinterpret_cast<const T *>(m_cur);
for (auto const it = vec.begin(); it != vec.end(); ++it)
*it = ReverseByteOrder(*it);
size_t const bytes = vec.m_size * sizeof(T);
m_cur += bytes;
m_cur = Align8Ptr(m_cur);
return *this;
}
size_t BytesRead() const { return static_cast<size_t>(m_cur - m_base); }
private:
uint8_t * m_base;
uint8_t * m_cur;
};
template <typename TWriter>
class FreezeVisitor
{
public:
FreezeVisitor(TWriter & writer) : m_writer(writer), m_written(0) {}
template <typename T>
typename enable_if<!is_pod<T>::value, FreezeVisitor &>::type operator()(
T & val, char const * /* friendlyName */)
{
val.map(*this);
return *this;
}
template <typename T>
typename enable_if<is_pod<T>::value, FreezeVisitor &>::type operator()(
T & val, char const * /* friendlyName */)
{
m_writer.Write(reinterpret_cast<void const *>(&val), sizeof(T));
m_written += sizeof(T);
WritePadding();
return *this;
}
template <typename T>
FreezeVisitor & operator()(succinct::mapper::mappable_vector<T> & vec,
char const * /* friendlyName */)
{
(*this)(vec.m_size, "size");
size_t const bytes = static_cast<size_t>(vec.m_size * sizeof(T));
m_writer.Write(vec.m_data, static_cast<size_t>(bytes));
m_written += bytes;
WritePadding();
return *this;
}
size_t Written() const { return m_written; }
private:
void WritePadding()
{
uint32_t const padding = NeedToAlign8(m_written);
static uint64_t const zero = 0;
if (padding > 0 && padding < 8)
{
m_writer.Write(reinterpret_cast<void const *>(&zero), padding);
m_written += padding;
}
}
TWriter & m_writer;
uint64_t m_written;
};
template <typename T>
size_t Map(T & value, uint8_t const * base, char const * friendlyName = "<TOP>")
{
MapVisitor visitor(base);
visitor(value, friendlyName);
return visitor.BytesRead();
}
template <typename T>
size_t ReverseMap(T & value, uint8_t * base, char const * friendlyName = "<TOP>")
{
ReverseMapVisitor visitor(base);
visitor(value, friendlyName);
return visitor.BytesRead();
}
template <typename T, typename TWriter>
size_t Freeze(T & val, TWriter & writer, char const * friendlyName = "<TOP>")
{
FreezeVisitor<TWriter> visitor(writer);
visitor(val, friendlyName);
return visitor.Written();
}
} // namespace coding