diff --git a/3party/succinct/mappable_vector.hpp b/3party/succinct/mappable_vector.hpp index 0c1c9a76ea..0f45f427ec 100644 --- a/3party/succinct/mappable_vector.hpp +++ b/3party/succinct/mappable_vector.hpp @@ -13,6 +13,14 @@ #include "intrinsics.hpp" +namespace coding +{ +template +class FreezeVisitor; +class MapVisitor; +class ReverseMapVisitor; +} + namespace succinct { namespace mapper { namespace detail { @@ -115,6 +123,11 @@ namespace succinct { namespace mapper { friend class detail::map_visitor; friend class detail::sizeof_visitor; + template + friend class coding::FreezeVisitor; + friend class coding::MapVisitor; + friend class coding::ReverseMapVisitor; + protected: const T* m_data; uint64_t m_size; diff --git a/coding/coding.pro b/coding/coding.pro index 9f8adce637..71b5bdee05 100644 --- a/coding/coding.pro +++ b/coding/coding.pro @@ -30,6 +30,7 @@ SOURCES += \ reader.cpp \ reader_streambuf.cpp \ reader_writer_ops.cpp \ + simple_dense_coding.cpp \ sha2.cpp \ uri.cpp \ # varint_vector.cpp \ @@ -80,10 +81,12 @@ HEADERS += \ reader_streambuf.hpp \ reader_wrapper.hpp \ reader_writer_ops.hpp \ + simple_dense_coding.hpp \ sha2.hpp \ streams.hpp \ streams_common.hpp \ streams_sink.hpp \ + succinct_mapper.hpp \ succinct_trie.hpp \ succinct_trie_builder.hpp \ succinct_trie_reader.hpp \ diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro index c055b04b88..3cffa4501f 100644 --- a/coding/coding_tests/coding_tests.pro +++ b/coding/coding_tests/coding_tests.pro @@ -35,7 +35,9 @@ SOURCES += ../../testing/testingmain.cpp \ reader_cache_test.cpp \ reader_test.cpp \ reader_writer_ops_test.cpp \ + simple_dense_coding_test.cpp \ sha2_test.cpp \ + succinct_mapper_test.cpp \ succinct_trie_test.cpp \ trie_test.cpp \ uri_test.cpp \ diff --git a/coding/coding_tests/simple_dense_coding_test.cpp b/coding/coding_tests/simple_dense_coding_test.cpp new file mode 100644 index 0000000000..79bbd1f880 --- /dev/null +++ b/coding/coding_tests/simple_dense_coding_test.cpp @@ -0,0 +1,50 @@ +#include "testing/testing.hpp" + +#include "coding/file_writer.hpp" +#include "coding/mmap_reader.hpp" +#include "coding/simple_dense_coding.hpp" +#include "coding/succinct_mapper.hpp" + +#include "base/logging.hpp" +#include "base/scope_guard.hpp" + +#include "std/limits.hpp" +#include "std/string.hpp" +#include "std/vector.hpp" + +using namespace coding; + +namespace +{ +void TestSDC(vector const & data, SimpleDenseCoding const & coding) +{ + TEST_EQUAL(data.size(), coding.Size(), ()); + for (size_t i = 0; i < data.size(); ++i) + TEST_EQUAL(data[i], coding.Get(i), ()); +} +} // namespace + +UNIT_TEST(SimpleDenseCoding_Smoke) +{ + size_t const kSize = numeric_limits::max(); + vector data(kSize); + for (size_t i = 0; i < data.size(); ++i) + data[i] = i; + + string const kTestFile = "test.tmp"; + MY_SCOPE_GUARD(cleanup, bind(&FileWriter::DeleteFileX, kTestFile)); + + { + SimpleDenseCoding coding(data); + TestSDC(data, coding); + FileWriter writer(kTestFile); + Freeze(coding, writer); + } + + { + MmapReader reader(kTestFile); + SimpleDenseCoding coding; + Map(coding, reader.Data()); + TestSDC(data, coding); + } +} diff --git a/coding/coding_tests/succinct_mapper_test.cpp b/coding/coding_tests/succinct_mapper_test.cpp new file mode 100644 index 0000000000..97dec21539 --- /dev/null +++ b/coding/coding_tests/succinct_mapper_test.cpp @@ -0,0 +1,38 @@ +#include "testing/testing.hpp" + +#include "coding/succinct_mapper.hpp" +#include "coding/writer.hpp" + +#include "std/vector.hpp" + +#include "3party/succinct/mapper.hpp" + +using namespace coding; + +UNIT_TEST(ReverseMapper_Smoke) +{ + uint64_t data = 0x0123456789abcdef; + uint64_t rdata = 0x0; + TEST_EQUAL(8, ReverseMap(rdata, reinterpret_cast(&data), "uint64_t"), ()); + + // Test that reversed uint64_t was read. + TEST_EQUAL(0xefcdab8967452301, rdata, ()); + + // Test that underlying buffer was modified. + TEST_EQUAL(0xefcdab8967452301, data, ()); +} + +UNIT_TEST(Freeze_Smoke) +{ + vector data; + { + MemWriter writer(data); + uint64_t const data = 0x0123456789abcdef; + Freeze(data, writer); + } + TEST_EQUAL(8, data.size(), ()); + + uint64_t value = 0x0; + TEST_EQUAL(8, Map(value, reinterpret_cast(data.data())), ()); + TEST_EQUAL(0x0123456789abcdef, value, ()); +} diff --git a/coding/simple_dense_coding.cpp b/coding/simple_dense_coding.cpp new file mode 100644 index 0000000000..f2e4acfb95 --- /dev/null +++ b/coding/simple_dense_coding.cpp @@ -0,0 +1,124 @@ +#include "coding/simple_dense_coding.hpp" + +#include "base/assert.hpp" + +#include "std/algorithm.hpp" +#include "std/limits.hpp" + +namespace coding +{ +namespace +{ +struct Code +{ + Code() : m_code(0), m_length(0) {} + + uint8_t m_code; + uint8_t m_length; +}; + +size_t const kAlphabetSize = static_cast(numeric_limits::max()) + 1; +Code g_codeTable[kAlphabetSize]; +bool g_codeTableInitialized = false; + +// Returns pointer to an initialized code table. If necessary, +// initializes it. In the latter case, code table is filled with +// following code words: 0, 1, 00, 01, 10, 11, 000, 001, ... +Code const * GetCodeTable() +{ + if (g_codeTableInitialized) + return g_codeTable; + + unsigned length = 1; + size_t rank = 0; + while (rank < kAlphabetSize) + { + // Number of codes with the same bit length. + size_t const numCodes = static_cast(1) << length; + + size_t base = rank; + while (rank - base < numCodes && rank < kAlphabetSize) + { + g_codeTable[rank].m_code = rank - base; + g_codeTable[rank].m_length = length; + ++rank; + } + + ++length; + } + + g_codeTableInitialized = true; + return g_codeTable; +} + +// Computes frequences for data symbols. +void GetFrequences(vector const & data, uint64_t frequency[]) +{ + memset(frequency, 0, sizeof(*frequency) * kAlphabetSize); + for (uint8_t symbol : data) + ++frequency[symbol]; +} +} // namespace + +SimpleDenseCoding::SimpleDenseCoding(vector const & data) +{ + uint64_t frequency[kAlphabetSize]; // Maps symbols to frequences. + GetFrequences(data, frequency); + + uint8_t symbols[kAlphabetSize]; // Maps ranks to symbols. + uint8_t rank[kAlphabetSize]; // Maps symbols to ranks. + + for (size_t i = 0; i < kAlphabetSize; ++i) + symbols[i] = i; + sort(symbols, symbols + kAlphabetSize, [&frequency](uint8_t lsym, uint8_t rsym) + { + return frequency[lsym] > frequency[rsym]; + }); + for (size_t r = 0; r < kAlphabetSize; ++r) + rank[symbols[r]] = r; + + Code const * codeTable = GetCodeTable(); + ASSERT(codeTable, ()); + + uint64_t bitLength = 0; + for (size_t symbol = 0; symbol < kAlphabetSize; ++symbol) + bitLength += static_cast(frequency[symbol]) * codeTable[rank[symbol]].m_length; + + succinct::bit_vector_builder bitsBuilder; + bitsBuilder.reserve(bitLength); + vector indexBuilder(bitLength); + size_t pos = 0; + { + for (uint8_t symbol : data) + { + Code const & code = codeTable[rank[symbol]]; + ASSERT_LESS(pos, bitLength, ()); + indexBuilder[pos] = 1; + + bitsBuilder.append_bits(code.m_code, code.m_length); + pos += code.m_length; + } + } + ASSERT_EQUAL(pos, bitLength, ()); + + succinct::bit_vector(&bitsBuilder).swap(m_bits); + succinct::rs_bit_vector(indexBuilder, true /* with_select_hints */).swap(m_index); + m_symbols.assign(symbols); +} + +uint8_t SimpleDenseCoding::Get(uint64_t i) const +{ + ASSERT_LESS(i, Size(), ()); + uint64_t const start = m_index.select(i); + uint64_t const end = i + 1 == Size() ? m_index.size() : m_index.select(i + 1); + + ASSERT_LESS(start, end, ()); + + uint8_t const length = static_cast(end - start); + ASSERT_LESS_OR_EQUAL(length, 8, ()); + + uint8_t const code = m_bits.get_bits(start, length); + uint8_t const rank = (1 << length) - 2 + code; + return m_symbols[rank]; +} +} // namespace coding diff --git a/coding/simple_dense_coding.hpp b/coding/simple_dense_coding.hpp new file mode 100644 index 0000000000..f6ee0f438c --- /dev/null +++ b/coding/simple_dense_coding.hpp @@ -0,0 +1,58 @@ +#pragma once + +#include "std/vector.hpp" + +#include "3party/succinct/bit_vector.hpp" +#include "3party/succinct/mappable_vector.hpp" +#include "3party/succinct/rs_bit_vector.hpp" + +namespace coding +{ +// This class represents so-called simple dense coding for byte +// strings. It can be used when it's necessary to compress strings +// with skewed entropy and nevertheless efficient access to the +// string's elements is needed. +// +// The main idea is to assign codewords from the set { 0, 1, 00, 01, +// 10, 11, 000, ... } to string's symbols in accordance with their +// frequencies and to create a helper bit-vector for starting +// positions of the codewords in compressed string. +// +// Memory complexity: 2 * n * (H_0(T) + 1) bits for a text T, but note +// that this is an upper bound and too pessimistic. + +// Time complexity: O(log(n * H_0(T))) to access i-th element of the +// string, because of logarithmic complexity of +// rs_bit_vector::select. This will be fixed when RRR will be +// implemented. +// +// For details, see Kimmo Fredriksson, Fedor Nikitin, "Simple Random +// Access Compression", Fundamenta Informaticae 2009, +// http://www.cs.uku.fi/~fredriks/pub/papers/fi09.pdf. +class SimpleDenseCoding +{ +public: + SimpleDenseCoding() = default; + + SimpleDenseCoding(vector const & data); + + uint8_t Get(uint64_t i) const; + + inline uint64_t Size() const { return m_index.num_ones(); } + + // map is used here (instead of Map) for compatibility with succinct + // structures. + template + void map(TVisitor & visitor) + { + visitor(m_bits, "m_bits"); + visitor(m_index, "m_index"); + visitor(m_symbols, "m_symbols"); + } + +private: + succinct::bit_vector m_bits; + succinct::rs_bit_vector m_index; + succinct::mapper::mappable_vector m_symbols; +}; +} // namespace coding diff --git a/coding/succinct_mapper.hpp b/coding/succinct_mapper.hpp new file mode 100644 index 0000000000..3800f3fea0 --- /dev/null +++ b/coding/succinct_mapper.hpp @@ -0,0 +1,196 @@ +#pragma once + +#include "coding/endianness.hpp" + +#include "std/type_traits.hpp" + +#include "3party/succinct/mappable_vector.hpp" +#include "3party/succinct/mapper.hpp" + +namespace coding +{ +template +static T * Align8Ptr(T * ptr) +{ + uint64_t value = (reinterpret_cast(ptr) + 0x7) & 0xfffffffffffffff8; + return reinterpret_cast(value); +} + +inline uint32_t NeedToAlign8(uint64_t written) { return 0x8 - (written & 0x7); } + +class MapVisitor +{ +public: + MapVisitor(uint8_t const * base) : m_base(base), m_cur(m_base) {} + + template + typename enable_if::value, MapVisitor &>::type operator()( + T & val, char const * /* friendlyName */) + { + val.map(*this); + return *this; + } + + template + typename enable_if::value, MapVisitor &>::type operator()( + T & val, char const * /* friendlyName */) + { + T const * valPtr = reinterpret_cast(m_cur); + val = *valPtr; + m_cur += sizeof(T); + + m_cur = Align8Ptr(m_cur); + return *this; + } + + template + MapVisitor & operator()(succinct::mapper::mappable_vector & vec, + char const * /* friendlyName */) + { + vec.clear(); + (*this)(vec.m_size, "size"); + + vec.m_data = reinterpret_cast(m_cur); + size_t const bytes = vec.m_size * sizeof(T); + + m_cur += bytes; + m_cur = Align8Ptr(m_cur); + return *this; + } + + size_t BytesRead() const { return static_cast(m_cur - m_base); } + +private: + uint8_t const * const m_base; + uint8_t const * m_cur; +}; + +class ReverseMapVisitor +{ +public: + ReverseMapVisitor(uint8_t * base) : m_base(base), m_cur(m_base) {} + + template + typename enable_if::value, ReverseMapVisitor &>::type operator()( + T & val, char const * /* friendlyName */) + { + val.map(*this); + return *this; + } + + template + typename enable_if::value, ReverseMapVisitor &>::type operator()( + T & val, char const * /* friendlyName */) + { + T * valPtr = reinterpret_cast(m_cur); + *valPtr = ReverseByteOrder(*valPtr); + val = *valPtr; + m_cur += sizeof(T); + + m_cur = Align8Ptr(m_cur); + return *this; + } + + template + ReverseMapVisitor & operator()(succinct::mapper::mappable_vector & vec, + char const * /* friendlyName */) + { + vec.clear(); + (*this)(vec.m_size, "size"); + + vec.m_data = reinterpret_cast(m_cur); + for (auto const it = vec.begin(); it != vec.end(); ++it) + *it = ReverseByteOrder(*it); + size_t const bytes = vec.m_size * sizeof(T); + + m_cur += bytes; + m_cur = Align8Ptr(m_cur); + return *this; + } + + size_t BytesRead() const { return static_cast(m_cur - m_base); } + +private: + uint8_t * m_base; + uint8_t * m_cur; +}; + +template +class FreezeVisitor +{ +public: + FreezeVisitor(TWriter & writer) : m_writer(writer), m_written(0) {} + + template + typename enable_if::value, FreezeVisitor &>::type operator()( + T & val, char const * /* friendlyName */) + { + val.map(*this); + return *this; + } + + template + typename enable_if::value, FreezeVisitor &>::type operator()( + T & val, char const * /* friendlyName */) + { + m_writer.Write(reinterpret_cast(&val), sizeof(T)); + m_written += sizeof(T); + WritePadding(); + return *this; + } + + template + FreezeVisitor & operator()(succinct::mapper::mappable_vector & vec, + char const * /* friendlyName */) + { + (*this)(vec.m_size, "size"); + + size_t const bytes = static_cast(vec.m_size * sizeof(T)); + m_writer.Write(vec.m_data, static_cast(bytes)); + m_written += bytes; + WritePadding(); + return *this; + } + + size_t Written() const { return m_written; } + +private: + void WritePadding() + { + uint32_t const padding = NeedToAlign8(m_written); + static uint64_t const zero = 0; + if (padding > 0 && padding < 8) + { + m_writer.Write(reinterpret_cast(&zero), padding); + m_written += padding; + } + } + + TWriter & m_writer; + uint64_t m_written; +}; + +template +size_t Map(T & value, uint8_t const * base, char const * friendlyName = "") +{ + MapVisitor visitor(base); + visitor(value, friendlyName); + return visitor.BytesRead(); +} + +template +size_t ReverseMap(T & value, uint8_t * base, char const * friendlyName = "") +{ + ReverseMapVisitor visitor(base); + visitor(value, friendlyName); + return visitor.BytesRead(); +} + +template +size_t Freeze(T & val, TWriter & writer, char const * friendlyName = "") +{ + FreezeVisitor visitor(writer); + visitor(val, friendlyName); + return visitor.Written(); +} +} // namespace coding