diff --git a/coding/bit_streams.hpp b/coding/bit_streams.hpp index 5bc00a9419..702e31145d 100644 --- a/coding/bit_streams.hpp +++ b/coding/bit_streams.hpp @@ -1,11 +1,13 @@ #pragma once -#include "std/cstdint.hpp" -#include "std/limits.hpp" - #include "base/assert.hpp" #include "base/logging.hpp" +#include "std/algorithm.hpp" +#include "std/cstdint.hpp" +#include "std/limits.hpp" + + namespace { uint8_t const kByteMask = 0xFF; @@ -39,7 +41,7 @@ public: // Writes n bits starting with the least significant bit. // They are written one byte at a time so endianness is of no concern. // All the other bits except for the first n must be set to zero. - void Write(uint8_t bits, uint32_t n) + void Write(uint8_t bits, uint8_t n) { if (n == 0) return; @@ -68,6 +70,17 @@ public: } } + // Same as Write but accept up to 32 bits to write. + void WriteAtMost32Bits(uint32_t bits, uint8_t n) + { + ASSERT_LESS_OR_EQUAL(n, 32, ()); + + uint8_t constexpr kMinBits = CHAR_BIT; + Write(static_cast(bits), min(n, kMinBits)); + if (n > kMinBits) + WriteAtMost32Bits(bits >> kMinBits, n - kMinBits); + } + private: // Writes up to CHAR_BIT-1 last bits if they have not been written yet // and pads them with zeros. @@ -96,7 +109,7 @@ public: // The underlying m_src is supposed to be byte-aligned (which is the // case when it reads from the place that was written to using BitWriter). // Read may use one lookahead byte. - uint8_t Read(uint32_t n) + uint8_t Read(uint8_t n) { if (n == 0) return 0; diff --git a/coding/coding.pro b/coding/coding.pro index adf2dd5515..44c6581d70 100644 --- a/coding/coding.pro +++ b/coding/coding.pro @@ -67,10 +67,12 @@ HEADERS += \ file_sort.hpp \ file_writer.hpp \ file_writer_stream.hpp \ + fixed_bits_ddvector.hpp \ hex.hpp \ huffman.hpp \ internal/file64_api.hpp \ internal/file_data.hpp \ + internal/xmlparser.hpp \ matrix_traversal.hpp \ mmap_reader.hpp \ multilang_utf8_string.hpp \ @@ -101,4 +103,3 @@ HEADERS += \ writer.hpp \ zip_creator.hpp \ zip_reader.hpp \ - internal/xmlparser.hpp \ diff --git a/coding/coding_tests/bit_streams_test.cpp b/coding/coding_tests/bit_streams_test.cpp index b2426e8c73..ac60178ecb 100644 --- a/coding/coding_tests/bit_streams_test.cpp +++ b/coding/coding_tests/bit_streams_test.cpp @@ -44,3 +44,21 @@ UNIT_TEST(BitStreams_Smoke) TEST_EQUAL(num, nums[i].first, (i)); } } + +UNIT_TEST(BitStreams_T1) +{ + using TBuffer = vector; + using TWriter = MemWriter; + + TBuffer buf; + { + TWriter w(buf); + BitWriter bits(w); + + bits.Write(0, 3); + bits.Write(3, 3); + bits.Write(6, 3); + } + + TEST_EQUAL(buf.size(), 2, ()); +} diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro index adb3676b54..9e4f3996e2 100644 --- a/coding/coding_tests/coding_tests.pro +++ b/coding/coding_tests/coding_tests.pro @@ -27,6 +27,7 @@ SOURCES += ../../testing/testingmain.cpp \ file_data_test.cpp \ file_sort_test.cpp \ file_utils_test.cpp \ + fixed_bits_ddvector_test.cpp \ hex_test.cpp \ huffman_test.cpp \ mem_file_reader_test.cpp \ diff --git a/coding/coding_tests/fixed_bits_ddvector_test.cpp b/coding/coding_tests/fixed_bits_ddvector_test.cpp new file mode 100644 index 0000000000..4a8a998952 --- /dev/null +++ b/coding/coding_tests/fixed_bits_ddvector_test.cpp @@ -0,0 +1,74 @@ +#include "testing/testing.hpp" + +#include "coding/fixed_bits_ddvector.hpp" +#include "coding/writer.hpp" + +#include "std/initializer_list.hpp" +#include "std/random.hpp" + + +namespace +{ + +template void TestWithData(vector const & lst) +{ + using TVector = FixedBitsDDVector; + using TBuffer = vector; + using TWriter = MemWriter; + + TBuffer buf; + { + TWriter writer(buf); + typename TVector::template Builder builder(writer); + + uint32_t optCount = 0; + uint32_t const optBound = (1 << Bits) - 1; + + for (uint32_t v : lst) + { + if (v < optBound) + ++optCount; + + builder.PushBack(v); + } + + pair exp(optCount, lst.size()); + TEST_EQUAL(builder.GetCount(), exp, ()); + } + + MemReader reader(buf.data(), buf.size()); + auto const vec = TVector::Create(reader); + + size_t i = 0; + for (uint32_t v : lst) + TEST_EQUAL(vec->Get(i++), v, ()); +} + +} // namespace + +UNIT_TEST(FixedBitsDDVector_Smoke) +{ + TestWithData<3>({0, 3, 6}); + TestWithData<3>({7, 20, 50}); + TestWithData<3>({1, 0, 4, 30, 5, 3, 6, 7, 2, 8, 0}); +} + +UNIT_TEST(FixedBitsDDVector_Rand) +{ + vector v; + + default_random_engine gen; + uniform_int_distribution distribution(0, 1000); + + size_t constexpr kMaxCount = 1000; + for (size_t i = 0; i < kMaxCount; ++i) + v.push_back(distribution(gen)); + + TestWithData<3>(v); + TestWithData<4>(v); + TestWithData<5>(v); + TestWithData<6>(v); + TestWithData<7>(v); + TestWithData<8>(v); + TestWithData<9>(v); +} diff --git a/coding/fixed_bits_ddvector.hpp b/coding/fixed_bits_ddvector.hpp new file mode 100644 index 0000000000..b535be9b73 --- /dev/null +++ b/coding/fixed_bits_ddvector.hpp @@ -0,0 +1,167 @@ +#pragma once + +#include "bit_streams.hpp" +#include "byte_stream.hpp" +#include "dd_vector.hpp" +#include "reader.hpp" +#include "write_to_sink.hpp" + +#include "std/algorithm.hpp" +#include "std/unique_ptr.hpp" +#include "std/vector.hpp" + + +/// Disk driven vector for optimal storing small values with rare big values. +/// Format: +/// 4 bytes to store vector's size +/// Buffer of ceil(Size * Bits / 8) bytes, e.g. vector of Bits-sized elements. +/// - values in range [0, (1 << Bits) - 2] stored as is +/// - value (1 << Bits) - 1 tells that actual value is stored in the exceptions table below. +/// Buffer with exceptions table, e.g. vector of (index, value) pairs till the end of the reader, +/// sorted by index parameter. +/// Component works in little endian without any host conversions. + +template +< + size_t Bits, /// number of fixed bits + class TReader, /// reader with random offset read functions + typename TSize = uint32_t, /// vector index type (platform independent) + typename TValue = uint32_t /// vector value type (platform independent) +> +class FixedBitsDDVector +{ + static_assert(is_unsigned::value, ""); + static_assert(is_unsigned::value, ""); + // 16 - is the maximum bits count to get all needed bits in random access within uint32_t. + static_assert(Bits <= 16, ""); + + using TSelf = FixedBitsDDVector; + + struct IndexValue + { + TSize m_index; + TValue m_value; + bool operator<(IndexValue const & rhs) const { return m_index < rhs.m_index; } + }; + + TReader m_bits; + DDVector m_vector; + +#ifdef DEBUG + TSize const m_size; +#endif + + using TBlock = uint32_t; + + static uint64_t AlignBytesCount(uint64_t count) + { + return max(count, static_cast(sizeof(TBlock))); + } + + static TBlock constexpr kMask = (1 << Bits) - 1; + + TValue FindInVector(TSize index) const + { + auto const it = lower_bound(m_vector.begin(), m_vector.end(), IndexValue{index, 0}); + ASSERT(it != m_vector.end() && it->m_index == index, ()); + return it->m_value; + } + + FixedBitsDDVector(TReader const & bitsReader, TReader const & vecReader, TSize size) + : m_bits(bitsReader) + , m_vector(vecReader) + #ifdef DEBUG + , m_size(size) + #endif + { + } + +public: + static unique_ptr Create(TReader const & reader) + { + TSize const size = ReadPrimitiveFromPos(reader, 0); + + uint64_t const off1 = sizeof(TSize); + uint64_t const off2 = AlignBytesCount((size * Bits + CHAR_BIT - 1) / CHAR_BIT) + off1; + return unique_ptr(new TSelf(reader.SubReader(off1, off2 - off1), + reader.SubReader(off2, reader.Size() - off2), + size)); + } + + TValue Get(TSize index) const + { + ASSERT_LESS(index, m_size, ()); + uint64_t const bitsOffset = index * Bits; + + uint64_t bytesOffset = bitsOffset / CHAR_BIT; + size_t constexpr kBlockSize = sizeof(TBlock); + if (bytesOffset + kBlockSize > m_bits.Size()) + bytesOffset = m_bits.Size() - kBlockSize; + + TBlock v = ReadPrimitiveFromPos(m_bits, bytesOffset); + v >>= (bitsOffset - bytesOffset * CHAR_BIT); + v &= kMask; + return (v == kMask ? FindInVector(index) : v); + } + + template class Builder + { + using TData = vector; + using TempWriter = PushBackByteSink; + using TBits = BitWriter; + + TData m_data; + TempWriter m_writer; + unique_ptr m_bits; + + vector m_excepts; + TSize m_count = 0; + TSize m_optCount = 0; + + TWriter & m_finalWriter; + + public: + explicit Builder(TWriter & writer) + : m_writer(m_data), m_bits(new TBits(m_writer)), m_finalWriter(writer) + { + } + + ~Builder() + { + // Final serialization is in dtor only. + // You can't do any intermediate flushes during building vector. + + // Reset the bit stream first. + m_bits.reset(); + + // Write size of vector. + WriteToSink(m_finalWriter, m_count); + + // Write bits vector, alignes at least to 4 bytes. + m_data.resize(AlignBytesCount(m_data.size())); + m_finalWriter.Write(m_data.data(), m_data.size()); + + // Write exceptions table. + m_finalWriter.Write(m_excepts.data(), m_excepts.size() * sizeof(IndexValue)); + } + + void PushBack(TValue v) + { + if (v >= kMask) + { + m_bits->WriteAtMost32Bits(kMask, Bits); + m_excepts.push_back({m_count, v}); + } + else + { + ++m_optCount; + m_bits->WriteAtMost32Bits(v, Bits); + } + + ++m_count; + } + + /// @return (number of stored as-is elements, number of all elements) + pair GetCount() const { return make_pair(m_optCount, m_count); } + }; +}; diff --git a/std/random.hpp b/std/random.hpp index 9c43314450..e90db2357c 100644 --- a/std/random.hpp +++ b/std/random.hpp @@ -6,6 +6,7 @@ #include +using std::default_random_engine; using std::mt19937; using std::uniform_int_distribution;