From 097618caddc3734e141ef18275f2ab3672f1f8f4 Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Fri, 3 Jul 2015 20:25:19 +0300 Subject: [PATCH] [omim] [coding] A Huffman coding implementation. --- coding/coding.pro | 2 + coding/coding_tests/coding_tests.pro | 1 + coding/coding_tests/huffman_test.cpp | 99 +++++++++++ coding/huffman.cpp | 78 ++++++++ coding/huffman.hpp | 254 +++++++++++++++++++++++++++ 5 files changed, 434 insertions(+) create mode 100644 coding/coding_tests/huffman_test.cpp create mode 100644 coding/huffman.cpp create mode 100644 coding/huffman.hpp diff --git a/coding/coding.pro b/coding/coding.pro index 9c48d33449..6fa5ba57f7 100644 --- a/coding/coding.pro +++ b/coding/coding.pro @@ -37,6 +37,7 @@ SOURCES += \ compressed_bit_vector.cpp \ # compressed_varnum_vector.cpp \ png_memory_encoder.cpp \ + huffman.cpp \ HEADERS += \ internal/xmlparser.hpp \ @@ -104,3 +105,4 @@ HEADERS += \ varint_misc.hpp \ bit_streams.hpp \ png_memory_encoder.hpp \ + huffman.hpp \ diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro index 4f92622d01..a8f5845170 100644 --- a/coding/coding_tests/coding_tests.pro +++ b/coding/coding_tests/coding_tests.pro @@ -30,6 +30,7 @@ SOURCES += ../../testing/testingmain.cpp \ file_utils_test.cpp \ gzip_test.cpp \ hex_test.cpp \ + huffman_test.cpp \ mem_file_reader_test.cpp \ mem_file_writer_test.cpp \ multilang_utf8_string_test.cpp \ diff --git a/coding/coding_tests/huffman_test.cpp b/coding/coding_tests/huffman_test.cpp new file mode 100644 index 0000000000..0aff25126a --- /dev/null +++ b/coding/coding_tests/huffman_test.cpp @@ -0,0 +1,99 @@ +#include "testing/testing.hpp" + +#include "coding/huffman.hpp" +#include "coding/reader.hpp" +#include "coding/writer.hpp" + +#include "base/string_utils.hpp" +#include "std/vector.hpp" + +namespace +{ +vector MakeUniStringVector(vector const & v) +{ + vector result(v.size()); + for (size_t i = 0; i < v.size(); ++i) + result[i] = strings::UniString(v[i].begin(), v[i].end()); + return result; +} + +void TestDecode(coding::HuffmanCoder const & h, uint32_t bits, uint32_t len, uint32_t expected) +{ + coding::HuffmanCoder::Code code(bits, len); + uint32_t received; + TEST(h.Decode(code, received), ("Could not decode", code.bits, "( length", code.len, ")")); + TEST_EQUAL(expected, received, ()); +} +} // namespace + +namespace coding +{ +UNIT_TEST(Huffman_Smoke) +{ + HuffmanCoder h; + h.Init(MakeUniStringVector(vector{"ab", "ac"})); + + TestDecode(h, 0, 1, static_cast('a')); // 0 + TestDecode(h, 1, 2, static_cast('b')); // 10 + TestDecode(h, 3, 2, static_cast('c')); // 11 +} + +UNIT_TEST(Huffman_OneSymbol) +{ + HuffmanCoder h; + h.Init(MakeUniStringVector(vector{string(5, 0)})); + + TestDecode(h, 0, 0, 0); +} + +UNIT_TEST(Huffman_Serialization_Encoding) +{ + HuffmanCoder hW; + hW.Init(MakeUniStringVector( + vector{"aaaaaaaaaa", "bbbbbbbbbb", "ccccc", "ddddd"})); // 10, 10, 5, 5 + vector buf; + MemWriter> writer(buf); + hW.WriteEncoding(writer); + + HuffmanCoder hR; + MemReader memReader(&buf[0], buf.size()); + ReaderSource reader(memReader); + hR.ReadEncoding(reader); + + TEST_EQUAL(reader.Pos(), writer.Pos(), ()); + + TestDecode(hW, 0, 2, static_cast('a')); // 00 + TestDecode(hW, 2, 2, static_cast('b')); // 01 + TestDecode(hW, 1, 2, static_cast('c')); // 10 + TestDecode(hW, 3, 2, static_cast('d')); // 11 + + TestDecode(hR, 0, 2, static_cast('a')); + TestDecode(hR, 2, 2, static_cast('b')); + TestDecode(hR, 1, 2, static_cast('c')); + TestDecode(hR, 3, 2, static_cast('d')); +} + +UNIT_TEST(Huffman_Serialization_Data) +{ + HuffmanCoder hW; + hW.Init(MakeUniStringVector( + vector{"aaaaaaaaaa", "bbbbbbbbbb", "ccccc", "ddddd"})); // 10, 10, 5, 5 + vector buf; + + string const data = "abacabaddddaaabbcabacabadbabd"; + strings::UniString expected = strings::UniString(data.begin(), data.end()); + + MemWriter> writer(buf); + hW.WriteEncoding(writer); + hW.EncodeAndWrite(writer, expected); + + HuffmanCoder hR; + MemReader memReader(&buf[0], buf.size()); + ReaderSource reader(memReader); + hR.ReadEncoding(reader); + strings::UniString received = hR.ReadAndDecode(reader); + + TEST_EQUAL(expected, received, ()); +} + +} // namespace coding diff --git a/coding/huffman.cpp b/coding/huffman.cpp new file mode 100644 index 0000000000..acdd945303 --- /dev/null +++ b/coding/huffman.cpp @@ -0,0 +1,78 @@ +#include "coding/huffman.hpp" + +#include "base/logging.hpp" + +namespace coding +{ +HuffmanCoder::~HuffmanCoder() +{ + try + { + DeleteHuffmanTree(m_root); + } + catch (...) + { + LOG(LWARNING, ("Caught an exception when deleting a Huffman tree.")); + } +} + +void HuffmanCoder::Init(vector const & data) +{ + BuildHuffmanTree(data.begin(), data.end()); + BuildTables(m_root, 0); + DeleteHuffmanTree(m_root); + m_root = nullptr; +} + +bool HuffmanCoder::Encode(uint32_t symbol, Code & code) const +{ + auto it = m_encoderTable.find(symbol); + if (it == m_encoderTable.end()) + return false; + code = it->second; + return true; +} + +bool HuffmanCoder::Decode(Code const & code, uint32_t & symbol) const +{ + auto it = m_decoderTable.find(code); + if (it == m_decoderTable.end()) + return false; + symbol = it->second; + return true; +} + +void HuffmanCoder::BuildTables(Node * root, uint32_t path) +{ + if (!root) + return; + if (root->isLeaf) + { + Code code(path, root->depth); + m_encoderTable[root->symbol] = code; + m_decoderTable[code] = root->symbol; + return; + } + BuildTables(root->l, path); + BuildTables(root->r, path + (static_cast(1) << root->depth)); +} + +void HuffmanCoder::DeleteHuffmanTree(Node * root) +{ + if (!root) + return; + DeleteHuffmanTree(root->l); + DeleteHuffmanTree(root->r); + delete root; +} + +void HuffmanCoder::SetDepths(Node * root, uint32_t depth) +{ + if (!root) + return; + root->depth = depth; + SetDepths(root->l, depth + 1); + SetDepths(root->r, depth + 1); +} + +} // namespace coding diff --git a/coding/huffman.hpp b/coding/huffman.hpp new file mode 100644 index 0000000000..b004a00268 --- /dev/null +++ b/coding/huffman.hpp @@ -0,0 +1,254 @@ +#pragma once + +#include "coding/bit_streams.hpp" +#include "coding/varint.hpp" + +#include "base/assert.hpp" +#include "base/string_utils.hpp" + +#include "std/algorithm.hpp" +#include "std/map.hpp" +#include "std/queue.hpp" +#include "std/vector.hpp" + +namespace coding +{ +class HuffmanCoder +{ +public: + // A Code encodes a path to a leaf. It is read starting from + // the least significant bit. + struct Code + { + uint32_t bits; + uint32_t len; + + Code() = default; + Code(uint32_t bits, uint32_t len) : bits(bits), len(len) {} + + bool operator<(const Code & o) const + { + if (bits != o.bits) + return bits < o.bits; + return len < o.len; + } + }; + + HuffmanCoder() : m_root(nullptr) {} + ~HuffmanCoder(); + + // Internally builds a Huffman tree and makes + // the EncodeAndWrite and ReadAndDecode methods available. + void Init(vector const & data); + + // One way to store the encoding would be + // -- the succinct representation of the topology of Huffman tree; + // -- the list of symbols that are stored in the leaves, as varuints in delta encoding. + // This would probably be an overkill. + template + void WriteEncoding(TWriter & writer) + { + // @todo Do not waste space, use BitWriter. + WriteVarUint(writer, m_decoderTable.size()); + for (auto const & kv : m_decoderTable) + { + WriteVarUint(writer, kv.first.bits); + WriteVarUint(writer, kv.first.len); + WriteVarUint(writer, kv.second); + } + } + + template + void ReadEncoding(TReader & reader) + { + DeleteHuffmanTree(m_root); + m_root = new Node(0 /* symbol */, 0 /* freq */, false /* isLeaf */); + + m_encoderTable.clear(); + m_decoderTable.clear(); + + size_t sz = static_cast(ReadVarUint(reader)); + for (size_t i = 0; i < sz; ++i) + { + uint32_t bits = ReadVarUint(reader); + uint32_t len = ReadVarUint(reader); + uint32_t symbol = ReadVarUint(reader); + Code code(bits, len); + + m_encoderTable[symbol] = code; + m_decoderTable[code] = symbol; + + Node * cur = m_root; + for (size_t j = 0; j < len; ++j) + { + if (((bits >> j) & 1) == 0) + { + if (!cur->l) + cur->l = new Node(0 /* symbol */, 0 /* freq */, false /* isLeaf */); + cur = cur->l; + } + else + { + if (!cur->r) + cur->r = new Node(0 /* symbol */, 0 /* freq */, false /* isLeaf */); + cur = cur->r; + } + cur->depth = j + 1; + } + cur->isLeaf = true; + cur->symbol = symbol; + } + } + + bool Encode(uint32_t symbol, Code & code) const; + bool Decode(Code const & code, uint32_t & symbol) const; + + template + void EncodeAndWrite(TWriter & writer, strings::UniString const & s) + { + BitWriter bitWriter(writer); + WriteVarUint(writer, s.size()); + for (size_t i = 0; i < s.size(); ++i) + EncodeAndWrite(bitWriter, static_cast(s[i])); + } + + template + strings::UniString ReadAndDecode(TReader & reader) const + { + BitReader bitReader(reader); + size_t sz = static_cast(ReadVarUint(reader)); + vector v(sz); + for (size_t i = 0; i < sz; ++i) + v[i] = static_cast(ReadAndDecode(bitReader)); + return strings::UniString(v.begin(), v.end()); + } + +private: + // One would need more than 2^32 symbols to build a code that long. + // On the other hand, 32 is short enough for our purposes, so do not + // try to shrink the trees beyond this threshold. + const uint32_t kMaxDepth = 32; + + struct Node + { + Node *l, *r; + uint32_t symbol; + uint32_t freq; + uint32_t depth; + bool isLeaf; + + Node(uint32_t symbol, uint32_t freq, bool isLeaf) + : l(nullptr), r(nullptr), symbol(symbol), freq(freq), depth(0), isLeaf(isLeaf) + { + } + }; + + struct NodeComparator + { + bool operator()(Node const * const a, Node const * const b) const + { + if (a->freq != b->freq) + return a->freq > b->freq; + return a->symbol > b->symbol; + } + }; + + // No need to clump the interface: keep private the methods + // that encode one symbol only. + template + void EncodeAndWrite(BitWriter & bitWriter, uint32_t symbol) + { + Code code; + CHECK(Encode(symbol, code), ()); + for (size_t i = 0; i < code.len; ++i) + bitWriter.Write((code.bits >> i) & 1, 1); + } + + template + uint32_t ReadAndDecode(BitReader & bitReader) const + { + Node * cur = m_root; + while (cur) + { + if (cur->isLeaf) + return cur->symbol; + uint8_t bit = bitReader.Read(1); + if (bit == 0) + cur = cur->l; + else + cur = cur->r; + } + CHECK(false, ("Could not decode a Huffman-encoded symbol.")); + return 0; + } + + // Converts a Huffman tree into the more convenient representation + // of encoding and decoding tables. + void BuildTables(Node * root, uint32_t path); + + void DeleteHuffmanTree(Node * root); + + // Builds a fixed Huffman tree for a collection of strings::UniStrings. + // UniString is always UTF-32. Every code point is treated as a symbol for the encoder. + template + void BuildHuffmanTree(TIter beg, TIter end) + { + if (beg == end) + { + m_root = nullptr; + return; + } + + map freqs; + for (auto it = beg; it != end; ++it) + { + auto const & e = *it; + for (size_t i = 0; i < e.size(); ++i) + { + ++freqs[static_cast(e[i])]; + } + } + + priority_queue, NodeComparator> pq; + for (auto const & e : freqs) + pq.push(new Node(e.first, e.second, true /* isLeaf */)); + + if (pq.empty()) + { + m_root = nullptr; + return; + } + + while (pq.size() > 1) + { + auto a = pq.top(); + pq.pop(); + auto b = pq.top(); + pq.pop(); + if (a->symbol > b->symbol) + swap(a, b); + // Give it the smaller symbol to make the resulting encoding more predictable. + auto ab = new Node(a->symbol, a->freq + b->freq, false /* isLeaf */); + ab->l = a; + ab->r = b; + CHECK_LESS_OR_EQUAL(a->depth, kMaxDepth, ()); + CHECK_LESS_OR_EQUAL(b->depth, kMaxDepth, ()); + pq.push(ab); + } + + m_root = pq.top(); + pq.pop(); + + SetDepths(m_root, 0); + } + + // Properly sets the depth field in the subtree rooted at root. + // It is easier to do it after the tree is built. + void SetDepths(Node * root, uint32_t depth); + + Node * m_root; // m_pRoot? + map m_decoderTable; + map m_encoderTable; +}; + +} // namespace coding