forked from organicmaps/organicmaps-tmp
[omim] [coding] A Huffman coding implementation.
This commit is contained in:
parent
84d1eb88a1
commit
097618cadd
5 changed files with 434 additions and 0 deletions
|
@ -37,6 +37,7 @@ SOURCES += \
|
|||
compressed_bit_vector.cpp \
|
||||
# compressed_varnum_vector.cpp \
|
||||
png_memory_encoder.cpp \
|
||||
huffman.cpp \
|
||||
|
||||
HEADERS += \
|
||||
internal/xmlparser.hpp \
|
||||
|
@ -104,3 +105,4 @@ HEADERS += \
|
|||
varint_misc.hpp \
|
||||
bit_streams.hpp \
|
||||
png_memory_encoder.hpp \
|
||||
huffman.hpp \
|
||||
|
|
|
@ -30,6 +30,7 @@ SOURCES += ../../testing/testingmain.cpp \
|
|||
file_utils_test.cpp \
|
||||
gzip_test.cpp \
|
||||
hex_test.cpp \
|
||||
huffman_test.cpp \
|
||||
mem_file_reader_test.cpp \
|
||||
mem_file_writer_test.cpp \
|
||||
multilang_utf8_string_test.cpp \
|
||||
|
|
99
coding/coding_tests/huffman_test.cpp
Normal file
99
coding/coding_tests/huffman_test.cpp
Normal file
|
@ -0,0 +1,99 @@
|
|||
#include "testing/testing.hpp"
|
||||
|
||||
#include "coding/huffman.hpp"
|
||||
#include "coding/reader.hpp"
|
||||
#include "coding/writer.hpp"
|
||||
|
||||
#include "base/string_utils.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
namespace
|
||||
{
|
||||
vector<strings::UniString> MakeUniStringVector(vector<string> const & v)
|
||||
{
|
||||
vector<strings::UniString> result(v.size());
|
||||
for (size_t i = 0; i < v.size(); ++i)
|
||||
result[i] = strings::UniString(v[i].begin(), v[i].end());
|
||||
return result;
|
||||
}
|
||||
|
||||
void TestDecode(coding::HuffmanCoder const & h, uint32_t bits, uint32_t len, uint32_t expected)
|
||||
{
|
||||
coding::HuffmanCoder::Code code(bits, len);
|
||||
uint32_t received;
|
||||
TEST(h.Decode(code, received), ("Could not decode", code.bits, "( length", code.len, ")"));
|
||||
TEST_EQUAL(expected, received, ());
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace coding
|
||||
{
|
||||
UNIT_TEST(Huffman_Smoke)
|
||||
{
|
||||
HuffmanCoder h;
|
||||
h.Init(MakeUniStringVector(vector<string>{"ab", "ac"}));
|
||||
|
||||
TestDecode(h, 0, 1, static_cast<uint32_t>('a')); // 0
|
||||
TestDecode(h, 1, 2, static_cast<uint32_t>('b')); // 10
|
||||
TestDecode(h, 3, 2, static_cast<uint32_t>('c')); // 11
|
||||
}
|
||||
|
||||
UNIT_TEST(Huffman_OneSymbol)
|
||||
{
|
||||
HuffmanCoder h;
|
||||
h.Init(MakeUniStringVector(vector<string>{string(5, 0)}));
|
||||
|
||||
TestDecode(h, 0, 0, 0);
|
||||
}
|
||||
|
||||
UNIT_TEST(Huffman_Serialization_Encoding)
|
||||
{
|
||||
HuffmanCoder hW;
|
||||
hW.Init(MakeUniStringVector(
|
||||
vector<string>{"aaaaaaaaaa", "bbbbbbbbbb", "ccccc", "ddddd"})); // 10, 10, 5, 5
|
||||
vector<uint8_t> buf;
|
||||
MemWriter<vector<uint8_t>> writer(buf);
|
||||
hW.WriteEncoding(writer);
|
||||
|
||||
HuffmanCoder hR;
|
||||
MemReader memReader(&buf[0], buf.size());
|
||||
ReaderSource<MemReader> reader(memReader);
|
||||
hR.ReadEncoding(reader);
|
||||
|
||||
TEST_EQUAL(reader.Pos(), writer.Pos(), ());
|
||||
|
||||
TestDecode(hW, 0, 2, static_cast<uint32_t>('a')); // 00
|
||||
TestDecode(hW, 2, 2, static_cast<uint32_t>('b')); // 01
|
||||
TestDecode(hW, 1, 2, static_cast<uint32_t>('c')); // 10
|
||||
TestDecode(hW, 3, 2, static_cast<uint32_t>('d')); // 11
|
||||
|
||||
TestDecode(hR, 0, 2, static_cast<uint32_t>('a'));
|
||||
TestDecode(hR, 2, 2, static_cast<uint32_t>('b'));
|
||||
TestDecode(hR, 1, 2, static_cast<uint32_t>('c'));
|
||||
TestDecode(hR, 3, 2, static_cast<uint32_t>('d'));
|
||||
}
|
||||
|
||||
UNIT_TEST(Huffman_Serialization_Data)
|
||||
{
|
||||
HuffmanCoder hW;
|
||||
hW.Init(MakeUniStringVector(
|
||||
vector<string>{"aaaaaaaaaa", "bbbbbbbbbb", "ccccc", "ddddd"})); // 10, 10, 5, 5
|
||||
vector<uint8_t> buf;
|
||||
|
||||
string const data = "abacabaddddaaabbcabacabadbabd";
|
||||
strings::UniString expected = strings::UniString(data.begin(), data.end());
|
||||
|
||||
MemWriter<vector<uint8_t>> writer(buf);
|
||||
hW.WriteEncoding(writer);
|
||||
hW.EncodeAndWrite(writer, expected);
|
||||
|
||||
HuffmanCoder hR;
|
||||
MemReader memReader(&buf[0], buf.size());
|
||||
ReaderSource<MemReader> reader(memReader);
|
||||
hR.ReadEncoding(reader);
|
||||
strings::UniString received = hR.ReadAndDecode(reader);
|
||||
|
||||
TEST_EQUAL(expected, received, ());
|
||||
}
|
||||
|
||||
} // namespace coding
|
78
coding/huffman.cpp
Normal file
78
coding/huffman.cpp
Normal file
|
@ -0,0 +1,78 @@
|
|||
#include "coding/huffman.hpp"
|
||||
|
||||
#include "base/logging.hpp"
|
||||
|
||||
namespace coding
|
||||
{
|
||||
HuffmanCoder::~HuffmanCoder()
|
||||
{
|
||||
try
|
||||
{
|
||||
DeleteHuffmanTree(m_root);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
LOG(LWARNING, ("Caught an exception when deleting a Huffman tree."));
|
||||
}
|
||||
}
|
||||
|
||||
void HuffmanCoder::Init(vector<strings::UniString> const & data)
|
||||
{
|
||||
BuildHuffmanTree(data.begin(), data.end());
|
||||
BuildTables(m_root, 0);
|
||||
DeleteHuffmanTree(m_root);
|
||||
m_root = nullptr;
|
||||
}
|
||||
|
||||
bool HuffmanCoder::Encode(uint32_t symbol, Code & code) const
|
||||
{
|
||||
auto it = m_encoderTable.find(symbol);
|
||||
if (it == m_encoderTable.end())
|
||||
return false;
|
||||
code = it->second;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HuffmanCoder::Decode(Code const & code, uint32_t & symbol) const
|
||||
{
|
||||
auto it = m_decoderTable.find(code);
|
||||
if (it == m_decoderTable.end())
|
||||
return false;
|
||||
symbol = it->second;
|
||||
return true;
|
||||
}
|
||||
|
||||
void HuffmanCoder::BuildTables(Node * root, uint32_t path)
|
||||
{
|
||||
if (!root)
|
||||
return;
|
||||
if (root->isLeaf)
|
||||
{
|
||||
Code code(path, root->depth);
|
||||
m_encoderTable[root->symbol] = code;
|
||||
m_decoderTable[code] = root->symbol;
|
||||
return;
|
||||
}
|
||||
BuildTables(root->l, path);
|
||||
BuildTables(root->r, path + (static_cast<uint32_t>(1) << root->depth));
|
||||
}
|
||||
|
||||
void HuffmanCoder::DeleteHuffmanTree(Node * root)
|
||||
{
|
||||
if (!root)
|
||||
return;
|
||||
DeleteHuffmanTree(root->l);
|
||||
DeleteHuffmanTree(root->r);
|
||||
delete root;
|
||||
}
|
||||
|
||||
void HuffmanCoder::SetDepths(Node * root, uint32_t depth)
|
||||
{
|
||||
if (!root)
|
||||
return;
|
||||
root->depth = depth;
|
||||
SetDepths(root->l, depth + 1);
|
||||
SetDepths(root->r, depth + 1);
|
||||
}
|
||||
|
||||
} // namespace coding
|
254
coding/huffman.hpp
Normal file
254
coding/huffman.hpp
Normal file
|
@ -0,0 +1,254 @@
|
|||
#pragma once
|
||||
|
||||
#include "coding/bit_streams.hpp"
|
||||
#include "coding/varint.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/map.hpp"
|
||||
#include "std/queue.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
namespace coding
|
||||
{
|
||||
class HuffmanCoder
|
||||
{
|
||||
public:
|
||||
// A Code encodes a path to a leaf. It is read starting from
|
||||
// the least significant bit.
|
||||
struct Code
|
||||
{
|
||||
uint32_t bits;
|
||||
uint32_t len;
|
||||
|
||||
Code() = default;
|
||||
Code(uint32_t bits, uint32_t len) : bits(bits), len(len) {}
|
||||
|
||||
bool operator<(const Code & o) const
|
||||
{
|
||||
if (bits != o.bits)
|
||||
return bits < o.bits;
|
||||
return len < o.len;
|
||||
}
|
||||
};
|
||||
|
||||
HuffmanCoder() : m_root(nullptr) {}
|
||||
~HuffmanCoder();
|
||||
|
||||
// Internally builds a Huffman tree and makes
|
||||
// the EncodeAndWrite and ReadAndDecode methods available.
|
||||
void Init(vector<strings::UniString> const & data);
|
||||
|
||||
// One way to store the encoding would be
|
||||
// -- the succinct representation of the topology of Huffman tree;
|
||||
// -- the list of symbols that are stored in the leaves, as varuints in delta encoding.
|
||||
// This would probably be an overkill.
|
||||
template <typename TWriter>
|
||||
void WriteEncoding(TWriter & writer)
|
||||
{
|
||||
// @todo Do not waste space, use BitWriter.
|
||||
WriteVarUint(writer, m_decoderTable.size());
|
||||
for (auto const & kv : m_decoderTable)
|
||||
{
|
||||
WriteVarUint(writer, kv.first.bits);
|
||||
WriteVarUint(writer, kv.first.len);
|
||||
WriteVarUint(writer, kv.second);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TReader>
|
||||
void ReadEncoding(TReader & reader)
|
||||
{
|
||||
DeleteHuffmanTree(m_root);
|
||||
m_root = new Node(0 /* symbol */, 0 /* freq */, false /* isLeaf */);
|
||||
|
||||
m_encoderTable.clear();
|
||||
m_decoderTable.clear();
|
||||
|
||||
size_t sz = static_cast<size_t>(ReadVarUint<uint32_t, TReader>(reader));
|
||||
for (size_t i = 0; i < sz; ++i)
|
||||
{
|
||||
uint32_t bits = ReadVarUint<uint32_t, TReader>(reader);
|
||||
uint32_t len = ReadVarUint<uint32_t, TReader>(reader);
|
||||
uint32_t symbol = ReadVarUint<uint32_t, TReader>(reader);
|
||||
Code code(bits, len);
|
||||
|
||||
m_encoderTable[symbol] = code;
|
||||
m_decoderTable[code] = symbol;
|
||||
|
||||
Node * cur = m_root;
|
||||
for (size_t j = 0; j < len; ++j)
|
||||
{
|
||||
if (((bits >> j) & 1) == 0)
|
||||
{
|
||||
if (!cur->l)
|
||||
cur->l = new Node(0 /* symbol */, 0 /* freq */, false /* isLeaf */);
|
||||
cur = cur->l;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!cur->r)
|
||||
cur->r = new Node(0 /* symbol */, 0 /* freq */, false /* isLeaf */);
|
||||
cur = cur->r;
|
||||
}
|
||||
cur->depth = j + 1;
|
||||
}
|
||||
cur->isLeaf = true;
|
||||
cur->symbol = symbol;
|
||||
}
|
||||
}
|
||||
|
||||
bool Encode(uint32_t symbol, Code & code) const;
|
||||
bool Decode(Code const & code, uint32_t & symbol) const;
|
||||
|
||||
template <typename TWriter>
|
||||
void EncodeAndWrite(TWriter & writer, strings::UniString const & s)
|
||||
{
|
||||
BitWriter<TWriter> bitWriter(writer);
|
||||
WriteVarUint(writer, s.size());
|
||||
for (size_t i = 0; i < s.size(); ++i)
|
||||
EncodeAndWrite(bitWriter, static_cast<uint32_t>(s[i]));
|
||||
}
|
||||
|
||||
template <typename TReader>
|
||||
strings::UniString ReadAndDecode(TReader & reader) const
|
||||
{
|
||||
BitReader<TReader> bitReader(reader);
|
||||
size_t sz = static_cast<size_t>(ReadVarUint<uint32_t, TReader>(reader));
|
||||
vector<strings::UniChar> v(sz);
|
||||
for (size_t i = 0; i < sz; ++i)
|
||||
v[i] = static_cast<strings::UniChar>(ReadAndDecode(bitReader));
|
||||
return strings::UniString(v.begin(), v.end());
|
||||
}
|
||||
|
||||
private:
|
||||
// One would need more than 2^32 symbols to build a code that long.
|
||||
// On the other hand, 32 is short enough for our purposes, so do not
|
||||
// try to shrink the trees beyond this threshold.
|
||||
const uint32_t kMaxDepth = 32;
|
||||
|
||||
struct Node
|
||||
{
|
||||
Node *l, *r;
|
||||
uint32_t symbol;
|
||||
uint32_t freq;
|
||||
uint32_t depth;
|
||||
bool isLeaf;
|
||||
|
||||
Node(uint32_t symbol, uint32_t freq, bool isLeaf)
|
||||
: l(nullptr), r(nullptr), symbol(symbol), freq(freq), depth(0), isLeaf(isLeaf)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct NodeComparator
|
||||
{
|
||||
bool operator()(Node const * const a, Node const * const b) const
|
||||
{
|
||||
if (a->freq != b->freq)
|
||||
return a->freq > b->freq;
|
||||
return a->symbol > b->symbol;
|
||||
}
|
||||
};
|
||||
|
||||
// No need to clump the interface: keep private the methods
|
||||
// that encode one symbol only.
|
||||
template <typename TWriter>
|
||||
void EncodeAndWrite(BitWriter<TWriter> & bitWriter, uint32_t symbol)
|
||||
{
|
||||
Code code;
|
||||
CHECK(Encode(symbol, code), ());
|
||||
for (size_t i = 0; i < code.len; ++i)
|
||||
bitWriter.Write((code.bits >> i) & 1, 1);
|
||||
}
|
||||
|
||||
template <typename TReader>
|
||||
uint32_t ReadAndDecode(BitReader<TReader> & bitReader) const
|
||||
{
|
||||
Node * cur = m_root;
|
||||
while (cur)
|
||||
{
|
||||
if (cur->isLeaf)
|
||||
return cur->symbol;
|
||||
uint8_t bit = bitReader.Read(1);
|
||||
if (bit == 0)
|
||||
cur = cur->l;
|
||||
else
|
||||
cur = cur->r;
|
||||
}
|
||||
CHECK(false, ("Could not decode a Huffman-encoded symbol."));
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Converts a Huffman tree into the more convenient representation
|
||||
// of encoding and decoding tables.
|
||||
void BuildTables(Node * root, uint32_t path);
|
||||
|
||||
void DeleteHuffmanTree(Node * root);
|
||||
|
||||
// Builds a fixed Huffman tree for a collection of strings::UniStrings.
|
||||
// UniString is always UTF-32. Every code point is treated as a symbol for the encoder.
|
||||
template <typename TIter>
|
||||
void BuildHuffmanTree(TIter beg, TIter end)
|
||||
{
|
||||
if (beg == end)
|
||||
{
|
||||
m_root = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
map<uint32_t, uint32_t> freqs;
|
||||
for (auto it = beg; it != end; ++it)
|
||||
{
|
||||
auto const & e = *it;
|
||||
for (size_t i = 0; i < e.size(); ++i)
|
||||
{
|
||||
++freqs[static_cast<uint32_t>(e[i])];
|
||||
}
|
||||
}
|
||||
|
||||
priority_queue<Node *, vector<Node *>, NodeComparator> pq;
|
||||
for (auto const & e : freqs)
|
||||
pq.push(new Node(e.first, e.second, true /* isLeaf */));
|
||||
|
||||
if (pq.empty())
|
||||
{
|
||||
m_root = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
while (pq.size() > 1)
|
||||
{
|
||||
auto a = pq.top();
|
||||
pq.pop();
|
||||
auto b = pq.top();
|
||||
pq.pop();
|
||||
if (a->symbol > b->symbol)
|
||||
swap(a, b);
|
||||
// Give it the smaller symbol to make the resulting encoding more predictable.
|
||||
auto ab = new Node(a->symbol, a->freq + b->freq, false /* isLeaf */);
|
||||
ab->l = a;
|
||||
ab->r = b;
|
||||
CHECK_LESS_OR_EQUAL(a->depth, kMaxDepth, ());
|
||||
CHECK_LESS_OR_EQUAL(b->depth, kMaxDepth, ());
|
||||
pq.push(ab);
|
||||
}
|
||||
|
||||
m_root = pq.top();
|
||||
pq.pop();
|
||||
|
||||
SetDepths(m_root, 0);
|
||||
}
|
||||
|
||||
// Properly sets the depth field in the subtree rooted at root.
|
||||
// It is easier to do it after the tree is built.
|
||||
void SetDepths(Node * root, uint32_t depth);
|
||||
|
||||
Node * m_root; // m_pRoot?
|
||||
map<Code, uint32_t> m_decoderTable;
|
||||
map<uint32_t, Code> m_encoderTable;
|
||||
};
|
||||
|
||||
} // namespace coding
|
Loading…
Add table
Reference in a new issue