From 64067f1917d5b3b69d8909e06ac481398650c686 Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Tue, 21 Jul 2015 13:23:57 +0300 Subject: [PATCH] Review fixes. --- coding/coding.pro | 150 +++++++++++++-------------- coding/coding_tests/huffman_test.cpp | 33 +++++- coding/huffman.cpp | 12 +-- coding/huffman.hpp | 23 ++-- 4 files changed, 124 insertions(+), 94 deletions(-) diff --git a/coding/coding.pro b/coding/coding.pro index 6fa5ba57f7..b4980f9289 100644 --- a/coding/coding.pro +++ b/coding/coding.pro @@ -10,99 +10,99 @@ include($$ROOT_DIR/common.pri) INCLUDEPATH *= $$ROOT_DIR/3party/tomcrypt/src/headers $$ROOT_DIR/3party/zlib $$ROOT_DIR/3party/expat/lib SOURCES += \ - internal/file_data.cpp \ - hex.cpp \ - file_reader.cpp \ - file_writer.cpp \ - lodepng.cpp \ - file_container.cpp \ - bzip2_compressor.cpp \ - gzip_compressor.cpp \ - timsort/timsort.cpp \ + arithmetic_codec.cpp \ base64.cpp \ - sha2.cpp \ - multilang_utf8_string.cpp \ - reader.cpp \ - zip_reader.cpp \ - mmap_reader.cpp \ - reader_streambuf.cpp \ - reader_writer_ops.cpp \ # blob_indexer.cpp \ # blob_storage.cpp \ - uri.cpp \ - zip_creator.cpp \ - file_name_utils.cpp \ -# varint_vector.cpp \ - arithmetic_codec.cpp \ + bzip2_compressor.cpp \ compressed_bit_vector.cpp \ # compressed_varnum_vector.cpp \ - png_memory_encoder.cpp \ + file_container.cpp \ + file_name_utils.cpp \ + file_reader.cpp \ + file_writer.cpp \ + gzip_compressor.cpp \ + hex.cpp \ huffman.cpp \ + internal/file_data.cpp \ + lodepng.cpp \ + mmap_reader.cpp \ + multilang_utf8_string.cpp \ + png_memory_encoder.cpp \ + reader.cpp \ + reader_streambuf.cpp \ + reader_writer_ops.cpp \ + sha2.cpp \ + timsort/timsort.cpp \ + uri.cpp \ +# varint_vector.cpp \ + zip_creator.cpp \ + zip_reader.cpp \ HEADERS += \ - internal/xmlparser.hpp \ - internal/expat_impl.h \ - internal/file_data.hpp \ - internal/file64_api.hpp \ - parse_xml.hpp \ - varint.hpp \ - endianness.hpp \ + arithmetic_codec.hpp \ + base64.hpp \ + bit_streams.hpp \ +# blob_indexer.hpp \ +# blob_storage.hpp \ + buffer_reader.hpp \ byte_stream.hpp \ - var_serial_vector.hpp \ - hex.hpp \ + bzip2_compressor.hpp \ + coder.hpp \ + coder_util.hpp \ + compressed_bit_vector.hpp \ +# compressed_varnum_vector.hpp \ + constants.hpp \ dd_vector.hpp \ - writer.hpp \ - write_to_sink.hpp \ - reader.hpp \ diff.hpp \ diff_patch_common.hpp \ + endianness.hpp \ + file_container.hpp \ + file_name_utils.hpp \ + file_reader.hpp \ + file_reader_stream.hpp \ + file_sort.hpp \ + file_writer.hpp \ + file_writer_stream.hpp \ + gzip_compressor.hpp \ + hex.hpp \ + huffman.hpp \ + internal/expat_impl.h \ + internal/file64_api.hpp \ + internal/file_data.hpp \ lodepng.hpp \ lodepng_io.hpp \ lodepng_io_private.hpp \ - var_record_reader.hpp \ - file_sort.hpp \ - file_reader.hpp \ - file_writer.hpp \ - reader_cache.hpp \ - buffer_reader.hpp \ - streams.hpp \ - streams_sink.hpp \ - streams_common.hpp \ - file_container.hpp \ - polymorph_reader.hpp \ - coder.hpp \ - coder_util.hpp \ - bzip2_compressor.hpp \ - gzip_compressor.hpp \ - timsort/timsort.hpp \ - base64.hpp \ - sha2.hpp \ - value_opt_string.hpp \ + matrix_traversal.hpp \ + mmap_reader.hpp \ multilang_utf8_string.hpp \ - url_encode.hpp \ - zip_reader.hpp \ + parse_xml.hpp \ + png_memory_encoder.hpp \ + polymorph_reader.hpp \ + read_write_utils.hpp \ + reader.hpp \ + reader_cache.hpp \ + reader_streambuf.hpp \ + reader_wrapper.hpp \ + reader_writer_ops.hpp \ + sha2.hpp \ + streams.hpp \ + streams_common.hpp \ + streams_sink.hpp \ + timsort/timsort.hpp \ trie.hpp \ trie_builder.hpp \ trie_reader.hpp \ - mmap_reader.hpp \ - read_write_utils.hpp \ - file_reader_stream.hpp \ - file_writer_stream.hpp \ - reader_streambuf.hpp \ - reader_writer_ops.hpp \ - reader_wrapper.hpp \ -# blob_indexer.hpp \ -# blob_storage.hpp \ uri.hpp \ - zip_creator.hpp \ - file_name_utils.hpp \ - constants.hpp \ - matrix_traversal.hpp \ -# varint_vector.hpp \ - arithmetic_codec.hpp \ - compressed_bit_vector.hpp \ -# compressed_varnum_vector.hpp \ + url_encode.hpp \ + value_opt_string.hpp \ + var_record_reader.hpp \ + var_serial_vector.hpp \ + varint.hpp \ varint_misc.hpp \ - bit_streams.hpp \ - png_memory_encoder.hpp \ - huffman.hpp \ +# varint_vector.hpp \ + write_to_sink.hpp \ + writer.hpp \ + zip_creator.hpp \ + zip_reader.hpp \ + internal/xmlparser.hpp \ diff --git a/coding/coding_tests/huffman_test.cpp b/coding/coding_tests/huffman_test.cpp index 0aff25126a..fbf0630401 100644 --- a/coding/coding_tests/huffman_test.cpp +++ b/coding/coding_tests/huffman_test.cpp @@ -5,6 +5,7 @@ #include "coding/writer.hpp" #include "base/string_utils.hpp" + #include "std/vector.hpp" namespace @@ -13,7 +14,7 @@ vector MakeUniStringVector(vector const & v) { vector result(v.size()); for (size_t i = 0; i < v.size(); ++i) - result[i] = strings::UniString(v[i].begin(), v[i].end()); + result[i] = strings::MakeUniString(v[i]); return result; } @@ -46,6 +47,36 @@ UNIT_TEST(Huffman_OneSymbol) TestDecode(h, 0, 0, 0); } +UNIT_TEST(Huffman_NonAscii) +{ + HuffmanCoder h; + string const data = "2πΩ"; + strings::UniString const uniData = strings::MakeUniString(data); + h.Init(vector{uniData}); + + TestDecode(h, 0, 2, static_cast(uniData[0])); // 00 + TestDecode(h, 1, 1, static_cast(uniData[1])); // 1 + TestDecode(h, 2, 2, static_cast(uniData[2])); // 01 +} + +UNIT_TEST(Huffman_Init) +{ + HuffmanCoder h; + h.Init(MakeUniStringVector(vector{"ab"})); + + vector buf; + buf.push_back(16); // size + buf.push_back(105); // 01101001 + buf.push_back(150); // 10010110 + + MemReader memReader(&buf[0], buf.size()); + ReaderSource reader(memReader); + strings::UniString received = h.ReadAndDecode(reader); + strings::UniString expected = strings::MakeUniString("baababbaabbabaab"); + + TEST_EQUAL(expected, received, ()); +} + UNIT_TEST(Huffman_Serialization_Encoding) { HuffmanCoder hW; diff --git a/coding/huffman.cpp b/coding/huffman.cpp index acdd945303..a8d8a1275a 100644 --- a/coding/huffman.cpp +++ b/coding/huffman.cpp @@ -6,22 +6,14 @@ namespace coding { HuffmanCoder::~HuffmanCoder() { - try - { - DeleteHuffmanTree(m_root); - } - catch (...) - { - LOG(LWARNING, ("Caught an exception when deleting a Huffman tree.")); - } + DeleteHuffmanTree(m_root); } void HuffmanCoder::Init(vector const & data) { + DeleteHuffmanTree(m_root); BuildHuffmanTree(data.begin(), data.end()); BuildTables(m_root, 0); - DeleteHuffmanTree(m_root); - m_root = nullptr; } bool HuffmanCoder::Encode(uint32_t symbol, Code & code) const diff --git a/coding/huffman.hpp b/coding/huffman.hpp index b004a00268..e3526afb82 100644 --- a/coding/huffman.hpp +++ b/coding/huffman.hpp @@ -23,7 +23,7 @@ public: uint32_t bits; uint32_t len; - Code() = default; + Code() : bits(0), len(0) {} Code(uint32_t bits, uint32_t len) : bits(bits), len(len) {} bool operator<(const Code & o) const @@ -124,10 +124,6 @@ public: } private: - // One would need more than 2^32 symbols to build a code that long. - // On the other hand, 32 is short enough for our purposes, so do not - // try to shrink the trees beyond this threshold. - const uint32_t kMaxDepth = 32; struct Node { @@ -160,8 +156,14 @@ private: { Code code; CHECK(Encode(symbol, code), ()); - for (size_t i = 0; i < code.len; ++i) - bitWriter.Write((code.bits >> i) & 1, 1); + size_t fullBytes = code.len / CHAR_BIT; + size_t rem = code.len % CHAR_BIT; + for (size_t i = 0; i < fullBytes; ++i) + { + bitWriter.Write(code.bits & 0xFF, CHAR_BIT); + code.bits >>= CHAR_BIT; + } + bitWriter.Write(code.bits, rem); } template @@ -191,7 +193,7 @@ private: // Builds a fixed Huffman tree for a collection of strings::UniStrings. // UniString is always UTF-32. Every code point is treated as a symbol for the encoder. template - void BuildHuffmanTree(TIter beg, TIter end) + void BuildHuffmanTree(TIter const & beg, TIter const & end) { if (beg == end) { @@ -199,6 +201,11 @@ private: return; } + // One would need more than 2^32 symbols to build a code that long. + // On the other hand, 32 is short enough for our purposes, so do not + // try to shrink the trees beyond this threshold. + uint32_t const kMaxDepth = 32; + map freqs; for (auto it = beg; it != end; ++it) {