diff --git a/coding/blob_indexer.cpp b/coding/blob_indexer.cpp new file mode 100644 index 0000000000..05912e6ab4 --- /dev/null +++ b/coding/blob_indexer.cpp @@ -0,0 +1,76 @@ +#include "blob_indexer.hpp" +#include "../coding/byte_stream.hpp" +#include "../coding/endianness.hpp" +#include "../coding/varint.hpp" +#include "../coding/writer.hpp" +#include "../coding/write_to_sink.hpp" +#include "../base/assert.hpp" +#include "../base/base.hpp" +#include "../base/logging.hpp" +#include "../std/algorithm.hpp" +#include "../std/set.hpp" +#include "../std/string.hpp" + +BlobIndexer::BlobIndexer(Writer & writer, + size_t maxUncompressedChunkSize, + function const & compressor) : + m_writer(writer), + m_maxUncompressedChunkSize(min(int(maxUncompressedChunkSize), (1 << BITS_IN_CHUNK_SIZE) - 1)), + m_compressor(compressor), + m_totalBlobSizeUncompressed(0), + m_maxBlobSize(0), + m_largeBlobCount(0) +{ + ASSERT_LESS(maxUncompressedChunkSize, (1 << BITS_IN_CHUNK_SIZE), ()); + CHECK_EQUAL(m_writer.Pos(), 0, ("Writer should not have something written already")); + + // Write header. + char const header[] = "Blb"; + m_writer.Write(header, 3); + WriteToSink(m_writer, static_cast(BITS_IN_CHUNK_SIZE)); +} + +uint64_t BlobIndexer::AddBlob(string const & blob) +{ + if (blob.size() > m_maxUncompressedChunkSize) + { + LOG(LINFO, ("Blob bigger than chunk:", m_blobChunkAndOffset.size(), blob.size(), + blob.substr(0, 64))); + ++m_largeBlobCount; + } + + if (m_currentChunk.size() + blob.size() > m_maxUncompressedChunkSize) + FlushChunk(); + + m_blobChunkAndOffset.push_back( + (m_chunkOffset.size() << BITS_IN_CHUNK_SIZE) + m_currentChunk.size()); + + m_currentChunk.insert(m_currentChunk.end(), blob.begin(), blob.end()); + + return m_blobChunkAndOffset.size() - 1; +} + +void BlobIndexer::FlushChunk() +{ + if (!m_currentChunk.empty()) + { + string compressedChunk; + m_compressor(m_currentChunk.data(), m_currentChunk.size(), compressedChunk); + m_writer.Write(compressedChunk.data(), compressedChunk.size()); + WriteToSink(m_writer, static_cast(m_currentChunk.size())); + uint32_t const chunkPrevOffset = (m_chunkOffset.empty() ? 0 : m_chunkOffset.back()); + m_chunkOffset.push_back(compressedChunk.size() + 4 + chunkPrevOffset); + m_currentChunk.clear(); + } +} + +BlobIndexer::~BlobIndexer() +{ + FlushChunk(); + + for (size_t i = 0; i < m_chunkOffset.size(); ++i) + WriteToSink(m_writer, m_chunkOffset[i]); + for (size_t i = 0; i < m_blobChunkAndOffset.size(); ++i) + WriteToSink(m_writer, m_blobChunkAndOffset[i]); + WriteToSink(m_writer, static_cast(m_blobChunkAndOffset.size())); +} diff --git a/coding/blob_indexer.hpp b/coding/blob_indexer.hpp new file mode 100644 index 0000000000..4aaa247028 --- /dev/null +++ b/coding/blob_indexer.hpp @@ -0,0 +1,39 @@ +#pragma once +#include "../std/function.hpp" +#include "../std/string.hpp" +#include "../std/vector.hpp" +#include "../base/base.hpp" + +class Writer; + +class BlobIndexer +{ +public: + BlobIndexer(Writer & writer, + size_t maxUncompressedChunkSize, + function const & compressor); + ~BlobIndexer(); + + // Add blob and return its id. + uint64_t AddBlob(string const & blob); + + void LogStats() const; + +private: + void FlushChunk(); + + Writer & m_writer; + size_t const m_maxUncompressedChunkSize; + function const m_compressor; + + static uint32_t const BITS_IN_CHUNK_SIZE = 20; + + vector m_chunkOffset; + vector m_blobChunkAndOffset; + vector m_currentChunk; + + // Just for stats. + uint64_t m_totalBlobSizeUncompressed; + uint32_t m_maxBlobSize; + uint32_t m_largeBlobCount; +}; diff --git a/coding/blob_storage.cpp b/coding/blob_storage.cpp new file mode 100644 index 0000000000..0838838e98 --- /dev/null +++ b/coding/blob_storage.cpp @@ -0,0 +1,92 @@ +#include "blob_storage.hpp" +#include "reader.hpp" + +// File Format: +// Blobs are grouped together in chunks and then chunks are compressed. +// nb - number of blobs +// nc - number of chunks +// +// [4| Header = "Blb1"] +// [*| Chunk 0 ] [*| Chunk 1 ] ... [*| Chunk nc-1] +// [4| Chunk 1 pos] [4| Chunk 2 pos] ... [4| Pos after the last chunk] +// [4| Blob info 0] [4| Blob info 1] ... [4| Blob info nb-1] +// [4| nb] +// +// +// Chunk Format: +// [*| Chunk data] +// [4| Uncompressed chunk size] +// +// Blob Info Format: +// [ Chunk number ] [Offset in uncompressed chunk] +// | 32 - BITS_IN_CHUNK_SIZE | | BITS_IN_CHUNK_SIZE | + + +BlobStorage::BlobStorage(Reader const * pReader, + function decompressor) : + m_pReader(pReader), m_decompressor(decompressor) +{ + Init(); +} + +BlobStorage::~BlobStorage() +{ +} + +void BlobStorage::Init() +{ + string header(3, ' '); + ReadFromPos(*m_pReader, 0, &header[0], 3); + if (header != "Blb") + MYTHROW(BlobStorage::OpenException, (header)); + m_bitsInChunkSize = ReadPrimitiveFromPos(*m_pReader, 3); + + uint64_t const fileSize = m_pReader->Size(); + uint32_t const blobCount = ReadPrimitiveFromPos(*m_pReader, fileSize - 4); + m_blobInfo.Init(PolymorphReader(m_pReader->CreateSubReader( + fileSize - 4 - 4 * blobCount, + 4 * blobCount))); + uint32_t const chunkCount = + (blobCount > 0 ? (m_blobInfo[blobCount - 1] >> m_bitsInChunkSize) + 1 : 0); + m_chunkOffset.Init(PolymorphReader(m_pReader->CreateSubReader( + fileSize - 4 - 4 * blobCount - 4 * chunkCount, + 4 * chunkCount))); +} + +uint32_t BlobStorage::Size() const +{ + return m_blobInfo.size(); +} + +uint32_t BlobStorage::GetChunkFromBI(uint32_t blobInfo) const +{ + return blobInfo >> m_bitsInChunkSize; +} + +uint32_t BlobStorage::GetOffsetFromBI(uint32_t blobInfo) const +{ + return blobInfo & ((1 << m_bitsInChunkSize) - 1); +} + +void BlobStorage::GetBlob(uint32_t i, string & blob) const +{ + ASSERT_LESS(i, Size(), ()); + uint32_t const blobInfo = m_blobInfo[i]; + uint32_t const chunk = GetChunkFromBI(blobInfo); + uint32_t const chunkBeg = (chunk == 0 ? 0 : m_chunkOffset[chunk - 1]); + uint32_t const chunkEnd = m_chunkOffset[chunk]; + vector compressedData(chunkEnd - chunkBeg); + ASSERT_GREATER(compressedData.size(), 4, ()); + m_pReader->Read(START_OFFSET + chunkBeg, &compressedData[0], compressedData.size()); + uint32_t const decompressedSize = ReadPrimitiveFromPos( + MemReader(&compressedData[0], compressedData.size()), compressedData.size() - 4); + + vector data(decompressedSize); + m_decompressor(&compressedData[0], compressedData.size() - 4, &data[0], data.size()); + + uint32_t const blobOffset = GetOffsetFromBI(blobInfo); + if (i != m_blobInfo.size() - 1 && chunk == GetChunkFromBI(m_blobInfo[i+1])) + blob.assign(data.begin() + blobOffset, data.begin() + GetOffsetFromBI(m_blobInfo[i+1])); + else + blob.assign(data.begin() + blobOffset, data.end()); +} diff --git a/coding/blob_storage.hpp b/coding/blob_storage.hpp new file mode 100644 index 0000000000..e1aa0ba194 --- /dev/null +++ b/coding/blob_storage.hpp @@ -0,0 +1,42 @@ +#pragma once +#include "dd_vector.hpp" +#include "polymorph_reader.hpp" +#include "../std/function.hpp" +#include "../std/scoped_ptr.hpp" +#include "../std/string.hpp" +#include "../base/base.hpp" +#include "../base/exception.hpp" + +class Reader; + +class BlobStorage +{ +public: + DECLARE_EXCEPTION(OpenException, RootException); + + // Takes ownership of pReader and deletes it, even if exception is thrown. + BlobStorage(Reader const * pReader, + function decompressor); + ~BlobStorage(); + + // Get blob by its number, starting from 0. + void GetBlob(uint32_t i, string & blob) const; + + // Returns the number of blobs. + uint32_t Size() const; + +private: + void Init(); + + uint32_t GetChunkFromBI(uint32_t blobInfo) const; + uint32_t GetOffsetFromBI(uint32_t blobInfo) const; + + uint32_t m_bitsInChunkSize; + static uint32_t const START_OFFSET = 4; + + scoped_ptr m_pReader; + function m_decompressor; + + DDVector m_blobInfo; + DDVector m_chunkOffset; +}; diff --git a/coding/coding.pro b/coding/coding.pro index b769bb07b5..fac05657ed 100644 --- a/coding/coding.pro +++ b/coding/coding.pro @@ -28,6 +28,8 @@ SOURCES += \ mmap_reader.cpp \ reader_streambuf.cpp \ reader_writer_ops.cpp \ + blob_indexer.cpp \ + blob_storage.cpp \ HEADERS += \ internal/xmlparser.h \ @@ -83,3 +85,5 @@ HEADERS += \ reader_streambuf.hpp \ reader_writer_ops.hpp \ reader_wrapper.hpp \ + blob_indexer.hpp \ + blob_storage.hpp \ diff --git a/coding/coding_tests/blob_storage_test.cpp b/coding/coding_tests/blob_storage_test.cpp new file mode 100644 index 0000000000..bd1f9b5f3f --- /dev/null +++ b/coding/coding_tests/blob_storage_test.cpp @@ -0,0 +1,108 @@ +#include "../../testing/testing.hpp" +#include "../blob_storage.hpp" +#include "../blob_indexer.hpp" + +#include "compressor_test_utils.hpp" + +#include "../../coding/reader.hpp" +#include "../../coding/writer.hpp" +#include "../../base/logging.hpp" +#include "../../base/macros.hpp" +#include "../../std/string.hpp" +#include "../../std/vector.hpp" + +namespace +{ + +string GetBlob(BlobStorage const & bs, uint32_t i) +{ + string blob; + bs.GetBlob(i, blob); + return blob; +} + +} // unnamed namespace + +UNIT_TEST(BlobIndexerEmptyTest) +{ + string serial; + { + MemWriter writer(serial); + BlobIndexer indexer(writer, 20, &coding::TestCompressor); + } + char const expected[] = "Blb\x14\0\0\0\0"; + TEST_EQUAL(serial, string(&expected[0], &expected[ARRAY_SIZE(expected)-1]), ()); + BlobStorage storage(new MemReader(&serial[0], serial.size()), &coding::TestDecompressor); +} + +UNIT_TEST(BlobIndexerSimpleSerialTest) +{ + string serial; + { + MemWriter writer(serial); + BlobIndexer indexer(writer, 20, &coding::TestCompressor); + indexer.AddBlob("abc"); + } + char const expected[] = "Blb\x14" // Header + "\3\0\0\0" // Chunk 0 with its decompressed size + "\x9\0\0\0" // Chunk 0 end offset + "\0\0\0\0" // Blob 0 info + "\1\0\0\0"; // Number of chunks + TEST_EQUAL(serial, string(&expected[0], &expected[ARRAY_SIZE(expected)-1]), ()); + BlobStorage bs(new MemReader(&serial[0], serial.size()), &coding::TestDecompressor); + TEST_EQUAL(bs.Size(), 1, ()); + TEST_EQUAL(GetBlob(bs, 0), "abc", ()); +} + +UNIT_TEST(BlobIndexerSerialTest) +{ + string serial; + { + MemWriter writer(serial); + BlobIndexer indexer(writer, 5, &coding::TestCompressor); + indexer.AddBlob("abc"); // Chunk 0 + indexer.AddBlob("d"); // Chunk 0 + indexer.AddBlob("ef"); // Chunk 1 + indexer.AddBlob("1234567890"); // Chunk 2 + indexer.AddBlob("0987654321"); // Chunk 3 + indexer.AddBlob("Hello"); // Chunk 4 + indexer.AddBlob("World"); // Chunk 5 + indexer.AddBlob("!"); // Chunk 6 + } + char const expected[] = "Blb\x14" // Header + "\x4\0\0\0" // Chunk 0 + "\x2\0\0\0" // Chunk 1 + "<1234567890>\xA\0\0\0" // Chunk 2 + "<0987654321>\xA\0\0\0" // Chunk 3 + "\x5\0\0\0" // Chunk 4 + "\x5\0\0\0" // Chunk 5 + "\x1\0\0\0" // Chunk 6 + "\x0A\0\0\0" // Chunk 0 end pos + "\x12\0\0\0" // Chunk 1 end pos + "\x22\0\0\0" // Chunk 2 end pos + "\x32\0\0\0" // Chunk 3 end pos + "\x3D\0\0\0" // Chunk 4 end pos + "\x48\0\0\0" // Chunk 5 end pos + "\x4F\0\0\0" // Chunk 6 end pos + "\x0\0\x00\0" // Blob 0 info + "\x3\0\x00\0" // Blob 1 info + "\x0\0\x10\0" // Blob 2 info + "\x0\0\x20\0" // Blob 3 info + "\x0\0\x30\0" // Blob 4 info + "\x0\0\x40\0" // Blob 5 info + "\x0\0\x50\0" // Blob 6 info + "\x0\0\x60\0" // Blob 7 info + "\x8\0\0\0" // Number of blobs + ; + TEST_EQUAL(serial, string(&expected[0], ARRAY_SIZE(expected) - 1), ()); + BlobStorage bs(new MemReader(&serial[0], serial.size()), &coding::TestDecompressor); + TEST_EQUAL(bs.Size(), 8, ()); + TEST_EQUAL(GetBlob(bs, 0), "abc", ()); + TEST_EQUAL(GetBlob(bs, 1), "d", ()); + TEST_EQUAL(GetBlob(bs, 2), "ef", ()); + TEST_EQUAL(GetBlob(bs, 3), "1234567890", ()); + TEST_EQUAL(GetBlob(bs, 4), "0987654321", ()); + TEST_EQUAL(GetBlob(bs, 5), "Hello", ()); + TEST_EQUAL(GetBlob(bs, 6), "World", ()); + TEST_EQUAL(GetBlob(bs, 7), "!", ()); +} diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro index 4cc8e33c45..2db011c1aa 100644 --- a/coding/coding_tests/coding_tests.pro +++ b/coding/coding_tests/coding_tests.pro @@ -37,7 +37,8 @@ SOURCES += ../../testing/testingmain.cpp \ file_data_test.cpp \ zip_reader_test.cpp \ trie_test.cpp \ - reader_writer_ops_test.cpp + reader_writer_ops_test.cpp \ + blob_storage_test.cpp \ HEADERS += \ reader_test.hpp \