From 5fea1a57db9eab92eca9a430f0c3e905030d542a Mon Sep 17 00:00:00 2001 From: Denis Koronchik Date: Thu, 16 Oct 2014 21:44:21 +0300 Subject: [PATCH] [coding] Add varint vectors --- coding/coding.pro | 2 + coding/coding_tests/coding_tests.pro | 1 + coding/coding_tests/varint_vector_test.cpp | 166 +++++++++++++++++ coding/varint_vector.cpp | 207 +++++++++++++++++++++ coding/varint_vector.hpp | 82 ++++++++ 5 files changed, 458 insertions(+) create mode 100644 coding/coding_tests/varint_vector_test.cpp create mode 100644 coding/varint_vector.cpp create mode 100644 coding/varint_vector.hpp diff --git a/coding/coding.pro b/coding/coding.pro index fa85a59d49..9477bde9e8 100644 --- a/coding/coding.pro +++ b/coding/coding.pro @@ -32,6 +32,7 @@ SOURCES += \ uri.cpp \ zip_creator.cpp \ file_name_utils.cpp \ + varint_vector.cpp \ HEADERS += \ internal/xmlparser.hpp \ @@ -93,3 +94,4 @@ HEADERS += \ file_name_utils.hpp \ constants.hpp \ matrix_traversal.hpp \ + varint_vector.hpp \ diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro index 61e8807cf3..3f4db0243c 100644 --- a/coding/coding_tests/coding_tests.pro +++ b/coding/coding_tests/coding_tests.pro @@ -45,6 +45,7 @@ SOURCES += ../../testing/testingmain.cpp \ uri_test.cpp \ zip_creator_test.cpp \ file_utils_test.cpp \ + varint_vector_test.cpp \ HEADERS += \ reader_test.hpp \ diff --git a/coding/coding_tests/varint_vector_test.cpp b/coding/coding_tests/varint_vector_test.cpp new file mode 100644 index 0000000000..f82d079adc --- /dev/null +++ b/coding/coding_tests/varint_vector_test.cpp @@ -0,0 +1,166 @@ +#include "../../testing/testing.hpp" + +#include "../varint_vector.hpp" +#include "../writer.hpp" +#include "../reader.hpp" + +#include "../../base/pseudo_random.hpp" + + +using namespace varint; + + +UNIT_TEST(VarintVector_Use) +{ + vector buffer; + MemWriter> writer(buffer); + + vector g_nums; + vector g_nums_sums; + + uint32_t const c_nums_count = 12345; + uint32_t const c_index_tests_count = 50000; + uint32_t const c_sum_tests_count = 20000; + + PseudoRNG32 rnd; + + // Generate vector. + { + uint64_t sum = 0; + VectorBuilder builder; + for (uint32_t i = 0; i < c_nums_count; ++i) + { + g_nums_sums.push_back(sum); + uint8_t const byte_size = rnd.Generate() % 6 + 1; + uint64_t const num = rnd.Generate() & ((uint64_t(1) << (byte_size * 7)) - 1); + + g_nums.push_back(num); + builder.AddNum(num); + sum += num; + } + + TEST_EQUAL(g_nums.size(), c_nums_count, ()); + TEST_EQUAL(g_nums_sums.size(), c_nums_count, ()); + + builder.Finalize(&writer); + } + + MemReader reader(buffer.data(), buffer.size()); + + // Test sequential access by index. + { + Vector v(&reader); + for (uint32_t i = 0; i < c_nums_count; ++i) + { + uint32_t serial_pos = 0; + uint64_t sum_before = 0; + v.FindByIndex(i, serial_pos, sum_before); + + TEST_EQUAL(sum_before, g_nums_sums[i], ()); + uint64_t num = 0; + v.Read(serial_pos, num); + + TEST_EQUAL(g_nums[i], num, ()); + } + } + + // Test random access by index. + { + Vector v(&reader); + for (uint32_t i = 0; i < c_index_tests_count; ++i) + { + uint64_t const index = rnd.Generate() % g_nums.size(); + + uint32_t serial_pos = 0; + uint64_t sum_before = 0; + v.FindByIndex(index, serial_pos, sum_before); + + TEST_EQUAL(sum_before, g_nums_sums[index], ()); + + uint64_t num = 0; + v.Read(serial_pos, num); + TEST_EQUAL(g_nums[index], num, ()); + } + } + + // Test sequential access by precise sum. + { + Vector v(&reader); + for (uint32_t i = 0; i < c_nums_count-1; ++i) + { + if (g_nums_sums[i] == g_nums_sums[i + 1]) + continue; + + uint64_t const sum = g_nums_sums[i]; + + uint32_t serial_pos = 0; + uint64_t sum_before = 0; + uint64_t count_before = 0; + v.FindBySum(sum, serial_pos, sum_before, count_before); + + TEST_EQUAL(count_before, i, ()); + TEST_EQUAL(sum, sum_before, ()); + + uint64_t num = 0; + v.Read(serial_pos, num); + TEST_EQUAL(g_nums[i], num, ()); + } + } + + // Test random access by precise sum. + { + Vector v(&reader); + for (uint32_t i = 0; i < c_sum_tests_count; ++i) + { + uint64_t index = rnd.Generate() % (g_nums_sums.size() - 2); + while (g_nums_sums[index] == g_nums_sums[index + 1]) + { + ++index; + TEST_LESS(index+1, g_nums.size(), ()); + } + + uint64_t const sum = g_nums_sums[index]; + + uint32_t serial_pos = 0; + uint64_t sum_before = 0; + uint64_t count_before = 0; + v.FindBySum(sum, serial_pos, sum_before, count_before); + + TEST_EQUAL(count_before, index, ()); + TEST_EQUAL(sum, sum_before, ()); + + uint64_t num = 0; + v.Read(serial_pos, num); + TEST_EQUAL(g_nums[index], num, ()); + } + } + + // Test random access by intermediate sum. + { + Vector v(&reader); + for (uint32_t i = 0; i < c_sum_tests_count; ++i) + { + uint64_t index = rnd.Generate() % (g_nums_sums.size() - 2); + while (g_nums_sums[index] + 1 >= g_nums_sums[index + 1]) + { + ++index; + TEST_LESS(index+1, g_nums_sums.size(), ()); + } + + uint64_t const sum = (g_nums_sums[index] + g_nums_sums[index + 1]) / 2; + + uint32_t serial_pos = 0; + uint64_t sum_before = 0; + uint64_t count_before = 0; + v.FindBySum(sum, serial_pos, sum_before, count_before); + + TEST_EQUAL(count_before, index, ()); + TEST_GREATER(sum, sum_before, ()); + TEST_LESS(sum, g_nums_sums[index + 1], ()); + + uint64_t num = 0; + v.Read(serial_pos, num); + TEST_EQUAL(g_nums[index], num, ()); + } + } +} diff --git a/coding/varint_vector.cpp b/coding/varint_vector.cpp new file mode 100644 index 0000000000..af47dce535 --- /dev/null +++ b/coding/varint_vector.cpp @@ -0,0 +1,207 @@ +#include "varint_vector.hpp" +#include "writer.hpp" +#include "reader.hpp" + +#include "../base/assert.hpp" + + +namespace varint +{ + +namespace +{ + void VarintEncode(vector & dst, uint64_t n) + { + if (n == 0) + { + dst.push_back(0); + } + else + { + while (n != 0) + { + uint8_t b = n & 0x7F; + n >>= 7; + b |= (n == 0) ? 0 : 0x80; + dst.push_back(b); + } + } + } + + uint64_t VarintDecode(Reader * reader, uint64_t & offset) + { + uint64_t n = 0; + int shift = 0; + while (1) + { + uint8_t b = 0; + reader->Read(offset, &b, 1); + n |= uint64_t(b & 0x7F) << shift; + ++offset; + if ((b & 0x80) == 0) + break; + shift += 7; + } + return n; + } +} + +VectorBuilder::VectorBuilder(uint64_t numElemPerTableEntry) + : m_numElemPerTableEntry(numElemPerTableEntry), m_numsCount(0), m_sum(0) +{ +} + +void VectorBuilder::AddNum(uint64_t num) +{ + if (m_numsCount % m_numElemPerTableEntry == 0) + { + TableEntry tableEntry; + tableEntry.pos = m_serialNums.size(); + tableEntry.sum = m_sum; + m_selectTable.push_back(tableEntry); + } + VarintEncode(m_serialNums, num); + ++m_numsCount; + m_sum += num; +} + +void VectorBuilder::Finalize(Writer * writer) +{ + vector header; + VarintEncode(header, m_numsCount); + VarintEncode(header, m_numElemPerTableEntry); + VarintEncode(header, m_selectTable.size()); + VarintEncode(header, m_serialNums.size()); + + writer->Write(header.data(), header.size()); + writer->Write(m_selectTable.data(), m_selectTable.size() * sizeof(m_selectTable.front())); + writer->Write(m_serialNums.data(), m_serialNums.size()); +} + + +void VectorBuilderDelayedLast::AddLast() +{ + if (m_hasLast) + { + BaseT::AddNum(m_last); + m_hasLast = false; + } +} + +void VectorBuilderDelayedLast::AddNum(uint64_t num) +{ + AddLast(); + + m_last = num; + m_hasLast = true; +} + +void VectorBuilderDelayedLast::ReplaceLast(uint64_t num) +{ + ASSERT(m_hasLast, ()); + m_last = num; +} + +void VectorBuilderDelayedLast::Finalize(Writer * writer) +{ + AddLast(); + + BaseT::Finalize(writer); +} + + +Vector::Vector(Reader * reader) + : m_reader(reader), m_numsCount(0), m_numElemPerTableEntry(0), m_numTableEntries(0), + m_serialNumsSize(0), m_selectTableOffset(0), m_serialNumsOffset(0) +{ + uint64_t parseOffset = 0; + m_numsCount = VarintDecode(m_reader, parseOffset); + m_numElemPerTableEntry = VarintDecode(m_reader, parseOffset); + m_numTableEntries = VarintDecode(m_reader, parseOffset); + m_serialNumsSize = VarintDecode(m_reader, parseOffset); + m_selectTableOffset = parseOffset; + m_serialNumsOffset = m_selectTableOffset + sizeof(TableEntry) * m_numTableEntries; +} + +void Vector::FindByIndex(uint64_t index, uint32_t & serialPos, uint64_t & sumBefore) +{ + ASSERT_LESS(index, m_numsCount, ()); + uint64_t tableEntryIndex = index / m_numElemPerTableEntry; + + ASSERT_LESS(tableEntryIndex, m_numTableEntries, ()); + + uint64_t tableEntryOffset = m_selectTableOffset + tableEntryIndex * sizeof(TableEntry); + uint64_t indexWithinRange = index % m_numElemPerTableEntry; + + TableEntry tableEntry; + m_reader->Read(tableEntryOffset, &tableEntry, sizeof(TableEntry)); + + uint64_t sum = tableEntry.sum; + uint64_t numOffset = m_serialNumsOffset + tableEntry.pos; + for (uint64_t i = 0; i < indexWithinRange; ++i) + { + uint64_t num = VarintDecode(m_reader, numOffset); + sum += num; + } + serialPos = numOffset - m_serialNumsOffset; + sumBefore = sum; +} + +void Vector::FindBySum(uint64_t sum, uint32_t & serialPos, uint64_t & sumBefore, uint64_t & countBefore) +{ + // First do binary search over select table to find the biggest + // sum that is less or equal to our. + uint64_t l = 0, r = m_numTableEntries; + uint64_t countBinarySearchCycles = 0; + while (r - l > 1) + { + ++countBinarySearchCycles; + + uint64_t m = (l + r) / 2; + uint64_t tableEntryOffset = m_selectTableOffset + m * sizeof(TableEntry); + + TableEntry tableEntry; + m_reader->Read(tableEntryOffset, &tableEntry, sizeof(TableEntry)); + if (sum >= tableEntry.sum) + l = m; + else + r = m; + } + + uint64_t tableEntryIndex = l; + countBefore = tableEntryIndex * m_numElemPerTableEntry; + + uint64_t tableEntryOffset = m_selectTableOffset + tableEntryIndex * sizeof(TableEntry); + TableEntry tableEntry; + m_reader->Read(tableEntryOffset, &tableEntry, sizeof(TableEntry)); + + uint64_t numsSum = tableEntry.sum; + // At this point nums_sum <= sum. + uint64_t numOffset = m_serialNumsOffset + tableEntry.pos; + while (numsSum <= sum) + { + uint64_t nextOffset = numOffset; + uint64_t num = VarintDecode(m_reader, nextOffset); + + if (numsSum + num > sum) + break; + + numOffset = nextOffset; + numsSum += num; + ++countBefore; + } + + serialPos = numOffset - m_serialNumsOffset; + sumBefore = numsSum; +} + +void Vector::Read(uint32_t & serialPos, uint64_t & num) +{ + ASSERT_LESS(serialPos, m_serialNumsSize, ()); + + uint64_t numOffset = m_serialNumsOffset + serialPos; + num = VarintDecode(m_reader, numOffset); + serialPos = numOffset - m_serialNumsOffset; +} + +} diff --git a/coding/varint_vector.hpp b/coding/varint_vector.hpp new file mode 100644 index 0000000000..8201fadb92 --- /dev/null +++ b/coding/varint_vector.hpp @@ -0,0 +1,82 @@ +#pragma once + +#include "../std/vector.hpp" + + +class Writer; +class Reader; + +namespace varint +{ + +#pragma pack(push, 1) +struct TableEntry +{ + uint32_t pos; + uint64_t sum; +}; +#pragma pack(pop) + + +class VectorBuilder +{ +protected: + // Implicit expectation: total compressed size should be within 4GB. + static uint64_t const DEF_NUM_ELEMENTS_PER_TABLE_ENTRY = 1024; + +public: + VectorBuilder(uint64_t numElemPerTableEntry = DEF_NUM_ELEMENTS_PER_TABLE_ENTRY); + + void AddNum(uint64_t num); + void Finalize(Writer * writer); + +protected: + uint64_t m_numElemPerTableEntry; + uint64_t m_numsCount; + uint64_t m_sum; + vector m_selectTable; + vector m_serialNums; +}; + +class VectorBuilderDelayedLast : public VectorBuilder +{ + typedef VectorBuilder BaseT; + uint64_t m_last; + bool m_hasLast; + + void AddLast(); +public: + VectorBuilderDelayedLast(uint64_t numElemPerTableEntry = DEF_NUM_ELEMENTS_PER_TABLE_ENTRY) + : BaseT(numElemPerTableEntry), m_hasLast(false) + { + } + + void AddNum(uint64_t num); + void ReplaceLast(uint64_t num); + void Finalize(Writer * writer); + + bool HasLast() const { return m_hasLast; } + uint64_t GetLast() const { return m_last; } + uint64_t GetNumsCount() const { return m_numsCount + (m_hasLast ? 1 : 0); } +}; + +class Vector +{ +public: + Vector(Reader * reader); + + void FindByIndex(uint64_t countBefore, uint32_t & serialPos, uint64_t & sumBefore); + void FindBySum(uint64_t sum, uint32_t & serialPos, uint64_t & sumBefore, uint64_t & countBefore); + void Read(uint32_t & serialPos, uint64_t & num); + +private: + Reader * m_reader; + uint64_t m_numsCount; + uint64_t m_numElemPerTableEntry; + uint64_t m_numTableEntries; + uint64_t m_serialNumsSize; + uint64_t m_selectTableOffset; + uint64_t m_serialNumsOffset; +}; + +}