diff --git a/base/base_tests/bits_test.cpp b/base/base_tests/bits_test.cpp index ecbaa08d8e..310e02ba2e 100644 --- a/base/base_tests/bits_test.cpp +++ b/base/base_tests/bits_test.cpp @@ -20,8 +20,8 @@ UNIT_TEST(Popcount32) { for (uint32_t i = 0; i < 10000; ++i) { - TEST_EQUAL(bits::popcount(i), PopCountSimple(i), (i)); - TEST_EQUAL(bits::popcount(0xC2000000 | i), PopCountSimple(0xC2000000 | i), (0xC2000000 | i)); + TEST_EQUAL(bits::PopCount(i), PopCountSimple(i), (i)); + TEST_EQUAL(bits::PopCount(0xC2000000 | i), PopCountSimple(0xC2000000 | i), (0xC2000000 | i)); } } @@ -36,7 +36,7 @@ UNIT_TEST(PopcountArray32) uint32_t expectedPopCount = 0; for (size_t i = 0; i < v.size(); ++i) expectedPopCount += PopCountSimple(v[i]); - TEST_EQUAL(bits::popcount(v.empty() ? NULL : &v[0], v.size()), expectedPopCount, + TEST_EQUAL(bits::PopCount(v.empty() ? NULL : &v[0], v.size()), expectedPopCount, (j, v.size(), expectedPopCount)); } } diff --git a/base/bits.hpp b/base/bits.hpp index 712c5e0593..cbae1baa4f 100644 --- a/base/bits.hpp +++ b/base/bits.hpp @@ -8,7 +8,7 @@ namespace bits { // Count the number of 1 bits. Implementation: see Hacker's delight book. - inline uint32_t popcount(uint32_t x) + inline uint32_t PopCount(uint32_t x) { x -= ((x >> 1) & 0x55555555); x = (x & 0x33333333) + ((x >> 2) & 0x33333333); @@ -18,14 +18,14 @@ namespace bits return x & 0x3F; } - inline uint32_t popcount(uint8_t x) + inline uint32_t PopCount(uint8_t x) { - return popcount(static_cast(x)); + return PopCount(static_cast(x)); } // Count the number of 1 bits in array p, length n bits. // There is a better implementation at hackersdelight.org - inline uint32_t popcount(uint32_t const * p, uint32_t n) + inline uint32_t PopCount(uint32_t const * p, uint32_t n) { uint32_t s = 0; for (uint32_t i = 0; i < n; i += 31) @@ -61,10 +61,15 @@ namespace bits return static_cast(SELECT1_ERROR); } + inline uint32_t PopCount(uint64_t x) + { + uint32_t lower = static_cast(x); + uint32_t higher = static_cast(x >> 32); + return PopCount(lower) + PopCount(higher); + } + // Will be implemented when needed. - uint64_t popcount(uint64_t x); - // Will be implemented when needed. - uint64_t popcount(uint64_t const * p, uint64_t n); + uint64_t PopCount(uint64_t const * p, uint64_t n); template T RoundLastBitsUpAndShiftRight(T x, T bits) { diff --git a/coding/coding.pro b/coding/coding.pro index 237fb51f21..f159e3fb3a 100644 --- a/coding/coding.pro +++ b/coding/coding.pro @@ -52,6 +52,7 @@ HEADERS += \ byte_stream.hpp \ coder.hpp \ coder_util.hpp \ + compressed_bit_vector.hpp \ old_compressed_bit_vector.hpp \ # compressed_varnum_vector.hpp \ constants.hpp \ diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro index 634cb43432..f782173851 100644 --- a/coding/coding_tests/coding_tests.pro +++ b/coding/coding_tests/coding_tests.pro @@ -17,6 +17,7 @@ SOURCES += ../../testing/testingmain.cpp \ bit_streams_test.cpp \ # blob_storage_test.cpp \ coder_util_test.cpp \ + compressed_bit_vector_test.cpp \ old_compressed_bit_vector_test.cpp \ # compressed_varnum_vector_test.cpp \ dd_vector_test.cpp \ diff --git a/coding/coding_tests/compressed_bit_vector_test.cpp b/coding/coding_tests/compressed_bit_vector_test.cpp new file mode 100644 index 0000000000..eae105ef79 --- /dev/null +++ b/coding/coding_tests/compressed_bit_vector_test.cpp @@ -0,0 +1,165 @@ +#include "testing/testing.hpp" + +#include "coding/compressed_bit_vector.hpp" +#include "coding/writer.hpp" + +#include "std/algorithm.hpp" +#include "std/iterator.hpp" + +namespace +{ +void CheckIntersection(vector & setBits1, vector & setBits2, + unique_ptr const & cbv) +{ + TEST(cbv.get(), ()); + vector expected; + sort(setBits1.begin(), setBits1.end()); + sort(setBits2.begin(), setBits2.end()); + set_intersection(setBits1.begin(), setBits1.end(), setBits2.begin(), setBits2.end(), + back_inserter(expected)); + TEST_EQUAL(expected.size(), cbv->PopCount(), ()); + for (size_t i = 0; i < expected.size(); ++i) + TEST(cbv->GetBit(expected[i]), ()); +} +} // namespace + +UNIT_TEST(CompressedBitVector_Smoke) {} + +UNIT_TEST(CompressedBitVector_Intersect1) +{ + size_t const n = 100; + vector setBits1; + vector setBits2; + for (size_t i = 0; i < n; ++i) + { + if (i > 0) + setBits1.push_back(i); + if (i + 1 < n) + setBits2.push_back(i); + } + auto cbv1 = coding::CompressedBitVectorBuilder::Build(setBits1); + auto cbv2 = coding::CompressedBitVectorBuilder::Build(setBits2); + TEST(cbv1.get(), ()); + TEST(cbv2.get(), ()); + auto cbv3 = coding::CompressedBitVector::Intersect(*cbv1, *cbv2); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Dense, cbv3->GetStorageStrategy(), ()); + CheckIntersection(setBits1, setBits2, cbv3); +} + +UNIT_TEST(CompressedBitVector_Intersect2) +{ + size_t const n = 100; + vector setBits1; + vector setBits2; + for (size_t i = 0; i < n; ++i) + { + if (i <= n / 2) + setBits1.push_back(i); + if (i >= n / 2) + setBits2.push_back(i); + } + auto cbv1 = coding::CompressedBitVectorBuilder::Build(setBits1); + auto cbv2 = coding::CompressedBitVectorBuilder::Build(setBits2); + TEST(cbv1.get(), ()); + TEST(cbv2.get(), ()); + auto cbv3 = coding::CompressedBitVector::Intersect(*cbv1, *cbv2); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv3->GetStorageStrategy(), ()); + CheckIntersection(setBits1, setBits2, cbv3); +} + +UNIT_TEST(CompressedBitVector_Intersect3) +{ + size_t const n = 100; + vector setBits1; + vector setBits2; + for (size_t i = 0; i < n; ++i) + { + if (i % 2 == 0) + setBits1.push_back(i); + if (i % 3 == 0) + setBits2.push_back(i); + } + auto cbv1 = coding::CompressedBitVectorBuilder::Build(setBits1); + auto cbv2 = coding::CompressedBitVectorBuilder::Build(setBits2); + TEST(cbv1.get(), ()); + TEST(cbv2.get(), ()); + auto cbv3 = coding::CompressedBitVector::Intersect(*cbv1, *cbv2); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv3->GetStorageStrategy(), ()); + for (size_t i = 0; i < n; ++i) + { + bool expected = i % 6 == 0; + TEST_EQUAL(expected, cbv3->GetBit(i), (i)); + } +} + +UNIT_TEST(CompressedBitVector_Intersect4) +{ + size_t const n = 1000; + vector setBits1; + vector setBits2; + for (size_t i = 0; i < n; ++i) + { + if (i % 100 == 0) + setBits1.push_back(i); + if (i % 150 == 0) + setBits2.push_back(i); + } + auto cbv1 = coding::CompressedBitVectorBuilder::Build(setBits1); + auto cbv2 = coding::CompressedBitVectorBuilder::Build(setBits2); + TEST(cbv1.get(), ()); + TEST(cbv2.get(), ()); + auto cbv3 = coding::CompressedBitVector::Intersect(*cbv1, *cbv2); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv3->GetStorageStrategy(), ()); + for (size_t i = 0; i < n; ++i) + { + bool expected = i % 300 == 0; + TEST_EQUAL(expected, cbv3->GetBit(i), (i)); + } +} + +UNIT_TEST(CompressedBitVector_SerializationDense) +{ + int const n = 100; + vector setBits; + for (size_t i = 0; i < n; ++i) + setBits.push_back(i); + vector buf; + { + MemWriter> writer(buf); + auto cbv = coding::CompressedBitVectorBuilder::Build(setBits); + cbv->Serialize(writer); + } + MemReader reader(buf.data(), buf.size()); + ReaderSource src(reader); + auto cbv = coding::CompressedBitVectorBuilder::Deserialize(src); + TEST(cbv.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Dense, cbv->GetStorageStrategy(), ()); + TEST_EQUAL(setBits.size(), cbv->PopCount(), ()); + for (size_t i = 0; i < setBits.size(); ++i) + TEST(cbv->GetBit(setBits[i]), ()); +} + +UNIT_TEST(CompressedBitVector_SerializationSparse) +{ + int const n = 100; + vector setBits; + for (size_t i = 0; i < n; ++i) + { + if (i % 10 == 0) + setBits.push_back(i); + } + vector buf; + { + MemWriter> writer(buf); + auto cbv = coding::CompressedBitVectorBuilder::Build(setBits); + cbv->Serialize(writer); + } + MemReader reader(buf.data(), buf.size()); + ReaderSource src(reader); + auto cbv = coding::CompressedBitVectorBuilder::Deserialize(src); + TEST(cbv.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv->GetStorageStrategy(), ()); + TEST_EQUAL(setBits.size(), cbv->PopCount(), ()); + for (size_t i = 0; i < setBits.size(); ++i) + TEST(cbv->GetBit(setBits[i]), ()); +} diff --git a/coding/compressed_bit_vector.cpp b/coding/compressed_bit_vector.cpp new file mode 100644 index 0000000000..5f4336b324 --- /dev/null +++ b/coding/compressed_bit_vector.cpp @@ -0,0 +1,218 @@ +#include "coding/compressed_bit_vector.hpp" +#include "coding/writer.hpp" +#include "coding/write_to_sink.hpp" + +#include "std/algorithm.hpp" + +namespace +{ +unique_ptr IntersectImpl(coding::DenseCBV const & a, + coding::DenseCBV const & b) +{ + size_t sizeA = a.NumBitGroups(); + size_t sizeB = b.NumBitGroups(); + vector resBits; + for (size_t i = 0; i < min(sizeA, sizeB); ++i) + { + uint64_t bitGroup = a.GetBitGroup(i) & b.GetBitGroup(i); + for (size_t j = 0; j < 64; j++) + if (((bitGroup >> j) & 1) > 0) + resBits.push_back(64 * i + j); + } + return coding::CompressedBitVectorBuilder::Build(resBits); +} + +// The intersection of dense and sparse is always sparse. +unique_ptr IntersectImpl(coding::DenseCBV const & a, + coding::SparseCBV const & b) +{ + vector resPos; + for (size_t i = 0; i < b.PopCount(); ++i) + { + auto pos = b.Select(i); + if (a.GetBit(pos)) + resPos.push_back(pos); + } + return make_unique(move(resPos)); +} + +unique_ptr IntersectImpl(coding::SparseCBV const & a, + coding::DenseCBV const & b) +{ + return IntersectImpl(b, a); +} + +unique_ptr IntersectImpl(coding::SparseCBV const & a, + coding::SparseCBV const & b) +{ + size_t sizeA = a.PopCount(); + size_t sizeB = b.PopCount(); + vector resPos; + size_t i = 0; + size_t j = 0; + while (i < sizeA && j < sizeB) + { + auto posA = a.Select(i); + auto posB = b.Select(j); + if (posA == posB) + { + resPos.push_back(posA); + ++i; + ++j; + } + else if (posA < posB) + { + ++i; + } + else + { + ++j; + } + } + return make_unique(move(resPos)); +} +} // namespace + +namespace coding +{ +DenseCBV::DenseCBV(vector const & setBits) +{ + if (setBits.empty()) + { + m_bits.resize(0); + m_popCount = 0; + return; + } + uint64_t maxBit = setBits[0]; + for (size_t i = 1; i < setBits.size(); ++i) + maxBit = max(maxBit, setBits[i]); + size_t sz = (maxBit + 64 - 1) / 64; + m_bits.resize(sz); + m_popCount = static_cast(setBits.size()); + for (uint64_t pos : setBits) + m_bits[pos / 64] |= static_cast(1) << (pos % 64); +} + +uint32_t DenseCBV::PopCount() const { return m_popCount; } + +uint32_t SparseCBV::PopCount() const { return m_positions.size(); } + +bool DenseCBV::GetBit(uint32_t pos) const +{ + uint64_t bitGroup = GetBitGroup(pos / 64); + return ((bitGroup >> (pos % 64)) & 1) > 0; +} + +bool SparseCBV::GetBit(uint32_t pos) const +{ + auto it = lower_bound(m_positions.begin(), m_positions.end(), pos); + return it != m_positions.end() && *it == pos; +} + +CompressedBitVector::StorageStrategy DenseCBV::GetStorageStrategy() const +{ + return CompressedBitVector::StorageStrategy::Dense; +} + +CompressedBitVector::StorageStrategy SparseCBV::GetStorageStrategy() const +{ + return CompressedBitVector::StorageStrategy::Sparse; +} + +template +void DenseCBV::ForEach(F && f) const +{ + for (size_t i = 0; i < m_bits.size(); ++i) + for (size_t j = 0; j < 64; ++j) + if (((m_bits[i] >> j) & 1) > 0) + f(64 * i + j); +} + +template +void SparseCBV::ForEach(F && f) const +{ + for (size_t i = 0; i < m_positions.size(); ++i) + f(m_positions[i]); +} + +string DebugPrint(CompressedBitVector::StorageStrategy strat) +{ + switch (strat) + { + case CompressedBitVector::StorageStrategy::Dense: + return "Dense"; + case CompressedBitVector::StorageStrategy::Sparse: + return "Sparse"; + } +} + +void DenseCBV::Serialize(Writer & writer) const +{ + uint8_t header = static_cast(GetStorageStrategy()); + WriteToSink(writer, header); + WriteToSink(writer, static_cast(NumBitGroups())); + for (size_t i = 0; i < NumBitGroups(); ++i) + WriteToSink(writer, GetBitGroup(i)); +} + +void SparseCBV::Serialize(Writer & writer) const +{ + uint8_t header = static_cast(GetStorageStrategy()); + WriteToSink(writer, header); + WriteToSink(writer, PopCount()); + ForEach([&](uint64_t bitPos) + { + WriteToSink(writer, bitPos); + }); +} + +// static +unique_ptr CompressedBitVectorBuilder::Build(vector const & setBits) +{ + if (setBits.empty()) + return make_unique(setBits); + uint64_t maxBit = setBits[0]; + for (size_t i = 1; i < setBits.size(); ++i) + maxBit = max(maxBit, setBits[i]); + // 30% occupied is dense enough + if (10 * setBits.size() >= 3 * maxBit) + return make_unique(setBits); + return make_unique(setBits); +} + +// static +unique_ptr CompressedBitVector::Intersect(CompressedBitVector const & lhs, + CompressedBitVector const & rhs) +{ + auto stratA = lhs.GetStorageStrategy(); + auto stratB = rhs.GetStorageStrategy(); + auto stratDense = CompressedBitVector::StorageStrategy::Dense; + auto stratSparse = CompressedBitVector::StorageStrategy::Sparse; + if (stratA == stratDense && stratB == stratDense) + { + DenseCBV const & a = static_cast(lhs); + DenseCBV const & b = static_cast(rhs); + return IntersectImpl(a, b); + } + if (stratA == stratDense && stratB == stratSparse) + { + DenseCBV const & a = static_cast(lhs); + SparseCBV const & b = static_cast(rhs); + return IntersectImpl(a, b); + } + if (stratA == stratSparse && stratB == stratDense) + { + SparseCBV const & a = static_cast(lhs); + DenseCBV const & b = static_cast(rhs); + return IntersectImpl(a, b); + } + if (stratA == stratSparse && stratB == stratSparse) + { + SparseCBV const & a = static_cast(lhs); + SparseCBV const & b = static_cast(rhs); + return IntersectImpl(a, b); + } + + return nullptr; +} +} // namespace coding diff --git a/coding/compressed_bit_vector.hpp b/coding/compressed_bit_vector.hpp new file mode 100644 index 0000000000..b856f9fc72 --- /dev/null +++ b/coding/compressed_bit_vector.hpp @@ -0,0 +1,182 @@ +#include "std/vector.hpp" + +#include "base/assert.hpp" +#include "base/bits.hpp" + +#include "coding/reader.hpp" +#include "coding/writer.hpp" + +#include "std/algorithm.hpp" +#include "std/unique_ptr.hpp" + +#include "base/assert.hpp" + +namespace coding +{ +class CompressedBitVector +{ +public: + enum class StorageStrategy + { + Dense, + Sparse + }; + + virtual ~CompressedBitVector() = default; + + // Executes f for each bit that is set to one using + // the bit's 0-based position as argument. + template + void ForEach(F && f) const; + + // Intersects two bit vectors. + static unique_ptr Intersect(CompressedBitVector const &, + CompressedBitVector const &); + + // Returns the number of set bits (population count). + virtual uint32_t PopCount() const = 0; + + // todo(@pimenov) How long will 32 bits be enough here? + // Would operator[] look better? + virtual bool GetBit(uint32_t pos) const = 0; + + // Returns the strategy used when storing this bit vector. + virtual StorageStrategy GetStorageStrategy() const = 0; + + // Writes the contents of a bit vector to writer. + // The first byte is always the header that defines the format. + // Currently the header is 0 or 1 for Dense and Sparse strategies respectively. + // It is easier to dispatch via virtual method calls and not bother + // with template TWriters here as we do in similar places in our code. + // This should not pose too much a problem because commonly + // used writers are inhereted from Writer anyway. + // todo(@pimenov). Think about rewriting Serialize and Deserialize to use the + // code in old_compressed_bit_vector.{c,h}pp. + virtual void Serialize(Writer & writer) const = 0; +}; + +string DebugPrint(CompressedBitVector::StorageStrategy strat); + +class DenseCBV : public CompressedBitVector +{ +public: + // Builds a dense CBV from a list of positions of set bits. + DenseCBV(vector const & setBits); + + // Builds a dense CBV from a packed bitmap of set bits. + // todo(@pimenov) This behaviour of & and && constructors is extremely error-prone. + DenseCBV(vector && bitMasks) : m_bits(move(bitMasks)) + { + m_popCount = 0; + for (size_t i = 0; i < m_bits.size(); ++i) + m_popCount += bits::PopCount(m_bits[i]); + } + + ~DenseCBV() = default; + + size_t NumBitGroups() const { return m_bits.size(); } + + template + void ForEach(F && f) const; + + uint64_t GetBitGroup(size_t i) const + { + if (i < m_bits.size()) + return m_bits[i]; + return 0; + } + + // CompressedBitVector overrides: + + uint32_t PopCount() const override; + + bool GetBit(uint32_t pos) const override; + + StorageStrategy GetStorageStrategy() const override; + + void Serialize(Writer & writer) const override; + +private: + vector m_bits; + uint32_t m_popCount; +}; + +class SparseCBV : public CompressedBitVector +{ +public: + SparseCBV(vector const & setBits) : m_positions(setBits) + { + ASSERT(is_sorted(m_positions.begin(), m_positions.end()), ()); + } + + SparseCBV(vector && setBits) : m_positions(move(setBits)) + { + ASSERT(is_sorted(m_positions.begin(), m_positions.end()), ()); + } + + ~SparseCBV() = default; + + // Returns the position of the i'th set bit. + uint64_t Select(size_t i) const + { + ASSERT_LESS(i, m_positions.size(), ()); + return m_positions[i]; + } + + template + void ForEach(F && f) const; + + // CompressedBitVector overrides: + + uint32_t PopCount() const override; + + bool GetBit(uint32_t pos) const override; + + StorageStrategy GetStorageStrategy() const override; + + void Serialize(Writer & writer) const override; + +private: + // 0-based positions of the set bits. + vector m_positions; +}; + +class CompressedBitVectorBuilder +{ +public: + // Chooses a strategy to store the bit vector with bits from setBits set to one + // and returns a pointer to a class that fits best. + static unique_ptr Build(vector const & setBits); + + // Reads a bit vector from reader which must contain a valid + // bit vector representation (see CompressedBitVector::Serialize for the format). + template + static unique_ptr Deserialize(TReader & reader) + { + ReaderSource src(reader); + uint8_t header = ReadPrimitiveFromSource(reader); + CompressedBitVector::StorageStrategy strat = + static_cast(header); + switch (strat) + { + case CompressedBitVector::StorageStrategy::Dense: + { + uint32_t numBitGroups = ReadPrimitiveFromSource(reader); + vector bitGroups(numBitGroups); + for (size_t i = 0; i < numBitGroups; ++i) + bitGroups[i] = ReadPrimitiveFromSource(reader); + return make_unique(move(bitGroups)); + } + case CompressedBitVector::StorageStrategy::Sparse: + { + uint32_t numBits = ReadPrimitiveFromSource(reader); + vector setBits(numBits); + for (size_t i = 0; i < numBits; ++i) + setBits[i] = ReadPrimitiveFromSource(reader); + return make_unique(setBits); + } + } + return nullptr; + } +}; +} // namespace coding diff --git a/std/algorithm.hpp b/std/algorithm.hpp index 27a647d675..9e7b7e75f8 100644 --- a/std/algorithm.hpp +++ b/std/algorithm.hpp @@ -8,11 +8,14 @@ using std::all_of; using std::binary_search; +using std::copy; using std::equal; +using std::equal_range; using std::fill; using std::find; -using std::find_if; using std::find_first_of; +using std::find_if; +using std::for_each; using std::is_sorted; using std::lexicographical_compare; using std::lower_bound; @@ -20,20 +23,17 @@ using std::max; using std::max_element; using std::min; using std::next_permutation; -using std::sort; -using std::stable_sort; using std::partial_sort; -using std::swap; -using std::upper_bound; -using std::unique; -using std::equal_range; -using std::for_each; -using std::copy; using std::remove_if; using std::replace; using std::reverse; -using std::set_union; using std::set_intersection; +using std::set_union; +using std::sort; +using std::stable_sort; +using std::swap; +using std::unique; +using std::upper_bound; // Bug workaround, see http://connect.microsoft.com/VisualStudio/feedbackdetail/view/840578/algorithm-possible-c-compiler-bug-when-using-std-set-difference-with-custom-comperator #ifdef _MSC_VER namespace vs_bug @@ -81,14 +81,14 @@ OutputIt set_difference(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputI #else using std::set_difference; #endif -using std::set_symmetric_difference; -using std::transform; -using std::push_heap; -using std::pop_heap; -using std::sort_heap; using std::distance; -using std::remove_copy_if; using std::generate; +using std::pop_heap; +using std::push_heap; +using std::remove_copy_if; +using std::set_symmetric_difference; +using std::sort_heap; +using std::transform; #ifdef DEBUG_NEW #define new DEBUG_NEW