From f2d961225399371a657b909829f19be86f6ab925 Mon Sep 17 00:00:00 2001 From: Yury Melnichek Date: Tue, 17 May 2011 20:27:26 +0200 Subject: [PATCH] One more new index format. :) Smaller than previous --- indexer/indexer_tests/interval_index_test.cpp | 148 +++++-- indexer/interval_index.hpp | 200 ++++----- indexer/interval_index_builder.hpp | 378 +++++++++++------- 3 files changed, 447 insertions(+), 279 deletions(-) diff --git a/indexer/indexer_tests/interval_index_test.cpp b/indexer/indexer_tests/interval_index_test.cpp index d8e2a3c0f0..9842fbe361 100644 --- a/indexer/indexer_tests/interval_index_test.cpp +++ b/indexer/indexer_tests/interval_index_test.cpp @@ -20,32 +20,110 @@ struct CellIdFeaturePairForTest }; } +UNIT_TEST(IntervalIndex_LevelCount) +{ + TEST_EQUAL(IntervalIndexBuilder(10, 1, 3).GetLevelCount(), 1, ()); + TEST_EQUAL(IntervalIndexBuilder(11, 1, 3).GetLevelCount(), 1, ()); + TEST_EQUAL(IntervalIndexBuilder(12, 1, 3).GetLevelCount(), 2, ()); + TEST_EQUAL(IntervalIndexBuilder(19, 2, 3).GetLevelCount(), 1, ()); + TEST_EQUAL(IntervalIndexBuilder(19, 1, 3).GetLevelCount(), 4, ()); + TEST_EQUAL(IntervalIndexBuilder(20, 1, 3).GetLevelCount(), 4, ()); +} + +UNIT_TEST(IntervalIndex_SerializedNodeBitmap) +{ + uint32_t const offset = 350; // == 0x15E + uint32_t childSizes[8] = { 0, 0, 0, 10, 0, 0, 1000, 0 }; + char const expSerial [] = + "\xBD\x05" // (350 << 1) + 1 == 701 == 0x2BD - offset encoded as varuint. + "\x48" // (1 << 3) | (1 << 6) == 72 == 0x48 - bitmap. + "\x0A" // 10 - childSizes[3] encoded as varuint. + "\xE8\x07" // 1000 = 0x3E8 - childSizes[6] encoded as varuint. + ""; + vector serializedNode; + MemWriter > writer(serializedNode); + IntervalIndexBuilder(11, 1, 3).WriteNode(writer, offset, childSizes); + TEST_EQUAL(serializedNode, vector(expSerial, expSerial + ARRAY_SIZE(expSerial) - 1), ()); +} + +UNIT_TEST(IntervalIndex_SerializedNodeList) +{ + uint32_t const offset = 350; // == 0x15E + uint32_t childSizes[16] = { 0, 0, 0, 0, 0, 0, 1000, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + char const expSerial [] = + "\xBC\x05" // (350 << 1) + 0 == 700 == 0x2BC - offset encoded as varuint. + "\x06" "\xE8\x07" // 6, 1000 + ""; + vector serializedNode; + MemWriter > writer(serializedNode); + IntervalIndexBuilder(11, 1, 4).WriteNode(writer, offset, childSizes); + TEST_EQUAL(serializedNode, vector(expSerial, expSerial + ARRAY_SIZE(expSerial) - 1), ()); +} + +UNIT_TEST(IntervalIndex_SerializedLeaves) +{ + vector data; + data.push_back(CellIdFeaturePairForTest(0x1537U, 0)); + data.push_back(CellIdFeaturePairForTest(0x1538U, 1)); + data.push_back(CellIdFeaturePairForTest(0x1637U, 2)); + vector serialLeaves; + MemWriter > writer(serialLeaves); + vector sizes; + IntervalIndexBuilder(16, 1, 4).BuildLeaves(writer, data.begin(), data.end(), sizes); + char const expSerial [] = "\x37\x00" "\x38\x02" "\x37\x04"; // 0x1537 0x1538 0x1637 + uint32_t const expSizes [] = { 4, 2 }; + TEST_EQUAL(serialLeaves, vector(expSerial, expSerial + ARRAY_SIZE(expSerial) - 1), ()); + TEST_EQUAL(sizes, vector(expSizes, expSizes + ARRAY_SIZE(expSizes)), ()); +} + +UNIT_TEST(IntervalIndex_SerializedNodes) +{ + vector data; + data.push_back(CellIdFeaturePairForTest(0x1537U, 0)); + data.push_back(CellIdFeaturePairForTest(0x1538U, 1)); + data.push_back(CellIdFeaturePairForTest(0x1637U, 2)); + uint32_t const leavesSizes [] = { 4, 2 }; + vector serialNodes; + MemWriter > writer(serialNodes); + vector sizes; + IntervalIndexBuilder(16, 1, 4).BuildLevel(writer, data.begin(), data.end(), 1, + leavesSizes, leavesSizes + ARRAY_SIZE(leavesSizes), + sizes); + char const expSerial [] = "\x01\x60\x00\x04\x02"; + uint32_t const expSizes [] = { ARRAY_SIZE(expSerial) - 1 }; + TEST_EQUAL(serialNodes, vector(expSerial, expSerial + ARRAY_SIZE(expSerial) - 1), ()); + TEST_EQUAL(sizes, vector(expSizes, expSizes + ARRAY_SIZE(expSizes)), ()); +} + UNIT_TEST(IntervalIndex_Serialized) { vector data; - data.push_back(CellIdFeaturePairForTest(0x21U, 0)); - data.push_back(CellIdFeaturePairForTest(0x22U, 1)); - data.push_back(CellIdFeaturePairForTest(0x41U, 2)); - vector serializedIndex; - MemWriter > writer(serializedIndex); - BuildIntervalIndex(data.begin(), data.end(), writer, 10); + data.push_back(CellIdFeaturePairForTest(0x1537U, 0)); + data.push_back(CellIdFeaturePairForTest(0x1538U, 1)); + data.push_back(CellIdFeaturePairForTest(0x1637U, 2)); + vector serialIndex; + MemWriter > writer(serialIndex); + IntervalIndexBuilder(16, 1, 4).BuildIndex(writer, data.begin(), data.end()); char const expSerial [] = - "\x02\x05" // Header - "\x00\x00\x00\x00" "\x06\x00\x00\x00" // Root - "\x10\x00\x00\x00" "\x06\x00\x00\x00" // 0x21 and 0x22 - "\x0A\x00\x00\x00" "\x02\x00\x00\x00" // 0x41 - "\x03\x00\x00\x00" "\x00\x00\x00\x00" // Dummy last internal node - "\x01" "\x05" "\x09" // (0x21, 0), (0x22, 1), (0x31, 2) + "\x01\x02\x04\x01" // Header + "\x14\x00\x00\x00" // Leaves level offset + "\x1A\x00\x00\x00" // Level 1 offset + "\x1F\x00\x00\x00" // Root level offset + "\x22\x00\x00\x00" // Root level offset + "\x37\x00" "\x38\x02" "\x37\x04" // 0x1537 0x1538 0x1637 + "\x01\x60\x00\x04\x02" // 0x15, 0x16 node + "\x00\x01\x05" // Root ""; - TEST_EQUAL(serializedIndex, vector(expSerial, expSerial + ARRAY_SIZE(expSerial) - 1), ()); + TEST_EQUAL(serialIndex, vector(expSerial, expSerial + ARRAY_SIZE(expSerial) - 1), ()); - MemReader reader(&serializedIndex[0], serializedIndex.size()); + MemReader reader(&serialIndex[0], serialIndex.size()); IntervalIndex index(reader); uint32_t expected [] = {0, 1, 2}; vector values; - index.ForEach(MakeBackInsertFunctor(values), 0, 0xFF); + TEST_EQUAL(index.KeyEnd(), 0x10000, ()); + index.ForEach(MakeBackInsertFunctor(values), 0, 0x10000); TEST_EQUAL(values, vector(expected, expected + ARRAY_SIZE(expected)), ()); } @@ -55,15 +133,16 @@ UNIT_TEST(IntervalIndex_Simple) data.push_back(CellIdFeaturePairForTest(0xA0B1C2D100ULL, 0)); data.push_back(CellIdFeaturePairForTest(0xA0B1C2D200ULL, 1)); data.push_back(CellIdFeaturePairForTest(0xA0B2C2D100ULL, 2)); - vector serializedIndex; - MemWriter > writer(serializedIndex); + vector serialIndex; + MemWriter > writer(serialIndex); BuildIntervalIndex(data.begin(), data.end(), writer, 40); - MemReader reader(&serializedIndex[0], serializedIndex.size()); + MemReader reader(&serialIndex[0], serialIndex.size()); IntervalIndex index(reader); + TEST_EQUAL(index.KeyEnd(), 0x10000000000ULL, ()); { uint32_t expected [] = {0, 1, 2}; vector values; - index.ForEach(MakeBackInsertFunctor(values), 0ULL, 0xFFFFFFFFFFULL); + index.ForEach(MakeBackInsertFunctor(values), 0ULL, index.KeyEnd()); TEST_EQUAL(values, vector(expected, expected + ARRAY_SIZE(expected)), ()); } { @@ -105,10 +184,10 @@ UNIT_TEST(IntervalIndex_Simple) UNIT_TEST(IntervalIndex_Empty) { vector data; - vector serializedIndex; - MemWriter > writer(serializedIndex); + vector serialIndex; + MemWriter > writer(serialIndex); BuildIntervalIndex(data.begin(), data.end(), writer, 40); - MemReader reader(&serializedIndex[0], serializedIndex.size()); + MemReader reader(&serialIndex[0], serialIndex.size()); IntervalIndex index(reader); { vector values; @@ -124,10 +203,10 @@ UNIT_TEST(IntervalIndex_Simple2) data.push_back(CellIdFeaturePairForTest(0xA0B1C2D200ULL, 1)); data.push_back(CellIdFeaturePairForTest(0xA0B1C2D200ULL, 3)); data.push_back(CellIdFeaturePairForTest(0xA0B2C2D200ULL, 2)); - vector serializedIndex; - MemWriter > writer(serializedIndex); + vector serialIndex; + MemWriter > writer(serialIndex); BuildIntervalIndex(data.begin(), data.end(), writer, 40); - MemReader reader(&serializedIndex[0], serializedIndex.size()); + MemReader reader(&serialIndex[0], serialIndex.size()); IntervalIndex index(reader); { uint32_t expected [] = {0, 1, 2, 3}; @@ -143,10 +222,10 @@ UNIT_TEST(IntervalIndex_Simple3) vector data; data.push_back(CellIdFeaturePairForTest(0x0100ULL, 0)); data.push_back(CellIdFeaturePairForTest(0x0200ULL, 1)); - vector serializedIndex; - MemWriter > writer(serializedIndex); + vector serialIndex; + MemWriter > writer(serialIndex); BuildIntervalIndex(data.begin(), data.end(), writer, 40); - MemReader reader(&serializedIndex[0], serializedIndex.size()); + MemReader reader(&serialIndex[0], serialIndex.size()); IntervalIndex index(reader); { uint32_t expected [] = {0, 1}; @@ -162,10 +241,10 @@ UNIT_TEST(IntervalIndex_Simple4) vector data; data.push_back(CellIdFeaturePairForTest(0x01030400ULL, 0)); data.push_back(CellIdFeaturePairForTest(0x02030400ULL, 1)); - vector serializedIndex; - MemWriter > writer(serializedIndex); + vector serialIndex; + MemWriter > writer(serialIndex); BuildIntervalIndex(data.begin(), data.end(), writer, 40); - MemReader reader(&serializedIndex[0], serializedIndex.size()); + MemReader reader(&serialIndex[0], serialIndex.size()); IntervalIndex index(reader); { uint32_t expected [] = {0, 1}; @@ -183,10 +262,10 @@ UNIT_TEST(IntervalIndex_Simple5) data.push_back(CellIdFeaturePairForTest(0xA0B1C2D200ULL, 1)); data.push_back(CellIdFeaturePairForTest(0xA0B1C2D200ULL, 3)); data.push_back(CellIdFeaturePairForTest(0xA0B2C2D200ULL, 2)); - vector serializedIndex; - MemWriter > writer(serializedIndex); + vector serialIndex; + MemWriter > writer(serialIndex); BuildIntervalIndex(data.begin(), data.end(), writer, 40); - MemReader reader(&serializedIndex[0], serializedIndex.size()); + MemReader reader(&serialIndex[0], serialIndex.size()); IntervalIndex index(reader); { uint32_t expected [] = {0, 1, 2, 3}; @@ -196,3 +275,4 @@ UNIT_TEST(IntervalIndex_Simple5) TEST_EQUAL(values, vector(expected, expected + ARRAY_SIZE(expected)), ()); } } + diff --git a/indexer/interval_index.hpp b/indexer/interval_index.hpp index 48c485f09f..35301eb7a4 100644 --- a/indexer/interval_index.hpp +++ b/indexer/interval_index.hpp @@ -1,6 +1,7 @@ #pragma once #include "../coding/endianness.hpp" #include "../coding/byte_stream.hpp" +#include "../coding/reader.hpp" #include "../coding/varint.hpp" #include "../base/assert.hpp" #include "../base/base.hpp" @@ -8,6 +9,7 @@ #include "../base/bitset.hpp" #include "../base/buffer_vector.hpp" #include "../base/macros.hpp" +#include "../std/algorithm.hpp" #include "../std/memcpy.hpp" #include "../std/static_assert.hpp" @@ -17,27 +19,21 @@ public: #pragma pack(push, 1) struct Header { + uint8_t m_Version; uint8_t m_Levels; uint8_t m_BitsPerLevel; + uint8_t m_LeafBytes; }; #pragma pack(pop) - STATIC_ASSERT(sizeof(Header) == 2); + STATIC_ASSERT(sizeof(Header) == 4); - struct Index - { - enum { MAX_BITS_PER_LEVEL = 256 }; - inline uint32_t GetOffset() const { return m_Offset; } - inline uint32_t Bit(uint32_t i) const { return m_Bitset.Bit(i); } - - uint32_t m_Offset; - Bitset m_Bitset; - }; - - static inline uint32_t BitsetSize(uint32_t bitsPerLevel) + static inline uint32_t BitmapSize(uint32_t bitsPerLevel) { ASSERT_GREATER(bitsPerLevel, 3, ()); return 1 << (bitsPerLevel - 3); } + + enum { kVersion = 1 }; }; template @@ -50,29 +46,34 @@ public: public: void Clear() {} private: - friend class IntervalIndex; - vector m_IntervalIndexCache; + // TODO: Add IntervalIndex cache here. }; - explicit IntervalIndex(ReaderT const & reader) - : m_Reader(reader) + explicit IntervalIndex(ReaderT const & reader) : m_Reader(reader) { - m_Reader.Read(0, &m_Header, sizeof(m_Header)); - m_NodeSize = 4 + BitsetSize(m_Header.m_BitsPerLevel); - ReadIndex(sizeof(m_Header), m_Level0Index); + ReaderSource src(reader); + src.Read(&m_Header, sizeof(Header)); + CHECK_EQUAL(m_Header.m_Version, static_cast(kVersion), ()); + if (m_Header.m_Levels != 0) + for (int i = 0; i <= m_Header.m_Levels + 1; ++i) + m_LevelOffsets.push_back(ReadPrimitiveFromSource(src)); + } + + uint64_t KeyEnd() const + { + return 1ULL << (m_Header.m_Levels * m_Header.m_BitsPerLevel + m_Header.m_LeafBytes * 8); } template - void ForEach(F const & f, uint64_t beg, uint64_t end, Query & query) const + void ForEach(F const & f, uint64_t beg, uint64_t end, Query &) const { - ASSERT_LESS(beg, 1ULL << m_Header.m_Levels * m_Header.m_BitsPerLevel, (beg, end)); - ASSERT_LESS_OR_EQUAL(end, 1ULL << m_Header.m_Levels * m_Header.m_BitsPerLevel, (beg, end)); - if (beg != end) + if (m_Header.m_Levels != 0 && beg != end) { - // end is inclusive in ForEachImpl(). - --end; - ForEachImpl(f, beg, end, m_Level0Index, sizeof(m_Header) + m_NodeSize, - (m_Header.m_Levels - 1) * m_Header.m_BitsPerLevel, query); + ASSERT_LESS_OR_EQUAL(beg, KeyEnd(), (end)); + ASSERT_LESS_OR_EQUAL(end, KeyEnd(), (beg)); + --end; // end is inclusive in ForEachImpl(). + ForEachNode(f, beg, end, m_Header.m_Levels, 0, + m_LevelOffsets[m_Header.m_Levels + 1] - m_LevelOffsets[m_Header.m_Levels]); } } @@ -84,78 +85,97 @@ public: } private: + template - void ForEachImpl(F const & f, uint64_t beg, uint64_t end, Index const & index, - uint32_t baseOffset, int skipBits, Query & query) const + void ForEachLeaf(F const & f, uint64_t const beg, uint64_t const end, + uint32_t const offset, uint32_t const size) const { - uint32_t const beg0 = static_cast(beg >> skipBits); - uint32_t const end0 = static_cast(end >> skipBits); - ASSERT_GREATER_OR_EQUAL(skipBits, 0, (beg, end, baseOffset)); - ASSERT_LESS_OR_EQUAL(beg, end, (baseOffset, skipBits)); - ASSERT_LESS_OR_EQUAL(beg0, end0, (beg, end, baseOffset, skipBits)); - ASSERT_LESS(end0, (1 << m_Header.m_BitsPerLevel), (beg0, beg, end, baseOffset, skipBits)); - - if (skipBits > 0) + buffer_vector data(size); + m_Reader.Read(offset, &data[0], size); + ArrayByteSource src(&data[0]); + void const * pEnd = &data[0] + size; + uint32_t value = 0; + while (src.Ptr() < pEnd) { - uint32_t cumCount = 0; - for (uint32_t i = 0; i < beg0; ++i) - cumCount += index.Bit(i); - for (uint32_t i = beg0; i <= end0; ++i) - { - if (index.Bit(i)) - { - uint64_t const levelBytesFF = (1ULL << skipBits) - 1; - uint64_t const b1 = (i == beg0) ? (beg & levelBytesFF) : 0; - uint64_t const e1 = (i == end0) ? (end & levelBytesFF) : levelBytesFF; - - Index index1; - uint32_t const offset = baseOffset + index.GetOffset() + (cumCount * m_NodeSize); - ReadIndex(offset, index1); - ForEachImpl(f, b1, e1, index1, offset + m_NodeSize, - skipBits - m_Header.m_BitsPerLevel, query); - ++cumCount; - } - } - } - else - { - Index nextIndex; - ReadIndex(baseOffset, nextIndex); - uint32_t const begOffset = baseOffset + index.GetOffset(); - uint32_t const endOffset = baseOffset + m_NodeSize + nextIndex.GetOffset(); - ASSERT_LESS(begOffset, endOffset, (beg, end, baseOffset, skipBits)); - buffer_vector data(endOffset - begOffset); - m_Reader.Read(begOffset, &data[0], data.size()); - ArrayByteSource src(&data[0]); - void const * const pEnd = &data[0] + data.size(); - uint32_t key = -1; - uint32_t value = 0; - while (src.Ptr() < pEnd) - { - uint32_t const x = ReadVarUint(src); - int32_t const delta = bits::ZigZagDecode(x >> 1); - ASSERT_GREATER_OR_EQUAL(static_cast(value) + delta, 0, ()); - value += delta; - if (x & 1) - for (++key; key < (1 << m_Header.m_BitsPerLevel) && !index.Bit(key); ++key) ; - ASSERT_LESS(key, 1 << m_Header.m_BitsPerLevel, (key)); - if (key > end0) - break; - if (key >= beg0) - f(value); - } + uint32_t key = 0; + src.Read(&key, m_Header.m_LeafBytes); + key = SwapIfBigEndian(key); + if (key > end) + break; + value += ReadVarInt(src); + if (key >= beg) + f(value); } } - void ReadIndex(uint64_t pos, Index & index) const + template + void ForEachNode(F const & f, uint64_t beg, uint64_t end, int level, + uint32_t offset, uint32_t size) const { - m_Reader.Read(pos, &index, m_NodeSize); - index.m_Offset = SwapIfBigEndian(index.m_Offset); + offset += m_LevelOffsets[level]; + + if (level == 0) + { + ForEachLeaf(f, beg, end, offset, size); + return; + } + + uint8_t const skipBits = (m_Header.m_LeafBytes << 3) + (level - 1) * m_Header.m_BitsPerLevel; + ASSERT_LESS_OR_EQUAL(beg, end, (skipBits)); + + uint64_t const levelBytesFF = (1ULL << skipBits) - 1; + uint32_t const beg0 = static_cast(beg >> skipBits); + uint32_t const end0 = static_cast(end >> skipBits); + ASSERT_LESS(end0, (1 << m_Header.m_BitsPerLevel), (beg, end, skipBits)); + + buffer_vector data(size); + m_Reader.Read(offset, &data[0], size); + ArrayByteSource src(&data[0]); + uint32_t const offsetAndFlag = ReadVarUint(src); + uint32_t childOffset = offsetAndFlag >> 1; + if (offsetAndFlag & 1) + { + // Reading bitmap. + uint8_t const * pBitmap = static_cast(src.Ptr()); + src.Advance(BitmapSize(m_Header.m_BitsPerLevel)); + for (uint32_t i = 0; i <= end0; ++i) + { + if (bits::GetBit(pBitmap, i)) + { + uint32_t childSize = ReadVarUint(src); + if (i >= beg0) + { + uint64_t const beg1 = (i == beg0) ? (beg & levelBytesFF) : 0; + uint64_t const end1 = (i == end0) ? (end & levelBytesFF) : levelBytesFF; + ForEachNode(f, beg1, end1, level - 1, childOffset, childSize); + } + childOffset += childSize; + } + } + ASSERT_EQUAL(static_cast(src.Ptr()) - &data[0], size, \ + (beg, end, offset, size)); + } + else + { + void const * pEnd = &data[0] + size; + while (src.Ptr() < pEnd) + { + uint8_t const i = src.ReadByte(); + if (i > end0) + break; + uint32_t childSize = ReadVarUint(src); + if (i >= beg0) + { + uint64_t const beg1 = (i == beg0) ? (beg & levelBytesFF) : 0; + uint64_t const end1 = (i == end0) ? (end & levelBytesFF) : levelBytesFF; + ForEachNode(f, beg1, end1, level - 1, childOffset, childSize); + } + childOffset += childSize; + } + } } ReaderT m_Reader; Header m_Header; - uint32_t m_NodeSize; - Index m_Level0Index; - int m_CellIdBytes; + buffer_vector m_LevelOffsets; }; diff --git a/indexer/interval_index_builder.hpp b/indexer/interval_index_builder.hpp index cbbeb0857b..853de38f48 100644 --- a/indexer/interval_index_builder.hpp +++ b/indexer/interval_index_builder.hpp @@ -7,195 +7,263 @@ #include "../base/assert.hpp" #include "../base/base.hpp" #include "../base/bits.hpp" -#include "../base/bitset.hpp" #include "../base/logging.hpp" #include "../std/vector.hpp" #include "../std/memcpy.hpp" -namespace impl -{ +// +------------------------------+ +// | Header | +// +------------------------------+ +// | Leaves offset | +// +------------------------------+ +// | Level 1 offset | +// +------------------------------+ +// | ... | +// +------------------------------+ +// | Level N offset | +// +------------------------------+ +// | Leaves data | +// +------------------------------+ +// | Level 1 data | +// +------------------------------+ +// | ... | +// +------------------------------+ +// | Level N data | +// +------------------------------+ -template -void WriteIntervalIndexNode(WriterT & writer, uint64_t offset, uint32_t bitsPerLevel, - Bitset const & bitMask) +class IntervalIndexBuilder { - int const bitsetSize = IntervalIndexBase::BitsetSize(bitsPerLevel); - CHECK_GREATER_OR_EQUAL(offset, writer.Pos() + 4 + bitsetSize, ()); - WriteToSink(writer, static_cast(offset - writer.Pos() - 4 - bitsetSize)); - writer.Write(&bitMask, IntervalIndexBase::BitsetSize(bitsPerLevel)); -} - -template void WriteIntervalIndexLeaf(SinkT & sink, uint32_t bitsPerLevel, - uint64_t prevKey, uint64_t prevValue, - uint64_t key, uint64_t value) -{ - uint64_t const lastBitsZeroMask = (1ULL << bitsPerLevel) - 1; - if ((key & ~lastBitsZeroMask) != (prevKey & ~lastBitsZeroMask)) - prevValue = 0; - - int64_t const delta = static_cast(value) - static_cast(prevValue); - uint64_t const encodedDelta = bits::ZigZagEncode(delta); - uint64_t const code = (encodedDelta << 1) + (key == prevKey ? 0 : 1); - WriteVarUint(sink, code); -} - -inline uint32_t IntervalIndexLeafSize(uint32_t bitsPerLevel, - uint64_t prevKey, uint64_t prevValue, - uint64_t key, uint64_t value) -{ - CountingSink sink; - WriteIntervalIndexLeaf(sink, bitsPerLevel, prevKey, prevValue, key, value); - return sink.GetCount(); -} - -template -bool CheckIntervalIndexInputSequence(CellIdValueIterT const & beg, - CellIdValueIterT const & end, - uint32_t keyBits) -{ - // Check that [beg, end) is sorted and log most populous cell. - if (beg != end) +public: + IntervalIndexBuilder(uint32_t keyBits, uint32_t leafBytes, uint32_t bitsPerLevel = 8) + : m_BitsPerLevel(bitsPerLevel), m_LeafBytes(leafBytes) { - uint32_t count = 0; - uint32_t maxCount = 0; - typename CellIdValueIterT::value_type mostPopulousCell = *beg; - CellIdValueIterT it = beg; - uint64_t prev = it->GetCell(); - for (++it; it != end; ++it) + CHECK_GREATER(leafBytes, 0, ()); + CHECK_LESS(keyBits, 63, ()); + int const nodeKeyBits = keyBits - (m_LeafBytes << 3); + CHECK_GREATER(nodeKeyBits, 0, (keyBits, leafBytes)); + m_Levels = (nodeKeyBits + m_BitsPerLevel - 1) / m_BitsPerLevel; + m_LastBitsMask = (1 << m_BitsPerLevel) - 1; + } + + uint32_t GetLevelCount() const { return m_Levels; } + + template + void BuildIndex(WriterT & writer, CellIdValueIterT const & beg, CellIdValueIterT const & end) + { + CHECK(CheckIntervalIndexInputSequence(beg, end), ()); + + if (beg == end) { - CHECK_GREATER(it->GetCell(), 0, ()); - CHECK_LESS_OR_EQUAL(prev, it->GetCell(), ()); - count = (prev == it->GetCell() ? count + 1 : 0); - if (count > maxCount) + IntervalIndexBase::Header header; + header.m_Version = IntervalIndexBase::kVersion; + header.m_BitsPerLevel = 0; + header.m_Levels = 0; + header.m_LeafBytes = 0; + writer.Write(&header, sizeof(header)); + return; + } + + uint64_t const initialPos = writer.Pos(); + WriteZeroesToSink(writer, sizeof(IntervalIndexBase::Header)); + WriteZeroesToSink(writer, 4 * (m_Levels + 2)); + uint64_t const afterHeaderPos = writer.Pos(); + + vector levelOffset; + { + vector offsets; + levelOffset.push_back(static_cast(writer.Pos())); + BuildLeaves(writer, beg, end, offsets); + levelOffset.push_back(static_cast(writer.Pos())); + for (int i = 1; i <= m_Levels; ++i) { - maxCount = count; - mostPopulousCell = *it; + vector nextOffsets; + BuildLevel(writer, beg, end, i, &offsets[0], &offsets[0] + offsets.size(), nextOffsets); + nextOffsets.swap(offsets); + levelOffset.push_back(static_cast(writer.Pos())); } - prev = it->GetCell(); } - if (maxCount > 0) + + uint64_t const lastPos = writer.Pos(); + writer.Seek(initialPos); + + // Write header. { - LOG(LINFO, ("Most populous cell:", maxCount, - mostPopulousCell.GetCell(), mostPopulousCell.GetFeature())); + IntervalIndexBase::Header header; + header.m_Version = IntervalIndexBase::kVersion; + header.m_BitsPerLevel = m_BitsPerLevel; + header.m_Levels = m_Levels; + header.m_LeafBytes = m_LeafBytes; + writer.Write(&header, sizeof(header)); } - } - for (CellIdValueIterT it = beg; it != end; ++it) - CHECK_LESS(it->GetCell(), 1ULL << keyBits, ()); - return true; -} -} + // Write level offsets. + for (size_t i = 0; i < levelOffset.size(); ++i) + WriteToSink(writer, levelOffset[i]); -// TODO: BuildIntervalIndex() shouldn't rely on indexing cellid-feature pairs. -template -void BuildIntervalIndex(CellIdValueIterT const & beg, CellIdValueIterT const & end, - SinkT & writer, uint8_t const keyBits) -{ - CHECK_LESS(keyBits, 63, ()); - CHECK(impl::CheckIntervalIndexInputSequence(beg, end, keyBits), ()); - - typedef Bitset BitsetType; - uint32_t const bitsPerLevel = 5; - uint32_t const lastBitsMask = (1 << bitsPerLevel) - 1; - uint32_t const nodeSize = 4 + IntervalIndexBase::BitsetSize(bitsPerLevel); - int const levelCount = (keyBits + bitsPerLevel - 1) / bitsPerLevel; - - // Write header. - { - IntervalIndexBase::Header header; - header.m_BitsPerLevel = bitsPerLevel; - header.m_Levels = levelCount; - writer.Write(&header, sizeof(header)); + uint64_t const pos = writer.Pos(); + CHECK_EQUAL(pos, afterHeaderPos, ()); + writer.Seek(lastPos); } - if (beg == end) + template + bool CheckIntervalIndexInputSequence(CellIdValueIterT const & beg, CellIdValueIterT const & end) { - // Write empty index. - CHECK_GREATER(levelCount, 1, ()); - impl::WriteIntervalIndexNode(writer, writer.Pos() + nodeSize, bitsPerLevel, BitsetType()); - LOG(LWARNING, ("Written empty index.")); - return; - } + // Check that [beg, end) is sorted and log most populous cell. + if (beg != end) + { + uint32_t count = 0; + uint32_t maxCount = 0; + typename CellIdValueIterT::value_type mostPopulousCell = *beg; + CellIdValueIterT it = beg; + uint64_t prev = it->GetCell(); + for (++it; it != end; ++it) + { + CHECK_GREATER(it->GetCell(), 0, ()); + CHECK_LESS_OR_EQUAL(prev, it->GetCell(), ()); + count = (prev == it->GetCell() ? count + 1 : 0); + if (count > maxCount) + { + maxCount = count; + mostPopulousCell = *it; + } + prev = it->GetCell(); + } + if (maxCount > 0) + { + LOG(LINFO, ("Most populous cell:", maxCount, + mostPopulousCell.GetCell(), mostPopulousCell.GetFeature())); + } + } - // Write internal nodes. - uint64_t childOffset = writer.Pos() + nodeSize; - uint64_t nextChildOffset = childOffset; - for (int level = levelCount - 1; level >= 0; --level) - { - // LOG(LINFO, ("Building interval index, level", level)); - uint64_t const initialLevelWriterPos = writer.Pos(); - uint64_t totalPopcount = 0; - uint32_t maxPopCount = 0; - uint64_t nodesWritten = 0; - - BitsetType bitMask = BitsetType(); - uint64_t prevKey = 0; - uint64_t prevValue = 0; + uint32_t const keyBits = 8 * m_LeafBytes + m_Levels * m_BitsPerLevel; for (CellIdValueIterT it = beg; it != end; ++it) { - uint64_t const key = it->GetCell() >> (level * bitsPerLevel); - uint32_t const value = it->GetFeature(); - - if (it != beg && (prevKey & ~lastBitsMask) != (key & ~lastBitsMask)) - { - // Write node for the previous parent. - impl::WriteIntervalIndexNode(writer, childOffset, bitsPerLevel, bitMask); - uint32_t const popCount = bitMask.PopCount(); - totalPopcount += popCount; - maxPopCount = max(maxPopCount, popCount); - ++nodesWritten; - childOffset = nextChildOffset; - bitMask = BitsetType(); - } - - bitMask.SetBit(key & lastBitsMask); - - if (level == 0) - nextChildOffset += impl::IntervalIndexLeafSize(bitsPerLevel, - prevKey, prevValue, key, value); - else if (it == beg || prevKey != key) - nextChildOffset += nodeSize; - - prevKey = key; - prevValue = value; + CHECK_LESS(it->GetCell(), 1ULL << keyBits, ()); + CHECK_EQUAL(it->GetFeature(), static_cast(it->GetFeature()), ()); } - // Write the last node. - impl::WriteIntervalIndexNode(writer, childOffset, bitsPerLevel, bitMask); - uint32_t const popCount = bitMask.PopCount(); - totalPopcount += popCount; - maxPopCount = max(maxPopCount, popCount); - ++nodesWritten; - - if (level == 1) - nextChildOffset += nodeSize; - - childOffset = nextChildOffset; - - LOG(LINFO, ("Level:", level, "size:", writer.Pos() - initialLevelWriterPos, \ - "density:", double(totalPopcount) / nodesWritten, "max density:", maxPopCount)); + return true; } - // Write the dummy one-after-last node. - impl::WriteIntervalIndexNode(writer, nextChildOffset, bitsPerLevel, BitsetType()); - - // Write leaves. + template + uint32_t WriteNode(SinkT & sink, uint32_t offset, uint32_t * childSizes) { - uint64_t const initialLevelWriterPos = writer.Pos(); + vector bitmapSerial, listSerial; + bitmapSerial.reserve(1024); + listSerial.reserve(1024); + PushBackByteSink > bitmapSink(bitmapSerial), listSink(listSerial); + WriteBitmapNode(bitmapSink, offset, childSizes); + WriteListNode(listSink, offset, childSizes); + if (bitmapSerial.size() <= listSerial.size()) + { + sink.Write(&bitmapSerial[0], bitmapSerial.size()); + return bitmapSerial.size(); + } + else + { + sink.Write(&listSerial[0], listSerial.size()); + return listSerial.size(); + } + } + template + void BuildLevel(WriterT & writer, CellIdValueIterT const & beg, CellIdValueIterT const & end, + int level, uint32_t const * childSizesBeg, uint32_t const * childSizesEnd, + vector & sizes) + { + UNUSED_VALUE(childSizesEnd); + ASSERT_GREATER(level, 0, ()); + uint32_t const skipBits = m_LeafBytes * 8 + (level - 1) * m_BitsPerLevel; + vector expandedSizes(1 << m_BitsPerLevel); uint64_t prevKey = -1; - uint32_t prevValue = 0; + uint32_t childOffset = 0; + uint32_t nextChildOffset = 0; + for (CellIdValueIterT it = beg; it != end; ++it) + { + uint64_t const key = it->GetCell() >> skipBits; + if (key == prevKey) + continue; + + if (it != beg && (key >> m_BitsPerLevel) != (prevKey >> m_BitsPerLevel)) + { + sizes.push_back(WriteNode(writer, childOffset, &expandedSizes[0])); + childOffset = nextChildOffset; + expandedSizes.assign(expandedSizes.size(), 0); + } + + nextChildOffset += *childSizesBeg; + expandedSizes[key & m_LastBitsMask] += *childSizesBeg; + ++childSizesBeg; + prevKey = key; + } + sizes.push_back(WriteNode(writer, childOffset, &expandedSizes[0])); + ASSERT_EQUAL(childSizesBeg, childSizesEnd, ()); + } + + template + void BuildLeaves(WriterT & writer, CellIdValueIterT const & beg, CellIdValueIterT const & end, + vector & sizes) + { + uint32_t const skipBits = 8 * m_LeafBytes; + uint64_t prevKey = 0; + uint64_t prevValue = 0; + uint64_t prevPos = writer.Pos(); for (CellIdValueIterT it = beg; it != end; ++it) { uint64_t const key = it->GetCell(); - uint32_t const value = it->GetFeature(); - impl::WriteIntervalIndexLeaf(writer, bitsPerLevel, prevKey, prevValue, key, value); + uint64_t const value = it->GetFeature(); + if (it != beg && (key >> skipBits) != (prevKey >> skipBits)) + { + sizes.push_back(static_cast(writer.Pos() - prevPos)); + prevValue = 0; + prevPos = writer.Pos(); + } + uint64_t const keySerial = SwapIfBigEndian(key); + writer.Write(&keySerial, m_LeafBytes); + WriteVarInt(writer, static_cast(value) - static_cast(prevValue)); prevKey = key; prevValue = value; } - - LOG(LINFO, ("Leaves size:", writer.Pos() - initialLevelWriterPos)); + sizes.push_back(static_cast(writer.Pos() - prevPos)); } - LOG(LINFO, ("Interval index building done.")); + template + void WriteBitmapNode(SinkT & sink, uint32_t offset, uint32_t * childSizes) + { + ASSERT_GREATER_OR_EQUAL(m_BitsPerLevel, 3, ()); + WriteVarUint(sink, (offset << 1) + 1); + buffer_vector bitMask(1 << (m_BitsPerLevel - 3)); + for (uint32_t i = 0; i < 1 << m_BitsPerLevel; ++i) + if (childSizes[i]) + bits::SetBitTo1(&bitMask[0], i); + sink.Write(&bitMask[0], bitMask.size()); + for (uint32_t i = 0; i < 1 << m_BitsPerLevel; ++i) + if (childSizes[i]) + WriteVarUint(sink, childSizes[i]); + } + + template + void WriteListNode(SinkT & sink, uint32_t offset, uint32_t * childSizes) + { + ASSERT_LESS_OR_EQUAL(m_BitsPerLevel, 8, ()); + WriteVarUint(sink, (offset << 1)); + for (uint32_t i = 0; i < 1 << m_BitsPerLevel; ++i) + { + if (childSizes[i]) + { + WriteToSink(sink, static_cast(i)); + WriteVarUint(sink, childSizes[i]); + } + } + } + +private: + uint32_t m_Levels, m_BitsPerLevel, m_LeafBytes, m_LastBitsMask; +}; + +template +void BuildIntervalIndex(CellIdValueIterT const & beg, CellIdValueIterT const & end, + WriterT & writer, uint32_t keyBits) +{ + IntervalIndexBuilder(keyBits, 1).BuildIndex(writer, beg, end); }