[indexer] Build index in one pass

This commit is contained in:
Anatoly Serdtcev 2019-12-20 11:17:51 +03:00 committed by LaGrunge
parent 917387ce20
commit de9a1c60e1
2 changed files with 119 additions and 154 deletions

View file

@ -106,41 +106,6 @@ UNIT_TEST(IntervalIndexV2_SerializedNodeList)
(DebugPrint(serializedNode), DebugPrint(expSerial)));
}
UNIT_TEST(IntervalIndex_SerializedLeaves)
{
vector<CellIdFeaturePairForTest> data;
data.push_back(CellIdFeaturePairForTest(0x1537U, 0));
data.push_back(CellIdFeaturePairForTest(0x1538U, 1));
data.push_back(CellIdFeaturePairForTest(0x1637U, 2));
vector<uint8_t> serialLeaves;
MemWriter<vector<uint8_t>> writer(serialLeaves);
vector<uint64_t> sizes;
IntervalIndexBuilder(16, 1, 4).BuildLeaves(writer, data.begin(), data.end(), sizes);
char const expSerial [] = "\x37\x00" "\x38\x02" "\x37\x04"; // 0x1537 0x1538 0x1637
uint32_t const expSizes [] = { 4, 2 };
TEST_EQUAL(serialLeaves, vector<uint8_t>(expSerial, expSerial + ARRAY_SIZE(expSerial) - 1), ());
TEST_EQUAL(sizes, vector<uint64_t>(expSizes, expSizes + ARRAY_SIZE(expSizes)), ());
}
UNIT_TEST(IntervalIndex_SerializedNodes)
{
vector<CellIdFeaturePairForTest> data;
data.push_back(CellIdFeaturePairForTest(0x1537U, 0));
data.push_back(CellIdFeaturePairForTest(0x1538U, 1));
data.push_back(CellIdFeaturePairForTest(0x1637U, 2));
uint64_t const leavesSizes [] = { 4, 2 };
vector<uint8_t> serialNodes;
MemWriter<vector<uint8_t>> writer(serialNodes);
vector<uint64_t> sizes;
IntervalIndexBuilder(16, 1, 4).BuildLevel(writer, data.begin(), data.end(), 1,
leavesSizes, leavesSizes + ARRAY_SIZE(leavesSizes),
sizes);
char const expSerial [] = "\x01\x60\x00\x04\x02";
uint32_t const expSizes [] = { ARRAY_SIZE(expSerial) - 1 };
TEST_EQUAL(serialNodes, vector<uint8_t>(expSerial, expSerial + ARRAY_SIZE(expSerial) - 1), ());
TEST_EQUAL(sizes, vector<uint64_t>(expSizes, expSizes + ARRAY_SIZE(expSizes)), ());
}
UNIT_TEST(IntervalIndex_Serialized)
{
vector<CellIdFeaturePairForTest> data;

View file

@ -6,6 +6,7 @@
#include "coding/endianness.hpp"
#include "coding/varint.hpp"
#include "coding/write_to_sink.hpp"
#include "coding/writer.hpp"
#include "base/assert.hpp"
#include "base/base.hpp"
@ -64,8 +65,6 @@ public:
template <class Writer, typename CellIdValueIter>
void BuildIndex(Writer & writer, CellIdValueIter const & beg, CellIdValueIter const & end)
{
CHECK(CheckIntervalIndexInputSequence(beg, end), ());
if (beg == end)
{
IntervalIndexBase::Header header;
@ -77,24 +76,29 @@ public:
return;
}
m_levelsAssembly.clear();
for (int i = 0; i <= static_cast<int>(m_Levels); ++i)
m_levelsAssembly.emplace_back(*this, i);
uint64_t const initialPos = writer.Pos();
WriteZeroesToSink(writer, sizeof(IntervalIndexBase::Header));
WriteZeroesToSink(writer, (m_version == IntervalIndexVersion::V1 ? 4 : 8) * (m_Levels + 2));
uint64_t const afterHeaderPos = writer.Pos();
std::vector<uint64_t> levelOffset;
levelOffset.push_back(writer.Pos());
BuildAllLevels(writer, beg, end);
levelOffset.push_back(writer.Pos());
// Write levels.
for (int i = 1; i <= static_cast<int>(m_Levels); ++i)
{
std::vector<uint64_t> offsets;
auto const & levelAssembly = m_levelsAssembly[i];
auto const & levelData = levelAssembly.GetLevelData();
writer.Write(levelData.data(), levelData.size());
levelOffset.push_back(writer.Pos());
BuildLeaves(writer, beg, end, offsets);
levelOffset.push_back(writer.Pos());
for (int i = 1; i <= static_cast<int>(m_Levels); ++i)
{
std::vector<uint64_t> nextOffsets;
BuildLevel(writer, beg, end, i, &offsets[0], &offsets[0] + offsets.size(), nextOffsets);
nextOffsets.swap(offsets);
levelOffset.push_back(writer.Pos());
}
}
uint64_t const lastPos = writer.Pos();
@ -127,47 +131,6 @@ public:
writer.Seek(lastPos);
}
template <typename CellIdValueIter>
bool CheckIntervalIndexInputSequence(CellIdValueIter const & beg, CellIdValueIter const & end)
{
// Check that [beg, end) is sorted and log most populous cell.
if (beg != end)
{
uint64_t count = 0;
uint64_t maxCount = 0;
typename CellIdValueIter::value_type mostPopulousCell = *beg;
CellIdValueIter it = beg;
uint64_t prev = it->GetCell();
for (++it; it != end; ++it)
{
CHECK_GREATER(it->GetCell(), 0, ());
CHECK_LESS_OR_EQUAL(prev, it->GetCell(), ());
count = (prev == it->GetCell() ? count + 1 : 0);
if (count > maxCount)
{
maxCount = count;
mostPopulousCell = *it;
}
prev = it->GetCell();
}
if (maxCount > 0)
{
LOG(LINFO, ("Most populous cell:", maxCount, mostPopulousCell.GetCell(),
mostPopulousCell.GetValue()));
}
}
uint32_t const keyBits = 8 * m_LeafBytes + m_Levels * m_BitsPerLevel;
for (CellIdValueIter it = beg; it != end; ++it)
{
CHECK_LESS(it->GetCell(), 1ULL << keyBits, ());
// We use static_cast<int64_t>(value) in BuildLeaves to store values difference as VarInt.
CHECK_LESS_OR_EQUAL(it->GetValue(), static_cast<uint64_t>(std::numeric_limits<int64_t>::max()), ());
}
return true;
}
template <class SinkT>
uint64_t WriteNode(SinkT & sink, uint64_t offset, uint64_t * childSizes)
{
@ -191,72 +154,6 @@ public:
}
}
template <class Writer, typename CellIdValueIter>
void BuildLevel(Writer & writer, CellIdValueIter const & beg, CellIdValueIter const & end,
int level, uint64_t const * childSizesBeg, uint64_t const * childSizesEnd,
std::vector<uint64_t> & sizes)
{
UNUSED_VALUE(childSizesEnd);
ASSERT_GREATER(level, 0, ());
uint32_t const skipBits = m_LeafBytes * 8 + (level - 1) * m_BitsPerLevel;
std::vector<uint64_t> expandedSizes(1 << m_BitsPerLevel);
uint64_t prevKey = static_cast<uint64_t>(-1);
uint64_t childOffset = 0;
uint64_t nextChildOffset = 0;
for (CellIdValueIter it = beg; it != end; ++it)
{
uint64_t const key = it->GetCell() >> skipBits;
if (key == prevKey)
continue;
if (it != beg && (key >> m_BitsPerLevel) != (prevKey >> m_BitsPerLevel))
{
sizes.push_back(WriteNode(writer, childOffset, &expandedSizes[0]));
childOffset = nextChildOffset;
expandedSizes.assign(expandedSizes.size(), 0);
}
nextChildOffset += *childSizesBeg;
CHECK_EQUAL(expandedSizes[key & m_LastBitsMask], 0, ());
expandedSizes[key & m_LastBitsMask] = *childSizesBeg;
++childSizesBeg;
prevKey = key;
}
sizes.push_back(WriteNode(writer, childOffset, &expandedSizes[0]));
ASSERT_EQUAL(childSizesBeg, childSizesEnd, ());
}
template <class Writer, typename CellIdValueIter>
void BuildLeaves(Writer & writer, CellIdValueIter const & beg, CellIdValueIter const & end,
std::vector<uint64_t> & sizes)
{
using Value = typename CellIdValueIter::value_type::ValueType;
uint32_t const skipBits = 8 * m_LeafBytes;
uint64_t prevKey = 0;
uint64_t prevValue = 0;
uint64_t prevPos = writer.Pos();
for (CellIdValueIter it = beg; it != end; ++it)
{
uint64_t const key = it->GetCell();
Value const value = it->GetValue();
if (key == prevKey && value == prevValue)
continue;
if (it != beg && (key >> skipBits) != (prevKey >> skipBits))
{
sizes.push_back(writer.Pos() - prevPos);
prevValue = 0;
prevPos = writer.Pos();
}
uint64_t const keySerial = SwapIfBigEndianMacroBased(key);
writer.Write(&keySerial, m_LeafBytes);
WriteVarInt(writer, static_cast<int64_t>(value) - static_cast<int64_t>(prevValue));
prevKey = key;
prevValue = value;
}
sizes.push_back(writer.Pos() - prevPos);
}
template <class SinkT>
void WriteBitmapNode(SinkT & sink, uint64_t offset, uint64_t * childSizes)
{
@ -314,8 +211,111 @@ public:
}
private:
class LevelAssembly
{
public:
LevelAssembly(IntervalIndexBuilder & indexBuilder, int level)
: m_indexBuilder{indexBuilder}
, m_level{level}
, m_expandedSizes(1 << indexBuilder.m_BitsPerLevel)
{ }
void NewChildNode(uint64_t childNodeKey, uint64_t childNodeSize, bool last = false)
{
CHECK(childNodeKey != m_prevChildNodeKey, ());
auto const bitsPerLevel = m_indexBuilder.m_BitsPerLevel;
if ((childNodeKey >> bitsPerLevel) != (m_prevChildNodeKey >> bitsPerLevel) &&
m_nextChildOffset)
{
auto nodeSize = m_indexBuilder.WriteNode(m_writer, m_childOffset, &m_expandedSizes[0]);
m_indexBuilder.NewNode(m_level, m_prevChildNodeKey >> bitsPerLevel, nodeSize);
m_childOffset = m_nextChildOffset;
m_expandedSizes.assign(m_expandedSizes.size(), 0);
}
m_nextChildOffset += childNodeSize;
auto const lastBitsMask = m_indexBuilder.m_LastBitsMask;
CHECK_EQUAL(m_expandedSizes[childNodeKey & lastBitsMask], 0, ());
m_expandedSizes[childNodeKey & lastBitsMask] = childNodeSize;
m_prevChildNodeKey = childNodeKey;
if (last)
{
auto nodeSize = m_indexBuilder.WriteNode(m_writer, m_childOffset, &m_expandedSizes[0]);
m_indexBuilder.NewNode(m_level, childNodeKey >> bitsPerLevel, nodeSize, true /* last */);
}
}
std::vector<char> const & GetLevelData() const
{
return *m_buffer;
}
private:
IntervalIndexBuilder & m_indexBuilder;
int m_level;
uint64_t m_prevChildNodeKey = std::numeric_limits<uint64_t>::max();
uint64_t m_childOffset{0};
uint64_t m_nextChildOffset{0};
std::vector<uint64_t> m_expandedSizes;
// |m_buffer| are allocated because of reference to buffer must be fixed for |m_writer|.
std::unique_ptr<std::vector<char>> m_buffer = std::make_unique<std::vector<char>>();
MemWriter<std::vector<char>> m_writer{*m_buffer};
};
void NewNode(int nodeLevel, uint64_t nodeKey, uint64_t nodeSize, bool last = false)
{
if (nodeLevel == static_cast<int>(m_Levels))
return;
m_levelsAssembly[nodeLevel + 1].NewChildNode(nodeKey, nodeSize, last);
}
template <class Writer, typename CellIdValueIter>
void BuildAllLevels(Writer & writer, CellIdValueIter const & beg, CellIdValueIter const & end)
{
using Value = typename CellIdValueIter::value_type::ValueType;
uint32_t const keyBits = 8 * m_LeafBytes + m_Levels * m_BitsPerLevel;
uint32_t const skipBits = 8 * m_LeafBytes;
uint64_t prevKey = 0;
uint64_t prevValue = 0;
uint64_t prevPos = writer.Pos();
for (CellIdValueIter it = beg; it != end; ++it)
{
uint64_t const key = it->GetCell();
CHECK_GREATER(key, 0, ());
CHECK_LESS(key, 1ULL << keyBits, ());
CHECK_GREATER_OR_EQUAL(key, prevKey, ());
Value const value = it->GetValue();
if (key == prevKey && value == prevValue)
continue;
if ((key >> skipBits) != (prevKey >> skipBits) && prevKey)
{
auto const nodeSize = writer.Pos() - prevPos;
NewNode(0 /* nodeLevel */, prevKey >> skipBits, nodeSize);
prevValue = 0;
prevPos = writer.Pos();
}
uint64_t const keySerial = SwapIfBigEndianMacroBased(key);
writer.Write(&keySerial, m_LeafBytes);
WriteVarInt(writer, static_cast<int64_t>(value) - static_cast<int64_t>(prevValue));
prevKey = key;
prevValue = value;
}
auto const nodeSize = writer.Pos() - prevPos;
NewNode(0 /* nodeLevel */, prevKey >> skipBits, nodeSize, true /* last */);
}
IntervalIndexVersion m_version;
uint32_t m_Levels, m_BitsPerLevel, m_LeafBytes, m_LastBitsMask;
std::vector<LevelAssembly> m_levelsAssembly;
};
template <class Writer, typename CellIdValueIter>