forked from organicmaps/organicmaps
Add Trie.
This commit is contained in:
parent
c2b6907ed6
commit
786dd9af11
5 changed files with 552 additions and 59 deletions
|
@ -86,4 +86,5 @@ HEADERS += \
|
|||
zip_reader.hpp \
|
||||
trie.hpp \
|
||||
trie_builder.hpp \
|
||||
trie_reader.hpp \
|
||||
mmap_reader.hpp \
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
#include "../../testing/testing.hpp"
|
||||
#include "../trie.hpp"
|
||||
#include "../trie_builder.hpp"
|
||||
#include "../trie_reader.hpp"
|
||||
#include "../byte_stream.hpp"
|
||||
#include "../write_to_sink.hpp"
|
||||
#include "../../base/logging.hpp"
|
||||
#include "../../std/algorithm.hpp"
|
||||
#include "../../std/scoped_ptr.hpp"
|
||||
#include "../../std/string.hpp"
|
||||
#include "../../std/vector.hpp"
|
||||
#include <boost/utility/binary.hpp>
|
||||
|
||||
|
@ -13,10 +18,68 @@ struct ChildNodeInfo
|
|||
{
|
||||
bool m_isLeaf;
|
||||
uint32_t m_size;
|
||||
char const * m_edge;
|
||||
vector<uint32_t> m_edge;
|
||||
string m_edgeValue;
|
||||
ChildNodeInfo(bool isLeaf, uint32_t size, char const * edge, char const * edgeValue)
|
||||
: m_isLeaf(isLeaf), m_size(size), m_edgeValue(edgeValue)
|
||||
{
|
||||
while (*edge)
|
||||
m_edge.push_back(*edge++);
|
||||
}
|
||||
|
||||
uint32_t Size() const { return m_size; }
|
||||
bool IsLeaf() const { return m_isLeaf; }
|
||||
strings::UniString GetEdge() const { return strings::MakeUniString(m_edge); }
|
||||
uint32_t const * GetEdge() const { return m_edge.data(); }
|
||||
uint32_t GetEdgeSize() const { return m_edge.size(); }
|
||||
void const * GetEdgeValue() const { return m_edgeValue.data(); }
|
||||
uint32_t GetEdgeValueSize() const { return m_edgeValue.size(); }
|
||||
};
|
||||
|
||||
struct KeyValuePair
|
||||
{
|
||||
vector<trie::TrieChar> m_key;
|
||||
uint32_t m_value;
|
||||
|
||||
template <class StringT>
|
||||
KeyValuePair(StringT const & key, int value) : m_key(key.begin(), key.end()), m_value(value) {}
|
||||
|
||||
uint32_t GetKeySize() const { return m_key.size(); }
|
||||
trie::TrieChar const * GetKeyData() const { return m_key.data(); }
|
||||
uint32_t GetValueSize() const { return 4; }
|
||||
void const * GetValueData() const { return &m_value; }
|
||||
|
||||
bool operator == (KeyValuePair const & p) const
|
||||
{
|
||||
return m_key == p.m_key && m_value == p.m_value;
|
||||
}
|
||||
|
||||
bool operator < (KeyValuePair const & p) const
|
||||
{
|
||||
if (m_key != p.m_key)
|
||||
return m_key < p.m_key;
|
||||
return m_value < p.m_value;
|
||||
}
|
||||
};
|
||||
|
||||
string debug_print(KeyValuePair const & p)
|
||||
{
|
||||
string keyS = ::debug_print(p.m_key);
|
||||
ostringstream out;
|
||||
out << "KVP(" << keyS << ", " << p.m_value << ")";
|
||||
return out.str();
|
||||
}
|
||||
|
||||
struct KeyValuePairBackInserter
|
||||
{
|
||||
vector<KeyValuePair> m_v;
|
||||
template <class StringT>
|
||||
void operator() (StringT const & s,
|
||||
trie::reader::FixedSizeValueReader<4>::ValueType const & rawValue)
|
||||
{
|
||||
uint32_t value;
|
||||
memcpy(&value, &rawValue, 4);
|
||||
m_v.push_back(KeyValuePair(s, value));
|
||||
}
|
||||
};
|
||||
|
||||
} // unnamed namespace
|
||||
|
@ -29,37 +92,98 @@ UNIT_TEST(TrieBuilder_WriteNode_Smoke)
|
|||
PushBackByteSink<vector<uint8_t> > sink(serial);
|
||||
ChildNodeInfo children[] =
|
||||
{
|
||||
{true, 1, "1A"},
|
||||
{false, 2, "B"},
|
||||
{false, 3, "zz"},
|
||||
{true, 4, "abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij"}
|
||||
ChildNodeInfo(true, 1, "1A", "i1"),
|
||||
ChildNodeInfo(false, 2, "B", "ii2"),
|
||||
ChildNodeInfo(false, 3, "zz", ""),
|
||||
ChildNodeInfo(true, 4,
|
||||
"abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij", "i4"),
|
||||
ChildNodeInfo(true, 5, "a", "5z")
|
||||
};
|
||||
trie::builder::WriteNode(sink, 0, 3, "123", 3,
|
||||
&children[0], &children[0] + ARRAY_SIZE(children));
|
||||
unsigned char const expected [] =
|
||||
{
|
||||
BOOST_BINARY(11000100), // Header: [0b11] [0b000100]
|
||||
BOOST_BINARY(11000101), // Header: [0b11] [0b000101]
|
||||
3, // Number of values
|
||||
'1', '2', '3', // Values
|
||||
1, // Child 1: size
|
||||
BOOST_BINARY(10000010), // Child 1: header: [+leaf] [-supershort] [2 symbols]
|
||||
BOOST_BINARY(10000001), // Child 1: header: [+leaf] [-supershort] [2 symbols]
|
||||
ZENC('1'), ZENC('A' - '1'), // Child 1: edge
|
||||
2, // Child 2: size
|
||||
'i', '1', // Child 1: intermediate data
|
||||
1, // Child 1: size
|
||||
64 | ZENC('B' - '1'), // Child 2: header: [-leaf] [+supershort]
|
||||
3, // Child 3: size
|
||||
BOOST_BINARY(00000010), // Child 3: header: [-leaf] [-supershort] [2 symbols]
|
||||
'i', 'i', '2', // Child 2: intermediate data
|
||||
2, // Child 2: size
|
||||
BOOST_BINARY(00000001), // Child 3: header: [-leaf] [-supershort] [2 symbols]
|
||||
ZENC('z' - 'B'), 0, // Child 3: edge
|
||||
4, // Child 4: size
|
||||
3, // Child 3: size
|
||||
BOOST_BINARY(10111111), // Child 4: header: [+leaf] [-supershort] [>= 63 symbols]
|
||||
70, // Child 4: edge size
|
||||
69, // Child 4: edgeSize - 1
|
||||
ZENC('a' - 'z'), 2,2,2,2,2,2,2,2,2, // Child 4: edge
|
||||
ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge
|
||||
ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge
|
||||
ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge
|
||||
ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge
|
||||
ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge
|
||||
ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2 // Child 4: edge
|
||||
ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge
|
||||
'i', '4', // Child 4: intermediate data
|
||||
4, // Child 4: size
|
||||
BOOST_BINARY(11000000) | ZENC(0), // Child 5: header: [+leaf] [+supershort]
|
||||
'5', 'z' // Child 5: intermediate data
|
||||
};
|
||||
|
||||
TEST_EQUAL(serial, vector<uint8_t>(&expected[0], &expected[0] + ARRAY_SIZE(expected)), ());
|
||||
}
|
||||
|
||||
UNIT_TEST(TrieBuilder_Build)
|
||||
{
|
||||
int const base = 3;
|
||||
int const maxLen = 3;
|
||||
|
||||
vector<string> possibleStrings(1, string());
|
||||
for (int len = 1; len <= maxLen; ++len)
|
||||
{
|
||||
for (int i = 0; i < pow(base, len); ++i)
|
||||
{
|
||||
string s(len, 'A');
|
||||
int t = i;
|
||||
for (int l = len - 1; l >= 0; --l, t /= base)
|
||||
s[l] += (t % base);
|
||||
possibleStrings.push_back(s);
|
||||
}
|
||||
}
|
||||
sort(possibleStrings.begin(), possibleStrings.end());
|
||||
// LOG(LINFO, (possibleStrings));
|
||||
|
||||
for (int i0 = -1; i0 < static_cast<int>(possibleStrings.size()); ++i0)
|
||||
for (int i1 = i0; i1 < static_cast<int>(possibleStrings.size()); ++i1)
|
||||
for (int i2 = i1; i2 < static_cast<int>(possibleStrings.size()); ++i2)
|
||||
{
|
||||
vector<KeyValuePair> v;
|
||||
if (i0 >= 0) v.push_back(KeyValuePair(possibleStrings[i0], i0));
|
||||
if (i1 >= 0) v.push_back(KeyValuePair(possibleStrings[i1], i1));
|
||||
if (i2 >= 0) v.push_back(KeyValuePair(possibleStrings[i2], i2));
|
||||
vector<string> vs;
|
||||
for (size_t i = 0; i < v.size(); ++i)
|
||||
vs.push_back(string(v[i].m_key.begin(), v[i].m_key.end()));
|
||||
|
||||
vector<uint8_t> serial;
|
||||
PushBackByteSink<vector<uint8_t> > sink(serial);
|
||||
trie::Build(sink, v.begin(), v.end());
|
||||
reverse(serial.begin(), serial.end());
|
||||
// LOG(LINFO, (serial.size(), vs));
|
||||
|
||||
MemReader memReader = MemReader(serial.data(), serial.size());
|
||||
typedef trie::Iterator<
|
||||
trie::reader::FixedSizeValueReader<4>::ValueType,
|
||||
trie::reader::EmptyValueReader::ValueType
|
||||
> IteratorType;
|
||||
scoped_ptr<IteratorType> root(trie::reader::ReadTrie(memReader,
|
||||
trie::reader::FixedSizeValueReader<4>(),
|
||||
trie::reader::EmptyValueReader()));
|
||||
vector<KeyValuePair> res;
|
||||
KeyValuePairBackInserter f;
|
||||
trie::ForEachRef(*root, f, vector<trie::TrieChar>());
|
||||
sort(f.m_v.begin(), f.m_v.end());
|
||||
TEST_EQUAL(v, f.m_v, ());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,2 +1,79 @@
|
|||
#pragma once
|
||||
#include "../base/assert.hpp"
|
||||
#include "../base/base.hpp"
|
||||
#include "../base/buffer_vector.hpp"
|
||||
#include "../std/scoped_ptr.hpp"
|
||||
|
||||
namespace trie
|
||||
{
|
||||
|
||||
typedef uint32_t TrieChar;
|
||||
|
||||
// 95 is a good value for the default baseChar, since both small and capital latin letters
|
||||
// are less than +/- 32 from it and thus can fit into supershort edge.
|
||||
static uint32_t const DEFAULT_CHAR = 95;
|
||||
|
||||
template <typename ValueT, typename EdgeValueT>
|
||||
class Iterator
|
||||
{
|
||||
public:
|
||||
struct Edge
|
||||
{
|
||||
buffer_vector<TrieChar, 8> m_str;
|
||||
EdgeValueT m_value;
|
||||
};
|
||||
|
||||
buffer_vector<Edge, 8> m_edge;
|
||||
buffer_vector<ValueT, 2> m_value;
|
||||
|
||||
virtual ~Iterator() {}
|
||||
|
||||
virtual Iterator<ValueT, EdgeValueT> * GoToEdge(uint32_t i) const = 0;
|
||||
};
|
||||
|
||||
namespace reader
|
||||
{
|
||||
|
||||
struct EmptyValueReader
|
||||
{
|
||||
typedef unsigned char ValueType;
|
||||
|
||||
template <typename SourceT>
|
||||
void operator() (SourceT &, ValueType & value) const
|
||||
{
|
||||
value = 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N>
|
||||
struct FixedSizeValueReader
|
||||
{
|
||||
struct ValueType
|
||||
{
|
||||
unsigned char m_data[N];
|
||||
};
|
||||
|
||||
template <typename SourceT>
|
||||
void operator() (SourceT & src, ValueType & value) const
|
||||
{
|
||||
src.Read(&value.m_data[0], N);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace trie::reader
|
||||
|
||||
template <typename ValueT, typename EdgeValueT, typename F, typename StringT>
|
||||
void ForEachRef(Iterator<ValueT, EdgeValueT> const & iter, F & f, StringT const & s)
|
||||
{
|
||||
for (size_t i = 0; i < iter.m_value.size(); ++i)
|
||||
f(s, iter.m_value[i]);
|
||||
for (size_t i = 0; i < iter.m_edge.size(); ++i)
|
||||
{
|
||||
StringT s1(s);
|
||||
s1.insert(s1.end(), iter.m_edge[i].m_str.begin(), iter.m_edge[i].m_str.end());
|
||||
scoped_ptr<Iterator<ValueT, EdgeValueT> > pIter1(iter.GoToEdge(i));
|
||||
ForEachRef(*pIter1, f, s1);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Trie
|
||||
|
|
|
@ -1,22 +1,55 @@
|
|||
#pragma once
|
||||
#include "../coding/byte_stream.hpp"
|
||||
#include "../coding/varint.hpp"
|
||||
#include "../base/string_utils.hpp"
|
||||
#include "../base/buffer_vector.hpp"
|
||||
#include "../std/algorithm.hpp"
|
||||
|
||||
// Trie format:
|
||||
// [1: header]
|
||||
// [node] ... [node]
|
||||
|
||||
// Nodes are written in post-order (first child, last child, parent). Contents of nodes is writern
|
||||
// reversed. The resulting file should be reverese before use! Then its contents will appear in
|
||||
// pre-order alphabetically reversed (parent, last child, first child).
|
||||
|
||||
// Leaf node format:
|
||||
// [value] ... [value]
|
||||
|
||||
// Internal node format:
|
||||
// [1: header]: [2: min(valueCount, 3)] [6: min(childCount, 63)]
|
||||
// [vu valueCount]: if valueCount in header == 3
|
||||
// [vu childCount]: if childCount in header == 63
|
||||
// [value] ... [value]
|
||||
// [childInfo] ... [childInfo]
|
||||
|
||||
// Child info format:
|
||||
// Every char of the edge is encoded as varint difference from the previous char. First char is
|
||||
// encoded as varint difference from the base char, which is the last char of the current prefix.
|
||||
//
|
||||
// [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)]
|
||||
// [vu edgeLen-1]: if edgeLen-1 in header == 63
|
||||
// [vi edgeChar0 - baseChar]
|
||||
// [vi edgeChar1 - edgeChar0]
|
||||
// ...
|
||||
// [vi edgeCharN - edgeCharN-1]
|
||||
// [edge value]
|
||||
// [child size]: if the child is not the first one (last one when reading)
|
||||
|
||||
namespace trie
|
||||
{
|
||||
namespace builder
|
||||
{
|
||||
|
||||
template <typename SinkT, typename ChildIterT>
|
||||
void WriteNode(SinkT & sink, strings::UniChar baseChar,
|
||||
uint32_t const valueCount, void const * valuesData, uint32_t const valuesSize,
|
||||
ChildIterT const begChild, ChildIterT const endChild)
|
||||
void WriteNode(SinkT & sink, TrieChar baseChar, uint32_t const valueCount,
|
||||
void const * const valuesDataSize, uint32_t const valuesSize,
|
||||
ChildIterT const begChild, ChildIterT const endChild,
|
||||
bool isRoot = false)
|
||||
{
|
||||
if (begChild == endChild)
|
||||
if (begChild == endChild && !isRoot)
|
||||
{
|
||||
// Leaf node.
|
||||
sink.Write(valuesData, valuesSize);
|
||||
sink.Write(valuesDataSize, valuesSize);
|
||||
return;
|
||||
}
|
||||
uint32_t const childCount = endChild - begChild;
|
||||
|
@ -26,15 +59,16 @@ void WriteNode(SinkT & sink, strings::UniChar baseChar,
|
|||
WriteVarUint(sink, valueCount);
|
||||
if (childCount >= 63)
|
||||
WriteVarUint(sink, childCount);
|
||||
sink.Write(valuesData, valuesSize);
|
||||
for (ChildIterT it = begChild; it != endChild; ++it)
|
||||
sink.Write(valuesDataSize, valuesSize);
|
||||
for (ChildIterT it = begChild; it != endChild; /*++it*/)
|
||||
{
|
||||
WriteVarUint(sink, it->Size());
|
||||
uint8_t header = (it->IsLeaf() ? 128 : 0);
|
||||
strings::UniString const edge = it->GetEdge();
|
||||
CHECK(!edge.empty(), ());
|
||||
TrieChar const * const edge = it->GetEdge();
|
||||
uint32_t const edgeSize = it->GetEdgeSize();
|
||||
CHECK_NOT_EQUAL(edgeSize, 0, ());
|
||||
CHECK_LESS(edgeSize, 100000, ());
|
||||
uint32_t const diff0 = bits::ZigZagEncode(int32_t(edge[0] - baseChar));
|
||||
if (edge.size() == 1 && (diff0 & ~63U) == 0)
|
||||
if (edgeSize == 1 && (diff0 & ~63U) == 0)
|
||||
{
|
||||
header |= 64;
|
||||
header |= diff0;
|
||||
|
@ -42,90 +76,143 @@ void WriteNode(SinkT & sink, strings::UniChar baseChar,
|
|||
}
|
||||
else
|
||||
{
|
||||
if (edge.size() < 63)
|
||||
if (edgeSize - 1 < 63)
|
||||
{
|
||||
header |= edge.size();
|
||||
header |= edgeSize - 1;
|
||||
WriteToSink(sink, header);
|
||||
}
|
||||
else
|
||||
{
|
||||
header |= 63;
|
||||
WriteToSink(sink, header);
|
||||
WriteVarUint(sink, static_cast<uint32_t>(edge.size()));
|
||||
WriteVarUint(sink, edgeSize - 1);
|
||||
}
|
||||
for (size_t i = 0; i < edge.size(); ++i)
|
||||
for (size_t i = 0; i < edgeSize; ++i)
|
||||
{
|
||||
WriteVarInt(sink, int32_t(edge[i] - baseChar));
|
||||
baseChar = edge[i];
|
||||
}
|
||||
}
|
||||
baseChar = edge[0];
|
||||
sink.Write(it->GetEdgeValue(), it->GetEdgeValueSize());
|
||||
|
||||
uint32_t const childSize = it->Size();
|
||||
if (++it != endChild)
|
||||
WriteVarUint(sink, childSize);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename SinkT, typename ChildIterT>
|
||||
void WriteNodeReverse(SinkT & sink, TrieChar baseChar, uint32_t const valueCount,
|
||||
void const * const valuesDataSize, uint32_t const valuesSize,
|
||||
ChildIterT const begChild, ChildIterT const endChild,
|
||||
bool isRoot = false)
|
||||
{
|
||||
typedef buffer_vector<uint8_t, 64> OutStorageType;
|
||||
OutStorageType out;
|
||||
PushBackByteSink<OutStorageType> outSink(out);
|
||||
WriteNode(outSink, baseChar, valueCount, valuesDataSize, valuesSize, begChild, endChild, isRoot);
|
||||
reverse(out.begin(), out.end());
|
||||
sink.Write(out.data(), out.size());
|
||||
}
|
||||
|
||||
struct ChildInfo
|
||||
{
|
||||
bool m_isLeaf;
|
||||
uint32_t m_size;
|
||||
char const * m_edge;
|
||||
buffer_vector<TrieChar, 8> m_edge;
|
||||
buffer_vector<uint8_t, 8> m_edgeValue;
|
||||
|
||||
ChildInfo() {}
|
||||
ChildInfo(bool isLeaf, uint32_t size, TrieChar c) : m_isLeaf(isLeaf), m_size(size), m_edge(1, c)
|
||||
{
|
||||
}
|
||||
|
||||
uint32_t Size() const { return m_size; }
|
||||
bool IsLeaf() const { return m_isLeaf; }
|
||||
strings::UniString GetEdge() const { return strings::MakeUniString(m_edge); }
|
||||
TrieChar const * GetEdge() const { return m_edge.data(); }
|
||||
uint32_t GetEdgeSize() const { return m_edge.size(); }
|
||||
void const * GetEdgeValue() const { return m_edgeValue.data(); }
|
||||
uint32_t GetEdgeValueSize() const { return m_edgeValue.size(); }
|
||||
};
|
||||
|
||||
struct NodeInfo
|
||||
{
|
||||
NodeInfo(uint64_t pos, strings::UniChar uniChar) : m_begPos(pos), m_char(uniChar) {}
|
||||
uint64_t m_begPos;
|
||||
strings::UniChar m_char;
|
||||
buffer_vector<ChildInfo, 4> m_children;
|
||||
TrieChar m_char;
|
||||
vector<ChildInfo> m_children;
|
||||
buffer_vector<uint8_t, 32> m_values;
|
||||
uint32_t m_valueCount;
|
||||
|
||||
NodeInfo() : m_valueCount(0) {}
|
||||
NodeInfo(uint64_t pos, TrieChar trieChar) : m_begPos(pos), m_char(trieChar), m_valueCount(0) {}
|
||||
};
|
||||
|
||||
void PopNodes(vector<builder::NodeInfo> & nodes, int nodesToPop)
|
||||
template <typename SinkT, class NodesT>
|
||||
void PopNodes(SinkT & sink, NodesT & nodes, int nodesToPop)
|
||||
{
|
||||
if (nodesToPop == 0)
|
||||
return;
|
||||
ASSERT_GREATER_OR_EQUAL(nodes.size(), nodesToPop, ());
|
||||
strings::UniString reverseEdge;
|
||||
while (nodesToPop > 0)
|
||||
ASSERT_GREATER(nodes.size(), nodesToPop, ());
|
||||
for (; nodesToPop > 0; --nodesToPop)
|
||||
{
|
||||
reverseEdge.push_back(nodes.back().m_char);reverseEdge.push_back(nodes.back().m_char);
|
||||
if (nodes.back().m_values.empty() && nodes.back().m_children.size() <= 1)
|
||||
NodeInfo & node = nodes.back();
|
||||
NodeInfo & prevNode = nodes[nodes.size() - 2];
|
||||
|
||||
if (node.m_valueCount == 0 && node.m_children.size() <= 1)
|
||||
{
|
||||
ASSERT_EQUAL(nodes.back().m_children.size(), 1, ());
|
||||
continue;
|
||||
ASSERT(node.m_values.empty(), ());
|
||||
ASSERT_EQUAL(node.m_children.size(), 1, ());
|
||||
ChildInfo & child = node.m_children[0];
|
||||
prevNode.m_children.push_back(ChildInfo(child.m_isLeaf, child.m_size, node.m_char));
|
||||
prevNode.m_children.back().m_edge.append(child.m_edge.begin(), child.m_edge.end());
|
||||
}
|
||||
else
|
||||
{
|
||||
WriteNodeReverse(sink, node.m_char, node.m_valueCount,
|
||||
node.m_values.data(), node.m_values.size(),
|
||||
node.m_children.rbegin(), node.m_children.rend());
|
||||
prevNode.m_children.push_back(ChildInfo(node.m_children.empty(),
|
||||
static_cast<uint32_t>(sink.Pos() - node.m_begPos),
|
||||
node.m_char));
|
||||
}
|
||||
|
||||
nodes.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace builder
|
||||
|
||||
/*
|
||||
template <typename SinkT, typename IterT>
|
||||
void Build(SinkT & sink, IterT const beg, IterT const end)
|
||||
{
|
||||
vector<builder::NodeInfo> nodes;
|
||||
strings::UniString prevKey;
|
||||
typedef buffer_vector<TrieChar, 32> TrieString;
|
||||
buffer_vector<builder::NodeInfo, 32> nodes(1, builder::NodeInfo(sink.Pos(), DEFAULT_CHAR));
|
||||
TrieString prevKey;
|
||||
for (IterT it = beg; it != end; ++it)
|
||||
{
|
||||
strings::UniString const key = it->Key();
|
||||
CHECK(!key.empty(), ());
|
||||
CHECK_LESS_OR_EQUAL(prevKey, key, ());
|
||||
TrieChar const * const pKeyData = it->GetKeyData();
|
||||
TrieString key(pKeyData, pKeyData + it->GetKeySize());
|
||||
CHECK(!(key < prevKey), (key, prevKey));
|
||||
int nCommon = 0;
|
||||
while (nCommon < min(key.size(),prevKey.size()) && prevKey[nCommon] == key[nCommon])
|
||||
while (nCommon < min(key.size(), prevKey.size()) && prevKey[nCommon] == key[nCommon])
|
||||
++nCommon;
|
||||
builder::PopNodes(nodes, nodes.size() - nCommon);
|
||||
builder::PopNodes(sink, nodes, nodes.size() - nCommon - 1); // Root is also a common node.
|
||||
uint64_t const pos = sink.Pos();
|
||||
for (int i = nCommon; i < key.size(); ++i)
|
||||
nodes.push_back(NodeInfo(pos, key[i]));
|
||||
uint8_t const * pValue = static_cast<uint8_t const *>(it->ValueData());
|
||||
nodes.back().m_values.insert(nodes.back().m_values.end(), pValue, pValue + it->ValueSize());
|
||||
nodes.push_back(builder::NodeInfo(pos, key[i]));
|
||||
uint8_t const * const pValue = static_cast<uint8_t const *>(it->GetValueData());
|
||||
nodes.back().m_values.insert(nodes.back().m_values.end(), pValue, pValue + it->GetValueSize());
|
||||
nodes.back().m_valueCount += 1;
|
||||
prevKey.swap(key);
|
||||
}
|
||||
builder::PopNodes(nodes.size());
|
||||
|
||||
// Pop all the nodes from the stack.
|
||||
builder::PopNodes(sink, nodes, nodes.size() - 1);
|
||||
|
||||
// Write the root.
|
||||
WriteNodeReverse(sink, DEFAULT_CHAR, nodes.back().m_valueCount,
|
||||
nodes.back().m_values.data(), nodes.back().m_values.size(),
|
||||
nodes.back().m_children.rbegin(), nodes.back().m_children.rend(),
|
||||
true);
|
||||
}
|
||||
*/
|
||||
|
||||
} // namespace trie
|
||||
|
|
204
coding/trie_reader.hpp
Normal file
204
coding/trie_reader.hpp
Normal file
|
@ -0,0 +1,204 @@
|
|||
#pragma once
|
||||
#include "trie.hpp"
|
||||
#include "../coding/reader.hpp"
|
||||
#include "../coding/varint.hpp"
|
||||
#include "../base/assert.hpp"
|
||||
#include "../base/bits.hpp"
|
||||
#include "../base/macros.hpp"
|
||||
|
||||
namespace trie
|
||||
{
|
||||
namespace reader
|
||||
{
|
||||
|
||||
template <class ReaderT, class ValueReaderT, typename EdgeValueT>
|
||||
class LeafIterator0 : public Iterator<typename ValueReaderT::ValueType, EdgeValueT>
|
||||
{
|
||||
public:
|
||||
typedef typename ValueReaderT::ValueType ValueType;
|
||||
typedef EdgeValueT EdgeValueType;
|
||||
|
||||
LeafIterator0(ReaderT const & reader, ValueReaderT const & valueReader)
|
||||
{
|
||||
uint32_t const size = static_cast<uint32_t>(reader.Size());
|
||||
ReaderSource<ReaderT> src(reader);
|
||||
while (src.Pos() < size)
|
||||
{
|
||||
this->m_value.push_back(ValueType());
|
||||
#ifdef DEBUG
|
||||
uint64_t const pos = src.Pos();
|
||||
#endif
|
||||
valueReader(src, this->m_value.back());
|
||||
ASSERT_NOT_EQUAL(pos, src.Pos(), ());
|
||||
}
|
||||
ASSERT_EQUAL(size, src.Pos(), ());
|
||||
}
|
||||
|
||||
Iterator<ValueType, EdgeValueType> * GoToEdge(uint32_t i) const
|
||||
{
|
||||
ASSERT(false, (i));
|
||||
UNUSED_VALUE(i);
|
||||
return NULL;
|
||||
}
|
||||
};
|
||||
|
||||
template <class ReaderT, class ValueReaderT, class EdgeValueReaderT>
|
||||
class IteratorImplBase :
|
||||
public Iterator<typename ValueReaderT::ValueType, typename EdgeValueReaderT::ValueType>
|
||||
{
|
||||
protected:
|
||||
enum { IS_READER_IN_MEMORY = 0 };
|
||||
};
|
||||
|
||||
template <class ValueReaderT, class EdgeValueReaderT>
|
||||
class IteratorImplBase<SharedMemReader, ValueReaderT, EdgeValueReaderT> :
|
||||
public Iterator<typename ValueReaderT::ValueType, typename EdgeValueReaderT::ValueType>
|
||||
{
|
||||
protected:
|
||||
enum { IS_READER_IN_MEMORY = 1 };
|
||||
};
|
||||
|
||||
|
||||
template <class ReaderT, class ValueReaderT, class EdgeValueReaderT>
|
||||
class Iterator0 : public IteratorImplBase<ReaderT, ValueReaderT, EdgeValueReaderT>
|
||||
{
|
||||
public:
|
||||
typedef typename ValueReaderT::ValueType ValueType;
|
||||
typedef typename EdgeValueReaderT::ValueType EdgeValueType;
|
||||
|
||||
Iterator0(ReaderT const & reader,
|
||||
ValueReaderT const & valueReader,
|
||||
EdgeValueReaderT const & edgeValueReader,
|
||||
TrieChar baseChar)
|
||||
: m_reader(reader), m_valueReader(valueReader), m_edgeValueReader(edgeValueReader)
|
||||
{
|
||||
ParseNode(baseChar);
|
||||
}
|
||||
|
||||
Iterator<ValueType, EdgeValueType> * GoToEdge(uint32_t i) const
|
||||
{
|
||||
ASSERT_LESS(i, this->m_edge.size(), ());
|
||||
uint32_t const offset = m_edgeInfo[i].m_offset;
|
||||
uint32_t const size = m_edgeInfo[i+1].m_offset - offset;
|
||||
|
||||
// TODO: Profile to check that MemReader optimization helps?
|
||||
/*
|
||||
if (!IteratorImplBase<ReaderT, ValueReaderT, EdgeValueReaderT>::IS_READER_IN_MEMORY &&
|
||||
size < 1024)
|
||||
{
|
||||
SharedMemReader memReader(size);
|
||||
m_reader.Read(offset, memReader.Data(), size);
|
||||
if (m_edgeInfo[i].m_isLeaf)
|
||||
return new LeafIterator0<SharedMemReader, ValueReaderT, EdgeValueType>(
|
||||
memReader, m_valueReader);
|
||||
else
|
||||
return new Iterator0<SharedMemReader, ValueReaderT, EdgeValueReaderT>(
|
||||
memReader, m_valueReader, m_edgeValueReader,
|
||||
this->m_edge[i].m_str.back());
|
||||
}
|
||||
else
|
||||
*/
|
||||
{
|
||||
if (m_edgeInfo[i].m_isLeaf)
|
||||
return new LeafIterator0<ReaderT, ValueReaderT, EdgeValueType>(
|
||||
m_reader.SubReader(offset, size), m_valueReader);
|
||||
else
|
||||
return new Iterator0<ReaderT, ValueReaderT, EdgeValueReaderT>(
|
||||
m_reader.SubReader(offset, size), m_valueReader, m_edgeValueReader,
|
||||
this->m_edge[i].m_str.back());
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void ParseNode(TrieChar baseChar)
|
||||
{
|
||||
ReaderSource<ReaderT> src(m_reader);
|
||||
|
||||
// [1: header]: [2: min(valueCount, 3)] [6: min(childCount, 63)]
|
||||
uint8_t const header = ReadPrimitiveFromSource<uint8_t>(src);
|
||||
uint32_t valueCount = (header >> 6);
|
||||
uint32_t childCount = (header & 63);
|
||||
|
||||
// [vu valueCount]: if valueCount in header == 3
|
||||
if (valueCount == 3)
|
||||
valueCount = ReadVarUint<uint32_t>(src);
|
||||
|
||||
// [vu childCount]: if childCount in header == 63
|
||||
if (childCount == 63)
|
||||
childCount = ReadVarUint<uint32_t>(src);
|
||||
|
||||
// [value] ... [value]
|
||||
this->m_value.resize(valueCount);
|
||||
for (uint32_t i = 0; i < valueCount; ++i)
|
||||
m_valueReader(src, this->m_value[i]);
|
||||
|
||||
// [childInfo] ... [childInfo]
|
||||
this->m_edge.resize(childCount);
|
||||
m_edgeInfo.resize(childCount + 1);
|
||||
m_edgeInfo[0].m_offset = 0;
|
||||
for (uint32_t i = 0; i < childCount; ++i)
|
||||
{
|
||||
typename Iterator<ValueType, EdgeValueType>::Edge & e = this->m_edge[i];
|
||||
|
||||
// [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)]
|
||||
uint8_t const header = ReadPrimitiveFromSource<uint8_t>(src);
|
||||
m_edgeInfo[i].m_isLeaf = (header & 128);
|
||||
if (header & 64)
|
||||
e.m_str.push_back(baseChar + bits::ZigZagDecode(header & 63U));
|
||||
else
|
||||
{
|
||||
// [vu edgeLen-1]: if edgeLen-1 in header == 63
|
||||
uint32_t edgeLen = (header & 63);
|
||||
if (edgeLen == 63)
|
||||
edgeLen = ReadVarUint<uint32_t>(src);
|
||||
edgeLen += 1;
|
||||
|
||||
// [vi edgeChar0 - baseChar] [vi edgeChar1 - edgeChar0] ... [vi edgeCharN - edgeCharN-1]
|
||||
e.m_str.reserve(edgeLen);
|
||||
for (uint32_t i = 0; i < edgeLen; ++i)
|
||||
e.m_str.push_back(baseChar += ReadVarInt<int32_t>(src));
|
||||
}
|
||||
|
||||
// [edge value]
|
||||
m_edgeValueReader(src, e.m_value);
|
||||
|
||||
// [child size]: if the child is not the last one
|
||||
m_edgeInfo[i + 1].m_offset = m_edgeInfo[i].m_offset;
|
||||
if (i != childCount - 1)
|
||||
m_edgeInfo[i + 1].m_offset += ReadVarUint<uint32_t>(src);
|
||||
|
||||
baseChar = e.m_str[0];
|
||||
}
|
||||
|
||||
uint32_t const currentOffset = static_cast<uint32_t>(src.Pos());
|
||||
for (size_t i = 0; i < m_edgeInfo.size(); ++i)
|
||||
m_edgeInfo[i].m_offset += currentOffset;
|
||||
m_edgeInfo.back().m_offset = static_cast<uint32_t>(m_reader.Size());
|
||||
}
|
||||
|
||||
struct EdgeInfo
|
||||
{
|
||||
uint32_t m_offset;
|
||||
bool m_isLeaf;
|
||||
};
|
||||
|
||||
buffer_vector<EdgeInfo, 9> m_edgeInfo;
|
||||
|
||||
ReaderT m_reader;
|
||||
ValueReaderT m_valueReader;
|
||||
EdgeValueReaderT m_edgeValueReader;
|
||||
};
|
||||
|
||||
// Returns iterator to the root of the trie.
|
||||
template <class ReaderT, class ValueReaderT, class EdgeValueReaderT>
|
||||
Iterator<typename ValueReaderT::ValueType, typename EdgeValueReaderT::ValueType> *
|
||||
ReadTrie(ReaderT const & reader,
|
||||
ValueReaderT valueReader = ValueReaderT(),
|
||||
EdgeValueReaderT edgeValueReader = EdgeValueReaderT())
|
||||
{
|
||||
return new Iterator0<ReaderT, ValueReaderT, EdgeValueReaderT>(
|
||||
reader, valueReader, edgeValueReader, DEFAULT_CHAR);
|
||||
}
|
||||
|
||||
} // namespace trie::reader
|
||||
} // namespace trie
|
Loading…
Add table
Reference in a new issue