diff --git a/coding/coding.pro b/coding/coding.pro index 12374ec59d..c3ceeaa944 100644 --- a/coding/coding.pro +++ b/coding/coding.pro @@ -86,4 +86,5 @@ HEADERS += \ zip_reader.hpp \ trie.hpp \ trie_builder.hpp \ + trie_reader.hpp \ mmap_reader.hpp \ diff --git a/coding/coding_tests/trie_test.cpp b/coding/coding_tests/trie_test.cpp index 81db627517..13cc4b8d93 100644 --- a/coding/coding_tests/trie_test.cpp +++ b/coding/coding_tests/trie_test.cpp @@ -1,8 +1,13 @@ #include "../../testing/testing.hpp" #include "../trie.hpp" #include "../trie_builder.hpp" +#include "../trie_reader.hpp" #include "../byte_stream.hpp" #include "../write_to_sink.hpp" +#include "../../base/logging.hpp" +#include "../../std/algorithm.hpp" +#include "../../std/scoped_ptr.hpp" +#include "../../std/string.hpp" #include "../../std/vector.hpp" #include @@ -13,10 +18,68 @@ struct ChildNodeInfo { bool m_isLeaf; uint32_t m_size; - char const * m_edge; + vector m_edge; + string m_edgeValue; + ChildNodeInfo(bool isLeaf, uint32_t size, char const * edge, char const * edgeValue) + : m_isLeaf(isLeaf), m_size(size), m_edgeValue(edgeValue) + { + while (*edge) + m_edge.push_back(*edge++); + } + uint32_t Size() const { return m_size; } bool IsLeaf() const { return m_isLeaf; } - strings::UniString GetEdge() const { return strings::MakeUniString(m_edge); } + uint32_t const * GetEdge() const { return m_edge.data(); } + uint32_t GetEdgeSize() const { return m_edge.size(); } + void const * GetEdgeValue() const { return m_edgeValue.data(); } + uint32_t GetEdgeValueSize() const { return m_edgeValue.size(); } +}; + +struct KeyValuePair +{ + vector m_key; + uint32_t m_value; + + template + KeyValuePair(StringT const & key, int value) : m_key(key.begin(), key.end()), m_value(value) {} + + uint32_t GetKeySize() const { return m_key.size(); } + trie::TrieChar const * GetKeyData() const { return m_key.data(); } + uint32_t GetValueSize() const { return 4; } + void const * GetValueData() const { return &m_value; } + + bool operator == (KeyValuePair const & p) const + { + return m_key == p.m_key && m_value == p.m_value; + } + + bool operator < (KeyValuePair const & p) const + { + if (m_key != p.m_key) + return m_key < p.m_key; + return m_value < p.m_value; + } +}; + +string debug_print(KeyValuePair const & p) +{ + string keyS = ::debug_print(p.m_key); + ostringstream out; + out << "KVP(" << keyS << ", " << p.m_value << ")"; + return out.str(); +} + +struct KeyValuePairBackInserter +{ + vector m_v; + template + void operator() (StringT const & s, + trie::reader::FixedSizeValueReader<4>::ValueType const & rawValue) + { + uint32_t value; + memcpy(&value, &rawValue, 4); + m_v.push_back(KeyValuePair(s, value)); + } }; } // unnamed namespace @@ -29,37 +92,98 @@ UNIT_TEST(TrieBuilder_WriteNode_Smoke) PushBackByteSink > sink(serial); ChildNodeInfo children[] = { - {true, 1, "1A"}, - {false, 2, "B"}, - {false, 3, "zz"}, - {true, 4, "abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij"} + ChildNodeInfo(true, 1, "1A", "i1"), + ChildNodeInfo(false, 2, "B", "ii2"), + ChildNodeInfo(false, 3, "zz", ""), + ChildNodeInfo(true, 4, + "abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij", "i4"), + ChildNodeInfo(true, 5, "a", "5z") }; trie::builder::WriteNode(sink, 0, 3, "123", 3, &children[0], &children[0] + ARRAY_SIZE(children)); unsigned char const expected [] = { - BOOST_BINARY(11000100), // Header: [0b11] [0b000100] + BOOST_BINARY(11000101), // Header: [0b11] [0b000101] 3, // Number of values '1', '2', '3', // Values - 1, // Child 1: size - BOOST_BINARY(10000010), // Child 1: header: [+leaf] [-supershort] [2 symbols] + BOOST_BINARY(10000001), // Child 1: header: [+leaf] [-supershort] [2 symbols] ZENC('1'), ZENC('A' - '1'), // Child 1: edge - 2, // Child 2: size + 'i', '1', // Child 1: intermediate data + 1, // Child 1: size 64 | ZENC('B' - '1'), // Child 2: header: [-leaf] [+supershort] - 3, // Child 3: size - BOOST_BINARY(00000010), // Child 3: header: [-leaf] [-supershort] [2 symbols] + 'i', 'i', '2', // Child 2: intermediate data + 2, // Child 2: size + BOOST_BINARY(00000001), // Child 3: header: [-leaf] [-supershort] [2 symbols] ZENC('z' - 'B'), 0, // Child 3: edge - 4, // Child 4: size + 3, // Child 3: size BOOST_BINARY(10111111), // Child 4: header: [+leaf] [-supershort] [>= 63 symbols] - 70, // Child 4: edge size + 69, // Child 4: edgeSize - 1 ZENC('a' - 'z'), 2,2,2,2,2,2,2,2,2, // Child 4: edge ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge - ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2 // Child 4: edge + ZENC('a' - 'j'), 2,2,2,2,2,2,2,2,2, // Child 4: edge + 'i', '4', // Child 4: intermediate data + 4, // Child 4: size + BOOST_BINARY(11000000) | ZENC(0), // Child 5: header: [+leaf] [+supershort] + '5', 'z' // Child 5: intermediate data }; TEST_EQUAL(serial, vector(&expected[0], &expected[0] + ARRAY_SIZE(expected)), ()); } + +UNIT_TEST(TrieBuilder_Build) +{ + int const base = 3; + int const maxLen = 3; + + vector possibleStrings(1, string()); + for (int len = 1; len <= maxLen; ++len) + { + for (int i = 0; i < pow(base, len); ++i) + { + string s(len, 'A'); + int t = i; + for (int l = len - 1; l >= 0; --l, t /= base) + s[l] += (t % base); + possibleStrings.push_back(s); + } + } + sort(possibleStrings.begin(), possibleStrings.end()); + // LOG(LINFO, (possibleStrings)); + + for (int i0 = -1; i0 < static_cast(possibleStrings.size()); ++i0) + for (int i1 = i0; i1 < static_cast(possibleStrings.size()); ++i1) + for (int i2 = i1; i2 < static_cast(possibleStrings.size()); ++i2) + { + vector v; + if (i0 >= 0) v.push_back(KeyValuePair(possibleStrings[i0], i0)); + if (i1 >= 0) v.push_back(KeyValuePair(possibleStrings[i1], i1)); + if (i2 >= 0) v.push_back(KeyValuePair(possibleStrings[i2], i2)); + vector vs; + for (size_t i = 0; i < v.size(); ++i) + vs.push_back(string(v[i].m_key.begin(), v[i].m_key.end())); + + vector serial; + PushBackByteSink > sink(serial); + trie::Build(sink, v.begin(), v.end()); + reverse(serial.begin(), serial.end()); + // LOG(LINFO, (serial.size(), vs)); + + MemReader memReader = MemReader(serial.data(), serial.size()); + typedef trie::Iterator< + trie::reader::FixedSizeValueReader<4>::ValueType, + trie::reader::EmptyValueReader::ValueType + > IteratorType; + scoped_ptr root(trie::reader::ReadTrie(memReader, + trie::reader::FixedSizeValueReader<4>(), + trie::reader::EmptyValueReader())); + vector res; + KeyValuePairBackInserter f; + trie::ForEachRef(*root, f, vector()); + sort(f.m_v.begin(), f.m_v.end()); + TEST_EQUAL(v, f.m_v, ()); + } +} diff --git a/coding/trie.hpp b/coding/trie.hpp index 610cff93ca..ee1940a659 100644 --- a/coding/trie.hpp +++ b/coding/trie.hpp @@ -1,2 +1,79 @@ #pragma once +#include "../base/assert.hpp" #include "../base/base.hpp" +#include "../base/buffer_vector.hpp" +#include "../std/scoped_ptr.hpp" + +namespace trie +{ + +typedef uint32_t TrieChar; + +// 95 is a good value for the default baseChar, since both small and capital latin letters +// are less than +/- 32 from it and thus can fit into supershort edge. +static uint32_t const DEFAULT_CHAR = 95; + +template +class Iterator +{ +public: + struct Edge + { + buffer_vector m_str; + EdgeValueT m_value; + }; + + buffer_vector m_edge; + buffer_vector m_value; + + virtual ~Iterator() {} + + virtual Iterator * GoToEdge(uint32_t i) const = 0; +}; + +namespace reader +{ + +struct EmptyValueReader +{ + typedef unsigned char ValueType; + + template + void operator() (SourceT &, ValueType & value) const + { + value = 0; + } +}; + +template +struct FixedSizeValueReader +{ + struct ValueType + { + unsigned char m_data[N]; + }; + + template + void operator() (SourceT & src, ValueType & value) const + { + src.Read(&value.m_data[0], N); + } +}; + +} // namespace trie::reader + +template +void ForEachRef(Iterator const & iter, F & f, StringT const & s) +{ + for (size_t i = 0; i < iter.m_value.size(); ++i) + f(s, iter.m_value[i]); + for (size_t i = 0; i < iter.m_edge.size(); ++i) + { + StringT s1(s); + s1.insert(s1.end(), iter.m_edge[i].m_str.begin(), iter.m_edge[i].m_str.end()); + scoped_ptr > pIter1(iter.GoToEdge(i)); + ForEachRef(*pIter1, f, s1); + } +} + +} // namespace Trie diff --git a/coding/trie_builder.hpp b/coding/trie_builder.hpp index 633eb34d9d..7761b695bc 100644 --- a/coding/trie_builder.hpp +++ b/coding/trie_builder.hpp @@ -1,22 +1,55 @@ #pragma once +#include "../coding/byte_stream.hpp" #include "../coding/varint.hpp" -#include "../base/string_utils.hpp" +#include "../base/buffer_vector.hpp" #include "../std/algorithm.hpp" +// Trie format: +// [1: header] +// [node] ... [node] + +// Nodes are written in post-order (first child, last child, parent). Contents of nodes is writern +// reversed. The resulting file should be reverese before use! Then its contents will appear in +// pre-order alphabetically reversed (parent, last child, first child). + +// Leaf node format: +// [value] ... [value] + +// Internal node format: +// [1: header]: [2: min(valueCount, 3)] [6: min(childCount, 63)] +// [vu valueCount]: if valueCount in header == 3 +// [vu childCount]: if childCount in header == 63 +// [value] ... [value] +// [childInfo] ... [childInfo] + +// Child info format: +// Every char of the edge is encoded as varint difference from the previous char. First char is +// encoded as varint difference from the base char, which is the last char of the current prefix. +// +// [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)] +// [vu edgeLen-1]: if edgeLen-1 in header == 63 +// [vi edgeChar0 - baseChar] +// [vi edgeChar1 - edgeChar0] +// ... +// [vi edgeCharN - edgeCharN-1] +// [edge value] +// [child size]: if the child is not the first one (last one when reading) + namespace trie { namespace builder { template -void WriteNode(SinkT & sink, strings::UniChar baseChar, - uint32_t const valueCount, void const * valuesData, uint32_t const valuesSize, - ChildIterT const begChild, ChildIterT const endChild) +void WriteNode(SinkT & sink, TrieChar baseChar, uint32_t const valueCount, + void const * const valuesDataSize, uint32_t const valuesSize, + ChildIterT const begChild, ChildIterT const endChild, + bool isRoot = false) { - if (begChild == endChild) + if (begChild == endChild && !isRoot) { // Leaf node. - sink.Write(valuesData, valuesSize); + sink.Write(valuesDataSize, valuesSize); return; } uint32_t const childCount = endChild - begChild; @@ -26,15 +59,16 @@ void WriteNode(SinkT & sink, strings::UniChar baseChar, WriteVarUint(sink, valueCount); if (childCount >= 63) WriteVarUint(sink, childCount); - sink.Write(valuesData, valuesSize); - for (ChildIterT it = begChild; it != endChild; ++it) + sink.Write(valuesDataSize, valuesSize); + for (ChildIterT it = begChild; it != endChild; /*++it*/) { - WriteVarUint(sink, it->Size()); uint8_t header = (it->IsLeaf() ? 128 : 0); - strings::UniString const edge = it->GetEdge(); - CHECK(!edge.empty(), ()); + TrieChar const * const edge = it->GetEdge(); + uint32_t const edgeSize = it->GetEdgeSize(); + CHECK_NOT_EQUAL(edgeSize, 0, ()); + CHECK_LESS(edgeSize, 100000, ()); uint32_t const diff0 = bits::ZigZagEncode(int32_t(edge[0] - baseChar)); - if (edge.size() == 1 && (diff0 & ~63U) == 0) + if (edgeSize == 1 && (diff0 & ~63U) == 0) { header |= 64; header |= diff0; @@ -42,90 +76,143 @@ void WriteNode(SinkT & sink, strings::UniChar baseChar, } else { - if (edge.size() < 63) + if (edgeSize - 1 < 63) { - header |= edge.size(); + header |= edgeSize - 1; WriteToSink(sink, header); } else { header |= 63; WriteToSink(sink, header); - WriteVarUint(sink, static_cast(edge.size())); + WriteVarUint(sink, edgeSize - 1); } - for (size_t i = 0; i < edge.size(); ++i) + for (size_t i = 0; i < edgeSize; ++i) { WriteVarInt(sink, int32_t(edge[i] - baseChar)); baseChar = edge[i]; } } baseChar = edge[0]; + sink.Write(it->GetEdgeValue(), it->GetEdgeValueSize()); + + uint32_t const childSize = it->Size(); + if (++it != endChild) + WriteVarUint(sink, childSize); } } +template +void WriteNodeReverse(SinkT & sink, TrieChar baseChar, uint32_t const valueCount, + void const * const valuesDataSize, uint32_t const valuesSize, + ChildIterT const begChild, ChildIterT const endChild, + bool isRoot = false) +{ + typedef buffer_vector OutStorageType; + OutStorageType out; + PushBackByteSink outSink(out); + WriteNode(outSink, baseChar, valueCount, valuesDataSize, valuesSize, begChild, endChild, isRoot); + reverse(out.begin(), out.end()); + sink.Write(out.data(), out.size()); +} + struct ChildInfo { bool m_isLeaf; uint32_t m_size; - char const * m_edge; + buffer_vector m_edge; + buffer_vector m_edgeValue; + + ChildInfo() {} + ChildInfo(bool isLeaf, uint32_t size, TrieChar c) : m_isLeaf(isLeaf), m_size(size), m_edge(1, c) + { + } + uint32_t Size() const { return m_size; } bool IsLeaf() const { return m_isLeaf; } - strings::UniString GetEdge() const { return strings::MakeUniString(m_edge); } + TrieChar const * GetEdge() const { return m_edge.data(); } + uint32_t GetEdgeSize() const { return m_edge.size(); } + void const * GetEdgeValue() const { return m_edgeValue.data(); } + uint32_t GetEdgeValueSize() const { return m_edgeValue.size(); } }; struct NodeInfo { - NodeInfo(uint64_t pos, strings::UniChar uniChar) : m_begPos(pos), m_char(uniChar) {} uint64_t m_begPos; - strings::UniChar m_char; - buffer_vector m_children; + TrieChar m_char; + vector m_children; buffer_vector m_values; + uint32_t m_valueCount; + + NodeInfo() : m_valueCount(0) {} + NodeInfo(uint64_t pos, TrieChar trieChar) : m_begPos(pos), m_char(trieChar), m_valueCount(0) {} }; -void PopNodes(vector & nodes, int nodesToPop) +template +void PopNodes(SinkT & sink, NodesT & nodes, int nodesToPop) { - if (nodesToPop == 0) - return; - ASSERT_GREATER_OR_EQUAL(nodes.size(), nodesToPop, ()); - strings::UniString reverseEdge; - while (nodesToPop > 0) + ASSERT_GREATER(nodes.size(), nodesToPop, ()); + for (; nodesToPop > 0; --nodesToPop) { - reverseEdge.push_back(nodes.back().m_char);reverseEdge.push_back(nodes.back().m_char); - if (nodes.back().m_values.empty() && nodes.back().m_children.size() <= 1) + NodeInfo & node = nodes.back(); + NodeInfo & prevNode = nodes[nodes.size() - 2]; + + if (node.m_valueCount == 0 && node.m_children.size() <= 1) { - ASSERT_EQUAL(nodes.back().m_children.size(), 1, ()); - continue; + ASSERT(node.m_values.empty(), ()); + ASSERT_EQUAL(node.m_children.size(), 1, ()); + ChildInfo & child = node.m_children[0]; + prevNode.m_children.push_back(ChildInfo(child.m_isLeaf, child.m_size, node.m_char)); + prevNode.m_children.back().m_edge.append(child.m_edge.begin(), child.m_edge.end()); + } + else + { + WriteNodeReverse(sink, node.m_char, node.m_valueCount, + node.m_values.data(), node.m_values.size(), + node.m_children.rbegin(), node.m_children.rend()); + prevNode.m_children.push_back(ChildInfo(node.m_children.empty(), + static_cast(sink.Pos() - node.m_begPos), + node.m_char)); } + nodes.pop_back(); } } } // namespace builder -/* template void Build(SinkT & sink, IterT const beg, IterT const end) { - vector nodes; - strings::UniString prevKey; + typedef buffer_vector TrieString; + buffer_vector nodes(1, builder::NodeInfo(sink.Pos(), DEFAULT_CHAR)); + TrieString prevKey; for (IterT it = beg; it != end; ++it) { - strings::UniString const key = it->Key(); - CHECK(!key.empty(), ()); - CHECK_LESS_OR_EQUAL(prevKey, key, ()); + TrieChar const * const pKeyData = it->GetKeyData(); + TrieString key(pKeyData, pKeyData + it->GetKeySize()); + CHECK(!(key < prevKey), (key, prevKey)); int nCommon = 0; - while (nCommon < min(key.size(),prevKey.size()) && prevKey[nCommon] == key[nCommon]) + while (nCommon < min(key.size(), prevKey.size()) && prevKey[nCommon] == key[nCommon]) ++nCommon; - builder::PopNodes(nodes, nodes.size() - nCommon); + builder::PopNodes(sink, nodes, nodes.size() - nCommon - 1); // Root is also a common node. uint64_t const pos = sink.Pos(); for (int i = nCommon; i < key.size(); ++i) - nodes.push_back(NodeInfo(pos, key[i])); - uint8_t const * pValue = static_cast(it->ValueData()); - nodes.back().m_values.insert(nodes.back().m_values.end(), pValue, pValue + it->ValueSize()); + nodes.push_back(builder::NodeInfo(pos, key[i])); + uint8_t const * const pValue = static_cast(it->GetValueData()); + nodes.back().m_values.insert(nodes.back().m_values.end(), pValue, pValue + it->GetValueSize()); + nodes.back().m_valueCount += 1; prevKey.swap(key); } - builder::PopNodes(nodes.size()); + + // Pop all the nodes from the stack. + builder::PopNodes(sink, nodes, nodes.size() - 1); + + // Write the root. + WriteNodeReverse(sink, DEFAULT_CHAR, nodes.back().m_valueCount, + nodes.back().m_values.data(), nodes.back().m_values.size(), + nodes.back().m_children.rbegin(), nodes.back().m_children.rend(), + true); } -*/ } // namespace trie diff --git a/coding/trie_reader.hpp b/coding/trie_reader.hpp new file mode 100644 index 0000000000..e34cf18e97 --- /dev/null +++ b/coding/trie_reader.hpp @@ -0,0 +1,204 @@ +#pragma once +#include "trie.hpp" +#include "../coding/reader.hpp" +#include "../coding/varint.hpp" +#include "../base/assert.hpp" +#include "../base/bits.hpp" +#include "../base/macros.hpp" + +namespace trie +{ +namespace reader +{ + +template +class LeafIterator0 : public Iterator +{ +public: + typedef typename ValueReaderT::ValueType ValueType; + typedef EdgeValueT EdgeValueType; + + LeafIterator0(ReaderT const & reader, ValueReaderT const & valueReader) + { + uint32_t const size = static_cast(reader.Size()); + ReaderSource src(reader); + while (src.Pos() < size) + { + this->m_value.push_back(ValueType()); +#ifdef DEBUG + uint64_t const pos = src.Pos(); +#endif + valueReader(src, this->m_value.back()); + ASSERT_NOT_EQUAL(pos, src.Pos(), ()); + } + ASSERT_EQUAL(size, src.Pos(), ()); + } + + Iterator * GoToEdge(uint32_t i) const + { + ASSERT(false, (i)); + UNUSED_VALUE(i); + return NULL; + } +}; + +template +class IteratorImplBase : + public Iterator +{ +protected: + enum { IS_READER_IN_MEMORY = 0 }; +}; + +template +class IteratorImplBase : + public Iterator +{ +protected: + enum { IS_READER_IN_MEMORY = 1 }; +}; + + +template +class Iterator0 : public IteratorImplBase +{ +public: + typedef typename ValueReaderT::ValueType ValueType; + typedef typename EdgeValueReaderT::ValueType EdgeValueType; + + Iterator0(ReaderT const & reader, + ValueReaderT const & valueReader, + EdgeValueReaderT const & edgeValueReader, + TrieChar baseChar) + : m_reader(reader), m_valueReader(valueReader), m_edgeValueReader(edgeValueReader) + { + ParseNode(baseChar); + } + + Iterator * GoToEdge(uint32_t i) const + { + ASSERT_LESS(i, this->m_edge.size(), ()); + uint32_t const offset = m_edgeInfo[i].m_offset; + uint32_t const size = m_edgeInfo[i+1].m_offset - offset; + + // TODO: Profile to check that MemReader optimization helps? + /* + if (!IteratorImplBase::IS_READER_IN_MEMORY && + size < 1024) + { + SharedMemReader memReader(size); + m_reader.Read(offset, memReader.Data(), size); + if (m_edgeInfo[i].m_isLeaf) + return new LeafIterator0( + memReader, m_valueReader); + else + return new Iterator0( + memReader, m_valueReader, m_edgeValueReader, + this->m_edge[i].m_str.back()); + } + else + */ + { + if (m_edgeInfo[i].m_isLeaf) + return new LeafIterator0( + m_reader.SubReader(offset, size), m_valueReader); + else + return new Iterator0( + m_reader.SubReader(offset, size), m_valueReader, m_edgeValueReader, + this->m_edge[i].m_str.back()); + } + } + +private: + void ParseNode(TrieChar baseChar) + { + ReaderSource src(m_reader); + + // [1: header]: [2: min(valueCount, 3)] [6: min(childCount, 63)] + uint8_t const header = ReadPrimitiveFromSource(src); + uint32_t valueCount = (header >> 6); + uint32_t childCount = (header & 63); + + // [vu valueCount]: if valueCount in header == 3 + if (valueCount == 3) + valueCount = ReadVarUint(src); + + // [vu childCount]: if childCount in header == 63 + if (childCount == 63) + childCount = ReadVarUint(src); + + // [value] ... [value] + this->m_value.resize(valueCount); + for (uint32_t i = 0; i < valueCount; ++i) + m_valueReader(src, this->m_value[i]); + + // [childInfo] ... [childInfo] + this->m_edge.resize(childCount); + m_edgeInfo.resize(childCount + 1); + m_edgeInfo[0].m_offset = 0; + for (uint32_t i = 0; i < childCount; ++i) + { + typename Iterator::Edge & e = this->m_edge[i]; + + // [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)] + uint8_t const header = ReadPrimitiveFromSource(src); + m_edgeInfo[i].m_isLeaf = (header & 128); + if (header & 64) + e.m_str.push_back(baseChar + bits::ZigZagDecode(header & 63U)); + else + { + // [vu edgeLen-1]: if edgeLen-1 in header == 63 + uint32_t edgeLen = (header & 63); + if (edgeLen == 63) + edgeLen = ReadVarUint(src); + edgeLen += 1; + + // [vi edgeChar0 - baseChar] [vi edgeChar1 - edgeChar0] ... [vi edgeCharN - edgeCharN-1] + e.m_str.reserve(edgeLen); + for (uint32_t i = 0; i < edgeLen; ++i) + e.m_str.push_back(baseChar += ReadVarInt(src)); + } + + // [edge value] + m_edgeValueReader(src, e.m_value); + + // [child size]: if the child is not the last one + m_edgeInfo[i + 1].m_offset = m_edgeInfo[i].m_offset; + if (i != childCount - 1) + m_edgeInfo[i + 1].m_offset += ReadVarUint(src); + + baseChar = e.m_str[0]; + } + + uint32_t const currentOffset = static_cast(src.Pos()); + for (size_t i = 0; i < m_edgeInfo.size(); ++i) + m_edgeInfo[i].m_offset += currentOffset; + m_edgeInfo.back().m_offset = static_cast(m_reader.Size()); + } + + struct EdgeInfo + { + uint32_t m_offset; + bool m_isLeaf; + }; + + buffer_vector m_edgeInfo; + + ReaderT m_reader; + ValueReaderT m_valueReader; + EdgeValueReaderT m_edgeValueReader; +}; + +// Returns iterator to the root of the trie. +template +Iterator * +ReadTrie(ReaderT const & reader, + ValueReaderT valueReader = ValueReaderT(), + EdgeValueReaderT edgeValueReader = EdgeValueReaderT()) +{ + return new Iterator0( + reader, valueReader, edgeValueReader, DEFAULT_CHAR); +} + +} // namespace trie::reader +} // namespace trie