Review fixes.

This commit is contained in:
Maxim Pimenov 2015-11-06 18:45:54 +03:00 committed by Sergey Yershov
parent a818bb4b37
commit ef97e8dbaf
13 changed files with 123 additions and 126 deletions

View file

@ -229,7 +229,7 @@ UNIT_TEST(CompressedBitVector_SerializationDense)
cbv->Serialize(writer);
}
MemReader reader(buf.data(), buf.size());
auto cbv = coding::CompressedBitVectorBuilder::Deserialize(reader);
auto cbv = coding::CompressedBitVectorBuilder::DeserializeFromReader(reader);
TEST(cbv.get(), ());
TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Dense, cbv->GetStorageStrategy(), ());
TEST_EQUAL(setBits.size(), cbv->PopCount(), ());
@ -254,7 +254,7 @@ UNIT_TEST(CompressedBitVector_SerializationSparse)
cbv->Serialize(writer);
}
MemReader reader(buf.data(), buf.size());
auto cbv = coding::CompressedBitVectorBuilder::Deserialize(reader);
auto cbv = coding::CompressedBitVectorBuilder::DeserializeFromReader(reader);
TEST(cbv.get(), ());
TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv->GetStorageStrategy(), ());
TEST_EQUAL(setBits.size(), cbv->PopCount(), ());

View file

@ -231,6 +231,14 @@ void DenseCBV::Serialize(Writer & writer) const
rw::WriteVectorOfPOD(writer, m_bitGroups);
}
unique_ptr<CompressedBitVector> DenseCBV::Clone() const
{
DenseCBV * cbv = new DenseCBV();
cbv->m_popCount = m_popCount;
cbv->m_bitGroups = m_bitGroups;
return unique_ptr<CompressedBitVector>(cbv);
}
SparseCBV::SparseCBV(vector<uint64_t> const & setBits) : m_positions(setBits)
{
ASSERT(is_sorted(m_positions.begin(), m_positions.end()), ());
@ -267,6 +275,13 @@ void SparseCBV::Serialize(Writer & writer) const
rw::WriteVectorOfPOD(writer, m_positions);
}
unique_ptr<CompressedBitVector> SparseCBV::Clone() const
{
SparseCBV * cbv = new SparseCBV();
cbv->m_positions = m_positions;
return unique_ptr<CompressedBitVector>(cbv);
}
// static
unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromBitPositions(
vector<uint64_t> const & setBits)
@ -290,7 +305,7 @@ unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromBitGroups(
while (!bitGroups.empty() && bitGroups.back() == 0)
bitGroups.pop_back();
if (bitGroups.empty())
return make_unique<SparseCBV>(bitGroups);
return make_unique<SparseCBV>(move(bitGroups));
uint64_t const maxBit = kBlockSize * (bitGroups.size() - 1) + bits::CeilLog(bitGroups.back());
uint64_t popCount = 0;
@ -312,28 +327,6 @@ unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromBitGroups(
return make_unique<SparseCBV>(setBits);
}
// static
unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromCBV(CompressedBitVector const & cbv)
{
auto strat = cbv.GetStorageStrategy();
switch (strat)
{
case CompressedBitVector::StorageStrategy::Dense:
{
DenseCBV const & dense = static_cast<DenseCBV const &>(cbv);
auto bitGroups = dense.m_bitGroups;
return CompressedBitVectorBuilder::FromBitGroups(move(bitGroups));
}
case CompressedBitVector::StorageStrategy::Sparse:
{
SparseCBV const & sparse = static_cast<SparseCBV const &>(cbv);
return CompressedBitVectorBuilder::FromBitPositions(sparse.m_positions);
}
}
CHECK(false, ("Unknown strategy when building a compressed bit vector."));
return unique_ptr<CompressedBitVector>();
}
string DebugPrint(CompressedBitVector::StorageStrategy strat)
{
switch (strat)

View file

@ -63,6 +63,9 @@ public:
// todo(@pimenov). Think about rewriting Serialize and Deserialize to use the
// code in old_compressed_bit_vector.{c,h}pp.
virtual void Serialize(Writer & writer) const = 0;
// Copies a bit vector and returns a pointer to the copy.
virtual unique_ptr<CompressedBitVector> Clone() const = 0;
};
string DebugPrint(CompressedBitVector::StorageStrategy strat);
@ -105,6 +108,7 @@ public:
bool GetBit(uint64_t pos) const override;
StorageStrategy GetStorageStrategy() const override;
void Serialize(Writer & writer) const override;
unique_ptr<CompressedBitVector> Clone() const override;
private:
vector<uint64_t> m_bitGroups;
@ -117,6 +121,8 @@ public:
friend class CompressedBitVectorBuilder;
using TIterator = vector<uint64_t>::const_iterator;
SparseCBV() = default;
explicit SparseCBV(vector<uint64_t> const & setBits);
explicit SparseCBV(vector<uint64_t> && setBits);
@ -136,6 +142,7 @@ public:
bool GetBit(uint64_t pos) const override;
StorageStrategy GetStorageStrategy() const override;
void Serialize(Writer & writer) const override;
unique_ptr<CompressedBitVector> Clone() const override;
inline TIterator Begin() const { return m_positions.cbegin(); }
inline TIterator End() const { return m_positions.cend(); }
@ -155,15 +162,13 @@ public:
// Chooses a strategy to store the bit vector with bits from a bitmap obtained
// by concatenating the elements of bitGroups.
static unique_ptr<CompressedBitVector> FromBitGroups(vector<uint64_t> & bitGroups);
static unique_ptr<CompressedBitVector> FromBitGroups(vector<uint64_t> && bitGroups);
// Copies a CBV.
static unique_ptr<CompressedBitVector> FromCBV(CompressedBitVector const & cbv);
// Reads a bit vector from reader which must contain a valid
// bit vector representation (see CompressedBitVector::Serialize for the format).
template <typename TReader>
static unique_ptr<CompressedBitVector> Deserialize(TReader & reader)
static unique_ptr<CompressedBitVector> DeserializeFromReader(TReader & reader)
{
ReaderSource<TReader> src(reader);
return DeserializeFromSource(src);

View file

@ -1,9 +1,9 @@
#include "generator/dumper.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "indexer/classificator.hpp"
#include "indexer/feature_processor.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "indexer/search_trie.hpp"
#include "coding/multilang_utf8_string.hpp"
@ -12,9 +12,9 @@
#include "std/algorithm.hpp"
#include "std/bind.hpp"
#include "std/functional.hpp"
#include "std/iostream.hpp"
#include "std/map.hpp"
#include "std/queue.hpp"
#include "std/vector.hpp"
namespace
@ -22,40 +22,30 @@ namespace
template <typename TValue>
struct SearchTokensCollector
{
priority_queue<pair<uint32_t, strings::UniString>> tokens;
strings::UniString m_currentS;
uint32_t m_currentCount;
SearchTokensCollector() : m_currentS(), m_currentCount(0) {}
void operator()(strings::UniString const & s, TValue const & /* value */)
{
if (m_currentS == s)
{
++m_currentCount;
}
else
if (m_currentS != s)
{
if (m_currentCount > 0)
{
tokens.push(make_pair(m_currentCount, m_currentS));
if (tokens.size() > 100)
tokens.pop();
}
m_tokens.emplace_back(m_currentCount, m_currentS);
m_currentS = s;
m_currentCount = 0;
}
++m_currentCount;
}
void Finish()
{
if (m_currentCount > 0)
{
tokens.push(make_pair(m_currentCount, m_currentS));
if (tokens.size() > 100)
tokens.pop();
}
m_tokens.emplace_back(m_currentCount, m_currentS);
sort(m_tokens.begin(), m_tokens.end(), greater<pair<uint32_t, strings::UniString>>());
}
vector<pair<uint32_t, strings::UniString>> m_tokens;
strings::UniString m_currentS;
uint32_t m_currentCount;
};
} // namespace
@ -198,7 +188,7 @@ namespace feature
}
}
void DumpSearchTokens(string const & fPath)
void DumpSearchTokens(string const & fPath, size_t maxTokensToShow)
{
using TValue = FeatureIndexValue;
@ -213,11 +203,11 @@ namespace feature
trie::ForEachRef(*trieRoot, f, strings::UniString());
f.Finish();
while (!f.tokens.empty())
auto freqTokenPairs = f.m_tokens;
for (size_t i = 0; i < min(maxTokensToShow, freqTokenPairs.size()); ++i)
{
strings::UniString const & s = f.tokens.top().second;
cout << f.tokens.top().first << " '" << strings::ToUtf8(s) << "'" << endl;
f.tokens.pop();
auto const & s = f.m_tokens[i].second;
cout << f.m_tokens[i].first << " " << strings::ToUtf8(s) << endl;
}
}

View file

@ -6,5 +6,9 @@ namespace feature
{
void DumpTypes(string const & fPath);
void DumpPrefixes(string const & fPath);
void DumpSearchTokens(string const & fPath);
// Writes top maxTokensToShow tokens sorted by their
// frequency, i.e. by the number of features in
// an mwm that contain the token in their name.
void DumpSearchTokens(string const & fPath, size_t maxTokensToShow);
}

View file

@ -245,7 +245,7 @@ int main(int argc, char ** argv)
feature::DumpPrefixes(datFile);
if (FLAGS_dump_search_tokens)
feature::DumpSearchTokens(datFile);
feature::DumpSearchTokens(datFile, 100 /* maxTokensToShow */);
if (FLAGS_unpack_mwm)
UnpackMwm(datFile);

View file

@ -47,7 +47,8 @@ struct KeyValuePair
template <class TString>
KeyValuePair(TString const & key, int value)
: m_key(key.begin(), key.end()), m_value(value)
{}
{
}
uint32_t GetKeySize() const { return m_key.size(); }
trie::TrieChar const * GetKeyData() const { return m_key.data(); }
@ -57,12 +58,12 @@ struct KeyValuePair
inline size_t value_size() const { return sizeof(m_value); }
bool operator == (KeyValuePair const & p) const
bool operator==(KeyValuePair const & p) const
{
return (m_key == p.m_key && m_value == p.m_value);
}
bool operator < (KeyValuePair const & p) const
bool operator<(KeyValuePair const & p) const
{
return ((m_key != p.m_key) ? m_key < p.m_key : m_value < p.m_value);
}
@ -94,8 +95,7 @@ struct KeyValuePairBackInserter
};
// The SingleValueSerializer and ValueList classes are similar to
// those in indexer/string_file_values.hpp but that file
// is not included to avoid coding_tests's dependency from indexer.
// those in indexer/string_file_values.hpp.
template <typename TPrimitive>
class SingleValueSerializer
{
@ -179,30 +179,29 @@ UNIT_TEST(TrieBuilder_WriteNode_Smoke)
valueList.Init({'1', '2', '3'});
trie::WriteNode(sink, SingleValueSerializer<char>(), 0, valueList, &children[0],
&children[0] + ARRAY_SIZE(children));
uint8_t const expected [] =
{
BOOST_BINARY(11000101), // Header: [0b11] [0b000101]
3, // Number of values
'1', '2', '3', // Values
BOOST_BINARY(10000001), // Child 1: header: [+leaf] [-supershort] [2 symbols]
MKUC(ZENC(MKSC('1'))), MKUC(ZENC(MKSC('A') - MKSC('1'))), // Child 1: edge
1, // Child 1: size
MKUC(64 | ZENC(MKSC('B') - MKSC('1'))), // Child 2: header: [-leaf] [+supershort]
2, // Child 2: size
BOOST_BINARY(00000001), // Child 3: header: [-leaf] [-supershort] [2 symbols]
MKUC(ZENC(MKSC('z') - MKSC('B'))), 0, // Child 3: edge
3, // Child 3: size
BOOST_BINARY(10111111), // Child 4: header: [+leaf] [-supershort] [>= 63 symbols]
69, // Child 4: edgeSize - 1
MKUC(ZENC(MKSC('a') - MKSC('z'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
4, // Child 4: size
MKUC(BOOST_BINARY(11000000) | ZENC(0)), // Child 5: header: [+leaf] [+supershort]
uint8_t const expected[] = {
BOOST_BINARY(11000101), // Header: [0b11] [0b000101]
3, // Number of values
'1', '2', '3', // Values
BOOST_BINARY(10000001), // Child 1: header: [+leaf] [-supershort] [2 symbols]
MKUC(ZENC(MKSC('1'))), MKUC(ZENC(MKSC('A') - MKSC('1'))), // Child 1: edge
1, // Child 1: size
MKUC(64 | ZENC(MKSC('B') - MKSC('1'))), // Child 2: header: [-leaf] [+supershort]
2, // Child 2: size
BOOST_BINARY(00000001), // Child 3: header: [-leaf] [-supershort] [2 symbols]
MKUC(ZENC(MKSC('z') - MKSC('B'))), 0, // Child 3: edge
3, // Child 3: size
BOOST_BINARY(10111111), // Child 4: header: [+leaf] [-supershort] [>= 63 symbols]
69, // Child 4: edgeSize - 1
MKUC(ZENC(MKSC('a') - MKSC('z'))), 2, 2, 2, 2, 2, 2, 2, 2, 2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2, 2, 2, 2, 2, 2, 2, 2, 2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2, 2, 2, 2, 2, 2, 2, 2, 2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2, 2, 2, 2, 2, 2, 2, 2, 2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2, 2, 2, 2, 2, 2, 2, 2, 2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2, 2, 2, 2, 2, 2, 2, 2, 2, // Child 4: edge
MKUC(ZENC(MKSC('a') - MKSC('j'))), 2, 2, 2, 2, 2, 2, 2, 2, 2, // Child 4: edge
4, // Child 4: size
MKUC(BOOST_BINARY(11000000) | ZENC(0)), // Child 5: header: [+leaf] [+supershort]
};
TEST_EQUAL(buf, vector<uint8_t>(&expected[0], &expected[0] + ARRAY_SIZE(expected)), ());
@ -216,7 +215,7 @@ UNIT_TEST(TrieBuilder_Build)
vector<string> possibleStrings(1, string());
for (int len = 1; len <= maxLen; ++len)
{
for (int i = 0, p = static_cast<int>(pow((double) base, len)); i < p; ++i)
for (int i = 0, p = static_cast<int>(pow((double)base, len)); i < p; ++i)
{
string s(len, 'A');
int t = i;
@ -232,28 +231,31 @@ UNIT_TEST(TrieBuilder_Build)
for (int i0 = -1; i0 < count; ++i0)
for (int i1 = i0; i1 < count; ++i1)
for (int i2 = i1; i2 < count; ++i2)
{
vector<KeyValuePair> v;
if (i0 >= 0) v.push_back(KeyValuePair(possibleStrings[i0], i0));
if (i1 >= 0) v.push_back(KeyValuePair(possibleStrings[i1], i1 + 10));
if (i2 >= 0) v.push_back(KeyValuePair(possibleStrings[i2], i2 + 100));
vector<string> vs;
for (size_t i = 0; i < v.size(); ++i)
vs.push_back(string(v[i].m_key.begin(), v[i].m_key.end()));
{
vector<KeyValuePair> v;
if (i0 >= 0)
v.push_back(KeyValuePair(possibleStrings[i0], i0));
if (i1 >= 0)
v.push_back(KeyValuePair(possibleStrings[i1], i1 + 10));
if (i2 >= 0)
v.push_back(KeyValuePair(possibleStrings[i2], i2 + 100));
vector<string> vs;
for (size_t i = 0; i < v.size(); ++i)
vs.push_back(string(v[i].m_key.begin(), v[i].m_key.end()));
vector<uint8_t> buf;
PushBackByteSink<vector<uint8_t>> sink(buf);
SingleValueSerializer<uint32_t> serializer;
trie::Build<PushBackByteSink<vector<uint8_t>>, typename vector<KeyValuePair>::iterator,
ValueList<uint32_t>>(sink, serializer, v.begin(), v.end());
reverse(buf.begin(), buf.end());
vector<uint8_t> buf;
PushBackByteSink<vector<uint8_t>> sink(buf);
SingleValueSerializer<uint32_t> serializer;
trie::Build<PushBackByteSink<vector<uint8_t>>, typename vector<KeyValuePair>::iterator,
ValueList<uint32_t>>(sink, serializer, v.begin(), v.end());
reverse(buf.begin(), buf.end());
MemReader memReader = MemReader(&buf[0], buf.size());
auto const root = trie::ReadTrie<MemReader, ValueList<uint32_t>>(memReader, serializer);
vector<KeyValuePair> res;
KeyValuePairBackInserter f;
trie::ForEachRef(*root, f, vector<trie::TrieChar>());
sort(f.m_v.begin(), f.m_v.end());
TEST_EQUAL(v, f.m_v, ());
}
MemReader memReader = MemReader(&buf[0], buf.size());
auto const root = trie::ReadTrie<MemReader, ValueList<uint32_t>>(memReader, serializer);
vector<KeyValuePair> res;
KeyValuePairBackInserter f;
trie::ForEachRef(*root, f, vector<trie::TrieChar>());
sort(f.m_v.begin(), f.m_v.end());
TEST_EQUAL(v, f.m_v, ());
}
}

View file

@ -14,7 +14,7 @@
#include "indexer/trie_builder.hpp"
#include "indexer/types_skipper.hpp"
#include "search/search_common.hpp" // for MAX_TOKENS constant
#include "search/search_common.hpp"
#include "defines.hpp"

View file

@ -31,7 +31,7 @@ struct FeatureIndexValue
bool operator==(FeatureIndexValue const & o) const { return m_featureId == o.m_featureId; }
void Swap(FeatureIndexValue & o) { ::swap(m_featureId, o.m_featureId); }
void Swap(FeatureIndexValue & o) { swap(m_featureId, o.m_featureId); }
uint64_t m_featureId;
};
@ -150,7 +150,7 @@ public:
ValueList(ValueList<FeatureIndexValue> const & o)
{
if (o.m_cbv)
m_cbv = coding::CompressedBitVectorBuilder::FromCBV(*o.m_cbv);
m_cbv = o.m_cbv->Clone();
}
void Init(vector<FeatureIndexValue> const & values)
@ -158,7 +158,7 @@ public:
vector<uint64_t> ids(values.size());
for (size_t i = 0; i < ids.size(); ++i)
ids[i] = values[i].m_featureId;
m_cbv = coding::CompressedBitVectorBuilder::FromBitPositions(ids);
m_cbv = coding::CompressedBitVectorBuilder::FromBitPositions(move(ids));
}
// This method returns number of values in the current instance of
@ -172,7 +172,7 @@ public:
return (m_cbv && m_cbv->PopCount() != 0) ? 1 : 0;
}
bool IsEmpty() const { return !m_cbv || m_cbv->PopCount() == 0; }
bool IsEmpty() const { return Size() == 0; }
template <typename TSink>
void Serialize(TSink & sink, SingleValueSerializer<TValue> const & /* serializer */) const

View file

@ -18,7 +18,8 @@
// -- Serialized Huffman encoding.
// -- Topology of the trie built on Huffman-encoded input strings [2 bits per node, level-order representation].
// -- List of pairs (node id, offset). One pair per final node (i.e. a node where a string ends).
// The lists of node ids and offsets are both non-decreasing and are delta-encoded with varuints.
// The lists of node ids and offsets are both non-decreasing and are delta-encoded with
// varuints.
// -- Values of final nodes in level-order. The values for final node |id| start at offset |offset|
// if there is a pair (id, offset) in the list above.

View file

@ -8,7 +8,6 @@
namespace trie
{
typedef uint32_t TrieChar;
// 95 is a good value for the default baseChar, since both small and capital latin letters
@ -19,7 +18,7 @@ static uint32_t const DEFAULT_CHAR = 0;
template <typename TValueList>
class Iterator
{
//dbg::ObjectTracker m_tracker;
// dbg::ObjectTracker m_tracker;
public:
using TValue = typename TValueList::TValue;
@ -46,7 +45,7 @@ struct EmptyValueReader
EmptyValueReader() = default;
template <typename SourceT>
void operator() (SourceT &, ValueType & value) const
void operator()(SourceT &, ValueType & value) const
{
value = 0;
}
@ -61,7 +60,7 @@ struct FixedSizeValueReader
};
template <typename SourceT>
void operator() (SourceT & src, ValueType & value) const
void operator()(SourceT & src, ValueType & value) const
{
src.Read(&value.m_data[0], N);
}

View file

@ -49,15 +49,18 @@ void WriteNode(TSink & sink, TSerializer const & serializer, TrieChar baseChar,
uint32_t const valueCount = valueList.Size();
if (begChild == endChild && !isRoot)
{
// Leaf node.
// Leaf node.
#ifdef DEBUG
auto posBefore = sink.Pos();
#endif
valueList.Serialize(sink, serializer);
#ifdef DEBUG
if (valueCount == 0)
ASSERT_EQUAL(sink.Pos(), posBefore, ("Empty valueList must produce an empty serialization."));
#endif
return;
}
uint32_t const childCount = endChild - begChild;
@ -215,7 +218,8 @@ void AppendValue(TNodeInfo & node, TValue const & value)
// sorted order and we can avoid sorting them before doing
// further operations such as ValueList construction.
using namespace std::rel_ops;
ASSERT(node.m_temporaryValueList.empty() || node.m_temporaryValueList.back() <= value, ());
ASSERT(node.m_temporaryValueList.empty() || node.m_temporaryValueList.back() <= value,
(node.m_temporaryValueList.size()));
if (!node.m_temporaryValueList.empty() && node.m_temporaryValueList.back() == value)
return;
if (node.m_mayAppend)

View file

@ -31,10 +31,9 @@ public:
return make_unique<LeafIterator0<TValueList, TSerializer>>(*this);
}
unique_ptr<Iterator<TValueList>> GoToEdge(size_t i) const override
unique_ptr<Iterator<TValueList>> GoToEdge(size_t /* i */) const override
{
ASSERT(false, (i));
UNUSED_VALUE(i);
ASSERT(false, ());
return nullptr;
}
};
@ -63,7 +62,7 @@ public:
{
ASSERT_LESS(i, this->m_edge.size(), ());
uint32_t const offset = m_edgeInfo[i].m_offset;
uint32_t const size = m_edgeInfo[i+1].m_offset - offset;
uint32_t const size = m_edgeInfo[i + 1].m_offset - offset;
if (m_edgeInfo[i].m_isLeaf)
{