Switched the search index to use compressed bit vectors.

This commit is contained in:
Maxim Pimenov 2015-10-21 14:46:54 +03:00 committed by Sergey Yershov
parent 7e2abfba0b
commit 2ef3f59f92
14 changed files with 116 additions and 48 deletions

View file

@ -277,7 +277,7 @@ UNIT_TEST(TrieBuilder_Build)
trie::ReadTrie<MemReader, ValueList<uint32_t>>(memReader, serial::CodingParams());
vector<KeyValuePair> res;
KeyValuePairBackInserter f;
trie::ForEachRef(*root, f, vector<trie::TrieChar>());
trie::ForEachRefWithValues(*root, f, vector<trie::TrieChar>());
sort(f.m_v.begin(), f.m_v.end());
TEST_EQUAL(v, f.m_v, ());
}

View file

@ -312,6 +312,27 @@ unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromBitGroups(
return make_unique<SparseCBV>(setBits);
}
// static
unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromCBV(CompressedBitVector const & cbv)
{
auto strat = cbv.GetStorageStrategy();
switch (strat)
{
case CompressedBitVector::StorageStrategy::Dense:
{
DenseCBV const & dense = static_cast<DenseCBV const &>(cbv);
auto bitGroups = dense.m_bitGroups;
return CompressedBitVectorBuilder::FromBitGroups(move(bitGroups));
}
case CompressedBitVector::StorageStrategy::Sparse:
{
SparseCBV const & sparse = static_cast<SparseCBV const &>(cbv);
return CompressedBitVectorBuilder::FromBitPositions(sparse.m_positions);
}
}
return unique_ptr<CompressedBitVector>();
}
string DebugPrint(CompressedBitVector::StorageStrategy strat)
{
switch (strat)

View file

@ -2,8 +2,11 @@
#include "coding/reader.hpp"
#include "coding/writer.hpp"
#include "base/assert.hpp"
#include "std/algorithm.hpp"
#include "std/unique_ptr.hpp"
#include "std/utility.hpp"
#include "std/vector.hpp"
namespace coding
@ -150,6 +153,9 @@ public:
// by concatenating the elements of bitGroups.
static unique_ptr<CompressedBitVector> FromBitGroups(vector<uint64_t> && bitGroups);
// Copies a CBV.
static unique_ptr<CompressedBitVector> FromCBV(CompressedBitVector const & cbv);
// Reads a bit vector from reader which must contain a valid
// bit vector representation (see CompressedBitVector::Serialize for the format).
template <typename TReader>

View file

@ -24,6 +24,8 @@ class Iterator
//dbg::ObjectTracker m_tracker;
public:
using TValue = typename TValueList::TValue;
struct Edge
{
typedef buffer_vector<TrieChar, 8> EdgeStrT;
@ -70,9 +72,9 @@ struct FixedSizeValueReader
template <typename TValueList, typename TF, typename TString>
void ForEachRef(Iterator<TValueList> const & iter, TF && f, TString const & s)
{
iter.m_valueList.ForEach([&f, &s](typename TValueList::TValue value)
iter.m_valueList.ForEach([&f, &s](typename TValueList::TValue const & /* value */)
{
f(s, value);
f(s);
});
for (size_t i = 0; i < iter.m_edge.size(); ++i)
{
@ -83,4 +85,19 @@ void ForEachRef(Iterator<TValueList> const & iter, TF && f, TString const & s)
}
}
template <typename TValueList, typename TF, typename TString>
void ForEachRefWithValues(Iterator<TValueList> const & iter, TF && f, TString const & s)
{
iter.m_valueList.ForEach([&f, &s](typename TValueList::TValue const & value)
{
f(s, value);
});
for (size_t i = 0; i < iter.m_edge.size(); ++i)
{
TString s1(s);
s1.insert(s1.end(), iter.m_edge[i].m_str.begin(), iter.m_edge[i].m_str.end());
auto it = iter.GoToEdge(i);
ForEachRefWithValues(*it, f, s1);
}
}
} // namespace trie

View file

@ -17,13 +17,13 @@
// pre-order alphabetically reversed (parent, last child, first child).
// Leaf node format:
// [value] ... [value]
// [valueList]
// Internal node format:
// [1: header]: [2: min(valueCount, 3)] [6: min(childCount, 63)]
// [vu valueCount]: if valueCount in header == 3
// [vu childCount]: if childCount in header == 63
// [value] ... [value]
// [valueList]
// [childInfo] ... [childInfo]
// Child info format:

View file

@ -94,7 +94,7 @@ private:
if (childCount == 63)
childCount = ReadVarUint<uint32_t>(src);
// [value] ... [value]
// [valueList]
m_valueList.Deserialize(src, valueCount);
// [childInfo] ... [childInfo]

View file

@ -164,7 +164,7 @@ namespace feature
SearchTokensCollector() : m_currentS(), m_currentCount(0) {}
void operator()(strings::UniString const & s, FeatureWithRankAndCenter const &)
void operator()(strings::UniString const & s)
{
if (m_currentS == s)
{
@ -200,7 +200,7 @@ namespace feature
feature::DataHeader header(container);
serial::CodingParams codingParams(trie::GetCodingParams(header.GetDefCodingParams()));
auto const pTrieRoot = trie::ReadTrie<ModelReaderPtr, ValueList<FeatureWithRankAndCenter>>(
auto const pTrieRoot = trie::ReadTrie<ModelReaderPtr, ValueList<FeatureIndexValue>>(
container.GetReader(SEARCH_INDEX_FILE_TAG), codingParams);
SearchTokensCollector f;

View file

@ -177,11 +177,15 @@ struct ValueBuilder<FeatureWithRankAndCenter>
template <>
struct ValueBuilder<FeatureIndexValue>
{
ValueBuilder(serial::CodingParams const & cp) : m_cp(cp) {}
void MakeValue(FeatureType const & /* f */, feature::TypesHolder const & /* types */,
uint32_t index, FeatureIndexValue & value) const
{
value.m_value = index;
value.m_featureId = index;
}
serial::CodingParams m_cp;
};
template <typename TStringsFile>
@ -258,22 +262,23 @@ public:
}
};
template <typename TValue>
void AddFeatureNameIndexPairs(FilesContainerR const & container,
CategoriesHolder & categoriesHolder,
StringsFile<FeatureWithRankAndCenter> & stringsFile)
StringsFile<TValue> & stringsFile)
{
FeaturesVectorTest features(container);
feature::DataHeader const & header = features.GetHeader();
serial::CodingParams codingParams(trie::GetCodingParams(header.GetDefCodingParams()));
ValueBuilder<FeatureWithRankAndCenter> valueBuilder(codingParams);
ValueBuilder<TValue> valueBuilder(codingParams);
unique_ptr<SynonymsHolder> synonyms;
if (header.GetType() == feature::DataHeader::world)
synonyms.reset(new SynonymsHolder(GetPlatform().WritablePathForFile(SYNONYMS_FILE)));
features.GetVector().ForEach(FeatureInserter<StringsFile<FeatureWithRankAndCenter>>(
features.GetVector().ForEach(FeatureInserter<StringsFile<TValue>>(
synonyms.get(), stringsFile, categoriesHolder, header.GetScaleRange(), valueBuilder));
}
} // namespace
@ -326,12 +331,14 @@ bool BuildSearchIndexFromDatFile(string const & datFile, bool forceRebuild)
void BuildSearchIndex(FilesContainerR & container, Writer & indexWriter,
string const & stringsFilePath)
{
using TValue = FeatureIndexValue;
Platform & platform = GetPlatform();
LOG(LINFO, ("Start building search index for", container.GetFileName()));
my::Timer timer;
StringsFile<FeatureWithRankAndCenter> stringsFile(stringsFilePath);
StringsFile<TValue> stringsFile(stringsFilePath);
CategoriesHolder categoriesHolder(platform.GetReader(SEARCH_CATEGORIES_FILE_NAME));
@ -341,9 +348,8 @@ void BuildSearchIndex(FilesContainerR & container, Writer & indexWriter,
LOG(LINFO, ("End sorting strings:", timer.ElapsedSeconds()));
stringsFile.OpenForRead();
trie::Build<Writer, typename StringsFile<FeatureWithRankAndCenter>::IteratorT,
ValueList<FeatureWithRankAndCenter>>(indexWriter, stringsFile.Begin(),
stringsFile.End());
trie::Build<Writer, typename StringsFile<TValue>::IteratorT, ValueList<TValue>>(
indexWriter, stringsFile.Begin(), stringsFile.End());
LOG(LINFO, ("End building search index, elapsed seconds:", timer.ElapsedSeconds()));
}

View file

@ -16,7 +16,7 @@ static const uint8_t kPointCodingBits = 20;
namespace trie
{
using DefaultIterator = trie::Iterator<ValueList<FeatureWithRankAndCenter>>;
using DefaultIterator = trie::Iterator<ValueList<FeatureIndexValue>>;
inline serial::CodingParams GetCodingParams(serial::CodingParams const & orig)
{

View file

@ -23,31 +23,42 @@
/// A wrapper around feature index.
struct FeatureIndexValue
{
FeatureIndexValue() : m_value(0) {}
FeatureIndexValue() : m_featureId(0) {}
FeatureIndexValue(uint64_t featureId) : m_featureId(featureId) {}
// The serialization and deserialization is needed for StringsFile.
// Use ValueList for group serialization in CBVs.
template <typename TWriter>
void Write(TWriter & writer) const
void Serialize(TWriter & writer) const
{
WriteToSink(writer, m_value);
WriteToSink(writer, m_featureId);
}
template <typename TReader>
void Read(TReader & reader)
void Deserialize(TReader & reader)
{
m_value = ReadPrimitiveFromSource<uint64_t>(reader);
ReaderSource<TReader> src(reader);
DeserializeFromSource(src);
}
inline void const * data() const { return &m_value; }
template <typename TSource>
void DeserializeFromSource(TSource & src)
{
m_featureId = ReadPrimitiveFromSource<uint64_t>(src);
}
inline size_t size() const { return sizeof(m_value); }
inline void const * data() const { return &m_featureId; }
bool operator<(FeatureIndexValue const & value) const { return m_value < value.m_value; }
inline size_t size() const { return sizeof(m_featureId); }
bool operator==(FeatureIndexValue const & value) const { return m_value == value.m_value; }
bool operator<(FeatureIndexValue const & o) const { return m_featureId < o.m_featureId; }
void swap(FeatureIndexValue & value) { ::swap(m_value, value.m_value); }
bool operator==(FeatureIndexValue const & o) const { return m_featureId == o.m_featureId; }
uint64_t m_value;
void Swap(FeatureIndexValue & o) { ::swap(m_featureId, o.m_featureId); }
uint64_t m_featureId;
};
struct FeatureWithRankAndCenter
@ -117,12 +128,20 @@ public:
ValueList() : m_cbv(unique_ptr<coding::CompressedBitVector>()) {}
ValueList(ValueList<FeatureIndexValue> const & o) : m_codingParams(o.m_codingParams)
{
if (o.m_cbv)
m_cbv = coding::CompressedBitVectorBuilder::FromCBV(*o.m_cbv);
else
m_cbv = unique_ptr<coding::CompressedBitVector>();
}
void Init(vector<FeatureIndexValue> const & values)
{
vector<uint64_t> offsets(values.size());
for (size_t i = 0; i < offsets.size(); ++i)
offsets[i] = values[i].m_value;
m_cbv = coding::CompressedBitVectorBuilder::FromBitPositions(offsets);
vector<uint64_t> ids(values.size());
for (size_t i = 0; i < ids.size(); ++i)
ids[i] = values[i].m_featureId;
m_cbv = coding::CompressedBitVectorBuilder::FromBitPositions(ids);
}
// This method returns number of values in the current instance of
@ -161,7 +180,10 @@ public:
template <typename TF>
void ForEach(TF && f) const
{
coding::CompressedBitVectorEnumerator::ForEach(*m_cbv, forward<TF>(f));
coding::CompressedBitVectorEnumerator::ForEach(*m_cbv, [&](uint64_t const bitPosition)
{
f(TValue(bitPosition));
});
}
void SetCodingParams(serial::CodingParams const & codingParams) { m_codingParams = codingParams; }

View file

@ -113,7 +113,6 @@ void FullMatchInTrie(trie::DefaultIterator const & trieRoot, strings::UniChar co
ASSERT_EQUAL ( symbolsMatched, s.size(), () );
LOG(LINFO, ("foreach`ing", it->m_valueList.Size()));
it->m_valueList.ForEach(f);
}
@ -152,11 +151,9 @@ void PrefixMatchInTrie(trie::DefaultIterator const & trieRoot, strings::UniChar
}
}
template <class TFilter>
template <typename TFilter, typename TValue>
class OffsetIntersecter
{
using TValue = FeatureWithRankAndCenter;
struct HashFn
{
size_t operator()(TValue const & v) const { return v.m_featureId; }
@ -232,7 +229,7 @@ struct TrieRootPrefix
}
};
template <class TFilter>
template <typename TFilter, typename TValue>
class TrieValuesHolder
{
public:
@ -246,7 +243,7 @@ public:
m_index = index;
}
void operator()(Query::TTrieValue const & v)
void operator()(TValue const & v)
{
if (m_filter(v.m_featureId))
m_holder[m_index].push_back(v);
@ -261,7 +258,7 @@ public:
}
private:
vector<vector<Query::TTrieValue>> m_holder;
vector<vector<TValue>> m_holder;
size_t m_index;
TFilter const & m_filter;
};
@ -380,10 +377,11 @@ template <typename TFilter, typename ToDo>
void MatchFeaturesInTrie(SearchQueryParams const & params, trie::DefaultIterator const & trieRoot,
TFilter const & filter, ToDo && toDo)
{
using TValue = trie::DefaultIterator::TValue;
TrieValuesHolder<TFilter> categoriesHolder(filter);
bool const categoriesMatched = MatchCategoriesInTrie(params, trieRoot, categoriesHolder);
impl::OffsetIntersecter<TFilter> intersecter(filter);
impl::OffsetIntersecter<TFilter, TValue> intersecter(filter);
for (size_t i = 0; i < params.m_tokens.size(); ++i)
{
ForEachLangPrefix(params, trieRoot, [&](TrieRootPrefix & langRoot, int8_t lang)

View file

@ -67,9 +67,8 @@ unique_ptr<coding::CompressedBitVector> RetrieveAddressFeatures(MwmSet::MwmHandl
ASSERT(value, ());
serial::CodingParams codingParams(trie::GetCodingParams(value->GetHeader().GetDefCodingParams()));
ModelReaderPtr searchReader = value->m_cont.GetReader(SEARCH_INDEX_FILE_TAG);
auto const trieRoot =
trie::ReadTrie<SubReaderWrapper<Reader>, ValueList<FeatureWithRankAndCenter>>(
SubReaderWrapper<Reader>(searchReader.GetPtr()), codingParams);
auto const trieRoot = trie::ReadTrie<SubReaderWrapper<Reader>, ValueList<FeatureIndexValue>>(
SubReaderWrapper<Reader>(searchReader.GetPtr()), codingParams);
auto emptyFilter = [](uint32_t /* featureId */)
{

View file

@ -1611,9 +1611,8 @@ void Query::SearchLocality(MwmValue const * pMwm, Locality & res1, Region & res2
ModelReaderPtr searchReader = pMwm->m_cont.GetReader(SEARCH_INDEX_FILE_TAG);
auto const trieRoot =
trie::ReadTrie<SubReaderWrapper<Reader>, ValueList<FeatureWithRankAndCenter>>(
SubReaderWrapper<Reader>(searchReader.GetPtr()), cp);
auto const trieRoot = trie::ReadTrie<SubReaderWrapper<Reader>, ValueList<FeatureIndexValue>>(
SubReaderWrapper<Reader>(searchReader.GetPtr()), cp);
ForEachLangPrefix(params, *trieRoot, [&](TrieRootPrefix & langRoot, int8_t lang)
{

View file

@ -110,7 +110,7 @@ public:
/// @name This stuff is public for implementation classes in search_query.cpp
/// Do not use it in client code.
//@{
using TTrieValue = FeatureWithRankAndCenter;
using TTrieValue = FeatureIndexValue;
void InitParams(bool localitySearch, SearchQueryParams & params);