[search] Refactored trie-walking routines.

This commit is contained in:
Yuri Gorshenin 2015-07-13 16:28:06 +03:00 committed by Alex Zolotarev
parent 5b1bfb8701
commit d7af959ce6
3 changed files with 221 additions and 266 deletions

View file

@ -1,5 +1,6 @@
#pragma once
#include "search/search_common.hpp"
#include "search/search_query_params.hpp"
#include "indexer/search_trie.hpp"
@ -221,16 +222,15 @@ public:
m_set->clear();
}
template <class ToDo> void ForEachResult(ToDo & toDo) const
template <class ToDo>
void ForEachResult(ToDo && toDo) const
{
if (m_prevSet)
{
for (typename SetType::const_iterator i = m_prevSet->begin(); i != m_prevSet->end(); ++i)
toDo(*i);
}
if (!m_prevSet)
return;
for (auto const & value : *m_prevSet)
toDo(value);
}
};
} // namespace search::impl
struct TrieRootPrefix
@ -255,91 +255,175 @@ struct TrieRootPrefix
}
};
/// Return features set for each token.
template <class HolderT>
void GetFeaturesInTrie(vector<vector<strings::UniString> > const & tokens,
vector<strings::UniString> const & prefixTokens,
TrieRootPrefix const & trieRoot,
HolderT & holder)
template <class TFilter>
class TrieValuesHolder
{
// Match tokens.
size_t const count = tokens.size();
holder.Resize(count + 1);
public:
TrieValuesHolder(TFilter const & filter) : m_filter(filter) {}
for (size_t i = 0; i < count; ++i)
void Resize(size_t count) { m_holder.resize(count); }
void StartNew(size_t index)
{
ASSERT_LESS(index, m_holder.size(), ());
m_index = index;
}
void operator()(Query::TrieValueT const & v)
{
if (m_filter(v.m_featureId))
m_holder[m_index].push_back(v);
}
template <class ToDo>
void GetValues(size_t index, ToDo && toDo) const
{
for (auto const & value : m_holder[index])
toDo(value);
}
private:
vector<vector<Query::TrieValueT> > m_holder;
size_t m_index;
TFilter const & m_filter;
};
// Calls toDo for each feature corresponding to at least one sym.
// *NOTE* toDo may be called several times for the same feature.
template <typename ToDo>
void MatchTokenInTrie(SearchQueryParams::TSynonymsVector const & syms,
TrieRootPrefix const & trieRoot, ToDo && toDo)
{
for (auto const & sym : syms)
{
ASSERT(!sym.empty(), ());
impl::FullMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize, sym, toDo);
}
}
// Calls toDo for each feature whose tokens contains at least one sym
// as a prefix.
// *NOTE* toDo may be called serveral times for the same feature.
template <typename ToDo>
void MatchTokenPrefixInTrie(SearchQueryParams::TSynonymsVector const & syms,
TrieRootPrefix const & trieRoot, ToDo && toDo)
{
for (auto const & sym : syms)
{
ASSERT(!sym.empty(), ());
impl::PrefixMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize, sym, toDo);
}
}
// Fills holder with features whose names correspond to tokens list up to synonyms.
// *NOTE* the same feature may be put in the same holder's slot several times.
template <typename THolder>
void MatchTokensInTrie(vector<SearchQueryParams::TSynonymsVector> const & tokens,
TrieRootPrefix const & trieRoot, THolder && holder)
{
holder.Resize(tokens.size());
for (size_t i = 0; i < tokens.size(); ++i)
{
holder.StartNew(i);
for (size_t j = 0; j < tokens[i].size(); ++j)
{
ASSERT ( !tokens[i][j].empty(), () );
impl::FullMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize,
tokens[i][j], holder);
}
}
// Match prefix.
holder.StartNew(count);
for (size_t i = 0; i < prefixTokens.size(); ++i)
{
ASSERT ( !prefixTokens[i].empty(), () );
impl::FullMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize,
prefixTokens[i], holder);
MatchTokenInTrie(tokens[i], trieRoot, holder);
}
}
/// Do set intersection of features for each token.
template <class ToDo, class FilterT, class HolderT>
void MatchFeaturesInTrie(vector<vector<strings::UniString> > const & tokens,
vector<strings::UniString> const & prefixTokens,
TrieRootPrefix const & trieRoot,
FilterT const & filter,
HolderT const & addHolder,
ToDo & toDo)
// Fills holder with features whose names correspond to tokens list up to synonyms,
// also, last holder's slot will be filled with features corresponding to prefixTokens.
// *NOTE* the same feature may be put in the same holder's slot several times.
template <typename THolder>
void MatchTokensAndPrefixInTrie(vector<SearchQueryParams::TSynonymsVector> const & tokens,
SearchQueryParams::TSynonymsVector const & prefixTokens,
TrieRootPrefix const & trieRoot, THolder && holder)
{
impl::OffsetIntersecter<FilterT> intersecter(filter);
MatchTokensInTrie(tokens, trieRoot, holder);
// Match tokens.
size_t const count = tokens.size();
for (size_t i = 0; i < count; ++i)
{
for (size_t j = 0; j < tokens[i].size(); ++j)
{
ASSERT ( !tokens[i][j].empty(), () );
// match in trie
impl::FullMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize,
tokens[i][j], intersecter);
}
// get additional features for 'i' token
addHolder.GetValues(i, intersecter);
intersecter.NextStep();
}
// Match prefix.
size_t const prefixCount = prefixTokens.size();
for (size_t i = 0; i < prefixCount; ++i)
{
ASSERT ( !prefixTokens[i].empty(), () );
// match in trie
impl::PrefixMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize,
prefixTokens[i], intersecter);
}
if (prefixCount > 0)
{
// get additional features for prefix token
addHolder.GetValues(count, intersecter);
intersecter.NextStep();
}
intersecter.ForEachResult(toDo);
holder.Resize(tokens.size() + 1);
holder.StartNew(tokens.size());
MatchTokenPrefixInTrie(prefixTokens, trieRoot, holder);
}
// Fills holder with categories whose description matches to at least one
// token from a search query.
// *NOTE* query prefix will be treated as a complete token in the function.
template <typename THolder>
bool MatchCategoriesInTrie(SearchQueryParams const & params, TrieIterator const & trieRoot,
THolder && holder)
{
ASSERT_LESS(trieRoot.m_edge.size(), numeric_limits<uint32_t>::max(), ());
uint32_t const numLangs = static_cast<uint32_t>(trieRoot.m_edge.size());
for (uint32_t langIx = 0; langIx < numLangs; ++langIx)
{
auto const & edge = trieRoot.m_edge[langIx].m_str;
ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ());
if (edge[0] == search::CATEGORIES_LANG)
{
unique_ptr<TrieIterator> const catRoot(trieRoot.GoToEdge(langIx));
MatchTokensInTrie(params.m_tokens, TrieRootPrefix(*catRoot, edge), holder);
// Last token's prefix is used as a complete token here, to limit a number of
// features in the last bucket of a holder. Probably, this is a false optimization.
holder.Resize(params.m_tokens.size() + 1);
holder.StartNew(params.m_tokens.size());
MatchTokenInTrie(params.m_prefixTokens, TrieRootPrefix(*catRoot, edge), holder);
return true;
}
}
return false;
}
// Calls toDo with trie root prefix and language code on each languagу allowed by params.
template <typename ToDo>
void ForEachLangPrefix(SearchQueryParams const & params, TrieIterator const & trieRoot,
ToDo && toDo)
{
ASSERT_LESS(trieRoot.m_edge.size(), numeric_limits<uint32_t>::max(), ());
uint32_t const numLangs = static_cast<uint32_t>(trieRoot.m_edge.size());
for (uint32_t langIx = 0; langIx < numLangs; ++langIx)
{
auto const & edge = trieRoot.m_edge[langIx].m_str;
ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ());
int8_t const lang = static_cast<int8_t>(edge[0]);
if (edge[0] < search::CATEGORIES_LANG && params.IsLangExist(lang))
{
unique_ptr<TrieIterator> const langRoot(trieRoot.GoToEdge(langIx));
TrieRootPrefix langPrefix(*langRoot, edge);
toDo(langPrefix, lang);
}
}
}
// Calls toDo for each feature whose description contains *ALL* tokens from a search query.
// Each feature will be passed to toDo only once.
template <typename TFilter, typename ToDo>
void MatchFeaturesInTrie(SearchQueryParams const & params, TrieIterator const & trieRoot,
TFilter const & filter, ToDo && toDo)
{
TrieValuesHolder<TFilter> categoriesHolder(filter);
CHECK(MatchCategoriesInTrie(params, trieRoot, categoriesHolder), ("Can't find categories."));
impl::OffsetIntersecter<TFilter> intersecter(filter);
for (size_t i = 0; i < params.m_tokens.size(); ++i)
{
ForEachLangPrefix(params, trieRoot, [&](TrieRootPrefix & langRoot, int8_t /* lang */)
{
MatchTokenInTrie(params.m_tokens[i], langRoot, intersecter);
});
categoriesHolder.GetValues(i, intersecter);
intersecter.NextStep();
}
if (!params.m_prefixTokens.empty())
{
ForEachLangPrefix(params, trieRoot, [&](TrieRootPrefix & langRoot, int8_t /* lang */)
{
MatchTokenPrefixInTrie(params.m_prefixTokens, langRoot, intersecter);
});
categoriesHolder.GetValues(params.m_tokens.size(), intersecter);
intersecter.NextStep();
}
intersecter.ForEachResult(forward<ToDo>(toDo));
}
} // namespace search

View file

@ -1066,33 +1066,6 @@ void Query::MakeResultHighlight(Result & res) const
SearchStringTokensIntersectionRanges(res.GetString(), beg, end, AssignHighlightRange(res));
}
namespace impl
{
class FeatureLoader
{
Query & m_query;
MwmSet::MwmId m_mwmID;
size_t m_count;
Query::ViewportID m_viewportID;
public:
FeatureLoader(Query & query, MwmSet::MwmId const & mwmID, Query::ViewportID viewportID)
: m_query(query), m_mwmID(mwmID), m_count(0), m_viewportID(viewportID)
{
}
void operator()(Query::TrieValueT const & value)
{
++m_count;
m_query.AddResultFromTrie(value, m_mwmID, m_viewportID);
}
size_t GetCount() const { return m_count; }
void Reset() { m_count = 0; }
};
}
namespace
{
int GetOldTypeFromIndex(size_t index)
@ -1768,54 +1741,42 @@ void Query::SearchLocality(MwmValue * pMwm, impl::Locality & res1, impl::Region
serial::CodingParams cp(GetCPForTrie(pMwm->GetHeader().GetDefCodingParams()));
ModelReaderPtr searchReader = pMwm->m_cont.GetReader(SEARCH_INDEX_FILE_TAG);
unique_ptr<TrieIterator> const pTrieRoot(::trie::reader::ReadTrie(
SubReaderWrapper<Reader>(searchReader.GetPtr()),
trie::ValueReader(cp),
trie::EdgeValueReader()));
unique_ptr<TrieIterator> const trieRoot(
::trie::reader::ReadTrie(SubReaderWrapper<Reader>(searchReader.GetPtr()),
trie::ValueReader(cp), trie::EdgeValueReader()));
ASSERT_LESS(pTrieRoot->m_edge.size(), numeric_limits<uint32_t>::max(), ());
uint32_t const count = static_cast<uint32_t>(pTrieRoot->m_edge.size());
for (uint32_t i = 0; i < count; ++i)
{
TrieIterator::Edge::EdgeStrT const & edge = pTrieRoot->m_edge[i].m_str;
ForEachLangPrefix(params, *trieRoot, [&](TrieRootPrefix & langRoot, int8_t lang)
{
impl::DoFindLocality doFind(*this, pMwm, lang);
MatchTokensInTrie(params.m_tokens, langRoot, doFind);
/// We do search countries, states and cities for one language.
/// @todo Do combine countries and cities for different languages.
int8_t const lang = static_cast<int8_t>(edge[0]);
if (edge[0] < search::CATEGORIES_LANG && params.IsLangExist(lang))
// Last token's prefix is used as a complete token here, to limit number of results.
doFind.Resize(params.m_tokens.size() + 1);
doFind.StartNew(params.m_tokens.size());
MatchTokenInTrie(params.m_prefixTokens, langRoot, doFind);
doFind.SortLocalities();
// Get regions from STATE and COUNTRY localities
vector<impl::Region> regions;
doFind.GetRegions(regions);
// Get best CITY locality.
impl::Locality loc;
doFind.GetBestCity(loc, regions);
if (res1 < loc)
{
unique_ptr<TrieIterator> const pLangRoot(pTrieRoot->GoToEdge(i));
// gel all localities from mwm
impl::DoFindLocality doFind(*this, pMwm, lang);
GetFeaturesInTrie(params.m_tokens, params.m_prefixTokens,
TrieRootPrefix(*pLangRoot, edge), doFind);
// sort localities by priority
doFind.SortLocalities();
// get Region's from STATE and COUNTRY localities
vector<impl::Region> regions;
doFind.GetRegions(regions);
// get best CITY locality
impl::Locality loc;
doFind.GetBestCity(loc, regions);
if (res1 < loc)
{
LOG(LDEBUG, ("Better location ", loc, " for language ", lang));
res1.Swap(loc);
}
// get best region
if (!regions.empty())
{
sort(regions.begin(), regions.end());
if (res2 < regions.back())
res2.Swap(regions.back());
}
LOG(LDEBUG, ("Better location ", loc, " for language ", lang));
res1.Swap(loc);
}
}
// Get best region.
if (!regions.empty())
{
sort(regions.begin(), regions.end());
if (res2 < regions.back())
res2.Swap(regions.back());
}
});
}
void Query::SearchFeatures()
@ -1860,34 +1821,6 @@ namespace
binary_search(m_offsets->begin(), m_offsets->end(), offset));
}
};
template <class FilterT> class TrieValuesHolder
{
vector<vector<Query::TrieValueT> > m_holder;
size_t m_ind;
FilterT const & m_filter;
public:
TrieValuesHolder(FilterT const & filter) : m_filter(filter) {}
void Resize(size_t count) { m_holder.resize(count); }
void StartNew(size_t ind)
{
ASSERT_LESS ( ind, m_holder.size(), () );
m_ind = ind;
}
void operator() (Query::TrieValueT const & v)
{
if (m_filter(v.m_featureId))
m_holder[m_ind].push_back(v);
}
template <class ToDo> void GetValues(size_t ind, ToDo & toDo) const
{
for (size_t i = 0; i < m_holder[ind].size(); ++i)
toDo(m_holder[ind][i]);
}
};
}
void Query::SearchFeatures(SearchQueryParams const & params, MWMVectorT const & mwmsInfo,
@ -1904,93 +1837,31 @@ void Query::SearchFeatures(SearchQueryParams const & params, MWMVectorT const &
}
}
namespace
{
void FillCategories(SearchQueryParams const & params, TrieIterator const * pTrieRoot,
TrieValuesHolder<FeaturesFilter> & categoriesHolder)
{
unique_ptr<TrieIterator> pCategoriesRoot;
typedef TrieIterator::Edge::EdgeStrT EdgeT;
EdgeT categoriesEdge;
ASSERT_LESS(pTrieRoot->m_edge.size(), numeric_limits<uint32_t>::max(), ());
uint32_t const count = static_cast<uint32_t>(pTrieRoot->m_edge.size());
for (uint32_t i = 0; i < count; ++i)
{
EdgeT const & edge = pTrieRoot->m_edge[i].m_str;
ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ());
if (edge[0] == search::CATEGORIES_LANG)
{
categoriesEdge = edge;
pCategoriesRoot.reset(pTrieRoot->GoToEdge(i));
break;
}
}
ASSERT(pCategoriesRoot != 0, ());
GetFeaturesInTrie(params.m_tokens, params.m_prefixTokens,
TrieRootPrefix(*pCategoriesRoot, categoriesEdge),
categoriesHolder);
}
}
void Query::SearchInMWM(Index::MwmHandle const & mwmHandle, SearchQueryParams const & params,
ViewportID vID /*= DEFAULT_V*/)
ViewportID viewportId /*= DEFAULT_V*/)
{
if (MwmValue const * const pMwm = mwmHandle.GetValue<MwmValue>())
{
if (pMwm->m_cont.IsExist(SEARCH_INDEX_FILE_TAG))
{
FHeaderT const & header = pMwm->GetHeader();
MwmValue const * const value = mwmHandle.GetValue<MwmValue>();
if (!value || !value->m_cont.IsExist(SEARCH_INDEX_FILE_TAG))
return;
/// @todo do not process World.mwm here - do it in SearchLocality
bool const isWorld = (header.GetType() == FHeaderT::world);
if (isWorld && !m_worldSearch)
return;
FHeaderT const & header = value->GetHeader();
/// @todo do not process World.mwm here - do it in SearchLocality
bool const isWorld = (header.GetType() == FHeaderT::world);
if (isWorld && !m_worldSearch)
return;
serial::CodingParams cp(GetCPForTrie(header.GetDefCodingParams()));
ModelReaderPtr searchReader = pMwm->m_cont.GetReader(SEARCH_INDEX_FILE_TAG);
unique_ptr<TrieIterator> const pTrieRoot(::trie::reader::ReadTrie(
SubReaderWrapper<Reader>(searchReader.GetPtr()),
trie::ValueReader(cp),
trie::EdgeValueReader()));
MwmSet::MwmId const mwmId = mwmHandle.GetId();
FeaturesFilter filter((vID == DEFAULT_V || isWorld) ? 0 : &m_offsetsInViewport[vID][mwmId], *this);
// Get categories for each token separately - find needed edge with categories.
TrieValuesHolder<FeaturesFilter> categoriesHolder(filter);
FillCategories(params, pTrieRoot.get(), categoriesHolder);
// Match tokens to feature for each language - iterate through first edges.
impl::FeatureLoader emitter(*this, mwmId, vID);
ASSERT_LESS(pTrieRoot->m_edge.size(), numeric_limits<uint32_t>::max(), ());
uint32_t const count = static_cast<uint32_t>(pTrieRoot->m_edge.size());
for (uint32_t i = 0; i < count; ++i)
{
TrieIterator::Edge::EdgeStrT const & edge = pTrieRoot->m_edge[i].m_str;
int8_t const lang = static_cast<int8_t>(edge[0]);
if (edge[0] < search::CATEGORIES_LANG && params.IsLangExist(lang))
{
unique_ptr<TrieIterator> const pLangRoot(pTrieRoot->GoToEdge(i));
MatchFeaturesInTrie(params.m_tokens, params.m_prefixTokens,
TrieRootPrefix(*pLangRoot, edge),
filter, categoriesHolder, emitter);
LOG(LDEBUG, ("Country", pMwm->GetCountryFile().GetNameWithoutExt(), "Lang",
StringUtf8Multilang::GetLangByCode(lang), "Matched", emitter.GetCount()));
emitter.Reset();
}
}
}
}
serial::CodingParams cp(GetCPForTrie(header.GetDefCodingParams()));
ModelReaderPtr searchReader = value->m_cont.GetReader(SEARCH_INDEX_FILE_TAG);
unique_ptr<TrieIterator> const trieRoot(
::trie::reader::ReadTrie(SubReaderWrapper<Reader>(searchReader.GetPtr()),
trie::ValueReader(cp), trie::EdgeValueReader()));
MwmSet::MwmId const mwmId = mwmHandle.GetId();
FeaturesFilter filter(
(viewportId == DEFAULT_V || isWorld) ? 0 : &m_offsetsInViewport[viewportId][mwmId], *this);
MatchFeaturesInTrie(params, *trieRoot, filter, [&](TrieValueT const & value)
{
AddResultFromTrie(value, mwmId, viewportId);
});
}
void Query::SuggestStrings(Results & res)

View file

@ -179,7 +179,7 @@ private:
ViewportID vID);
/// Do search in particular map (mwmHandle).
void SearchInMWM(Index::MwmHandle const & mwmHandle, SearchQueryParams const & params,
ViewportID vID = DEFAULT_V);
ViewportID viewportId = DEFAULT_V);
//@}
void SuggestStrings(Results & res);