From d7af959ce6c52d949bbdd2161a507b02f5b6684a Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Mon, 13 Jul 2015 16:28:06 +0300 Subject: [PATCH] [search] Refactored trie-walking routines. --- search/feature_offset_match.hpp | 250 +++++++++++++++++++++----------- search/search_query.cpp | 235 +++++++----------------------- search/search_query.hpp | 2 +- 3 files changed, 221 insertions(+), 266 deletions(-) diff --git a/search/feature_offset_match.hpp b/search/feature_offset_match.hpp index 529a258f24..3474d69a23 100644 --- a/search/feature_offset_match.hpp +++ b/search/feature_offset_match.hpp @@ -1,5 +1,6 @@ #pragma once #include "search/search_common.hpp" +#include "search/search_query_params.hpp" #include "indexer/search_trie.hpp" @@ -221,16 +222,15 @@ public: m_set->clear(); } - template void ForEachResult(ToDo & toDo) const + template + void ForEachResult(ToDo && toDo) const { - if (m_prevSet) - { - for (typename SetType::const_iterator i = m_prevSet->begin(); i != m_prevSet->end(); ++i) - toDo(*i); - } + if (!m_prevSet) + return; + for (auto const & value : *m_prevSet) + toDo(value); } }; - } // namespace search::impl struct TrieRootPrefix @@ -255,91 +255,175 @@ struct TrieRootPrefix } }; -/// Return features set for each token. -template -void GetFeaturesInTrie(vector > const & tokens, - vector const & prefixTokens, - TrieRootPrefix const & trieRoot, - HolderT & holder) +template +class TrieValuesHolder { - // Match tokens. - size_t const count = tokens.size(); - holder.Resize(count + 1); +public: + TrieValuesHolder(TFilter const & filter) : m_filter(filter) {} - for (size_t i = 0; i < count; ++i) + void Resize(size_t count) { m_holder.resize(count); } + + void StartNew(size_t index) + { + ASSERT_LESS(index, m_holder.size(), ()); + m_index = index; + } + + void operator()(Query::TrieValueT const & v) + { + if (m_filter(v.m_featureId)) + m_holder[m_index].push_back(v); + } + + template + void GetValues(size_t index, ToDo && toDo) const + { + for (auto const & value : m_holder[index]) + toDo(value); + } + +private: + vector > m_holder; + size_t m_index; + TFilter const & m_filter; +}; + +// Calls toDo for each feature corresponding to at least one sym. +// *NOTE* toDo may be called several times for the same feature. +template +void MatchTokenInTrie(SearchQueryParams::TSynonymsVector const & syms, + TrieRootPrefix const & trieRoot, ToDo && toDo) +{ + for (auto const & sym : syms) + { + ASSERT(!sym.empty(), ()); + impl::FullMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize, sym, toDo); + } +} + +// Calls toDo for each feature whose tokens contains at least one sym +// as a prefix. +// *NOTE* toDo may be called serveral times for the same feature. +template +void MatchTokenPrefixInTrie(SearchQueryParams::TSynonymsVector const & syms, + TrieRootPrefix const & trieRoot, ToDo && toDo) +{ + for (auto const & sym : syms) + { + ASSERT(!sym.empty(), ()); + impl::PrefixMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize, sym, toDo); + } +} + +// Fills holder with features whose names correspond to tokens list up to synonyms. +// *NOTE* the same feature may be put in the same holder's slot several times. +template +void MatchTokensInTrie(vector const & tokens, + TrieRootPrefix const & trieRoot, THolder && holder) +{ + holder.Resize(tokens.size()); + for (size_t i = 0; i < tokens.size(); ++i) { holder.StartNew(i); - - for (size_t j = 0; j < tokens[i].size(); ++j) - { - ASSERT ( !tokens[i][j].empty(), () ); - - impl::FullMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize, - tokens[i][j], holder); - } - } - - // Match prefix. - holder.StartNew(count); - for (size_t i = 0; i < prefixTokens.size(); ++i) - { - ASSERT ( !prefixTokens[i].empty(), () ); - - impl::FullMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize, - prefixTokens[i], holder); + MatchTokenInTrie(tokens[i], trieRoot, holder); } } -/// Do set intersection of features for each token. -template -void MatchFeaturesInTrie(vector > const & tokens, - vector const & prefixTokens, - TrieRootPrefix const & trieRoot, - FilterT const & filter, - HolderT const & addHolder, - ToDo & toDo) +// Fills holder with features whose names correspond to tokens list up to synonyms, +// also, last holder's slot will be filled with features corresponding to prefixTokens. +// *NOTE* the same feature may be put in the same holder's slot several times. +template +void MatchTokensAndPrefixInTrie(vector const & tokens, + SearchQueryParams::TSynonymsVector const & prefixTokens, + TrieRootPrefix const & trieRoot, THolder && holder) { - impl::OffsetIntersecter intersecter(filter); + MatchTokensInTrie(tokens, trieRoot, holder); - // Match tokens. - size_t const count = tokens.size(); - for (size_t i = 0; i < count; ++i) - { - for (size_t j = 0; j < tokens[i].size(); ++j) - { - ASSERT ( !tokens[i][j].empty(), () ); - - // match in trie - impl::FullMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize, - tokens[i][j], intersecter); - } - - // get additional features for 'i' token - addHolder.GetValues(i, intersecter); - - intersecter.NextStep(); - } - - // Match prefix. - size_t const prefixCount = prefixTokens.size(); - for (size_t i = 0; i < prefixCount; ++i) - { - ASSERT ( !prefixTokens[i].empty(), () ); - - // match in trie - impl::PrefixMatchInTrie(trieRoot.m_root, trieRoot.m_prefix, trieRoot.m_prefixSize, - prefixTokens[i], intersecter); - } - - if (prefixCount > 0) - { - // get additional features for prefix token - addHolder.GetValues(count, intersecter); - - intersecter.NextStep(); - } - - intersecter.ForEachResult(toDo); + holder.Resize(tokens.size() + 1); + holder.StartNew(tokens.size()); + MatchTokenPrefixInTrie(prefixTokens, trieRoot, holder); } +// Fills holder with categories whose description matches to at least one +// token from a search query. +// *NOTE* query prefix will be treated as a complete token in the function. +template +bool MatchCategoriesInTrie(SearchQueryParams const & params, TrieIterator const & trieRoot, + THolder && holder) +{ + ASSERT_LESS(trieRoot.m_edge.size(), numeric_limits::max(), ()); + uint32_t const numLangs = static_cast(trieRoot.m_edge.size()); + for (uint32_t langIx = 0; langIx < numLangs; ++langIx) + { + auto const & edge = trieRoot.m_edge[langIx].m_str; + ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ()); + if (edge[0] == search::CATEGORIES_LANG) + { + unique_ptr const catRoot(trieRoot.GoToEdge(langIx)); + MatchTokensInTrie(params.m_tokens, TrieRootPrefix(*catRoot, edge), holder); + + // Last token's prefix is used as a complete token here, to limit a number of + // features in the last bucket of a holder. Probably, this is a false optimization. + holder.Resize(params.m_tokens.size() + 1); + holder.StartNew(params.m_tokens.size()); + MatchTokenInTrie(params.m_prefixTokens, TrieRootPrefix(*catRoot, edge), holder); + return true; + } + } + return false; +} + +// Calls toDo with trie root prefix and language code on each languagу allowed by params. +template +void ForEachLangPrefix(SearchQueryParams const & params, TrieIterator const & trieRoot, + ToDo && toDo) +{ + ASSERT_LESS(trieRoot.m_edge.size(), numeric_limits::max(), ()); + uint32_t const numLangs = static_cast(trieRoot.m_edge.size()); + for (uint32_t langIx = 0; langIx < numLangs; ++langIx) + { + auto const & edge = trieRoot.m_edge[langIx].m_str; + ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ()); + int8_t const lang = static_cast(edge[0]); + if (edge[0] < search::CATEGORIES_LANG && params.IsLangExist(lang)) + { + unique_ptr const langRoot(trieRoot.GoToEdge(langIx)); + TrieRootPrefix langPrefix(*langRoot, edge); + toDo(langPrefix, lang); + } + } +} + +// Calls toDo for each feature whose description contains *ALL* tokens from a search query. +// Each feature will be passed to toDo only once. +template +void MatchFeaturesInTrie(SearchQueryParams const & params, TrieIterator const & trieRoot, + TFilter const & filter, ToDo && toDo) +{ + TrieValuesHolder categoriesHolder(filter); + CHECK(MatchCategoriesInTrie(params, trieRoot, categoriesHolder), ("Can't find categories.")); + + impl::OffsetIntersecter intersecter(filter); + for (size_t i = 0; i < params.m_tokens.size(); ++i) + { + ForEachLangPrefix(params, trieRoot, [&](TrieRootPrefix & langRoot, int8_t /* lang */) + { + MatchTokenInTrie(params.m_tokens[i], langRoot, intersecter); + }); + categoriesHolder.GetValues(i, intersecter); + intersecter.NextStep(); + } + + if (!params.m_prefixTokens.empty()) + { + ForEachLangPrefix(params, trieRoot, [&](TrieRootPrefix & langRoot, int8_t /* lang */) + { + MatchTokenPrefixInTrie(params.m_prefixTokens, langRoot, intersecter); + }); + categoriesHolder.GetValues(params.m_tokens.size(), intersecter); + intersecter.NextStep(); + } + + intersecter.ForEachResult(forward(toDo)); +} } // namespace search diff --git a/search/search_query.cpp b/search/search_query.cpp index 722c084822..cbc9732b2f 100644 --- a/search/search_query.cpp +++ b/search/search_query.cpp @@ -1066,33 +1066,6 @@ void Query::MakeResultHighlight(Result & res) const SearchStringTokensIntersectionRanges(res.GetString(), beg, end, AssignHighlightRange(res)); } -namespace impl -{ -class FeatureLoader -{ - Query & m_query; - MwmSet::MwmId m_mwmID; - size_t m_count; - Query::ViewportID m_viewportID; - -public: - FeatureLoader(Query & query, MwmSet::MwmId const & mwmID, Query::ViewportID viewportID) - : m_query(query), m_mwmID(mwmID), m_count(0), m_viewportID(viewportID) - { - } - - void operator()(Query::TrieValueT const & value) - { - ++m_count; - m_query.AddResultFromTrie(value, m_mwmID, m_viewportID); - } - - size_t GetCount() const { return m_count; } - - void Reset() { m_count = 0; } -}; -} - namespace { int GetOldTypeFromIndex(size_t index) @@ -1768,54 +1741,42 @@ void Query::SearchLocality(MwmValue * pMwm, impl::Locality & res1, impl::Region serial::CodingParams cp(GetCPForTrie(pMwm->GetHeader().GetDefCodingParams())); ModelReaderPtr searchReader = pMwm->m_cont.GetReader(SEARCH_INDEX_FILE_TAG); - unique_ptr const pTrieRoot(::trie::reader::ReadTrie( - SubReaderWrapper(searchReader.GetPtr()), - trie::ValueReader(cp), - trie::EdgeValueReader())); + unique_ptr const trieRoot( + ::trie::reader::ReadTrie(SubReaderWrapper(searchReader.GetPtr()), + trie::ValueReader(cp), trie::EdgeValueReader())); - ASSERT_LESS(pTrieRoot->m_edge.size(), numeric_limits::max(), ()); - uint32_t const count = static_cast(pTrieRoot->m_edge.size()); - for (uint32_t i = 0; i < count; ++i) - { - TrieIterator::Edge::EdgeStrT const & edge = pTrieRoot->m_edge[i].m_str; + ForEachLangPrefix(params, *trieRoot, [&](TrieRootPrefix & langRoot, int8_t lang) + { + impl::DoFindLocality doFind(*this, pMwm, lang); + MatchTokensInTrie(params.m_tokens, langRoot, doFind); - /// We do search countries, states and cities for one language. - /// @todo Do combine countries and cities for different languages. - int8_t const lang = static_cast(edge[0]); - if (edge[0] < search::CATEGORIES_LANG && params.IsLangExist(lang)) + // Last token's prefix is used as a complete token here, to limit number of results. + doFind.Resize(params.m_tokens.size() + 1); + doFind.StartNew(params.m_tokens.size()); + MatchTokenInTrie(params.m_prefixTokens, langRoot, doFind); + doFind.SortLocalities(); + + // Get regions from STATE and COUNTRY localities + vector regions; + doFind.GetRegions(regions); + + // Get best CITY locality. + impl::Locality loc; + doFind.GetBestCity(loc, regions); + if (res1 < loc) { - unique_ptr const pLangRoot(pTrieRoot->GoToEdge(i)); - - // gel all localities from mwm - impl::DoFindLocality doFind(*this, pMwm, lang); - GetFeaturesInTrie(params.m_tokens, params.m_prefixTokens, - TrieRootPrefix(*pLangRoot, edge), doFind); - - // sort localities by priority - doFind.SortLocalities(); - - // get Region's from STATE and COUNTRY localities - vector regions; - doFind.GetRegions(regions); - - // get best CITY locality - impl::Locality loc; - doFind.GetBestCity(loc, regions); - if (res1 < loc) - { - LOG(LDEBUG, ("Better location ", loc, " for language ", lang)); - res1.Swap(loc); - } - - // get best region - if (!regions.empty()) - { - sort(regions.begin(), regions.end()); - if (res2 < regions.back()) - res2.Swap(regions.back()); - } + LOG(LDEBUG, ("Better location ", loc, " for language ", lang)); + res1.Swap(loc); } - } + + // Get best region. + if (!regions.empty()) + { + sort(regions.begin(), regions.end()); + if (res2 < regions.back()) + res2.Swap(regions.back()); + } + }); } void Query::SearchFeatures() @@ -1860,34 +1821,6 @@ namespace binary_search(m_offsets->begin(), m_offsets->end(), offset)); } }; - - template class TrieValuesHolder - { - vector > m_holder; - size_t m_ind; - FilterT const & m_filter; - - public: - TrieValuesHolder(FilterT const & filter) : m_filter(filter) {} - - void Resize(size_t count) { m_holder.resize(count); } - void StartNew(size_t ind) - { - ASSERT_LESS ( ind, m_holder.size(), () ); - m_ind = ind; - } - void operator() (Query::TrieValueT const & v) - { - if (m_filter(v.m_featureId)) - m_holder[m_ind].push_back(v); - } - - template void GetValues(size_t ind, ToDo & toDo) const - { - for (size_t i = 0; i < m_holder[ind].size(); ++i) - toDo(m_holder[ind][i]); - } - }; } void Query::SearchFeatures(SearchQueryParams const & params, MWMVectorT const & mwmsInfo, @@ -1904,93 +1837,31 @@ void Query::SearchFeatures(SearchQueryParams const & params, MWMVectorT const & } } -namespace -{ -void FillCategories(SearchQueryParams const & params, TrieIterator const * pTrieRoot, - TrieValuesHolder & categoriesHolder) -{ - unique_ptr pCategoriesRoot; - typedef TrieIterator::Edge::EdgeStrT EdgeT; - EdgeT categoriesEdge; - - ASSERT_LESS(pTrieRoot->m_edge.size(), numeric_limits::max(), ()); - uint32_t const count = static_cast(pTrieRoot->m_edge.size()); - for (uint32_t i = 0; i < count; ++i) - { - EdgeT const & edge = pTrieRoot->m_edge[i].m_str; - ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ()); - - if (edge[0] == search::CATEGORIES_LANG) - { - categoriesEdge = edge; - pCategoriesRoot.reset(pTrieRoot->GoToEdge(i)); - break; - } - } - ASSERT(pCategoriesRoot != 0, ()); - - GetFeaturesInTrie(params.m_tokens, params.m_prefixTokens, - TrieRootPrefix(*pCategoriesRoot, categoriesEdge), - categoriesHolder); -} - -} - void Query::SearchInMWM(Index::MwmHandle const & mwmHandle, SearchQueryParams const & params, - ViewportID vID /*= DEFAULT_V*/) + ViewportID viewportId /*= DEFAULT_V*/) { - if (MwmValue const * const pMwm = mwmHandle.GetValue()) - { - if (pMwm->m_cont.IsExist(SEARCH_INDEX_FILE_TAG)) - { - FHeaderT const & header = pMwm->GetHeader(); + MwmValue const * const value = mwmHandle.GetValue(); + if (!value || !value->m_cont.IsExist(SEARCH_INDEX_FILE_TAG)) + return; - /// @todo do not process World.mwm here - do it in SearchLocality - bool const isWorld = (header.GetType() == FHeaderT::world); - if (isWorld && !m_worldSearch) - return; + FHeaderT const & header = value->GetHeader(); + /// @todo do not process World.mwm here - do it in SearchLocality + bool const isWorld = (header.GetType() == FHeaderT::world); + if (isWorld && !m_worldSearch) + return; - serial::CodingParams cp(GetCPForTrie(header.GetDefCodingParams())); - - ModelReaderPtr searchReader = pMwm->m_cont.GetReader(SEARCH_INDEX_FILE_TAG); - unique_ptr const pTrieRoot(::trie::reader::ReadTrie( - SubReaderWrapper(searchReader.GetPtr()), - trie::ValueReader(cp), - trie::EdgeValueReader())); - - MwmSet::MwmId const mwmId = mwmHandle.GetId(); - FeaturesFilter filter((vID == DEFAULT_V || isWorld) ? 0 : &m_offsetsInViewport[vID][mwmId], *this); - - // Get categories for each token separately - find needed edge with categories. - TrieValuesHolder categoriesHolder(filter); - FillCategories(params, pTrieRoot.get(), categoriesHolder); - - // Match tokens to feature for each language - iterate through first edges. - impl::FeatureLoader emitter(*this, mwmId, vID); - - ASSERT_LESS(pTrieRoot->m_edge.size(), numeric_limits::max(), ()); - uint32_t const count = static_cast(pTrieRoot->m_edge.size()); - for (uint32_t i = 0; i < count; ++i) - { - TrieIterator::Edge::EdgeStrT const & edge = pTrieRoot->m_edge[i].m_str; - int8_t const lang = static_cast(edge[0]); - - if (edge[0] < search::CATEGORIES_LANG && params.IsLangExist(lang)) - { - unique_ptr const pLangRoot(pTrieRoot->GoToEdge(i)); - - MatchFeaturesInTrie(params.m_tokens, params.m_prefixTokens, - TrieRootPrefix(*pLangRoot, edge), - filter, categoriesHolder, emitter); - - LOG(LDEBUG, ("Country", pMwm->GetCountryFile().GetNameWithoutExt(), "Lang", - StringUtf8Multilang::GetLangByCode(lang), "Matched", emitter.GetCount())); - - emitter.Reset(); - } - } - } - } + serial::CodingParams cp(GetCPForTrie(header.GetDefCodingParams())); + ModelReaderPtr searchReader = value->m_cont.GetReader(SEARCH_INDEX_FILE_TAG); + unique_ptr const trieRoot( + ::trie::reader::ReadTrie(SubReaderWrapper(searchReader.GetPtr()), + trie::ValueReader(cp), trie::EdgeValueReader())); + MwmSet::MwmId const mwmId = mwmHandle.GetId(); + FeaturesFilter filter( + (viewportId == DEFAULT_V || isWorld) ? 0 : &m_offsetsInViewport[viewportId][mwmId], *this); + MatchFeaturesInTrie(params, *trieRoot, filter, [&](TrieValueT const & value) + { + AddResultFromTrie(value, mwmId, viewportId); + }); } void Query::SuggestStrings(Results & res) diff --git a/search/search_query.hpp b/search/search_query.hpp index 77dac826ed..043f0b7564 100644 --- a/search/search_query.hpp +++ b/search/search_query.hpp @@ -179,7 +179,7 @@ private: ViewportID vID); /// Do search in particular map (mwmHandle). void SearchInMWM(Index::MwmHandle const & mwmHandle, SearchQueryParams const & params, - ViewportID vID = DEFAULT_V); + ViewportID viewportId = DEFAULT_V); //@} void SuggestStrings(Results & res);