forked from organicmaps/organicmaps-tmp
[search] Fuzzy search by category.
This commit is contained in:
parent
05b55c1b6a
commit
8808823db1
12 changed files with 197 additions and 52 deletions
|
@ -15,7 +15,12 @@ namespace my
|
|||
template <typename String, typename Value>
|
||||
class MemTrie
|
||||
{
|
||||
private:
|
||||
struct Node;
|
||||
|
||||
public:
|
||||
using Char = typename String::value_type;
|
||||
|
||||
MemTrie() = default;
|
||||
MemTrie(MemTrie && rhs) { *this = std::move(rhs); }
|
||||
|
||||
|
@ -27,6 +32,31 @@ public:
|
|||
return *this;
|
||||
}
|
||||
|
||||
// A read-only iterator. Any modification to the
|
||||
// underlying trie is assumed to invalidate the iterator.
|
||||
class Iterator
|
||||
{
|
||||
public:
|
||||
Iterator(MemTrie::Node const & node) : m_node(node) {}
|
||||
|
||||
template <typename ToDo>
|
||||
void ForEachMove(ToDo && todo) const
|
||||
{
|
||||
for (auto const & move : m_node.m_moves)
|
||||
todo(move.first, Iterator(*move.second));
|
||||
}
|
||||
|
||||
template <typename ToDo>
|
||||
void ForEachInNode(ToDo && todo) const
|
||||
{
|
||||
for (auto const & value : m_node.m_values)
|
||||
todo(value);
|
||||
}
|
||||
|
||||
private:
|
||||
MemTrie::Node const & m_node;
|
||||
};
|
||||
|
||||
// Adds a key-value pair to the trie.
|
||||
void Add(String const & key, Value const & value)
|
||||
{
|
||||
|
@ -41,40 +71,42 @@ public:
|
|||
cur->AddValue(value);
|
||||
}
|
||||
|
||||
// Traverses all key-value pairs in the trie and calls |toDo| on each of them.
|
||||
// Traverses all key-value pairs in the trie and calls |todo| on each of them.
|
||||
template <typename ToDo>
|
||||
void ForEachInTrie(ToDo && toDo) const
|
||||
void ForEachInTrie(ToDo && todo) const
|
||||
{
|
||||
String prefix;
|
||||
ForEachInSubtree(m_root, prefix, std::forward<ToDo>(toDo));
|
||||
ForEachInSubtree(m_root, prefix, std::forward<ToDo>(todo));
|
||||
}
|
||||
|
||||
// Calls |toDo| for each key-value pair in the node that is reachable
|
||||
// Calls |todo| for each key-value pair in the node that is reachable
|
||||
// by |prefix| from the trie root. Does nothing if such node does
|
||||
// not exist.
|
||||
template <typename ToDo>
|
||||
void ForEachInNode(String const & prefix, ToDo && toDo) const
|
||||
void ForEachInNode(String const & prefix, ToDo && todo) const
|
||||
{
|
||||
if (auto const * root = MoveTo(prefix))
|
||||
ForEachInNode(*root, prefix, std::forward<ToDo>(toDo));
|
||||
ForEachInNode(*root, prefix, std::forward<ToDo>(todo));
|
||||
}
|
||||
|
||||
// Calls |toDo| for each key-value pair in a subtree that is
|
||||
// Calls |todo| for each key-value pair in a subtree that is
|
||||
// reachable by |prefix| from the trie root. Does nothing if such
|
||||
// subtree does not exist.
|
||||
template <typename ToDo>
|
||||
void ForEachInSubtree(String prefix, ToDo && toDo) const
|
||||
void ForEachInSubtree(String prefix, ToDo && todo) const
|
||||
{
|
||||
if (auto const * root = MoveTo(prefix))
|
||||
ForEachInSubtree(*root, prefix, std::forward<ToDo>(toDo));
|
||||
ForEachInSubtree(*root, prefix, std::forward<ToDo>(todo));
|
||||
}
|
||||
|
||||
size_t GetNumNodes() const { return m_numNodes; }
|
||||
Iterator GetRootIterator() const { return Iterator(m_root); }
|
||||
Node const & GetRoot() const { return m_root; }
|
||||
|
||||
private:
|
||||
struct Node
|
||||
{
|
||||
using Char = typename String::value_type;
|
||||
friend class MemTrie<String, Value>::Iterator;
|
||||
|
||||
Node() = default;
|
||||
Node(Node && /* rhs */) = default;
|
||||
|
@ -117,27 +149,27 @@ private:
|
|||
return cur;
|
||||
}
|
||||
|
||||
// Calls |toDo| for each key-value pair in a |node| that is
|
||||
// Calls |todo| for each key-value pair in a |node| that is
|
||||
// reachable by |prefix| from the trie root.
|
||||
template <typename ToDo>
|
||||
void ForEachInNode(Node const & node, String const & prefix, ToDo && toDo) const
|
||||
void ForEachInNode(Node const & node, String const & prefix, ToDo && todo) const
|
||||
{
|
||||
for (auto const & value : node.m_values)
|
||||
toDo(prefix, value);
|
||||
todo(prefix, value);
|
||||
}
|
||||
|
||||
// Calls |toDo| for each key-value pair in subtree where |node| is a
|
||||
// Calls |todo| for each key-value pair in subtree where |node| is a
|
||||
// root of the subtree. |prefix| is a path from the trie root to the
|
||||
// |node|.
|
||||
template <typename ToDo>
|
||||
void ForEachInSubtree(Node const & node, String & prefix, ToDo && toDo) const
|
||||
void ForEachInSubtree(Node const & node, String & prefix, ToDo && todo) const
|
||||
{
|
||||
ForEachInNode(node, prefix, toDo);
|
||||
ForEachInNode(node, prefix, todo);
|
||||
|
||||
for (auto const & move : node.m_moves)
|
||||
{
|
||||
prefix.push_back(move.first);
|
||||
ForEachInSubtree(*move.second, prefix, toDo);
|
||||
ForEachInSubtree(*move.second, prefix, todo);
|
||||
prefix.pop_back();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -125,6 +125,15 @@ public:
|
|||
/// @returns raw classificator type if it's not localized in categories.txt.
|
||||
string GetReadableFeatureType(uint32_t type, int8_t locale) const;
|
||||
|
||||
// Exposes the tries that map category tokens to types.
|
||||
Trie const * GetNameToTypesTrie(int8_t locale) const
|
||||
{
|
||||
auto const it = m_name2type.find(locale);
|
||||
if (it == m_name2type.end())
|
||||
return nullptr;
|
||||
return it->second.get();
|
||||
}
|
||||
|
||||
bool IsTypeExist(uint32_t type) const;
|
||||
|
||||
inline void Swap(CategoriesHolder & r)
|
||||
|
|
|
@ -123,6 +123,7 @@ set(
|
|||
token_slice.hpp
|
||||
types_skipper.cpp
|
||||
types_skipper.hpp
|
||||
utils.cpp
|
||||
utils.hpp
|
||||
viewport_search_callback.cpp
|
||||
viewport_search_callback.hpp
|
||||
|
|
|
@ -5,16 +5,4 @@ namespace search
|
|||
/// Upper bound for max count of tokens for indexing and scoring.
|
||||
int constexpr MAX_TOKENS = 32;
|
||||
int constexpr MAX_SUGGESTS_COUNT = 5;
|
||||
|
||||
template <typename IterT1, typename IterT2>
|
||||
bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix)
|
||||
{
|
||||
while (beg != end && begPrefix != endPrefix && *beg == *begPrefix)
|
||||
{
|
||||
++beg;
|
||||
++begPrefix;
|
||||
}
|
||||
return begPrefix == endPrefix;
|
||||
}
|
||||
|
||||
} // namespace search
|
||||
|
|
|
@ -221,7 +221,7 @@ struct SearchTrieRequest
|
|||
QueryParams::Langs m_langs;
|
||||
};
|
||||
|
||||
// Calls |toDo| for each feature accepted but at least one DFA.
|
||||
// Calls |toDo| for each feature accepted by at least one DFA.
|
||||
//
|
||||
// *NOTE* |toDo| may be called several times for the same feature.
|
||||
template <typename DFA, typename Value, typename ToDo>
|
||||
|
|
|
@ -315,18 +315,6 @@ size_t OrderCountries(m2::RectD const & pivot, vector<shared_ptr<MwmInfo>> & inf
|
|||
auto const sep = stable_partition(infos.begin(), infos.end(), intersects);
|
||||
return distance(infos.begin(), sep);
|
||||
}
|
||||
|
||||
size_t GetMaxErrorsForToken(UniString const & token)
|
||||
{
|
||||
bool const digitsOnly = all_of(token.begin(), token.end(), isdigit);
|
||||
if (digitsOnly)
|
||||
return 0;
|
||||
if (token.size() < 4)
|
||||
return 0;
|
||||
if (token.size() < 8)
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Geocoder::Params --------------------------------------------------------------------------------
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#include "keyword_matcher.hpp"
|
||||
#include "search/keyword_matcher.hpp"
|
||||
|
||||
#include "search/utils.hpp"
|
||||
|
||||
#include "indexer/search_delimiters.hpp"
|
||||
#include "indexer/search_string_utils.hpp"
|
||||
|
|
|
@ -128,8 +128,8 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res
|
|||
GetPlatform().GetMarketingService().SendMarketingEvent(marketing::kSearchEmitResultsAndCoords, {});
|
||||
}
|
||||
|
||||
// Removes all full-token stop words from |params|, unless |params|
|
||||
// consists of all such tokens.
|
||||
// Removes all full-token stop words from |params|.
|
||||
// Does nothing if all tokens in |params| are stop words.
|
||||
void RemoveStopWordsIfNeeded(QueryParams & params)
|
||||
{
|
||||
size_t numStopWords = 0;
|
||||
|
@ -331,6 +331,7 @@ int8_t Processor::GetLanguage(int id) const
|
|||
{
|
||||
return m_ranker.GetLanguage(GetLangIndex(id));
|
||||
}
|
||||
|
||||
m2::PointD Processor::GetPivotPoint() const
|
||||
{
|
||||
bool const viewportSearch = m_mode == Mode::Viewport;
|
||||
|
@ -413,6 +414,13 @@ void Processor::ForEachCategoryType(StringSliceBase const & slice, ToDo && todo)
|
|||
::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, forward<ToDo>(todo));
|
||||
}
|
||||
|
||||
template <typename ToDo>
|
||||
void Processor::ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const
|
||||
{
|
||||
::search::ForEachCategoryTypeFuzzy(slice, GetCategoryLocales(), m_categories,
|
||||
forward<ToDo>(todo));
|
||||
}
|
||||
|
||||
void Processor::Search(SearchParams const & params, m2::RectD const & viewport)
|
||||
{
|
||||
SetMode(params.m_mode);
|
||||
|
@ -671,11 +679,9 @@ void Processor::InitParams(QueryParams & params)
|
|||
}
|
||||
}
|
||||
};
|
||||
ForEachCategoryType(QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix), addSyms);
|
||||
|
||||
auto & langs = params.GetLangs();
|
||||
for (int i = 0; i < LANG_COUNT; ++i)
|
||||
langs.Insert(GetLanguage(i));
|
||||
// todo(@m, @y). Shall we match prefix tokens for categories?
|
||||
ForEachCategoryTypeFuzzy(QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix), addSyms);
|
||||
|
||||
RemoveStopWordsIfNeeded(params);
|
||||
|
||||
|
@ -687,6 +693,12 @@ void Processor::InitParams(QueryParams & params)
|
|||
if (IsStreetSynonym(token.m_original))
|
||||
params.GetTypeIndices(i).clear();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < params.GetNumTokens(); ++i)
|
||||
my::SortUnique(params.GetTypeIndices(i));
|
||||
|
||||
for (int i = 0; i < LANG_COUNT; ++i)
|
||||
params.GetLangs().Insert(GetLanguage(i));
|
||||
}
|
||||
|
||||
void Processor::InitGeocoder(Geocoder::Params & params)
|
||||
|
|
|
@ -141,6 +141,9 @@ protected:
|
|||
template <typename ToDo>
|
||||
void ForEachCategoryType(StringSliceBase const & slice, ToDo && todo) const;
|
||||
|
||||
template <typename ToDo>
|
||||
void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const;
|
||||
|
||||
m2::PointD GetPivotPoint() const;
|
||||
m2::RectD GetPivotRect() const;
|
||||
|
||||
|
|
|
@ -135,4 +135,5 @@ SOURCES += \
|
|||
streets_matcher.cpp \
|
||||
token_slice.cpp \
|
||||
types_skipper.cpp \
|
||||
utils.cpp \
|
||||
viewport_search_callback.cpp
|
||||
|
|
|
@ -754,6 +754,9 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
|
|||
TestPOI bar(m2::PointD(0, 0), "Черчилль", "ru");
|
||||
bar.SetTypes({{"amenity", "pub"}});
|
||||
|
||||
TestPOI metro(m2::PointD(5.0, 5.0), "Liceu", "es");
|
||||
metro.SetTypes({{"railway", "subway_entrance"}});
|
||||
|
||||
BuildWorld([&](TestMwmBuilder & builder) {
|
||||
builder.Add(country);
|
||||
builder.Add(city);
|
||||
|
@ -762,6 +765,7 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
|
|||
auto id = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
|
||||
builder.Add(street);
|
||||
builder.Add(bar);
|
||||
builder.Add(metro);
|
||||
});
|
||||
|
||||
SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0)));
|
||||
|
@ -778,6 +782,17 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
|
|||
TEST(ResultsMatch("масква ленинргадский чирчиль", "ru", TRules{}), ());
|
||||
|
||||
TEST(ResultsMatch("моксва ленинргадский черчиль", "ru", rules), ());
|
||||
|
||||
TEST(ResultsMatch("food", "ru", rules), ());
|
||||
TEST(ResultsMatch("foood", "ru", rules), ());
|
||||
TEST(ResultsMatch("fod", "ru", TRules{}), ());
|
||||
|
||||
TRules rulesMetro = {ExactMatch(id, metro)};
|
||||
TEST(ResultsMatch("transporte", "es", rulesMetro), ());
|
||||
TEST(ResultsMatch("transport", "es", rulesMetro), ());
|
||||
TEST(ResultsMatch("transpurt", "en", rulesMetro), ());
|
||||
TEST(ResultsMatch("transpurrt", "es", rulesMetro), ());
|
||||
TEST(ResultsMatch("transportation", "en", TRules{}), ());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
100
search/utils.hpp
100
search/utils.hpp
|
@ -3,14 +3,74 @@
|
|||
#include "search/token_slice.hpp"
|
||||
|
||||
#include "indexer/categories_holder.hpp"
|
||||
#include "indexer/search_delimiters.hpp"
|
||||
#include "indexer/search_string_utils.hpp"
|
||||
|
||||
#include "base/buffer_vector.hpp"
|
||||
#include "base/levenshtein_dfa.hpp"
|
||||
#include "base/stl_helpers.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <functional>
|
||||
#include <queue>
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace
|
||||
{
|
||||
// my::MemTrie<strings::UniString, uint32_t>
|
||||
// todo(@m, @y). Unite with the similar function in search/feature_offset_match.hpp.
|
||||
template <typename Trie, typename DFA, typename ToDo>
|
||||
bool MatchInTrie(Trie const & trie, DFA const & dfa, ToDo && toDo)
|
||||
{
|
||||
using Char = typename Trie::Char;
|
||||
using TrieIt = typename Trie::Iterator;
|
||||
using DFAIt = typename DFA::Iterator;
|
||||
using State = pair<TrieIt, DFAIt>;
|
||||
|
||||
std::queue<State> q;
|
||||
|
||||
{
|
||||
auto it = dfa.Begin();
|
||||
if (it.Rejects())
|
||||
return false;
|
||||
q.emplace(trie.GetRootIterator(), it);
|
||||
}
|
||||
|
||||
bool found = false;
|
||||
|
||||
while (!q.empty())
|
||||
{
|
||||
auto const p = q.front();
|
||||
q.pop();
|
||||
|
||||
auto const & trieIt = p.first;
|
||||
auto const & dfaIt = p.second;
|
||||
|
||||
if (dfaIt.Accepts())
|
||||
{
|
||||
trieIt.ForEachInNode(toDo);
|
||||
found = true;
|
||||
}
|
||||
|
||||
trieIt.ForEachMove([&](Char const & c, TrieIt const & nextTrieIt) {
|
||||
auto nextDfaIt = dfaIt;
|
||||
nextDfaIt.Move(c);
|
||||
if (!nextDfaIt.Rejects())
|
||||
q.emplace(nextTrieIt, nextDfaIt);
|
||||
});
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
using TLocales = buffer_vector<int8_t, 3>;
|
||||
|
||||
size_t GetMaxErrorsForToken(strings::UniString const & token);
|
||||
|
||||
template <typename ToDo>
|
||||
void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales,
|
||||
CategoriesHolder const & categories, ToDo && todo)
|
||||
|
@ -18,16 +78,50 @@ void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales
|
|||
for (size_t i = 0; i < slice.Size(); ++i)
|
||||
{
|
||||
auto const & token = slice.Get(i);
|
||||
for (size_t j = 0; j < locales.size(); ++j)
|
||||
categories.ForEachTypeByName(locales[j], token, bind<void>(todo, i, _1));
|
||||
for (int8_t const locale : locales)
|
||||
categories.ForEachTypeByName(locale, token, std::bind<void>(todo, i, std::placeholders::_1));
|
||||
|
||||
// Special case processing of 2 codepoints emoji (e.g. black guy on a bike).
|
||||
// Only emoji synonyms can have one codepoint.
|
||||
if (token.size() > 1)
|
||||
{
|
||||
categories.ForEachTypeByName(CategoriesHolder::kEnglishCode, strings::UniString(1, token[0]),
|
||||
bind<void>(todo, i, _1));
|
||||
std::bind<void>(todo, i, std::placeholders::_1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Unlike ForEachCategoryType which extracts types by a token
|
||||
// from |slice| by exactly matching it to a token in the name
|
||||
// of a category, in the worst case this function has to loop through the tokens
|
||||
// in all category synonyms in all |locales| in order to find a token
|
||||
// whose edit distance is close enough to the required token from |slice|.
|
||||
template <typename ToDo>
|
||||
void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, TLocales const & locales,
|
||||
CategoriesHolder const & categories, ToDo && todo)
|
||||
{
|
||||
for (size_t i = 0; i < slice.Size(); ++i)
|
||||
{
|
||||
auto const & token = slice.Get(i);
|
||||
auto const & dfa =
|
||||
strings::LevenshteinDFA(token, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(token));
|
||||
for (int8_t const locale : locales)
|
||||
{
|
||||
auto const * trie = categories.GetNameToTypesTrie(locale);
|
||||
if (trie != nullptr)
|
||||
MatchInTrie(*trie, dfa, std::bind<void>(todo, i, std::placeholders::_1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename IterT1, typename IterT2>
|
||||
bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix)
|
||||
{
|
||||
while (beg != end && begPrefix != endPrefix && *beg == *begPrefix)
|
||||
{
|
||||
++beg;
|
||||
++begPrefix;
|
||||
}
|
||||
return begPrefix == endPrefix;
|
||||
}
|
||||
} // namespace search
|
||||
|
|
Loading…
Add table
Reference in a new issue