[search] Fuzzy search by category.

This commit is contained in:
Maxim Pimenov 2017-02-01 19:14:01 +03:00
parent 05b55c1b6a
commit 8808823db1
12 changed files with 197 additions and 52 deletions

View file

@ -15,7 +15,12 @@ namespace my
template <typename String, typename Value>
class MemTrie
{
private:
struct Node;
public:
using Char = typename String::value_type;
MemTrie() = default;
MemTrie(MemTrie && rhs) { *this = std::move(rhs); }
@ -27,6 +32,31 @@ public:
return *this;
}
// A read-only iterator. Any modification to the
// underlying trie is assumed to invalidate the iterator.
class Iterator
{
public:
Iterator(MemTrie::Node const & node) : m_node(node) {}
template <typename ToDo>
void ForEachMove(ToDo && todo) const
{
for (auto const & move : m_node.m_moves)
todo(move.first, Iterator(*move.second));
}
template <typename ToDo>
void ForEachInNode(ToDo && todo) const
{
for (auto const & value : m_node.m_values)
todo(value);
}
private:
MemTrie::Node const & m_node;
};
// Adds a key-value pair to the trie.
void Add(String const & key, Value const & value)
{
@ -41,40 +71,42 @@ public:
cur->AddValue(value);
}
// Traverses all key-value pairs in the trie and calls |toDo| on each of them.
// Traverses all key-value pairs in the trie and calls |todo| on each of them.
template <typename ToDo>
void ForEachInTrie(ToDo && toDo) const
void ForEachInTrie(ToDo && todo) const
{
String prefix;
ForEachInSubtree(m_root, prefix, std::forward<ToDo>(toDo));
ForEachInSubtree(m_root, prefix, std::forward<ToDo>(todo));
}
// Calls |toDo| for each key-value pair in the node that is reachable
// Calls |todo| for each key-value pair in the node that is reachable
// by |prefix| from the trie root. Does nothing if such node does
// not exist.
template <typename ToDo>
void ForEachInNode(String const & prefix, ToDo && toDo) const
void ForEachInNode(String const & prefix, ToDo && todo) const
{
if (auto const * root = MoveTo(prefix))
ForEachInNode(*root, prefix, std::forward<ToDo>(toDo));
ForEachInNode(*root, prefix, std::forward<ToDo>(todo));
}
// Calls |toDo| for each key-value pair in a subtree that is
// Calls |todo| for each key-value pair in a subtree that is
// reachable by |prefix| from the trie root. Does nothing if such
// subtree does not exist.
template <typename ToDo>
void ForEachInSubtree(String prefix, ToDo && toDo) const
void ForEachInSubtree(String prefix, ToDo && todo) const
{
if (auto const * root = MoveTo(prefix))
ForEachInSubtree(*root, prefix, std::forward<ToDo>(toDo));
ForEachInSubtree(*root, prefix, std::forward<ToDo>(todo));
}
size_t GetNumNodes() const { return m_numNodes; }
Iterator GetRootIterator() const { return Iterator(m_root); }
Node const & GetRoot() const { return m_root; }
private:
struct Node
{
using Char = typename String::value_type;
friend class MemTrie<String, Value>::Iterator;
Node() = default;
Node(Node && /* rhs */) = default;
@ -117,27 +149,27 @@ private:
return cur;
}
// Calls |toDo| for each key-value pair in a |node| that is
// Calls |todo| for each key-value pair in a |node| that is
// reachable by |prefix| from the trie root.
template <typename ToDo>
void ForEachInNode(Node const & node, String const & prefix, ToDo && toDo) const
void ForEachInNode(Node const & node, String const & prefix, ToDo && todo) const
{
for (auto const & value : node.m_values)
toDo(prefix, value);
todo(prefix, value);
}
// Calls |toDo| for each key-value pair in subtree where |node| is a
// Calls |todo| for each key-value pair in subtree where |node| is a
// root of the subtree. |prefix| is a path from the trie root to the
// |node|.
template <typename ToDo>
void ForEachInSubtree(Node const & node, String & prefix, ToDo && toDo) const
void ForEachInSubtree(Node const & node, String & prefix, ToDo && todo) const
{
ForEachInNode(node, prefix, toDo);
ForEachInNode(node, prefix, todo);
for (auto const & move : node.m_moves)
{
prefix.push_back(move.first);
ForEachInSubtree(*move.second, prefix, toDo);
ForEachInSubtree(*move.second, prefix, todo);
prefix.pop_back();
}
}

View file

@ -125,6 +125,15 @@ public:
/// @returns raw classificator type if it's not localized in categories.txt.
string GetReadableFeatureType(uint32_t type, int8_t locale) const;
// Exposes the tries that map category tokens to types.
Trie const * GetNameToTypesTrie(int8_t locale) const
{
auto const it = m_name2type.find(locale);
if (it == m_name2type.end())
return nullptr;
return it->second.get();
}
bool IsTypeExist(uint32_t type) const;
inline void Swap(CategoriesHolder & r)

View file

@ -123,6 +123,7 @@ set(
token_slice.hpp
types_skipper.cpp
types_skipper.hpp
utils.cpp
utils.hpp
viewport_search_callback.cpp
viewport_search_callback.hpp

View file

@ -5,16 +5,4 @@ namespace search
/// Upper bound for max count of tokens for indexing and scoring.
int constexpr MAX_TOKENS = 32;
int constexpr MAX_SUGGESTS_COUNT = 5;
template <typename IterT1, typename IterT2>
bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix)
{
while (beg != end && begPrefix != endPrefix && *beg == *begPrefix)
{
++beg;
++begPrefix;
}
return begPrefix == endPrefix;
}
} // namespace search

View file

@ -221,7 +221,7 @@ struct SearchTrieRequest
QueryParams::Langs m_langs;
};
// Calls |toDo| for each feature accepted but at least one DFA.
// Calls |toDo| for each feature accepted by at least one DFA.
//
// *NOTE* |toDo| may be called several times for the same feature.
template <typename DFA, typename Value, typename ToDo>

View file

@ -315,18 +315,6 @@ size_t OrderCountries(m2::RectD const & pivot, vector<shared_ptr<MwmInfo>> & inf
auto const sep = stable_partition(infos.begin(), infos.end(), intersects);
return distance(infos.begin(), sep);
}
size_t GetMaxErrorsForToken(UniString const & token)
{
bool const digitsOnly = all_of(token.begin(), token.end(), isdigit);
if (digitsOnly)
return 0;
if (token.size() < 4)
return 0;
if (token.size() < 8)
return 1;
return 2;
}
} // namespace
// Geocoder::Params --------------------------------------------------------------------------------

View file

@ -1,4 +1,6 @@
#include "keyword_matcher.hpp"
#include "search/keyword_matcher.hpp"
#include "search/utils.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"

View file

@ -128,8 +128,8 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res
GetPlatform().GetMarketingService().SendMarketingEvent(marketing::kSearchEmitResultsAndCoords, {});
}
// Removes all full-token stop words from |params|, unless |params|
// consists of all such tokens.
// Removes all full-token stop words from |params|.
// Does nothing if all tokens in |params| are stop words.
void RemoveStopWordsIfNeeded(QueryParams & params)
{
size_t numStopWords = 0;
@ -331,6 +331,7 @@ int8_t Processor::GetLanguage(int id) const
{
return m_ranker.GetLanguage(GetLangIndex(id));
}
m2::PointD Processor::GetPivotPoint() const
{
bool const viewportSearch = m_mode == Mode::Viewport;
@ -413,6 +414,13 @@ void Processor::ForEachCategoryType(StringSliceBase const & slice, ToDo && todo)
::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, forward<ToDo>(todo));
}
template <typename ToDo>
void Processor::ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const
{
::search::ForEachCategoryTypeFuzzy(slice, GetCategoryLocales(), m_categories,
forward<ToDo>(todo));
}
void Processor::Search(SearchParams const & params, m2::RectD const & viewport)
{
SetMode(params.m_mode);
@ -671,11 +679,9 @@ void Processor::InitParams(QueryParams & params)
}
}
};
ForEachCategoryType(QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix), addSyms);
auto & langs = params.GetLangs();
for (int i = 0; i < LANG_COUNT; ++i)
langs.Insert(GetLanguage(i));
// todo(@m, @y). Shall we match prefix tokens for categories?
ForEachCategoryTypeFuzzy(QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix), addSyms);
RemoveStopWordsIfNeeded(params);
@ -687,6 +693,12 @@ void Processor::InitParams(QueryParams & params)
if (IsStreetSynonym(token.m_original))
params.GetTypeIndices(i).clear();
}
for (size_t i = 0; i < params.GetNumTokens(); ++i)
my::SortUnique(params.GetTypeIndices(i));
for (int i = 0; i < LANG_COUNT; ++i)
params.GetLangs().Insert(GetLanguage(i));
}
void Processor::InitGeocoder(Geocoder::Params & params)

View file

@ -141,6 +141,9 @@ protected:
template <typename ToDo>
void ForEachCategoryType(StringSliceBase const & slice, ToDo && todo) const;
template <typename ToDo>
void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const;
m2::PointD GetPivotPoint() const;
m2::RectD GetPivotRect() const;

View file

@ -135,4 +135,5 @@ SOURCES += \
streets_matcher.cpp \
token_slice.cpp \
types_skipper.cpp \
utils.cpp \
viewport_search_callback.cpp

View file

@ -754,6 +754,9 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
TestPOI bar(m2::PointD(0, 0), "Черчилль", "ru");
bar.SetTypes({{"amenity", "pub"}});
TestPOI metro(m2::PointD(5.0, 5.0), "Liceu", "es");
metro.SetTypes({{"railway", "subway_entrance"}});
BuildWorld([&](TestMwmBuilder & builder) {
builder.Add(country);
builder.Add(city);
@ -762,6 +765,7 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
auto id = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
builder.Add(street);
builder.Add(bar);
builder.Add(metro);
});
SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0)));
@ -778,6 +782,17 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
TEST(ResultsMatch("масква ленинргадский чирчиль", "ru", TRules{}), ());
TEST(ResultsMatch("моксва ленинргадский черчиль", "ru", rules), ());
TEST(ResultsMatch("food", "ru", rules), ());
TEST(ResultsMatch("foood", "ru", rules), ());
TEST(ResultsMatch("fod", "ru", TRules{}), ());
TRules rulesMetro = {ExactMatch(id, metro)};
TEST(ResultsMatch("transporte", "es", rulesMetro), ());
TEST(ResultsMatch("transport", "es", rulesMetro), ());
TEST(ResultsMatch("transpurt", "en", rulesMetro), ());
TEST(ResultsMatch("transpurrt", "es", rulesMetro), ());
TEST(ResultsMatch("transportation", "en", TRules{}), ());
}
}

View file

@ -3,14 +3,74 @@
#include "search/token_slice.hpp"
#include "indexer/categories_holder.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "base/buffer_vector.hpp"
#include "base/levenshtein_dfa.hpp"
#include "base/stl_helpers.hpp"
#include "base/string_utils.hpp"
#include <algorithm>
#include <cctype>
#include <functional>
#include <queue>
namespace search
{
namespace
{
// my::MemTrie<strings::UniString, uint32_t>
// todo(@m, @y). Unite with the similar function in search/feature_offset_match.hpp.
template <typename Trie, typename DFA, typename ToDo>
bool MatchInTrie(Trie const & trie, DFA const & dfa, ToDo && toDo)
{
using Char = typename Trie::Char;
using TrieIt = typename Trie::Iterator;
using DFAIt = typename DFA::Iterator;
using State = pair<TrieIt, DFAIt>;
std::queue<State> q;
{
auto it = dfa.Begin();
if (it.Rejects())
return false;
q.emplace(trie.GetRootIterator(), it);
}
bool found = false;
while (!q.empty())
{
auto const p = q.front();
q.pop();
auto const & trieIt = p.first;
auto const & dfaIt = p.second;
if (dfaIt.Accepts())
{
trieIt.ForEachInNode(toDo);
found = true;
}
trieIt.ForEachMove([&](Char const & c, TrieIt const & nextTrieIt) {
auto nextDfaIt = dfaIt;
nextDfaIt.Move(c);
if (!nextDfaIt.Rejects())
q.emplace(nextTrieIt, nextDfaIt);
});
}
return found;
}
} // namespace
using TLocales = buffer_vector<int8_t, 3>;
size_t GetMaxErrorsForToken(strings::UniString const & token);
template <typename ToDo>
void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales,
CategoriesHolder const & categories, ToDo && todo)
@ -18,16 +78,50 @@ void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales
for (size_t i = 0; i < slice.Size(); ++i)
{
auto const & token = slice.Get(i);
for (size_t j = 0; j < locales.size(); ++j)
categories.ForEachTypeByName(locales[j], token, bind<void>(todo, i, _1));
for (int8_t const locale : locales)
categories.ForEachTypeByName(locale, token, std::bind<void>(todo, i, std::placeholders::_1));
// Special case processing of 2 codepoints emoji (e.g. black guy on a bike).
// Only emoji synonyms can have one codepoint.
if (token.size() > 1)
{
categories.ForEachTypeByName(CategoriesHolder::kEnglishCode, strings::UniString(1, token[0]),
bind<void>(todo, i, _1));
std::bind<void>(todo, i, std::placeholders::_1));
}
}
}
// Unlike ForEachCategoryType which extracts types by a token
// from |slice| by exactly matching it to a token in the name
// of a category, in the worst case this function has to loop through the tokens
// in all category synonyms in all |locales| in order to find a token
// whose edit distance is close enough to the required token from |slice|.
template <typename ToDo>
void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, TLocales const & locales,
CategoriesHolder const & categories, ToDo && todo)
{
for (size_t i = 0; i < slice.Size(); ++i)
{
auto const & token = slice.Get(i);
auto const & dfa =
strings::LevenshteinDFA(token, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(token));
for (int8_t const locale : locales)
{
auto const * trie = categories.GetNameToTypesTrie(locale);
if (trie != nullptr)
MatchInTrie(*trie, dfa, std::bind<void>(todo, i, std::placeholders::_1));
}
}
}
template <typename IterT1, typename IterT2>
bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix)
{
while (beg != end && begPrefix != endPrefix && *beg == *begPrefix)
{
++beg;
++begPrefix;
}
return begPrefix == endPrefix;
}
} // namespace search