Merge pull request #5412 from mpimenov/categories

[search] Fuzzy search by category.
This commit is contained in:
ygorshenin 2017-02-14 18:24:32 +03:00 committed by GitHub
commit 4973685785
16 changed files with 220 additions and 62 deletions

View file

@ -3,6 +3,7 @@
#include "base/macros.hpp"
#include "base/stl_add.hpp"
#include <algorithm>
#include <cstdint>
#include <map>
#include <memory>
@ -15,7 +16,12 @@ namespace my
template <typename String, typename Value>
class MemTrie
{
private:
struct Node;
public:
using Char = typename String::value_type;
MemTrie() = default;
MemTrie(MemTrie && rhs) { *this = std::move(rhs); }
@ -23,10 +29,38 @@ public:
{
m_root = std::move(rhs.m_root);
m_numNodes = rhs.m_numNodes;
rhs.m_numNodes = 1;
rhs.Clear();
return *this;
}
// A read-only iterator wrapping a Node. Any modification to the
// underlying trie is assumed to invalidate the iterator.
class Iterator
{
public:
Iterator(MemTrie::Node const & node) : m_node(node) {}
// Iterates over all possible moves from this Iterator's node
// and calls |toDo| with two arguments:
// (Char of the move, Iterator wrapping the node of the move).
template <typename ToDo>
void ForEachMove(ToDo && toDo) const
{
for (auto const & move : m_node.m_moves)
toDo(move.first, Iterator(*move.second));
}
// Calls |toDo| for every value in this Iterator's node.
template <typename ToDo>
void ForEachInNode(ToDo && toDo) const
{
std::for_each(m_node.m_values.begin(), m_node.m_values.end(), std::forward<ToDo>(toDo));
}
private:
MemTrie::Node const & m_node;
};
// Adds a key-value pair to the trie.
void Add(String const & key, Value const & value)
{
@ -69,12 +103,20 @@ public:
ForEachInSubtree(*root, prefix, std::forward<ToDo>(toDo));
}
void Clear()
{
m_root.Clear();
m_numNodes = 1;
}
size_t GetNumNodes() const { return m_numNodes; }
Iterator GetRootIterator() const { return Iterator(m_root); }
Node const & GetRoot() const { return m_root; }
private:
struct Node
{
using Char = typename String::value_type;
friend class MemTrie<String, Value>::Iterator;
Node() = default;
Node(Node && /* rhs */) = default;
@ -98,6 +140,12 @@ private:
void AddValue(Value const & value) { m_values.push_back(value); }
void Clear()
{
m_moves.clear();
m_values.clear();
}
std::map<Char, std::unique_ptr<Node>> m_moves;
std::vector<Value> m_values;

View file

@ -237,19 +237,14 @@ bool IsASCIIString(std::string const & str)
bool IsASCIIDigit(UniChar c) { return c >= '0' && c <= '9'; }
bool IsASCIILatin(UniChar c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); }
bool StartsWith(UniString const & s, UniString const & p)
{
if (p.size() > s.size())
return false;
for (size_t i = 0; i < p.size(); ++i)
{
if (s[i] != p[i])
return false;
}
return true;
return StartsWith(s.begin(), s.end(), p.begin(), p.end());
}
bool StartsWith(std::string const & s1, char const * s2) { return (s1.compare(0, strlen(s2), s2) == 0); }
bool EndsWith(std::string const & s1, char const * s2)
{
size_t const n = s1.size();

View file

@ -439,6 +439,17 @@ std::string to_string_dac(double d, int dac);
inline std::string to_string_with_digits_after_comma(double d, int dac) { return to_string_dac(d, dac); }
//@}
template <typename IterT1, typename IterT2>
bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix)
{
while (beg != end && begPrefix != endPrefix && *beg == *begPrefix)
{
++beg;
++begPrefix;
}
return begPrefix == endPrefix;
}
bool StartsWith(UniString const & s, UniString const & p);
bool StartsWith(std::string const & s1, char const * s2);

View file

@ -69,7 +69,7 @@ bool ParseEmoji(CategoriesHolder::Category::Name & name)
return false;
}
name.m_name = ToUtf8(UniString(1, static_cast<UniChar>(c)));
name.m_name = ToUtf8(UniString(1 /* numChars */, static_cast<UniChar>(c)));
if (IsASCIIString(ToUtf8(search::NormalizeAndSimplifyString(name.m_name))))
{
@ -203,6 +203,8 @@ void CategoriesHolder::AddCategory(Category & cat, vector<uint32_t> & types)
auto const locale = synonym.m_locale;
ASSERT_NOT_EQUAL(locale, kUnsupportedLocaleCode, ());
auto const localePrefix = String(1, static_cast<strings::UniChar>(locale));
auto const uniName = search::NormalizeAndSimplifyString(synonym.m_name);
vector<String> tokens;
@ -213,10 +215,7 @@ void CategoriesHolder::AddCategory(Category & cat, vector<uint32_t> & types)
if (!ValidKeyToken(token))
continue;
for (uint32_t const t : types)
{
auto it = m_name2type.emplace(locale, make_unique<Trie>()).first;
it->second->Add(token, t);
}
m_name2type.Add(localePrefix + token, t);
}
}
}
@ -243,7 +242,7 @@ bool CategoriesHolder::ValidKeyToken(String const & s)
void CategoriesHolder::LoadFromStream(istream & s)
{
m_type2cat.clear();
m_name2type.clear();
m_name2type.Clear();
m_groupTranslations.clear();
State state = EParseTypes;

View file

@ -4,6 +4,7 @@
#include "base/stl_helpers.hpp"
#include "base/string_utils.hpp"
#include "std/algorithm.hpp"
#include "std/deque.hpp"
#include "std/iostream.hpp"
#include "std/map.hpp"
@ -56,7 +57,8 @@ private:
Type2CategoryCont m_type2cat;
// Maps locale and category token to the list of corresponding types.
map<int8_t, unique_ptr<Trie>> m_name2type;
// Locale is treated as a special symbol prepended to the token.
Trie m_name2type;
GroupTranslations m_groupTranslations;
@ -109,10 +111,9 @@ public:
template <class ToDo>
void ForEachTypeByName(int8_t locale, String const & name, ToDo && toDo) const
{
auto const it = m_name2type.find(locale);
if (it == m_name2type.end())
return;
it->second->ForEachInNode(name, my::MakeIgnoreFirstArgument(forward<ToDo>(toDo)));
auto const localePrefix = String(1, static_cast<strings::UniChar>(locale));
m_name2type.ForEachInNode(localePrefix + name,
my::MakeIgnoreFirstArgument(forward<ToDo>(toDo)));
}
inline GroupTranslations const & GetGroupTranslations() const { return m_groupTranslations; }
@ -125,12 +126,14 @@ public:
/// @returns raw classificator type if it's not localized in categories.txt.
string GetReadableFeatureType(uint32_t type, int8_t locale) const;
// Exposes the tries that map category tokens to types.
Trie const & GetNameToTypesTrie() const { return m_name2type; }
bool IsTypeExist(uint32_t type) const;
inline void Swap(CategoriesHolder & r)
{
m_type2cat.swap(r.m_type2cat);
m_name2type.swap(r.m_name2type);
std::swap(m_name2type, r.m_name2type);
}
// Converts any language |locale| from UI to the corresponding

View file

@ -123,6 +123,7 @@ set(
token_slice.hpp
types_skipper.cpp
types_skipper.hpp
utils.cpp
utils.hpp
viewport_search_callback.cpp
viewport_search_callback.hpp

View file

@ -5,16 +5,4 @@ namespace search
/// Upper bound for max count of tokens for indexing and scoring.
int constexpr MAX_TOKENS = 32;
int constexpr MAX_SUGGESTS_COUNT = 5;
template <typename IterT1, typename IterT2>
bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix)
{
while (beg != end && begPrefix != endPrefix && *beg == *begPrefix)
{
++beg;
++begPrefix;
}
return begPrefix == endPrefix;
}
} // namespace search

View file

@ -221,7 +221,7 @@ struct SearchTrieRequest
QueryParams::Langs m_langs;
};
// Calls |toDo| for each feature accepted but at least one DFA.
// Calls |toDo| for each feature accepted by at least one DFA.
//
// *NOTE* |toDo| may be called several times for the same feature.
template <typename DFA, typename Value, typename ToDo>

View file

@ -315,18 +315,6 @@ size_t OrderCountries(m2::RectD const & pivot, vector<shared_ptr<MwmInfo>> & inf
auto const sep = stable_partition(infos.begin(), infos.end(), intersects);
return distance(infos.begin(), sep);
}
size_t GetMaxErrorsForToken(UniString const & token)
{
bool const digitsOnly = all_of(token.begin(), token.end(), isdigit);
if (digitsOnly)
return 0;
if (token.size() < 4)
return 0;
if (token.size() < 8)
return 1;
return 2;
}
} // namespace
// Geocoder::Params --------------------------------------------------------------------------------

View file

@ -1,9 +1,10 @@
#include "keyword_matcher.hpp"
#include "search/keyword_matcher.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "base/stl_add.hpp"
#include "base/string_utils.hpp"
#include "std/algorithm.hpp"
#include "std/sstream.hpp"
@ -67,7 +68,7 @@ KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const * tokens, size_t coun
bPrefixMatched = false;
for (int j = 0; j < count && !bPrefixMatched; ++j)
if (!isNameTokenMatched[j] &&
StartsWith(tokens[j].begin(), tokens[j].end(), m_prefix.begin(), m_prefix.end()))
strings::StartsWith(tokens[j].begin(), tokens[j].end(), m_prefix.begin(), m_prefix.end()))
{
isNameTokenMatched[j] = bPrefixMatched = true;
int8_t const tokenMatchDistance = int(m_keywords.size()) - j;

View file

@ -128,8 +128,8 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res
GetPlatform().GetMarketingService().SendMarketingEvent(marketing::kSearchEmitResultsAndCoords, {});
}
// Removes all full-token stop words from |params|, unless |params|
// consists of all such tokens.
// Removes all full-token stop words from |params|.
// Does nothing if all tokens in |params| are non-prefix stop words.
void RemoveStopWordsIfNeeded(QueryParams & params)
{
size_t numStopWords = 0;
@ -331,6 +331,7 @@ int8_t Processor::GetLanguage(int id) const
{
return m_ranker.GetLanguage(GetLangIndex(id));
}
m2::PointD Processor::GetPivotPoint() const
{
bool const viewportSearch = m_mode == Mode::Viewport;
@ -408,9 +409,16 @@ TLocales Processor::GetCategoryLocales() const
}
template <typename ToDo>
void Processor::ForEachCategoryType(StringSliceBase const & slice, ToDo && todo) const
void Processor::ForEachCategoryType(StringSliceBase const & slice, ToDo && toDo) const
{
::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, forward<ToDo>(todo));
::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, forward<ToDo>(toDo));
}
template <typename ToDo>
void Processor::ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && toDo) const
{
::search::ForEachCategoryTypeFuzzy(slice, GetCategoryLocales(), m_categories,
forward<ToDo>(toDo));
}
void Processor::Search(SearchParams const & params, m2::RectD const & viewport)
@ -671,11 +679,9 @@ void Processor::InitParams(QueryParams & params)
}
}
};
ForEachCategoryType(QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix), addSyms);
auto & langs = params.GetLangs();
for (int i = 0; i < LANG_COUNT; ++i)
langs.Insert(GetLanguage(i));
// todo(@m, @y). Shall we match prefix tokens for categories?
ForEachCategoryTypeFuzzy(QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix), addSyms);
RemoveStopWordsIfNeeded(params);
@ -687,6 +693,12 @@ void Processor::InitParams(QueryParams & params)
if (IsStreetSynonym(token.m_original))
params.GetTypeIndices(i).clear();
}
for (size_t i = 0; i < params.GetNumTokens(); ++i)
my::SortUnique(params.GetTypeIndices(i));
for (int i = 0; i < LANG_COUNT; ++i)
params.GetLangs().Insert(GetLanguage(i));
}
void Processor::InitGeocoder(Geocoder::Params & params)

View file

@ -139,7 +139,10 @@ protected:
TLocales GetCategoryLocales() const;
template <typename ToDo>
void ForEachCategoryType(StringSliceBase const & slice, ToDo && todo) const;
void ForEachCategoryType(StringSliceBase const & slice, ToDo && toDo) const;
template <typename ToDo>
void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && toDo) const;
m2::PointD GetPivotPoint() const;
m2::RectD GetPivotRect() const;

View file

@ -8,6 +8,7 @@
#include "indexer/feature_algo.hpp"
#include "base/logging.hpp"
#include "base/string_utils.hpp"
#include "std/algorithm.hpp"
#include "std/iterator.hpp"
@ -433,7 +434,7 @@ void Ranker::MatchForSuggestions(strings::UniString const & token, int8_t locale
if ((suggest.m_prefixLength <= token.size()) &&
(token != s) && // do not push suggestion if it already equals to token
(suggest.m_locale == locale) && // push suggestions only for needed language
StartsWith(s.begin(), s.end(), token.begin(), token.end()))
strings::StartsWith(s.begin(), s.end(), token.begin(), token.end()))
{
string const utf8Str = strings::ToUtf8(s);
Result r(utf8Str, prologue + utf8Str + " ");

View file

@ -135,4 +135,5 @@ SOURCES += \
streets_matcher.cpp \
token_slice.cpp \
types_skipper.cpp \
utils.cpp \
viewport_search_callback.cpp

View file

@ -754,6 +754,9 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
TestPOI bar(m2::PointD(0, 0), "Черчилль", "ru");
bar.SetTypes({{"amenity", "pub"}});
TestPOI metro(m2::PointD(5.0, 5.0), "Liceu", "es");
metro.SetTypes({{"railway", "subway_entrance"}});
BuildWorld([&](TestMwmBuilder & builder) {
builder.Add(country);
builder.Add(city);
@ -762,6 +765,7 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
auto id = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
builder.Add(street);
builder.Add(bar);
builder.Add(metro);
});
SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0)));
@ -778,6 +782,17 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
TEST(ResultsMatch("масква ленинргадский чирчиль", "ru", TRules{}), ());
TEST(ResultsMatch("моксва ленинргадский черчиль", "ru", rules), ());
TEST(ResultsMatch("food", "ru", rules), ());
TEST(ResultsMatch("foood", "ru", rules), ());
TEST(ResultsMatch("fod", "ru", TRules{}), ());
TRules rulesMetro = {ExactMatch(id, metro)};
TEST(ResultsMatch("transporte", "es", rulesMetro), ());
TEST(ResultsMatch("transport", "es", rulesMetro), ());
TEST(ResultsMatch("transpurt", "en", rulesMetro), ());
TEST(ResultsMatch("transpurrt", "es", rulesMetro), ());
TEST(ResultsMatch("transportation", "en", TRules{}), ());
}
}

View file

@ -3,14 +3,72 @@
#include "search/token_slice.hpp"
#include "indexer/categories_holder.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "base/buffer_vector.hpp"
#include "base/levenshtein_dfa.hpp"
#include "base/stl_helpers.hpp"
#include "base/string_utils.hpp"
#include <algorithm>
#include <cctype>
#include <functional>
#include <queue>
#include <vector>
namespace search
{
// todo(@m, @y). Unite with the similar function in search/feature_offset_match.hpp.
template <typename Trie, typename DFA, typename ToDo>
bool MatchInTrie(Trie const & /* trie */, typename Trie::Iterator const & trieStartIt,
DFA const & dfa, ToDo && toDo)
{
using Char = typename Trie::Char;
using TrieIt = typename Trie::Iterator;
using DFAIt = typename DFA::Iterator;
using State = pair<TrieIt, DFAIt>;
std::queue<State> q;
{
auto it = dfa.Begin();
if (it.Rejects())
return false;
q.emplace(trieStartIt, it);
}
bool found = false;
while (!q.empty())
{
auto const p = q.front();
q.pop();
auto const & trieIt = p.first;
auto const & dfaIt = p.second;
if (dfaIt.Accepts())
{
trieIt.ForEachInNode(toDo);
found = true;
}
trieIt.ForEachMove([&](Char const & c, TrieIt const & nextTrieIt) {
auto nextDfaIt = dfaIt;
nextDfaIt.Move(c);
if (!nextDfaIt.Rejects())
q.emplace(nextTrieIt, nextDfaIt);
});
}
return found;
}
using TLocales = buffer_vector<int8_t, 3>;
size_t GetMaxErrorsForToken(strings::UniString const & token);
template <typename ToDo>
void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales,
CategoriesHolder const & categories, ToDo && todo)
@ -18,16 +76,50 @@ void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales
for (size_t i = 0; i < slice.Size(); ++i)
{
auto const & token = slice.Get(i);
for (size_t j = 0; j < locales.size(); ++j)
categories.ForEachTypeByName(locales[j], token, bind<void>(todo, i, _1));
for (int8_t const locale : locales)
categories.ForEachTypeByName(locale, token, std::bind<void>(todo, i, std::placeholders::_1));
// Special case processing of 2 codepoints emoji (e.g. black guy on a bike).
// Only emoji synonyms can have one codepoint.
if (token.size() > 1)
{
categories.ForEachTypeByName(CategoriesHolder::kEnglishCode, strings::UniString(1, token[0]),
bind<void>(todo, i, _1));
std::bind<void>(todo, i, std::placeholders::_1));
}
}
}
// Unlike ForEachCategoryType which extracts types by a token
// from |slice| by exactly matching it to a token in the name
// of a category, in the worst case this function has to loop through the tokens
// in all category synonyms in all |locales| in order to find a token
// whose edit distance is close enough to the required token from |slice|.
template <typename ToDo>
void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, TLocales const & locales,
CategoriesHolder const & categories, ToDo && todo)
{
using Trie = my::MemTrie<strings::UniString, uint32_t>;
auto const & trie = categories.GetNameToTypesTrie();
auto const & trieRootIt = trie.GetRootIterator();
vector<int8_t> sortedLocales(locales.begin(), locales.end());
my::SortUnique(sortedLocales);
for (size_t i = 0; i < slice.Size(); ++i)
{
auto const & token = slice.Get(i);
// todo(@m, @y). We build dfa twice for each token: here and in geocoder.cpp.
// A possible optimization is to build each dfa once and save it. Note that
// dfas for the prefix tokens differ, i.e. we ignore slice.IsPrefix(i) here.
strings::LevenshteinDFA const dfa(token, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(token));
trieRootIt.ForEachMove([&](Trie::Char const & c, Trie::Iterator const & moveIt) {
if (std::binary_search(sortedLocales.begin(), sortedLocales.end(), static_cast<int8_t>(c)))
{
MatchInTrie(trie /* passed to infer the iterator's type */, moveIt, dfa,
std::bind<void>(todo, i, std::placeholders::_1));
}
});
}
}
} // namespace search