[search] Avoid fancy categories matching. #4276
7 changed files with 72 additions and 27 deletions
|
@ -4,11 +4,9 @@
|
|||
#include "base/macros.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace base
|
||||
|
@ -221,7 +219,7 @@ public:
|
|||
template <typename ToDo>
|
||||
void ForEachInNode(ToDo && toDo) const
|
||||
{
|
||||
m_node.m_values.ForEach(std::forward<ToDo>(toDo));
|
||||
m_node.m_values.ForEach(toDo);
|
||||
}
|
||||
|
||||
String GetLabel() const { return m_node.m_edge.template As<String>(); }
|
||||
|
@ -291,7 +289,7 @@ public:
|
|||
void ForEachInTrie(ToDo && toDo) const
|
||||
{
|
||||
String prefix;
|
||||
ForEachInSubtree(m_root, prefix, std::forward<ToDo>(toDo));
|
||||
ForEachInSubtree(m_root, prefix, toDo);
|
||||
}
|
||||
|
||||
// Calls |toDo| for each key-value pair in the node that is reachable
|
||||
|
@ -302,7 +300,7 @@ public:
|
|||
{
|
||||
MoveTo(prefix, true /* fullMatch */,
|
||||
[&](Node const & node, Edge const & /* edge */, size_t /* offset */) {
|
||||
node.m_values.ForEach(std::forward<ToDo>(toDo));
|
||||
node.m_values.ForEach(toDo);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -323,7 +321,7 @@ public:
|
|||
String p = prefix;
|
||||
for (; offset < edge.Size(); ++offset)
|
||||
p.push_back(edge[offset]);
|
||||
ForEachInSubtree(node, p, std::forward<ToDo>(toDo));
|
||||
ForEachInSubtree(node, p, toDo);
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -1,18 +1,13 @@
|
|||
#pragma once
|
||||
|
||||
#include "base/mem_trie.hpp"
|
||||
#include "base/stl_helpers.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
class Reader;
|
||||
|
@ -173,7 +168,7 @@ public:
|
|||
void ForEachTypeByName(int8_t locale, strings::UniString const & name, ToDo && toDo) const
|
||||
{
|
||||
auto const localePrefix = strings::UniString(1, static_cast<strings::UniChar>(locale));
|
||||
m_name2type.ForEachInNode(localePrefix + name, std::forward<ToDo>(toDo));
|
||||
m_name2type.ForEachInNode(localePrefix + name, toDo);
|
||||
}
|
||||
|
||||
GroupTranslations const & GetGroupTranslations() const { return m_groupTranslations; }
|
||||
|
|
|
@ -55,12 +55,26 @@ size_t GetMaxErrorsForToken(strings::UniString const & token)
|
|||
|
||||
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s)
|
||||
{
|
||||
ASSERT(!s.empty(), ());
|
||||
// In search we use LevenshteinDFAs for fuzzy matching. But due to
|
||||
// performance reasons, we limit prefix misprints to fixed set of substitutions defined in
|
||||
// kAllowedMisprints and skipped letters.
|
||||
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s));
|
||||
}
|
||||
|
||||
strings::LevenshteinDFA BuildLevenshteinDFA_Category(strings::UniString const & s)
|
||||
{
|
||||
// https://github.com/organicmaps/organicmaps/issues/3655
|
||||
// Separate DFA for categories (token's length <= 4 means no errors allowed) to avoid fancy matchings like:
|
||||
// cafe <-> care
|
||||
// ecco -> eco
|
||||
// shop <-> shoe
|
||||
/// @todo "hote" doesn't match "hotel" now. Should allow _adding_ symbols when size == 4.
|
||||
|
||||
ASSERT(!s.empty(), ());
|
||||
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForTokenLength(s.size() - 1));
|
||||
}
|
||||
|
||||
UniString NormalizeAndSimplifyString(string_view s)
|
||||
{
|
||||
UniString uniString = MakeUniString(s);
|
||||
|
|
|
@ -22,6 +22,7 @@ inline constexpr size_t GetMaxErrorsForTokenLength(size_t length)
|
|||
size_t GetMaxErrorsForToken(strings::UniString const & token);
|
||||
|
||||
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s);
|
||||
strings::LevenshteinDFA BuildLevenshteinDFA_Category(strings::UniString const & s);
|
||||
|
||||
// This function should be used for all search strings normalization.
|
||||
// It does some magic text transformation which greatly helps us to improve our search.
|
||||
|
|
|
@ -494,14 +494,13 @@ Locales Processor::GetCategoryLocales() const
|
|||
template <typename ToDo>
|
||||
void Processor::ForEachCategoryType(StringSliceBase const & slice, ToDo && toDo) const
|
||||
{
|
||||
::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, forward<ToDo>(toDo));
|
||||
::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, toDo);
|
||||
}
|
||||
|
||||
template <typename ToDo>
|
||||
void Processor::ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && toDo) const
|
||||
{
|
||||
::search::ForEachCategoryTypeFuzzy(slice, GetCategoryLocales(), m_categories,
|
||||
forward<ToDo>(toDo));
|
||||
::search::ForEachCategoryTypeFuzzy(slice, GetCategoryLocales(), m_categories, toDo);
|
||||
}
|
||||
|
||||
void Processor::Search(SearchParams params)
|
||||
|
|
|
@ -917,7 +917,8 @@ UNIT_CLASS_TEST(ProcessorTest, TestCategorialSearch)
|
|||
}
|
||||
|
||||
{
|
||||
Rules const rules = {ExactMatch(wonderlandId, hotel1), ExactMatch(wonderlandId, hotel2),
|
||||
/// @todo We updated fuzzy match for categories: hote -> hotel is not matched now (4 letters input token).
|
||||
Rules const rules = {/*ExactMatch(wonderlandId, hotel1),*/ ExactMatch(wonderlandId, hotel2),
|
||||
ExactMatch(wonderlandId, hotelCafe), ExactMatch(testWorldId, homel),
|
||||
ExactMatch(wonderlandId, hotelDeVille)};
|
||||
// A prefix token.
|
||||
|
@ -3207,4 +3208,45 @@ UNIT_CLASS_TEST(ProcessorTest, Place_Region)
|
|||
TEST(ResultsMatch("carth", rules, "en"), ());
|
||||
}
|
||||
|
||||
UNIT_CLASS_TEST(ProcessorTest, FuzzyCategories)
|
||||
{
|
||||
TestPOI cafe({0, 0.01}, "xxx", "en");
|
||||
cafe.SetTypes({{"amenity", "cafe"}});
|
||||
|
||||
TestPOI cosmetics({0, 0.02}, "yyy", "en");
|
||||
cosmetics.SetTypes({{"shop", "cosmetics"}});
|
||||
|
||||
TestPOI shoes({0, 0.03}, "ecco", "en");
|
||||
shoes.SetTypes({{"shop", "shoes"}});
|
||||
|
||||
TestPOI organic({0, 0.04}, "zzz", "en");
|
||||
organic.SetTypes({{"shop", "grocery"}, {"organic", "yes"}});
|
||||
|
||||
auto wonderlandId = BuildCountry("Wonderland", [&](TestMwmBuilder & builder)
|
||||
{
|
||||
builder.Add(cafe);
|
||||
builder.Add(cosmetics);
|
||||
builder.Add(shoes);
|
||||
builder.Add(organic);
|
||||
});
|
||||
|
||||
SetViewport(m2::RectD(-0.5, -0.5, 0.5, 0.5));
|
||||
|
||||
{
|
||||
Rules const rules = {ExactMatch(wonderlandId, cafe)};
|
||||
TEST(ResultsMatch("cafe", rules), ());
|
||||
}
|
||||
|
||||
{
|
||||
Rules const rules = {ExactMatch(wonderlandId, shoes)};
|
||||
TEST(ResultsMatch("shoe", rules), ());
|
||||
TEST(ResultsMatch("shoes", rules), ());
|
||||
}
|
||||
|
||||
{
|
||||
Rules const rules = {ExactMatch(wonderlandId, shoes)};
|
||||
TEST(ResultsMatch("ecco", rules), ());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace processor_test
|
||||
|
|
|
@ -6,19 +6,14 @@
|
|||
|
||||
#include "indexer/categories_holder.hpp"
|
||||
#include "indexer/feature_decl.hpp"
|
||||
#include "indexer/mwm_set.hpp"
|
||||
#include "indexer/search_delimiters.hpp"
|
||||
#include "indexer/search_string_utils.hpp"
|
||||
#include "indexer/trie.hpp"
|
||||
|
||||
#include "geometry/rect2d.hpp"
|
||||
|
||||
#include "base/levenshtein_dfa.hpp"
|
||||
#include "base/stl_helpers.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
@ -36,14 +31,14 @@ void ForEachCategoryType(StringSliceBase const & slice, Locales const & locales,
|
|||
{
|
||||
auto const & token = slice.Get(i);
|
||||
for (int8_t const locale : locales)
|
||||
categories.ForEachTypeByName(locale, token, std::bind<void>(todo, i, std::placeholders::_1));
|
||||
categories.ForEachTypeByName(locale, token, [&todo, i](uint32_t type) { todo(i, type); });
|
||||
|
||||
// Special case processing of 2 codepoints emoji (e.g. black guy on a bike).
|
||||
// Only emoji synonyms can have one codepoint.
|
||||
if (token.size() > 1)
|
||||
{
|
||||
categories.ForEachTypeByName(CategoriesHolder::kEnglishCode, strings::UniString(1, token[0]),
|
||||
std::bind<void>(todo, i, std::placeholders::_1));
|
||||
[&todo, i](uint32_t type) { todo(i, type); });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -68,11 +63,12 @@ void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, Locales const & loc
|
|||
// A possible optimization is to build each dfa once and save it. Note that
|
||||
// dfas for the prefix tokens differ, i.e. we ignore slice.IsPrefix(i) here.
|
||||
SearchTrieRequest<strings::LevenshteinDFA> request;
|
||||
request.m_names.push_back(BuildLevenshteinDFA(slice.Get(i)));
|
||||
request.m_names.push_back(BuildLevenshteinDFA_Category(slice.Get(i)));
|
||||
request.SetLangs(locales);
|
||||
|
||||
MatchFeaturesInTrie(request, iterator, [&](uint32_t /* type */) { return true; } /* filter */,
|
||||
std::bind<void>(todo, i, std::placeholders::_1));
|
||||
MatchFeaturesInTrie(request, iterator,
|
||||
[](uint32_t) { return true; } /* filter */,
|
||||
[&todo, i](uint32_t type, bool) { todo(i, type); } /* todo */);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Reference in a new issue