[search] Avoid fancy categories matching. #4276

Merged
vng merged 2 commits from vng-fix into master 2023-01-19 18:12:32 +00:00
7 changed files with 72 additions and 27 deletions

View file

@ -4,11 +4,9 @@
#include "base/macros.hpp"
#include <algorithm>
#include <cstddef>
#include <functional>
#include <map>
#include <memory>
#include <utility>
#include <string>
#include <vector>
namespace base
@ -221,7 +219,7 @@ public:
template <typename ToDo>
void ForEachInNode(ToDo && toDo) const
{
m_node.m_values.ForEach(std::forward<ToDo>(toDo));
m_node.m_values.ForEach(toDo);
}
String GetLabel() const { return m_node.m_edge.template As<String>(); }
@ -291,7 +289,7 @@ public:
void ForEachInTrie(ToDo && toDo) const
{
String prefix;
ForEachInSubtree(m_root, prefix, std::forward<ToDo>(toDo));
ForEachInSubtree(m_root, prefix, toDo);
}
// Calls |toDo| for each key-value pair in the node that is reachable
@ -302,7 +300,7 @@ public:
{
MoveTo(prefix, true /* fullMatch */,
[&](Node const & node, Edge const & /* edge */, size_t /* offset */) {
node.m_values.ForEach(std::forward<ToDo>(toDo));
node.m_values.ForEach(toDo);
});
}
@ -323,7 +321,7 @@ public:
String p = prefix;
for (; offset < edge.Size(); ++offset)
p.push_back(edge[offset]);
ForEachInSubtree(node, p, std::forward<ToDo>(toDo));
ForEachInSubtree(node, p, toDo);
});
}

View file

@ -1,18 +1,13 @@
#pragma once
#include "base/mem_trie.hpp"
#include "base/stl_helpers.hpp"
#include "base/string_utils.hpp"
#include <algorithm>
#include <array>
#include <cstdint>
#include <iostream>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
class Reader;
@ -173,7 +168,7 @@ public:
void ForEachTypeByName(int8_t locale, strings::UniString const & name, ToDo && toDo) const
{
auto const localePrefix = strings::UniString(1, static_cast<strings::UniChar>(locale));
m_name2type.ForEachInNode(localePrefix + name, std::forward<ToDo>(toDo));
m_name2type.ForEachInNode(localePrefix + name, toDo);
}
GroupTranslations const & GetGroupTranslations() const { return m_groupTranslations; }

View file

@ -55,12 +55,26 @@ size_t GetMaxErrorsForToken(strings::UniString const & token)
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s)
{
ASSERT(!s.empty(), ());
// In search we use LevenshteinDFAs for fuzzy matching. But due to
// performance reasons, we limit prefix misprints to fixed set of substitutions defined in
// kAllowedMisprints and skipped letters.
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s));
}
strings::LevenshteinDFA BuildLevenshteinDFA_Category(strings::UniString const & s)
{
// https://github.com/organicmaps/organicmaps/issues/3655
// Separate DFA for categories (token's length <= 4 means no errors allowed) to avoid fancy matchings like:
// cafe <-> care
// ecco -> eco
// shop <-> shoe
/// @todo "hote" doesn't match "hotel" now. Should allow _adding_ symbols when size == 4.
ASSERT(!s.empty(), ());
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForTokenLength(s.size() - 1));
}
UniString NormalizeAndSimplifyString(string_view s)
{
UniString uniString = MakeUniString(s);

View file

@ -22,6 +22,7 @@ inline constexpr size_t GetMaxErrorsForTokenLength(size_t length)
size_t GetMaxErrorsForToken(strings::UniString const & token);
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s);
strings::LevenshteinDFA BuildLevenshteinDFA_Category(strings::UniString const & s);
// This function should be used for all search strings normalization.
// It does some magic text transformation which greatly helps us to improve our search.

View file

@ -494,14 +494,13 @@ Locales Processor::GetCategoryLocales() const
template <typename ToDo>
void Processor::ForEachCategoryType(StringSliceBase const & slice, ToDo && toDo) const
{
::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, forward<ToDo>(toDo));
::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, toDo);
}
template <typename ToDo>
void Processor::ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && toDo) const
{
::search::ForEachCategoryTypeFuzzy(slice, GetCategoryLocales(), m_categories,
forward<ToDo>(toDo));
::search::ForEachCategoryTypeFuzzy(slice, GetCategoryLocales(), m_categories, toDo);
}
void Processor::Search(SearchParams params)

View file

@ -917,7 +917,8 @@ UNIT_CLASS_TEST(ProcessorTest, TestCategorialSearch)
}
{
Rules const rules = {ExactMatch(wonderlandId, hotel1), ExactMatch(wonderlandId, hotel2),
/// @todo We updated fuzzy match for categories: hote -> hotel is not matched now (4 letters input token).
Rules const rules = {/*ExactMatch(wonderlandId, hotel1),*/ ExactMatch(wonderlandId, hotel2),
ExactMatch(wonderlandId, hotelCafe), ExactMatch(testWorldId, homel),
ExactMatch(wonderlandId, hotelDeVille)};
// A prefix token.
@ -3207,4 +3208,45 @@ UNIT_CLASS_TEST(ProcessorTest, Place_Region)
TEST(ResultsMatch("carth", rules, "en"), ());
}
UNIT_CLASS_TEST(ProcessorTest, FuzzyCategories)
{
TestPOI cafe({0, 0.01}, "xxx", "en");
cafe.SetTypes({{"amenity", "cafe"}});
TestPOI cosmetics({0, 0.02}, "yyy", "en");
cosmetics.SetTypes({{"shop", "cosmetics"}});
TestPOI shoes({0, 0.03}, "ecco", "en");
shoes.SetTypes({{"shop", "shoes"}});
TestPOI organic({0, 0.04}, "zzz", "en");
organic.SetTypes({{"shop", "grocery"}, {"organic", "yes"}});
auto wonderlandId = BuildCountry("Wonderland", [&](TestMwmBuilder & builder)
{
builder.Add(cafe);
builder.Add(cosmetics);
builder.Add(shoes);
builder.Add(organic);
});
SetViewport(m2::RectD(-0.5, -0.5, 0.5, 0.5));
{
Rules const rules = {ExactMatch(wonderlandId, cafe)};
TEST(ResultsMatch("cafe", rules), ());
}
{
Rules const rules = {ExactMatch(wonderlandId, shoes)};
TEST(ResultsMatch("shoe", rules), ());
TEST(ResultsMatch("shoes", rules), ());
}
{
Rules const rules = {ExactMatch(wonderlandId, shoes)};
TEST(ResultsMatch("ecco", rules), ());
}
}
} // namespace processor_test

View file

@ -6,19 +6,14 @@
#include "indexer/categories_holder.hpp"
#include "indexer/feature_decl.hpp"
#include "indexer/mwm_set.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "indexer/trie.hpp"
#include "geometry/rect2d.hpp"
#include "base/levenshtein_dfa.hpp"
#include "base/stl_helpers.hpp"
#include "base/string_utils.hpp"
#include <cstddef>
#include <cstdint>
#include <functional>
#include <memory>
#include <vector>
@ -36,14 +31,14 @@ void ForEachCategoryType(StringSliceBase const & slice, Locales const & locales,
{
auto const & token = slice.Get(i);
for (int8_t const locale : locales)
categories.ForEachTypeByName(locale, token, std::bind<void>(todo, i, std::placeholders::_1));
categories.ForEachTypeByName(locale, token, [&todo, i](uint32_t type) { todo(i, type); });
// Special case processing of 2 codepoints emoji (e.g. black guy on a bike).
// Only emoji synonyms can have one codepoint.
if (token.size() > 1)
{
categories.ForEachTypeByName(CategoriesHolder::kEnglishCode, strings::UniString(1, token[0]),
std::bind<void>(todo, i, std::placeholders::_1));
[&todo, i](uint32_t type) { todo(i, type); });
}
}
}
@ -68,11 +63,12 @@ void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, Locales const & loc
// A possible optimization is to build each dfa once and save it. Note that
// dfas for the prefix tokens differ, i.e. we ignore slice.IsPrefix(i) here.
SearchTrieRequest<strings::LevenshteinDFA> request;
request.m_names.push_back(BuildLevenshteinDFA(slice.Get(i)));
request.m_names.push_back(BuildLevenshteinDFA_Category(slice.Get(i)));
request.SetLangs(locales);
MatchFeaturesInTrie(request, iterator, [&](uint32_t /* type */) { return true; } /* filter */,
std::bind<void>(todo, i, std::placeholders::_1));
MatchFeaturesInTrie(request, iterator,
[](uint32_t) { return true; } /* filter */,
[&todo, i](uint32_t type, bool) { todo(i, type); } /* todo */);
}
}