[search] Avoid fancy categories matching (like cafe <-> care).

Signed-off-by: Viktor Govako <viktor.govako@gmail.com>
This commit is contained in:
Viktor Govako 2023-01-14 13:37:12 -03:00
parent 939811a031
commit 37fd63e05f
4 changed files with 59 additions and 2 deletions

View file

@ -55,12 +55,26 @@ size_t GetMaxErrorsForToken(strings::UniString const & token)
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s)
{
ASSERT(!s.empty(), ());
// In search we use LevenshteinDFAs for fuzzy matching. But due to
// performance reasons, we limit prefix misprints to fixed set of substitutions defined in
// kAllowedMisprints and skipped letters.
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s));
}
strings::LevenshteinDFA BuildLevenshteinDFA_Category(strings::UniString const & s)
{
// https://github.com/organicmaps/organicmaps/issues/3655
// Separate DFA for categories (token's length <= 4 means no errors allowed) to avoid fancy matchings like:
// cafe <-> care
// ecco -> eco
// shop <-> shoe
/// @todo "hote" doesn't match "hotel" now. Should allow _adding_ symbols when size == 4.
ASSERT(!s.empty(), ());
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForTokenLength(s.size() - 1));
}
UniString NormalizeAndSimplifyString(string_view s)
{
UniString uniString = MakeUniString(s);

View file

@ -22,6 +22,7 @@ inline constexpr size_t GetMaxErrorsForTokenLength(size_t length)
size_t GetMaxErrorsForToken(strings::UniString const & token);
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s);
strings::LevenshteinDFA BuildLevenshteinDFA_Category(strings::UniString const & s);
// This function should be used for all search strings normalization.
// It does some magic text transformation which greatly helps us to improve our search.

View file

@ -917,7 +917,8 @@ UNIT_CLASS_TEST(ProcessorTest, TestCategorialSearch)
}
{
Rules const rules = {ExactMatch(wonderlandId, hotel1), ExactMatch(wonderlandId, hotel2),
/// @todo We updated fuzzy match for categories: hote -> hotel is not matched now (4 letters input token).
Rules const rules = {/*ExactMatch(wonderlandId, hotel1),*/ ExactMatch(wonderlandId, hotel2),
ExactMatch(wonderlandId, hotelCafe), ExactMatch(testWorldId, homel),
ExactMatch(wonderlandId, hotelDeVille)};
// A prefix token.
@ -3207,4 +3208,45 @@ UNIT_CLASS_TEST(ProcessorTest, Place_Region)
TEST(ResultsMatch("carth", rules, "en"), ());
}
UNIT_CLASS_TEST(ProcessorTest, FuzzyCategories)
{
TestPOI cafe({0, 0.01}, "xxx", "en");
cafe.SetTypes({{"amenity", "cafe"}});
TestPOI cosmetics({0, 0.02}, "yyy", "en");
cosmetics.SetTypes({{"shop", "cosmetics"}});
TestPOI shoes({0, 0.03}, "ecco", "en");
shoes.SetTypes({{"shop", "shoes"}});
TestPOI organic({0, 0.04}, "zzz", "en");
organic.SetTypes({{"shop", "grocery"}, {"organic", "yes"}});
auto wonderlandId = BuildCountry("Wonderland", [&](TestMwmBuilder & builder)
{
builder.Add(cafe);
builder.Add(cosmetics);
builder.Add(shoes);
builder.Add(organic);
});
SetViewport(m2::RectD(-0.5, -0.5, 0.5, 0.5));
{
Rules const rules = {ExactMatch(wonderlandId, cafe)};
TEST(ResultsMatch("cafe", rules), ());
}
{
Rules const rules = {ExactMatch(wonderlandId, shoes)};
TEST(ResultsMatch("shoe", rules), ());
TEST(ResultsMatch("shoes", rules), ());
}
{
Rules const rules = {ExactMatch(wonderlandId, shoes)};
TEST(ResultsMatch("ecco", rules), ());
}
}
} // namespace processor_test

View file

@ -63,7 +63,7 @@ void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, Locales const & loc
// A possible optimization is to build each dfa once and save it. Note that
// dfas for the prefix tokens differ, i.e. we ignore slice.IsPrefix(i) here.
SearchTrieRequest<strings::LevenshteinDFA> request;
request.m_names.push_back(BuildLevenshteinDFA(slice.Get(i)));
request.m_names.push_back(BuildLevenshteinDFA_Category(slice.Get(i)));
request.SetLangs(locales);
MatchFeaturesInTrie(request, iterator,