From e958e56792abf6f485be369ede92f3306a444937 Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Wed, 26 Jun 2019 17:00:35 +0300 Subject: [PATCH] [search] Add IsStreetSynonymWithMisprints method. --- .../search_string_utils_test.cpp | 39 +++++++++ indexer/search_string_utils.cpp | 85 ++++++++++++++++++- indexer/search_string_utils.hpp | 7 ++ search/bookmarks/processor.hpp | 2 + search/utils.cpp | 34 -------- search/utils.hpp | 4 - 6 files changed, 132 insertions(+), 39 deletions(-) diff --git a/indexer/indexer_tests/search_string_utils_test.cpp b/indexer/indexer_tests/search_string_utils_test.cpp index dea12767c9..30463dbd27 100644 --- a/indexer/indexer_tests/search_string_utils_test.cpp +++ b/indexer/indexer_tests/search_string_utils_test.cpp @@ -47,6 +47,16 @@ bool TestStreetPrefixMatch(char const * s) return IsStreetSynonymPrefix(MakeUniString(s)); } +bool TestStreetSynonymWithMisprints(char const * s) +{ + return IsStreetSynonymWithMisprints(MakeUniString(s)); +} + +bool TestStreetPrefixMatchWithMisprints(char const * s) +{ + return IsStreetSynonymPrefixWithMisprints(MakeUniString(s)); +} + string NormalizeAndSimplifyStringUtf8(string const & s) { return strings::ToUtf8(NormalizeAndSimplifyString(s)); @@ -106,8 +116,23 @@ UNIT_TEST(StreetSynonym) { TEST(TestStreetSynonym("street"), ()); TEST(TestStreetSynonym("улица"), ()); + TEST(TestStreetSynonym("strasse"), ()); + TEST(TestStreetSynonymWithMisprints("strasse"), ()); TEST(!TestStreetSynonym("strase"), ()); + TEST(TestStreetSynonymWithMisprints("strase"), ()); + + TEST(TestStreetSynonym("boulevard"), ()); + TEST(TestStreetSynonymWithMisprints("boulevard"), ()); + TEST(!TestStreetSynonym("boulevrd"), ()); + TEST(TestStreetSynonymWithMisprints("boulevrd"), ()); + + TEST(TestStreetSynonym("avenue"), ()); + TEST(TestStreetSynonymWithMisprints("avenue"), ()); + TEST(!TestStreetSynonym("aveneu"), ()); + TEST(TestStreetSynonymWithMisprints("aveneu"), ()); + + TEST(!TestStreetSynonymWithMisprints("abcdefg"), ()); } UNIT_TEST(StreetPrefixMatch) @@ -119,6 +144,20 @@ UNIT_TEST(StreetPrefixMatch) TEST(TestStreetPrefixMatch("проез"), ()); TEST(TestStreetPrefixMatch("проезд"), ()); TEST(!TestStreetPrefixMatch("проездд"), ()); + + TEST(TestStreetPrefixMatchWithMisprints("пр"), ()); + TEST(!TestStreetPrefixMatch("пре"), ()); + TEST(!TestStreetPrefixMatchWithMisprints("пре"), ()); + TEST(!TestStreetPrefixMatch("преу"), ()); + TEST(TestStreetPrefixMatchWithMisprints("преу"), ()); + TEST(!TestStreetPrefixMatch("преул"), ()); + TEST(TestStreetPrefixMatchWithMisprints("преул"), ()); + TEST(!TestStreetPrefixMatch("преуло"), ()); + TEST(TestStreetPrefixMatchWithMisprints("преуло"), ()); + TEST(!TestStreetPrefixMatch("преулок"), ()); + TEST(TestStreetPrefixMatchWithMisprints("преулок"), ()); + TEST(!TestStreetPrefixMatch("преулак"), ()); + TEST(!TestStreetPrefixMatchWithMisprints("преулак"), ()); } UNIT_TEST(StreetTokensFilter) diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp index fb1298dc7a..314aed12ea 100644 --- a/indexer/search_string_utils.cpp +++ b/indexer/search_string_utils.cpp @@ -2,12 +2,16 @@ #include "indexer/string_set.hpp" #include "base/assert.hpp" +#include "base/dfa_helpers.hpp" #include "base/macros.hpp" #include "base/mem_trie.hpp" #include "3party/utfcpp/source/utf8/unchecked.h" #include +#include +#include +#include using namespace std; using namespace strings; @@ -16,6 +20,17 @@ namespace search { namespace { +vector const kAllowedMisprints = { + strings::MakeUniString("ckq"), + strings::MakeUniString("eyjiu"), + strings::MakeUniString("gh"), + strings::MakeUniString("pf"), + strings::MakeUniString("vw"), + strings::MakeUniString("ао"), + strings::MakeUniString("еиэ"), + strings::MakeUniString("шщ"), +}; + // Replaces '#' followed by an end-of-string or a digit with space. void RemoveNumeroSigns(UniString & s) { @@ -42,6 +57,26 @@ void RemoveNumeroSigns(UniString & s) } } // namespace +size_t GetMaxErrorsForToken(strings::UniString const & token) +{ + bool const digitsOnly = all_of(token.begin(), token.end(), ::isdigit); + if (digitsOnly) + return 0; + if (token.size() < 4) + return 0; + if (token.size() < 8) + return 1; + return 2; +} + +strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s) +{ + // In search we use LevenshteinDFAs for fuzzy matching. But due to + // performance reasons, we limit prefix misprints to fixed set of substitutions defined in + // kAllowedMisprints and skipped letters. + return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s)); +} + UniString NormalizeAndSimplifyString(string const & s) { UniString uniString = MakeUniString(s); @@ -205,6 +240,8 @@ public: bool m_empty; }; + using Trie = base::MemTrie; + StreetsSynonymsHolder() { char const * affics[] = @@ -283,8 +320,42 @@ public: bool MatchPrefix(UniString const & s) const { return m_strings.HasPrefix(s); } bool FullMatch(UniString const & s) const { return m_strings.HasKey(s); } + template + bool MatchWithMisprints(DFA const & dfa) const + { + using TrieIt = Trie::Iterator; + using State = pair; + + auto const trieRoot = m_strings.GetRootIterator(); + + queue q; + q.emplace(trieRoot, dfa.Begin()); + + while (!q.empty()) + { + auto const p = q.front(); + q.pop(); + + auto const & currTrieIt = p.first; + auto const & currDfaIt = p.second; + + if (currDfaIt.Accepts()) + return true; + + currTrieIt.ForEachMove([&q, &currDfaIt](UniChar const & c, TrieIt const & nextTrieIt) { + auto nextDfaIt = currDfaIt; + nextDfaIt.Move(c); + strings::DFAMove(nextDfaIt, nextTrieIt.GetLabel()); + if (!nextDfaIt.Rejects()) + q.emplace(nextTrieIt, nextDfaIt); + }); + } + + return false; + } + private: - base::MemTrie m_strings; + Trie m_strings; }; StreetsSynonymsHolder g_streets; @@ -342,6 +413,18 @@ bool IsStreetSynonymPrefix(UniString const & s) return g_streets.MatchPrefix(s); } +bool IsStreetSynonymWithMisprints(UniString const & s) +{ + auto const dfa = BuildLevenshteinDFA(s); + return g_streets.MatchWithMisprints(dfa); +} + +bool IsStreetSynonymPrefixWithMisprints(UniString const & s) +{ + auto const dfa = strings::PrefixDFAModifier(BuildLevenshteinDFA(s)); + return g_streets.MatchWithMisprints(dfa); +} + bool ContainsNormalized(string const & str, string const & substr) { UniString const ustr = NormalizeAndSimplifyString(str); diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index 7f3f2fbe41..87017c4c09 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -2,6 +2,7 @@ #include "indexer/search_delimiters.hpp" +#include "base/levenshtein_dfa.hpp" #include "base/stl_helpers.hpp" #include "base/string_utils.hpp" @@ -12,6 +13,10 @@ namespace search { +size_t GetMaxErrorsForToken(strings::UniString const & token); + +strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s); + // This function should be used for all search strings normalization. // It does some magic text transformation which greatly helps us to improve our search. strings::UniString NormalizeAndSimplifyString(std::string const & s); @@ -80,6 +85,8 @@ strings::UniString GetStreetNameAsKey(std::string const & name, bool ignoreStree // *NOTE* The argument string must be normalized and simplified. bool IsStreetSynonym(strings::UniString const & s); bool IsStreetSynonymPrefix(strings::UniString const & s); +bool IsStreetSynonymWithMisprints(strings::UniString const & s); +bool IsStreetSynonymPrefixWithMisprints(strings::UniString const & s); /// Normalizes both str and substr, and then returns true if substr is found in str. /// Used in native platform code for search in localized strings (cuisines, categories, strings etc.). diff --git a/search/bookmarks/processor.hpp b/search/bookmarks/processor.hpp index a2c670ccf4..5141472030 100644 --- a/search/bookmarks/processor.hpp +++ b/search/bookmarks/processor.hpp @@ -10,6 +10,8 @@ #include "search/query_params.hpp" #include "search/utils.hpp" +#include "indexer/search_string_utils.hpp" + #include #include #include diff --git a/search/utils.cpp b/search/utils.cpp index 6da595cb9d..f8b55db544 100644 --- a/search/utils.cpp +++ b/search/utils.cpp @@ -14,42 +14,8 @@ using namespace std; -namespace -{ -vector const kAllowedMisprints = { - strings::MakeUniString("ckq"), - strings::MakeUniString("eyjiu"), - strings::MakeUniString("gh"), - strings::MakeUniString("pf"), - strings::MakeUniString("vw"), - strings::MakeUniString("ао"), - strings::MakeUniString("еиэ"), - strings::MakeUniString("шщ"), -}; -} // namespace - namespace search { -size_t GetMaxErrorsForToken(strings::UniString const & token) -{ - bool const digitsOnly = all_of(token.begin(), token.end(), ::isdigit); - if (digitsOnly) - return 0; - if (token.size() < 4) - return 0; - if (token.size() < 8) - return 1; - return 2; -} - -strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s) -{ - // In search we use LevenshteinDFAs for fuzzy matching. But due to - // performance reasons, we limit prefix misprints to fixed set of substitutions defined in - // kAllowedMisprints and skipped letters. - return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s)); -} - vector GetCategoryTypes(string const & name, string const & locale, CategoriesHolder const & categories) { diff --git a/search/utils.hpp b/search/utils.hpp index bba9fcef98..4d05d41874 100644 --- a/search/utils.hpp +++ b/search/utils.hpp @@ -28,10 +28,6 @@ class MwmInfo; namespace search { -size_t GetMaxErrorsForToken(strings::UniString const & token); - -strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s); - template void ForEachCategoryType(StringSliceBase const & slice, Locales const & locales, CategoriesHolder const & categories, ToDo && todo)