[search] Add IsStreetSynonymWithMisprints method.

This commit is contained in:
tatiana-yan 2019-06-26 17:00:35 +03:00 committed by mpimenov
parent 8e1a57bb88
commit e958e56792
6 changed files with 132 additions and 39 deletions

View file

@ -47,6 +47,16 @@ bool TestStreetPrefixMatch(char const * s)
return IsStreetSynonymPrefix(MakeUniString(s));
}
bool TestStreetSynonymWithMisprints(char const * s)
{
return IsStreetSynonymWithMisprints(MakeUniString(s));
}
bool TestStreetPrefixMatchWithMisprints(char const * s)
{
return IsStreetSynonymPrefixWithMisprints(MakeUniString(s));
}
string NormalizeAndSimplifyStringUtf8(string const & s)
{
return strings::ToUtf8(NormalizeAndSimplifyString(s));
@ -106,8 +116,23 @@ UNIT_TEST(StreetSynonym)
{
TEST(TestStreetSynonym("street"), ());
TEST(TestStreetSynonym("улица"), ());
TEST(TestStreetSynonym("strasse"), ());
TEST(TestStreetSynonymWithMisprints("strasse"), ());
TEST(!TestStreetSynonym("strase"), ());
TEST(TestStreetSynonymWithMisprints("strase"), ());
TEST(TestStreetSynonym("boulevard"), ());
TEST(TestStreetSynonymWithMisprints("boulevard"), ());
TEST(!TestStreetSynonym("boulevrd"), ());
TEST(TestStreetSynonymWithMisprints("boulevrd"), ());
TEST(TestStreetSynonym("avenue"), ());
TEST(TestStreetSynonymWithMisprints("avenue"), ());
TEST(!TestStreetSynonym("aveneu"), ());
TEST(TestStreetSynonymWithMisprints("aveneu"), ());
TEST(!TestStreetSynonymWithMisprints("abcdefg"), ());
}
UNIT_TEST(StreetPrefixMatch)
@ -119,6 +144,20 @@ UNIT_TEST(StreetPrefixMatch)
TEST(TestStreetPrefixMatch("проез"), ());
TEST(TestStreetPrefixMatch("проезд"), ());
TEST(!TestStreetPrefixMatch("проездд"), ());
TEST(TestStreetPrefixMatchWithMisprints("пр"), ());
TEST(!TestStreetPrefixMatch("пре"), ());
TEST(!TestStreetPrefixMatchWithMisprints("пре"), ());
TEST(!TestStreetPrefixMatch("преу"), ());
TEST(TestStreetPrefixMatchWithMisprints("преу"), ());
TEST(!TestStreetPrefixMatch("преул"), ());
TEST(TestStreetPrefixMatchWithMisprints("преул"), ());
TEST(!TestStreetPrefixMatch("преуло"), ());
TEST(TestStreetPrefixMatchWithMisprints("преуло"), ());
TEST(!TestStreetPrefixMatch("преулок"), ());
TEST(TestStreetPrefixMatchWithMisprints("преулок"), ());
TEST(!TestStreetPrefixMatch("преулак"), ());
TEST(!TestStreetPrefixMatchWithMisprints("преулак"), ());
}
UNIT_TEST(StreetTokensFilter)

View file

@ -2,12 +2,16 @@
#include "indexer/string_set.hpp"
#include "base/assert.hpp"
#include "base/dfa_helpers.hpp"
#include "base/macros.hpp"
#include "base/mem_trie.hpp"
#include "3party/utfcpp/source/utf8/unchecked.h"
#include <algorithm>
#include <memory>
#include <queue>
#include <vector>
using namespace std;
using namespace strings;
@ -16,6 +20,17 @@ namespace search
{
namespace
{
vector<strings::UniString> const kAllowedMisprints = {
strings::MakeUniString("ckq"),
strings::MakeUniString("eyjiu"),
strings::MakeUniString("gh"),
strings::MakeUniString("pf"),
strings::MakeUniString("vw"),
strings::MakeUniString("ао"),
strings::MakeUniString("еиэ"),
strings::MakeUniString("шщ"),
};
// Replaces '#' followed by an end-of-string or a digit with space.
void RemoveNumeroSigns(UniString & s)
{
@ -42,6 +57,26 @@ void RemoveNumeroSigns(UniString & s)
}
} // namespace
size_t GetMaxErrorsForToken(strings::UniString const & token)
{
bool const digitsOnly = all_of(token.begin(), token.end(), ::isdigit);
if (digitsOnly)
return 0;
if (token.size() < 4)
return 0;
if (token.size() < 8)
return 1;
return 2;
}
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s)
{
// In search we use LevenshteinDFAs for fuzzy matching. But due to
// performance reasons, we limit prefix misprints to fixed set of substitutions defined in
// kAllowedMisprints and skipped letters.
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s));
}
UniString NormalizeAndSimplifyString(string const & s)
{
UniString uniString = MakeUniString(s);
@ -205,6 +240,8 @@ public:
bool m_empty;
};
using Trie = base::MemTrie<UniString, BooleanSum, base::VectorMoves>;
StreetsSynonymsHolder()
{
char const * affics[] =
@ -283,8 +320,42 @@ public:
bool MatchPrefix(UniString const & s) const { return m_strings.HasPrefix(s); }
bool FullMatch(UniString const & s) const { return m_strings.HasKey(s); }
template <typename DFA>
bool MatchWithMisprints(DFA const & dfa) const
{
using TrieIt = Trie::Iterator;
using State = pair<TrieIt, typename DFA::Iterator>;
auto const trieRoot = m_strings.GetRootIterator();
queue<State> q;
q.emplace(trieRoot, dfa.Begin());
while (!q.empty())
{
auto const p = q.front();
q.pop();
auto const & currTrieIt = p.first;
auto const & currDfaIt = p.second;
if (currDfaIt.Accepts())
return true;
currTrieIt.ForEachMove([&q, &currDfaIt](UniChar const & c, TrieIt const & nextTrieIt) {
auto nextDfaIt = currDfaIt;
nextDfaIt.Move(c);
strings::DFAMove(nextDfaIt, nextTrieIt.GetLabel());
if (!nextDfaIt.Rejects())
q.emplace(nextTrieIt, nextDfaIt);
});
}
return false;
}
private:
base::MemTrie<UniString, BooleanSum, base::VectorMoves> m_strings;
Trie m_strings;
};
StreetsSynonymsHolder g_streets;
@ -342,6 +413,18 @@ bool IsStreetSynonymPrefix(UniString const & s)
return g_streets.MatchPrefix(s);
}
bool IsStreetSynonymWithMisprints(UniString const & s)
{
auto const dfa = BuildLevenshteinDFA(s);
return g_streets.MatchWithMisprints(dfa);
}
bool IsStreetSynonymPrefixWithMisprints(UniString const & s)
{
auto const dfa = strings::PrefixDFAModifier<strings::LevenshteinDFA>(BuildLevenshteinDFA(s));
return g_streets.MatchWithMisprints(dfa);
}
bool ContainsNormalized(string const & str, string const & substr)
{
UniString const ustr = NormalizeAndSimplifyString(str);

View file

@ -2,6 +2,7 @@
#include "indexer/search_delimiters.hpp"
#include "base/levenshtein_dfa.hpp"
#include "base/stl_helpers.hpp"
#include "base/string_utils.hpp"
@ -12,6 +13,10 @@
namespace search
{
size_t GetMaxErrorsForToken(strings::UniString const & token);
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s);
// This function should be used for all search strings normalization.
// It does some magic text transformation which greatly helps us to improve our search.
strings::UniString NormalizeAndSimplifyString(std::string const & s);
@ -80,6 +85,8 @@ strings::UniString GetStreetNameAsKey(std::string const & name, bool ignoreStree
// *NOTE* The argument string must be normalized and simplified.
bool IsStreetSynonym(strings::UniString const & s);
bool IsStreetSynonymPrefix(strings::UniString const & s);
bool IsStreetSynonymWithMisprints(strings::UniString const & s);
bool IsStreetSynonymPrefixWithMisprints(strings::UniString const & s);
/// Normalizes both str and substr, and then returns true if substr is found in str.
/// Used in native platform code for search in localized strings (cuisines, categories, strings etc.).

View file

@ -10,6 +10,8 @@
#include "search/query_params.hpp"
#include "search/utils.hpp"
#include "indexer/search_string_utils.hpp"
#include <cstdint>
#include <unordered_map>
#include <utility>

View file

@ -14,42 +14,8 @@
using namespace std;
namespace
{
vector<strings::UniString> const kAllowedMisprints = {
strings::MakeUniString("ckq"),
strings::MakeUniString("eyjiu"),
strings::MakeUniString("gh"),
strings::MakeUniString("pf"),
strings::MakeUniString("vw"),
strings::MakeUniString("ао"),
strings::MakeUniString("еиэ"),
strings::MakeUniString("шщ"),
};
} // namespace
namespace search
{
size_t GetMaxErrorsForToken(strings::UniString const & token)
{
bool const digitsOnly = all_of(token.begin(), token.end(), ::isdigit);
if (digitsOnly)
return 0;
if (token.size() < 4)
return 0;
if (token.size() < 8)
return 1;
return 2;
}
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s)
{
// In search we use LevenshteinDFAs for fuzzy matching. But due to
// performance reasons, we limit prefix misprints to fixed set of substitutions defined in
// kAllowedMisprints and skipped letters.
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s));
}
vector<uint32_t> GetCategoryTypes(string const & name, string const & locale,
CategoriesHolder const & categories)
{

View file

@ -28,10 +28,6 @@ class MwmInfo;
namespace search
{
size_t GetMaxErrorsForToken(strings::UniString const & token);
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s);
template <typename ToDo>
void ForEachCategoryType(StringSliceBase const & slice, Locales const & locales,
CategoriesHolder const & categories, ToDo && todo)