forked from organicmaps/organicmaps
[search] Add IsStreetSynonymWithMisprints method.
This commit is contained in:
parent
8e1a57bb88
commit
e958e56792
6 changed files with 132 additions and 39 deletions
|
@ -47,6 +47,16 @@ bool TestStreetPrefixMatch(char const * s)
|
|||
return IsStreetSynonymPrefix(MakeUniString(s));
|
||||
}
|
||||
|
||||
bool TestStreetSynonymWithMisprints(char const * s)
|
||||
{
|
||||
return IsStreetSynonymWithMisprints(MakeUniString(s));
|
||||
}
|
||||
|
||||
bool TestStreetPrefixMatchWithMisprints(char const * s)
|
||||
{
|
||||
return IsStreetSynonymPrefixWithMisprints(MakeUniString(s));
|
||||
}
|
||||
|
||||
string NormalizeAndSimplifyStringUtf8(string const & s)
|
||||
{
|
||||
return strings::ToUtf8(NormalizeAndSimplifyString(s));
|
||||
|
@ -106,8 +116,23 @@ UNIT_TEST(StreetSynonym)
|
|||
{
|
||||
TEST(TestStreetSynonym("street"), ());
|
||||
TEST(TestStreetSynonym("улица"), ());
|
||||
|
||||
TEST(TestStreetSynonym("strasse"), ());
|
||||
TEST(TestStreetSynonymWithMisprints("strasse"), ());
|
||||
TEST(!TestStreetSynonym("strase"), ());
|
||||
TEST(TestStreetSynonymWithMisprints("strase"), ());
|
||||
|
||||
TEST(TestStreetSynonym("boulevard"), ());
|
||||
TEST(TestStreetSynonymWithMisprints("boulevard"), ());
|
||||
TEST(!TestStreetSynonym("boulevrd"), ());
|
||||
TEST(TestStreetSynonymWithMisprints("boulevrd"), ());
|
||||
|
||||
TEST(TestStreetSynonym("avenue"), ());
|
||||
TEST(TestStreetSynonymWithMisprints("avenue"), ());
|
||||
TEST(!TestStreetSynonym("aveneu"), ());
|
||||
TEST(TestStreetSynonymWithMisprints("aveneu"), ());
|
||||
|
||||
TEST(!TestStreetSynonymWithMisprints("abcdefg"), ());
|
||||
}
|
||||
|
||||
UNIT_TEST(StreetPrefixMatch)
|
||||
|
@ -119,6 +144,20 @@ UNIT_TEST(StreetPrefixMatch)
|
|||
TEST(TestStreetPrefixMatch("проез"), ());
|
||||
TEST(TestStreetPrefixMatch("проезд"), ());
|
||||
TEST(!TestStreetPrefixMatch("проездд"), ());
|
||||
|
||||
TEST(TestStreetPrefixMatchWithMisprints("пр"), ());
|
||||
TEST(!TestStreetPrefixMatch("пре"), ());
|
||||
TEST(!TestStreetPrefixMatchWithMisprints("пре"), ());
|
||||
TEST(!TestStreetPrefixMatch("преу"), ());
|
||||
TEST(TestStreetPrefixMatchWithMisprints("преу"), ());
|
||||
TEST(!TestStreetPrefixMatch("преул"), ());
|
||||
TEST(TestStreetPrefixMatchWithMisprints("преул"), ());
|
||||
TEST(!TestStreetPrefixMatch("преуло"), ());
|
||||
TEST(TestStreetPrefixMatchWithMisprints("преуло"), ());
|
||||
TEST(!TestStreetPrefixMatch("преулок"), ());
|
||||
TEST(TestStreetPrefixMatchWithMisprints("преулок"), ());
|
||||
TEST(!TestStreetPrefixMatch("преулак"), ());
|
||||
TEST(!TestStreetPrefixMatchWithMisprints("преулак"), ());
|
||||
}
|
||||
|
||||
UNIT_TEST(StreetTokensFilter)
|
||||
|
|
|
@ -2,12 +2,16 @@
|
|||
#include "indexer/string_set.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/dfa_helpers.hpp"
|
||||
#include "base/macros.hpp"
|
||||
#include "base/mem_trie.hpp"
|
||||
|
||||
#include "3party/utfcpp/source/utf8/unchecked.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
using namespace strings;
|
||||
|
@ -16,6 +20,17 @@ namespace search
|
|||
{
|
||||
namespace
|
||||
{
|
||||
vector<strings::UniString> const kAllowedMisprints = {
|
||||
strings::MakeUniString("ckq"),
|
||||
strings::MakeUniString("eyjiu"),
|
||||
strings::MakeUniString("gh"),
|
||||
strings::MakeUniString("pf"),
|
||||
strings::MakeUniString("vw"),
|
||||
strings::MakeUniString("ао"),
|
||||
strings::MakeUniString("еиэ"),
|
||||
strings::MakeUniString("шщ"),
|
||||
};
|
||||
|
||||
// Replaces '#' followed by an end-of-string or a digit with space.
|
||||
void RemoveNumeroSigns(UniString & s)
|
||||
{
|
||||
|
@ -42,6 +57,26 @@ void RemoveNumeroSigns(UniString & s)
|
|||
}
|
||||
} // namespace
|
||||
|
||||
size_t GetMaxErrorsForToken(strings::UniString const & token)
|
||||
{
|
||||
bool const digitsOnly = all_of(token.begin(), token.end(), ::isdigit);
|
||||
if (digitsOnly)
|
||||
return 0;
|
||||
if (token.size() < 4)
|
||||
return 0;
|
||||
if (token.size() < 8)
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
|
||||
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s)
|
||||
{
|
||||
// In search we use LevenshteinDFAs for fuzzy matching. But due to
|
||||
// performance reasons, we limit prefix misprints to fixed set of substitutions defined in
|
||||
// kAllowedMisprints and skipped letters.
|
||||
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s));
|
||||
}
|
||||
|
||||
UniString NormalizeAndSimplifyString(string const & s)
|
||||
{
|
||||
UniString uniString = MakeUniString(s);
|
||||
|
@ -205,6 +240,8 @@ public:
|
|||
bool m_empty;
|
||||
};
|
||||
|
||||
using Trie = base::MemTrie<UniString, BooleanSum, base::VectorMoves>;
|
||||
|
||||
StreetsSynonymsHolder()
|
||||
{
|
||||
char const * affics[] =
|
||||
|
@ -283,8 +320,42 @@ public:
|
|||
bool MatchPrefix(UniString const & s) const { return m_strings.HasPrefix(s); }
|
||||
bool FullMatch(UniString const & s) const { return m_strings.HasKey(s); }
|
||||
|
||||
template <typename DFA>
|
||||
bool MatchWithMisprints(DFA const & dfa) const
|
||||
{
|
||||
using TrieIt = Trie::Iterator;
|
||||
using State = pair<TrieIt, typename DFA::Iterator>;
|
||||
|
||||
auto const trieRoot = m_strings.GetRootIterator();
|
||||
|
||||
queue<State> q;
|
||||
q.emplace(trieRoot, dfa.Begin());
|
||||
|
||||
while (!q.empty())
|
||||
{
|
||||
auto const p = q.front();
|
||||
q.pop();
|
||||
|
||||
auto const & currTrieIt = p.first;
|
||||
auto const & currDfaIt = p.second;
|
||||
|
||||
if (currDfaIt.Accepts())
|
||||
return true;
|
||||
|
||||
currTrieIt.ForEachMove([&q, &currDfaIt](UniChar const & c, TrieIt const & nextTrieIt) {
|
||||
auto nextDfaIt = currDfaIt;
|
||||
nextDfaIt.Move(c);
|
||||
strings::DFAMove(nextDfaIt, nextTrieIt.GetLabel());
|
||||
if (!nextDfaIt.Rejects())
|
||||
q.emplace(nextTrieIt, nextDfaIt);
|
||||
});
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
base::MemTrie<UniString, BooleanSum, base::VectorMoves> m_strings;
|
||||
Trie m_strings;
|
||||
};
|
||||
|
||||
StreetsSynonymsHolder g_streets;
|
||||
|
@ -342,6 +413,18 @@ bool IsStreetSynonymPrefix(UniString const & s)
|
|||
return g_streets.MatchPrefix(s);
|
||||
}
|
||||
|
||||
bool IsStreetSynonymWithMisprints(UniString const & s)
|
||||
{
|
||||
auto const dfa = BuildLevenshteinDFA(s);
|
||||
return g_streets.MatchWithMisprints(dfa);
|
||||
}
|
||||
|
||||
bool IsStreetSynonymPrefixWithMisprints(UniString const & s)
|
||||
{
|
||||
auto const dfa = strings::PrefixDFAModifier<strings::LevenshteinDFA>(BuildLevenshteinDFA(s));
|
||||
return g_streets.MatchWithMisprints(dfa);
|
||||
}
|
||||
|
||||
bool ContainsNormalized(string const & str, string const & substr)
|
||||
{
|
||||
UniString const ustr = NormalizeAndSimplifyString(str);
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#include "indexer/search_delimiters.hpp"
|
||||
|
||||
#include "base/levenshtein_dfa.hpp"
|
||||
#include "base/stl_helpers.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
|
@ -12,6 +13,10 @@
|
|||
|
||||
namespace search
|
||||
{
|
||||
size_t GetMaxErrorsForToken(strings::UniString const & token);
|
||||
|
||||
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s);
|
||||
|
||||
// This function should be used for all search strings normalization.
|
||||
// It does some magic text transformation which greatly helps us to improve our search.
|
||||
strings::UniString NormalizeAndSimplifyString(std::string const & s);
|
||||
|
@ -80,6 +85,8 @@ strings::UniString GetStreetNameAsKey(std::string const & name, bool ignoreStree
|
|||
// *NOTE* The argument string must be normalized and simplified.
|
||||
bool IsStreetSynonym(strings::UniString const & s);
|
||||
bool IsStreetSynonymPrefix(strings::UniString const & s);
|
||||
bool IsStreetSynonymWithMisprints(strings::UniString const & s);
|
||||
bool IsStreetSynonymPrefixWithMisprints(strings::UniString const & s);
|
||||
|
||||
/// Normalizes both str and substr, and then returns true if substr is found in str.
|
||||
/// Used in native platform code for search in localized strings (cuisines, categories, strings etc.).
|
||||
|
|
|
@ -10,6 +10,8 @@
|
|||
#include "search/query_params.hpp"
|
||||
#include "search/utils.hpp"
|
||||
|
||||
#include "indexer/search_string_utils.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
|
|
@ -14,42 +14,8 @@
|
|||
|
||||
using namespace std;
|
||||
|
||||
namespace
|
||||
{
|
||||
vector<strings::UniString> const kAllowedMisprints = {
|
||||
strings::MakeUniString("ckq"),
|
||||
strings::MakeUniString("eyjiu"),
|
||||
strings::MakeUniString("gh"),
|
||||
strings::MakeUniString("pf"),
|
||||
strings::MakeUniString("vw"),
|
||||
strings::MakeUniString("ао"),
|
||||
strings::MakeUniString("еиэ"),
|
||||
strings::MakeUniString("шщ"),
|
||||
};
|
||||
} // namespace
|
||||
|
||||
namespace search
|
||||
{
|
||||
size_t GetMaxErrorsForToken(strings::UniString const & token)
|
||||
{
|
||||
bool const digitsOnly = all_of(token.begin(), token.end(), ::isdigit);
|
||||
if (digitsOnly)
|
||||
return 0;
|
||||
if (token.size() < 4)
|
||||
return 0;
|
||||
if (token.size() < 8)
|
||||
return 1;
|
||||
return 2;
|
||||
}
|
||||
|
||||
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s)
|
||||
{
|
||||
// In search we use LevenshteinDFAs for fuzzy matching. But due to
|
||||
// performance reasons, we limit prefix misprints to fixed set of substitutions defined in
|
||||
// kAllowedMisprints and skipped letters.
|
||||
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s));
|
||||
}
|
||||
|
||||
vector<uint32_t> GetCategoryTypes(string const & name, string const & locale,
|
||||
CategoriesHolder const & categories)
|
||||
{
|
||||
|
|
|
@ -28,10 +28,6 @@ class MwmInfo;
|
|||
|
||||
namespace search
|
||||
{
|
||||
size_t GetMaxErrorsForToken(strings::UniString const & token);
|
||||
|
||||
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s);
|
||||
|
||||
template <typename ToDo>
|
||||
void ForEachCategoryType(StringSliceBase const & slice, Locales const & locales,
|
||||
CategoriesHolder const & categories, ToDo && todo)
|
||||
|
|
Loading…
Add table
Reference in a new issue