diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp index d1f88423b7..2d7cab01b1 100644 --- a/base/base_tests/levenshtein_dfa_test.cpp +++ b/base/base_tests/levenshtein_dfa_test.cpp @@ -5,6 +5,7 @@ #include #include +#include using namespace std; using namespace strings; @@ -21,11 +22,12 @@ enum class Status struct Result { Result() = default; - Result(Status status, size_t errorsMade) : m_status(status), m_errorsMade(errorsMade) {} + Result(Status status, size_t errorsMade = 0) : m_status(status), m_errorsMade(errorsMade) {} bool operator==(Result const & rhs) const { - return m_status == rhs.m_status && m_errorsMade == rhs.m_errorsMade; + return m_status == rhs.m_status && + (m_errorsMade == rhs.m_errorsMade || m_status == Status::Rejects); } Status m_status = Status::Accepts; @@ -132,14 +134,14 @@ UNIT_TEST(LevenshteinDFA_Smoke) UNIT_TEST(LevenshteinDFA_Prefix) { { - LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 1 /* maxErrors */); + LevenshteinDFA dfa("москва", 1 /* prefixSize */, 1 /* maxErrors */); TEST(Accepts(dfa, "москва"), ()); TEST(Accepts(dfa, "масква"), ()); TEST(Accepts(dfa, "моска"), ()); TEST(Rejects(dfa, "иосква"), ()); } { - LevenshteinDFA dfa("москва", 0 /* prefixCharsToKeep */, 1 /* maxErrors */); + LevenshteinDFA dfa("москва", 0 /* prefixSize */, 1 /* maxErrors */); TEST(Accepts(dfa, "москва"), ()); TEST(Accepts(dfa, "иосква"), ()); TEST(Accepts(dfa, "моксва"), ()); @@ -149,7 +151,7 @@ UNIT_TEST(LevenshteinDFA_Prefix) UNIT_TEST(LevenshteinDFA_ErrorsMade) { { - LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 2 /* maxErrors */); + LevenshteinDFA dfa("москва", 1 /* prefixSize */, 2 /* maxErrors */); TEST_EQUAL(GetResult(dfa, "москва"), Result(Status::Accepts, 0 /* errorsMade */), ()); TEST_EQUAL(GetResult(dfa, "москв"), Result(Status::Accepts, 1 /* errorsMade */), ()); @@ -165,19 +167,37 @@ UNIT_TEST(LevenshteinDFA_ErrorsMade) } { - LevenshteinDFA dfa("aa", 0 /* prefixCharsToKeep */, 2 /* maxErrors */); + LevenshteinDFA dfa("aa", 0 /* prefixSize */, 2 /* maxErrors */); TEST_EQUAL(GetResult(dfa, "abab"), Result(Status::Accepts, 2 /* errorsMade */), ()); } { - LevenshteinDFA dfa("mississippi", 0 /* prefixCharsToKeep */, 0 /* maxErrors */); + LevenshteinDFA dfa("mississippi", 0 /* prefixSize */, 0 /* maxErrors */); TEST_EQUAL(GetResult(dfa, "misisipi").m_status, Status::Rejects, ()); TEST_EQUAL(GetResult(dfa, "mississipp").m_status, Status::Intermediate, ()); TEST_EQUAL(GetResult(dfa, "mississippi"), Result(Status::Accepts, 0 /* errorsMade */), ()); } { - LevenshteinDFA dfa("кафе", 1 /* prefixCharsToKeep */, 1 /* maxErrors */); + vector const allowedMisprints = {MakeUniString("yj")}; + size_t const prefixSize = 1; + size_t const maxErrors = 1; + string const str = "yekaterinburg"; + vector> const queries = { + {"yekaterinburg", Result(Status::Accepts, 0 /* errorsMade */)}, + {"ekaterinburg", Result(Status::Accepts, 1 /* errorsMade */)}, + {"jekaterinburg", Result(Status::Accepts, 1 /* errorsMade */)}, + {"iekaterinburg", Result(Status::Rejects)}}; + + for (auto const & q : queries) + { + LevenshteinDFA dfa(MakeUniString(q.first), prefixSize, allowedMisprints, maxErrors); + TEST_EQUAL(GetResult(dfa, str), q.second, ("Query:", q.first, "string:", str)); + } + } + + { + LevenshteinDFA dfa("кафе", 1 /* prefixSize */, 1 /* maxErrors */); TEST_EQUAL(GetResult(dfa, "кафе"), Result(Status::Accepts, 0 /* errorsMade */), ()); TEST_EQUAL(GetResult(dfa, "кафер"), Result(Status::Accepts, 1 /* errorsMade */), ()); } diff --git a/base/levenshtein_dfa.cpp b/base/levenshtein_dfa.cpp index 40454192f8..9c0b09b00c 100644 --- a/base/levenshtein_dfa.cpp +++ b/base/levenshtein_dfa.cpp @@ -18,20 +18,22 @@ inline size_t AbsDiff(size_t a, size_t b) { return a > b ? a - b : b - a; } class TransitionTable { public: - TransitionTable(UniString const & s) : m_s(s), m_size(s.size()) {} + TransitionTable(UniString const & s, std::vector const & prefixMisprints, + size_t prefixSize) + : m_s(s), m_size(s.size()), m_prefixMisprints(prefixMisprints), m_prefixSize(prefixSize) + { + } - void Move(LevenshteinDFA::State const & s, size_t prefixCharsToKeep, UniChar c, - LevenshteinDFA::State & t) + void Move(LevenshteinDFA::State const & s, UniChar c, LevenshteinDFA::State & t) { t.Clear(); for (auto const & p : s.m_positions) - GetMoves(p, prefixCharsToKeep, c, t); + GetMoves(p, c, t); t.Normalize(); } private: - void GetMoves(LevenshteinDFA::Position const & p, size_t prefixCharsToKeep, UniChar c, - LevenshteinDFA::State & t) + void GetMoves(LevenshteinDFA::Position const & p, UniChar c, LevenshteinDFA::State & t) { auto & ps = t.m_positions; @@ -53,11 +55,16 @@ private: if (p.m_errorsLeft == 0) return; - if (p.m_offset < prefixCharsToKeep) - return; - ps.emplace_back(p.m_offset, p.m_errorsLeft - 1, false /* transposed */); + if (p.m_offset < m_prefixSize) + { + // Allow only prefixMisprints for prefix. + if (IsAllowedPrefixMisprint(c, p.m_offset)) + ps.emplace_back(p.m_offset + 1, p.m_errorsLeft - 1, false /* transposed */); + return; + } + if (p.m_offset == m_size) return; @@ -87,8 +94,25 @@ private: return false; } + bool IsAllowedPrefixMisprint(UniChar c, size_t position) const + { + CHECK_LESS(position, m_prefixSize, ()); + + for (auto const & misprints : m_prefixMisprints) + { + if (std::find(misprints.begin(), misprints.end(), c) != misprints.end() && + std::find(misprints.begin(), misprints.end(), m_s[position]) != misprints.end()) + { + return true; + } + } + return false; + } + UniString const & m_s; size_t const m_size; + std::vector const m_prefixMisprints; + size_t const m_prefixSize; }; } // namespace @@ -169,10 +193,21 @@ void LevenshteinDFA::State::Normalize() // LevenshteinDFA ---------------------------------------------------------------------------------- // static -LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, size_t maxErrors) +LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixSize, + std::vector const & prefixMisprints, size_t maxErrors) : m_size(s.size()), m_maxErrors(maxErrors) { m_alphabet.assign(s.begin(), s.end()); + CHECK_LESS_OR_EQUAL(prefixSize, s.size(), ()); + + for (auto it = s.begin(); std::distance(it, s.begin()) < prefixSize; ++it) + { + for (auto const & misprints : prefixMisprints) + { + if (std::find(misprints.begin(), misprints.end(), *it) != misprints.end()) + m_alphabet.insert(m_alphabet.end(), misprints.begin(), misprints.end()); + } + } my::SortUnique(m_alphabet); UniChar missed = 0; @@ -204,7 +239,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si pushState(MakeStart(), kStartingState); pushState(MakeRejecting(), kRejectingState); - TransitionTable table(s); + TransitionTable table(s, prefixMisprints, prefixSize); while (!states.empty()) { @@ -222,7 +257,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si for (size_t i = 0; i < m_alphabet.size(); ++i) { State next; - table.Move(curr, prefixCharsToKeep, m_alphabet[i], next); + table.Move(curr, m_alphabet[i], next); size_t nid; @@ -242,18 +277,18 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si } } -LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t prefixCharsToKeep, size_t maxErrors) - : LevenshteinDFA(MakeUniString(s), prefixCharsToKeep, maxErrors) +LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t prefixSize, size_t maxErrors) + : LevenshteinDFA(MakeUniString(s), prefixSize, {} /* prefixMisprints */, maxErrors) { } LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t maxErrors) - : LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors) + : LevenshteinDFA(s, 0 /* prefixSize */, {} /* prefixMisprints */, maxErrors) { } LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t maxErrors) - : LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors) + : LevenshteinDFA(MakeUniString(s), 0 /* prefixSize */, {} /* prefixMisprints */, maxErrors) { } diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp index 88569d0b65..5505c94b23 100644 --- a/base/levenshtein_dfa.hpp +++ b/base/levenshtein_dfa.hpp @@ -3,6 +3,7 @@ #include "base/string_utils.hpp" #include +#include #include namespace strings @@ -94,8 +95,9 @@ public: LevenshteinDFA(LevenshteinDFA const &) = default; LevenshteinDFA(LevenshteinDFA &&) = default; - LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, size_t maxErrors); - LevenshteinDFA(std::string const & s, size_t prefixCharsToKeep, size_t maxErrors); + LevenshteinDFA(UniString const & s, size_t prefixSize, + std::vector const & prefixMisprints, size_t maxErrors); + LevenshteinDFA(std::string const & s, size_t prefixSize, size_t maxErrors); LevenshteinDFA(UniString const & s, size_t maxErrors); LevenshteinDFA(std::string const & s, size_t maxErrors); diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp index 5c2becc049..446bf11967 100644 --- a/search/search_integration_tests/processor_test.cpp +++ b/search/search_integration_tests/processor_test.cpp @@ -432,16 +432,27 @@ UNIT_CLASS_TEST(ProcessorTest, TestRankingInfo_ErrorsMade) TestCity chekhov(m2::PointD(0, 0), "Чеховъ Антонъ Павловичъ", "ru", 100 /* rank */); + TestStreet yesenina( + vector{m2::PointD(0.5, -0.5), m2::PointD(0, 0), m2::PointD(-0.5, 0.5)}, + "Yesenina street", "en"); + TestStreet pushkinskaya( vector{m2::PointD(-0.5, -0.5), m2::PointD(0, 0), m2::PointD(0.5, 0.5)}, "Улица Пушкинская", "ru"); + + TestStreet ostrovskogo( + vector{m2::PointD(-0.5, 0.0), m2::PointD(0, 0), m2::PointD(0.5, 0.0)}, + "улица Островского", "ru"); + TestPOI lermontov(m2::PointD(0, 0), "Трактиръ Лермонтовъ", "ru"); lermontov.SetTypes({{"amenity", "cafe"}}); auto worldId = BuildWorld([&](TestMwmBuilder & builder) { builder.Add(chekhov); }); auto wonderlandId = BuildCountry(countryName, [&](TestMwmBuilder & builder) { + builder.Add(yesenina); builder.Add(pushkinskaya); + builder.Add(ostrovskogo); builder.Add(lermontov); }); @@ -460,6 +471,14 @@ UNIT_CLASS_TEST(ProcessorTest, TestRankingInfo_ErrorsMade) checkErrors("кафе лермонтов", ErrorsMade(1)); checkErrors("трактир лермонтов", ErrorsMade(2)); checkErrors("кафе", ErrorsMade()); + + checkErrors("Yesenina cafe", ErrorsMade(0)); + checkErrors("Esenina cafe", ErrorsMade(1)); + checkErrors("Jesenina cafe", ErrorsMade(1)); + + checkErrors("Островского кафе", ErrorsMade(0)); + checkErrors("Астровского кафе", ErrorsMade(1)); + checkErrors("пушкенская трактир лермонтов", ErrorsMade(3)); checkErrors("пушкенская кафе", ErrorsMade(1)); checkErrors("пушкинская трактиръ лермонтовъ", ErrorsMade(0)); diff --git a/search/utils.cpp b/search/utils.cpp index 12dd9293d4..4f330fc517 100644 --- a/search/utils.cpp +++ b/search/utils.cpp @@ -6,6 +6,20 @@ using namespace std; +namespace +{ +vector const kAllowedMisprints = { + strings::MakeUniString("ckq"), + strings::MakeUniString("eyjiu"), + strings::MakeUniString("gh"), + strings::MakeUniString("pf"), + strings::MakeUniString("vw"), + strings::MakeUniString("ао"), + strings::MakeUniString("еиэ"), + strings::MakeUniString("шщ"), +}; +} // namespace + namespace search { size_t GetMaxErrorsForToken(strings::UniString const & token) @@ -23,9 +37,9 @@ size_t GetMaxErrorsForToken(strings::UniString const & token) strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s) { // In search we use LevenshteinDFAs for fuzzy matching. But due to - // performance reasons, we assume that the first letter is always - // correct. - return strings::LevenshteinDFA(s, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(s)); + // performance reasons, we limit prefix misprints to fixed set of substitutions defined in + // kAllowedMisprints and skipped letters. + return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s)); } MwmSet::MwmHandle FindWorld(Index const & index, vector> const & infos)