forked from organicmaps/organicmaps
Allow predefined set of misprints in prefix.
This commit is contained in:
parent
48635d9780
commit
f76abf201c
5 changed files with 119 additions and 29 deletions
|
@ -5,6 +5,7 @@
|
|||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
using namespace strings;
|
||||
|
@ -21,11 +22,12 @@ enum class Status
|
|||
struct Result
|
||||
{
|
||||
Result() = default;
|
||||
Result(Status status, size_t errorsMade) : m_status(status), m_errorsMade(errorsMade) {}
|
||||
Result(Status status, size_t errorsMade = 0) : m_status(status), m_errorsMade(errorsMade) {}
|
||||
|
||||
bool operator==(Result const & rhs) const
|
||||
{
|
||||
return m_status == rhs.m_status && m_errorsMade == rhs.m_errorsMade;
|
||||
return m_status == rhs.m_status &&
|
||||
(m_errorsMade == rhs.m_errorsMade || m_status == Status::Rejects);
|
||||
}
|
||||
|
||||
Status m_status = Status::Accepts;
|
||||
|
@ -132,14 +134,14 @@ UNIT_TEST(LevenshteinDFA_Smoke)
|
|||
UNIT_TEST(LevenshteinDFA_Prefix)
|
||||
{
|
||||
{
|
||||
LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 1 /* maxErrors */);
|
||||
LevenshteinDFA dfa("москва", 1 /* prefixSize */, 1 /* maxErrors */);
|
||||
TEST(Accepts(dfa, "москва"), ());
|
||||
TEST(Accepts(dfa, "масква"), ());
|
||||
TEST(Accepts(dfa, "моска"), ());
|
||||
TEST(Rejects(dfa, "иосква"), ());
|
||||
}
|
||||
{
|
||||
LevenshteinDFA dfa("москва", 0 /* prefixCharsToKeep */, 1 /* maxErrors */);
|
||||
LevenshteinDFA dfa("москва", 0 /* prefixSize */, 1 /* maxErrors */);
|
||||
TEST(Accepts(dfa, "москва"), ());
|
||||
TEST(Accepts(dfa, "иосква"), ());
|
||||
TEST(Accepts(dfa, "моксва"), ());
|
||||
|
@ -149,7 +151,7 @@ UNIT_TEST(LevenshteinDFA_Prefix)
|
|||
UNIT_TEST(LevenshteinDFA_ErrorsMade)
|
||||
{
|
||||
{
|
||||
LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 2 /* maxErrors */);
|
||||
LevenshteinDFA dfa("москва", 1 /* prefixSize */, 2 /* maxErrors */);
|
||||
|
||||
TEST_EQUAL(GetResult(dfa, "москва"), Result(Status::Accepts, 0 /* errorsMade */), ());
|
||||
TEST_EQUAL(GetResult(dfa, "москв"), Result(Status::Accepts, 1 /* errorsMade */), ());
|
||||
|
@ -165,19 +167,37 @@ UNIT_TEST(LevenshteinDFA_ErrorsMade)
|
|||
}
|
||||
|
||||
{
|
||||
LevenshteinDFA dfa("aa", 0 /* prefixCharsToKeep */, 2 /* maxErrors */);
|
||||
LevenshteinDFA dfa("aa", 0 /* prefixSize */, 2 /* maxErrors */);
|
||||
TEST_EQUAL(GetResult(dfa, "abab"), Result(Status::Accepts, 2 /* errorsMade */), ());
|
||||
}
|
||||
|
||||
{
|
||||
LevenshteinDFA dfa("mississippi", 0 /* prefixCharsToKeep */, 0 /* maxErrors */);
|
||||
LevenshteinDFA dfa("mississippi", 0 /* prefixSize */, 0 /* maxErrors */);
|
||||
TEST_EQUAL(GetResult(dfa, "misisipi").m_status, Status::Rejects, ());
|
||||
TEST_EQUAL(GetResult(dfa, "mississipp").m_status, Status::Intermediate, ());
|
||||
TEST_EQUAL(GetResult(dfa, "mississippi"), Result(Status::Accepts, 0 /* errorsMade */), ());
|
||||
}
|
||||
|
||||
{
|
||||
LevenshteinDFA dfa("кафе", 1 /* prefixCharsToKeep */, 1 /* maxErrors */);
|
||||
vector<UniString> const allowedMisprints = {MakeUniString("yj")};
|
||||
size_t const prefixSize = 1;
|
||||
size_t const maxErrors = 1;
|
||||
string const str = "yekaterinburg";
|
||||
vector<pair<string, Result>> const queries = {
|
||||
{"yekaterinburg", Result(Status::Accepts, 0 /* errorsMade */)},
|
||||
{"ekaterinburg", Result(Status::Accepts, 1 /* errorsMade */)},
|
||||
{"jekaterinburg", Result(Status::Accepts, 1 /* errorsMade */)},
|
||||
{"iekaterinburg", Result(Status::Rejects)}};
|
||||
|
||||
for (auto const & q : queries)
|
||||
{
|
||||
LevenshteinDFA dfa(MakeUniString(q.first), prefixSize, allowedMisprints, maxErrors);
|
||||
TEST_EQUAL(GetResult(dfa, str), q.second, ("Query:", q.first, "string:", str));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
LevenshteinDFA dfa("кафе", 1 /* prefixSize */, 1 /* maxErrors */);
|
||||
TEST_EQUAL(GetResult(dfa, "кафе"), Result(Status::Accepts, 0 /* errorsMade */), ());
|
||||
TEST_EQUAL(GetResult(dfa, "кафер"), Result(Status::Accepts, 1 /* errorsMade */), ());
|
||||
}
|
||||
|
|
|
@ -18,20 +18,22 @@ inline size_t AbsDiff(size_t a, size_t b) { return a > b ? a - b : b - a; }
|
|||
class TransitionTable
|
||||
{
|
||||
public:
|
||||
TransitionTable(UniString const & s) : m_s(s), m_size(s.size()) {}
|
||||
TransitionTable(UniString const & s, std::vector<UniString> const & prefixMisprints,
|
||||
size_t prefixSize)
|
||||
: m_s(s), m_size(s.size()), m_prefixMisprints(prefixMisprints), m_prefixSize(prefixSize)
|
||||
{
|
||||
}
|
||||
|
||||
void Move(LevenshteinDFA::State const & s, size_t prefixCharsToKeep, UniChar c,
|
||||
LevenshteinDFA::State & t)
|
||||
void Move(LevenshteinDFA::State const & s, UniChar c, LevenshteinDFA::State & t)
|
||||
{
|
||||
t.Clear();
|
||||
for (auto const & p : s.m_positions)
|
||||
GetMoves(p, prefixCharsToKeep, c, t);
|
||||
GetMoves(p, c, t);
|
||||
t.Normalize();
|
||||
}
|
||||
|
||||
private:
|
||||
void GetMoves(LevenshteinDFA::Position const & p, size_t prefixCharsToKeep, UniChar c,
|
||||
LevenshteinDFA::State & t)
|
||||
void GetMoves(LevenshteinDFA::Position const & p, UniChar c, LevenshteinDFA::State & t)
|
||||
{
|
||||
auto & ps = t.m_positions;
|
||||
|
||||
|
@ -53,11 +55,16 @@ private:
|
|||
if (p.m_errorsLeft == 0)
|
||||
return;
|
||||
|
||||
if (p.m_offset < prefixCharsToKeep)
|
||||
return;
|
||||
|
||||
ps.emplace_back(p.m_offset, p.m_errorsLeft - 1, false /* transposed */);
|
||||
|
||||
if (p.m_offset < m_prefixSize)
|
||||
{
|
||||
// Allow only prefixMisprints for prefix.
|
||||
if (IsAllowedPrefixMisprint(c, p.m_offset))
|
||||
ps.emplace_back(p.m_offset + 1, p.m_errorsLeft - 1, false /* transposed */);
|
||||
return;
|
||||
}
|
||||
|
||||
if (p.m_offset == m_size)
|
||||
return;
|
||||
|
||||
|
@ -87,8 +94,25 @@ private:
|
|||
return false;
|
||||
}
|
||||
|
||||
bool IsAllowedPrefixMisprint(UniChar c, size_t position) const
|
||||
{
|
||||
CHECK_LESS(position, m_prefixSize, ());
|
||||
|
||||
for (auto const & misprints : m_prefixMisprints)
|
||||
{
|
||||
if (std::find(misprints.begin(), misprints.end(), c) != misprints.end() &&
|
||||
std::find(misprints.begin(), misprints.end(), m_s[position]) != misprints.end())
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
UniString const & m_s;
|
||||
size_t const m_size;
|
||||
std::vector<UniString> const m_prefixMisprints;
|
||||
size_t const m_prefixSize;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
|
@ -169,10 +193,21 @@ void LevenshteinDFA::State::Normalize()
|
|||
|
||||
// LevenshteinDFA ----------------------------------------------------------------------------------
|
||||
// static
|
||||
LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, size_t maxErrors)
|
||||
LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixSize,
|
||||
std::vector<UniString> const & prefixMisprints, size_t maxErrors)
|
||||
: m_size(s.size()), m_maxErrors(maxErrors)
|
||||
{
|
||||
m_alphabet.assign(s.begin(), s.end());
|
||||
CHECK_LESS_OR_EQUAL(prefixSize, s.size(), ());
|
||||
|
||||
for (auto it = s.begin(); std::distance(it, s.begin()) < prefixSize; ++it)
|
||||
{
|
||||
for (auto const & misprints : prefixMisprints)
|
||||
{
|
||||
if (std::find(misprints.begin(), misprints.end(), *it) != misprints.end())
|
||||
m_alphabet.insert(m_alphabet.end(), misprints.begin(), misprints.end());
|
||||
}
|
||||
}
|
||||
my::SortUnique(m_alphabet);
|
||||
|
||||
UniChar missed = 0;
|
||||
|
@ -204,7 +239,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si
|
|||
pushState(MakeStart(), kStartingState);
|
||||
pushState(MakeRejecting(), kRejectingState);
|
||||
|
||||
TransitionTable table(s);
|
||||
TransitionTable table(s, prefixMisprints, prefixSize);
|
||||
|
||||
while (!states.empty())
|
||||
{
|
||||
|
@ -222,7 +257,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si
|
|||
for (size_t i = 0; i < m_alphabet.size(); ++i)
|
||||
{
|
||||
State next;
|
||||
table.Move(curr, prefixCharsToKeep, m_alphabet[i], next);
|
||||
table.Move(curr, m_alphabet[i], next);
|
||||
|
||||
size_t nid;
|
||||
|
||||
|
@ -242,18 +277,18 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si
|
|||
}
|
||||
}
|
||||
|
||||
LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t prefixCharsToKeep, size_t maxErrors)
|
||||
: LevenshteinDFA(MakeUniString(s), prefixCharsToKeep, maxErrors)
|
||||
LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t prefixSize, size_t maxErrors)
|
||||
: LevenshteinDFA(MakeUniString(s), prefixSize, {} /* prefixMisprints */, maxErrors)
|
||||
{
|
||||
}
|
||||
|
||||
LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t maxErrors)
|
||||
: LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors)
|
||||
: LevenshteinDFA(s, 0 /* prefixSize */, {} /* prefixMisprints */, maxErrors)
|
||||
{
|
||||
}
|
||||
|
||||
LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t maxErrors)
|
||||
: LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors)
|
||||
: LevenshteinDFA(MakeUniString(s), 0 /* prefixSize */, {} /* prefixMisprints */, maxErrors)
|
||||
{
|
||||
}
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace strings
|
||||
|
@ -94,8 +95,9 @@ public:
|
|||
LevenshteinDFA(LevenshteinDFA const &) = default;
|
||||
LevenshteinDFA(LevenshteinDFA &&) = default;
|
||||
|
||||
LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, size_t maxErrors);
|
||||
LevenshteinDFA(std::string const & s, size_t prefixCharsToKeep, size_t maxErrors);
|
||||
LevenshteinDFA(UniString const & s, size_t prefixSize,
|
||||
std::vector<UniString> const & prefixMisprints, size_t maxErrors);
|
||||
LevenshteinDFA(std::string const & s, size_t prefixSize, size_t maxErrors);
|
||||
LevenshteinDFA(UniString const & s, size_t maxErrors);
|
||||
LevenshteinDFA(std::string const & s, size_t maxErrors);
|
||||
|
||||
|
|
|
@ -432,16 +432,27 @@ UNIT_CLASS_TEST(ProcessorTest, TestRankingInfo_ErrorsMade)
|
|||
|
||||
TestCity chekhov(m2::PointD(0, 0), "Чеховъ Антонъ Павловичъ", "ru", 100 /* rank */);
|
||||
|
||||
TestStreet yesenina(
|
||||
vector<m2::PointD>{m2::PointD(0.5, -0.5), m2::PointD(0, 0), m2::PointD(-0.5, 0.5)},
|
||||
"Yesenina street", "en");
|
||||
|
||||
TestStreet pushkinskaya(
|
||||
vector<m2::PointD>{m2::PointD(-0.5, -0.5), m2::PointD(0, 0), m2::PointD(0.5, 0.5)},
|
||||
"Улица Пушкинская", "ru");
|
||||
|
||||
TestStreet ostrovskogo(
|
||||
vector<m2::PointD>{m2::PointD(-0.5, 0.0), m2::PointD(0, 0), m2::PointD(0.5, 0.0)},
|
||||
"улица Островского", "ru");
|
||||
|
||||
TestPOI lermontov(m2::PointD(0, 0), "Трактиръ Лермонтовъ", "ru");
|
||||
lermontov.SetTypes({{"amenity", "cafe"}});
|
||||
|
||||
auto worldId = BuildWorld([&](TestMwmBuilder & builder) { builder.Add(chekhov); });
|
||||
|
||||
auto wonderlandId = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
|
||||
builder.Add(yesenina);
|
||||
builder.Add(pushkinskaya);
|
||||
builder.Add(ostrovskogo);
|
||||
builder.Add(lermontov);
|
||||
});
|
||||
|
||||
|
@ -460,6 +471,14 @@ UNIT_CLASS_TEST(ProcessorTest, TestRankingInfo_ErrorsMade)
|
|||
checkErrors("кафе лермонтов", ErrorsMade(1));
|
||||
checkErrors("трактир лермонтов", ErrorsMade(2));
|
||||
checkErrors("кафе", ErrorsMade());
|
||||
|
||||
checkErrors("Yesenina cafe", ErrorsMade(0));
|
||||
checkErrors("Esenina cafe", ErrorsMade(1));
|
||||
checkErrors("Jesenina cafe", ErrorsMade(1));
|
||||
|
||||
checkErrors("Островского кафе", ErrorsMade(0));
|
||||
checkErrors("Астровского кафе", ErrorsMade(1));
|
||||
|
||||
checkErrors("пушкенская трактир лермонтов", ErrorsMade(3));
|
||||
checkErrors("пушкенская кафе", ErrorsMade(1));
|
||||
checkErrors("пушкинская трактиръ лермонтовъ", ErrorsMade(0));
|
||||
|
|
|
@ -6,6 +6,20 @@
|
|||
|
||||
using namespace std;
|
||||
|
||||
namespace
|
||||
{
|
||||
vector<strings::UniString> const kAllowedMisprints = {
|
||||
strings::MakeUniString("ckq"),
|
||||
strings::MakeUniString("eyjiu"),
|
||||
strings::MakeUniString("gh"),
|
||||
strings::MakeUniString("pf"),
|
||||
strings::MakeUniString("vw"),
|
||||
strings::MakeUniString("ао"),
|
||||
strings::MakeUniString("еиэ"),
|
||||
strings::MakeUniString("шщ"),
|
||||
};
|
||||
} // namespace
|
||||
|
||||
namespace search
|
||||
{
|
||||
size_t GetMaxErrorsForToken(strings::UniString const & token)
|
||||
|
@ -23,9 +37,9 @@ size_t GetMaxErrorsForToken(strings::UniString const & token)
|
|||
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s)
|
||||
{
|
||||
// In search we use LevenshteinDFAs for fuzzy matching. But due to
|
||||
// performance reasons, we assume that the first letter is always
|
||||
// correct.
|
||||
return strings::LevenshteinDFA(s, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(s));
|
||||
// performance reasons, we limit prefix misprints to fixed set of substitutions defined in
|
||||
// kAllowedMisprints and skipped letters.
|
||||
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s));
|
||||
}
|
||||
|
||||
MwmSet::MwmHandle FindWorld(Index const & index, vector<shared_ptr<MwmInfo>> const & infos)
|
||||
|
|
Loading…
Add table
Reference in a new issue