Allow predefined set of misprints in prefix.

This commit is contained in:
tatiana-kondakova 2018-03-23 20:09:23 +03:00 committed by mpimenov
parent 48635d9780
commit f76abf201c
5 changed files with 119 additions and 29 deletions

View file

@ -5,6 +5,7 @@
#include <sstream>
#include <string>
#include <vector>
using namespace std;
using namespace strings;
@ -21,11 +22,12 @@ enum class Status
struct Result
{
Result() = default;
Result(Status status, size_t errorsMade) : m_status(status), m_errorsMade(errorsMade) {}
Result(Status status, size_t errorsMade = 0) : m_status(status), m_errorsMade(errorsMade) {}
bool operator==(Result const & rhs) const
{
return m_status == rhs.m_status && m_errorsMade == rhs.m_errorsMade;
return m_status == rhs.m_status &&
(m_errorsMade == rhs.m_errorsMade || m_status == Status::Rejects);
}
Status m_status = Status::Accepts;
@ -132,14 +134,14 @@ UNIT_TEST(LevenshteinDFA_Smoke)
UNIT_TEST(LevenshteinDFA_Prefix)
{
{
LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 1 /* maxErrors */);
LevenshteinDFA dfa("москва", 1 /* prefixSize */, 1 /* maxErrors */);
TEST(Accepts(dfa, "москва"), ());
TEST(Accepts(dfa, "масква"), ());
TEST(Accepts(dfa, "моска"), ());
TEST(Rejects(dfa, "иосква"), ());
}
{
LevenshteinDFA dfa("москва", 0 /* prefixCharsToKeep */, 1 /* maxErrors */);
LevenshteinDFA dfa("москва", 0 /* prefixSize */, 1 /* maxErrors */);
TEST(Accepts(dfa, "москва"), ());
TEST(Accepts(dfa, "иосква"), ());
TEST(Accepts(dfa, "моксва"), ());
@ -149,7 +151,7 @@ UNIT_TEST(LevenshteinDFA_Prefix)
UNIT_TEST(LevenshteinDFA_ErrorsMade)
{
{
LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 2 /* maxErrors */);
LevenshteinDFA dfa("москва", 1 /* prefixSize */, 2 /* maxErrors */);
TEST_EQUAL(GetResult(dfa, "москва"), Result(Status::Accepts, 0 /* errorsMade */), ());
TEST_EQUAL(GetResult(dfa, "москв"), Result(Status::Accepts, 1 /* errorsMade */), ());
@ -165,19 +167,37 @@ UNIT_TEST(LevenshteinDFA_ErrorsMade)
}
{
LevenshteinDFA dfa("aa", 0 /* prefixCharsToKeep */, 2 /* maxErrors */);
LevenshteinDFA dfa("aa", 0 /* prefixSize */, 2 /* maxErrors */);
TEST_EQUAL(GetResult(dfa, "abab"), Result(Status::Accepts, 2 /* errorsMade */), ());
}
{
LevenshteinDFA dfa("mississippi", 0 /* prefixCharsToKeep */, 0 /* maxErrors */);
LevenshteinDFA dfa("mississippi", 0 /* prefixSize */, 0 /* maxErrors */);
TEST_EQUAL(GetResult(dfa, "misisipi").m_status, Status::Rejects, ());
TEST_EQUAL(GetResult(dfa, "mississipp").m_status, Status::Intermediate, ());
TEST_EQUAL(GetResult(dfa, "mississippi"), Result(Status::Accepts, 0 /* errorsMade */), ());
}
{
LevenshteinDFA dfa("кафе", 1 /* prefixCharsToKeep */, 1 /* maxErrors */);
vector<UniString> const allowedMisprints = {MakeUniString("yj")};
size_t const prefixSize = 1;
size_t const maxErrors = 1;
string const str = "yekaterinburg";
vector<pair<string, Result>> const queries = {
{"yekaterinburg", Result(Status::Accepts, 0 /* errorsMade */)},
{"ekaterinburg", Result(Status::Accepts, 1 /* errorsMade */)},
{"jekaterinburg", Result(Status::Accepts, 1 /* errorsMade */)},
{"iekaterinburg", Result(Status::Rejects)}};
for (auto const & q : queries)
{
LevenshteinDFA dfa(MakeUniString(q.first), prefixSize, allowedMisprints, maxErrors);
TEST_EQUAL(GetResult(dfa, str), q.second, ("Query:", q.first, "string:", str));
}
}
{
LevenshteinDFA dfa("кафе", 1 /* prefixSize */, 1 /* maxErrors */);
TEST_EQUAL(GetResult(dfa, "кафе"), Result(Status::Accepts, 0 /* errorsMade */), ());
TEST_EQUAL(GetResult(dfa, "кафер"), Result(Status::Accepts, 1 /* errorsMade */), ());
}

View file

@ -18,20 +18,22 @@ inline size_t AbsDiff(size_t a, size_t b) { return a > b ? a - b : b - a; }
class TransitionTable
{
public:
TransitionTable(UniString const & s) : m_s(s), m_size(s.size()) {}
TransitionTable(UniString const & s, std::vector<UniString> const & prefixMisprints,
size_t prefixSize)
: m_s(s), m_size(s.size()), m_prefixMisprints(prefixMisprints), m_prefixSize(prefixSize)
{
}
void Move(LevenshteinDFA::State const & s, size_t prefixCharsToKeep, UniChar c,
LevenshteinDFA::State & t)
void Move(LevenshteinDFA::State const & s, UniChar c, LevenshteinDFA::State & t)
{
t.Clear();
for (auto const & p : s.m_positions)
GetMoves(p, prefixCharsToKeep, c, t);
GetMoves(p, c, t);
t.Normalize();
}
private:
void GetMoves(LevenshteinDFA::Position const & p, size_t prefixCharsToKeep, UniChar c,
LevenshteinDFA::State & t)
void GetMoves(LevenshteinDFA::Position const & p, UniChar c, LevenshteinDFA::State & t)
{
auto & ps = t.m_positions;
@ -53,11 +55,16 @@ private:
if (p.m_errorsLeft == 0)
return;
if (p.m_offset < prefixCharsToKeep)
return;
ps.emplace_back(p.m_offset, p.m_errorsLeft - 1, false /* transposed */);
if (p.m_offset < m_prefixSize)
{
// Allow only prefixMisprints for prefix.
if (IsAllowedPrefixMisprint(c, p.m_offset))
ps.emplace_back(p.m_offset + 1, p.m_errorsLeft - 1, false /* transposed */);
return;
}
if (p.m_offset == m_size)
return;
@ -87,8 +94,25 @@ private:
return false;
}
bool IsAllowedPrefixMisprint(UniChar c, size_t position) const
{
CHECK_LESS(position, m_prefixSize, ());
for (auto const & misprints : m_prefixMisprints)
{
if (std::find(misprints.begin(), misprints.end(), c) != misprints.end() &&
std::find(misprints.begin(), misprints.end(), m_s[position]) != misprints.end())
{
return true;
}
}
return false;
}
UniString const & m_s;
size_t const m_size;
std::vector<UniString> const m_prefixMisprints;
size_t const m_prefixSize;
};
} // namespace
@ -169,10 +193,21 @@ void LevenshteinDFA::State::Normalize()
// LevenshteinDFA ----------------------------------------------------------------------------------
// static
LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, size_t maxErrors)
LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixSize,
std::vector<UniString> const & prefixMisprints, size_t maxErrors)
: m_size(s.size()), m_maxErrors(maxErrors)
{
m_alphabet.assign(s.begin(), s.end());
CHECK_LESS_OR_EQUAL(prefixSize, s.size(), ());
for (auto it = s.begin(); std::distance(it, s.begin()) < prefixSize; ++it)
{
for (auto const & misprints : prefixMisprints)
{
if (std::find(misprints.begin(), misprints.end(), *it) != misprints.end())
m_alphabet.insert(m_alphabet.end(), misprints.begin(), misprints.end());
}
}
my::SortUnique(m_alphabet);
UniChar missed = 0;
@ -204,7 +239,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si
pushState(MakeStart(), kStartingState);
pushState(MakeRejecting(), kRejectingState);
TransitionTable table(s);
TransitionTable table(s, prefixMisprints, prefixSize);
while (!states.empty())
{
@ -222,7 +257,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si
for (size_t i = 0; i < m_alphabet.size(); ++i)
{
State next;
table.Move(curr, prefixCharsToKeep, m_alphabet[i], next);
table.Move(curr, m_alphabet[i], next);
size_t nid;
@ -242,18 +277,18 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si
}
}
LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t prefixCharsToKeep, size_t maxErrors)
: LevenshteinDFA(MakeUniString(s), prefixCharsToKeep, maxErrors)
LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t prefixSize, size_t maxErrors)
: LevenshteinDFA(MakeUniString(s), prefixSize, {} /* prefixMisprints */, maxErrors)
{
}
LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t maxErrors)
: LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors)
: LevenshteinDFA(s, 0 /* prefixSize */, {} /* prefixMisprints */, maxErrors)
{
}
LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t maxErrors)
: LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors)
: LevenshteinDFA(MakeUniString(s), 0 /* prefixSize */, {} /* prefixMisprints */, maxErrors)
{
}

View file

@ -3,6 +3,7 @@
#include "base/string_utils.hpp"
#include <cstddef>
#include <string>
#include <vector>
namespace strings
@ -94,8 +95,9 @@ public:
LevenshteinDFA(LevenshteinDFA const &) = default;
LevenshteinDFA(LevenshteinDFA &&) = default;
LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, size_t maxErrors);
LevenshteinDFA(std::string const & s, size_t prefixCharsToKeep, size_t maxErrors);
LevenshteinDFA(UniString const & s, size_t prefixSize,
std::vector<UniString> const & prefixMisprints, size_t maxErrors);
LevenshteinDFA(std::string const & s, size_t prefixSize, size_t maxErrors);
LevenshteinDFA(UniString const & s, size_t maxErrors);
LevenshteinDFA(std::string const & s, size_t maxErrors);

View file

@ -432,16 +432,27 @@ UNIT_CLASS_TEST(ProcessorTest, TestRankingInfo_ErrorsMade)
TestCity chekhov(m2::PointD(0, 0), "Чеховъ Антонъ Павловичъ", "ru", 100 /* rank */);
TestStreet yesenina(
vector<m2::PointD>{m2::PointD(0.5, -0.5), m2::PointD(0, 0), m2::PointD(-0.5, 0.5)},
"Yesenina street", "en");
TestStreet pushkinskaya(
vector<m2::PointD>{m2::PointD(-0.5, -0.5), m2::PointD(0, 0), m2::PointD(0.5, 0.5)},
"Улица Пушкинская", "ru");
TestStreet ostrovskogo(
vector<m2::PointD>{m2::PointD(-0.5, 0.0), m2::PointD(0, 0), m2::PointD(0.5, 0.0)},
"улица Островского", "ru");
TestPOI lermontov(m2::PointD(0, 0), "Трактиръ Лермонтовъ", "ru");
lermontov.SetTypes({{"amenity", "cafe"}});
auto worldId = BuildWorld([&](TestMwmBuilder & builder) { builder.Add(chekhov); });
auto wonderlandId = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
builder.Add(yesenina);
builder.Add(pushkinskaya);
builder.Add(ostrovskogo);
builder.Add(lermontov);
});
@ -460,6 +471,14 @@ UNIT_CLASS_TEST(ProcessorTest, TestRankingInfo_ErrorsMade)
checkErrors("кафе лермонтов", ErrorsMade(1));
checkErrors("трактир лермонтов", ErrorsMade(2));
checkErrors("кафе", ErrorsMade());
checkErrors("Yesenina cafe", ErrorsMade(0));
checkErrors("Esenina cafe", ErrorsMade(1));
checkErrors("Jesenina cafe", ErrorsMade(1));
checkErrors("Островского кафе", ErrorsMade(0));
checkErrors("Астровского кафе", ErrorsMade(1));
checkErrors("пушкенская трактир лермонтов", ErrorsMade(3));
checkErrors("пушкенская кафе", ErrorsMade(1));
checkErrors("пушкинская трактиръ лермонтовъ", ErrorsMade(0));

View file

@ -6,6 +6,20 @@
using namespace std;
namespace
{
vector<strings::UniString> const kAllowedMisprints = {
strings::MakeUniString("ckq"),
strings::MakeUniString("eyjiu"),
strings::MakeUniString("gh"),
strings::MakeUniString("pf"),
strings::MakeUniString("vw"),
strings::MakeUniString("ао"),
strings::MakeUniString("еиэ"),
strings::MakeUniString("шщ"),
};
} // namespace
namespace search
{
size_t GetMaxErrorsForToken(strings::UniString const & token)
@ -23,9 +37,9 @@ size_t GetMaxErrorsForToken(strings::UniString const & token)
strings::LevenshteinDFA BuildLevenshteinDFA(strings::UniString const & s)
{
// In search we use LevenshteinDFAs for fuzzy matching. But due to
// performance reasons, we assume that the first letter is always
// correct.
return strings::LevenshteinDFA(s, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(s));
// performance reasons, we limit prefix misprints to fixed set of substitutions defined in
// kAllowedMisprints and skipped letters.
return strings::LevenshteinDFA(s, 1 /* prefixSize */, kAllowedMisprints, GetMaxErrorsForToken(s));
}
MwmSet::MwmHandle FindWorld(Index const & index, vector<shared_ptr<MwmInfo>> const & infos)