[search] Match street synonyms with misprints.

This commit is contained in:
tatiana-yan 2019-06-28 15:38:13 +03:00 committed by mpimenov
parent 07c656942f
commit b2a8add9bc
6 changed files with 145 additions and 59 deletions

View file

@ -203,10 +203,9 @@ struct FeatureNameInserter
if (m_hasStreetType)
{
search::StreetTokensFilter filter([&](strings::UniString const & token, size_t /* tag */)
{
AddToken(lang, token);
});
search::StreetTokensFilter filter(
[&](strings::UniString const & token, size_t /* tag */) { AddToken(lang, token); },
false /* withMisprints */);
for (auto const & token : tokens)
filter.Put(token, false /* isPrefix */, 0 /* tag */);

View file

@ -18,16 +18,15 @@ namespace
class Utf8StreetTokensFilter
{
public:
explicit Utf8StreetTokensFilter(vector<pair<string, size_t>> & cont)
explicit Utf8StreetTokensFilter(vector<pair<string, size_t>> & cont, bool withMisprints = false)
: m_cont(cont)
, m_filter([&](UniString const & token, size_t tag)
{
m_cont.emplace_back(ToUtf8(token), tag);
})
, m_filter(
[&](UniString const & token, size_t tag) { m_cont.emplace_back(ToUtf8(token), tag); },
withMisprints)
{
}
inline void Put(string const & token, bool isPrefix, size_t tag)
void Put(string const & token, bool isPrefix, size_t tag)
{
m_filter.Put(MakeUniString(token), isPrefix, tag);
}
@ -162,11 +161,11 @@ UNIT_TEST(StreetPrefixMatch)
UNIT_TEST(StreetTokensFilter)
{
using TList = vector<pair<string, size_t>>;
using List = vector<pair<string, size_t>>;
{
TList expected = {};
TList actual;
List expected = {};
List actual;
Utf8StreetTokensFilter filter(actual);
filter.Put("ули", true /* isPrefix */, 0 /* tag */);
@ -175,8 +174,8 @@ UNIT_TEST(StreetTokensFilter)
}
{
TList expected = {};
TList actual;
List expected = {};
List actual;
Utf8StreetTokensFilter filter(actual);
filter.Put("улица", false /* isPrefix */, 0 /* tag */);
@ -185,8 +184,8 @@ UNIT_TEST(StreetTokensFilter)
}
{
TList expected = {{"генерала", 1}, {"антонова", 2}};
TList actual;
List expected = {{"генерала", 1}, {"антонова", 2}};
List actual;
Utf8StreetTokensFilter filter(actual);
filter.Put("ул", false /* isPrefix */, 0 /* tag */);
@ -197,8 +196,8 @@ UNIT_TEST(StreetTokensFilter)
}
{
TList expected = {{"улица", 100}, {"набережная", 50}};
TList actual;
List expected = {{"улица", 100}, {"набережная", 50}};
List actual;
Utf8StreetTokensFilter filter(actual);
filter.Put("улица", false /* isPrefix */, 100 /* tag */);
@ -208,8 +207,8 @@ UNIT_TEST(StreetTokensFilter)
}
{
TList expected = {{"улица", 0}, {"набережная", 1}, {"проспект", 2}};
TList actual;
List expected = {{"улица", 0}, {"набережная", 1}, {"проспект", 2}};
List actual;
Utf8StreetTokensFilter filter(actual);
filter.Put("улица", false /* isPrefix */, 0 /* tag */);
@ -218,6 +217,42 @@ UNIT_TEST(StreetTokensFilter)
TEST_EQUAL(expected, actual, ());
}
{
List expectedWithMisprints = {{"ленинский", 0}};
List expectedWithoutMisprints = {{"ленинский", 0}, {"пропект", 1}};
List actualWithMisprints;
List actualWithoutMisprints;
Utf8StreetTokensFilter filterWithMisprints(actualWithMisprints, true /* withMisprints */);
Utf8StreetTokensFilter filterWithoutMisprints(actualWithoutMisprints,
false /* withMisprints */);
filterWithMisprints.Put("ленинский", false /* isPrefix */, 0 /* tag */);
filterWithoutMisprints.Put("ленинский", false /* isPrefix */, 0 /* tag */);
filterWithMisprints.Put("пропект", false /* isPrefix */, 1 /* tag */);
filterWithoutMisprints.Put("пропект", false /* isPrefix */, 1 /* tag */);
TEST_EQUAL(expectedWithMisprints, actualWithMisprints, ());
TEST_EQUAL(expectedWithoutMisprints, actualWithoutMisprints, ());
}
{
List expectedWithMisprints = {{"улица", 0}, {"набрежная", 1}};
List expectedWithoutMisprints = {{"набрежная", 1}};
List actualWithMisprints;
List actualWithoutMisprints;
Utf8StreetTokensFilter filterWithMisprints(actualWithMisprints, true /* withMisprints */);
Utf8StreetTokensFilter filterWithoutMisprints(actualWithoutMisprints,
false /* withMisprints */);
filterWithMisprints.Put("улица", false /* isPrefix */, 0 /* tag */);
filterWithoutMisprints.Put("улица", false /* isPrefix */, 0 /* tag */);
filterWithMisprints.Put("набрежная", false /* isPrefix */, 1 /* tag */);
filterWithoutMisprints.Put("набрежная", false /* isPrefix */, 1 /* tag */);
TEST_EQUAL(expectedWithMisprints, actualWithMisprints, ());
TEST_EQUAL(expectedWithoutMisprints, actualWithoutMisprints, ());
}
}
UNIT_TEST(NormalizeAndSimplifyString_Numero)

View file

@ -435,7 +435,14 @@ bool ContainsNormalized(string const & str, string const & substr)
// StreetTokensFilter ------------------------------------------------------------------------------
void StreetTokensFilter::Put(strings::UniString const & token, bool isPrefix, size_t tag)
{
if ((isPrefix && IsStreetSynonymPrefix(token)) || (!isPrefix && IsStreetSynonym(token)))
using IsStreetChecker = std::function<bool(strings::UniString const &)>;
IsStreetChecker isStreet = m_withMisprints ? IsStreetSynonymWithMisprints : IsStreetSynonym;
IsStreetChecker isStreetPrefix =
m_withMisprints ? IsStreetSynonymPrefixWithMisprints : IsStreetSynonymPrefix;
auto const isStreetSynonym = isStreet(token);
if ((isPrefix && isStreetPrefix(token)) || (!isPrefix && isStreetSynonym))
{
++m_numSynonyms;
if (m_numSynonyms == 1)
@ -446,7 +453,7 @@ void StreetTokensFilter::Put(strings::UniString const & token, bool isPrefix, si
}
// Do not emit delayed token for incomplete street synonym.
if ((!isPrefix || IsStreetSynonym(token)) && m_numSynonyms == 2)
if ((!isPrefix || isStreetSynonym) && m_numSynonyms == 2)
EmitToken(m_delayedToken, m_delayedTag);
}
EmitToken(token, tag);

View file

@ -103,8 +103,9 @@ class StreetTokensFilter
public:
using Callback = std::function<void(strings::UniString const & token, size_t tag)>;
template <typename TC>
explicit StreetTokensFilter(TC && callback) : m_callback(std::forward<TC>(callback))
template <typename C>
StreetTokensFilter(C && callback, bool withMisprints)
: m_callback(std::forward<C>(callback)), m_withMisprints(withMisprints)
{
}
@ -125,5 +126,6 @@ private:
size_t m_numSynonyms = 0;
Callback m_callback;
bool m_withMisprints = false;
};
} // namespace search

View file

@ -2018,5 +2018,41 @@ UNIT_CLASS_TEST(ProcessorTest, Strasse)
checkNoErrors("xyz", rules);
}
}
UNIT_CLASS_TEST(ProcessorTest, StreetSynonymsWithMisprints)
{
string const countryName = "Wonderland";
TestStreet leninsky(vector<m2::PointD>{m2::PointD(0.0, -1.0), m2::PointD(0.0, 1.0)},
"Ленинский проспект", "ru");
TestStreet nabrezhnaya(vector<m2::PointD>{m2::PointD(1.0, -1.0), m2::PointD(1.0, 1.0)},
"улица набрежная", "ru");
TestStreet naberezhnaya(vector<m2::PointD>{m2::PointD(2.0, -1.0), m2::PointD(2.0, 1.0)},
"улица набережная", "ru");
auto countryId = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
builder.Add(leninsky);
builder.Add(nabrezhnaya);
builder.Add(naberezhnaya);
});
SetViewport(m2::RectD(m2::PointD(0.0, -1.0), m2::PointD(2.0, 1.0)));
{
Rules rules = {ExactMatch(countryId, leninsky)};
TEST(ResultsMatch("ленинский проспект", rules), ());
TEST(ResultsMatch("ленинский пропект", rules), ());
TEST(ResultsMatch("ленинский", rules), ());
}
{
Rules rules = {ExactMatch(countryId, nabrezhnaya), ExactMatch(countryId, naberezhnaya)};
TEST(ResultsMatch("улица набрежная", rules), ());
TEST(ResultsMatch("набрежная", rules), ());
}
{
Rules rules = {ExactMatch(countryId, naberezhnaya)};
TEST(ResultsMatch("улица набережная", rules), ());
}
}
} // namespace
} // namespace search

View file

@ -124,48 +124,55 @@ void StreetsMatcher::FindStreets(BaseContext const & ctx, FeaturesFilter const &
}
};
StreetTokensFilter filter([&](strings::UniString const & /* token */, size_t tag)
{
auto buffer = streets.Intersect(ctx.m_features[tag].m_features);
if (tag < curToken)
auto findStreets = [&](bool withMisprints)
{
StreetTokensFilter filter([&](strings::UniString const & /* token */, size_t tag)
{
// This is the case for delayed
// street synonym. Therefore,
// |streets| is temporarily in the
// incomplete state.
auto buffer = streets.Intersect(ctx.m_features[tag].m_features);
if (tag < curToken)
{
// This is the case for delayed
// street synonym. Therefore,
// |streets| is temporarily in the
// incomplete state.
streets = buffer;
all = all.Intersect(ctx.m_features[tag].m_features);
emptyIntersection = false;
incomplete = true;
return;
}
ASSERT_EQUAL(tag, curToken, ());
// |streets| will become empty after
// the intersection. Therefore we need
// to create streets layer right now.
if (buffer.IsEmpty())
emit();
streets = buffer;
all = all.Intersect(ctx.m_features[tag].m_features);
emptyIntersection = false;
incomplete = false;
},
withMisprints);
incomplete = true;
return;
}
ASSERT_EQUAL(tag, curToken, ());
for (; curToken < ctx.m_numTokens && !ctx.IsTokenUsed(curToken) && !streets.IsEmpty();
++curToken)
{
auto const & token = params.GetToken(curToken).GetOriginal();
bool const isPrefix = params.IsPrefixToken(curToken);
// |streets| will become empty after
// the intersection. Therefore we need
// to create streets layer right now.
if (buffer.IsEmpty())
emit();
if (house_numbers::LooksLikeHouseNumber(token, isPrefix))
emit();
streets = buffer;
all = all.Intersect(ctx.m_features[tag].m_features);
emptyIntersection = false;
incomplete = false;
});
filter.Put(token, isPrefix, curToken);
}
emit();
};
for (; curToken < ctx.m_numTokens && !ctx.IsTokenUsed(curToken) && !streets.IsEmpty();
++curToken)
{
auto const & token = params.GetToken(curToken).GetOriginal();
bool const isPrefix = params.IsPrefixToken(curToken);
if (house_numbers::LooksLikeHouseNumber(token, isPrefix))
emit();
filter.Put(token, isPrefix, curToken);
}
emit();
findStreets(false /* withMisprints */);
findStreets(true /* withMisprints */);
}
}
} // namespace search