[search] Fix street synonyms with misprints matching.

This commit is contained in:
tatiana-yan 2019-07-02 14:37:44 +03:00 committed by mpimenov
parent b71fb6a256
commit 12583814f0
3 changed files with 196 additions and 40 deletions

View file

@ -560,9 +560,9 @@ UNIT_CLASS_TEST(ProcessorTest, TestRankingInfo_ErrorsMade)
checkErrors("трактир лермонтов", ErrorsMade(2));
checkErrors("кафе", ErrorsMade());
checkErrors("Yesenina cafe", ErrorsMade(0));
checkErrors("Esenina cafe", ErrorsMade(1));
checkErrors("Jesenina cafe", ErrorsMade(1));
checkErrors("Cafe Yesenina", ErrorsMade(0));
checkErrors("Cafe Esenina", ErrorsMade(1));
checkErrors("Cafe Jesenina", ErrorsMade(1));
checkErrors("Островского кафе", ErrorsMade(0));
checkErrors("Астровского кафе", ErrorsMade(1));
@ -2052,5 +2052,106 @@ UNIT_CLASS_TEST(ProcessorTest, StreetSynonymsWithMisprints)
}
}
UNIT_CLASS_TEST(ProcessorTest, HouseOnStreetSynonymsWithMisprints)
{
string const countryName = "Wonderland";
TestStreet tverskoi(vector<m2::PointD>{m2::PointD(1.0, -1.0), m2::PointD(1.0, 1.0)},
"Tverskoi Boulevard", "en");
TestStreet leninsky(vector<m2::PointD>{m2::PointD(0.0, -1.0), m2::PointD(0.0, 1.0)},
"Leninsky Avenue", "en");
TestStreet mira(vector<m2::PointD>{m2::PointD(-1.0, -1.0), m2::PointD(-1.0, 1.0)},
"Проспект Мира", "ru");
TestPOI houseTverskoi(m2::PointD(1.0, 0.0), "", "en");
houseTverskoi.SetHouseNumber("3");
houseTverskoi.SetStreetName(tverskoi.GetName("en"));
TestPOI houseLeninsky(m2::PointD(0.0, 0.0), "", "en");
houseLeninsky.SetHouseNumber("5");
houseLeninsky.SetStreetName(leninsky.GetName("en"));
TestPOI houseMira(m2::PointD(-1.0, 0.0), "", "en");
houseMira.SetHouseNumber("7");
houseMira.SetStreetName(mira.GetName("ru"));
auto countryId = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
builder.Add(tverskoi);
builder.Add(leninsky);
builder.Add(mira);
builder.Add(houseTverskoi);
builder.Add(houseLeninsky);
builder.Add(houseMira);
});
auto alternativeMatch = [this](string const & query, Rules const & rules1, Rules const & rules2) {
TestSearchRequest request(m_engine, query, "en", Mode::Everywhere, m_viewport);
request.Run();
return MatchResults(m_dataSource, rules1, request.Results()) ||
MatchResults(m_dataSource, rules2, request.Results());
};
SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0)));
{
Rules rules = {ExactMatch(countryId, houseTverskoi)};
Rules rulesWithStreet = {ExactMatch(countryId, houseTverskoi), ExactMatch(countryId, tverskoi)};
TEST(alternativeMatch("tverskoi 3", rules, rulesWithStreet), ());
TEST(alternativeMatch("tverskoi boulevard 3", rules, rulesWithStreet), ());
TEST(alternativeMatch("tverskoi bulevard 3", rules, rulesWithStreet), ());
TEST(alternativeMatch("tverskoi blvd 3", rules, rulesWithStreet), ());
TEST(alternativeMatch("tverskoi blvrd 3", rules, rulesWithStreet), ());
TEST(alternativeMatch("tverskoi boulevrd 3", rules, rulesWithStreet), ());
TEST(alternativeMatch("tverskoi bolevard 3", rules, rulesWithStreet), ());
}
{
Rules rules = {ExactMatch(countryId, houseLeninsky)};
Rules rulesWithStreet = {ExactMatch(countryId, houseLeninsky), ExactMatch(countryId, leninsky)};
TEST(alternativeMatch("leninsky 5", rules, rulesWithStreet), ());
TEST(alternativeMatch("leninsky avenue 5", rules, rulesWithStreet), ());
TEST(alternativeMatch("leninsky avenu 5", rules, rulesWithStreet), ());
TEST(alternativeMatch("leninsky avneue 5", rules, rulesWithStreet), ());
TEST(alternativeMatch("leninsky av 5", rules, rulesWithStreet), ());
}
{
Rules rules = {ExactMatch(countryId, houseMira)};
Rules rulesWithStreet = {ExactMatch(countryId, houseMira), ExactMatch(countryId, mira)};
TEST(alternativeMatch("мира 7", rules, rulesWithStreet), ());
TEST(alternativeMatch("проспект мира 7", rules, rulesWithStreet), ());
TEST(alternativeMatch("пропект мира 7", rules, rulesWithStreet), ());
TEST(alternativeMatch("прсопект мира 7", rules, rulesWithStreet), ());
TEST(alternativeMatch("пр-т мира 7", rules, rulesWithStreet), ());
}
}
UNIT_CLASS_TEST(ProcessorTest, StreetSynonymPrefixMatch)
{
string const countryName = "Wonderland";
TestStreet yesenina(
vector<m2::PointD>{m2::PointD(0.5, -0.5), m2::PointD(0, 0), m2::PointD(-0.5, 0.5)},
"Yesenina street", "en");
TestPOI cafe(m2::PointD(0, 0), "", "en");
cafe.SetTypes({{"amenity", "cafe"}});
auto countryId = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
builder.Add(yesenina);
builder.Add(cafe);
});
SetViewport(m2::RectD(-1, -1, 1, 1));
{
Rules rules = {ExactMatch(countryId, cafe)};
TEST(ResultsMatch("Yesenina cafe ", rules), ());
TEST(ResultsMatch("Cafe Yesenina ", rules), ());
TEST(ResultsMatch("Cafe Yesenina", rules), ());
}
{
Rules rules = {ExactMatch(countryId, cafe), ExactMatch(countryId, yesenina)};
// Prefix match with misprints to street synonym gives street as additional result
// but we still can find the cafe.
TEST(ResultsMatch("Yesenina cafe", rules), ());
}
}
} // namespace
} // namespace search

View file

@ -16,11 +16,43 @@ namespace search
{
namespace
{
bool LessByHash(StreetsMatcher::Prediction const & lhs, StreetsMatcher::Prediction const & rhs)
bool LessByHashAndRange(StreetsMatcher::Prediction const & lhs,
StreetsMatcher::Prediction const & rhs)
{
if (lhs.m_hash != rhs.m_hash)
return lhs.m_hash < rhs.m_hash;
if (lhs.GetNumTokens() != rhs.GetNumTokens())
return lhs.GetNumTokens() > rhs.GetNumTokens();
if (lhs.m_tokenRange.Begin() != rhs.m_tokenRange.Begin())
return lhs.m_tokenRange.Begin() < rhs.m_tokenRange.Begin();
if (lhs.m_prob != rhs.m_prob)
return lhs.m_prob > rhs.m_prob;
if (lhs.m_withMisprints != rhs.m_withMisprints)
return rhs.m_withMisprints;
return false;
}
bool EqualsByHashAndRange(StreetsMatcher::Prediction const & lhs,
StreetsMatcher::Prediction const & rhs)
{
return lhs.GetNumTokens() == rhs.GetNumTokens() &&
lhs.m_tokenRange.Begin() == rhs.m_tokenRange.Begin() && lhs.m_hash == rhs.m_hash;
}
bool LessByHashAndMisprints(StreetsMatcher::Prediction const & lhs,
StreetsMatcher::Prediction const & rhs)
{
if (lhs.m_hash != rhs.m_hash)
return lhs.m_hash < rhs.m_hash;
if (lhs.m_withMisprints != rhs.m_withMisprints)
return rhs.m_withMisprints;
if (lhs.m_prob != rhs.m_prob)
return lhs.m_prob > rhs.m_prob;
@ -29,6 +61,12 @@ bool LessByHash(StreetsMatcher::Prediction const & lhs, StreetsMatcher::Predicti
return lhs.m_tokenRange.Begin() < rhs.m_tokenRange.Begin();
}
bool EqualsByHashAndMisprints(StreetsMatcher::Prediction const & lhs,
StreetsMatcher::Prediction const & rhs)
{
return lhs.m_withMisprints == rhs.m_withMisprints && lhs.m_hash == rhs.m_hash;
}
} // namespace
// static
@ -44,10 +82,25 @@ void StreetsMatcher::Go(BaseContext const & ctx, FeaturesFilter const & filter,
if (predictions.empty())
return;
sort(predictions.begin(), predictions.end(), &LessByHash);
predictions.erase(
unique(predictions.begin(), predictions.end(), base::EqualsBy(&Prediction::m_hash)),
predictions.end());
// Remove predictions with the same m_hash (features) and token range.
sort(predictions.begin(), predictions.end(), &LessByHashAndRange);
predictions.erase(unique(predictions.begin(), predictions.end(), &EqualsByHashAndRange),
predictions.end());
// Leave the most probable and longest prediction for predictions with the same m_hash (features)
// and m_withMisprints.
// We will still distinguish parses with the same m_hash (features) but different range and m_withMisprints.
// For example, for "Paramount dive" we will have two parses:
// STREET UNUSED (can be matched to poi later)
// Paramount dive
//
// STREET STREET ("drive" with misprints)
// Paramount dive
//
// The parses will have the same features and hash but we need both of them.
sort(predictions.begin(), predictions.end(), &LessByHashAndMisprints);
predictions.erase(unique(predictions.begin(), predictions.end(), &EqualsByHashAndMisprints),
predictions.end());
sort(predictions.rbegin(), predictions.rend(), base::LessBy(&Prediction::m_prob));
while (predictions.size() > kMaxNumOfImprobablePredictions &&
@ -66,33 +119,8 @@ void StreetsMatcher::FindStreets(BaseContext const & ctx, FeaturesFilter const &
if (ctx.IsTokenUsed(startToken))
continue;
// Here we try to match as many tokens as possible while
// intersection is a non-empty bit vector of streets. Single
// tokens that are synonyms to streets are ignored. Moreover,
// each time a token that looks like a beginning of a house number
// is met, we try to use current intersection of tokens as a
// street layer and try to match BUILDINGs or POIs.
CBV streets(ctx.m_streets);
CBV all;
all.SetFull();
size_t curToken = startToken;
// This variable is used for prevention of duplicate calls to
// CreateStreetsLayerAndMatchLowerLayers() with the same
// arguments.
size_t lastToken = startToken;
// When true, no bit vectors were intersected with |streets| at all.
bool emptyIntersection = true;
// When true, |streets| is in the incomplete state and can't be
// used for creation of street layers.
bool incomplete = false;
auto emit = [&]()
{
auto emit = [&](CBV const & streets, CBV const & all, size_t curToken, size_t lastToken,
bool emptyIntersection, bool incomplete, bool withMisprints) {
if (!streets.IsEmpty() && !emptyIntersection && !incomplete && lastToken != curToken)
{
CBV fs(streets);
@ -121,11 +149,36 @@ void StreetsMatcher::FindStreets(BaseContext const & ctx, FeaturesFilter const &
prediction.m_features = move(fs);
prediction.m_hash = prediction.m_features.Hash();
prediction.m_withMisprints = withMisprints;
}
};
auto findStreets = [&](bool withMisprints)
{
auto findStreets = [&](bool withMisprints) {
// Here we try to match as many tokens as possible while
// intersection is a non-empty bit vector of streets. Single
// tokens that are synonyms to streets are ignored. Moreover,
// each time a token that looks like a beginning of a house number
// is met, we try to use current intersection of tokens as a
// street layer and try to match BUILDINGs or POIs.
CBV streets(ctx.m_streets);
CBV all;
all.SetFull();
size_t curToken = startToken;
// This variable is used for prevention of duplicate calls to
// CreateStreetsLayerAndMatchLowerLayers() with the same
// arguments.
size_t lastToken = startToken;
// When true, no bit vectors were intersected with |streets| at all.
bool emptyIntersection = true;
// When true, |streets| is in the incomplete state and can't be
// used for creation of street layers.
bool incomplete = false;
StreetTokensFilter filter([&](strings::UniString const & /* token */, size_t tag)
{
auto buffer = streets.Intersect(ctx.m_features[tag].m_features);
@ -148,7 +201,8 @@ void StreetsMatcher::FindStreets(BaseContext const & ctx, FeaturesFilter const &
// the intersection. Therefore we need
// to create streets layer right now.
if (buffer.IsEmpty())
emit();
emit(streets, all, curToken, lastToken, emptyIntersection,
incomplete, withMisprints);
streets = buffer;
all = all.Intersect(ctx.m_features[tag].m_features);
@ -164,11 +218,11 @@ void StreetsMatcher::FindStreets(BaseContext const & ctx, FeaturesFilter const &
bool const isPrefix = params.IsPrefixToken(curToken);
if (house_numbers::LooksLikeHouseNumber(token, isPrefix))
emit();
emit(streets, all, curToken, lastToken, emptyIntersection, incomplete, withMisprints);
filter.Put(token, isPrefix, curToken);
}
emit();
emit(streets, all, curToken, lastToken, emptyIntersection, incomplete, withMisprints);
};
findStreets(false /* withMisprints */);

View file

@ -22,6 +22,7 @@ public:
CBV m_features;
TokenRange m_tokenRange;
bool m_withMisprints = false;
double m_prob = 0.0;
uint64_t m_hash = 0;
};