From b2a8add9bccf37c606afe045a6a9f04c987c089f Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Fri, 28 Jun 2019 15:38:13 +0300 Subject: [PATCH] [search] Match street synonyms with misprints. --- generator/search_index_builder.cpp | 7 +- .../search_string_utils_test.cpp | 69 ++++++++++++----- indexer/search_string_utils.cpp | 11 ++- indexer/search_string_utils.hpp | 6 +- .../processor_test.cpp | 36 +++++++++ search/streets_matcher.cpp | 75 ++++++++++--------- 6 files changed, 145 insertions(+), 59 deletions(-) diff --git a/generator/search_index_builder.cpp b/generator/search_index_builder.cpp index d3ec4a0f33..104a4fce05 100644 --- a/generator/search_index_builder.cpp +++ b/generator/search_index_builder.cpp @@ -203,10 +203,9 @@ struct FeatureNameInserter if (m_hasStreetType) { - search::StreetTokensFilter filter([&](strings::UniString const & token, size_t /* tag */) - { - AddToken(lang, token); - }); + search::StreetTokensFilter filter( + [&](strings::UniString const & token, size_t /* tag */) { AddToken(lang, token); }, + false /* withMisprints */); for (auto const & token : tokens) filter.Put(token, false /* isPrefix */, 0 /* tag */); diff --git a/indexer/indexer_tests/search_string_utils_test.cpp b/indexer/indexer_tests/search_string_utils_test.cpp index 30463dbd27..5a273c62fb 100644 --- a/indexer/indexer_tests/search_string_utils_test.cpp +++ b/indexer/indexer_tests/search_string_utils_test.cpp @@ -18,16 +18,15 @@ namespace class Utf8StreetTokensFilter { public: - explicit Utf8StreetTokensFilter(vector> & cont) + explicit Utf8StreetTokensFilter(vector> & cont, bool withMisprints = false) : m_cont(cont) - , m_filter([&](UniString const & token, size_t tag) - { - m_cont.emplace_back(ToUtf8(token), tag); - }) + , m_filter( + [&](UniString const & token, size_t tag) { m_cont.emplace_back(ToUtf8(token), tag); }, + withMisprints) { } - inline void Put(string const & token, bool isPrefix, size_t tag) + void Put(string const & token, bool isPrefix, size_t tag) { m_filter.Put(MakeUniString(token), isPrefix, tag); } @@ -162,11 +161,11 @@ UNIT_TEST(StreetPrefixMatch) UNIT_TEST(StreetTokensFilter) { - using TList = vector>; + using List = vector>; { - TList expected = {}; - TList actual; + List expected = {}; + List actual; Utf8StreetTokensFilter filter(actual); filter.Put("ули", true /* isPrefix */, 0 /* tag */); @@ -175,8 +174,8 @@ UNIT_TEST(StreetTokensFilter) } { - TList expected = {}; - TList actual; + List expected = {}; + List actual; Utf8StreetTokensFilter filter(actual); filter.Put("улица", false /* isPrefix */, 0 /* tag */); @@ -185,8 +184,8 @@ UNIT_TEST(StreetTokensFilter) } { - TList expected = {{"генерала", 1}, {"антонова", 2}}; - TList actual; + List expected = {{"генерала", 1}, {"антонова", 2}}; + List actual; Utf8StreetTokensFilter filter(actual); filter.Put("ул", false /* isPrefix */, 0 /* tag */); @@ -197,8 +196,8 @@ UNIT_TEST(StreetTokensFilter) } { - TList expected = {{"улица", 100}, {"набережная", 50}}; - TList actual; + List expected = {{"улица", 100}, {"набережная", 50}}; + List actual; Utf8StreetTokensFilter filter(actual); filter.Put("улица", false /* isPrefix */, 100 /* tag */); @@ -208,8 +207,8 @@ UNIT_TEST(StreetTokensFilter) } { - TList expected = {{"улица", 0}, {"набережная", 1}, {"проспект", 2}}; - TList actual; + List expected = {{"улица", 0}, {"набережная", 1}, {"проспект", 2}}; + List actual; Utf8StreetTokensFilter filter(actual); filter.Put("улица", false /* isPrefix */, 0 /* tag */); @@ -218,6 +217,42 @@ UNIT_TEST(StreetTokensFilter) TEST_EQUAL(expected, actual, ()); } + + { + List expectedWithMisprints = {{"ленинский", 0}}; + List expectedWithoutMisprints = {{"ленинский", 0}, {"пропект", 1}}; + List actualWithMisprints; + List actualWithoutMisprints; + + Utf8StreetTokensFilter filterWithMisprints(actualWithMisprints, true /* withMisprints */); + Utf8StreetTokensFilter filterWithoutMisprints(actualWithoutMisprints, + false /* withMisprints */); + filterWithMisprints.Put("ленинский", false /* isPrefix */, 0 /* tag */); + filterWithoutMisprints.Put("ленинский", false /* isPrefix */, 0 /* tag */); + filterWithMisprints.Put("пропект", false /* isPrefix */, 1 /* tag */); + filterWithoutMisprints.Put("пропект", false /* isPrefix */, 1 /* tag */); + + TEST_EQUAL(expectedWithMisprints, actualWithMisprints, ()); + TEST_EQUAL(expectedWithoutMisprints, actualWithoutMisprints, ()); + } + + { + List expectedWithMisprints = {{"улица", 0}, {"набрежная", 1}}; + List expectedWithoutMisprints = {{"набрежная", 1}}; + List actualWithMisprints; + List actualWithoutMisprints; + + Utf8StreetTokensFilter filterWithMisprints(actualWithMisprints, true /* withMisprints */); + Utf8StreetTokensFilter filterWithoutMisprints(actualWithoutMisprints, + false /* withMisprints */); + filterWithMisprints.Put("улица", false /* isPrefix */, 0 /* tag */); + filterWithoutMisprints.Put("улица", false /* isPrefix */, 0 /* tag */); + filterWithMisprints.Put("набрежная", false /* isPrefix */, 1 /* tag */); + filterWithoutMisprints.Put("набрежная", false /* isPrefix */, 1 /* tag */); + + TEST_EQUAL(expectedWithMisprints, actualWithMisprints, ()); + TEST_EQUAL(expectedWithoutMisprints, actualWithoutMisprints, ()); + } } UNIT_TEST(NormalizeAndSimplifyString_Numero) diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp index 314aed12ea..2d5b5ece8c 100644 --- a/indexer/search_string_utils.cpp +++ b/indexer/search_string_utils.cpp @@ -435,7 +435,14 @@ bool ContainsNormalized(string const & str, string const & substr) // StreetTokensFilter ------------------------------------------------------------------------------ void StreetTokensFilter::Put(strings::UniString const & token, bool isPrefix, size_t tag) { - if ((isPrefix && IsStreetSynonymPrefix(token)) || (!isPrefix && IsStreetSynonym(token))) + using IsStreetChecker = std::function; + + IsStreetChecker isStreet = m_withMisprints ? IsStreetSynonymWithMisprints : IsStreetSynonym; + IsStreetChecker isStreetPrefix = + m_withMisprints ? IsStreetSynonymPrefixWithMisprints : IsStreetSynonymPrefix; + + auto const isStreetSynonym = isStreet(token); + if ((isPrefix && isStreetPrefix(token)) || (!isPrefix && isStreetSynonym)) { ++m_numSynonyms; if (m_numSynonyms == 1) @@ -446,7 +453,7 @@ void StreetTokensFilter::Put(strings::UniString const & token, bool isPrefix, si } // Do not emit delayed token for incomplete street synonym. - if ((!isPrefix || IsStreetSynonym(token)) && m_numSynonyms == 2) + if ((!isPrefix || isStreetSynonym) && m_numSynonyms == 2) EmitToken(m_delayedToken, m_delayedTag); } EmitToken(token, tag); diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index 87017c4c09..20415be63d 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -103,8 +103,9 @@ class StreetTokensFilter public: using Callback = std::function; - template - explicit StreetTokensFilter(TC && callback) : m_callback(std::forward(callback)) + template + StreetTokensFilter(C && callback, bool withMisprints) + : m_callback(std::forward(callback)), m_withMisprints(withMisprints) { } @@ -125,5 +126,6 @@ private: size_t m_numSynonyms = 0; Callback m_callback; + bool m_withMisprints = false; }; } // namespace search diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp index b551a41b73..11fc40a90a 100644 --- a/search/search_integration_tests/processor_test.cpp +++ b/search/search_integration_tests/processor_test.cpp @@ -2018,5 +2018,41 @@ UNIT_CLASS_TEST(ProcessorTest, Strasse) checkNoErrors("xyz", rules); } } + +UNIT_CLASS_TEST(ProcessorTest, StreetSynonymsWithMisprints) +{ + string const countryName = "Wonderland"; + + TestStreet leninsky(vector{m2::PointD(0.0, -1.0), m2::PointD(0.0, 1.0)}, + "Ленинский проспект", "ru"); + TestStreet nabrezhnaya(vector{m2::PointD(1.0, -1.0), m2::PointD(1.0, 1.0)}, + "улица набрежная", "ru"); + TestStreet naberezhnaya(vector{m2::PointD(2.0, -1.0), m2::PointD(2.0, 1.0)}, + "улица набережная", "ru"); + + auto countryId = BuildCountry(countryName, [&](TestMwmBuilder & builder) { + builder.Add(leninsky); + builder.Add(nabrezhnaya); + builder.Add(naberezhnaya); + }); + + SetViewport(m2::RectD(m2::PointD(0.0, -1.0), m2::PointD(2.0, 1.0))); + { + Rules rules = {ExactMatch(countryId, leninsky)}; + TEST(ResultsMatch("ленинский проспект", rules), ()); + TEST(ResultsMatch("ленинский пропект", rules), ()); + TEST(ResultsMatch("ленинский", rules), ()); + } + { + Rules rules = {ExactMatch(countryId, nabrezhnaya), ExactMatch(countryId, naberezhnaya)}; + TEST(ResultsMatch("улица набрежная", rules), ()); + TEST(ResultsMatch("набрежная", rules), ()); + } + { + Rules rules = {ExactMatch(countryId, naberezhnaya)}; + TEST(ResultsMatch("улица набережная", rules), ()); + } +} + } // namespace } // namespace search diff --git a/search/streets_matcher.cpp b/search/streets_matcher.cpp index e65719aa31..e7cd4b4720 100644 --- a/search/streets_matcher.cpp +++ b/search/streets_matcher.cpp @@ -124,48 +124,55 @@ void StreetsMatcher::FindStreets(BaseContext const & ctx, FeaturesFilter const & } }; - StreetTokensFilter filter([&](strings::UniString const & /* token */, size_t tag) - { - auto buffer = streets.Intersect(ctx.m_features[tag].m_features); - if (tag < curToken) + auto findStreets = [&](bool withMisprints) + { + StreetTokensFilter filter([&](strings::UniString const & /* token */, size_t tag) { - // This is the case for delayed - // street synonym. Therefore, - // |streets| is temporarily in the - // incomplete state. + auto buffer = streets.Intersect(ctx.m_features[tag].m_features); + if (tag < curToken) + { + // This is the case for delayed + // street synonym. Therefore, + // |streets| is temporarily in the + // incomplete state. + streets = buffer; + all = all.Intersect(ctx.m_features[tag].m_features); + emptyIntersection = false; + + incomplete = true; + return; + } + ASSERT_EQUAL(tag, curToken, ()); + + // |streets| will become empty after + // the intersection. Therefore we need + // to create streets layer right now. + if (buffer.IsEmpty()) + emit(); + streets = buffer; all = all.Intersect(ctx.m_features[tag].m_features); emptyIntersection = false; + incomplete = false; + }, + withMisprints); - incomplete = true; - return; - } - ASSERT_EQUAL(tag, curToken, ()); + for (; curToken < ctx.m_numTokens && !ctx.IsTokenUsed(curToken) && !streets.IsEmpty(); + ++curToken) + { + auto const & token = params.GetToken(curToken).GetOriginal(); + bool const isPrefix = params.IsPrefixToken(curToken); - // |streets| will become empty after - // the intersection. Therefore we need - // to create streets layer right now. - if (buffer.IsEmpty()) - emit(); + if (house_numbers::LooksLikeHouseNumber(token, isPrefix)) + emit(); - streets = buffer; - all = all.Intersect(ctx.m_features[tag].m_features); - emptyIntersection = false; - incomplete = false; - }); + filter.Put(token, isPrefix, curToken); + } + emit(); + }; - for (; curToken < ctx.m_numTokens && !ctx.IsTokenUsed(curToken) && !streets.IsEmpty(); - ++curToken) - { - auto const & token = params.GetToken(curToken).GetOriginal(); - bool const isPrefix = params.IsPrefixToken(curToken); - - if (house_numbers::LooksLikeHouseNumber(token, isPrefix)) - emit(); - - filter.Put(token, isPrefix, curToken); - } - emit(); + findStreets(false /* withMisprints */); + findStreets(true /* withMisprints */); } } } // namespace search