From 683250079ac00a81920ed9cc101239cee4ed2f9c Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Thu, 16 Jun 2016 17:25:36 +0300 Subject: [PATCH] [search] Fixed streets matching. --- generator/search_index_builder.cpp | 34 ++----- .../search_string_utils_test.cpp | 70 ++++++++++++++ indexer/search_string_utils.cpp | 23 +++++ indexer/search_string_utils.hpp | 33 +++++++ search/geocoder.cpp | 91 ++++++++++++------- search/geocoder.hpp | 4 +- .../processor_test.cpp | 17 ++++ 7 files changed, 212 insertions(+), 60 deletions(-) diff --git a/generator/search_index_builder.cpp b/generator/search_index_builder.cpp index 49071e2019..d94930010e 100644 --- a/generator/search_index_builder.cpp +++ b/generator/search_index_builder.cpp @@ -176,33 +176,19 @@ public: tokens.resize(maxTokensCount); } - // Streets are a special case: we do not add the token "street" and its - // synonyms when the feature's name contains it because in - // the search phase this part of the query will be matched against the - // "street" in the categories branch of the search index. - // However, we still add it when there are two or more street tokens - // ("avenue st", "улица набережная"). - - size_t const tokensCount = tokens.size(); - size_t numStreetTokens = 0; - vector isStreet(tokensCount); - for (size_t i = 0; i < tokensCount; ++i) + if (m_hasStreetType) { - if (search::IsStreetSynonym(tokens[i])) - { - isStreet[i] = true; - ++numStreetTokens; - } + search::StreetTokensFilter filter([&](strings::UniString const & token, size_t /* tag */) + { + AddToken(lang, token); + }); + for (auto const & token : tokens) + filter.Put(token, false /* isPrefix */, 0 /* tag */); } - - for (size_t i = 0; i < tokensCount; ++i) + else { - if (numStreetTokens == 1 && isStreet[i] && m_hasStreetType) - { - //LOG(LDEBUG, ("Skipping token:", tokens[i], "in", name)); - continue; - } - AddToken(lang, tokens[i]); + for (auto const & token : tokens) + AddToken(lang, token); } return true; diff --git a/indexer/indexer_tests/search_string_utils_test.cpp b/indexer/indexer_tests/search_string_utils_test.cpp index 27f8ef7cb3..d02015469d 100644 --- a/indexer/indexer_tests/search_string_utils_test.cpp +++ b/indexer/indexer_tests/search_string_utils_test.cpp @@ -9,6 +9,28 @@ using namespace strings; namespace { +class Utf8StreetTokensFilter +{ +public: + Utf8StreetTokensFilter(vector> & cont) + : m_cont(cont) + , m_filter([&](UniString const & token, size_t tag) + { + m_cont.emplace_back(ToUtf8(token), tag); + }) + { + } + + inline void Put(string const & token, bool isPrefix, size_t tag) + { + m_filter.Put(MakeUniString(token), isPrefix, tag); + } + +private: + vector> & m_cont; + StreetTokensFilter m_filter; +}; + bool TestStreetPrefixMatch(char const * s) { return IsStreetSynonymPrefix(MakeUniString(s)); @@ -74,3 +96,51 @@ UNIT_TEST(StreetPrefixMatch) TEST(TestStreetPrefixMatch("проезд"), ()); TEST(!TestStreetPrefixMatch("проездд"), ()); } + +UNIT_TEST(StreetTokensFilter) +{ + using TList = vector>; + + { + TList expected = {}; + TList actual; + + Utf8StreetTokensFilter filter(actual); + filter.Put("ули", true /* isPrefix */, 0 /* tag */); + + TEST_EQUAL(expected, actual, ()); + } + + { + TList expected = {}; + TList actual; + + Utf8StreetTokensFilter filter(actual); + filter.Put("улица", false /* isPrefix */, 0 /* tag */); + + TEST_EQUAL(expected, actual, ()); + } + + { + TList expected = {{"генерала", 1}, {"антонова", 2}}; + TList actual; + + Utf8StreetTokensFilter filter(actual); + filter.Put("ул", false /* isPrefix */, 0 /* tag */); + filter.Put("генерала", false /* isPrefix */, 1 /* tag */); + filter.Put("антонова", false /* isPrefix */, 2 /* tag */); + + TEST_EQUAL(expected, actual, ()); + } + + { + TList expected = {{"улица", 100}, {"набережная", 50}}; + TList actual; + + Utf8StreetTokensFilter filter(actual); + filter.Put("улица", false /* isPrefix */, 100 /* tag */); + filter.Put("набережная", true /* isPrefix */, 50 /* tag */); + + TEST_EQUAL(expected, actual, ()); + } +} diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp index 24aaaa3764..ade404f93d 100644 --- a/indexer/search_string_utils.cpp +++ b/indexer/search_string_utils.cpp @@ -223,4 +223,27 @@ bool ContainsNormalized(string const & str, string const & substr) UniString const usubstr = NormalizeAndSimplifyString(substr); return std::search(ustr.begin(), ustr.end(), usubstr.begin(), usubstr.end()) != ustr.end(); } + +// StreetTokensFilter ------------------------------------------------------------------------------ +void StreetTokensFilter::Put(strings::UniString const & token, bool isPrefix, size_t tag) +{ + if ((isPrefix && IsStreetSynonymPrefix(token)) || (!isPrefix && IsStreetSynonym(token))) + { + ++m_numSynonyms; + if (m_numSynonyms == 1) + { + m_delayedToken = token; + m_delayedTag = tag; + } + else + { + EmitToken(m_delayedToken, m_delayedTag); + EmitToken(token, tag); + } + } + else + { + EmitToken(token, tag); + } +} } // namespace search diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index 87e529e0d0..46d93208cb 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -54,4 +54,37 @@ bool IsStreetSynonymPrefix(strings::UniString const & s); /// Normalizes both str and substr, and then returns true if substr is found in str. /// Used in native platform code for search in localized strings (cuisines, categories, strings etc.). bool ContainsNormalized(string const & str, string const & substr); + +// This class can be used as a filter for street tokens. As there can +// be street synonyms in the street name, single street synonym is +// skipped, but multiple synonyms are left as is. +class StreetTokensFilter +{ +public: + using TCallback = function; + + template + StreetTokensFilter(TC && callback) + : m_callback(forward(callback)) + { + } + + // Puts token to the filter. Filter checks following cases: + // * if |token| is the first street synonym met so far, it's delayed + // * if |token| is a street synonym, but not the first, callback is called + // for the |token| and for the previously delayed token + // * if |token| is not a street synonym, callback is called for the |token| + void Put(strings::UniString const & token, bool isPrefix, size_t tag); + +private: + using TCell = pair; + + inline void EmitToken(strings::UniString const & token, size_t tag) { m_callback(token, tag); } + + strings::UniString m_delayedToken; + size_t m_delayedTag = 0; + size_t m_numSynonyms = 0; + + TCallback m_callback; +}; } // namespace search diff --git a/search/geocoder.cpp b/search/geocoder.cpp index 1f0ac2b8fa..1cafb50e73 100644 --- a/search/geocoder.cpp +++ b/search/geocoder.cpp @@ -466,6 +466,8 @@ void Geocoder::SetParams(Params const & params) } } + LOG(LDEBUG, ("Tokens = ", m_params.m_tokens)); + LOG(LDEBUG, ("Prefix tokens = ", m_params.m_prefixTokens)); LOG(LDEBUG, ("Languages =", m_params.m_langs)); } @@ -1075,61 +1077,82 @@ void Geocoder::GreedilyMatchStreets() continue; // Here we try to match as many tokens as possible while - // intersection is a non-empty bit vector of streets. All tokens - // that are synonyms to streets are ignored. Moreover, each time - // a token that looks like a beginning of a house number is met, - // we try to use current intersection of tokens as a street layer - // and try to match buildings or pois. - unique_ptr allFeatures; + // intersection is a non-empty bit vector of streets. Single + // tokens that are synonyms to streets are ignored. Moreover, + // each time a token that looks like a beginning of a house number + // is met, we try to use current intersection of tokens as a + // street layer and try to match BUILDINGs or POIs. + CBVPtr allFeatures(m_streets, false /* isOwner */); size_t curToken = startToken; // This variable is used for prevention of duplicate calls to // CreateStreetsLayerAndMatchLowerLayers() with the same // arguments. - size_t lastStopToken = curToken; + size_t lastToken = startToken; - for (; curToken < m_numTokens && !m_usedTokens[curToken]; ++curToken) + bool emptyIntersection = true; + bool incomplete = false; + + auto createStreetsLayerAndMatchLowerLayers = [&]() + { + if (!allFeatures.IsEmpty() && !emptyIntersection && !incomplete && lastToken != curToken) + { + CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, *allFeatures); + lastToken = curToken; + } + }; + + StreetTokensFilter filter([&](strings::UniString const & /* token */, size_t tag) + { + auto buffer = coding::CompressedBitVector::Intersect( + *allFeatures, *m_addressFeatures[tag]); + if (tag < curToken) + { + allFeatures.Set(move(buffer)); + emptyIntersection = false; + incomplete = true; + return; + } + ASSERT_EQUAL(tag, curToken, ()); + + // |allFeatures| will become empty + // after the intersection. Therefore + // we need to create streets layer + // right now. + if (coding::CompressedBitVector::IsEmpty(buffer)) + createStreetsLayerAndMatchLowerLayers(); + + allFeatures.Set(move(buffer)); + emptyIntersection = false; + incomplete = false; + }); + + for (; curToken < m_numTokens && !m_usedTokens[curToken] && !allFeatures.IsEmpty(); ++curToken) { auto const & token = m_params.GetTokens(curToken).front(); - if (IsStreetSynonymPrefix(token)) - continue; - bool const isPrefix = curToken >= m_params.m_tokens.size(); + if (house_numbers::LooksLikeHouseNumber(token, isPrefix)) - { - CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures); - lastStopToken = curToken; - } + createStreetsLayerAndMatchLowerLayers(); - unique_ptr buffer; - if (startToken == curToken || coding::CompressedBitVector::IsEmpty(allFeatures)) - buffer = coding::CompressedBitVector::Intersect(*m_streets, *m_addressFeatures[curToken]); - else - buffer = coding::CompressedBitVector::Intersect(*allFeatures, *m_addressFeatures[curToken]); - - if (coding::CompressedBitVector::IsEmpty(buffer)) - break; - - allFeatures.swap(buffer); + filter.Put(token, isPrefix, curToken); } - - if (curToken != lastStopToken) - CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures); + createStreetsLayerAndMatchLowerLayers(); } } void Geocoder::CreateStreetsLayerAndMatchLowerLayers( - size_t startToken, size_t endToken, unique_ptr const & features) + size_t startToken, size_t endToken, coding::CompressedBitVector const & features) { ASSERT(m_layers.empty(), ()); - if (coding::CompressedBitVector::IsEmpty(features)) + if (coding::CompressedBitVector::IsEmpty(&features)) return; - CBVPtr filtered(features.get(), false /* isOwner */); - if (m_filter->NeedToFilter(*features)) - filtered.Set(m_filter->Filter(*features).release(), true /* isOwner */); + CBVPtr filtered(&features, false /* isOwner */); + if (m_filter->NeedToFilter(features)) + filtered.Set(m_filter->Filter(features).release(), true /* isOwner */); m_layers.emplace_back(); MY_SCOPE_GUARD(cleanupGuard, bind(&vector::pop_back, &m_layers)); @@ -1138,7 +1161,7 @@ void Geocoder::CreateStreetsLayerAndMatchLowerLayers( InitLayer(SearchModel::SEARCH_TYPE_STREET, startToken, endToken, layer); vector sortedFeatures; - sortedFeatures.reserve(features->PopCount()); + sortedFeatures.reserve(features.PopCount()); filtered.ForEach(MakeBackInsertFunctor(sortedFeatures)); layer.m_sortedFeatures = &sortedFeatures; diff --git a/search/geocoder.hpp b/search/geocoder.hpp index 439784a742..db8e905b98 100644 --- a/search/geocoder.hpp +++ b/search/geocoder.hpp @@ -245,8 +245,8 @@ private: // then performs geocoding in street vicinities. void GreedilyMatchStreets(); - void CreateStreetsLayerAndMatchLowerLayers( - size_t startToken, size_t endToken, unique_ptr const & features); + void CreateStreetsLayerAndMatchLowerLayers(size_t startToken, size_t endToken, + coding::CompressedBitVector const & features); // Tries to find all paths in a search tree, where each edge is // marked with some substring of the query tokens. These paths are diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp index 89c9032634..419ed0025b 100644 --- a/search/search_integration_tests/processor_test.cpp +++ b/search/search_integration_tests/processor_test.cpp @@ -59,6 +59,8 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke) TestCity losAlamosCity(m2::PointD(10, 10), "Los Alamos", "en", 100 /* rank */); TestCity mskCity(m2::PointD(0, 0), "Moscow", "en", 100 /* rank */); + TestCity torontoCity(m2::PointD(-10, -10), "Toronto", "en", 100 /* rank */); + TestVillage longPondVillage(m2::PointD(15, 15), "Long Pond Village", "en", 10 /* rank */); TestStreet feynmanStreet( vector{m2::PointD(9.999, 9.999), m2::PointD(10, 10), m2::PointD(10.001, 10.001)}, @@ -94,16 +96,23 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke) TestPOI lantern1(m2::PointD(10.0005, 10.0005), "lantern 1", "en"); TestPOI lantern2(m2::PointD(10.0006, 10.0005), "lantern 2", "en"); + TestStreet stradaDrive(vector{m2::PointD(-10.001, -10.001), m2::PointD(-10, -10), + m2::PointD(-9.999, -9.999)}, + "Strada drive", "en"); + TestBuilding terranceHouse(m2::PointD(-10, -10), "", "155", stradaDrive, "en"); + BuildWorld([&](TestMwmBuilder & builder) { builder.Add(wonderlandCountry); builder.Add(losAlamosCity); builder.Add(mskCity); + builder.Add(torontoCity); }); auto wonderlandId = BuildCountry(countryName, [&](TestMwmBuilder & builder) { builder.Add(losAlamosCity); builder.Add(mskCity); + builder.Add(torontoCity); builder.Add(longPondVillage); builder.Add(feynmanStreet); @@ -125,6 +134,9 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke) builder.Add(quantumCafe); builder.Add(lantern1); builder.Add(lantern2); + + builder.Add(stradaDrive); + builder.Add(terranceHouse); }); SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0))); @@ -190,6 +202,11 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke) TRules rules = {ExactMatch(wonderlandId, bornHouse)}; TEST(ResultsMatch("long pond 1st april street 8", rules), ()); } + + { + TRules rules = {ExactMatch(wonderlandId, terranceHouse)}; + TEST(ResultsMatch("Toronto strada drive 155", rules), ()); + } } UNIT_CLASS_TEST(ProcessorTest, SearchInWorld)