From ca90764d85352aad99f01ce9fa1682ab258876bb Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Fri, 29 Jan 2016 15:34:04 +0300 Subject: [PATCH] [search] Fixed greedy street matching. --- base/buffer_vector.hpp | 6 ++ indexer/feature_impl.cpp | 4 + indexer/feature_impl.hpp | 2 + search/search_string_utils.cpp | 2 +- .../house_numbers_matcher_test.cpp | 17 ++++ search/v2/features_layer.cpp | 2 +- search/v2/features_layer.hpp | 4 +- search/v2/features_layer_matcher.hpp | 9 +- search/v2/features_layer_path_finder.cpp | 4 +- search/v2/geocoder.cpp | 87 +++++++++++++------ search/v2/geocoder.hpp | 3 + search/v2/house_numbers_matcher.cpp | 18 ++-- search/v2/house_numbers_matcher.hpp | 7 +- 13 files changed, 116 insertions(+), 49 deletions(-) diff --git a/base/buffer_vector.hpp b/base/buffer_vector.hpp index a8b121e145..fbf6a7d928 100644 --- a/base/buffer_vector.hpp +++ b/base/buffer_vector.hpp @@ -100,6 +100,12 @@ public: return *this; } + template + void append(buffer_vector const & v) + { + append(v.begin(), v.end()); + } + template void append(IterT beg, IterT end) { diff --git a/indexer/feature_impl.cpp b/indexer/feature_impl.cpp index 2ae4979a82..b2d53f55e4 100644 --- a/indexer/feature_impl.cpp +++ b/indexer/feature_impl.cpp @@ -67,4 +67,8 @@ bool IsHouseNumber(string const & s) return (!s.empty() && IsDigit(s[0])); } +bool IsHouseNumber(strings::UniString const & s) +{ + return (!s.empty() && IsDigit(s[0])); +} } diff --git a/indexer/feature_impl.hpp b/indexer/feature_impl.hpp index 66d7357e3f..4b24519de8 100644 --- a/indexer/feature_impl.hpp +++ b/indexer/feature_impl.hpp @@ -29,8 +29,10 @@ namespace feature return str; } + bool IsDigit(int c); bool IsNumber(strings::UniString const & s); bool IsHouseNumber(string const & s); + bool IsHouseNumber(strings::UniString const & s); bool IsHouseNumberDeepCheck(strings::UniString const & s); } diff --git a/search/search_string_utils.cpp b/search/search_string_utils.cpp index 67aed4beba..f955c1fb79 100644 --- a/search/search_string_utils.cpp +++ b/search/search_string_utils.cpp @@ -1,7 +1,7 @@ #include "search_string_utils.hpp" +#include "std/set.hpp" #include "std/transform_iterator.hpp" -#include "std/unordered_set.hpp" #include "base/macros.hpp" diff --git a/search/search_tests/house_numbers_matcher_test.cpp b/search/search_tests/house_numbers_matcher_test.cpp index d0138cd53c..524ed06c0e 100644 --- a/search/search_tests/house_numbers_matcher_test.cpp +++ b/search/search_tests/house_numbers_matcher_test.cpp @@ -9,6 +9,22 @@ using namespace strings; using namespace search::v2; +namespace +{ +void NormalizeHouseNumber(string const & s, vector & ts) +{ + vector tokens; + search::v2::NormalizeHouseNumber(strings::MakeUniString(s), tokens); + for (auto const & token : tokens) + ts.push_back(strings::ToUtf8(token)); +} + +bool HouseNumbersMatch(string const & houseNumber, string const & query) +{ + return search::v2::HouseNumbersMatch(strings::MakeUniString(houseNumber), + strings::MakeUniString(query)); +} + void CheckTokenizer(string const & utf8s, vector const & expected) { UniString utf32s = MakeUniString(utf8s); @@ -35,6 +51,7 @@ void CheckNormalizer(string const & utf8s, string const & expected) } TEST_EQUAL(actual, expected, ()); } +} // namespace UNIT_TEST(HouseNumberTokenizer_Smoke) { diff --git a/search/v2/features_layer.cpp b/search/v2/features_layer.cpp index 07cae07255..c18d42998b 100644 --- a/search/v2/features_layer.cpp +++ b/search/v2/features_layer.cpp @@ -25,7 +25,7 @@ string DebugPrint(FeaturesLayer const & layer) ostringstream os; os << "FeaturesLayer [ size of m_sortedFeatures: " << (layer.m_sortedFeatures ? layer.m_sortedFeatures->size() : 0) - << ", m_subQuery: " << layer.m_subQuery << ", m_startToken: " << layer.m_startToken + << ", m_subQuery: " << DebugPrint(layer.m_subQuery) << ", m_startToken: " << layer.m_startToken << ", m_endToken: " << layer.m_endToken << ", m_type: " << DebugPrint(layer.m_type) << " ]"; return os.str(); } diff --git a/search/v2/features_layer.hpp b/search/v2/features_layer.hpp index f0a750610f..cb031f8a89 100644 --- a/search/v2/features_layer.hpp +++ b/search/v2/features_layer.hpp @@ -2,6 +2,8 @@ #include "search/v2/search_model.hpp" +#include "base/string_utils.hpp" + #include "std/vector.hpp" namespace search @@ -20,7 +22,7 @@ struct FeaturesLayer // Non-owning ptr to a sorted vector of features. vector const * m_sortedFeatures; - string m_subQuery; + strings::UniString m_subQuery; size_t m_startToken; size_t m_endToken; diff --git a/search/v2/features_layer_matcher.hpp b/search/v2/features_layer_matcher.hpp index 822680257f..dfec608a15 100644 --- a/search/v2/features_layer_matcher.hpp +++ b/search/v2/features_layer_matcher.hpp @@ -24,6 +24,7 @@ #include "base/logging.hpp" #include "base/macros.hpp" #include "base/stl_helpers.hpp" +#include "base/string_utils.hpp" #include "std/algorithm.hpp" #include "std/bind.hpp" @@ -151,7 +152,7 @@ private: // |buildings| doesn't contain buildings matching by house number, // so following code reads buildings in POIs vicinities and checks // house numbers. - vector queryTokens; + vector queryTokens; NormalizeHouseNumber(parent.m_subQuery, queryTokens); if (queryTokens.empty()) return; @@ -168,7 +169,7 @@ private: { if (building.m_id.m_mwmId != m_context->m_id || building.m_distanceMeters > kBuildingRadiusMeters) continue; - if (HouseNumbersMatch(building.m_name, queryTokens)) + if (HouseNumbersMatch(strings::MakeUniString(building.m_name), queryTokens)) fn(pois[i], building.m_id.m_index); } } @@ -240,7 +241,7 @@ private: return; } - vector queryTokens; + vector queryTokens; NormalizeHouseNumber(child.m_subQuery, queryTokens); auto const & checker = ftypes::IsBuildingChecker::Instance(); @@ -272,7 +273,7 @@ private: if (!child.m_hasDelayedFeatures) return false; - string const houseNumber = feature.GetHouseNumber(); + strings::UniString const houseNumber(strings::MakeUniString(feature.GetHouseNumber())); if (!feature::IsHouseNumber(houseNumber)) return false; return HouseNumbersMatch(houseNumber, queryTokens); diff --git a/search/v2/features_layer_path_finder.cpp b/search/v2/features_layer_path_finder.cpp index f042709880..b84c8fbd30 100644 --- a/search/v2/features_layer_path_finder.cpp +++ b/search/v2/features_layer_path_finder.cpp @@ -45,9 +45,9 @@ uint64_t CalcBottomUpPassCost(vector const & layers) return CalcPassCost(layers.begin(), layers.end()); } -bool LooksLikeHouseNumber(string const & query) +bool LooksLikeHouseNumber(strings::UniString const & query) { - vector tokens; + vector tokens; NormalizeHouseNumber(query, tokens); return !tokens.empty() && feature::IsHouseNumber(tokens.front()); } diff --git a/search/v2/geocoder.cpp b/search/v2/geocoder.cpp index c49b9a50d5..6e4eefb6b7 100644 --- a/search/v2/geocoder.cpp +++ b/search/v2/geocoder.cpp @@ -56,6 +56,10 @@ size_t constexpr kMaxNumCities = 5; size_t constexpr kMaxNumStates = 5; size_t constexpr kMaxNumVillages = 5; size_t constexpr kMaxNumCountries = 5; + +// This constant limits number of localities that will be extracted +// from World map. Villages are not counted here as they're not +// included into World map. size_t constexpr kMaxNumLocalities = kMaxNumCities + kMaxNumStates + kMaxNumCountries; // List of countries we're supporting search by state. Elements of the @@ -63,6 +67,8 @@ size_t constexpr kMaxNumLocalities = kMaxNumCities + kMaxNumStates + kMaxNumCoun string const kCountriesWithStates[] = {"US_", "Canada_"}; double constexpr kComparePoints = MercatorBounds::GetCellID2PointAbsEpsilon(); +strings::UniString const kUniSpace(strings::MakeUniString(" ")); + template struct Id { @@ -149,19 +155,19 @@ private: }; void JoinQueryTokens(SearchQueryParams const & params, size_t curToken, size_t endToken, - string const & sep, string & res) + strings::UniString const & sep, strings::UniString & res) { ASSERT_LESS_OR_EQUAL(curToken, endToken, ()); for (size_t i = curToken; i < endToken; ++i) { if (i < params.m_tokens.size()) { - res.append(strings::ToUtf8(params.m_tokens[i].front())); + res.append(params.m_tokens[i].front()); } else { ASSERT_EQUAL(i, params.m_tokens.size(), ()); - res.append(strings::ToUtf8(params.m_prefixTokens.front())); + res.append(params.m_prefixTokens.front()); } if (i + 1 != endToken) @@ -375,7 +381,11 @@ void Geocoder::SetParams(Params const & params) for (size_t i = 0; i < m_numTokens; ++i) { auto & synonyms = m_params.GetTokens(i); - if (!synonyms.empty() && IsStreetSynonym(synonyms.front())) + ASSERT(!synonyms.empty(), ()); + + auto const & token = synonyms.front(); + + if (IsStreetSynonym(token)) { auto b = synonyms.begin(); auto e = synonyms.end(); @@ -971,24 +981,35 @@ void Geocoder::LimitedSearch(coding::CompressedBitVector const * filter, size_t void Geocoder::GreedilyMatchStreets() { - ASSERT(m_layers.empty(), ()); - m_layers.emplace_back(); - - MY_SCOPE_GUARD(cleanupGuard, bind(&vector::pop_back, &m_layers)); for (size_t startToken = 0; startToken < m_numTokens; ++startToken) { if (m_usedTokens[startToken]) continue; - unique_ptr buffer; + // Here we try to match as many tokens as possible while + // intersection is a non-empty bit vector of streets. All tokens + // that are synonyms to streets are ignored. Moreover, each time + // a token that looks like a beginning of a house number is met, + // we try to use current intersection of tokens as a street layer + // and try to match buildings or pois. unique_ptr allFeatures; size_t curToken = startToken; for (; curToken < m_numTokens && !m_usedTokens[curToken]; ++curToken) { - if (IsStreetSynonym(m_params.GetTokens(curToken).front())) + auto const & token = m_params.GetTokens(curToken).front(); + if (IsStreetSynonym(token)) continue; + if (feature::IsHouseNumber(token) && + !coding::CompressedBitVector::IsEmpty(allFeatures)) + { + if (m_filter.NeedToFilter(*allFeatures)) + allFeatures = m_filter.Filter(*allFeatures); + CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures); + } + + unique_ptr buffer; if (startToken == curToken || coding::CompressedBitVector::IsEmpty(allFeatures)) buffer = coding::CompressedBitVector::Intersect(*m_streets, *m_addressFeatures[curToken]); else @@ -1002,27 +1023,41 @@ void Geocoder::GreedilyMatchStreets() if (coding::CompressedBitVector::IsEmpty(allFeatures)) continue; - if (m_filter.NeedToFilter(*allFeatures)) allFeatures = m_filter.Filter(*allFeatures); - auto & layer = m_layers.back(); - layer.Clear(); - layer.m_type = SearchModel::SEARCH_TYPE_STREET; - layer.m_startToken = startToken; - layer.m_endToken = curToken; - JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, " " /* sep */, - layer.m_subQuery); - vector sortedFeatures; - coding::CompressedBitVectorEnumerator::ForEach(*allFeatures, - MakeBackInsertFunctor(sortedFeatures)); - layer.m_sortedFeatures = &sortedFeatures; - - ScopedMarkTokens mark(m_usedTokens, startToken, curToken); - MatchPOIsAndBuildings(0 /* curToken */); + CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures); } } +void Geocoder::CreateStreetsLayerAndMatchLowerLayers( + size_t startToken, size_t endToken, unique_ptr const & features) +{ + ASSERT(m_layers.empty(), ()); + + if (coding::CompressedBitVector::IsEmpty(features)) + return; + + m_layers.emplace_back(); + MY_SCOPE_GUARD(cleanupGuard, bind(&vector::pop_back, &m_layers)); + + auto & layer = m_layers.back(); + layer.Clear(); + layer.m_type = SearchModel::SEARCH_TYPE_STREET; + layer.m_startToken = startToken; + layer.m_endToken = endToken; + JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, kUniSpace /* sep */, + layer.m_subQuery); + + vector sortedFeatures; + sortedFeatures.reserve(features->PopCount()); + coding::CompressedBitVectorEnumerator::ForEach(*features, MakeBackInsertFunctor(sortedFeatures)); + layer.m_sortedFeatures = &sortedFeatures; + + ScopedMarkTokens mark(m_usedTokens, startToken, endToken); + MatchPOIsAndBuildings(0 /* curToken */); +} + void Geocoder::MatchPOIsAndBuildings(size_t curToken) { // Skip used tokens. @@ -1063,7 +1098,7 @@ void Geocoder::MatchPOIsAndBuildings(size_t curToken) layer.Clear(); layer.m_startToken = curToken; layer.m_endToken = curToken + n; - JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, " " /* sep */, + JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, kUniSpace /* sep */, layer.m_subQuery); } diff --git a/search/v2/geocoder.hpp b/search/v2/geocoder.hpp index 88053b7fa7..c3bb6c3646 100644 --- a/search/v2/geocoder.hpp +++ b/search/v2/geocoder.hpp @@ -186,6 +186,9 @@ private: // then performs geocoding in street vicinities. void GreedilyMatchStreets(); + void CreateStreetsLayerAndMatchLowerLayers( + size_t startToken, size_t endToken, unique_ptr const & features); + // Tries to find all paths in a search tree, where each edge is // marked with some substring of the query tokens. These paths are // called "layer sequence" and current path is stored in |m_layers|. diff --git a/search/v2/house_numbers_matcher.cpp b/search/v2/house_numbers_matcher.cpp index 51f0ac495c..377520f1b1 100644 --- a/search/v2/house_numbers_matcher.cpp +++ b/search/v2/house_numbers_matcher.cpp @@ -153,23 +153,19 @@ void HouseNumberTokenizer::Tokenize(UniString const & s, vector & ts) } } -void NormalizeHouseNumber(string const & s, vector & ts) +void NormalizeHouseNumber(strings::UniString const & s, vector & ts) { vector tokens; - HouseNumberTokenizer::Tokenize(MakeLowerCase(MakeUniString(s)), tokens); - - vector mergedTokens; - MergeTokens(tokens, mergedTokens); - - transform(mergedTokens.begin(), mergedTokens.end(), back_inserter(ts), &ToUtf8); + HouseNumberTokenizer::Tokenize(MakeLowerCase(s), tokens); + MergeTokens(tokens, ts); } -bool HouseNumbersMatch(string const & houseNumber, string const & query) +bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniString const & query) { if (houseNumber == query) return true; - vector queryTokens; + vector queryTokens; NormalizeHouseNumber(query, queryTokens); if (!queryTokens.empty()) @@ -178,14 +174,14 @@ bool HouseNumbersMatch(string const & houseNumber, string const & query) return HouseNumbersMatch(houseNumber, queryTokens); } -bool HouseNumbersMatch(string const & houseNumber, vector const & queryTokens) +bool HouseNumbersMatch(strings::UniString const & houseNumber, vector const & queryTokens) { if (houseNumber.empty() || queryTokens.empty()) return false; if (queryTokens[0][0] != houseNumber[0]) return false; - vector houseNumberTokens; + vector houseNumberTokens; NormalizeHouseNumber(houseNumber, houseNumberTokens); if (houseNumberTokens.empty()) diff --git a/search/v2/house_numbers_matcher.hpp b/search/v2/house_numbers_matcher.hpp index a3009b2033..77c046c9de 100644 --- a/search/v2/house_numbers_matcher.hpp +++ b/search/v2/house_numbers_matcher.hpp @@ -38,12 +38,13 @@ public: }; // Splits house number by tokens, removes blanks and separators. -void NormalizeHouseNumber(string const & s, vector & ts); +void NormalizeHouseNumber(strings::UniString const & s, vector & ts); // Returns true when |query| matches to |houseNumber|. -bool HouseNumbersMatch(string const & houseNumber, string const & query); +bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniString const & query); // Returns true when |queryTokens| match to |houseNumber|. -bool HouseNumbersMatch(string const & houseNumber, vector const & queryTokens); +bool HouseNumbersMatch(strings::UniString const & houseNumber, + vector const & queryTokens); } // namespace v2 } // namespace search