From 36ef1408f454de4fde543bf1b7577e0a59a326dd Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Wed, 22 Jan 2020 17:40:09 +0300 Subject: [PATCH] [search][search_quality] More liberal matching of linear features. --- indexer/search_string_utils.hpp | 8 + .../assessment_tool/main_model.cpp | 2 +- .../assessment_tool/search_request_runner.cpp | 2 +- .../features_collector_tool.cpp | 2 +- search/search_quality/matcher.cpp | 198 ++++++++++++++---- search/search_quality/matcher.hpp | 16 +- 6 files changed, 178 insertions(+), 50 deletions(-) diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index 7ac9e27d53..aea59822b3 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace search { @@ -54,6 +55,13 @@ void NormalizeAndTokenizeAsUtf8(std::string const & s, Tokens & tokens) SplitUniString(NormalizeAndSimplifyString(s), fn, search::Delimiters()); } +inline std::vector NormalizeAndTokenizeAsUtf8(std::string const & s) +{ + std::vector result; + NormalizeAndTokenizeAsUtf8(s, result); + return result; +} + template void ForEachNormalizedToken(std::string const & s, Fn && fn) { diff --git a/search/search_quality/assessment_tool/main_model.cpp b/search/search_quality/assessment_tool/main_model.cpp index 64d47644b1..da92cfe93c 100644 --- a/search/search_quality/assessment_tool/main_model.cpp +++ b/search/search_quality/assessment_tool/main_model.cpp @@ -399,7 +399,7 @@ void MainModel::ForAnyMatchingEntry(Context & context, FeatureID const & id, Fn for (size_t i = 0; i < nonFoundResults.size(); ++i) { auto const & result = context.m_nonFoundResults[i]; - if (matcher.Matches(result, *ft)) + if (matcher.Matches(context.m_sample.m_query, result, *ft)) return fn(context.m_nonFoundResultsEdits, i); } } diff --git a/search/search_quality/assessment_tool/search_request_runner.cpp b/search/search_quality/assessment_tool/search_request_runner.cpp index 78d0f7c645..ff12a7390f 100644 --- a/search/search_quality/assessment_tool/search_request_runner.cpp +++ b/search/search_quality/assessment_tool/search_request_runner.cpp @@ -103,7 +103,7 @@ void SearchRequestRunner::RunRequest(size_t index, bool background, size_t times search::Matcher matcher(loader); vector const actual(results.begin(), results.end()); - matcher.Match(sample.m_results, actual, goldenMatching, actualMatching); + matcher.Match(sample, actual, goldenMatching, actualMatching); relevances.resize(actual.size()); for (size_t i = 0; i < goldenMatching.size(); ++i) { diff --git a/search/search_quality/features_collector_tool/features_collector_tool.cpp b/search/search_quality/features_collector_tool/features_collector_tool.cpp index 34d6857b6d..44eb525531 100644 --- a/search/search_quality/features_collector_tool/features_collector_tool.cpp +++ b/search/search_quality/features_collector_tool/features_collector_tool.cpp @@ -170,7 +170,7 @@ int main(int argc, char * argv[]) vector goldenMatching; vector actualMatching; - matcher.Match(sample.m_results, results, goldenMatching, actualMatching); + matcher.Match(sample, results, goldenMatching, actualMatching); for (size_t j = 0; j < results.size(); ++j) { diff --git a/search/search_quality/matcher.cpp b/search/search_quality/matcher.cpp index 91f449dcd9..60ba76dda6 100644 --- a/search/search_quality/matcher.cpp +++ b/search/search_quality/matcher.cpp @@ -1,28 +1,121 @@ #include "search/search_quality/matcher.hpp" #include "search/feature_loader.hpp" +#include "search/house_numbers_matcher.hpp" #include "indexer/feature.hpp" #include "indexer/feature_algo.hpp" #include "indexer/search_string_utils.hpp" -#include "base/string_utils.hpp" - #include "geometry/mercator.hpp" +#include "geometry/parametrized_segment.hpp" +#include "geometry/point2d.hpp" +#include "geometry/polyline2d.hpp" +#include "base/assert.hpp" #include "base/control_flow.hpp" #include "base/stl_helpers.hpp" +#include "base/string_utils.hpp" + +#include + +namespace +{ +double DistanceToFeature(m2::PointD const & pt, FeatureType & ft) +{ + if (ft.GetGeomType() != feature::GeomType::Line) + return mercator::DistanceOnEarth(pt, feature::GetCenter(ft)); + + ft.ParseGeometry(FeatureType::BEST_GEOMETRY); + std::vector points(ft.GetPointsCount()); + for (size_t i = 0; i < points.size(); ++i) + points[i] = ft.GetPoint(i); + + auto const & [dummy, segId] = m2::CalcMinSquaredDistance(points.begin(), points.end(), pt); + CHECK_LESS(segId + 1, points.size(), ()); + m2::ParametrizedSegment segment(points[segId], points[segId + 1]); + + return mercator::DistanceOnEarth(pt, segment.ClosestPointTo(pt)); +} + +template +bool StartsWithHouseNumber(Iter beg, Iter end) +{ + using namespace search::house_numbers; + + std::string s; + for (auto it = beg; it != end; ++it) + { + s.append(*it); + if (LooksLikeHouseNumber(s, false /* isPrefix */)) + return true; + } + return false; +} + +// todo(@m) This function looks very slow. +template +bool EndsWithHouseNumber(Iter beg, Iter end) +{ + using namespace search::house_numbers; + + if (beg == end) + return false; + + std::string s; + for (auto it = --end;; --it) + { + s = *it + s; + if (LooksLikeHouseNumber(s, false /* isPrefix */)) + return true; + if (it == beg) + break; + } + return false; +} + +bool StreetMatches(std::string const & name, std::vector const & queryTokens) +{ + auto const nameTokens = search::NormalizeAndTokenizeAsUtf8(name); + + if (nameTokens.empty()) + return false; + + for (size_t i = 0; i + nameTokens.size() <= queryTokens.size(); ++i) + { + bool found = true; + for (size_t j = 0; j < nameTokens.size(); ++j) + { + if (queryTokens[i + j] != nameTokens[j]) + { + found = false; + break; + } + } + + if (!found) + continue; + + if (!EndsWithHouseNumber(queryTokens.begin(), queryTokens.begin() + i) && + !StartsWithHouseNumber(queryTokens.begin() + i + nameTokens.size(), queryTokens.end())) + { + return true; + } + } + + return false; +} +} // namespace namespace search { -// static -size_t constexpr Matcher::kInvalidId; - Matcher::Matcher(FeatureLoader & loader) : m_loader(loader) {} -void Matcher::Match(std::vector const & golden, std::vector const & actual, +void Matcher::Match(Sample const & goldenSample, std::vector const & actual, std::vector & goldenMatching, std::vector & actualMatching) { + auto const & golden = goldenSample.m_results; + auto const n = golden.size(); auto const m = actual.size(); @@ -42,7 +135,7 @@ void Matcher::Match(std::vector const & golden, std::vector const & golden, std::vector #include #include @@ -16,21 +19,22 @@ class FeatureLoader; class Matcher { public: - static size_t constexpr kInvalidId = std::numeric_limits::max(); + inline static size_t constexpr kInvalidId = std::numeric_limits::max(); explicit Matcher(FeatureLoader & loader); - // Matches the |golden| results loaded from a Sample with |actual| results + // Matches the results loaded from |goldenSample| with |actual| results // found by the search engine using the params from the Sample. // goldenMatching[i] is the index of the result in |actual| that matches // the sample result number i. // actualMatching[j] is the index of the sample in |golden| that matches // the golden result number j. - void Match(std::vector const & golden, std::vector const & actual, + void Match(Sample const & goldenSample, std::vector const & actual, std::vector & goldenMatching, std::vector & actualMatching); - bool Matches(Sample::Result const & golden, FeatureType & ft); - bool Matches(Sample::Result const & golden, Result const & actual); + bool Matches(strings::UniString const & query, Sample::Result const & golden, + Result const & actual); + bool Matches(strings::UniString const & query, Sample::Result const & golden, FeatureType & ft); private: FeatureLoader & m_loader;