From ddbedfde555ecf4ea13c529fe1e90e3801c8018a Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Thu, 19 May 2016 14:26:44 +0300 Subject: [PATCH] Review fixes. --- base/stl_helpers.hpp | 6 -- .../search_query_v2_test.cpp | 45 ++++++---- search/search_quality/download-maps.sh | 82 +++++++++++++++---- .../features_collector_tool.cpp | 5 +- search/search_quality/scoring_model.py | 4 +- search/search_query.cpp | 33 +++----- search/v2/geocoder.cpp | 4 +- search/v2/ranking_info.cpp | 23 +++--- search/v2/ranking_info.hpp | 16 ++-- 9 files changed, 134 insertions(+), 84 deletions(-) diff --git a/base/stl_helpers.hpp b/base/stl_helpers.hpp index 704c487c2e..3375867f75 100644 --- a/base/stl_helpers.hpp +++ b/base/stl_helpers.hpp @@ -73,10 +73,4 @@ impl::Comparer CompareBy(T (C::*p)() const) { return impl::Comparer(p); } - -template -struct Id -{ - T const & operator()(T const & t) const { return t; } -}; } // namespace my diff --git a/search/search_integration_tests/search_query_v2_test.cpp b/search/search_integration_tests/search_query_v2_test.cpp index d156d75b2f..3d2b498057 100644 --- a/search/search_integration_tests/search_query_v2_test.cpp +++ b/search/search_integration_tests/search_query_v2_test.cpp @@ -33,7 +33,7 @@ namespace class SearchQueryV2Test : public SearchTest { public: - unique_ptr DoRequest(string const & query) + unique_ptr MakeRequest(string const & query) { SearchParams params; params.m_query = query; @@ -332,7 +332,7 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestRankingInfo) SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5))); { - auto request = DoRequest("golden gate bridge "); + auto request = MakeRequest("golden gate bridge "); TRules rules = {ExactMatch(wonderlandId, goldenGateBridge), ExactMatch(wonderlandId, goldenGateStreet)}; @@ -342,15 +342,14 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestRankingInfo) { auto const & info = result.GetRankingInfo(); TEST_EQUAL(NAME_SCORE_FULL_MATCH, info.m_nameScore, (result)); - TEST(!info.m_matchByTrueCats, (result)); - TEST(!info.m_matchByFalseCats, (result)); - TEST(my::AlmostEqualAbs(1.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage)); + TEST(!info.m_pureCats, (result)); + TEST(!info.m_falseCats, (result)); } } // This test is quite important and must always pass. { - auto request = DoRequest("cafe лермонтов"); + auto request = MakeRequest("cafe лермонтов"); auto const & results = request->Results(); TRules rules{ExactMatch(wonderlandId, cafe1), ExactMatch(wonderlandId, cafe2), @@ -475,6 +474,9 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories) TestPOI named(m2::PointD(0.0001, 0.0001), "ATM", "en"); named.SetTypes({{"amenity", "atm"}}); + TestPOI busStop(m2::PointD(0.00005, 0.0005), "ATM Bus Stop", "en"); + busStop.SetTypes({{"highway", "bus_stop"}}); + BuildWorld([&](TestMwmBuilder & builder) { builder.Add(sanFrancisco); @@ -483,24 +485,42 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories) { builder.Add(named); builder.Add(noname); + builder.Add(busStop); }); SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5))); - TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)}; { - auto request = DoRequest("atm"); + TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named), + ExactMatch(wonderlandId, busStop)}; + + auto request = MakeRequest("atm"); TEST(MatchResults(rules, request->Results()), ()); for (auto const & result : request->Results()) { + Index::FeaturesLoaderGuard loader(m_engine, wonderlandId); + FeatureType ft; + loader.GetFeatureByIndex(result.GetFeatureID().m_index, ft); + auto const & info = result.GetRankingInfo(); - TEST(info.m_matchByTrueCats, (result)); - TEST(!info.m_matchByFalseCats, (result)); + + if (busStop.Matches(ft)) + { + TEST(!info.m_pureCats, (result)); + TEST(info.m_falseCats, (result)); + } + else + { + TEST(info.m_pureCats, (result)); + TEST(!info.m_falseCats, (result)); + } } } { - auto request = DoRequest("#atm"); + TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)}; + + auto request = MakeRequest("#atm"); TEST(MatchResults(rules, request->Results()), ()); for (auto const & result : request->Results()) @@ -510,9 +530,6 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories) // Token with a hashtag should not participate in name-score // calculations. TEST_EQUAL(NAME_SCORE_ZERO, info.m_nameScore, (result)); - - // TODO (@y): fix this. Name coverage calculations are flawed. - // TEST(my::AlmostEqualAbs(0.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage)); } } diff --git a/search/search_quality/download-maps.sh b/search/search_quality/download-maps.sh index 85ea1ab758..c41ac333eb 100755 --- a/search/search_quality/download-maps.sh +++ b/search/search_quality/download-maps.sh @@ -3,25 +3,77 @@ # Downloads all maps necessary for learning to rank to the current # directory. -case $# in - 1) VERSION="$1" - ;; - *) echo "Usage: $0 version" 2>&1 - exit -1 - ;; -esac +ALL= +VERSION= +BASE="http://direct.mapswithme.com/direct" + +display_usage() { + echo "Usage: $0 -v [version] -a -h" + echo " -v version of maps to download" + echo " -a download all maps of the specified version" + echo " -h display this message" +} + +while getopts ":av:h" opt +do + case "$opt" in + a) ALL=1 + ;; + v) VERSION="$OPTARG" + ;; + h) display_usage + exit -1 + ;; + \?) echo "Invalid option: -$OPTARG" 1>&2 + ;; + :) echo "Option -$OPTARG requires an argument" 1>&2 + ;; + esac +done + +if [ -z "$VERSION" ] +then + echo "Version of maps is not specified." 1>&2 + exit -1 +fi + +if ! curl "$BASE/" 2>/dev/null | + sed -n 's/^.*href="\(.*\)\/".*$/\1/p' | + grep -v "^../$" | grep -q "$VERSION" +then + echo "Invalid version: $VERSION" 1>&2 + exit -1 +fi -BASE="http://direct.mapswithme.com/direct/$VERSION/" NAMES=("Australia_Brisbane.mwm" "Belarus_Minsk*.mwm" "Germany_*.mwm" "Russia_*.mwm" "UK_England_*.mwm" - "US_California_*.mwm" "US_Maryland_*.mwm") + "US_California_*.mwm" + "US_Maryland_*.mwm") -set -e -set -x -for name in ${NAMES[@]} -do - wget -r -np -nd -A "$name" "$BASE" -done +DIR="$BASE/$VERSION" + +if [ "$ALL" ] +then + echo "Downloading all maps..." + + files=$(curl "$DIR/" 2>/dev/null | sed -n 's/^.*href="\(.*\.mwm\)".*$/\1/p') + + set -e + set -x + for file in $files + do + wget -np -nd "$DIR/$file" + done +else + echo "Downloading maps..." + + set -e + set -x + for name in ${NAMES[@]} + do + wget -r -np -nd -A "$name" "$DIR/" + done +fi diff --git a/search/search_quality/features_collector_tool/features_collector_tool.cpp b/search/search_quality/features_collector_tool/features_collector_tool.cpp index 9529b9d3fb..bf33021ab6 100644 --- a/search/search_quality/features_collector_tool/features_collector_tool.cpp +++ b/search/search_quality/features_collector_tool/features_collector_tool.cpp @@ -143,14 +143,15 @@ void DisplayStats(ostream & os, vector const & samples, vector co ASSERT_EQUAL(stats.size(), n, ()); size_t numWarnings = 0; - for (auto const & stat : stats) { + for (auto const & stat : stats) + { if (!stat.m_notFound.empty()) ++numWarnings; } if (numWarnings == 0) { - os << "All " << stats.size() << " queries OK." << endl; + os << "All " << stats.size() << " queries are OK." << endl; return; } diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index 244688d890..2409e3d715 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -38,7 +38,7 @@ def normalize_data(data): data['Rank'] = data['Rank'].apply(lambda v: v / MAX_RANK) data['Relevance'] = data['Relevance'].apply(lambda v: RELEVANCES[v]) - cats = data['MatchByTrueCats'].combine(data['MatchByFalseCats'], max) + cats = data['PureCats'].combine(data['FalseCats'], max) # Full prefix match is unified with a full match as these features # are collinear. But we need both of them as they're also used in @@ -49,8 +49,6 @@ def normalize_data(data): # the features too. data['NameScore'] = data['NameScore'].combine(cats, transform_name_score) - data['NameCoverage'] = data['NameCoverage'].combine(cats, lambda v, c: v if c == 0 else 0.0) - # Adds dummy variables to data for NAME_SCORES. for ns in NAME_SCORES: data[ns] = data['NameScore'].apply(lambda v: int(ns == v)) diff --git a/search/search_query.cpp b/search/search_query.cpp index cfbf486c89..0f78a1c64f 100644 --- a/search/search_query.cpp +++ b/search/search_query.cpp @@ -190,20 +190,11 @@ void UpdateNameScore(string const & name, TSlice const & slice, v2::NameScore & template void UpdateNameScore(vector const & tokens, TSlice const & slice, - v2::NameScore & bestScore, double & bestCoverage) + v2::NameScore & bestScore) { auto const score = v2::GetNameScore(tokens, slice); - auto const coverage = - tokens.empty() ? 0 : static_cast(slice.Size()) / static_cast(tokens.size()); if (score > bestScore) - { bestScore = score; - bestCoverage = coverage; - } - else if (score == bestScore && coverage > bestCoverage) - { - bestCoverage = coverage; - } } inline bool IsHashtagged(strings::UniString const & s) { return !s.empty() && s[0] == '#'; } @@ -663,8 +654,8 @@ class PreResult2Maker vector tokens; SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters()); - UpdateNameScore(tokens, slice, info.m_nameScore, info.m_nameCoverage); - UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore, info.m_nameCoverage); + UpdateNameScore(tokens, slice, info.m_nameScore); + UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore); } if (info.m_searchType == v2::SearchModel::SEARCH_TYPE_BUILDING) @@ -679,16 +670,14 @@ class PreResult2Maker ++matched[i].first; }); - info.m_matchByTrueCats = - all_of(matched.begin(), matched.end(), [](pair const & m) - { - return m.first != 0; - }); - info.m_matchByFalseCats = - all_of(matched.begin(), matched.end(), [](pair const & m) - { - return m.first == 0 && m.second != 0; - }); + info.m_pureCats = all_of(matched.begin(), matched.end(), [](pair const & m) + { + return m.first != 0; + }); + info.m_falseCats = all_of(matched.begin(), matched.end(), [](pair const & m) + { + return m.first == 0 && m.second != 0; + }); } uint8_t NormalizeRank(uint8_t rank, v2::SearchModel::SearchType type, m2::PointD const & center, diff --git a/search/v2/geocoder.cpp b/search/v2/geocoder.cpp index be83577b74..3f3a91b7dd 100644 --- a/search/v2/geocoder.cpp +++ b/search/v2/geocoder.cpp @@ -1557,12 +1557,12 @@ SearchModel::SearchType Geocoder::GetSearchTypeInGeocoding(uint32_t featureId) bool Geocoder::AllTokensUsed() const { - return all_of(m_usedTokens.begin(), m_usedTokens.end(), my::Id()); + return all_of(m_usedTokens.begin(), m_usedTokens.end(), IdFunctor()); } bool Geocoder::HasUsedTokensInRange(size_t from, size_t to) const { - return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, my::Id()); + return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, IdFunctor()); } size_t Geocoder::NumUnusedTokensGroups() const diff --git a/search/v2/ranking_info.cpp b/search/v2/ranking_info.cpp index 1a4cbb42bd..1e4dbbaf32 100644 --- a/search/v2/ranking_info.cpp +++ b/search/v2/ranking_info.cpp @@ -47,10 +47,9 @@ void RankingInfo::PrintCSVHeader(ostream & os) os << "DistanceToPivot" << ",Rank" << ",NameScore" - << ",NameCoverage" << ",SearchType" - << ",MatchByTrueCats" - << ",MatchByFalseCats"; + << ",PureCats" + << ",FalseCats"; } string DebugPrint(RankingInfo const & info) @@ -60,10 +59,9 @@ string DebugPrint(RankingInfo const & info) os << "m_distanceToPivot:" << info.m_distanceToPivot << ","; os << "m_rank:" << static_cast(info.m_rank) << ","; os << "m_nameScore:" << DebugPrint(info.m_nameScore) << ","; - os << "m_nameCoverage:" << info.m_nameCoverage << ","; os << "m_searchType:" << DebugPrint(info.m_searchType) << ","; - os << "m_matchByTrueCats:" << info.m_matchByTrueCats << ","; - os << "m_matchByFalseCats:" << info.m_matchByFalseCats; + os << "m_pureCats:" << info.m_pureCats << ","; + os << "m_falseCats:" << info.m_falseCats; os << "]"; return os.str(); } @@ -72,8 +70,7 @@ void RankingInfo::ToCSV(ostream & os) const { os << fixed; os << m_distanceToPivot << "," << static_cast(m_rank) << "," << DebugPrint(m_nameScore) - << "," << m_nameCoverage << "," << DebugPrint(m_searchType) << "," << m_matchByTrueCats << "," - << m_matchByFalseCats; + << "," << DebugPrint(m_searchType) << "," << m_pureCats << "," << m_falseCats; } double RankingInfo::GetLinearModelRank() const @@ -86,11 +83,15 @@ double RankingInfo::GetLinearModelRank() const double const rank = static_cast(m_rank) / numeric_limits::max(); auto nameScore = m_nameScore; - auto nameCoverage = m_nameCoverage; - if (m_matchByTrueCats || m_matchByFalseCats) + if (m_pureCats || m_falseCats) { + // If the feature was matched only by categorial tokens, it's + // better for ranking to set name score to zero. For example, + // when we're looking for a "cafe", cafes "Cafe Pushkin" and + // "Lermontov" both match to the request, but must be ranked in + // accordance to their distances to the user position or viewport, + // in spite of "Cafe Pushkin" has a non-zero name rank. nameScore = NAME_SCORE_ZERO; - nameCoverage = 0.0; } return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] + diff --git a/search/v2/ranking_info.hpp b/search/v2/ranking_info.hpp index 23d5b000c9..e86811a3ee 100644 --- a/search/v2/ranking_info.hpp +++ b/search/v2/ranking_info.hpp @@ -24,19 +24,17 @@ struct RankingInfo // Score for the feature's name. NameScore m_nameScore = NAME_SCORE_ZERO; - // Fraction of tokens from the query matched to a feature name. - double m_nameCoverage = 0; - // Search type for the feature. SearchModel::SearchType m_searchType = SearchModel::SEARCH_TYPE_COUNT; - // True if the feature was matched only by tokens corresponding to - // it's categories. - bool m_matchByTrueCats = false; + // True if all of the tokens that the feature was matched by + // correspond to this feature's categories. + bool m_pureCats = false; - // True if the feature was matched only by tokens don't - // corresponding to it's categories. - bool m_matchByFalseCats = false; + // True if none of the tokens that the feature was matched by + // corresponds to this feature's categories although all of the + // tokens are categorial ones. + bool m_falseCats = false; static void PrintCSVHeader(ostream & os);