From 04e5c0eb18c5213867e6d73723207fb496e09d01 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Mon, 16 May 2016 15:32:43 +0300 Subject: [PATCH] [search] Fixed ranking model. --- base/stl_helpers.hpp | 6 + .../search_query_v2_test.cpp | 77 +++--- search/search_quality/download-maps.sh | 27 +++ .../features_collector_tool.cpp | 20 +- search/search_quality/scoring_model.py | 223 ++++++++++++++---- search/search_query.cpp | 51 ++-- search/search_query.hpp | 4 +- search/search_tests/ranking_tests.cpp | 1 + search/v2/geocoder.cpp | 10 +- search/v2/ranking_info.cpp | 60 +++-- search/v2/ranking_info.hpp | 8 + search/v2/search_model.cpp | 16 +- search/v2/token_slice.hpp | 57 +++++ 13 files changed, 409 insertions(+), 151 deletions(-) create mode 100755 search/search_quality/download-maps.sh diff --git a/base/stl_helpers.hpp b/base/stl_helpers.hpp index 3375867f75..704c487c2e 100644 --- a/base/stl_helpers.hpp +++ b/base/stl_helpers.hpp @@ -73,4 +73,10 @@ impl::Comparer CompareBy(T (C::*p)() const) { return impl::Comparer(p); } + +template +struct Id +{ + T const & operator()(T const & t) const { return t; } +}; } // namespace my diff --git a/search/search_integration_tests/search_query_v2_test.cpp b/search/search_integration_tests/search_query_v2_test.cpp index f010e2135a..d156d75b2f 100644 --- a/search/search_integration_tests/search_query_v2_test.cpp +++ b/search/search_integration_tests/search_query_v2_test.cpp @@ -30,16 +30,27 @@ namespace search { namespace { -void MakeDefaultTestParams(string const & query, SearchParams & params) -{ - params.m_query = query; - params.m_inputLocale = "en"; - params.SetMode(Mode::Everywhere); - params.SetSuggestsEnabled(false); -} - class SearchQueryV2Test : public SearchTest { +public: + unique_ptr DoRequest(string const & query) + { + SearchParams params; + params.m_query = query; + params.m_inputLocale = "en"; + params.SetMode(Mode::Everywhere); + params.SetSuggestsEnabled(false); + + auto request = make_unique(m_engine, params, m_viewport); + request->Wait(); + return request; + } + + bool MatchResults(vector> rules, + vector const & actual) const + { + return ::MatchResults(m_engine, rules, actual); + } }; UNIT_CLASS_TEST(SearchQueryV2Test, Smoke) @@ -271,7 +282,7 @@ UNIT_CLASS_TEST(SearchQueryV2Test, DisableSuggests) request.Wait(); TRules rules = {ExactMatch(worldId, london1), ExactMatch(worldId, london2)}; - TEST(MatchResults(m_engine, rules, request.Results()), ()); + TEST(MatchResults(rules, request.Results()), ()); } } @@ -321,41 +332,34 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestRankingInfo) SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5))); { - SearchParams params; - MakeDefaultTestParams("golden gate bridge ", params); - - TestSearchRequest request(m_engine, params, m_viewport); - request.Wait(); + auto request = DoRequest("golden gate bridge "); TRules rules = {ExactMatch(wonderlandId, goldenGateBridge), ExactMatch(wonderlandId, goldenGateStreet)}; - TEST(MatchResults(m_engine, rules, request.Results()), ()); - for (auto const & result : request.Results()) + TEST(MatchResults(rules, request->Results()), ()); + for (auto const & result : request->Results()) { auto const & info = result.GetRankingInfo(); TEST_EQUAL(NAME_SCORE_FULL_MATCH, info.m_nameScore, (result)); + TEST(!info.m_matchByTrueCats, (result)); + TEST(!info.m_matchByFalseCats, (result)); TEST(my::AlmostEqualAbs(1.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage)); } } // This test is quite important and must always pass. { - SearchParams params; - MakeDefaultTestParams("cafe лермонтов", params); - - TestSearchRequest request(m_engine, params, m_viewport); - request.Wait(); - - auto const & results = request.Results(); + auto request = DoRequest("cafe лермонтов"); + auto const & results = request->Results(); TRules rules{ExactMatch(wonderlandId, cafe1), ExactMatch(wonderlandId, cafe2), ExactMatch(wonderlandId, lermontov)}; - TEST(MatchResults(m_engine, rules, results), ()); + TEST(MatchResults(rules, results), ()); TEST_EQUAL(3, results.size(), ("Unexpected number of retrieved cafes.")); auto const & top = results.front(); - TEST(MatchResults(m_engine, {ExactMatch(wonderlandId, lermontov)}, {top}), ()); + TEST(MatchResults({ExactMatch(wonderlandId, lermontov)}, {top}), ()); } { @@ -482,19 +486,24 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories) }); SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5))); - TRules rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)}; - - TEST(ResultsMatch("atm", rules), ()); + TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)}; { - SearchParams params; - MakeDefaultTestParams("#atm", params); + auto request = DoRequest("atm"); + TEST(MatchResults(rules, request->Results()), ()); + for (auto const & result : request->Results()) + { + auto const & info = result.GetRankingInfo(); + TEST(info.m_matchByTrueCats, (result)); + TEST(!info.m_matchByFalseCats, (result)); + } + } - TestSearchRequest request(m_engine, params, m_viewport); - request.Wait(); + { + auto request = DoRequest("#atm"); - TEST(MatchResults(m_engine, rules, request.Results()), ()); - for (auto const & result : request.Results()) + TEST(MatchResults(rules, request->Results()), ()); + for (auto const & result : request->Results()) { auto const & info = result.GetRankingInfo(); diff --git a/search/search_quality/download-maps.sh b/search/search_quality/download-maps.sh new file mode 100755 index 0000000000..85ea1ab758 --- /dev/null +++ b/search/search_quality/download-maps.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Downloads all maps necessary for learning to rank to the current +# directory. + +case $# in + 1) VERSION="$1" + ;; + *) echo "Usage: $0 version" 2>&1 + exit -1 + ;; +esac + +BASE="http://direct.mapswithme.com/direct/$VERSION/" +NAMES=("Australia_Brisbane.mwm" + "Belarus_Minsk*.mwm" + "Germany_*.mwm" + "Russia_*.mwm" + "UK_England_*.mwm" + "US_California_*.mwm" "US_Maryland_*.mwm") + +set -e +set -x +for name in ${NAMES[@]} +do + wget -r -np -nd -A "$name" "$BASE" +done diff --git a/search/search_quality/features_collector_tool/features_collector_tool.cpp b/search/search_quality/features_collector_tool/features_collector_tool.cpp index d844d94ebd..9529b9d3fb 100644 --- a/search/search_quality/features_collector_tool/features_collector_tool.cpp +++ b/search/search_quality/features_collector_tool/features_collector_tool.cpp @@ -141,15 +141,25 @@ void DisplayStats(ostream & os, vector const & samples, vector co { auto const n = samples.size(); ASSERT_EQUAL(stats.size(), n, ()); + + size_t numWarnings = 0; + for (auto const & stat : stats) { + if (!stat.m_notFound.empty()) + ++numWarnings; + } + + if (numWarnings == 0) + { + os << "All " << stats.size() << " queries OK." << endl; + return; + } + + os << numWarnings << " warnings." << endl; for (size_t i = 0; i < n; ++i) { - os << "Query #" << i << " \"" << strings::ToUtf8(samples[i].m_query) << "\""; if (stats[i].m_notFound.empty()) - { - os << ": OK" << endl; continue; - } - os << ": WARNING" << endl; + os << "Query #" << i << " \"" << strings::ToUtf8(samples[i].m_query) << "\":" << endl; for (auto const & j : stats[i].m_notFound) os << "Not found: " << DebugPrint(samples[i].m_results[j]) << endl; } diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index b22b241e3b..244688d890 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -1,41 +1,69 @@ #!/usr/bin/env python3 from math import exp, log +from scipy.stats import pearsonr from sklearn import cross_validation, grid_search, svm import argparse import collections import itertools +import matplotlib.pyplot as plt import numpy as np import pandas as pd +import random import sys -FEATURES = ['DistanceToPivot', 'Rank', 'NameScore', 'NameCoverage', 'SearchType'] -MAX_DISTANCE_METERS = 2e7 +MAX_DISTANCE_METERS = 2e6 MAX_RANK = 255 RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3} NAME_SCORES = ['Zero', 'Substring Prefix', 'Substring', 'Full Match Prefix', 'Full Match'] -SEARCH_TYPES = {'POI': 0, - 'BUILDING': 0, - 'STREET': 1, - 'UNCLASSIFIED': 2, - 'VILLAGE': 3, - 'CITY': 4, - 'STATE': 5, - 'COUNTRY': 6} +SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country'] + +FEATURES = ['DistanceToPivot', 'Rank'] + NAME_SCORES + SEARCH_TYPES + + +def transform_name_score(value, categories_match): + if categories_match == 1: + return 'Zero' + elif value == 'Full Match Prefix': + return 'Full Match' + else: + return value def normalize_data(data): - transform_distance = lambda d: 1 - min(d, MAX_DISTANCE_METERS) / MAX_DISTANCE_METERS - - max_name_score = len(NAME_SCORES) - 1 - max_search_type = SEARCH_TYPES['COUNTRY'] + transform_distance = lambda v: min(v, MAX_DISTANCE_METERS) / MAX_DISTANCE_METERS data['DistanceToPivot'] = data['DistanceToPivot'].apply(transform_distance) - data['Rank'] = data['Rank'].apply(lambda rank: rank / MAX_RANK) - data['NameScore'] = data['NameScore'].apply(lambda s: NAME_SCORES.index(s) / max_name_score) - data['SearchType'] = data['SearchType'].apply(lambda t: SEARCH_TYPES[t] / max_search_type) - data['Relevance'] = data['Relevance'].apply(lambda r: RELEVANCES[r]) + data['Rank'] = data['Rank'].apply(lambda v: v / MAX_RANK) + data['Relevance'] = data['Relevance'].apply(lambda v: RELEVANCES[v]) + + cats = data['MatchByTrueCats'].combine(data['MatchByFalseCats'], max) + + # Full prefix match is unified with a full match as these features + # are collinear. But we need both of them as they're also used in + # locality sorting. + # + # TODO (@y, @m): do forward/backward/subset selection of features + # instead of this merging. It would be great to conduct PCA on + # the features too. + data['NameScore'] = data['NameScore'].combine(cats, transform_name_score) + + data['NameCoverage'] = data['NameCoverage'].combine(cats, lambda v, c: v if c == 0 else 0.0) + + # Adds dummy variables to data for NAME_SCORES. + for ns in NAME_SCORES: + data[ns] = data['NameScore'].apply(lambda v: int(ns == v)) + + # Adds dummy variables to data for SEARCH_TYPES. + + # We unify BUILDING with POI here, as we don't have enough + # training data to distinguish between them. Remove following + # line as soon as the model will be changed or we will have enough + # training data. + data['SearchType'] = data['SearchType'].apply(lambda v: v if v != 'Building' else 'POI') + for st in SEARCH_TYPES: + data[st] = data['SearchType'].apply(lambda v: int(st == v)) def compute_ndcg(relevances): @@ -44,25 +72,12 @@ def compute_ndcg(relevances): array of scores. """ - relevances_summary = collections.defaultdict(int) - - dcg = 0 - for i, relevance in enumerate(relevances): - dcg += relevance / log(2 + i, 2) - relevances_summary[relevance] += 1 - - dcg_norm, i = 0, 0 - for relevance in sorted(relevances_summary.keys(), reverse=True): - for _ in range(relevances_summary[relevance]): - dcg_norm += relevance / log(2 + i, 2) - i += 1 - - if dcg_norm == 0: - return 0 - return dcg / dcg_norm + dcg = sum(r / log(2 + i, 2) for i, r in enumerate(relevances)) + dcg_norm = sum(r / log(2 + i, 2) for i, r in enumerate(sorted(relevances, reverse=True))) + return dcg / dcg_norm if dcg_norm != 0 else 0 -def compute_ndcg_without_w(data): +def compute_ndcgs_without_ws(data): """ Computes NDCG (Normalized Discounted Cumulative Gain) for a given data. Returns an array of ndcg scores in the shape [num groups of @@ -77,17 +92,17 @@ def compute_ndcg_without_w(data): relevances = np.array(data.ix[indices]['Relevance']) ndcgs.append(compute_ndcg(relevances)) - return np.array(ndcgs) + return ndcgs -def compute_ndcg_for_w(data, w): +def compute_ndcgs_for_ws(data, ws): """ Computes NDCG (Normalized Discounted Cumulative Gain) for a given data and an array of coeffs in a linear model. Returns an array of ndcg scores in the shape [num groups of features]. """ - data_scores = np.array([np.dot(data.ix[i][FEATURES], w) for i in data.index]) + data_scores = np.array([np.dot(data.ix[i][FEATURES], ws) for i in data.index]) grouped = data.groupby(data['SampleId'], sort=False).groups ndcgs = [] @@ -101,7 +116,7 @@ def compute_ndcg_for_w(data, w): relevances = relevances[scores.argsort()[::-1]] ndcgs.append(compute_ndcg(relevances)) - return np.array(ndcgs) + return ndcgs def transform_data(data): @@ -150,36 +165,144 @@ def transform_data(data): return xs, ys +def plot_diagrams(xs, ys, features): + """ + For each feature, plots histagrams of x * sign(y), where x is a + slice on the feature of a list of pairwise differences between + input feature-vectors and y is a list of pairwise differences + between relevances of the input feature-vectors. Stong bias + toward positive or negative values in histograms indicates that + the current feature is important for ranking, as there is a + correlation between difference between features values and + relevancy. + """ + for i, f in enumerate(features): + x = [x[i] * np.sign(y) for x, y in zip(xs, ys)] + + l, r = min(x), max(x) + d = max(abs(l), abs(r)) + + plt.subplot(4, 4, i + 1) + plt.hist(x, bins=8, range=(-d, d)) + plt.title(f) + plt.show() + + +def show_pearson_statistics(xs, ys, features): + """ + Shows info about Pearson coefficient between features and + relevancy. + """ + + print('***** Correlation table *****') + print('H0 - feature not is correlated with relevancy') + print('H1 - feature is correlated with relevancy') + print() + + cs, ncs = [], [] + for i, f in enumerate(features): + zs = [x[i] for x in xs] + (c, p) = pearsonr(zs, ys) + + correlated = p < 0.05 + print('{}: pearson={:.3f}, P(H1)={}'.format(f, c, 1 - p)) + if correlated: + cs.append(f) + else: + ncs.append(f) + + print() + print('Correlated:', cs) + print('Non-correlated:', ncs) + + +def raw_output(features, ws): + """ + Prints feature-coeff pairs to the standard output. + """ + + for f, w in zip(features, ws): + print('{}: {}'.format(f, w)) + + +def print_const(name, value): + print('double const k{} = {:.7f};'.format(name, value)) + + +def print_array(name, size, values): + print('double const {}[{}] = {{'.format(name, size)) + print(',\n'.join(' {:.7f} /* {} */'.format(w, f) for (f, w) in values)) + print('};') + +def cpp_output(features, ws): + """ + Prints feature-coeff pairs in the C++-compatible format. + """ + + ns, st = [], [] + + for f, w in zip(features, ws): + if f in NAME_SCORES: + ns.append((f, w)) + elif f in SEARCH_TYPES: + st.append((f, w)) + else: + print_const(f, w) + print_array('kNameScore', 'NameScore::NAME_SCORE_COUNT', ns) + print_array('kSearchType', 'SearchModel::SEARCH_TYPE_COUNT', st) + + def main(args): data = pd.read_csv(sys.stdin) normalize_data(data) - ndcg = compute_ndcg_without_w(data); - print('Current NDCG: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg))) + ndcgs = compute_ndcgs_without_ws(data); + print('Current NDCG: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs))) print() - x, y = transform_data(data) + xs, ys = transform_data(data) + + if args.plot: + plot_diagrams(xs, ys, FEATURES) clf = svm.LinearSVC(random_state=args.seed) - cv = cross_validation.KFold(len(y), n_folds=5, shuffle=True, random_state=args.seed) + cv = cross_validation.KFold(len(ys), n_folds=5, shuffle=True, random_state=args.seed) # "C" stands for the regularizer constant. grid = {'C': np.power(10.0, np.arange(-5, 6))} gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv) - gs.fit(x, y) + gs.fit(xs, ys) - w = gs.best_estimator_.coef_[0] - ndcg = compute_ndcg_for_w(data, w) + ws = gs.best_estimator_.coef_[0] + max_w = max(abs(w) for w in ws) + ws = np.divide(ws, max_w) + + # Following code restores coeffs for merged features. + ws[FEATURES.index('Building')] = ws[FEATURES.index('POI')] + ws[FEATURES.index('Full Match Prefix')] = ws[FEATURES.index('Full Match')] + + ndcgs = compute_ndcgs_for_ws(data, ws) + + print('NDCG mean: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs))) + print('Accuracy: {}'.format(gs.best_score_)) + + if args.pearson: + print() + show_pearson_statistics(xs, ys, FEATURES) - print('NDCG mean: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg))) print() - print('Linear model weights:') - for f, c in zip(FEATURES, w): - print('{}: {}'.format(f, c)) + print('***** Linear model weights *****') + if args.cpp: + cpp_output(FEATURES, ws) + else: + raw_output(FEATURES, ws) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--seed', help='random seed', type=int) + parser.add_argument('--plot', help='plot diagrams', action='store_true') + parser.add_argument('--pearson', help='show pearson statistics', action='store_true') + parser.add_argument('--cpp', help='generate output in the C++ format', action='store_true') args = parser.parse_args() main(args) diff --git a/search/search_query.cpp b/search/search_query.cpp index 7ebce8aee8..cfbf486c89 100644 --- a/search/search_query.cpp +++ b/search/search_query.cpp @@ -12,7 +12,6 @@ #include "search/v2/pre_ranking_info.hpp" #include "search/v2/ranking_info.hpp" #include "search/v2/ranking_utils.hpp" -#include "search/v2/token_slice.hpp" #include "storage/country_info_getter.hpp" #include "storage/index.hpp" @@ -422,29 +421,18 @@ int Query::GetCategoryLocales(int8_t (&arr) [3]) const } template -void Query::ForEachCategoryTypes(ToDo toDo) const +void Query::ForEachCategoryTypes(v2::QuerySlice const & slice, ToDo toDo) const { int8_t arrLocales[3]; int const localesCount = GetCategoryLocales(arrLocales); - size_t const tokensCount = m_tokens.size(); - for (size_t i = 0; i < tokensCount; ++i) + for (size_t i = 0; i < slice.Size(); ++i) { - auto token = RemoveHashtag(m_tokens[i]); - + auto token = RemoveHashtag(slice.Get(i)); for (int j = 0; j < localesCount; ++j) m_categories.ForEachTypeByName(arrLocales[j], token, bind(ref(toDo), i, _1)); ProcessEmojiIfNeeded(token, i, toDo); } - - if (!m_prefix.empty()) - { - auto prefix = RemoveHashtag(m_prefix); - - for (int j = 0; j < localesCount; ++j) - m_categories.ForEachTypeByName(arrLocales[j], prefix, bind(ref(toDo), tokensCount, _1)); - ProcessEmojiIfNeeded(prefix, tokensCount, toDo); - } } template @@ -522,10 +510,11 @@ void Query::SetQuery(string const & query) // get preffered types to show in results m_prefferedTypes.clear(); - ForEachCategoryTypes([&] (size_t, uint32_t t) - { - m_prefferedTypes.insert(t); - }); + ForEachCategoryTypes(v2::QuerySliceOnRawStrings(m_tokens, m_prefix), + [&](size_t, uint32_t t) + { + m_prefferedTypes.insert(t); + }); } void Query::FlushViewportResults(v2::Geocoder::Params const & params, Results & res, @@ -660,7 +649,6 @@ class PreResult2Maker info.m_distanceToPivot = MercatorBounds::DistanceOnEarth(center, pivot); info.m_rank = preInfo.m_rank; info.m_searchType = preInfo.m_searchType; - info.m_nameScore = v2::NAME_SCORE_ZERO; v2::TokenSlice slice(m_params, preInfo.m_startToken, preInfo.m_endToken); @@ -681,6 +669,26 @@ class PreResult2Maker if (info.m_searchType == v2::SearchModel::SEARCH_TYPE_BUILDING) UpdateNameScore(ft.GetHouseNumber(), sliceNoCategories, info.m_nameScore); + + feature::TypesHolder holder(ft); + vector> matched(slice.Size()); + m_query.ForEachCategoryTypes(v2::QuerySliceOnTokens(slice), [&](size_t i, uint32_t t) + { + ++matched[i].second; + if (holder.Has(t)) + ++matched[i].first; + }); + + info.m_matchByTrueCats = + all_of(matched.begin(), matched.end(), [](pair const & m) + { + return m.first != 0; + }); + info.m_matchByFalseCats = + all_of(matched.begin(), matched.end(), [](pair const & m) + { + return m.first == 0 && m.second != 0; + }); } uint8_t NormalizeRank(uint8_t rank, v2::SearchModel::SearchType type, m2::PointD const & center, @@ -1259,7 +1267,8 @@ void Query::InitParams(bool localitySearch, SearchQueryParams & params) } } }; - ForEachCategoryTypes(addSyms); + ForEachCategoryTypes(v2::QuerySliceOnRawStrings(m_tokens, m_prefix), + addSyms); } for (auto & tokens : params.m_tokens) diff --git a/search/search_query.hpp b/search/search_query.hpp index 8aca5e591d..a35732d43f 100644 --- a/search/search_query.hpp +++ b/search/search_query.hpp @@ -7,6 +7,7 @@ #include "search/suggest.hpp" #include "search/v2/geocoder.hpp" #include "search/v2/rank_table_cache.hpp" +#include "search/v2/token_slice.hpp" #include "indexer/ftypes_matcher.hpp" #include "indexer/index.hpp" @@ -145,7 +146,8 @@ protected: void ClearResults(); int GetCategoryLocales(int8_t (&arr) [3]) const; - template void ForEachCategoryTypes(ToDo toDo) const; + template + void ForEachCategoryTypes(v2::QuerySlice const & slice, ToDo toDo) const; template void ProcessEmojiIfNeeded( strings::UniString const & token, size_t ind, ToDo & toDo) const; diff --git a/search/search_tests/ranking_tests.cpp b/search/search_tests/ranking_tests.cpp index cb82359f11..61f966848c 100644 --- a/search/search_tests/ranking_tests.cpp +++ b/search/search_tests/ranking_tests.cpp @@ -44,5 +44,6 @@ UNIT_TEST(NameTest_Smoke) TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", 2, 3), NAME_SCORE_FULL_MATCH, ()); TEST_EQUAL(GetScore("San Francisco", "Fran", 0, 1), NAME_SCORE_SUBSTRING_PREFIX, ()); TEST_EQUAL(GetScore("San Francisco", "Fran ", 0, 1), NAME_SCORE_ZERO, ()); + TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", 0, 1), NAME_SCORE_FULL_MATCH_PREFIX, ()); } } // namespace diff --git a/search/v2/geocoder.cpp b/search/v2/geocoder.cpp index 2dc918a4e9..be83577b74 100644 --- a/search/v2/geocoder.cpp +++ b/search/v2/geocoder.cpp @@ -73,12 +73,6 @@ size_t constexpr kLocalityRectsCacheSize = 10; strings::UniString const kUniSpace(strings::MakeUniString(" ")); -template -struct Id -{ - T const & operator()(T const & t) const { return t; } -}; - struct ScopedMarkTokens { ScopedMarkTokens(vector & usedTokens, size_t from, size_t to) @@ -1563,12 +1557,12 @@ SearchModel::SearchType Geocoder::GetSearchTypeInGeocoding(uint32_t featureId) bool Geocoder::AllTokensUsed() const { - return all_of(m_usedTokens.begin(), m_usedTokens.end(), Id()); + return all_of(m_usedTokens.begin(), m_usedTokens.end(), my::Id()); } bool Geocoder::HasUsedTokensInRange(size_t from, size_t to) const { - return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, Id()); + return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, my::Id()); } size_t Geocoder::NumUnusedTokensGroups() const diff --git a/search/v2/ranking_info.cpp b/search/v2/ranking_info.cpp index 6c1765b766..1a4cbb42bd 100644 --- a/search/v2/ranking_info.cpp +++ b/search/v2/ranking_info.cpp @@ -12,20 +12,34 @@ namespace { // See search/search_quality/scoring_model.py for details. In short, // these coeffs correspond to coeffs in a linear model. -double const kDistanceToPivot = 0.19933969103335503; -double const kRank = 3.528698483480807; -double const kNameScore = 1.0050524496846687; -double const kNameCoverage = 0.33989660511789926; -double const kSearchType = 1.1949307125113533; +double const kDistanceToPivot = -1.0000000; +double const kRank = 0.5430747; +double const kNameScore[NameScore::NAME_SCORE_COUNT] = { + -0.3686323 /* Zero */, + 0.0977193 /* Substring Prefix */, + 0.1340500 /* Substring */, + 0.1368631 /* Full Match Prefix */, + 0.1368631 /* Full Match */ +}; +double const kSearchType[SearchModel::SEARCH_TYPE_COUNT] = { + -0.9195533 /* POI */, + -0.9195533 /* Building */, + -0.1470504 /* Street */, + -0.6392620 /* Unclassified */, + -0.0900970 /* Village */, + 0.4383605 /* City */, + 0.6296097 /* State */, + 0.7279924 /* Country */ +}; double TransformDistance(double distance) { - return 1.0 - min(distance, RankingInfo::kMaxDistMeters) / RankingInfo::kMaxDistMeters; + return min(distance, RankingInfo::kMaxDistMeters) / RankingInfo::kMaxDistMeters; } } // namespace // static -double const RankingInfo::kMaxDistMeters = 2e7; +double const RankingInfo::kMaxDistMeters = 2e6; // static void RankingInfo::PrintCSVHeader(ostream & os) @@ -34,7 +48,9 @@ void RankingInfo::PrintCSVHeader(ostream & os) << ",Rank" << ",NameScore" << ",NameCoverage" - << ",SearchType"; + << ",SearchType" + << ",MatchByTrueCats" + << ",MatchByFalseCats"; } string DebugPrint(RankingInfo const & info) @@ -45,7 +61,9 @@ string DebugPrint(RankingInfo const & info) os << "m_rank:" << static_cast(info.m_rank) << ","; os << "m_nameScore:" << DebugPrint(info.m_nameScore) << ","; os << "m_nameCoverage:" << info.m_nameCoverage << ","; - os << "m_searchType:" << DebugPrint(info.m_searchType); + os << "m_searchType:" << DebugPrint(info.m_searchType) << ","; + os << "m_matchByTrueCats:" << info.m_matchByTrueCats << ","; + os << "m_matchByFalseCats:" << info.m_matchByFalseCats; os << "]"; return os.str(); } @@ -54,7 +72,8 @@ void RankingInfo::ToCSV(ostream & os) const { os << fixed; os << m_distanceToPivot << "," << static_cast(m_rank) << "," << DebugPrint(m_nameScore) - << "," << m_nameCoverage << "," << DebugPrint(m_searchType); + << "," << m_nameCoverage << "," << DebugPrint(m_searchType) << "," << m_matchByTrueCats << "," + << m_matchByFalseCats; } double RankingInfo::GetLinearModelRank() const @@ -65,24 +84,17 @@ double RankingInfo::GetLinearModelRank() const // integrated in the build system. double const distanceToPivot = TransformDistance(m_distanceToPivot); double const rank = static_cast(m_rank) / numeric_limits::max(); - double const nameScore = static_cast(m_nameScore) / NAME_SCORE_FULL_MATCH; - double const nameCoverage = m_nameCoverage; - double searchType; - switch (m_searchType) + auto nameScore = m_nameScore; + auto nameCoverage = m_nameCoverage; + if (m_matchByTrueCats || m_matchByFalseCats) { - case SearchModel::SEARCH_TYPE_POI: - case SearchModel::SEARCH_TYPE_BUILDING: - searchType = 0; - break; - default: - searchType = m_searchType - 1; - break; + nameScore = NAME_SCORE_ZERO; + nameCoverage = 0.0; } - searchType = searchType / (SearchModel::SEARCH_TYPE_COUNTRY - 1); - return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore * nameScore + - kNameCoverage * nameCoverage + kSearchType * searchType; + return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] + + kSearchType[m_searchType]; } } // namespace v2 } // namespace search diff --git a/search/v2/ranking_info.hpp b/search/v2/ranking_info.hpp index 7083fd193f..23d5b000c9 100644 --- a/search/v2/ranking_info.hpp +++ b/search/v2/ranking_info.hpp @@ -30,6 +30,14 @@ struct RankingInfo // Search type for the feature. SearchModel::SearchType m_searchType = SearchModel::SEARCH_TYPE_COUNT; + // True if the feature was matched only by tokens corresponding to + // it's categories. + bool m_matchByTrueCats = false; + + // True if the feature was matched only by tokens don't + // corresponding to it's categories. + bool m_matchByFalseCats = false; + static void PrintCSVHeader(ostream & os); void ToCSV(ostream & os) const; diff --git a/search/v2/search_model.cpp b/search/v2/search_model.cpp index 06553fff72..d76895fca4 100644 --- a/search/v2/search_model.cpp +++ b/search/v2/search_model.cpp @@ -130,14 +130,14 @@ string DebugPrint(SearchModel::SearchType type) switch (type) { case SearchModel::SEARCH_TYPE_POI: return "POI"; - case SearchModel::SEARCH_TYPE_BUILDING: return "BUILDING"; - case SearchModel::SEARCH_TYPE_STREET: return "STREET"; - case SearchModel::SEARCH_TYPE_CITY: return "CITY"; - case SearchModel::SEARCH_TYPE_VILLAGE: return "VILLAGE"; - case SearchModel::SEARCH_TYPE_STATE: return "STATE"; - case SearchModel::SEARCH_TYPE_COUNTRY: return "COUNTRY"; - case SearchModel::SEARCH_TYPE_UNCLASSIFIED: return "UNCLASSIFIED"; - case SearchModel::SEARCH_TYPE_COUNT: return "COUNT"; + case SearchModel::SEARCH_TYPE_BUILDING: return "Building"; + case SearchModel::SEARCH_TYPE_STREET: return "Street"; + case SearchModel::SEARCH_TYPE_CITY: return "City"; + case SearchModel::SEARCH_TYPE_VILLAGE: return "Village"; + case SearchModel::SEARCH_TYPE_STATE: return "State"; + case SearchModel::SEARCH_TYPE_COUNTRY: return "Country"; + case SearchModel::SEARCH_TYPE_UNCLASSIFIED: return "Unclassified"; + case SearchModel::SEARCH_TYPE_COUNT: return "Count"; } ASSERT(false, ("Unknown search type:", static_cast(type))); return string(); diff --git a/search/v2/token_slice.hpp b/search/v2/token_slice.hpp index c8cc856d8c..9ef8c12659 100644 --- a/search/v2/token_slice.hpp +++ b/search/v2/token_slice.hpp @@ -67,6 +67,63 @@ private: vector m_indexes; }; +class QuerySlice +{ +public: + using TString = SearchQueryParams::TString; + + virtual ~QuerySlice() = default; + + virtual TString const & Get(size_t i) const = 0; + virtual size_t Size() const = 0; + virtual bool IsPrefix(size_t i) const = 0; + + bool Empty() const { return Size() == 0; } +}; + +class QuerySliceOnTokens : public QuerySlice +{ +public: + QuerySliceOnTokens(TokenSlice const & slice) : m_slice(slice) {} + + // QuerySlice overrides: + SearchQueryParams::TString const & Get(size_t i) const override { return m_slice.Get(i).front(); } + size_t Size() const override { return m_slice.Size(); } + bool IsPrefix(size_t i) const override { return m_slice.IsPrefix(i); } + +private: + TokenSlice const m_slice; +}; + +template +class QuerySliceOnRawStrings : public QuerySlice +{ +public: + QuerySliceOnRawStrings(TCont const & tokens, TString const & prefix) + : m_tokens(tokens), m_prefix(prefix) + { + } + + // QuerySlice overrides: + SearchQueryParams::TString const & Get(size_t i) const override + { + ASSERT_LESS(i, Size(), ()); + return i == m_tokens.size() ? m_prefix : m_tokens[i]; + } + + size_t Size() const override { return m_tokens.size() + (m_prefix.empty() ? 0 : 1); } + + bool IsPrefix(size_t i) const override + { + ASSERT_LESS(i, Size(), ()); + return i == m_tokens.size(); + } + + private: + TCont const & m_tokens; + TString const & m_prefix; +}; + string DebugPrint(TokenSlice const & slice); string DebugPrint(TokenSliceNoCategories const & slice);