diff --git a/search/search_integration_tests/search_query_v2_test.cpp b/search/search_integration_tests/search_query_v2_test.cpp index f010e2135a..3d2b498057 100644 --- a/search/search_integration_tests/search_query_v2_test.cpp +++ b/search/search_integration_tests/search_query_v2_test.cpp @@ -30,16 +30,27 @@ namespace search { namespace { -void MakeDefaultTestParams(string const & query, SearchParams & params) -{ - params.m_query = query; - params.m_inputLocale = "en"; - params.SetMode(Mode::Everywhere); - params.SetSuggestsEnabled(false); -} - class SearchQueryV2Test : public SearchTest { +public: + unique_ptr MakeRequest(string const & query) + { + SearchParams params; + params.m_query = query; + params.m_inputLocale = "en"; + params.SetMode(Mode::Everywhere); + params.SetSuggestsEnabled(false); + + auto request = make_unique(m_engine, params, m_viewport); + request->Wait(); + return request; + } + + bool MatchResults(vector> rules, + vector const & actual) const + { + return ::MatchResults(m_engine, rules, actual); + } }; UNIT_CLASS_TEST(SearchQueryV2Test, Smoke) @@ -271,7 +282,7 @@ UNIT_CLASS_TEST(SearchQueryV2Test, DisableSuggests) request.Wait(); TRules rules = {ExactMatch(worldId, london1), ExactMatch(worldId, london2)}; - TEST(MatchResults(m_engine, rules, request.Results()), ()); + TEST(MatchResults(rules, request.Results()), ()); } } @@ -321,41 +332,33 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestRankingInfo) SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5))); { - SearchParams params; - MakeDefaultTestParams("golden gate bridge ", params); - - TestSearchRequest request(m_engine, params, m_viewport); - request.Wait(); + auto request = MakeRequest("golden gate bridge "); TRules rules = {ExactMatch(wonderlandId, goldenGateBridge), ExactMatch(wonderlandId, goldenGateStreet)}; - TEST(MatchResults(m_engine, rules, request.Results()), ()); - for (auto const & result : request.Results()) + TEST(MatchResults(rules, request->Results()), ()); + for (auto const & result : request->Results()) { auto const & info = result.GetRankingInfo(); TEST_EQUAL(NAME_SCORE_FULL_MATCH, info.m_nameScore, (result)); - TEST(my::AlmostEqualAbs(1.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage)); + TEST(!info.m_pureCats, (result)); + TEST(!info.m_falseCats, (result)); } } // This test is quite important and must always pass. { - SearchParams params; - MakeDefaultTestParams("cafe лермонтов", params); - - TestSearchRequest request(m_engine, params, m_viewport); - request.Wait(); - - auto const & results = request.Results(); + auto request = MakeRequest("cafe лермонтов"); + auto const & results = request->Results(); TRules rules{ExactMatch(wonderlandId, cafe1), ExactMatch(wonderlandId, cafe2), ExactMatch(wonderlandId, lermontov)}; - TEST(MatchResults(m_engine, rules, results), ()); + TEST(MatchResults(rules, results), ()); TEST_EQUAL(3, results.size(), ("Unexpected number of retrieved cafes.")); auto const & top = results.front(); - TEST(MatchResults(m_engine, {ExactMatch(wonderlandId, lermontov)}, {top}), ()); + TEST(MatchResults({ExactMatch(wonderlandId, lermontov)}, {top}), ()); } { @@ -471,6 +474,9 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories) TestPOI named(m2::PointD(0.0001, 0.0001), "ATM", "en"); named.SetTypes({{"amenity", "atm"}}); + TestPOI busStop(m2::PointD(0.00005, 0.0005), "ATM Bus Stop", "en"); + busStop.SetTypes({{"highway", "bus_stop"}}); + BuildWorld([&](TestMwmBuilder & builder) { builder.Add(sanFrancisco); @@ -479,31 +485,51 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories) { builder.Add(named); builder.Add(noname); + builder.Add(busStop); }); SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5))); - TRules rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)}; - - TEST(ResultsMatch("atm", rules), ()); { - SearchParams params; - MakeDefaultTestParams("#atm", params); + TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named), + ExactMatch(wonderlandId, busStop)}; - TestSearchRequest request(m_engine, params, m_viewport); - request.Wait(); + auto request = MakeRequest("atm"); + TEST(MatchResults(rules, request->Results()), ()); + for (auto const & result : request->Results()) + { + Index::FeaturesLoaderGuard loader(m_engine, wonderlandId); + FeatureType ft; + loader.GetFeatureByIndex(result.GetFeatureID().m_index, ft); - TEST(MatchResults(m_engine, rules, request.Results()), ()); - for (auto const & result : request.Results()) + auto const & info = result.GetRankingInfo(); + + if (busStop.Matches(ft)) + { + TEST(!info.m_pureCats, (result)); + TEST(info.m_falseCats, (result)); + } + else + { + TEST(info.m_pureCats, (result)); + TEST(!info.m_falseCats, (result)); + } + } + } + + { + TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)}; + + auto request = MakeRequest("#atm"); + + TEST(MatchResults(rules, request->Results()), ()); + for (auto const & result : request->Results()) { auto const & info = result.GetRankingInfo(); // Token with a hashtag should not participate in name-score // calculations. TEST_EQUAL(NAME_SCORE_ZERO, info.m_nameScore, (result)); - - // TODO (@y): fix this. Name coverage calculations are flawed. - // TEST(my::AlmostEqualAbs(0.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage)); } } diff --git a/search/search_quality/download-maps.sh b/search/search_quality/download-maps.sh new file mode 100755 index 0000000000..c41ac333eb --- /dev/null +++ b/search/search_quality/download-maps.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +# Downloads all maps necessary for learning to rank to the current +# directory. + +ALL= +VERSION= +BASE="http://direct.mapswithme.com/direct" + +display_usage() { + echo "Usage: $0 -v [version] -a -h" + echo " -v version of maps to download" + echo " -a download all maps of the specified version" + echo " -h display this message" +} + +while getopts ":av:h" opt +do + case "$opt" in + a) ALL=1 + ;; + v) VERSION="$OPTARG" + ;; + h) display_usage + exit -1 + ;; + \?) echo "Invalid option: -$OPTARG" 1>&2 + ;; + :) echo "Option -$OPTARG requires an argument" 1>&2 + ;; + esac +done + +if [ -z "$VERSION" ] +then + echo "Version of maps is not specified." 1>&2 + exit -1 +fi + +if ! curl "$BASE/" 2>/dev/null | + sed -n 's/^.*href="\(.*\)\/".*$/\1/p' | + grep -v "^../$" | grep -q "$VERSION" +then + echo "Invalid version: $VERSION" 1>&2 + exit -1 +fi + +NAMES=("Australia_Brisbane.mwm" + "Belarus_Minsk*.mwm" + "Germany_*.mwm" + "Russia_*.mwm" + "UK_England_*.mwm" + "US_California_*.mwm" + "US_Maryland_*.mwm") + +DIR="$BASE/$VERSION" + +if [ "$ALL" ] +then + echo "Downloading all maps..." + + files=$(curl "$DIR/" 2>/dev/null | sed -n 's/^.*href="\(.*\.mwm\)".*$/\1/p') + + set -e + set -x + for file in $files + do + wget -np -nd "$DIR/$file" + done +else + echo "Downloading maps..." + + set -e + set -x + for name in ${NAMES[@]} + do + wget -r -np -nd -A "$name" "$DIR/" + done +fi diff --git a/search/search_quality/features_collector_tool/features_collector_tool.cpp b/search/search_quality/features_collector_tool/features_collector_tool.cpp index d844d94ebd..bf33021ab6 100644 --- a/search/search_quality/features_collector_tool/features_collector_tool.cpp +++ b/search/search_quality/features_collector_tool/features_collector_tool.cpp @@ -141,15 +141,26 @@ void DisplayStats(ostream & os, vector const & samples, vector co { auto const n = samples.size(); ASSERT_EQUAL(stats.size(), n, ()); + + size_t numWarnings = 0; + for (auto const & stat : stats) + { + if (!stat.m_notFound.empty()) + ++numWarnings; + } + + if (numWarnings == 0) + { + os << "All " << stats.size() << " queries are OK." << endl; + return; + } + + os << numWarnings << " warnings." << endl; for (size_t i = 0; i < n; ++i) { - os << "Query #" << i << " \"" << strings::ToUtf8(samples[i].m_query) << "\""; if (stats[i].m_notFound.empty()) - { - os << ": OK" << endl; continue; - } - os << ": WARNING" << endl; + os << "Query #" << i << " \"" << strings::ToUtf8(samples[i].m_query) << "\":" << endl; for (auto const & j : stats[i].m_notFound) os << "Not found: " << DebugPrint(samples[i].m_results[j]) << endl; } diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index b22b241e3b..2409e3d715 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -1,41 +1,67 @@ #!/usr/bin/env python3 from math import exp, log +from scipy.stats import pearsonr from sklearn import cross_validation, grid_search, svm import argparse import collections import itertools +import matplotlib.pyplot as plt import numpy as np import pandas as pd +import random import sys -FEATURES = ['DistanceToPivot', 'Rank', 'NameScore', 'NameCoverage', 'SearchType'] -MAX_DISTANCE_METERS = 2e7 +MAX_DISTANCE_METERS = 2e6 MAX_RANK = 255 RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3} NAME_SCORES = ['Zero', 'Substring Prefix', 'Substring', 'Full Match Prefix', 'Full Match'] -SEARCH_TYPES = {'POI': 0, - 'BUILDING': 0, - 'STREET': 1, - 'UNCLASSIFIED': 2, - 'VILLAGE': 3, - 'CITY': 4, - 'STATE': 5, - 'COUNTRY': 6} +SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country'] + +FEATURES = ['DistanceToPivot', 'Rank'] + NAME_SCORES + SEARCH_TYPES + + +def transform_name_score(value, categories_match): + if categories_match == 1: + return 'Zero' + elif value == 'Full Match Prefix': + return 'Full Match' + else: + return value def normalize_data(data): - transform_distance = lambda d: 1 - min(d, MAX_DISTANCE_METERS) / MAX_DISTANCE_METERS - - max_name_score = len(NAME_SCORES) - 1 - max_search_type = SEARCH_TYPES['COUNTRY'] + transform_distance = lambda v: min(v, MAX_DISTANCE_METERS) / MAX_DISTANCE_METERS data['DistanceToPivot'] = data['DistanceToPivot'].apply(transform_distance) - data['Rank'] = data['Rank'].apply(lambda rank: rank / MAX_RANK) - data['NameScore'] = data['NameScore'].apply(lambda s: NAME_SCORES.index(s) / max_name_score) - data['SearchType'] = data['SearchType'].apply(lambda t: SEARCH_TYPES[t] / max_search_type) - data['Relevance'] = data['Relevance'].apply(lambda r: RELEVANCES[r]) + data['Rank'] = data['Rank'].apply(lambda v: v / MAX_RANK) + data['Relevance'] = data['Relevance'].apply(lambda v: RELEVANCES[v]) + + cats = data['PureCats'].combine(data['FalseCats'], max) + + # Full prefix match is unified with a full match as these features + # are collinear. But we need both of them as they're also used in + # locality sorting. + # + # TODO (@y, @m): do forward/backward/subset selection of features + # instead of this merging. It would be great to conduct PCA on + # the features too. + data['NameScore'] = data['NameScore'].combine(cats, transform_name_score) + + # Adds dummy variables to data for NAME_SCORES. + for ns in NAME_SCORES: + data[ns] = data['NameScore'].apply(lambda v: int(ns == v)) + + # Adds dummy variables to data for SEARCH_TYPES. + + # We unify BUILDING with POI here, as we don't have enough + # training data to distinguish between them. Remove following + # line as soon as the model will be changed or we will have enough + # training data. + data['SearchType'] = data['SearchType'].apply(lambda v: v if v != 'Building' else 'POI') + for st in SEARCH_TYPES: + data[st] = data['SearchType'].apply(lambda v: int(st == v)) def compute_ndcg(relevances): @@ -44,25 +70,12 @@ def compute_ndcg(relevances): array of scores. """ - relevances_summary = collections.defaultdict(int) - - dcg = 0 - for i, relevance in enumerate(relevances): - dcg += relevance / log(2 + i, 2) - relevances_summary[relevance] += 1 - - dcg_norm, i = 0, 0 - for relevance in sorted(relevances_summary.keys(), reverse=True): - for _ in range(relevances_summary[relevance]): - dcg_norm += relevance / log(2 + i, 2) - i += 1 - - if dcg_norm == 0: - return 0 - return dcg / dcg_norm + dcg = sum(r / log(2 + i, 2) for i, r in enumerate(relevances)) + dcg_norm = sum(r / log(2 + i, 2) for i, r in enumerate(sorted(relevances, reverse=True))) + return dcg / dcg_norm if dcg_norm != 0 else 0 -def compute_ndcg_without_w(data): +def compute_ndcgs_without_ws(data): """ Computes NDCG (Normalized Discounted Cumulative Gain) for a given data. Returns an array of ndcg scores in the shape [num groups of @@ -77,17 +90,17 @@ def compute_ndcg_without_w(data): relevances = np.array(data.ix[indices]['Relevance']) ndcgs.append(compute_ndcg(relevances)) - return np.array(ndcgs) + return ndcgs -def compute_ndcg_for_w(data, w): +def compute_ndcgs_for_ws(data, ws): """ Computes NDCG (Normalized Discounted Cumulative Gain) for a given data and an array of coeffs in a linear model. Returns an array of ndcg scores in the shape [num groups of features]. """ - data_scores = np.array([np.dot(data.ix[i][FEATURES], w) for i in data.index]) + data_scores = np.array([np.dot(data.ix[i][FEATURES], ws) for i in data.index]) grouped = data.groupby(data['SampleId'], sort=False).groups ndcgs = [] @@ -101,7 +114,7 @@ def compute_ndcg_for_w(data, w): relevances = relevances[scores.argsort()[::-1]] ndcgs.append(compute_ndcg(relevances)) - return np.array(ndcgs) + return ndcgs def transform_data(data): @@ -150,36 +163,144 @@ def transform_data(data): return xs, ys +def plot_diagrams(xs, ys, features): + """ + For each feature, plots histagrams of x * sign(y), where x is a + slice on the feature of a list of pairwise differences between + input feature-vectors and y is a list of pairwise differences + between relevances of the input feature-vectors. Stong bias + toward positive or negative values in histograms indicates that + the current feature is important for ranking, as there is a + correlation between difference between features values and + relevancy. + """ + for i, f in enumerate(features): + x = [x[i] * np.sign(y) for x, y in zip(xs, ys)] + + l, r = min(x), max(x) + d = max(abs(l), abs(r)) + + plt.subplot(4, 4, i + 1) + plt.hist(x, bins=8, range=(-d, d)) + plt.title(f) + plt.show() + + +def show_pearson_statistics(xs, ys, features): + """ + Shows info about Pearson coefficient between features and + relevancy. + """ + + print('***** Correlation table *****') + print('H0 - feature not is correlated with relevancy') + print('H1 - feature is correlated with relevancy') + print() + + cs, ncs = [], [] + for i, f in enumerate(features): + zs = [x[i] for x in xs] + (c, p) = pearsonr(zs, ys) + + correlated = p < 0.05 + print('{}: pearson={:.3f}, P(H1)={}'.format(f, c, 1 - p)) + if correlated: + cs.append(f) + else: + ncs.append(f) + + print() + print('Correlated:', cs) + print('Non-correlated:', ncs) + + +def raw_output(features, ws): + """ + Prints feature-coeff pairs to the standard output. + """ + + for f, w in zip(features, ws): + print('{}: {}'.format(f, w)) + + +def print_const(name, value): + print('double const k{} = {:.7f};'.format(name, value)) + + +def print_array(name, size, values): + print('double const {}[{}] = {{'.format(name, size)) + print(',\n'.join(' {:.7f} /* {} */'.format(w, f) for (f, w) in values)) + print('};') + +def cpp_output(features, ws): + """ + Prints feature-coeff pairs in the C++-compatible format. + """ + + ns, st = [], [] + + for f, w in zip(features, ws): + if f in NAME_SCORES: + ns.append((f, w)) + elif f in SEARCH_TYPES: + st.append((f, w)) + else: + print_const(f, w) + print_array('kNameScore', 'NameScore::NAME_SCORE_COUNT', ns) + print_array('kSearchType', 'SearchModel::SEARCH_TYPE_COUNT', st) + + def main(args): data = pd.read_csv(sys.stdin) normalize_data(data) - ndcg = compute_ndcg_without_w(data); - print('Current NDCG: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg))) + ndcgs = compute_ndcgs_without_ws(data); + print('Current NDCG: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs))) print() - x, y = transform_data(data) + xs, ys = transform_data(data) + + if args.plot: + plot_diagrams(xs, ys, FEATURES) clf = svm.LinearSVC(random_state=args.seed) - cv = cross_validation.KFold(len(y), n_folds=5, shuffle=True, random_state=args.seed) + cv = cross_validation.KFold(len(ys), n_folds=5, shuffle=True, random_state=args.seed) # "C" stands for the regularizer constant. grid = {'C': np.power(10.0, np.arange(-5, 6))} gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv) - gs.fit(x, y) + gs.fit(xs, ys) - w = gs.best_estimator_.coef_[0] - ndcg = compute_ndcg_for_w(data, w) + ws = gs.best_estimator_.coef_[0] + max_w = max(abs(w) for w in ws) + ws = np.divide(ws, max_w) + + # Following code restores coeffs for merged features. + ws[FEATURES.index('Building')] = ws[FEATURES.index('POI')] + ws[FEATURES.index('Full Match Prefix')] = ws[FEATURES.index('Full Match')] + + ndcgs = compute_ndcgs_for_ws(data, ws) + + print('NDCG mean: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs))) + print('Accuracy: {}'.format(gs.best_score_)) + + if args.pearson: + print() + show_pearson_statistics(xs, ys, FEATURES) - print('NDCG mean: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg))) print() - print('Linear model weights:') - for f, c in zip(FEATURES, w): - print('{}: {}'.format(f, c)) + print('***** Linear model weights *****') + if args.cpp: + cpp_output(FEATURES, ws) + else: + raw_output(FEATURES, ws) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--seed', help='random seed', type=int) + parser.add_argument('--plot', help='plot diagrams', action='store_true') + parser.add_argument('--pearson', help='show pearson statistics', action='store_true') + parser.add_argument('--cpp', help='generate output in the C++ format', action='store_true') args = parser.parse_args() main(args) diff --git a/search/search_query.cpp b/search/search_query.cpp index 7ebce8aee8..0f78a1c64f 100644 --- a/search/search_query.cpp +++ b/search/search_query.cpp @@ -12,7 +12,6 @@ #include "search/v2/pre_ranking_info.hpp" #include "search/v2/ranking_info.hpp" #include "search/v2/ranking_utils.hpp" -#include "search/v2/token_slice.hpp" #include "storage/country_info_getter.hpp" #include "storage/index.hpp" @@ -191,20 +190,11 @@ void UpdateNameScore(string const & name, TSlice const & slice, v2::NameScore & template void UpdateNameScore(vector const & tokens, TSlice const & slice, - v2::NameScore & bestScore, double & bestCoverage) + v2::NameScore & bestScore) { auto const score = v2::GetNameScore(tokens, slice); - auto const coverage = - tokens.empty() ? 0 : static_cast(slice.Size()) / static_cast(tokens.size()); if (score > bestScore) - { bestScore = score; - bestCoverage = coverage; - } - else if (score == bestScore && coverage > bestCoverage) - { - bestCoverage = coverage; - } } inline bool IsHashtagged(strings::UniString const & s) { return !s.empty() && s[0] == '#'; } @@ -422,29 +412,18 @@ int Query::GetCategoryLocales(int8_t (&arr) [3]) const } template -void Query::ForEachCategoryTypes(ToDo toDo) const +void Query::ForEachCategoryTypes(v2::QuerySlice const & slice, ToDo toDo) const { int8_t arrLocales[3]; int const localesCount = GetCategoryLocales(arrLocales); - size_t const tokensCount = m_tokens.size(); - for (size_t i = 0; i < tokensCount; ++i) + for (size_t i = 0; i < slice.Size(); ++i) { - auto token = RemoveHashtag(m_tokens[i]); - + auto token = RemoveHashtag(slice.Get(i)); for (int j = 0; j < localesCount; ++j) m_categories.ForEachTypeByName(arrLocales[j], token, bind(ref(toDo), i, _1)); ProcessEmojiIfNeeded(token, i, toDo); } - - if (!m_prefix.empty()) - { - auto prefix = RemoveHashtag(m_prefix); - - for (int j = 0; j < localesCount; ++j) - m_categories.ForEachTypeByName(arrLocales[j], prefix, bind(ref(toDo), tokensCount, _1)); - ProcessEmojiIfNeeded(prefix, tokensCount, toDo); - } } template @@ -522,10 +501,11 @@ void Query::SetQuery(string const & query) // get preffered types to show in results m_prefferedTypes.clear(); - ForEachCategoryTypes([&] (size_t, uint32_t t) - { - m_prefferedTypes.insert(t); - }); + ForEachCategoryTypes(v2::QuerySliceOnRawStrings(m_tokens, m_prefix), + [&](size_t, uint32_t t) + { + m_prefferedTypes.insert(t); + }); } void Query::FlushViewportResults(v2::Geocoder::Params const & params, Results & res, @@ -660,7 +640,6 @@ class PreResult2Maker info.m_distanceToPivot = MercatorBounds::DistanceOnEarth(center, pivot); info.m_rank = preInfo.m_rank; info.m_searchType = preInfo.m_searchType; - info.m_nameScore = v2::NAME_SCORE_ZERO; v2::TokenSlice slice(m_params, preInfo.m_startToken, preInfo.m_endToken); @@ -675,12 +654,30 @@ class PreResult2Maker vector tokens; SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters()); - UpdateNameScore(tokens, slice, info.m_nameScore, info.m_nameCoverage); - UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore, info.m_nameCoverage); + UpdateNameScore(tokens, slice, info.m_nameScore); + UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore); } if (info.m_searchType == v2::SearchModel::SEARCH_TYPE_BUILDING) UpdateNameScore(ft.GetHouseNumber(), sliceNoCategories, info.m_nameScore); + + feature::TypesHolder holder(ft); + vector> matched(slice.Size()); + m_query.ForEachCategoryTypes(v2::QuerySliceOnTokens(slice), [&](size_t i, uint32_t t) + { + ++matched[i].second; + if (holder.Has(t)) + ++matched[i].first; + }); + + info.m_pureCats = all_of(matched.begin(), matched.end(), [](pair const & m) + { + return m.first != 0; + }); + info.m_falseCats = all_of(matched.begin(), matched.end(), [](pair const & m) + { + return m.first == 0 && m.second != 0; + }); } uint8_t NormalizeRank(uint8_t rank, v2::SearchModel::SearchType type, m2::PointD const & center, @@ -1259,7 +1256,8 @@ void Query::InitParams(bool localitySearch, SearchQueryParams & params) } } }; - ForEachCategoryTypes(addSyms); + ForEachCategoryTypes(v2::QuerySliceOnRawStrings(m_tokens, m_prefix), + addSyms); } for (auto & tokens : params.m_tokens) diff --git a/search/search_query.hpp b/search/search_query.hpp index 8aca5e591d..a35732d43f 100644 --- a/search/search_query.hpp +++ b/search/search_query.hpp @@ -7,6 +7,7 @@ #include "search/suggest.hpp" #include "search/v2/geocoder.hpp" #include "search/v2/rank_table_cache.hpp" +#include "search/v2/token_slice.hpp" #include "indexer/ftypes_matcher.hpp" #include "indexer/index.hpp" @@ -145,7 +146,8 @@ protected: void ClearResults(); int GetCategoryLocales(int8_t (&arr) [3]) const; - template void ForEachCategoryTypes(ToDo toDo) const; + template + void ForEachCategoryTypes(v2::QuerySlice const & slice, ToDo toDo) const; template void ProcessEmojiIfNeeded( strings::UniString const & token, size_t ind, ToDo & toDo) const; diff --git a/search/search_tests/ranking_tests.cpp b/search/search_tests/ranking_tests.cpp index cb82359f11..61f966848c 100644 --- a/search/search_tests/ranking_tests.cpp +++ b/search/search_tests/ranking_tests.cpp @@ -44,5 +44,6 @@ UNIT_TEST(NameTest_Smoke) TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", 2, 3), NAME_SCORE_FULL_MATCH, ()); TEST_EQUAL(GetScore("San Francisco", "Fran", 0, 1), NAME_SCORE_SUBSTRING_PREFIX, ()); TEST_EQUAL(GetScore("San Francisco", "Fran ", 0, 1), NAME_SCORE_ZERO, ()); + TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", 0, 1), NAME_SCORE_FULL_MATCH_PREFIX, ()); } } // namespace diff --git a/search/v2/geocoder.cpp b/search/v2/geocoder.cpp index 59255d794f..b39acdfb82 100644 --- a/search/v2/geocoder.cpp +++ b/search/v2/geocoder.cpp @@ -73,12 +73,6 @@ size_t constexpr kLocalityRectsCacheSize = 10; strings::UniString const kUniSpace(strings::MakeUniString(" ")); -template -struct Id -{ - T const & operator()(T const & t) const { return t; } -}; - struct ScopedMarkTokens { ScopedMarkTokens(vector & usedTokens, size_t from, size_t to) @@ -1563,12 +1557,12 @@ SearchModel::SearchType Geocoder::GetSearchTypeInGeocoding(uint32_t featureId) bool Geocoder::AllTokensUsed() const { - return all_of(m_usedTokens.begin(), m_usedTokens.end(), Id()); + return all_of(m_usedTokens.begin(), m_usedTokens.end(), IdFunctor()); } bool Geocoder::HasUsedTokensInRange(size_t from, size_t to) const { - return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, Id()); + return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, IdFunctor()); } size_t Geocoder::NumUnusedTokensGroups() const diff --git a/search/v2/ranking_info.cpp b/search/v2/ranking_info.cpp index 6c1765b766..1e4dbbaf32 100644 --- a/search/v2/ranking_info.cpp +++ b/search/v2/ranking_info.cpp @@ -12,20 +12,34 @@ namespace { // See search/search_quality/scoring_model.py for details. In short, // these coeffs correspond to coeffs in a linear model. -double const kDistanceToPivot = 0.19933969103335503; -double const kRank = 3.528698483480807; -double const kNameScore = 1.0050524496846687; -double const kNameCoverage = 0.33989660511789926; -double const kSearchType = 1.1949307125113533; +double const kDistanceToPivot = -1.0000000; +double const kRank = 0.5430747; +double const kNameScore[NameScore::NAME_SCORE_COUNT] = { + -0.3686323 /* Zero */, + 0.0977193 /* Substring Prefix */, + 0.1340500 /* Substring */, + 0.1368631 /* Full Match Prefix */, + 0.1368631 /* Full Match */ +}; +double const kSearchType[SearchModel::SEARCH_TYPE_COUNT] = { + -0.9195533 /* POI */, + -0.9195533 /* Building */, + -0.1470504 /* Street */, + -0.6392620 /* Unclassified */, + -0.0900970 /* Village */, + 0.4383605 /* City */, + 0.6296097 /* State */, + 0.7279924 /* Country */ +}; double TransformDistance(double distance) { - return 1.0 - min(distance, RankingInfo::kMaxDistMeters) / RankingInfo::kMaxDistMeters; + return min(distance, RankingInfo::kMaxDistMeters) / RankingInfo::kMaxDistMeters; } } // namespace // static -double const RankingInfo::kMaxDistMeters = 2e7; +double const RankingInfo::kMaxDistMeters = 2e6; // static void RankingInfo::PrintCSVHeader(ostream & os) @@ -33,8 +47,9 @@ void RankingInfo::PrintCSVHeader(ostream & os) os << "DistanceToPivot" << ",Rank" << ",NameScore" - << ",NameCoverage" - << ",SearchType"; + << ",SearchType" + << ",PureCats" + << ",FalseCats"; } string DebugPrint(RankingInfo const & info) @@ -44,8 +59,9 @@ string DebugPrint(RankingInfo const & info) os << "m_distanceToPivot:" << info.m_distanceToPivot << ","; os << "m_rank:" << static_cast(info.m_rank) << ","; os << "m_nameScore:" << DebugPrint(info.m_nameScore) << ","; - os << "m_nameCoverage:" << info.m_nameCoverage << ","; - os << "m_searchType:" << DebugPrint(info.m_searchType); + os << "m_searchType:" << DebugPrint(info.m_searchType) << ","; + os << "m_pureCats:" << info.m_pureCats << ","; + os << "m_falseCats:" << info.m_falseCats; os << "]"; return os.str(); } @@ -54,7 +70,7 @@ void RankingInfo::ToCSV(ostream & os) const { os << fixed; os << m_distanceToPivot << "," << static_cast(m_rank) << "," << DebugPrint(m_nameScore) - << "," << m_nameCoverage << "," << DebugPrint(m_searchType); + << "," << DebugPrint(m_searchType) << "," << m_pureCats << "," << m_falseCats; } double RankingInfo::GetLinearModelRank() const @@ -65,24 +81,21 @@ double RankingInfo::GetLinearModelRank() const // integrated in the build system. double const distanceToPivot = TransformDistance(m_distanceToPivot); double const rank = static_cast(m_rank) / numeric_limits::max(); - double const nameScore = static_cast(m_nameScore) / NAME_SCORE_FULL_MATCH; - double const nameCoverage = m_nameCoverage; - double searchType; - switch (m_searchType) + auto nameScore = m_nameScore; + if (m_pureCats || m_falseCats) { - case SearchModel::SEARCH_TYPE_POI: - case SearchModel::SEARCH_TYPE_BUILDING: - searchType = 0; - break; - default: - searchType = m_searchType - 1; - break; + // If the feature was matched only by categorial tokens, it's + // better for ranking to set name score to zero. For example, + // when we're looking for a "cafe", cafes "Cafe Pushkin" and + // "Lermontov" both match to the request, but must be ranked in + // accordance to their distances to the user position or viewport, + // in spite of "Cafe Pushkin" has a non-zero name rank. + nameScore = NAME_SCORE_ZERO; } - searchType = searchType / (SearchModel::SEARCH_TYPE_COUNTRY - 1); - return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore * nameScore + - kNameCoverage * nameCoverage + kSearchType * searchType; + return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] + + kSearchType[m_searchType]; } } // namespace v2 } // namespace search diff --git a/search/v2/ranking_info.hpp b/search/v2/ranking_info.hpp index 7083fd193f..e86811a3ee 100644 --- a/search/v2/ranking_info.hpp +++ b/search/v2/ranking_info.hpp @@ -24,12 +24,18 @@ struct RankingInfo // Score for the feature's name. NameScore m_nameScore = NAME_SCORE_ZERO; - // Fraction of tokens from the query matched to a feature name. - double m_nameCoverage = 0; - // Search type for the feature. SearchModel::SearchType m_searchType = SearchModel::SEARCH_TYPE_COUNT; + // True if all of the tokens that the feature was matched by + // correspond to this feature's categories. + bool m_pureCats = false; + + // True if none of the tokens that the feature was matched by + // corresponds to this feature's categories although all of the + // tokens are categorial ones. + bool m_falseCats = false; + static void PrintCSVHeader(ostream & os); void ToCSV(ostream & os) const; diff --git a/search/v2/search_model.cpp b/search/v2/search_model.cpp index 06553fff72..d76895fca4 100644 --- a/search/v2/search_model.cpp +++ b/search/v2/search_model.cpp @@ -130,14 +130,14 @@ string DebugPrint(SearchModel::SearchType type) switch (type) { case SearchModel::SEARCH_TYPE_POI: return "POI"; - case SearchModel::SEARCH_TYPE_BUILDING: return "BUILDING"; - case SearchModel::SEARCH_TYPE_STREET: return "STREET"; - case SearchModel::SEARCH_TYPE_CITY: return "CITY"; - case SearchModel::SEARCH_TYPE_VILLAGE: return "VILLAGE"; - case SearchModel::SEARCH_TYPE_STATE: return "STATE"; - case SearchModel::SEARCH_TYPE_COUNTRY: return "COUNTRY"; - case SearchModel::SEARCH_TYPE_UNCLASSIFIED: return "UNCLASSIFIED"; - case SearchModel::SEARCH_TYPE_COUNT: return "COUNT"; + case SearchModel::SEARCH_TYPE_BUILDING: return "Building"; + case SearchModel::SEARCH_TYPE_STREET: return "Street"; + case SearchModel::SEARCH_TYPE_CITY: return "City"; + case SearchModel::SEARCH_TYPE_VILLAGE: return "Village"; + case SearchModel::SEARCH_TYPE_STATE: return "State"; + case SearchModel::SEARCH_TYPE_COUNTRY: return "Country"; + case SearchModel::SEARCH_TYPE_UNCLASSIFIED: return "Unclassified"; + case SearchModel::SEARCH_TYPE_COUNT: return "Count"; } ASSERT(false, ("Unknown search type:", static_cast(type))); return string(); diff --git a/search/v2/token_slice.hpp b/search/v2/token_slice.hpp index c8cc856d8c..9ef8c12659 100644 --- a/search/v2/token_slice.hpp +++ b/search/v2/token_slice.hpp @@ -67,6 +67,63 @@ private: vector m_indexes; }; +class QuerySlice +{ +public: + using TString = SearchQueryParams::TString; + + virtual ~QuerySlice() = default; + + virtual TString const & Get(size_t i) const = 0; + virtual size_t Size() const = 0; + virtual bool IsPrefix(size_t i) const = 0; + + bool Empty() const { return Size() == 0; } +}; + +class QuerySliceOnTokens : public QuerySlice +{ +public: + QuerySliceOnTokens(TokenSlice const & slice) : m_slice(slice) {} + + // QuerySlice overrides: + SearchQueryParams::TString const & Get(size_t i) const override { return m_slice.Get(i).front(); } + size_t Size() const override { return m_slice.Size(); } + bool IsPrefix(size_t i) const override { return m_slice.IsPrefix(i); } + +private: + TokenSlice const m_slice; +}; + +template +class QuerySliceOnRawStrings : public QuerySlice +{ +public: + QuerySliceOnRawStrings(TCont const & tokens, TString const & prefix) + : m_tokens(tokens), m_prefix(prefix) + { + } + + // QuerySlice overrides: + SearchQueryParams::TString const & Get(size_t i) const override + { + ASSERT_LESS(i, Size(), ()); + return i == m_tokens.size() ? m_prefix : m_tokens[i]; + } + + size_t Size() const override { return m_tokens.size() + (m_prefix.empty() ? 0 : 1); } + + bool IsPrefix(size_t i) const override + { + ASSERT_LESS(i, Size(), ()); + return i == m_tokens.size(); + } + + private: + TCont const & m_tokens; + TString const & m_prefix; +}; + string DebugPrint(TokenSlice const & slice); string DebugPrint(TokenSliceNoCategories const & slice);