Merge pull request #3259 from ygorshenin/fix-ranking-model

[search] Fixed ranking model.
This commit is contained in:
mpimenov 2016-05-20 19:09:31 +04:00
commit 50187d1bd3
12 changed files with 480 additions and 172 deletions

View file

@ -30,16 +30,27 @@ namespace search
{
namespace
{
void MakeDefaultTestParams(string const & query, SearchParams & params)
{
params.m_query = query;
params.m_inputLocale = "en";
params.SetMode(Mode::Everywhere);
params.SetSuggestsEnabled(false);
}
class SearchQueryV2Test : public SearchTest
{
public:
unique_ptr<TestSearchRequest> MakeRequest(string const & query)
{
SearchParams params;
params.m_query = query;
params.m_inputLocale = "en";
params.SetMode(Mode::Everywhere);
params.SetSuggestsEnabled(false);
auto request = make_unique<TestSearchRequest>(m_engine, params, m_viewport);
request->Wait();
return request;
}
bool MatchResults(vector<shared_ptr<MatchingRule>> rules,
vector<search::Result> const & actual) const
{
return ::MatchResults(m_engine, rules, actual);
}
};
UNIT_CLASS_TEST(SearchQueryV2Test, Smoke)
@ -271,7 +282,7 @@ UNIT_CLASS_TEST(SearchQueryV2Test, DisableSuggests)
request.Wait();
TRules rules = {ExactMatch(worldId, london1), ExactMatch(worldId, london2)};
TEST(MatchResults(m_engine, rules, request.Results()), ());
TEST(MatchResults(rules, request.Results()), ());
}
}
@ -321,41 +332,33 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestRankingInfo)
SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5)));
{
SearchParams params;
MakeDefaultTestParams("golden gate bridge ", params);
TestSearchRequest request(m_engine, params, m_viewport);
request.Wait();
auto request = MakeRequest("golden gate bridge ");
TRules rules = {ExactMatch(wonderlandId, goldenGateBridge),
ExactMatch(wonderlandId, goldenGateStreet)};
TEST(MatchResults(m_engine, rules, request.Results()), ());
for (auto const & result : request.Results())
TEST(MatchResults(rules, request->Results()), ());
for (auto const & result : request->Results())
{
auto const & info = result.GetRankingInfo();
TEST_EQUAL(NAME_SCORE_FULL_MATCH, info.m_nameScore, (result));
TEST(my::AlmostEqualAbs(1.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage));
TEST(!info.m_pureCats, (result));
TEST(!info.m_falseCats, (result));
}
}
// This test is quite important and must always pass.
{
SearchParams params;
MakeDefaultTestParams("cafe лермонтов", params);
TestSearchRequest request(m_engine, params, m_viewport);
request.Wait();
auto const & results = request.Results();
auto request = MakeRequest("cafe лермонтов");
auto const & results = request->Results();
TRules rules{ExactMatch(wonderlandId, cafe1), ExactMatch(wonderlandId, cafe2),
ExactMatch(wonderlandId, lermontov)};
TEST(MatchResults(m_engine, rules, results), ());
TEST(MatchResults(rules, results), ());
TEST_EQUAL(3, results.size(), ("Unexpected number of retrieved cafes."));
auto const & top = results.front();
TEST(MatchResults(m_engine, {ExactMatch(wonderlandId, lermontov)}, {top}), ());
TEST(MatchResults({ExactMatch(wonderlandId, lermontov)}, {top}), ());
}
{
@ -471,6 +474,9 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories)
TestPOI named(m2::PointD(0.0001, 0.0001), "ATM", "en");
named.SetTypes({{"amenity", "atm"}});
TestPOI busStop(m2::PointD(0.00005, 0.0005), "ATM Bus Stop", "en");
busStop.SetTypes({{"highway", "bus_stop"}});
BuildWorld([&](TestMwmBuilder & builder)
{
builder.Add(sanFrancisco);
@ -479,31 +485,51 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories)
{
builder.Add(named);
builder.Add(noname);
builder.Add(busStop);
});
SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5)));
TRules rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)};
TEST(ResultsMatch("atm", rules), ());
{
SearchParams params;
MakeDefaultTestParams("#atm", params);
TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named),
ExactMatch(wonderlandId, busStop)};
TestSearchRequest request(m_engine, params, m_viewport);
request.Wait();
auto request = MakeRequest("atm");
TEST(MatchResults(rules, request->Results()), ());
for (auto const & result : request->Results())
{
Index::FeaturesLoaderGuard loader(m_engine, wonderlandId);
FeatureType ft;
loader.GetFeatureByIndex(result.GetFeatureID().m_index, ft);
TEST(MatchResults(m_engine, rules, request.Results()), ());
for (auto const & result : request.Results())
auto const & info = result.GetRankingInfo();
if (busStop.Matches(ft))
{
TEST(!info.m_pureCats, (result));
TEST(info.m_falseCats, (result));
}
else
{
TEST(info.m_pureCats, (result));
TEST(!info.m_falseCats, (result));
}
}
}
{
TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)};
auto request = MakeRequest("#atm");
TEST(MatchResults(rules, request->Results()), ());
for (auto const & result : request->Results())
{
auto const & info = result.GetRankingInfo();
// Token with a hashtag should not participate in name-score
// calculations.
TEST_EQUAL(NAME_SCORE_ZERO, info.m_nameScore, (result));
// TODO (@y): fix this. Name coverage calculations are flawed.
// TEST(my::AlmostEqualAbs(0.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage));
}
}

View file

@ -0,0 +1,79 @@
#!/bin/bash
# Downloads all maps necessary for learning to rank to the current
# directory.
ALL=
VERSION=
BASE="http://direct.mapswithme.com/direct"
display_usage() {
echo "Usage: $0 -v [version] -a -h"
echo " -v version of maps to download"
echo " -a download all maps of the specified version"
echo " -h display this message"
}
while getopts ":av:h" opt
do
case "$opt" in
a) ALL=1
;;
v) VERSION="$OPTARG"
;;
h) display_usage
exit -1
;;
\?) echo "Invalid option: -$OPTARG" 1>&2
;;
:) echo "Option -$OPTARG requires an argument" 1>&2
;;
esac
done
if [ -z "$VERSION" ]
then
echo "Version of maps is not specified." 1>&2
exit -1
fi
if ! curl "$BASE/" 2>/dev/null |
sed -n 's/^.*href="\(.*\)\/".*$/\1/p' |
grep -v "^../$" | grep -q "$VERSION"
then
echo "Invalid version: $VERSION" 1>&2
exit -1
fi
NAMES=("Australia_Brisbane.mwm"
"Belarus_Minsk*.mwm"
"Germany_*.mwm"
"Russia_*.mwm"
"UK_England_*.mwm"
"US_California_*.mwm"
"US_Maryland_*.mwm")
DIR="$BASE/$VERSION"
if [ "$ALL" ]
then
echo "Downloading all maps..."
files=$(curl "$DIR/" 2>/dev/null | sed -n 's/^.*href="\(.*\.mwm\)".*$/\1/p')
set -e
set -x
for file in $files
do
wget -np -nd "$DIR/$file"
done
else
echo "Downloading maps..."
set -e
set -x
for name in ${NAMES[@]}
do
wget -r -np -nd -A "$name" "$DIR/"
done
fi

View file

@ -141,15 +141,26 @@ void DisplayStats(ostream & os, vector<Sample> const & samples, vector<Stats> co
{
auto const n = samples.size();
ASSERT_EQUAL(stats.size(), n, ());
size_t numWarnings = 0;
for (auto const & stat : stats)
{
if (!stat.m_notFound.empty())
++numWarnings;
}
if (numWarnings == 0)
{
os << "All " << stats.size() << " queries are OK." << endl;
return;
}
os << numWarnings << " warnings." << endl;
for (size_t i = 0; i < n; ++i)
{
os << "Query #" << i << " \"" << strings::ToUtf8(samples[i].m_query) << "\"";
if (stats[i].m_notFound.empty())
{
os << ": OK" << endl;
continue;
}
os << ": WARNING" << endl;
os << "Query #" << i << " \"" << strings::ToUtf8(samples[i].m_query) << "\":" << endl;
for (auto const & j : stats[i].m_notFound)
os << "Not found: " << DebugPrint(samples[i].m_results[j]) << endl;
}

View file

@ -1,41 +1,67 @@
#!/usr/bin/env python3
from math import exp, log
from scipy.stats import pearsonr
from sklearn import cross_validation, grid_search, svm
import argparse
import collections
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import sys
FEATURES = ['DistanceToPivot', 'Rank', 'NameScore', 'NameCoverage', 'SearchType']
MAX_DISTANCE_METERS = 2e7
MAX_DISTANCE_METERS = 2e6
MAX_RANK = 255
RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3}
NAME_SCORES = ['Zero', 'Substring Prefix', 'Substring', 'Full Match Prefix', 'Full Match']
SEARCH_TYPES = {'POI': 0,
'BUILDING': 0,
'STREET': 1,
'UNCLASSIFIED': 2,
'VILLAGE': 3,
'CITY': 4,
'STATE': 5,
'COUNTRY': 6}
SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country']
FEATURES = ['DistanceToPivot', 'Rank'] + NAME_SCORES + SEARCH_TYPES
def transform_name_score(value, categories_match):
if categories_match == 1:
return 'Zero'
elif value == 'Full Match Prefix':
return 'Full Match'
else:
return value
def normalize_data(data):
transform_distance = lambda d: 1 - min(d, MAX_DISTANCE_METERS) / MAX_DISTANCE_METERS
max_name_score = len(NAME_SCORES) - 1
max_search_type = SEARCH_TYPES['COUNTRY']
transform_distance = lambda v: min(v, MAX_DISTANCE_METERS) / MAX_DISTANCE_METERS
data['DistanceToPivot'] = data['DistanceToPivot'].apply(transform_distance)
data['Rank'] = data['Rank'].apply(lambda rank: rank / MAX_RANK)
data['NameScore'] = data['NameScore'].apply(lambda s: NAME_SCORES.index(s) / max_name_score)
data['SearchType'] = data['SearchType'].apply(lambda t: SEARCH_TYPES[t] / max_search_type)
data['Relevance'] = data['Relevance'].apply(lambda r: RELEVANCES[r])
data['Rank'] = data['Rank'].apply(lambda v: v / MAX_RANK)
data['Relevance'] = data['Relevance'].apply(lambda v: RELEVANCES[v])
cats = data['PureCats'].combine(data['FalseCats'], max)
# Full prefix match is unified with a full match as these features
# are collinear. But we need both of them as they're also used in
# locality sorting.
#
# TODO (@y, @m): do forward/backward/subset selection of features
# instead of this merging. It would be great to conduct PCA on
# the features too.
data['NameScore'] = data['NameScore'].combine(cats, transform_name_score)
# Adds dummy variables to data for NAME_SCORES.
for ns in NAME_SCORES:
data[ns] = data['NameScore'].apply(lambda v: int(ns == v))
# Adds dummy variables to data for SEARCH_TYPES.
# We unify BUILDING with POI here, as we don't have enough
# training data to distinguish between them. Remove following
# line as soon as the model will be changed or we will have enough
# training data.
data['SearchType'] = data['SearchType'].apply(lambda v: v if v != 'Building' else 'POI')
for st in SEARCH_TYPES:
data[st] = data['SearchType'].apply(lambda v: int(st == v))
def compute_ndcg(relevances):
@ -44,25 +70,12 @@ def compute_ndcg(relevances):
array of scores.
"""
relevances_summary = collections.defaultdict(int)
dcg = 0
for i, relevance in enumerate(relevances):
dcg += relevance / log(2 + i, 2)
relevances_summary[relevance] += 1
dcg_norm, i = 0, 0
for relevance in sorted(relevances_summary.keys(), reverse=True):
for _ in range(relevances_summary[relevance]):
dcg_norm += relevance / log(2 + i, 2)
i += 1
if dcg_norm == 0:
return 0
return dcg / dcg_norm
dcg = sum(r / log(2 + i, 2) for i, r in enumerate(relevances))
dcg_norm = sum(r / log(2 + i, 2) for i, r in enumerate(sorted(relevances, reverse=True)))
return dcg / dcg_norm if dcg_norm != 0 else 0
def compute_ndcg_without_w(data):
def compute_ndcgs_without_ws(data):
"""
Computes NDCG (Normalized Discounted Cumulative Gain) for a given
data. Returns an array of ndcg scores in the shape [num groups of
@ -77,17 +90,17 @@ def compute_ndcg_without_w(data):
relevances = np.array(data.ix[indices]['Relevance'])
ndcgs.append(compute_ndcg(relevances))
return np.array(ndcgs)
return ndcgs
def compute_ndcg_for_w(data, w):
def compute_ndcgs_for_ws(data, ws):
"""
Computes NDCG (Normalized Discounted Cumulative Gain) for a given
data and an array of coeffs in a linear model. Returns an array of
ndcg scores in the shape [num groups of features].
"""
data_scores = np.array([np.dot(data.ix[i][FEATURES], w) for i in data.index])
data_scores = np.array([np.dot(data.ix[i][FEATURES], ws) for i in data.index])
grouped = data.groupby(data['SampleId'], sort=False).groups
ndcgs = []
@ -101,7 +114,7 @@ def compute_ndcg_for_w(data, w):
relevances = relevances[scores.argsort()[::-1]]
ndcgs.append(compute_ndcg(relevances))
return np.array(ndcgs)
return ndcgs
def transform_data(data):
@ -150,36 +163,144 @@ def transform_data(data):
return xs, ys
def plot_diagrams(xs, ys, features):
"""
For each feature, plots histagrams of x * sign(y), where x is a
slice on the feature of a list of pairwise differences between
input feature-vectors and y is a list of pairwise differences
between relevances of the input feature-vectors. Stong bias
toward positive or negative values in histograms indicates that
the current feature is important for ranking, as there is a
correlation between difference between features values and
relevancy.
"""
for i, f in enumerate(features):
x = [x[i] * np.sign(y) for x, y in zip(xs, ys)]
l, r = min(x), max(x)
d = max(abs(l), abs(r))
plt.subplot(4, 4, i + 1)
plt.hist(x, bins=8, range=(-d, d))
plt.title(f)
plt.show()
def show_pearson_statistics(xs, ys, features):
"""
Shows info about Pearson coefficient between features and
relevancy.
"""
print('***** Correlation table *****')
print('H0 - feature not is correlated with relevancy')
print('H1 - feature is correlated with relevancy')
print()
cs, ncs = [], []
for i, f in enumerate(features):
zs = [x[i] for x in xs]
(c, p) = pearsonr(zs, ys)
correlated = p < 0.05
print('{}: pearson={:.3f}, P(H1)={}'.format(f, c, 1 - p))
if correlated:
cs.append(f)
else:
ncs.append(f)
print()
print('Correlated:', cs)
print('Non-correlated:', ncs)
def raw_output(features, ws):
"""
Prints feature-coeff pairs to the standard output.
"""
for f, w in zip(features, ws):
print('{}: {}'.format(f, w))
def print_const(name, value):
print('double const k{} = {:.7f};'.format(name, value))
def print_array(name, size, values):
print('double const {}[{}] = {{'.format(name, size))
print(',\n'.join(' {:.7f} /* {} */'.format(w, f) for (f, w) in values))
print('};')
def cpp_output(features, ws):
"""
Prints feature-coeff pairs in the C++-compatible format.
"""
ns, st = [], []
for f, w in zip(features, ws):
if f in NAME_SCORES:
ns.append((f, w))
elif f in SEARCH_TYPES:
st.append((f, w))
else:
print_const(f, w)
print_array('kNameScore', 'NameScore::NAME_SCORE_COUNT', ns)
print_array('kSearchType', 'SearchModel::SEARCH_TYPE_COUNT', st)
def main(args):
data = pd.read_csv(sys.stdin)
normalize_data(data)
ndcg = compute_ndcg_without_w(data);
print('Current NDCG: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg)))
ndcgs = compute_ndcgs_without_ws(data);
print('Current NDCG: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs)))
print()
x, y = transform_data(data)
xs, ys = transform_data(data)
if args.plot:
plot_diagrams(xs, ys, FEATURES)
clf = svm.LinearSVC(random_state=args.seed)
cv = cross_validation.KFold(len(y), n_folds=5, shuffle=True, random_state=args.seed)
cv = cross_validation.KFold(len(ys), n_folds=5, shuffle=True, random_state=args.seed)
# "C" stands for the regularizer constant.
grid = {'C': np.power(10.0, np.arange(-5, 6))}
gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(x, y)
gs.fit(xs, ys)
w = gs.best_estimator_.coef_[0]
ndcg = compute_ndcg_for_w(data, w)
ws = gs.best_estimator_.coef_[0]
max_w = max(abs(w) for w in ws)
ws = np.divide(ws, max_w)
# Following code restores coeffs for merged features.
ws[FEATURES.index('Building')] = ws[FEATURES.index('POI')]
ws[FEATURES.index('Full Match Prefix')] = ws[FEATURES.index('Full Match')]
ndcgs = compute_ndcgs_for_ws(data, ws)
print('NDCG mean: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs)))
print('Accuracy: {}'.format(gs.best_score_))
if args.pearson:
print()
show_pearson_statistics(xs, ys, FEATURES)
print('NDCG mean: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg)))
print()
print('Linear model weights:')
for f, c in zip(FEATURES, w):
print('{}: {}'.format(f, c))
print('***** Linear model weights *****')
if args.cpp:
cpp_output(FEATURES, ws)
else:
raw_output(FEATURES, ws)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--seed', help='random seed', type=int)
parser.add_argument('--plot', help='plot diagrams', action='store_true')
parser.add_argument('--pearson', help='show pearson statistics', action='store_true')
parser.add_argument('--cpp', help='generate output in the C++ format', action='store_true')
args = parser.parse_args()
main(args)

View file

@ -12,7 +12,6 @@
#include "search/v2/pre_ranking_info.hpp"
#include "search/v2/ranking_info.hpp"
#include "search/v2/ranking_utils.hpp"
#include "search/v2/token_slice.hpp"
#include "storage/country_info_getter.hpp"
#include "storage/index.hpp"
@ -191,20 +190,11 @@ void UpdateNameScore(string const & name, TSlice const & slice, v2::NameScore &
template <typename TSlice>
void UpdateNameScore(vector<strings::UniString> const & tokens, TSlice const & slice,
v2::NameScore & bestScore, double & bestCoverage)
v2::NameScore & bestScore)
{
auto const score = v2::GetNameScore(tokens, slice);
auto const coverage =
tokens.empty() ? 0 : static_cast<double>(slice.Size()) / static_cast<double>(tokens.size());
if (score > bestScore)
{
bestScore = score;
bestCoverage = coverage;
}
else if (score == bestScore && coverage > bestCoverage)
{
bestCoverage = coverage;
}
}
inline bool IsHashtagged(strings::UniString const & s) { return !s.empty() && s[0] == '#'; }
@ -422,29 +412,18 @@ int Query::GetCategoryLocales(int8_t (&arr) [3]) const
}
template <class ToDo>
void Query::ForEachCategoryTypes(ToDo toDo) const
void Query::ForEachCategoryTypes(v2::QuerySlice const & slice, ToDo toDo) const
{
int8_t arrLocales[3];
int const localesCount = GetCategoryLocales(arrLocales);
size_t const tokensCount = m_tokens.size();
for (size_t i = 0; i < tokensCount; ++i)
for (size_t i = 0; i < slice.Size(); ++i)
{
auto token = RemoveHashtag(m_tokens[i]);
auto token = RemoveHashtag(slice.Get(i));
for (int j = 0; j < localesCount; ++j)
m_categories.ForEachTypeByName(arrLocales[j], token, bind<void>(ref(toDo), i, _1));
ProcessEmojiIfNeeded(token, i, toDo);
}
if (!m_prefix.empty())
{
auto prefix = RemoveHashtag(m_prefix);
for (int j = 0; j < localesCount; ++j)
m_categories.ForEachTypeByName(arrLocales[j], prefix, bind<void>(ref(toDo), tokensCount, _1));
ProcessEmojiIfNeeded(prefix, tokensCount, toDo);
}
}
template <class ToDo>
@ -522,10 +501,11 @@ void Query::SetQuery(string const & query)
// get preffered types to show in results
m_prefferedTypes.clear();
ForEachCategoryTypes([&] (size_t, uint32_t t)
{
m_prefferedTypes.insert(t);
});
ForEachCategoryTypes(v2::QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix),
[&](size_t, uint32_t t)
{
m_prefferedTypes.insert(t);
});
}
void Query::FlushViewportResults(v2::Geocoder::Params const & params, Results & res,
@ -660,7 +640,6 @@ class PreResult2Maker
info.m_distanceToPivot = MercatorBounds::DistanceOnEarth(center, pivot);
info.m_rank = preInfo.m_rank;
info.m_searchType = preInfo.m_searchType;
info.m_nameScore = v2::NAME_SCORE_ZERO;
v2::TokenSlice slice(m_params, preInfo.m_startToken, preInfo.m_endToken);
@ -675,12 +654,30 @@ class PreResult2Maker
vector<strings::UniString> tokens;
SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());
UpdateNameScore(tokens, slice, info.m_nameScore, info.m_nameCoverage);
UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore, info.m_nameCoverage);
UpdateNameScore(tokens, slice, info.m_nameScore);
UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore);
}
if (info.m_searchType == v2::SearchModel::SEARCH_TYPE_BUILDING)
UpdateNameScore(ft.GetHouseNumber(), sliceNoCategories, info.m_nameScore);
feature::TypesHolder holder(ft);
vector<pair<size_t, size_t>> matched(slice.Size());
m_query.ForEachCategoryTypes(v2::QuerySliceOnTokens(slice), [&](size_t i, uint32_t t)
{
++matched[i].second;
if (holder.Has(t))
++matched[i].first;
});
info.m_pureCats = all_of(matched.begin(), matched.end(), [](pair<size_t, size_t> const & m)
{
return m.first != 0;
});
info.m_falseCats = all_of(matched.begin(), matched.end(), [](pair<size_t, size_t> const & m)
{
return m.first == 0 && m.second != 0;
});
}
uint8_t NormalizeRank(uint8_t rank, v2::SearchModel::SearchType type, m2::PointD const & center,
@ -1259,7 +1256,8 @@ void Query::InitParams(bool localitySearch, SearchQueryParams & params)
}
}
};
ForEachCategoryTypes(addSyms);
ForEachCategoryTypes(v2::QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix),
addSyms);
}
for (auto & tokens : params.m_tokens)

View file

@ -7,6 +7,7 @@
#include "search/suggest.hpp"
#include "search/v2/geocoder.hpp"
#include "search/v2/rank_table_cache.hpp"
#include "search/v2/token_slice.hpp"
#include "indexer/ftypes_matcher.hpp"
#include "indexer/index.hpp"
@ -145,7 +146,8 @@ protected:
void ClearResults();
int GetCategoryLocales(int8_t (&arr) [3]) const;
template <class ToDo> void ForEachCategoryTypes(ToDo toDo) const;
template <class ToDo>
void ForEachCategoryTypes(v2::QuerySlice const & slice, ToDo toDo) const;
template <class ToDo> void ProcessEmojiIfNeeded(
strings::UniString const & token, size_t ind, ToDo & toDo) const;

View file

@ -44,5 +44,6 @@ UNIT_TEST(NameTest_Smoke)
TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", 2, 3), NAME_SCORE_FULL_MATCH, ());
TEST_EQUAL(GetScore("San Francisco", "Fran", 0, 1), NAME_SCORE_SUBSTRING_PREFIX, ());
TEST_EQUAL(GetScore("San Francisco", "Fran ", 0, 1), NAME_SCORE_ZERO, ());
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", 0, 1), NAME_SCORE_FULL_MATCH_PREFIX, ());
}
} // namespace

View file

@ -73,12 +73,6 @@ size_t constexpr kLocalityRectsCacheSize = 10;
strings::UniString const kUniSpace(strings::MakeUniString(" "));
template <typename T>
struct Id
{
T const & operator()(T const & t) const { return t; }
};
struct ScopedMarkTokens
{
ScopedMarkTokens(vector<bool> & usedTokens, size_t from, size_t to)
@ -1563,12 +1557,12 @@ SearchModel::SearchType Geocoder::GetSearchTypeInGeocoding(uint32_t featureId)
bool Geocoder::AllTokensUsed() const
{
return all_of(m_usedTokens.begin(), m_usedTokens.end(), Id<bool>());
return all_of(m_usedTokens.begin(), m_usedTokens.end(), IdFunctor());
}
bool Geocoder::HasUsedTokensInRange(size_t from, size_t to) const
{
return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, Id<bool>());
return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, IdFunctor());
}
size_t Geocoder::NumUnusedTokensGroups() const

View file

@ -12,20 +12,34 @@ namespace
{
// See search/search_quality/scoring_model.py for details. In short,
// these coeffs correspond to coeffs in a linear model.
double const kDistanceToPivot = 0.19933969103335503;
double const kRank = 3.528698483480807;
double const kNameScore = 1.0050524496846687;
double const kNameCoverage = 0.33989660511789926;
double const kSearchType = 1.1949307125113533;
double const kDistanceToPivot = -1.0000000;
double const kRank = 0.5430747;
double const kNameScore[NameScore::NAME_SCORE_COUNT] = {
-0.3686323 /* Zero */,
0.0977193 /* Substring Prefix */,
0.1340500 /* Substring */,
0.1368631 /* Full Match Prefix */,
0.1368631 /* Full Match */
};
double const kSearchType[SearchModel::SEARCH_TYPE_COUNT] = {
-0.9195533 /* POI */,
-0.9195533 /* Building */,
-0.1470504 /* Street */,
-0.6392620 /* Unclassified */,
-0.0900970 /* Village */,
0.4383605 /* City */,
0.6296097 /* State */,
0.7279924 /* Country */
};
double TransformDistance(double distance)
{
return 1.0 - min(distance, RankingInfo::kMaxDistMeters) / RankingInfo::kMaxDistMeters;
return min(distance, RankingInfo::kMaxDistMeters) / RankingInfo::kMaxDistMeters;
}
} // namespace
// static
double const RankingInfo::kMaxDistMeters = 2e7;
double const RankingInfo::kMaxDistMeters = 2e6;
// static
void RankingInfo::PrintCSVHeader(ostream & os)
@ -33,8 +47,9 @@ void RankingInfo::PrintCSVHeader(ostream & os)
os << "DistanceToPivot"
<< ",Rank"
<< ",NameScore"
<< ",NameCoverage"
<< ",SearchType";
<< ",SearchType"
<< ",PureCats"
<< ",FalseCats";
}
string DebugPrint(RankingInfo const & info)
@ -44,8 +59,9 @@ string DebugPrint(RankingInfo const & info)
os << "m_distanceToPivot:" << info.m_distanceToPivot << ",";
os << "m_rank:" << static_cast<int>(info.m_rank) << ",";
os << "m_nameScore:" << DebugPrint(info.m_nameScore) << ",";
os << "m_nameCoverage:" << info.m_nameCoverage << ",";
os << "m_searchType:" << DebugPrint(info.m_searchType);
os << "m_searchType:" << DebugPrint(info.m_searchType) << ",";
os << "m_pureCats:" << info.m_pureCats << ",";
os << "m_falseCats:" << info.m_falseCats;
os << "]";
return os.str();
}
@ -54,7 +70,7 @@ void RankingInfo::ToCSV(ostream & os) const
{
os << fixed;
os << m_distanceToPivot << "," << static_cast<int>(m_rank) << "," << DebugPrint(m_nameScore)
<< "," << m_nameCoverage << "," << DebugPrint(m_searchType);
<< "," << DebugPrint(m_searchType) << "," << m_pureCats << "," << m_falseCats;
}
double RankingInfo::GetLinearModelRank() const
@ -65,24 +81,21 @@ double RankingInfo::GetLinearModelRank() const
// integrated in the build system.
double const distanceToPivot = TransformDistance(m_distanceToPivot);
double const rank = static_cast<double>(m_rank) / numeric_limits<uint8_t>::max();
double const nameScore = static_cast<double>(m_nameScore) / NAME_SCORE_FULL_MATCH;
double const nameCoverage = m_nameCoverage;
double searchType;
switch (m_searchType)
auto nameScore = m_nameScore;
if (m_pureCats || m_falseCats)
{
case SearchModel::SEARCH_TYPE_POI:
case SearchModel::SEARCH_TYPE_BUILDING:
searchType = 0;
break;
default:
searchType = m_searchType - 1;
break;
// If the feature was matched only by categorial tokens, it's
// better for ranking to set name score to zero. For example,
// when we're looking for a "cafe", cafes "Cafe Pushkin" and
// "Lermontov" both match to the request, but must be ranked in
// accordance to their distances to the user position or viewport,
// in spite of "Cafe Pushkin" has a non-zero name rank.
nameScore = NAME_SCORE_ZERO;
}
searchType = searchType / (SearchModel::SEARCH_TYPE_COUNTRY - 1);
return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore * nameScore +
kNameCoverage * nameCoverage + kSearchType * searchType;
return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] +
kSearchType[m_searchType];
}
} // namespace v2
} // namespace search

View file

@ -24,12 +24,18 @@ struct RankingInfo
// Score for the feature's name.
NameScore m_nameScore = NAME_SCORE_ZERO;
// Fraction of tokens from the query matched to a feature name.
double m_nameCoverage = 0;
// Search type for the feature.
SearchModel::SearchType m_searchType = SearchModel::SEARCH_TYPE_COUNT;
// True if all of the tokens that the feature was matched by
// correspond to this feature's categories.
bool m_pureCats = false;
// True if none of the tokens that the feature was matched by
// corresponds to this feature's categories although all of the
// tokens are categorial ones.
bool m_falseCats = false;
static void PrintCSVHeader(ostream & os);
void ToCSV(ostream & os) const;

View file

@ -130,14 +130,14 @@ string DebugPrint(SearchModel::SearchType type)
switch (type)
{
case SearchModel::SEARCH_TYPE_POI: return "POI";
case SearchModel::SEARCH_TYPE_BUILDING: return "BUILDING";
case SearchModel::SEARCH_TYPE_STREET: return "STREET";
case SearchModel::SEARCH_TYPE_CITY: return "CITY";
case SearchModel::SEARCH_TYPE_VILLAGE: return "VILLAGE";
case SearchModel::SEARCH_TYPE_STATE: return "STATE";
case SearchModel::SEARCH_TYPE_COUNTRY: return "COUNTRY";
case SearchModel::SEARCH_TYPE_UNCLASSIFIED: return "UNCLASSIFIED";
case SearchModel::SEARCH_TYPE_COUNT: return "COUNT";
case SearchModel::SEARCH_TYPE_BUILDING: return "Building";
case SearchModel::SEARCH_TYPE_STREET: return "Street";
case SearchModel::SEARCH_TYPE_CITY: return "City";
case SearchModel::SEARCH_TYPE_VILLAGE: return "Village";
case SearchModel::SEARCH_TYPE_STATE: return "State";
case SearchModel::SEARCH_TYPE_COUNTRY: return "Country";
case SearchModel::SEARCH_TYPE_UNCLASSIFIED: return "Unclassified";
case SearchModel::SEARCH_TYPE_COUNT: return "Count";
}
ASSERT(false, ("Unknown search type:", static_cast<int>(type)));
return string();

View file

@ -67,6 +67,63 @@ private:
vector<size_t> m_indexes;
};
class QuerySlice
{
public:
using TString = SearchQueryParams::TString;
virtual ~QuerySlice() = default;
virtual TString const & Get(size_t i) const = 0;
virtual size_t Size() const = 0;
virtual bool IsPrefix(size_t i) const = 0;
bool Empty() const { return Size() == 0; }
};
class QuerySliceOnTokens : public QuerySlice
{
public:
QuerySliceOnTokens(TokenSlice const & slice) : m_slice(slice) {}
// QuerySlice overrides:
SearchQueryParams::TString const & Get(size_t i) const override { return m_slice.Get(i).front(); }
size_t Size() const override { return m_slice.Size(); }
bool IsPrefix(size_t i) const override { return m_slice.IsPrefix(i); }
private:
TokenSlice const m_slice;
};
template <typename TCont>
class QuerySliceOnRawStrings : public QuerySlice
{
public:
QuerySliceOnRawStrings(TCont const & tokens, TString const & prefix)
: m_tokens(tokens), m_prefix(prefix)
{
}
// QuerySlice overrides:
SearchQueryParams::TString const & Get(size_t i) const override
{
ASSERT_LESS(i, Size(), ());
return i == m_tokens.size() ? m_prefix : m_tokens[i];
}
size_t Size() const override { return m_tokens.size() + (m_prefix.empty() ? 0 : 1); }
bool IsPrefix(size_t i) const override
{
ASSERT_LESS(i, Size(), ());
return i == m_tokens.size();
}
private:
TCont const & m_tokens;
TString const & m_prefix;
};
string DebugPrint(TokenSlice const & slice);
string DebugPrint(TokenSliceNoCategories const & slice);