forked from organicmaps/organicmaps
Merge pull request #3259 from ygorshenin/fix-ranking-model
[search] Fixed ranking model.
This commit is contained in:
commit
50187d1bd3
12 changed files with 480 additions and 172 deletions
|
@ -30,16 +30,27 @@ namespace search
|
|||
{
|
||||
namespace
|
||||
{
|
||||
void MakeDefaultTestParams(string const & query, SearchParams & params)
|
||||
{
|
||||
params.m_query = query;
|
||||
params.m_inputLocale = "en";
|
||||
params.SetMode(Mode::Everywhere);
|
||||
params.SetSuggestsEnabled(false);
|
||||
}
|
||||
|
||||
class SearchQueryV2Test : public SearchTest
|
||||
{
|
||||
public:
|
||||
unique_ptr<TestSearchRequest> MakeRequest(string const & query)
|
||||
{
|
||||
SearchParams params;
|
||||
params.m_query = query;
|
||||
params.m_inputLocale = "en";
|
||||
params.SetMode(Mode::Everywhere);
|
||||
params.SetSuggestsEnabled(false);
|
||||
|
||||
auto request = make_unique<TestSearchRequest>(m_engine, params, m_viewport);
|
||||
request->Wait();
|
||||
return request;
|
||||
}
|
||||
|
||||
bool MatchResults(vector<shared_ptr<MatchingRule>> rules,
|
||||
vector<search::Result> const & actual) const
|
||||
{
|
||||
return ::MatchResults(m_engine, rules, actual);
|
||||
}
|
||||
};
|
||||
|
||||
UNIT_CLASS_TEST(SearchQueryV2Test, Smoke)
|
||||
|
@ -271,7 +282,7 @@ UNIT_CLASS_TEST(SearchQueryV2Test, DisableSuggests)
|
|||
request.Wait();
|
||||
TRules rules = {ExactMatch(worldId, london1), ExactMatch(worldId, london2)};
|
||||
|
||||
TEST(MatchResults(m_engine, rules, request.Results()), ());
|
||||
TEST(MatchResults(rules, request.Results()), ());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -321,41 +332,33 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestRankingInfo)
|
|||
|
||||
SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5)));
|
||||
{
|
||||
SearchParams params;
|
||||
MakeDefaultTestParams("golden gate bridge ", params);
|
||||
|
||||
TestSearchRequest request(m_engine, params, m_viewport);
|
||||
request.Wait();
|
||||
auto request = MakeRequest("golden gate bridge ");
|
||||
|
||||
TRules rules = {ExactMatch(wonderlandId, goldenGateBridge),
|
||||
ExactMatch(wonderlandId, goldenGateStreet)};
|
||||
|
||||
TEST(MatchResults(m_engine, rules, request.Results()), ());
|
||||
for (auto const & result : request.Results())
|
||||
TEST(MatchResults(rules, request->Results()), ());
|
||||
for (auto const & result : request->Results())
|
||||
{
|
||||
auto const & info = result.GetRankingInfo();
|
||||
TEST_EQUAL(NAME_SCORE_FULL_MATCH, info.m_nameScore, (result));
|
||||
TEST(my::AlmostEqualAbs(1.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage));
|
||||
TEST(!info.m_pureCats, (result));
|
||||
TEST(!info.m_falseCats, (result));
|
||||
}
|
||||
}
|
||||
|
||||
// This test is quite important and must always pass.
|
||||
{
|
||||
SearchParams params;
|
||||
MakeDefaultTestParams("cafe лермонтов", params);
|
||||
|
||||
TestSearchRequest request(m_engine, params, m_viewport);
|
||||
request.Wait();
|
||||
|
||||
auto const & results = request.Results();
|
||||
auto request = MakeRequest("cafe лермонтов");
|
||||
auto const & results = request->Results();
|
||||
|
||||
TRules rules{ExactMatch(wonderlandId, cafe1), ExactMatch(wonderlandId, cafe2),
|
||||
ExactMatch(wonderlandId, lermontov)};
|
||||
TEST(MatchResults(m_engine, rules, results), ());
|
||||
TEST(MatchResults(rules, results), ());
|
||||
|
||||
TEST_EQUAL(3, results.size(), ("Unexpected number of retrieved cafes."));
|
||||
auto const & top = results.front();
|
||||
TEST(MatchResults(m_engine, {ExactMatch(wonderlandId, lermontov)}, {top}), ());
|
||||
TEST(MatchResults({ExactMatch(wonderlandId, lermontov)}, {top}), ());
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -471,6 +474,9 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories)
|
|||
TestPOI named(m2::PointD(0.0001, 0.0001), "ATM", "en");
|
||||
named.SetTypes({{"amenity", "atm"}});
|
||||
|
||||
TestPOI busStop(m2::PointD(0.00005, 0.0005), "ATM Bus Stop", "en");
|
||||
busStop.SetTypes({{"highway", "bus_stop"}});
|
||||
|
||||
BuildWorld([&](TestMwmBuilder & builder)
|
||||
{
|
||||
builder.Add(sanFrancisco);
|
||||
|
@ -479,31 +485,51 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories)
|
|||
{
|
||||
builder.Add(named);
|
||||
builder.Add(noname);
|
||||
builder.Add(busStop);
|
||||
});
|
||||
|
||||
SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5)));
|
||||
TRules rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)};
|
||||
|
||||
TEST(ResultsMatch("atm", rules), ());
|
||||
|
||||
{
|
||||
SearchParams params;
|
||||
MakeDefaultTestParams("#atm", params);
|
||||
TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named),
|
||||
ExactMatch(wonderlandId, busStop)};
|
||||
|
||||
TestSearchRequest request(m_engine, params, m_viewport);
|
||||
request.Wait();
|
||||
auto request = MakeRequest("atm");
|
||||
TEST(MatchResults(rules, request->Results()), ());
|
||||
for (auto const & result : request->Results())
|
||||
{
|
||||
Index::FeaturesLoaderGuard loader(m_engine, wonderlandId);
|
||||
FeatureType ft;
|
||||
loader.GetFeatureByIndex(result.GetFeatureID().m_index, ft);
|
||||
|
||||
TEST(MatchResults(m_engine, rules, request.Results()), ());
|
||||
for (auto const & result : request.Results())
|
||||
auto const & info = result.GetRankingInfo();
|
||||
|
||||
if (busStop.Matches(ft))
|
||||
{
|
||||
TEST(!info.m_pureCats, (result));
|
||||
TEST(info.m_falseCats, (result));
|
||||
}
|
||||
else
|
||||
{
|
||||
TEST(info.m_pureCats, (result));
|
||||
TEST(!info.m_falseCats, (result));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)};
|
||||
|
||||
auto request = MakeRequest("#atm");
|
||||
|
||||
TEST(MatchResults(rules, request->Results()), ());
|
||||
for (auto const & result : request->Results())
|
||||
{
|
||||
auto const & info = result.GetRankingInfo();
|
||||
|
||||
// Token with a hashtag should not participate in name-score
|
||||
// calculations.
|
||||
TEST_EQUAL(NAME_SCORE_ZERO, info.m_nameScore, (result));
|
||||
|
||||
// TODO (@y): fix this. Name coverage calculations are flawed.
|
||||
// TEST(my::AlmostEqualAbs(0.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
79
search/search_quality/download-maps.sh
Executable file
79
search/search_quality/download-maps.sh
Executable file
|
@ -0,0 +1,79 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Downloads all maps necessary for learning to rank to the current
|
||||
# directory.
|
||||
|
||||
ALL=
|
||||
VERSION=
|
||||
BASE="http://direct.mapswithme.com/direct"
|
||||
|
||||
display_usage() {
|
||||
echo "Usage: $0 -v [version] -a -h"
|
||||
echo " -v version of maps to download"
|
||||
echo " -a download all maps of the specified version"
|
||||
echo " -h display this message"
|
||||
}
|
||||
|
||||
while getopts ":av:h" opt
|
||||
do
|
||||
case "$opt" in
|
||||
a) ALL=1
|
||||
;;
|
||||
v) VERSION="$OPTARG"
|
||||
;;
|
||||
h) display_usage
|
||||
exit -1
|
||||
;;
|
||||
\?) echo "Invalid option: -$OPTARG" 1>&2
|
||||
;;
|
||||
:) echo "Option -$OPTARG requires an argument" 1>&2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$VERSION" ]
|
||||
then
|
||||
echo "Version of maps is not specified." 1>&2
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if ! curl "$BASE/" 2>/dev/null |
|
||||
sed -n 's/^.*href="\(.*\)\/".*$/\1/p' |
|
||||
grep -v "^../$" | grep -q "$VERSION"
|
||||
then
|
||||
echo "Invalid version: $VERSION" 1>&2
|
||||
exit -1
|
||||
fi
|
||||
|
||||
NAMES=("Australia_Brisbane.mwm"
|
||||
"Belarus_Minsk*.mwm"
|
||||
"Germany_*.mwm"
|
||||
"Russia_*.mwm"
|
||||
"UK_England_*.mwm"
|
||||
"US_California_*.mwm"
|
||||
"US_Maryland_*.mwm")
|
||||
|
||||
DIR="$BASE/$VERSION"
|
||||
|
||||
if [ "$ALL" ]
|
||||
then
|
||||
echo "Downloading all maps..."
|
||||
|
||||
files=$(curl "$DIR/" 2>/dev/null | sed -n 's/^.*href="\(.*\.mwm\)".*$/\1/p')
|
||||
|
||||
set -e
|
||||
set -x
|
||||
for file in $files
|
||||
do
|
||||
wget -np -nd "$DIR/$file"
|
||||
done
|
||||
else
|
||||
echo "Downloading maps..."
|
||||
|
||||
set -e
|
||||
set -x
|
||||
for name in ${NAMES[@]}
|
||||
do
|
||||
wget -r -np -nd -A "$name" "$DIR/"
|
||||
done
|
||||
fi
|
|
@ -141,15 +141,26 @@ void DisplayStats(ostream & os, vector<Sample> const & samples, vector<Stats> co
|
|||
{
|
||||
auto const n = samples.size();
|
||||
ASSERT_EQUAL(stats.size(), n, ());
|
||||
|
||||
size_t numWarnings = 0;
|
||||
for (auto const & stat : stats)
|
||||
{
|
||||
if (!stat.m_notFound.empty())
|
||||
++numWarnings;
|
||||
}
|
||||
|
||||
if (numWarnings == 0)
|
||||
{
|
||||
os << "All " << stats.size() << " queries are OK." << endl;
|
||||
return;
|
||||
}
|
||||
|
||||
os << numWarnings << " warnings." << endl;
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
os << "Query #" << i << " \"" << strings::ToUtf8(samples[i].m_query) << "\"";
|
||||
if (stats[i].m_notFound.empty())
|
||||
{
|
||||
os << ": OK" << endl;
|
||||
continue;
|
||||
}
|
||||
os << ": WARNING" << endl;
|
||||
os << "Query #" << i << " \"" << strings::ToUtf8(samples[i].m_query) << "\":" << endl;
|
||||
for (auto const & j : stats[i].m_notFound)
|
||||
os << "Not found: " << DebugPrint(samples[i].m_results[j]) << endl;
|
||||
}
|
||||
|
|
|
@ -1,41 +1,67 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from math import exp, log
|
||||
from scipy.stats import pearsonr
|
||||
from sklearn import cross_validation, grid_search, svm
|
||||
import argparse
|
||||
import collections
|
||||
import itertools
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import random
|
||||
import sys
|
||||
|
||||
FEATURES = ['DistanceToPivot', 'Rank', 'NameScore', 'NameCoverage', 'SearchType']
|
||||
|
||||
MAX_DISTANCE_METERS = 2e7
|
||||
MAX_DISTANCE_METERS = 2e6
|
||||
MAX_RANK = 255
|
||||
RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3}
|
||||
NAME_SCORES = ['Zero', 'Substring Prefix', 'Substring', 'Full Match Prefix', 'Full Match']
|
||||
SEARCH_TYPES = {'POI': 0,
|
||||
'BUILDING': 0,
|
||||
'STREET': 1,
|
||||
'UNCLASSIFIED': 2,
|
||||
'VILLAGE': 3,
|
||||
'CITY': 4,
|
||||
'STATE': 5,
|
||||
'COUNTRY': 6}
|
||||
SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country']
|
||||
|
||||
FEATURES = ['DistanceToPivot', 'Rank'] + NAME_SCORES + SEARCH_TYPES
|
||||
|
||||
|
||||
def transform_name_score(value, categories_match):
|
||||
if categories_match == 1:
|
||||
return 'Zero'
|
||||
elif value == 'Full Match Prefix':
|
||||
return 'Full Match'
|
||||
else:
|
||||
return value
|
||||
|
||||
|
||||
def normalize_data(data):
|
||||
transform_distance = lambda d: 1 - min(d, MAX_DISTANCE_METERS) / MAX_DISTANCE_METERS
|
||||
|
||||
max_name_score = len(NAME_SCORES) - 1
|
||||
max_search_type = SEARCH_TYPES['COUNTRY']
|
||||
transform_distance = lambda v: min(v, MAX_DISTANCE_METERS) / MAX_DISTANCE_METERS
|
||||
|
||||
data['DistanceToPivot'] = data['DistanceToPivot'].apply(transform_distance)
|
||||
data['Rank'] = data['Rank'].apply(lambda rank: rank / MAX_RANK)
|
||||
data['NameScore'] = data['NameScore'].apply(lambda s: NAME_SCORES.index(s) / max_name_score)
|
||||
data['SearchType'] = data['SearchType'].apply(lambda t: SEARCH_TYPES[t] / max_search_type)
|
||||
data['Relevance'] = data['Relevance'].apply(lambda r: RELEVANCES[r])
|
||||
data['Rank'] = data['Rank'].apply(lambda v: v / MAX_RANK)
|
||||
data['Relevance'] = data['Relevance'].apply(lambda v: RELEVANCES[v])
|
||||
|
||||
cats = data['PureCats'].combine(data['FalseCats'], max)
|
||||
|
||||
# Full prefix match is unified with a full match as these features
|
||||
# are collinear. But we need both of them as they're also used in
|
||||
# locality sorting.
|
||||
#
|
||||
# TODO (@y, @m): do forward/backward/subset selection of features
|
||||
# instead of this merging. It would be great to conduct PCA on
|
||||
# the features too.
|
||||
data['NameScore'] = data['NameScore'].combine(cats, transform_name_score)
|
||||
|
||||
# Adds dummy variables to data for NAME_SCORES.
|
||||
for ns in NAME_SCORES:
|
||||
data[ns] = data['NameScore'].apply(lambda v: int(ns == v))
|
||||
|
||||
# Adds dummy variables to data for SEARCH_TYPES.
|
||||
|
||||
# We unify BUILDING with POI here, as we don't have enough
|
||||
# training data to distinguish between them. Remove following
|
||||
# line as soon as the model will be changed or we will have enough
|
||||
# training data.
|
||||
data['SearchType'] = data['SearchType'].apply(lambda v: v if v != 'Building' else 'POI')
|
||||
for st in SEARCH_TYPES:
|
||||
data[st] = data['SearchType'].apply(lambda v: int(st == v))
|
||||
|
||||
|
||||
def compute_ndcg(relevances):
|
||||
|
@ -44,25 +70,12 @@ def compute_ndcg(relevances):
|
|||
array of scores.
|
||||
"""
|
||||
|
||||
relevances_summary = collections.defaultdict(int)
|
||||
|
||||
dcg = 0
|
||||
for i, relevance in enumerate(relevances):
|
||||
dcg += relevance / log(2 + i, 2)
|
||||
relevances_summary[relevance] += 1
|
||||
|
||||
dcg_norm, i = 0, 0
|
||||
for relevance in sorted(relevances_summary.keys(), reverse=True):
|
||||
for _ in range(relevances_summary[relevance]):
|
||||
dcg_norm += relevance / log(2 + i, 2)
|
||||
i += 1
|
||||
|
||||
if dcg_norm == 0:
|
||||
return 0
|
||||
return dcg / dcg_norm
|
||||
dcg = sum(r / log(2 + i, 2) for i, r in enumerate(relevances))
|
||||
dcg_norm = sum(r / log(2 + i, 2) for i, r in enumerate(sorted(relevances, reverse=True)))
|
||||
return dcg / dcg_norm if dcg_norm != 0 else 0
|
||||
|
||||
|
||||
def compute_ndcg_without_w(data):
|
||||
def compute_ndcgs_without_ws(data):
|
||||
"""
|
||||
Computes NDCG (Normalized Discounted Cumulative Gain) for a given
|
||||
data. Returns an array of ndcg scores in the shape [num groups of
|
||||
|
@ -77,17 +90,17 @@ def compute_ndcg_without_w(data):
|
|||
relevances = np.array(data.ix[indices]['Relevance'])
|
||||
ndcgs.append(compute_ndcg(relevances))
|
||||
|
||||
return np.array(ndcgs)
|
||||
return ndcgs
|
||||
|
||||
|
||||
def compute_ndcg_for_w(data, w):
|
||||
def compute_ndcgs_for_ws(data, ws):
|
||||
"""
|
||||
Computes NDCG (Normalized Discounted Cumulative Gain) for a given
|
||||
data and an array of coeffs in a linear model. Returns an array of
|
||||
ndcg scores in the shape [num groups of features].
|
||||
"""
|
||||
|
||||
data_scores = np.array([np.dot(data.ix[i][FEATURES], w) for i in data.index])
|
||||
data_scores = np.array([np.dot(data.ix[i][FEATURES], ws) for i in data.index])
|
||||
grouped = data.groupby(data['SampleId'], sort=False).groups
|
||||
|
||||
ndcgs = []
|
||||
|
@ -101,7 +114,7 @@ def compute_ndcg_for_w(data, w):
|
|||
relevances = relevances[scores.argsort()[::-1]]
|
||||
ndcgs.append(compute_ndcg(relevances))
|
||||
|
||||
return np.array(ndcgs)
|
||||
return ndcgs
|
||||
|
||||
|
||||
def transform_data(data):
|
||||
|
@ -150,36 +163,144 @@ def transform_data(data):
|
|||
return xs, ys
|
||||
|
||||
|
||||
def plot_diagrams(xs, ys, features):
|
||||
"""
|
||||
For each feature, plots histagrams of x * sign(y), where x is a
|
||||
slice on the feature of a list of pairwise differences between
|
||||
input feature-vectors and y is a list of pairwise differences
|
||||
between relevances of the input feature-vectors. Stong bias
|
||||
toward positive or negative values in histograms indicates that
|
||||
the current feature is important for ranking, as there is a
|
||||
correlation between difference between features values and
|
||||
relevancy.
|
||||
"""
|
||||
for i, f in enumerate(features):
|
||||
x = [x[i] * np.sign(y) for x, y in zip(xs, ys)]
|
||||
|
||||
l, r = min(x), max(x)
|
||||
d = max(abs(l), abs(r))
|
||||
|
||||
plt.subplot(4, 4, i + 1)
|
||||
plt.hist(x, bins=8, range=(-d, d))
|
||||
plt.title(f)
|
||||
plt.show()
|
||||
|
||||
|
||||
def show_pearson_statistics(xs, ys, features):
|
||||
"""
|
||||
Shows info about Pearson coefficient between features and
|
||||
relevancy.
|
||||
"""
|
||||
|
||||
print('***** Correlation table *****')
|
||||
print('H0 - feature not is correlated with relevancy')
|
||||
print('H1 - feature is correlated with relevancy')
|
||||
print()
|
||||
|
||||
cs, ncs = [], []
|
||||
for i, f in enumerate(features):
|
||||
zs = [x[i] for x in xs]
|
||||
(c, p) = pearsonr(zs, ys)
|
||||
|
||||
correlated = p < 0.05
|
||||
print('{}: pearson={:.3f}, P(H1)={}'.format(f, c, 1 - p))
|
||||
if correlated:
|
||||
cs.append(f)
|
||||
else:
|
||||
ncs.append(f)
|
||||
|
||||
print()
|
||||
print('Correlated:', cs)
|
||||
print('Non-correlated:', ncs)
|
||||
|
||||
|
||||
def raw_output(features, ws):
|
||||
"""
|
||||
Prints feature-coeff pairs to the standard output.
|
||||
"""
|
||||
|
||||
for f, w in zip(features, ws):
|
||||
print('{}: {}'.format(f, w))
|
||||
|
||||
|
||||
def print_const(name, value):
|
||||
print('double const k{} = {:.7f};'.format(name, value))
|
||||
|
||||
|
||||
def print_array(name, size, values):
|
||||
print('double const {}[{}] = {{'.format(name, size))
|
||||
print(',\n'.join(' {:.7f} /* {} */'.format(w, f) for (f, w) in values))
|
||||
print('};')
|
||||
|
||||
def cpp_output(features, ws):
|
||||
"""
|
||||
Prints feature-coeff pairs in the C++-compatible format.
|
||||
"""
|
||||
|
||||
ns, st = [], []
|
||||
|
||||
for f, w in zip(features, ws):
|
||||
if f in NAME_SCORES:
|
||||
ns.append((f, w))
|
||||
elif f in SEARCH_TYPES:
|
||||
st.append((f, w))
|
||||
else:
|
||||
print_const(f, w)
|
||||
print_array('kNameScore', 'NameScore::NAME_SCORE_COUNT', ns)
|
||||
print_array('kSearchType', 'SearchModel::SEARCH_TYPE_COUNT', st)
|
||||
|
||||
|
||||
def main(args):
|
||||
data = pd.read_csv(sys.stdin)
|
||||
normalize_data(data)
|
||||
|
||||
ndcg = compute_ndcg_without_w(data);
|
||||
print('Current NDCG: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg)))
|
||||
ndcgs = compute_ndcgs_without_ws(data);
|
||||
print('Current NDCG: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs)))
|
||||
print()
|
||||
|
||||
x, y = transform_data(data)
|
||||
xs, ys = transform_data(data)
|
||||
|
||||
if args.plot:
|
||||
plot_diagrams(xs, ys, FEATURES)
|
||||
|
||||
clf = svm.LinearSVC(random_state=args.seed)
|
||||
cv = cross_validation.KFold(len(y), n_folds=5, shuffle=True, random_state=args.seed)
|
||||
cv = cross_validation.KFold(len(ys), n_folds=5, shuffle=True, random_state=args.seed)
|
||||
|
||||
# "C" stands for the regularizer constant.
|
||||
grid = {'C': np.power(10.0, np.arange(-5, 6))}
|
||||
gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
|
||||
gs.fit(x, y)
|
||||
gs.fit(xs, ys)
|
||||
|
||||
w = gs.best_estimator_.coef_[0]
|
||||
ndcg = compute_ndcg_for_w(data, w)
|
||||
ws = gs.best_estimator_.coef_[0]
|
||||
max_w = max(abs(w) for w in ws)
|
||||
ws = np.divide(ws, max_w)
|
||||
|
||||
# Following code restores coeffs for merged features.
|
||||
ws[FEATURES.index('Building')] = ws[FEATURES.index('POI')]
|
||||
ws[FEATURES.index('Full Match Prefix')] = ws[FEATURES.index('Full Match')]
|
||||
|
||||
ndcgs = compute_ndcgs_for_ws(data, ws)
|
||||
|
||||
print('NDCG mean: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs)))
|
||||
print('Accuracy: {}'.format(gs.best_score_))
|
||||
|
||||
if args.pearson:
|
||||
print()
|
||||
show_pearson_statistics(xs, ys, FEATURES)
|
||||
|
||||
print('NDCG mean: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg)))
|
||||
print()
|
||||
print('Linear model weights:')
|
||||
for f, c in zip(FEATURES, w):
|
||||
print('{}: {}'.format(f, c))
|
||||
print('***** Linear model weights *****')
|
||||
if args.cpp:
|
||||
cpp_output(FEATURES, ws)
|
||||
else:
|
||||
raw_output(FEATURES, ws)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--seed', help='random seed', type=int)
|
||||
parser.add_argument('--plot', help='plot diagrams', action='store_true')
|
||||
parser.add_argument('--pearson', help='show pearson statistics', action='store_true')
|
||||
parser.add_argument('--cpp', help='generate output in the C++ format', action='store_true')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
#include "search/v2/pre_ranking_info.hpp"
|
||||
#include "search/v2/ranking_info.hpp"
|
||||
#include "search/v2/ranking_utils.hpp"
|
||||
#include "search/v2/token_slice.hpp"
|
||||
|
||||
#include "storage/country_info_getter.hpp"
|
||||
#include "storage/index.hpp"
|
||||
|
@ -191,20 +190,11 @@ void UpdateNameScore(string const & name, TSlice const & slice, v2::NameScore &
|
|||
|
||||
template <typename TSlice>
|
||||
void UpdateNameScore(vector<strings::UniString> const & tokens, TSlice const & slice,
|
||||
v2::NameScore & bestScore, double & bestCoverage)
|
||||
v2::NameScore & bestScore)
|
||||
{
|
||||
auto const score = v2::GetNameScore(tokens, slice);
|
||||
auto const coverage =
|
||||
tokens.empty() ? 0 : static_cast<double>(slice.Size()) / static_cast<double>(tokens.size());
|
||||
if (score > bestScore)
|
||||
{
|
||||
bestScore = score;
|
||||
bestCoverage = coverage;
|
||||
}
|
||||
else if (score == bestScore && coverage > bestCoverage)
|
||||
{
|
||||
bestCoverage = coverage;
|
||||
}
|
||||
}
|
||||
|
||||
inline bool IsHashtagged(strings::UniString const & s) { return !s.empty() && s[0] == '#'; }
|
||||
|
@ -422,29 +412,18 @@ int Query::GetCategoryLocales(int8_t (&arr) [3]) const
|
|||
}
|
||||
|
||||
template <class ToDo>
|
||||
void Query::ForEachCategoryTypes(ToDo toDo) const
|
||||
void Query::ForEachCategoryTypes(v2::QuerySlice const & slice, ToDo toDo) const
|
||||
{
|
||||
int8_t arrLocales[3];
|
||||
int const localesCount = GetCategoryLocales(arrLocales);
|
||||
size_t const tokensCount = m_tokens.size();
|
||||
|
||||
for (size_t i = 0; i < tokensCount; ++i)
|
||||
for (size_t i = 0; i < slice.Size(); ++i)
|
||||
{
|
||||
auto token = RemoveHashtag(m_tokens[i]);
|
||||
|
||||
auto token = RemoveHashtag(slice.Get(i));
|
||||
for (int j = 0; j < localesCount; ++j)
|
||||
m_categories.ForEachTypeByName(arrLocales[j], token, bind<void>(ref(toDo), i, _1));
|
||||
ProcessEmojiIfNeeded(token, i, toDo);
|
||||
}
|
||||
|
||||
if (!m_prefix.empty())
|
||||
{
|
||||
auto prefix = RemoveHashtag(m_prefix);
|
||||
|
||||
for (int j = 0; j < localesCount; ++j)
|
||||
m_categories.ForEachTypeByName(arrLocales[j], prefix, bind<void>(ref(toDo), tokensCount, _1));
|
||||
ProcessEmojiIfNeeded(prefix, tokensCount, toDo);
|
||||
}
|
||||
}
|
||||
|
||||
template <class ToDo>
|
||||
|
@ -522,10 +501,11 @@ void Query::SetQuery(string const & query)
|
|||
|
||||
// get preffered types to show in results
|
||||
m_prefferedTypes.clear();
|
||||
ForEachCategoryTypes([&] (size_t, uint32_t t)
|
||||
{
|
||||
m_prefferedTypes.insert(t);
|
||||
});
|
||||
ForEachCategoryTypes(v2::QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix),
|
||||
[&](size_t, uint32_t t)
|
||||
{
|
||||
m_prefferedTypes.insert(t);
|
||||
});
|
||||
}
|
||||
|
||||
void Query::FlushViewportResults(v2::Geocoder::Params const & params, Results & res,
|
||||
|
@ -660,7 +640,6 @@ class PreResult2Maker
|
|||
info.m_distanceToPivot = MercatorBounds::DistanceOnEarth(center, pivot);
|
||||
info.m_rank = preInfo.m_rank;
|
||||
info.m_searchType = preInfo.m_searchType;
|
||||
|
||||
info.m_nameScore = v2::NAME_SCORE_ZERO;
|
||||
|
||||
v2::TokenSlice slice(m_params, preInfo.m_startToken, preInfo.m_endToken);
|
||||
|
@ -675,12 +654,30 @@ class PreResult2Maker
|
|||
vector<strings::UniString> tokens;
|
||||
SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());
|
||||
|
||||
UpdateNameScore(tokens, slice, info.m_nameScore, info.m_nameCoverage);
|
||||
UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore, info.m_nameCoverage);
|
||||
UpdateNameScore(tokens, slice, info.m_nameScore);
|
||||
UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore);
|
||||
}
|
||||
|
||||
if (info.m_searchType == v2::SearchModel::SEARCH_TYPE_BUILDING)
|
||||
UpdateNameScore(ft.GetHouseNumber(), sliceNoCategories, info.m_nameScore);
|
||||
|
||||
feature::TypesHolder holder(ft);
|
||||
vector<pair<size_t, size_t>> matched(slice.Size());
|
||||
m_query.ForEachCategoryTypes(v2::QuerySliceOnTokens(slice), [&](size_t i, uint32_t t)
|
||||
{
|
||||
++matched[i].second;
|
||||
if (holder.Has(t))
|
||||
++matched[i].first;
|
||||
});
|
||||
|
||||
info.m_pureCats = all_of(matched.begin(), matched.end(), [](pair<size_t, size_t> const & m)
|
||||
{
|
||||
return m.first != 0;
|
||||
});
|
||||
info.m_falseCats = all_of(matched.begin(), matched.end(), [](pair<size_t, size_t> const & m)
|
||||
{
|
||||
return m.first == 0 && m.second != 0;
|
||||
});
|
||||
}
|
||||
|
||||
uint8_t NormalizeRank(uint8_t rank, v2::SearchModel::SearchType type, m2::PointD const & center,
|
||||
|
@ -1259,7 +1256,8 @@ void Query::InitParams(bool localitySearch, SearchQueryParams & params)
|
|||
}
|
||||
}
|
||||
};
|
||||
ForEachCategoryTypes(addSyms);
|
||||
ForEachCategoryTypes(v2::QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix),
|
||||
addSyms);
|
||||
}
|
||||
|
||||
for (auto & tokens : params.m_tokens)
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include "search/suggest.hpp"
|
||||
#include "search/v2/geocoder.hpp"
|
||||
#include "search/v2/rank_table_cache.hpp"
|
||||
#include "search/v2/token_slice.hpp"
|
||||
|
||||
#include "indexer/ftypes_matcher.hpp"
|
||||
#include "indexer/index.hpp"
|
||||
|
@ -145,7 +146,8 @@ protected:
|
|||
void ClearResults();
|
||||
|
||||
int GetCategoryLocales(int8_t (&arr) [3]) const;
|
||||
template <class ToDo> void ForEachCategoryTypes(ToDo toDo) const;
|
||||
template <class ToDo>
|
||||
void ForEachCategoryTypes(v2::QuerySlice const & slice, ToDo toDo) const;
|
||||
template <class ToDo> void ProcessEmojiIfNeeded(
|
||||
strings::UniString const & token, size_t ind, ToDo & toDo) const;
|
||||
|
||||
|
|
|
@ -44,5 +44,6 @@ UNIT_TEST(NameTest_Smoke)
|
|||
TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", 2, 3), NAME_SCORE_FULL_MATCH, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "Fran", 0, 1), NAME_SCORE_SUBSTRING_PREFIX, ());
|
||||
TEST_EQUAL(GetScore("San Francisco", "Fran ", 0, 1), NAME_SCORE_ZERO, ());
|
||||
TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", 0, 1), NAME_SCORE_FULL_MATCH_PREFIX, ());
|
||||
}
|
||||
} // namespace
|
||||
|
|
|
@ -73,12 +73,6 @@ size_t constexpr kLocalityRectsCacheSize = 10;
|
|||
|
||||
strings::UniString const kUniSpace(strings::MakeUniString(" "));
|
||||
|
||||
template <typename T>
|
||||
struct Id
|
||||
{
|
||||
T const & operator()(T const & t) const { return t; }
|
||||
};
|
||||
|
||||
struct ScopedMarkTokens
|
||||
{
|
||||
ScopedMarkTokens(vector<bool> & usedTokens, size_t from, size_t to)
|
||||
|
@ -1563,12 +1557,12 @@ SearchModel::SearchType Geocoder::GetSearchTypeInGeocoding(uint32_t featureId)
|
|||
|
||||
bool Geocoder::AllTokensUsed() const
|
||||
{
|
||||
return all_of(m_usedTokens.begin(), m_usedTokens.end(), Id<bool>());
|
||||
return all_of(m_usedTokens.begin(), m_usedTokens.end(), IdFunctor());
|
||||
}
|
||||
|
||||
bool Geocoder::HasUsedTokensInRange(size_t from, size_t to) const
|
||||
{
|
||||
return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, Id<bool>());
|
||||
return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, IdFunctor());
|
||||
}
|
||||
|
||||
size_t Geocoder::NumUnusedTokensGroups() const
|
||||
|
|
|
@ -12,20 +12,34 @@ namespace
|
|||
{
|
||||
// See search/search_quality/scoring_model.py for details. In short,
|
||||
// these coeffs correspond to coeffs in a linear model.
|
||||
double const kDistanceToPivot = 0.19933969103335503;
|
||||
double const kRank = 3.528698483480807;
|
||||
double const kNameScore = 1.0050524496846687;
|
||||
double const kNameCoverage = 0.33989660511789926;
|
||||
double const kSearchType = 1.1949307125113533;
|
||||
double const kDistanceToPivot = -1.0000000;
|
||||
double const kRank = 0.5430747;
|
||||
double const kNameScore[NameScore::NAME_SCORE_COUNT] = {
|
||||
-0.3686323 /* Zero */,
|
||||
0.0977193 /* Substring Prefix */,
|
||||
0.1340500 /* Substring */,
|
||||
0.1368631 /* Full Match Prefix */,
|
||||
0.1368631 /* Full Match */
|
||||
};
|
||||
double const kSearchType[SearchModel::SEARCH_TYPE_COUNT] = {
|
||||
-0.9195533 /* POI */,
|
||||
-0.9195533 /* Building */,
|
||||
-0.1470504 /* Street */,
|
||||
-0.6392620 /* Unclassified */,
|
||||
-0.0900970 /* Village */,
|
||||
0.4383605 /* City */,
|
||||
0.6296097 /* State */,
|
||||
0.7279924 /* Country */
|
||||
};
|
||||
|
||||
double TransformDistance(double distance)
|
||||
{
|
||||
return 1.0 - min(distance, RankingInfo::kMaxDistMeters) / RankingInfo::kMaxDistMeters;
|
||||
return min(distance, RankingInfo::kMaxDistMeters) / RankingInfo::kMaxDistMeters;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// static
|
||||
double const RankingInfo::kMaxDistMeters = 2e7;
|
||||
double const RankingInfo::kMaxDistMeters = 2e6;
|
||||
|
||||
// static
|
||||
void RankingInfo::PrintCSVHeader(ostream & os)
|
||||
|
@ -33,8 +47,9 @@ void RankingInfo::PrintCSVHeader(ostream & os)
|
|||
os << "DistanceToPivot"
|
||||
<< ",Rank"
|
||||
<< ",NameScore"
|
||||
<< ",NameCoverage"
|
||||
<< ",SearchType";
|
||||
<< ",SearchType"
|
||||
<< ",PureCats"
|
||||
<< ",FalseCats";
|
||||
}
|
||||
|
||||
string DebugPrint(RankingInfo const & info)
|
||||
|
@ -44,8 +59,9 @@ string DebugPrint(RankingInfo const & info)
|
|||
os << "m_distanceToPivot:" << info.m_distanceToPivot << ",";
|
||||
os << "m_rank:" << static_cast<int>(info.m_rank) << ",";
|
||||
os << "m_nameScore:" << DebugPrint(info.m_nameScore) << ",";
|
||||
os << "m_nameCoverage:" << info.m_nameCoverage << ",";
|
||||
os << "m_searchType:" << DebugPrint(info.m_searchType);
|
||||
os << "m_searchType:" << DebugPrint(info.m_searchType) << ",";
|
||||
os << "m_pureCats:" << info.m_pureCats << ",";
|
||||
os << "m_falseCats:" << info.m_falseCats;
|
||||
os << "]";
|
||||
return os.str();
|
||||
}
|
||||
|
@ -54,7 +70,7 @@ void RankingInfo::ToCSV(ostream & os) const
|
|||
{
|
||||
os << fixed;
|
||||
os << m_distanceToPivot << "," << static_cast<int>(m_rank) << "," << DebugPrint(m_nameScore)
|
||||
<< "," << m_nameCoverage << "," << DebugPrint(m_searchType);
|
||||
<< "," << DebugPrint(m_searchType) << "," << m_pureCats << "," << m_falseCats;
|
||||
}
|
||||
|
||||
double RankingInfo::GetLinearModelRank() const
|
||||
|
@ -65,24 +81,21 @@ double RankingInfo::GetLinearModelRank() const
|
|||
// integrated in the build system.
|
||||
double const distanceToPivot = TransformDistance(m_distanceToPivot);
|
||||
double const rank = static_cast<double>(m_rank) / numeric_limits<uint8_t>::max();
|
||||
double const nameScore = static_cast<double>(m_nameScore) / NAME_SCORE_FULL_MATCH;
|
||||
double const nameCoverage = m_nameCoverage;
|
||||
|
||||
double searchType;
|
||||
switch (m_searchType)
|
||||
auto nameScore = m_nameScore;
|
||||
if (m_pureCats || m_falseCats)
|
||||
{
|
||||
case SearchModel::SEARCH_TYPE_POI:
|
||||
case SearchModel::SEARCH_TYPE_BUILDING:
|
||||
searchType = 0;
|
||||
break;
|
||||
default:
|
||||
searchType = m_searchType - 1;
|
||||
break;
|
||||
// If the feature was matched only by categorial tokens, it's
|
||||
// better for ranking to set name score to zero. For example,
|
||||
// when we're looking for a "cafe", cafes "Cafe Pushkin" and
|
||||
// "Lermontov" both match to the request, but must be ranked in
|
||||
// accordance to their distances to the user position or viewport,
|
||||
// in spite of "Cafe Pushkin" has a non-zero name rank.
|
||||
nameScore = NAME_SCORE_ZERO;
|
||||
}
|
||||
searchType = searchType / (SearchModel::SEARCH_TYPE_COUNTRY - 1);
|
||||
|
||||
return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore * nameScore +
|
||||
kNameCoverage * nameCoverage + kSearchType * searchType;
|
||||
return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] +
|
||||
kSearchType[m_searchType];
|
||||
}
|
||||
} // namespace v2
|
||||
} // namespace search
|
||||
|
|
|
@ -24,12 +24,18 @@ struct RankingInfo
|
|||
// Score for the feature's name.
|
||||
NameScore m_nameScore = NAME_SCORE_ZERO;
|
||||
|
||||
// Fraction of tokens from the query matched to a feature name.
|
||||
double m_nameCoverage = 0;
|
||||
|
||||
// Search type for the feature.
|
||||
SearchModel::SearchType m_searchType = SearchModel::SEARCH_TYPE_COUNT;
|
||||
|
||||
// True if all of the tokens that the feature was matched by
|
||||
// correspond to this feature's categories.
|
||||
bool m_pureCats = false;
|
||||
|
||||
// True if none of the tokens that the feature was matched by
|
||||
// corresponds to this feature's categories although all of the
|
||||
// tokens are categorial ones.
|
||||
bool m_falseCats = false;
|
||||
|
||||
static void PrintCSVHeader(ostream & os);
|
||||
|
||||
void ToCSV(ostream & os) const;
|
||||
|
|
|
@ -130,14 +130,14 @@ string DebugPrint(SearchModel::SearchType type)
|
|||
switch (type)
|
||||
{
|
||||
case SearchModel::SEARCH_TYPE_POI: return "POI";
|
||||
case SearchModel::SEARCH_TYPE_BUILDING: return "BUILDING";
|
||||
case SearchModel::SEARCH_TYPE_STREET: return "STREET";
|
||||
case SearchModel::SEARCH_TYPE_CITY: return "CITY";
|
||||
case SearchModel::SEARCH_TYPE_VILLAGE: return "VILLAGE";
|
||||
case SearchModel::SEARCH_TYPE_STATE: return "STATE";
|
||||
case SearchModel::SEARCH_TYPE_COUNTRY: return "COUNTRY";
|
||||
case SearchModel::SEARCH_TYPE_UNCLASSIFIED: return "UNCLASSIFIED";
|
||||
case SearchModel::SEARCH_TYPE_COUNT: return "COUNT";
|
||||
case SearchModel::SEARCH_TYPE_BUILDING: return "Building";
|
||||
case SearchModel::SEARCH_TYPE_STREET: return "Street";
|
||||
case SearchModel::SEARCH_TYPE_CITY: return "City";
|
||||
case SearchModel::SEARCH_TYPE_VILLAGE: return "Village";
|
||||
case SearchModel::SEARCH_TYPE_STATE: return "State";
|
||||
case SearchModel::SEARCH_TYPE_COUNTRY: return "Country";
|
||||
case SearchModel::SEARCH_TYPE_UNCLASSIFIED: return "Unclassified";
|
||||
case SearchModel::SEARCH_TYPE_COUNT: return "Count";
|
||||
}
|
||||
ASSERT(false, ("Unknown search type:", static_cast<int>(type)));
|
||||
return string();
|
||||
|
|
|
@ -67,6 +67,63 @@ private:
|
|||
vector<size_t> m_indexes;
|
||||
};
|
||||
|
||||
class QuerySlice
|
||||
{
|
||||
public:
|
||||
using TString = SearchQueryParams::TString;
|
||||
|
||||
virtual ~QuerySlice() = default;
|
||||
|
||||
virtual TString const & Get(size_t i) const = 0;
|
||||
virtual size_t Size() const = 0;
|
||||
virtual bool IsPrefix(size_t i) const = 0;
|
||||
|
||||
bool Empty() const { return Size() == 0; }
|
||||
};
|
||||
|
||||
class QuerySliceOnTokens : public QuerySlice
|
||||
{
|
||||
public:
|
||||
QuerySliceOnTokens(TokenSlice const & slice) : m_slice(slice) {}
|
||||
|
||||
// QuerySlice overrides:
|
||||
SearchQueryParams::TString const & Get(size_t i) const override { return m_slice.Get(i).front(); }
|
||||
size_t Size() const override { return m_slice.Size(); }
|
||||
bool IsPrefix(size_t i) const override { return m_slice.IsPrefix(i); }
|
||||
|
||||
private:
|
||||
TokenSlice const m_slice;
|
||||
};
|
||||
|
||||
template <typename TCont>
|
||||
class QuerySliceOnRawStrings : public QuerySlice
|
||||
{
|
||||
public:
|
||||
QuerySliceOnRawStrings(TCont const & tokens, TString const & prefix)
|
||||
: m_tokens(tokens), m_prefix(prefix)
|
||||
{
|
||||
}
|
||||
|
||||
// QuerySlice overrides:
|
||||
SearchQueryParams::TString const & Get(size_t i) const override
|
||||
{
|
||||
ASSERT_LESS(i, Size(), ());
|
||||
return i == m_tokens.size() ? m_prefix : m_tokens[i];
|
||||
}
|
||||
|
||||
size_t Size() const override { return m_tokens.size() + (m_prefix.empty() ? 0 : 1); }
|
||||
|
||||
bool IsPrefix(size_t i) const override
|
||||
{
|
||||
ASSERT_LESS(i, Size(), ());
|
||||
return i == m_tokens.size();
|
||||
}
|
||||
|
||||
private:
|
||||
TCont const & m_tokens;
|
||||
TString const & m_prefix;
|
||||
};
|
||||
|
||||
string DebugPrint(TokenSlice const & slice);
|
||||
|
||||
string DebugPrint(TokenSliceNoCategories const & slice);
|
||||
|
|
Loading…
Add table
Reference in a new issue