forked from organicmaps/organicmaps
[search] Implemented gradient descent for a linear model estimation.
This commit is contained in:
parent
b167513276
commit
72dbdb6a18
6 changed files with 173 additions and 45 deletions
|
@ -5,48 +5,27 @@ import numpy as np
|
|||
import pandas as pd
|
||||
import sys
|
||||
|
||||
FEATURES = ['MinDistance', 'Rank', 'SearchType', 'NameScore', 'NameCoverage']
|
||||
|
||||
DISTANCE_WINDOW = 1e9
|
||||
MAX_RANK = 256
|
||||
RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3}
|
||||
NAME_SCORES = ['Zero', 'Substring Prefix', 'Substring', 'Full Match Prefix', 'Full Match']
|
||||
SEARCH_TYPES = ['POI', 'BUILDING', 'STREET', 'UNCLASSIFIED', 'VILLAGE', 'CITY', 'STATE', 'COUNTRY']
|
||||
|
||||
def transform_distance(distance):
|
||||
return exp(-distance / DISTANCE_WINDOW)
|
||||
|
||||
def transform_rank(rank):
|
||||
return rank / MAX_RANK
|
||||
|
||||
def transform_relevance(score):
|
||||
return RELEVANCES[score]
|
||||
|
||||
def transform_name_score(score):
|
||||
return NAME_SCORES.index(score) / len(NAME_SCORES)
|
||||
|
||||
def transform_search_type(type):
|
||||
return SEARCH_TYPES.index(type) / len(SEARCH_TYPES)
|
||||
|
||||
# This function may use any fields of row to compute score except
|
||||
# 'Relevance' and 'SampleId'.
|
||||
#
|
||||
# TODO (@y, @m): learn a linear model here or find good coeffs by
|
||||
# brute-force.
|
||||
def get_score(row):
|
||||
x = row[['MinDistance', 'Rank', 'SearchType', 'NameScore']]
|
||||
w = np.array([1, 1, 1, 1])
|
||||
return np.dot(x, w)
|
||||
|
||||
def normalize_data(data):
|
||||
transform_distance = lambda d: exp(-d / DISTANCE_WINDOW)
|
||||
|
||||
data['DistanceToViewport'] = data['DistanceToViewport'].apply(transform_distance)
|
||||
data['DistanceToPosition'] = data['DistanceToPosition'].apply(transform_distance)
|
||||
data['Rank'] = data['Rank'].apply(transform_rank)
|
||||
data['NameScore'] = data['NameScore'].apply(transform_name_score)
|
||||
data['SearchType'] = data['SearchType'].apply(transform_search_type)
|
||||
data['Relevance'] = data['Relevance'].apply(transform_relevance)
|
||||
|
||||
# Adds some new columns to the data frame.
|
||||
data['Rank'] = data['Rank'].apply(lambda rank: rank / MAX_RANK)
|
||||
data['NameScore'] = data['NameScore'].apply(lambda s: NAME_SCORES.index(s) / len(NAME_SCORES))
|
||||
data['SearchType'] = data['SearchType'].apply(
|
||||
lambda t: SEARCH_TYPES.index(t) / len(SEARCH_TYPES))
|
||||
data['Relevance'] = data['Relevance'].apply(lambda r: RELEVANCES[r])
|
||||
data['MinDistance'] = pd.Series(np.minimum(data['DistanceToViewport'], data['DistanceToPosition']))
|
||||
data['Score'] = pd.Series([get_score(data.ix[i]) for i in data.index])
|
||||
|
||||
|
||||
def compute_ndcg(scores):
|
||||
scores_summary = collections.defaultdict(int)
|
||||
|
@ -58,7 +37,7 @@ def compute_ndcg(scores):
|
|||
|
||||
dcg_norm, i = 0, 0
|
||||
for score in sorted(scores_summary.keys(), reverse=True):
|
||||
for j in range(scores_summary[score]):
|
||||
for _ in range(scores_summary[score]):
|
||||
dcg_norm += score / log(2 + i, 2)
|
||||
i += 1
|
||||
|
||||
|
@ -66,20 +45,139 @@ def compute_ndcg(scores):
|
|||
return 0
|
||||
return dcg / dcg_norm
|
||||
|
||||
def main():
|
||||
data = pd.read_csv(sys.stdin)
|
||||
normalize_data(data)
|
||||
|
||||
def compute_ndcg_for_w(data, w):
|
||||
data_scores = np.array([np.dot(data.ix[i][FEATURES], w) for i in data.index])
|
||||
grouped = data.groupby(data['SampleId'], sort=False).groups
|
||||
|
||||
ndcgs = []
|
||||
for id in grouped:
|
||||
indices = grouped[id]
|
||||
group = data.ix[indices]
|
||||
sorted_group = group.sort_values('Score', ascending=False)
|
||||
ndcgs.append(compute_ndcg(sorted_group['Relevance']))
|
||||
|
||||
ndcgs = np.array(ndcgs)
|
||||
print('NDCG mean: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs)))
|
||||
relevances = np.array(data.ix[indices]['Relevance'])
|
||||
scores = data_scores[indices]
|
||||
|
||||
# Reoders relevances in accordance with decreasing scores.
|
||||
relevances = relevances[scores.argsort()[::-1]]
|
||||
ndcgs.append(compute_ndcg(relevances))
|
||||
|
||||
return np.array(ndcgs)
|
||||
|
||||
|
||||
def gradient_descent(w_init, grad, eps=1e-6, lam=1e-3, num_steps=1000):
|
||||
n = len(w_init)
|
||||
w, dw = np.copy(w_init), np.zeros(n)
|
||||
for step in range(1, num_steps):
|
||||
wn = w - eps / step * grad(w) + lam * dw
|
||||
w, dw = wn, wn - w
|
||||
if np.linalg.norm(dw) < eps:
|
||||
break
|
||||
return w
|
||||
|
||||
|
||||
class NaiveLoss:
|
||||
"""
|
||||
Represents a gradient implementation for a naive loss function f,
|
||||
such that:
|
||||
|
||||
df / dx = (f(x + eps) - f(x)) / eps
|
||||
"""
|
||||
|
||||
def __init__(self, data, eps=1e-6):
|
||||
self.data, self.eps = data, eps
|
||||
|
||||
def value(self, w):
|
||||
return compute_ndcg_for_w(self.data, w)
|
||||
|
||||
def gradient(self, w):
|
||||
n = len(w)
|
||||
g = np.zeros(n)
|
||||
|
||||
fw = self.value(w)
|
||||
for i in range(n):
|
||||
w[i] += self.eps
|
||||
g[i] = (self.value(w) - fw) / self.eps
|
||||
w[i] -= self.eps
|
||||
return g
|
||||
|
||||
|
||||
class RankingSVMLoss:
|
||||
"""
|
||||
Represents a loss function with a gradient for a RankingSVM model.
|
||||
Simple version of a loss function for a ranked list of features
|
||||
has following form:
|
||||
|
||||
loss(w) = sum{i, j: max(0, 1 - sign(y[j] - y[i]) * dot(w, x[j] - x[i]))} + lam * dot(w, w)
|
||||
|
||||
This version is slightly modified, as we dealing with a group of
|
||||
ranked lists, so loss function is actually a weighted sum of loss
|
||||
values for each list, where each weight is a 1 / list size.
|
||||
"""
|
||||
|
||||
def sign(self, x):
|
||||
if x < 0:
|
||||
return -1
|
||||
elif x > 0:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def __init__(self, data, lam=1e-3):
|
||||
self.coeffs, self.lam = [], lam
|
||||
|
||||
grouped = data.groupby(data['SampleId'], sort=False).groups
|
||||
for id in grouped:
|
||||
indices = grouped[id]
|
||||
features = data.ix[indices][FEATURES]
|
||||
relevances = np.array(data.ix[indices]['Relevance'])
|
||||
n = len(indices)
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
y = self.sign(relevances[j] - relevances[i]) / n
|
||||
dx = y * (np.array(features.iloc[j]) - np.array(features.iloc[i]))
|
||||
self.coeffs.append(dx)
|
||||
|
||||
|
||||
def value(self, w):
|
||||
result = self.lam * np.dot(w, w)
|
||||
for coeff in self.coeffs:
|
||||
v = 1 - np.dot(coeff, w)
|
||||
if v > 0:
|
||||
result += v
|
||||
return result
|
||||
|
||||
|
||||
def gradient(self, w):
|
||||
result = 2 * self.lam * w
|
||||
for coeff in self.coeffs:
|
||||
if 1 - np.dot(coeff, w) > 0:
|
||||
result = result - coeff
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
data = pd.read_csv(sys.stdin)
|
||||
normalize_data(data)
|
||||
|
||||
best_w = np.ones(len(FEATURES))
|
||||
best_mean = np.mean(compute_ndcg_for_w(data, best_w))
|
||||
|
||||
loss = RankingSVMLoss(data, lam=1e-3)
|
||||
grad = lambda w: loss.gradient(w)
|
||||
|
||||
num_steps = 1000
|
||||
for i in range(1, num_steps + 1):
|
||||
if ((i * 100) % num_steps == 0):
|
||||
print((i * 100) // num_steps, '%')
|
||||
w_init = np.random.random(len(FEATURES))
|
||||
w = gradient_descent(w_init, grad, eps=0.01)
|
||||
mean = np.mean(compute_ndcg_for_w(data, w))
|
||||
if mean > best_mean:
|
||||
best_mean, best_w = mean, w
|
||||
print(best_mean)
|
||||
|
||||
ndcg = compute_ndcg_for_w(data, best_w)
|
||||
print(np.mean(ndcg), np.std(ndcg), best_w)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -661,9 +661,22 @@ class PreResult2Maker
|
|||
string name;
|
||||
if (!ft.GetName(lang, name))
|
||||
continue;
|
||||
auto score = GetNameScore(name, m_params, preInfo.m_startToken, preInfo.m_endToken);
|
||||
vector<strings::UniString> tokens;
|
||||
SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());
|
||||
|
||||
auto score = GetNameScore(tokens, m_params, preInfo.m_startToken, preInfo.m_endToken);
|
||||
auto coverage =
|
||||
tokens.empty() ? 0 : static_cast<double>(preInfo.m_endToken - preInfo.m_startToken) /
|
||||
static_cast<double>(tokens.size());
|
||||
if (score > info.m_nameScore)
|
||||
{
|
||||
info.m_nameScore = score;
|
||||
info.m_nameCoverage = coverage;
|
||||
}
|
||||
else if (score == info.m_nameScore && coverage > info.m_nameCoverage)
|
||||
{
|
||||
info.m_nameCoverage = coverage;
|
||||
}
|
||||
}
|
||||
|
||||
if (info.m_searchType == v2::SearchModel::SEARCH_TYPE_BUILDING)
|
||||
|
|
|
@ -11,6 +11,7 @@ void RankingInfo::PrintCSVHeader(ostream & os)
|
|||
<< ",DistanceToPosition"
|
||||
<< ",Rank"
|
||||
<< ",NameScore"
|
||||
<< ",NameCoverage"
|
||||
<< ",SearchType"
|
||||
<< ",PositionInViewport";
|
||||
}
|
||||
|
@ -23,6 +24,7 @@ string DebugPrint(RankingInfo const & info)
|
|||
os << "m_distanceToPosition:" << info.m_distanceToPosition << ",";
|
||||
os << "m_rank:" << static_cast<int>(info.m_rank) << ",";
|
||||
os << "m_nameScore:" << DebugPrint(info.m_nameScore) << ",";
|
||||
os << "m_nameCoverage:" << info.m_nameCoverage << ",";
|
||||
os << "m_searchType:" << DebugPrint(info.m_searchType) << ",";
|
||||
os << "m_positionInViewport:" << info.m_positionInViewport;
|
||||
os << "]";
|
||||
|
@ -33,8 +35,8 @@ void RankingInfo::ToCSV(ostream & os) const
|
|||
{
|
||||
os << fixed;
|
||||
os << m_distanceToViewport << "," << m_distanceToPosition << "," << static_cast<int>(m_rank)
|
||||
<< "," << DebugPrint(m_nameScore) << "," << DebugPrint(m_searchType) << ","
|
||||
<< m_positionInViewport;
|
||||
<< "," << DebugPrint(m_nameScore) << "," << m_nameCoverage << "," << DebugPrint(m_searchType)
|
||||
<< "," << m_positionInViewport;
|
||||
}
|
||||
} // namespace v2
|
||||
} // namespace search
|
||||
|
|
|
@ -25,6 +25,9 @@ struct RankingInfo
|
|||
// Score for the feature's name.
|
||||
NameScore m_nameScore = NAME_SCORE_ZERO;
|
||||
|
||||
// Number of tokens from the query matched to a feature name.
|
||||
double m_nameCoverage = 0;
|
||||
|
||||
// Search type for the feature.
|
||||
SearchModel::SearchType m_searchType = SearchModel::SEARCH_TYPE_COUNT;
|
||||
|
||||
|
|
|
@ -6,10 +6,8 @@
|
|||
#include "indexer/search_string_utils.hpp"
|
||||
|
||||
#include "base/stl_add.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
using namespace strings;
|
||||
|
||||
|
@ -43,6 +41,14 @@ NameScore GetNameScore(string const & name, SearchQueryParams const & params, si
|
|||
|
||||
vector<UniString> tokens;
|
||||
SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());
|
||||
return GetNameScore(tokens, params, startToken, endToken);
|
||||
}
|
||||
|
||||
NameScore GetNameScore(vector<UniString> const & tokens, SearchQueryParams const & params,
|
||||
size_t startToken, size_t endToken)
|
||||
{
|
||||
if (startToken >= endToken)
|
||||
return NAME_SCORE_ZERO;
|
||||
|
||||
size_t const n = tokens.size();
|
||||
size_t const m = endToken - startToken;
|
||||
|
|
|
@ -3,9 +3,12 @@
|
|||
#include "search/v2/geocoder.hpp"
|
||||
#include "search/v2/search_model.hpp"
|
||||
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/cstdint.hpp"
|
||||
#include "std/limits.hpp"
|
||||
#include "std/string.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
namespace search
|
||||
{
|
||||
|
@ -27,6 +30,9 @@ enum NameScore
|
|||
NameScore GetNameScore(string const & name, SearchQueryParams const & params, size_t startToken,
|
||||
size_t endToken);
|
||||
|
||||
NameScore GetNameScore(vector<strings::UniString> const & tokens, SearchQueryParams const & params,
|
||||
size_t startToken, size_t endToken);
|
||||
|
||||
string DebugPrint(NameScore score);
|
||||
} // namespace v2
|
||||
} // namespace search
|
||||
|
|
Loading…
Add table
Reference in a new issue