diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index 336ed56367..65264ac34f 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -5,48 +5,27 @@ import numpy as np import pandas as pd import sys +FEATURES = ['MinDistance', 'Rank', 'SearchType', 'NameScore', 'NameCoverage'] + DISTANCE_WINDOW = 1e9 MAX_RANK = 256 RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3} NAME_SCORES = ['Zero', 'Substring Prefix', 'Substring', 'Full Match Prefix', 'Full Match'] SEARCH_TYPES = ['POI', 'BUILDING', 'STREET', 'UNCLASSIFIED', 'VILLAGE', 'CITY', 'STATE', 'COUNTRY'] -def transform_distance(distance): - return exp(-distance / DISTANCE_WINDOW) - -def transform_rank(rank): - return rank / MAX_RANK - -def transform_relevance(score): - return RELEVANCES[score] - -def transform_name_score(score): - return NAME_SCORES.index(score) / len(NAME_SCORES) - -def transform_search_type(type): - return SEARCH_TYPES.index(type) / len(SEARCH_TYPES) - -# This function may use any fields of row to compute score except -# 'Relevance' and 'SampleId'. -# -# TODO (@y, @m): learn a linear model here or find good coeffs by -# brute-force. -def get_score(row): - x = row[['MinDistance', 'Rank', 'SearchType', 'NameScore']] - w = np.array([1, 1, 1, 1]) - return np.dot(x, w) def normalize_data(data): + transform_distance = lambda d: exp(-d / DISTANCE_WINDOW) + data['DistanceToViewport'] = data['DistanceToViewport'].apply(transform_distance) data['DistanceToPosition'] = data['DistanceToPosition'].apply(transform_distance) - data['Rank'] = data['Rank'].apply(transform_rank) - data['NameScore'] = data['NameScore'].apply(transform_name_score) - data['SearchType'] = data['SearchType'].apply(transform_search_type) - data['Relevance'] = data['Relevance'].apply(transform_relevance) - - # Adds some new columns to the data frame. + data['Rank'] = data['Rank'].apply(lambda rank: rank / MAX_RANK) + data['NameScore'] = data['NameScore'].apply(lambda s: NAME_SCORES.index(s) / len(NAME_SCORES)) + data['SearchType'] = data['SearchType'].apply( + lambda t: SEARCH_TYPES.index(t) / len(SEARCH_TYPES)) + data['Relevance'] = data['Relevance'].apply(lambda r: RELEVANCES[r]) data['MinDistance'] = pd.Series(np.minimum(data['DistanceToViewport'], data['DistanceToPosition'])) - data['Score'] = pd.Series([get_score(data.ix[i]) for i in data.index]) + def compute_ndcg(scores): scores_summary = collections.defaultdict(int) @@ -58,7 +37,7 @@ def compute_ndcg(scores): dcg_norm, i = 0, 0 for score in sorted(scores_summary.keys(), reverse=True): - for j in range(scores_summary[score]): + for _ in range(scores_summary[score]): dcg_norm += score / log(2 + i, 2) i += 1 @@ -66,20 +45,139 @@ def compute_ndcg(scores): return 0 return dcg / dcg_norm -def main(): - data = pd.read_csv(sys.stdin) - normalize_data(data) + +def compute_ndcg_for_w(data, w): + data_scores = np.array([np.dot(data.ix[i][FEATURES], w) for i in data.index]) grouped = data.groupby(data['SampleId'], sort=False).groups ndcgs = [] for id in grouped: indices = grouped[id] - group = data.ix[indices] - sorted_group = group.sort_values('Score', ascending=False) - ndcgs.append(compute_ndcg(sorted_group['Relevance'])) - ndcgs = np.array(ndcgs) - print('NDCG mean: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs))) + relevances = np.array(data.ix[indices]['Relevance']) + scores = data_scores[indices] + + # Reoders relevances in accordance with decreasing scores. + relevances = relevances[scores.argsort()[::-1]] + ndcgs.append(compute_ndcg(relevances)) + + return np.array(ndcgs) + + +def gradient_descent(w_init, grad, eps=1e-6, lam=1e-3, num_steps=1000): + n = len(w_init) + w, dw = np.copy(w_init), np.zeros(n) + for step in range(1, num_steps): + wn = w - eps / step * grad(w) + lam * dw + w, dw = wn, wn - w + if np.linalg.norm(dw) < eps: + break + return w + + +class NaiveLoss: + """ + Represents a gradient implementation for a naive loss function f, + such that: + + df / dx = (f(x + eps) - f(x)) / eps + """ + + def __init__(self, data, eps=1e-6): + self.data, self.eps = data, eps + + def value(self, w): + return compute_ndcg_for_w(self.data, w) + + def gradient(self, w): + n = len(w) + g = np.zeros(n) + + fw = self.value(w) + for i in range(n): + w[i] += self.eps + g[i] = (self.value(w) - fw) / self.eps + w[i] -= self.eps + return g + + +class RankingSVMLoss: + """ + Represents a loss function with a gradient for a RankingSVM model. + Simple version of a loss function for a ranked list of features + has following form: + + loss(w) = sum{i, j: max(0, 1 - sign(y[j] - y[i]) * dot(w, x[j] - x[i]))} + lam * dot(w, w) + + This version is slightly modified, as we dealing with a group of + ranked lists, so loss function is actually a weighted sum of loss + values for each list, where each weight is a 1 / list size. + """ + + def sign(self, x): + if x < 0: + return -1 + elif x > 0: + return 1 + return 0 + + + def __init__(self, data, lam=1e-3): + self.coeffs, self.lam = [], lam + + grouped = data.groupby(data['SampleId'], sort=False).groups + for id in grouped: + indices = grouped[id] + features = data.ix[indices][FEATURES] + relevances = np.array(data.ix[indices]['Relevance']) + n = len(indices) + for i in range(n): + for j in range(i + 1, n): + y = self.sign(relevances[j] - relevances[i]) / n + dx = y * (np.array(features.iloc[j]) - np.array(features.iloc[i])) + self.coeffs.append(dx) + + + def value(self, w): + result = self.lam * np.dot(w, w) + for coeff in self.coeffs: + v = 1 - np.dot(coeff, w) + if v > 0: + result += v + return result + + + def gradient(self, w): + result = 2 * self.lam * w + for coeff in self.coeffs: + if 1 - np.dot(coeff, w) > 0: + result = result - coeff + return result + + +def main(): + data = pd.read_csv(sys.stdin) + normalize_data(data) + + best_w = np.ones(len(FEATURES)) + best_mean = np.mean(compute_ndcg_for_w(data, best_w)) + + loss = RankingSVMLoss(data, lam=1e-3) + grad = lambda w: loss.gradient(w) + + num_steps = 1000 + for i in range(1, num_steps + 1): + if ((i * 100) % num_steps == 0): + print((i * 100) // num_steps, '%') + w_init = np.random.random(len(FEATURES)) + w = gradient_descent(w_init, grad, eps=0.01) + mean = np.mean(compute_ndcg_for_w(data, w)) + if mean > best_mean: + best_mean, best_w = mean, w + print(best_mean) + + ndcg = compute_ndcg_for_w(data, best_w) + print(np.mean(ndcg), np.std(ndcg), best_w) if __name__ == "__main__": main() diff --git a/search/search_query.cpp b/search/search_query.cpp index b11a22ee2e..046fda0c9a 100644 --- a/search/search_query.cpp +++ b/search/search_query.cpp @@ -661,9 +661,22 @@ class PreResult2Maker string name; if (!ft.GetName(lang, name)) continue; - auto score = GetNameScore(name, m_params, preInfo.m_startToken, preInfo.m_endToken); + vector tokens; + SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters()); + + auto score = GetNameScore(tokens, m_params, preInfo.m_startToken, preInfo.m_endToken); + auto coverage = + tokens.empty() ? 0 : static_cast(preInfo.m_endToken - preInfo.m_startToken) / + static_cast(tokens.size()); if (score > info.m_nameScore) + { info.m_nameScore = score; + info.m_nameCoverage = coverage; + } + else if (score == info.m_nameScore && coverage > info.m_nameCoverage) + { + info.m_nameCoverage = coverage; + } } if (info.m_searchType == v2::SearchModel::SEARCH_TYPE_BUILDING) diff --git a/search/v2/ranking_info.cpp b/search/v2/ranking_info.cpp index cd834c7c01..35a3fa1d42 100644 --- a/search/v2/ranking_info.cpp +++ b/search/v2/ranking_info.cpp @@ -11,6 +11,7 @@ void RankingInfo::PrintCSVHeader(ostream & os) << ",DistanceToPosition" << ",Rank" << ",NameScore" + << ",NameCoverage" << ",SearchType" << ",PositionInViewport"; } @@ -23,6 +24,7 @@ string DebugPrint(RankingInfo const & info) os << "m_distanceToPosition:" << info.m_distanceToPosition << ","; os << "m_rank:" << static_cast(info.m_rank) << ","; os << "m_nameScore:" << DebugPrint(info.m_nameScore) << ","; + os << "m_nameCoverage:" << info.m_nameCoverage << ","; os << "m_searchType:" << DebugPrint(info.m_searchType) << ","; os << "m_positionInViewport:" << info.m_positionInViewport; os << "]"; @@ -33,8 +35,8 @@ void RankingInfo::ToCSV(ostream & os) const { os << fixed; os << m_distanceToViewport << "," << m_distanceToPosition << "," << static_cast(m_rank) - << "," << DebugPrint(m_nameScore) << "," << DebugPrint(m_searchType) << "," - << m_positionInViewport; + << "," << DebugPrint(m_nameScore) << "," << m_nameCoverage << "," << DebugPrint(m_searchType) + << "," << m_positionInViewport; } } // namespace v2 } // namespace search diff --git a/search/v2/ranking_info.hpp b/search/v2/ranking_info.hpp index 714aa1ce1d..fe80c972f5 100644 --- a/search/v2/ranking_info.hpp +++ b/search/v2/ranking_info.hpp @@ -25,6 +25,9 @@ struct RankingInfo // Score for the feature's name. NameScore m_nameScore = NAME_SCORE_ZERO; + // Number of tokens from the query matched to a feature name. + double m_nameCoverage = 0; + // Search type for the feature. SearchModel::SearchType m_searchType = SearchModel::SEARCH_TYPE_COUNT; diff --git a/search/v2/ranking_utils.cpp b/search/v2/ranking_utils.cpp index cb0e741785..ab2fb9fafe 100644 --- a/search/v2/ranking_utils.cpp +++ b/search/v2/ranking_utils.cpp @@ -6,10 +6,8 @@ #include "indexer/search_string_utils.hpp" #include "base/stl_add.hpp" -#include "base/string_utils.hpp" #include "std/algorithm.hpp" -#include "std/vector.hpp" using namespace strings; @@ -43,6 +41,14 @@ NameScore GetNameScore(string const & name, SearchQueryParams const & params, si vector tokens; SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters()); + return GetNameScore(tokens, params, startToken, endToken); +} + +NameScore GetNameScore(vector const & tokens, SearchQueryParams const & params, + size_t startToken, size_t endToken) +{ + if (startToken >= endToken) + return NAME_SCORE_ZERO; size_t const n = tokens.size(); size_t const m = endToken - startToken; diff --git a/search/v2/ranking_utils.hpp b/search/v2/ranking_utils.hpp index 5b6ccf21cb..40514f92fa 100644 --- a/search/v2/ranking_utils.hpp +++ b/search/v2/ranking_utils.hpp @@ -3,9 +3,12 @@ #include "search/v2/geocoder.hpp" #include "search/v2/search_model.hpp" +#include "base/string_utils.hpp" + #include "std/cstdint.hpp" #include "std/limits.hpp" #include "std/string.hpp" +#include "std/vector.hpp" namespace search { @@ -27,6 +30,9 @@ enum NameScore NameScore GetNameScore(string const & name, SearchQueryParams const & params, size_t startToken, size_t endToken); +NameScore GetNameScore(vector const & tokens, SearchQueryParams const & params, + size_t startToken, size_t endToken); + string DebugPrint(NameScore score); } // namespace v2 } // namespace search