From 4212a0e4a154d6d7c06ad6ebcac551d1a439870d Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Fri, 18 Mar 2016 17:18:47 +0300 Subject: [PATCH] [search] Fixed search quality scripts. --- search/search_quality/gen-samples.lisp | 9 +++++- search/search_quality/scoring_model.py | 39 ++++++++++++++++++++------ search/v2/ranking_info.cpp | 8 +++--- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/search/search_quality/gen-samples.lisp b/search/search_quality/gen-samples.lisp index 77098df03f..49ab1a6bce 100755 --- a/search/search_quality/gen-samples.lisp +++ b/search/search_quality/gen-samples.lisp @@ -41,10 +41,17 @@ exec /usr/local/bin/sbcl --noinform --quit --load $0 --end-toplevel-options "$@" (maxx :initarg :maxx) (maxy :initarg :maxy))) +(defun position-x-y (x y) + (assert (and (>= x *minx*) (<= x *maxx*))) + (assert (and (>= y *miny*) (<= y *maxy*))) + (make-instance 'pos :x x :y y)) + (defun position-lat-lon (lat lon) - (make-instance 'pos :x (lon-to-x lon) :y (lat-to-y lat))) + (position-x-y (lon-to-x lon) (lat-to-y lat))) (defun viewport (&key minx miny maxx maxy) + (assert (<= minx maxx)) + (assert (<= miny maxy)) (make-instance 'viewport :minx minx :maxx maxx :miny miny :maxy maxy)) (defclass result () diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index 2decef07eb..f59222235b 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -38,23 +38,23 @@ def normalize_data(data): data['Relevance'] = data['Relevance'].apply(lambda r: RELEVANCES[r]) -def compute_ndcg(scores): +def compute_ndcg(relevances): """ Computes NDCG (Normalized Discounted Cumulative Gain) for a given array of scores. """ - scores_summary = collections.defaultdict(int) + relevances_summary = collections.defaultdict(int) dcg = 0 - for i, score in enumerate(scores): - dcg += score / log(2 + i, 2) - scores_summary[score] += 1 + for i, relevance in enumerate(relevances): + dcg += relevance / log(2 + i, 2) + relevances_summary[relevance] += 1 dcg_norm, i = 0, 0 - for score in sorted(scores_summary.keys(), reverse=True): - for _ in range(scores_summary[score]): - dcg_norm += score / log(2 + i, 2) + for relevance in sorted(relevances_summary.keys(), reverse=True): + for _ in range(relevances_summary[relevance]): + dcg_norm += relevance / log(2 + i, 2) i += 1 if dcg_norm == 0: @@ -62,6 +62,24 @@ def compute_ndcg(scores): return dcg / dcg_norm +def compute_ndcg_without_w(data): + """ + Computes NDCG (Normalized Discounted Cumulative Gain) for a given + data. Returns an array of ndcg scores in the shape [num groups of + features]. + """ + + grouped = data.groupby(data['SampleId'], sort=False).groups + + ndcgs = [] + for id in grouped: + indices = grouped[id] + relevances = np.array(data.ix[indices]['Relevance']) + ndcgs.append(compute_ndcg(relevances)) + + return np.array(ndcgs) + + def compute_ndcg_for_w(data, w): """ Computes NDCG (Normalized Discounted Cumulative Gain) for a given @@ -120,6 +138,11 @@ def transform_data(data): def main(args): data = pd.read_csv(sys.stdin) normalize_data(data) + + ndcg = compute_ndcg_without_w(data); + print('Current NDCG: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg))) + print() + x, y = transform_data(data) clf = svm.LinearSVC(random_state=args.seed) diff --git a/search/v2/ranking_info.cpp b/search/v2/ranking_info.cpp index c5a50520cc..28e2dd7086 100644 --- a/search/v2/ranking_info.cpp +++ b/search/v2/ranking_info.cpp @@ -11,11 +11,11 @@ namespace { // See search/search_quality/scoring_model.py for details. In short, // these coeffs correspond to coeffs in a linear model. -double const kDistanceToPivot = 13.531; -double const kRank = 16.295; -double const kNameScore = 1.0; +double const kDistanceToPivot = 24.443; +double const kRank = 11.010; +double const kNameScore = 1.0 double const kNameCoverage = 0.0; -double const kSearchType = 10.692; +double const kSearchType = 22.378; double TransformDistance(double distance) {