[search] Fixed search quality scripts.

This commit is contained in:
Yuri Gorshenin 2016-03-18 17:18:47 +03:00 committed by Sergey Yershov
parent bea4069067
commit 4212a0e4a1
3 changed files with 43 additions and 13 deletions

View file

@ -41,10 +41,17 @@ exec /usr/local/bin/sbcl --noinform --quit --load $0 --end-toplevel-options "$@"
(maxx :initarg :maxx)
(maxy :initarg :maxy)))
(defun position-x-y (x y)
(assert (and (>= x *minx*) (<= x *maxx*)))
(assert (and (>= y *miny*) (<= y *maxy*)))
(make-instance 'pos :x x :y y))
(defun position-lat-lon (lat lon)
(make-instance 'pos :x (lon-to-x lon) :y (lat-to-y lat)))
(position-x-y (lon-to-x lon) (lat-to-y lat)))
(defun viewport (&key minx miny maxx maxy)
(assert (<= minx maxx))
(assert (<= miny maxy))
(make-instance 'viewport :minx minx :maxx maxx :miny miny :maxy maxy))
(defclass result ()

View file

@ -38,23 +38,23 @@ def normalize_data(data):
data['Relevance'] = data['Relevance'].apply(lambda r: RELEVANCES[r])
def compute_ndcg(scores):
def compute_ndcg(relevances):
"""
Computes NDCG (Normalized Discounted Cumulative Gain) for a given
array of scores.
"""
scores_summary = collections.defaultdict(int)
relevances_summary = collections.defaultdict(int)
dcg = 0
for i, score in enumerate(scores):
dcg += score / log(2 + i, 2)
scores_summary[score] += 1
for i, relevance in enumerate(relevances):
dcg += relevance / log(2 + i, 2)
relevances_summary[relevance] += 1
dcg_norm, i = 0, 0
for score in sorted(scores_summary.keys(), reverse=True):
for _ in range(scores_summary[score]):
dcg_norm += score / log(2 + i, 2)
for relevance in sorted(relevances_summary.keys(), reverse=True):
for _ in range(relevances_summary[relevance]):
dcg_norm += relevance / log(2 + i, 2)
i += 1
if dcg_norm == 0:
@ -62,6 +62,24 @@ def compute_ndcg(scores):
return dcg / dcg_norm
def compute_ndcg_without_w(data):
"""
Computes NDCG (Normalized Discounted Cumulative Gain) for a given
data. Returns an array of ndcg scores in the shape [num groups of
features].
"""
grouped = data.groupby(data['SampleId'], sort=False).groups
ndcgs = []
for id in grouped:
indices = grouped[id]
relevances = np.array(data.ix[indices]['Relevance'])
ndcgs.append(compute_ndcg(relevances))
return np.array(ndcgs)
def compute_ndcg_for_w(data, w):
"""
Computes NDCG (Normalized Discounted Cumulative Gain) for a given
@ -120,6 +138,11 @@ def transform_data(data):
def main(args):
data = pd.read_csv(sys.stdin)
normalize_data(data)
ndcg = compute_ndcg_without_w(data);
print('Current NDCG: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg)))
print()
x, y = transform_data(data)
clf = svm.LinearSVC(random_state=args.seed)

View file

@ -11,11 +11,11 @@ namespace
{
// See search/search_quality/scoring_model.py for details. In short,
// these coeffs correspond to coeffs in a linear model.
double const kDistanceToPivot = 13.531;
double const kRank = 16.295;
double const kNameScore = 1.0;
double const kDistanceToPivot = 24.443;
double const kRank = 11.010;
double const kNameScore = 1.0
double const kNameCoverage = 0.0;
double const kSearchType = 10.692;
double const kSearchType = 22.378;
double TransformDistance(double distance)
{