From b5af3115c412c6d7e5579ee85a28b28a97919229 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Thu, 24 Mar 2016 15:55:08 +0300 Subject: [PATCH 1/2] [search][quality] Implemented set balancing for samples. --- search/search_quality/scoring_model.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index f59222235b..274f704f26 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -113,6 +113,10 @@ def transform_data(data): grouped = data.groupby(data['SampleId'], sort=False).groups xs, ys = [], [] + + # k is used to create a balanced samples set for better linear + # separation. + k = 1 for id in grouped: indices = grouped[id] features = data.ix[indices][FEATURES] @@ -123,10 +127,16 @@ def transform_data(data): y = np.sign(relevances[j] - relevances[i]) if y == 0: continue - x = (np.array(features.iloc[j]) - np.array(features.iloc[i])) + + x = np.array(features.iloc[j]) - np.array(features.iloc[i]) + if y != k: + x = np.negative(x) + y = -y + xs.append(x) ys.append(y) - total = total + 1 + total += 1 + k = -k # Scales this group of features to equalize different search # queries. From 451f82242f81e56cda1e621e071509b7bda4007f Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Thu, 24 Mar 2016 17:37:16 +0300 Subject: [PATCH 2/2] Review fixes. --- search/search_quality/scoring_model.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index 274f704f26..7b22a9bfda 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -110,21 +110,19 @@ def transform_data(data): linear SVM. """ - grouped = data.groupby(data['SampleId'], sort=False).groups + grouped = data.groupby(data['SampleId'], sort=False) xs, ys = [], [] # k is used to create a balanced samples set for better linear # separation. k = 1 - for id in grouped: - indices = grouped[id] - features = data.ix[indices][FEATURES] - relevances = np.array(data.ix[indices]['Relevance']) + for _, group in grouped: + features, relevances = group[FEATURES], group['Relevance'] - n, total = len(indices), 0 + n, total = len(group), 0 for _, (i, j) in enumerate(itertools.combinations(range(n), 2)): - y = np.sign(relevances[j] - relevances[i]) + y = np.sign(relevances.iloc[j] - relevances.iloc[i]) if y == 0: continue