diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index f59222235b..7b22a9bfda 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -110,23 +110,31 @@ def transform_data(data): linear SVM. """ - grouped = data.groupby(data['SampleId'], sort=False).groups + grouped = data.groupby(data['SampleId'], sort=False) xs, ys = [], [] - for id in grouped: - indices = grouped[id] - features = data.ix[indices][FEATURES] - relevances = np.array(data.ix[indices]['Relevance']) - n, total = len(indices), 0 + # k is used to create a balanced samples set for better linear + # separation. + k = 1 + for _, group in grouped: + features, relevances = group[FEATURES], group['Relevance'] + + n, total = len(group), 0 for _, (i, j) in enumerate(itertools.combinations(range(n), 2)): - y = np.sign(relevances[j] - relevances[i]) + y = np.sign(relevances.iloc[j] - relevances.iloc[i]) if y == 0: continue - x = (np.array(features.iloc[j]) - np.array(features.iloc[i])) + + x = np.array(features.iloc[j]) - np.array(features.iloc[i]) + if y != k: + x = np.negative(x) + y = -y + xs.append(x) ys.append(y) - total = total + 1 + total += 1 + k = -k # Scales this group of features to equalize different search # queries.