From b5af3115c412c6d7e5579ee85a28b28a97919229 Mon Sep 17 00:00:00 2001
From: Yuri Gorshenin <y@maps.me>
Date: Thu, 24 Mar 2016 15:55:08 +0300
Subject: [PATCH 1/2] [search][quality] Implemented set balancing for samples.

---
 search/search_quality/scoring_model.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py
index f59222235b..274f704f26 100755
--- a/search/search_quality/scoring_model.py
+++ b/search/search_quality/scoring_model.py
@@ -113,6 +113,10 @@ def transform_data(data):
     grouped = data.groupby(data['SampleId'], sort=False).groups
 
     xs, ys = [], []
+
+    # k is used to create a balanced samples set for better linear
+    # separation.
+    k = 1
     for id in grouped:
         indices = grouped[id]
         features = data.ix[indices][FEATURES]
@@ -123,10 +127,16 @@ def transform_data(data):
             y = np.sign(relevances[j] - relevances[i])
             if y == 0:
                 continue
-            x = (np.array(features.iloc[j]) - np.array(features.iloc[i]))
+
+            x = np.array(features.iloc[j]) - np.array(features.iloc[i])
+            if y != k:
+                x = np.negative(x)
+                y = -y
+
             xs.append(x)
             ys.append(y)
-            total = total + 1
+            total += 1
+            k = -k
 
         # Scales this group of features to equalize different search
         # queries.

From 451f82242f81e56cda1e621e071509b7bda4007f Mon Sep 17 00:00:00 2001
From: Yuri Gorshenin <y@maps.me>
Date: Thu, 24 Mar 2016 17:37:16 +0300
Subject: [PATCH 2/2] Review fixes.

---
 search/search_quality/scoring_model.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py
index 274f704f26..7b22a9bfda 100755
--- a/search/search_quality/scoring_model.py
+++ b/search/search_quality/scoring_model.py
@@ -110,21 +110,19 @@ def transform_data(data):
     linear SVM.
     """
 
-    grouped = data.groupby(data['SampleId'], sort=False).groups
+    grouped = data.groupby(data['SampleId'], sort=False)
 
     xs, ys = [], []
 
     # k is used to create a balanced samples set for better linear
     # separation.
     k = 1
-    for id in grouped:
-        indices = grouped[id]
-        features = data.ix[indices][FEATURES]
-        relevances = np.array(data.ix[indices]['Relevance'])
+    for _, group in grouped:
+        features, relevances = group[FEATURES], group['Relevance']
 
-        n, total = len(indices), 0
+        n, total = len(group), 0
         for _, (i, j) in enumerate(itertools.combinations(range(n), 2)):
-            y = np.sign(relevances[j] - relevances[i])
+            y = np.sign(relevances.iloc[j] - relevances.iloc[i])
             if y == 0:
                 continue