Merge pull request #3259 from ygorshenin/fix-ranking-model

[search] Fixed ranking model.
2016-05-20 19:09:31 +04:00 · 2016-05-20 19:09:31 +04:00 · 50187d1bd3
commit 50187d1bd3
parent 9653664e91 ddbedfde55
12 changed files with 480 additions and 172 deletions
--- a/search/search_integration_tests/search_query_v2_test.cpp
+++ b/search/search_integration_tests/search_query_v2_test.cpp
@ -30,16 +30,27 @@ namespace search
 {
 namespace
 {
-void MakeDefaultTestParams(string const & query, SearchParams & params)
-{
-  params.m_query = query;
-  params.m_inputLocale = "en";
-  params.SetMode(Mode::Everywhere);
-  params.SetSuggestsEnabled(false);
-}
-
 class SearchQueryV2Test : public SearchTest
 {
+public:
+  unique_ptr<TestSearchRequest> MakeRequest(string const & query)
+  {
+    SearchParams params;
+    params.m_query = query;
+    params.m_inputLocale = "en";
+    params.SetMode(Mode::Everywhere);
+    params.SetSuggestsEnabled(false);
+
+    auto request = make_unique<TestSearchRequest>(m_engine, params, m_viewport);
+    request->Wait();
+    return request;
+  }
+
+  bool MatchResults(vector<shared_ptr<MatchingRule>> rules,
+                    vector<search::Result> const & actual) const
+  {
+    return ::MatchResults(m_engine, rules, actual);
+  }
 };

 UNIT_CLASS_TEST(SearchQueryV2Test, Smoke)
@ -271,7 +282,7 @@ UNIT_CLASS_TEST(SearchQueryV2Test, DisableSuggests)
    request.Wait();
    TRules rules = {ExactMatch(worldId, london1), ExactMatch(worldId, london2)};

-    TEST(MatchResults(m_engine, rules, request.Results()), ());
+    TEST(MatchResults(rules, request.Results()), ());
  }
 }

@ -321,41 +332,33 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestRankingInfo)

  SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5)));
  {
-    SearchParams params;
-    MakeDefaultTestParams("golden gate bridge ", params);
-
-    TestSearchRequest request(m_engine, params, m_viewport);
-    request.Wait();
+    auto request = MakeRequest("golden gate bridge ");

    TRules rules = {ExactMatch(wonderlandId, goldenGateBridge),
                    ExactMatch(wonderlandId, goldenGateStreet)};

-    TEST(MatchResults(m_engine, rules, request.Results()), ());
-    for (auto const & result : request.Results())
+    TEST(MatchResults(rules, request->Results()), ());
+    for (auto const & result : request->Results())
    {
      auto const & info = result.GetRankingInfo();
      TEST_EQUAL(NAME_SCORE_FULL_MATCH, info.m_nameScore, (result));
-      TEST(my::AlmostEqualAbs(1.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage));
+      TEST(!info.m_pureCats, (result));
+      TEST(!info.m_falseCats, (result));
    }
  }

  // This test is quite important and must always pass.
  {
-    SearchParams params;
-    MakeDefaultTestParams("cafe лермонтов", params);
-
-    TestSearchRequest request(m_engine, params, m_viewport);
-    request.Wait();
-
-    auto const & results = request.Results();
+    auto request = MakeRequest("cafe лермонтов");
+    auto const & results = request->Results();

    TRules rules{ExactMatch(wonderlandId, cafe1), ExactMatch(wonderlandId, cafe2),
                 ExactMatch(wonderlandId, lermontov)};
-    TEST(MatchResults(m_engine, rules, results), ());
+    TEST(MatchResults(rules, results), ());

    TEST_EQUAL(3, results.size(), ("Unexpected number of retrieved cafes."));
    auto const & top = results.front();
-    TEST(MatchResults(m_engine, {ExactMatch(wonderlandId, lermontov)}, {top}), ());
+    TEST(MatchResults({ExactMatch(wonderlandId, lermontov)}, {top}), ());
  }

  {
@ -471,6 +474,9 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories)
  TestPOI named(m2::PointD(0.0001, 0.0001), "ATM", "en");
  named.SetTypes({{"amenity", "atm"}});

+  TestPOI busStop(m2::PointD(0.00005, 0.0005), "ATM Bus Stop", "en");
+  busStop.SetTypes({{"highway", "bus_stop"}});
+
  BuildWorld([&](TestMwmBuilder & builder)
             {
               builder.Add(sanFrancisco);
@ -479,31 +485,51 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories)
                                   {
                                     builder.Add(named);
                                     builder.Add(noname);
+                                     builder.Add(busStop);
                                   });

  SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5)));
-  TRules rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)};
-
-  TEST(ResultsMatch("atm", rules), ());

  {
-    SearchParams params;
-    MakeDefaultTestParams("#atm", params);
+    TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named),
+                          ExactMatch(wonderlandId, busStop)};

-    TestSearchRequest request(m_engine, params, m_viewport);
-    request.Wait();
+    auto request = MakeRequest("atm");
+    TEST(MatchResults(rules, request->Results()), ());
+    for (auto const & result : request->Results())
+    {
+      Index::FeaturesLoaderGuard loader(m_engine, wonderlandId);
+      FeatureType ft;
+      loader.GetFeatureByIndex(result.GetFeatureID().m_index, ft);

-    TEST(MatchResults(m_engine, rules, request.Results()), ());
-    for (auto const & result : request.Results())
+      auto const & info = result.GetRankingInfo();
+
+      if (busStop.Matches(ft))
+      {
+        TEST(!info.m_pureCats, (result));
+        TEST(info.m_falseCats, (result));
+      }
+      else
+      {
+        TEST(info.m_pureCats, (result));
+        TEST(!info.m_falseCats, (result));
+      }
+    }
+  }
+
+  {
+    TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)};
+
+    auto request = MakeRequest("#atm");
+
+    TEST(MatchResults(rules, request->Results()), ());
+    for (auto const & result : request->Results())
    {
      auto const & info = result.GetRankingInfo();

      // Token with a hashtag should not participate in name-score
      // calculations.
      TEST_EQUAL(NAME_SCORE_ZERO, info.m_nameScore, (result));
-
-      // TODO (@y): fix this. Name coverage calculations are flawed.
-      // TEST(my::AlmostEqualAbs(0.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage));
    }
  }

--- a/search/search_quality/download-maps.sh
+++ b/search/search_quality/download-maps.sh
@ -0,0 +1,79 @@
+#!/bin/bash
+
+# Downloads all maps necessary for learning to rank to the current
+# directory.
+
+ALL=
+VERSION=
+BASE="http://direct.mapswithme.com/direct"
+
+display_usage() {
+    echo "Usage: $0 -v [version] -a -h"
+    echo "    -v  version of maps to download"
+    echo "    -a  download all maps of the specified version"
+    echo "    -h  display this message"
+}
+
+while getopts ":av:h" opt
+do
+    case "$opt" in
+        a) ALL=1
+           ;;
+        v) VERSION="$OPTARG"
+           ;;
+        h) display_usage
+           exit -1
+           ;;
+        \?) echo "Invalid option: -$OPTARG" 1>&2
+            ;;
+        :) echo "Option -$OPTARG requires an argument" 1>&2
+           ;;
+    esac
+done
+
+if [ -z "$VERSION" ]
+then
+    echo "Version of maps is not specified." 1>&2
+    exit -1
+fi
+
+if ! curl "$BASE/" 2>/dev/null |
+        sed -n 's/^.*href="\(.*\)\/".*$/\1/p' |
+        grep -v "^../$" | grep -q "$VERSION"
+then
+    echo "Invalid version: $VERSION" 1>&2
+    exit -1
+fi
+
+NAMES=("Australia_Brisbane.mwm"
+       "Belarus_Minsk*.mwm"
+       "Germany_*.mwm"
+       "Russia_*.mwm"
+       "UK_England_*.mwm"
+       "US_California_*.mwm"
+       "US_Maryland_*.mwm")
+
+DIR="$BASE/$VERSION"
+
+if [ "$ALL" ]
+then
+    echo "Downloading all maps..."
+
+    files=$(curl "$DIR/" 2>/dev/null | sed -n 's/^.*href="\(.*\.mwm\)".*$/\1/p')
+
+    set -e
+    set -x
+    for file in $files
+    do
+        wget -np -nd "$DIR/$file"
+    done
+else
+    echo "Downloading maps..."
+
+    set -e
+    set -x
+    for name in ${NAMES[@]}
+    do
+        wget -r -np -nd -A "$name" "$DIR/"
+    done
+fi
--- a/search/search_quality/features_collector_tool/features_collector_tool.cpp
+++ b/search/search_quality/features_collector_tool/features_collector_tool.cpp
@ -141,15 +141,26 @@ void DisplayStats(ostream & os, vector<Sample> const & samples, vector<Stats> co
 {
  auto const n = samples.size();
  ASSERT_EQUAL(stats.size(), n, ());
+
+  size_t numWarnings = 0;
+  for (auto const & stat : stats)
+  {
+    if (!stat.m_notFound.empty())
+      ++numWarnings;
+  }
+
+  if (numWarnings == 0)
+  {
+    os << "All " << stats.size() << " queries are OK." << endl;
+    return;
+  }
+
+  os << numWarnings << " warnings." << endl;
  for (size_t i = 0; i < n; ++i)
  {
-    os << "Query #" << i << " \"" << strings::ToUtf8(samples[i].m_query) << "\"";
    if (stats[i].m_notFound.empty())
-    {
-      os << ": OK" << endl;
      continue;
-    }
-    os << ": WARNING" << endl;
+    os << "Query #" << i << " \"" << strings::ToUtf8(samples[i].m_query) << "\":" << endl;
    for (auto const & j : stats[i].m_notFound)
      os << "Not found: " << DebugPrint(samples[i].m_results[j]) << endl;
  }
--- a/search/search_quality/scoring_model.py
+++ b/search/search_quality/scoring_model.py
@ -1,41 +1,67 @@
 #!/usr/bin/env python3

 from math import exp, log
+from scipy.stats import pearsonr
 from sklearn import cross_validation, grid_search, svm
 import argparse
 import collections
 import itertools
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import random
 import sys

-FEATURES = ['DistanceToPivot', 'Rank', 'NameScore', 'NameCoverage', 'SearchType']

-MAX_DISTANCE_METERS = 2e7
+MAX_DISTANCE_METERS = 2e6
 MAX_RANK = 255
 RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3}
 NAME_SCORES = ['Zero', 'Substring Prefix', 'Substring', 'Full Match Prefix', 'Full Match']
-SEARCH_TYPES = {'POI': 0,
-                'BUILDING': 0,
-                'STREET': 1,
-                'UNCLASSIFIED': 2,
-                'VILLAGE': 3,
-                'CITY': 4,
-                'STATE': 5,
-                'COUNTRY': 6}
+SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country']
+
+FEATURES = ['DistanceToPivot', 'Rank'] + NAME_SCORES + SEARCH_TYPES
+
+
+def transform_name_score(value, categories_match):
+    if categories_match == 1:
+        return 'Zero'
+    elif value == 'Full Match Prefix':
+        return 'Full Match'
+    else:
+        return value


 def normalize_data(data):
-    transform_distance = lambda d: 1 - min(d, MAX_DISTANCE_METERS) / MAX_DISTANCE_METERS
-
-    max_name_score = len(NAME_SCORES) - 1
-    max_search_type = SEARCH_TYPES['COUNTRY']
+    transform_distance = lambda v: min(v, MAX_DISTANCE_METERS) / MAX_DISTANCE_METERS

    data['DistanceToPivot'] = data['DistanceToPivot'].apply(transform_distance)
-    data['Rank'] = data['Rank'].apply(lambda rank: rank / MAX_RANK)
-    data['NameScore'] = data['NameScore'].apply(lambda s: NAME_SCORES.index(s) / max_name_score)
-    data['SearchType'] = data['SearchType'].apply(lambda t: SEARCH_TYPES[t] / max_search_type)
-    data['Relevance'] = data['Relevance'].apply(lambda r: RELEVANCES[r])
+    data['Rank'] = data['Rank'].apply(lambda v: v / MAX_RANK)
+    data['Relevance'] = data['Relevance'].apply(lambda v: RELEVANCES[v])
+
+    cats = data['PureCats'].combine(data['FalseCats'], max)
+
+    # Full prefix match is unified with a full match as these features
+    # are collinear. But we need both of them as they're also used in
+    # locality sorting.
+    #
+    # TODO (@y, @m): do forward/backward/subset selection of features
+    # instead of this merging.  It would be great to conduct PCA on
+    # the features too.
+    data['NameScore'] = data['NameScore'].combine(cats, transform_name_score)
+
+    # Adds dummy variables to data for NAME_SCORES.
+    for ns in NAME_SCORES:
+        data[ns] = data['NameScore'].apply(lambda v: int(ns == v))
+
+    # Adds dummy variables to data for SEARCH_TYPES.
+
+    # We unify BUILDING with POI here, as we don't have enough
+    # training data to distinguish between them.  Remove following
+    # line as soon as the model will be changed or we will have enough
+    # training data.
+    data['SearchType'] = data['SearchType'].apply(lambda v: v if v != 'Building' else 'POI')
+    for st in SEARCH_TYPES:
+        data[st] = data['SearchType'].apply(lambda v: int(st == v))


 def compute_ndcg(relevances):
@ -44,25 +70,12 @@ def compute_ndcg(relevances):
    array of scores.
    """

-    relevances_summary = collections.defaultdict(int)
-
-    dcg = 0
-    for i, relevance in enumerate(relevances):
-        dcg += relevance / log(2 + i, 2)
-        relevances_summary[relevance] += 1
-
-    dcg_norm, i = 0, 0
-    for relevance in sorted(relevances_summary.keys(), reverse=True):
-        for _ in range(relevances_summary[relevance]):
-            dcg_norm += relevance / log(2 + i, 2)
-            i += 1
-
-    if dcg_norm == 0:
-        return 0
-    return dcg / dcg_norm
+    dcg = sum(r / log(2 + i, 2) for i, r in enumerate(relevances))
+    dcg_norm = sum(r / log(2 + i, 2) for i, r in enumerate(sorted(relevances, reverse=True)))
+    return dcg / dcg_norm if dcg_norm != 0 else 0


-def compute_ndcg_without_w(data):
+def compute_ndcgs_without_ws(data):
    """
    Computes NDCG (Normalized Discounted Cumulative Gain) for a given
    data. Returns an array of ndcg scores in the shape [num groups of
@ -77,17 +90,17 @@ def compute_ndcg_without_w(data):
        relevances = np.array(data.ix[indices]['Relevance'])
        ndcgs.append(compute_ndcg(relevances))

-    return np.array(ndcgs)
+    return ndcgs


-def compute_ndcg_for_w(data, w):
+def compute_ndcgs_for_ws(data, ws):
    """
    Computes NDCG (Normalized Discounted Cumulative Gain) for a given
    data and an array of coeffs in a linear model. Returns an array of
    ndcg scores in the shape [num groups of features].
    """

-    data_scores = np.array([np.dot(data.ix[i][FEATURES], w) for i in data.index])
+    data_scores = np.array([np.dot(data.ix[i][FEATURES], ws) for i in data.index])
    grouped = data.groupby(data['SampleId'], sort=False).groups

    ndcgs = []
@ -101,7 +114,7 @@ def compute_ndcg_for_w(data, w):
        relevances = relevances[scores.argsort()[::-1]]
        ndcgs.append(compute_ndcg(relevances))

-    return np.array(ndcgs)
+    return ndcgs


 def transform_data(data):
@ -150,36 +163,144 @@ def transform_data(data):
    return xs, ys


+def plot_diagrams(xs, ys, features):
+    """
+    For each feature, plots histagrams of x * sign(y), where x is a
+    slice on the feature of a list of pairwise differences between
+    input feature-vectors and y is a list of pairwise differences
+    between relevances of the input feature-vectors.  Stong bias
+    toward positive or negative values in histograms indicates that
+    the current feature is important for ranking, as there is a
+    correlation between difference between features values and
+    relevancy.
+    """
+    for i, f in enumerate(features):
+        x = [x[i] * np.sign(y) for x, y in zip(xs, ys)]
+
+        l, r = min(x), max(x)
+        d = max(abs(l), abs(r))
+
+        plt.subplot(4, 4, i + 1)
+        plt.hist(x, bins=8, range=(-d, d))
+        plt.title(f)
+    plt.show()
+
+
+def show_pearson_statistics(xs, ys, features):
+    """
+    Shows info about Pearson coefficient between features and
+    relevancy.
+    """
+
+    print('***** Correlation table *****')
+    print('H0 - feature not is correlated with relevancy')
+    print('H1 - feature is correlated with relevancy')
+    print()
+
+    cs, ncs = [], []
+    for i, f in enumerate(features):
+        zs = [x[i] for x in xs]
+        (c, p) = pearsonr(zs, ys)
+
+        correlated = p < 0.05
+        print('{}: pearson={:.3f}, P(H1)={}'.format(f, c, 1 - p))
+        if correlated:
+            cs.append(f)
+        else:
+            ncs.append(f)
+
+    print()
+    print('Correlated:', cs)
+    print('Non-correlated:', ncs)
+
+
+def raw_output(features, ws):
+    """
+    Prints feature-coeff pairs to the standard output.
+    """
+
+    for f, w in zip(features, ws):
+        print('{}: {}'.format(f, w))
+
+
+def print_const(name, value):
+    print('double const k{} = {:.7f};'.format(name, value))
+
+
+def print_array(name, size, values):
+    print('double const {}[{}] = {{'.format(name, size))
+    print(',\n'.join('  {:.7f} /* {} */'.format(w, f) for (f, w) in values))
+    print('};')
+
+def cpp_output(features, ws):
+    """
+    Prints feature-coeff pairs in the C++-compatible format.
+    """
+
+    ns, st = [], []
+
+    for f, w in zip(features, ws):
+        if f in NAME_SCORES:
+            ns.append((f, w))
+        elif f in SEARCH_TYPES:
+            st.append((f, w))
+        else:
+            print_const(f, w)
+    print_array('kNameScore', 'NameScore::NAME_SCORE_COUNT', ns)
+    print_array('kSearchType', 'SearchModel::SEARCH_TYPE_COUNT', st)
+
+
 def main(args):
    data = pd.read_csv(sys.stdin)
    normalize_data(data)

-    ndcg = compute_ndcg_without_w(data);
-    print('Current NDCG: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg)))
+    ndcgs = compute_ndcgs_without_ws(data);
+    print('Current NDCG: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs)))
    print()

-    x, y = transform_data(data)
+    xs, ys = transform_data(data)
+
+    if args.plot:
+        plot_diagrams(xs, ys, FEATURES)

    clf = svm.LinearSVC(random_state=args.seed)
-    cv = cross_validation.KFold(len(y), n_folds=5, shuffle=True, random_state=args.seed)
+    cv = cross_validation.KFold(len(ys), n_folds=5, shuffle=True, random_state=args.seed)

    # "C" stands for the regularizer constant.
    grid = {'C': np.power(10.0, np.arange(-5, 6))}
    gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
-    gs.fit(x, y)
+    gs.fit(xs, ys)

-    w = gs.best_estimator_.coef_[0]
-    ndcg = compute_ndcg_for_w(data, w)
+    ws = gs.best_estimator_.coef_[0]
+    max_w = max(abs(w) for w in ws)
+    ws = np.divide(ws, max_w)
+
+    # Following code restores coeffs for merged features.
+    ws[FEATURES.index('Building')] = ws[FEATURES.index('POI')]
+    ws[FEATURES.index('Full Match Prefix')] = ws[FEATURES.index('Full Match')]
+
+    ndcgs = compute_ndcgs_for_ws(data, ws)
+
+    print('NDCG mean: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs)))
+    print('Accuracy: {}'.format(gs.best_score_))
+
+    if args.pearson:
+        print()
+        show_pearson_statistics(xs, ys, FEATURES)

-    print('NDCG mean: {}, std: {}'.format(np.mean(ndcg), np.std(ndcg)))
    print()
-    print('Linear model weights:')
-    for f, c in zip(FEATURES, w):
-        print('{}: {}'.format(f, c))
+    print('***** Linear model weights *****')
+    if args.cpp:
+        cpp_output(FEATURES, ws)
+    else:
+        raw_output(FEATURES, ws)


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', help='random seed', type=int)
+    parser.add_argument('--plot', help='plot diagrams', action='store_true')
+    parser.add_argument('--pearson', help='show pearson statistics', action='store_true')
+    parser.add_argument('--cpp', help='generate output in the C++ format', action='store_true')
    args = parser.parse_args()
    main(args)
--- a/search/search_query.cpp
+++ b/search/search_query.cpp
@ -12,7 +12,6 @@
 #include "search/v2/pre_ranking_info.hpp"
 #include "search/v2/ranking_info.hpp"
 #include "search/v2/ranking_utils.hpp"
-#include "search/v2/token_slice.hpp"

 #include "storage/country_info_getter.hpp"
 #include "storage/index.hpp"
@ -191,20 +190,11 @@ void UpdateNameScore(string const & name, TSlice const & slice, v2::NameScore &

 template <typename TSlice>
 void UpdateNameScore(vector<strings::UniString> const & tokens, TSlice const & slice,
-                     v2::NameScore & bestScore, double & bestCoverage)
+                     v2::NameScore & bestScore)
 {
  auto const score = v2::GetNameScore(tokens, slice);
-  auto const coverage =
-      tokens.empty() ? 0 : static_cast<double>(slice.Size()) / static_cast<double>(tokens.size());
  if (score > bestScore)
-  {
    bestScore = score;
-    bestCoverage = coverage;
-  }
-  else if (score == bestScore && coverage > bestCoverage)
-  {
-    bestCoverage = coverage;
-  }
 }

 inline bool IsHashtagged(strings::UniString const & s) { return !s.empty() && s[0] == '#'; }
@ -422,29 +412,18 @@ int Query::GetCategoryLocales(int8_t (&arr) [3]) const
 }

 template <class ToDo>
-void Query::ForEachCategoryTypes(ToDo toDo) const
+void Query::ForEachCategoryTypes(v2::QuerySlice const & slice, ToDo toDo) const
 {
  int8_t arrLocales[3];
  int const localesCount = GetCategoryLocales(arrLocales);
-  size_t const tokensCount = m_tokens.size();

-  for (size_t i = 0; i < tokensCount; ++i)
+  for (size_t i = 0; i < slice.Size(); ++i)
  {
-    auto token = RemoveHashtag(m_tokens[i]);
-
+    auto token = RemoveHashtag(slice.Get(i));
    for (int j = 0; j < localesCount; ++j)
      m_categories.ForEachTypeByName(arrLocales[j], token, bind<void>(ref(toDo), i, _1));
    ProcessEmojiIfNeeded(token, i, toDo);
  }
-
-  if (!m_prefix.empty())
-  {
-    auto prefix = RemoveHashtag(m_prefix);
-
-    for (int j = 0; j < localesCount; ++j)
-      m_categories.ForEachTypeByName(arrLocales[j], prefix, bind<void>(ref(toDo), tokensCount, _1));
-    ProcessEmojiIfNeeded(prefix, tokensCount, toDo);
-  }
 }

 template <class ToDo>
@ -522,10 +501,11 @@ void Query::SetQuery(string const & query)

  // get preffered types to show in results
  m_prefferedTypes.clear();
-  ForEachCategoryTypes([&] (size_t, uint32_t t)
-  {
-    m_prefferedTypes.insert(t);
-  });
+  ForEachCategoryTypes(v2::QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix),
+                       [&](size_t, uint32_t t)
+                       {
+                         m_prefferedTypes.insert(t);
+                       });
 }

 void Query::FlushViewportResults(v2::Geocoder::Params const & params, Results & res,
@ -660,7 +640,6 @@ class PreResult2Maker
    info.m_distanceToPivot = MercatorBounds::DistanceOnEarth(center, pivot);
    info.m_rank = preInfo.m_rank;
    info.m_searchType = preInfo.m_searchType;
-
    info.m_nameScore = v2::NAME_SCORE_ZERO;

    v2::TokenSlice slice(m_params, preInfo.m_startToken, preInfo.m_endToken);
@ -675,12 +654,30 @@ class PreResult2Maker
      vector<strings::UniString> tokens;
      SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());

-      UpdateNameScore(tokens, slice, info.m_nameScore, info.m_nameCoverage);
-      UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore, info.m_nameCoverage);
+      UpdateNameScore(tokens, slice, info.m_nameScore);
+      UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore);
    }

    if (info.m_searchType == v2::SearchModel::SEARCH_TYPE_BUILDING)
      UpdateNameScore(ft.GetHouseNumber(), sliceNoCategories, info.m_nameScore);
+
+    feature::TypesHolder holder(ft);
+    vector<pair<size_t, size_t>> matched(slice.Size());
+    m_query.ForEachCategoryTypes(v2::QuerySliceOnTokens(slice), [&](size_t i, uint32_t t)
+    {
+      ++matched[i].second;
+      if (holder.Has(t))
+        ++matched[i].first;
+    });
+
+    info.m_pureCats = all_of(matched.begin(), matched.end(), [](pair<size_t, size_t> const & m)
+                             {
+                               return m.first != 0;
+                             });
+    info.m_falseCats = all_of(matched.begin(), matched.end(), [](pair<size_t, size_t> const & m)
+                              {
+                                return m.first == 0 && m.second != 0;
+                              });
  }

  uint8_t NormalizeRank(uint8_t rank, v2::SearchModel::SearchType type, m2::PointD const & center,
@ -1259,7 +1256,8 @@ void Query::InitParams(bool localitySearch, SearchQueryParams & params)
        }
      }
    };
-    ForEachCategoryTypes(addSyms);
+    ForEachCategoryTypes(v2::QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix),
+                         addSyms);
  }

  for (auto & tokens : params.m_tokens)
--- a/search/search_query.hpp
+++ b/search/search_query.hpp
@ -7,6 +7,7 @@
 #include "search/suggest.hpp"
 #include "search/v2/geocoder.hpp"
 #include "search/v2/rank_table_cache.hpp"
+#include "search/v2/token_slice.hpp"

 #include "indexer/ftypes_matcher.hpp"
 #include "indexer/index.hpp"
@ -145,7 +146,8 @@ protected:
  void ClearResults();

  int GetCategoryLocales(int8_t (&arr) [3]) const;
-  template <class ToDo> void ForEachCategoryTypes(ToDo toDo) const;
+  template <class ToDo>
+  void ForEachCategoryTypes(v2::QuerySlice const & slice, ToDo toDo) const;
  template <class ToDo> void ProcessEmojiIfNeeded(
      strings::UniString const & token, size_t ind, ToDo & toDo) const;

--- a/search/search_tests/ranking_tests.cpp
+++ b/search/search_tests/ranking_tests.cpp
@ -44,5 +44,6 @@ UNIT_TEST(NameTest_Smoke)
  TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", 2, 3), NAME_SCORE_FULL_MATCH, ());
  TEST_EQUAL(GetScore("San Francisco", "Fran", 0, 1), NAME_SCORE_SUBSTRING_PREFIX, ());
  TEST_EQUAL(GetScore("San Francisco", "Fran ", 0, 1), NAME_SCORE_ZERO, ());
+  TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", 0, 1), NAME_SCORE_FULL_MATCH_PREFIX, ());
 }
 }  // namespace
--- a/search/v2/geocoder.cpp
+++ b/search/v2/geocoder.cpp
@ -73,12 +73,6 @@ size_t constexpr kLocalityRectsCacheSize = 10;

 strings::UniString const kUniSpace(strings::MakeUniString(" "));

-template <typename T>
-struct Id
-{
-  T const & operator()(T const & t) const { return t; }
-};
-
 struct ScopedMarkTokens
 {
  ScopedMarkTokens(vector<bool> & usedTokens, size_t from, size_t to)
@ -1563,12 +1557,12 @@ SearchModel::SearchType Geocoder::GetSearchTypeInGeocoding(uint32_t featureId)

 bool Geocoder::AllTokensUsed() const
 {
-  return all_of(m_usedTokens.begin(), m_usedTokens.end(), Id<bool>());
+  return all_of(m_usedTokens.begin(), m_usedTokens.end(), IdFunctor());
 }

 bool Geocoder::HasUsedTokensInRange(size_t from, size_t to) const
 {
-  return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, Id<bool>());
+  return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, IdFunctor());
 }

 size_t Geocoder::NumUnusedTokensGroups() const
--- a/search/v2/ranking_info.cpp
+++ b/search/v2/ranking_info.cpp
@ -12,20 +12,34 @@ namespace
 {
 // See search/search_quality/scoring_model.py for details.  In short,
 // these coeffs correspond to coeffs in a linear model.
-double const kDistanceToPivot = 0.19933969103335503;
-double const kRank = 3.528698483480807;
-double const kNameScore = 1.0050524496846687;
-double const kNameCoverage = 0.33989660511789926;
-double const kSearchType = 1.1949307125113533;
+double const kDistanceToPivot = -1.0000000;
+double const kRank = 0.5430747;
+double const kNameScore[NameScore::NAME_SCORE_COUNT] = {
+  -0.3686323 /* Zero */,
+  0.0977193 /* Substring Prefix */,
+  0.1340500 /* Substring */,
+  0.1368631 /* Full Match Prefix */,
+  0.1368631 /* Full Match */
+};
+double const kSearchType[SearchModel::SEARCH_TYPE_COUNT] = {
+  -0.9195533 /* POI */,
+  -0.9195533 /* Building */,
+  -0.1470504 /* Street */,
+  -0.6392620 /* Unclassified */,
+  -0.0900970 /* Village */,
+  0.4383605 /* City */,
+  0.6296097 /* State */,
+  0.7279924 /* Country */
+};

 double TransformDistance(double distance)
 {
-  return 1.0 - min(distance, RankingInfo::kMaxDistMeters) / RankingInfo::kMaxDistMeters;
+  return min(distance, RankingInfo::kMaxDistMeters) / RankingInfo::kMaxDistMeters;
 }
 }  // namespace

 // static
-double const RankingInfo::kMaxDistMeters = 2e7;
+double const RankingInfo::kMaxDistMeters = 2e6;

 // static
 void RankingInfo::PrintCSVHeader(ostream & os)
@ -33,8 +47,9 @@ void RankingInfo::PrintCSVHeader(ostream & os)
  os << "DistanceToPivot"
     << ",Rank"
     << ",NameScore"
-     << ",NameCoverage"
-     << ",SearchType";
+     << ",SearchType"
+     << ",PureCats"
+     << ",FalseCats";
 }

 string DebugPrint(RankingInfo const & info)
@ -44,8 +59,9 @@ string DebugPrint(RankingInfo const & info)
  os << "m_distanceToPivot:" << info.m_distanceToPivot << ",";
  os << "m_rank:" << static_cast<int>(info.m_rank) << ",";
  os << "m_nameScore:" << DebugPrint(info.m_nameScore) << ",";
-  os << "m_nameCoverage:" << info.m_nameCoverage << ",";
-  os << "m_searchType:" << DebugPrint(info.m_searchType);
+  os << "m_searchType:" << DebugPrint(info.m_searchType) << ",";
+  os << "m_pureCats:" << info.m_pureCats << ",";
+  os << "m_falseCats:" << info.m_falseCats;
  os << "]";
  return os.str();
 }
@ -54,7 +70,7 @@ void RankingInfo::ToCSV(ostream & os) const
 {
  os << fixed;
  os << m_distanceToPivot << "," << static_cast<int>(m_rank) << "," << DebugPrint(m_nameScore)
-     << "," << m_nameCoverage << "," << DebugPrint(m_searchType);
+     << "," << DebugPrint(m_searchType) << "," << m_pureCats << "," << m_falseCats;
 }

 double RankingInfo::GetLinearModelRank() const
@ -65,24 +81,21 @@ double RankingInfo::GetLinearModelRank() const
  // integrated in the build system.
  double const distanceToPivot = TransformDistance(m_distanceToPivot);
  double const rank = static_cast<double>(m_rank) / numeric_limits<uint8_t>::max();
-  double const nameScore = static_cast<double>(m_nameScore) / NAME_SCORE_FULL_MATCH;
-  double const nameCoverage = m_nameCoverage;

-  double searchType;
-  switch (m_searchType)
+  auto nameScore = m_nameScore;
+  if (m_pureCats || m_falseCats)
  {
-    case SearchModel::SEARCH_TYPE_POI:
-    case SearchModel::SEARCH_TYPE_BUILDING:
-      searchType = 0;
-      break;
-    default:
-      searchType = m_searchType - 1;
-      break;
+    // If the feature was matched only by categorial tokens, it's
+    // better for ranking to set name score to zero.  For example,
+    // when we're looking for a "cafe", cafes "Cafe Pushkin" and
+    // "Lermontov" both match to the request, but must be ranked in
+    // accordance to their distances to the user position or viewport,
+    // in spite of "Cafe Pushkin" has a non-zero name rank.
+    nameScore = NAME_SCORE_ZERO;
  }
-  searchType = searchType / (SearchModel::SEARCH_TYPE_COUNTRY - 1);

-  return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore * nameScore +
-         kNameCoverage * nameCoverage + kSearchType * searchType;
+  return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] +
+         kSearchType[m_searchType];
 }
 }  // namespace v2
 }  // namespace search
--- a/search/v2/ranking_info.hpp
+++ b/search/v2/ranking_info.hpp
@ -24,12 +24,18 @@ struct RankingInfo
  // Score for the feature's name.
  NameScore m_nameScore = NAME_SCORE_ZERO;

-  // Fraction of tokens from the query matched to a feature name.
-  double m_nameCoverage = 0;
-
  // Search type for the feature.
  SearchModel::SearchType m_searchType = SearchModel::SEARCH_TYPE_COUNT;

+  // True if all of the tokens that the feature was matched by
+  // correspond to this feature's categories.
+  bool m_pureCats = false;
+
+  // True if none of the tokens that the feature was matched by
+  // corresponds to this feature's categories although all of the
+  // tokens are categorial ones.
+  bool m_falseCats = false;
+
  static void PrintCSVHeader(ostream & os);

  void ToCSV(ostream & os) const;
--- a/search/v2/search_model.cpp
+++ b/search/v2/search_model.cpp
@ -130,14 +130,14 @@ string DebugPrint(SearchModel::SearchType type)
  switch (type)
  {
  case SearchModel::SEARCH_TYPE_POI: return "POI";
-  case SearchModel::SEARCH_TYPE_BUILDING: return "BUILDING";
-  case SearchModel::SEARCH_TYPE_STREET: return "STREET";
-  case SearchModel::SEARCH_TYPE_CITY: return "CITY";
-  case SearchModel::SEARCH_TYPE_VILLAGE: return "VILLAGE";
-  case SearchModel::SEARCH_TYPE_STATE: return "STATE";
-  case SearchModel::SEARCH_TYPE_COUNTRY: return "COUNTRY";
-  case SearchModel::SEARCH_TYPE_UNCLASSIFIED: return "UNCLASSIFIED";
-  case SearchModel::SEARCH_TYPE_COUNT: return "COUNT";
+  case SearchModel::SEARCH_TYPE_BUILDING: return "Building";
+  case SearchModel::SEARCH_TYPE_STREET: return "Street";
+  case SearchModel::SEARCH_TYPE_CITY: return "City";
+  case SearchModel::SEARCH_TYPE_VILLAGE: return "Village";
+  case SearchModel::SEARCH_TYPE_STATE: return "State";
+  case SearchModel::SEARCH_TYPE_COUNTRY: return "Country";
+  case SearchModel::SEARCH_TYPE_UNCLASSIFIED: return "Unclassified";
+  case SearchModel::SEARCH_TYPE_COUNT: return "Count";
  }
  ASSERT(false, ("Unknown search type:", static_cast<int>(type)));
  return string();
--- a/search/v2/token_slice.hpp
+++ b/search/v2/token_slice.hpp
@ -67,6 +67,63 @@ private:
  vector<size_t> m_indexes;
 };

+class QuerySlice
+{
+public:
+  using TString = SearchQueryParams::TString;
+
+  virtual ~QuerySlice() = default;
+
+  virtual TString const & Get(size_t i) const = 0;
+  virtual size_t Size() const = 0;
+  virtual bool IsPrefix(size_t i) const = 0;
+
+  bool Empty() const { return Size() == 0; }
+};
+
+class QuerySliceOnTokens : public QuerySlice
+{
+public:
+  QuerySliceOnTokens(TokenSlice const & slice) : m_slice(slice) {}
+
+  // QuerySlice overrides:
+  SearchQueryParams::TString const & Get(size_t i) const override { return m_slice.Get(i).front(); }
+  size_t Size() const override { return m_slice.Size(); }
+  bool IsPrefix(size_t i) const override { return m_slice.IsPrefix(i); }
+
+private:
+  TokenSlice const m_slice;
+};
+
+template <typename TCont>
+class QuerySliceOnRawStrings : public QuerySlice
+{
+public:
+  QuerySliceOnRawStrings(TCont const & tokens, TString const & prefix)
+    : m_tokens(tokens), m_prefix(prefix)
+  {
+  }
+
+  // QuerySlice overrides:
+  SearchQueryParams::TString const & Get(size_t i) const override
+  {
+    ASSERT_LESS(i, Size(), ());
+    return i == m_tokens.size() ? m_prefix : m_tokens[i];
+  }
+
+  size_t Size() const override { return m_tokens.size() + (m_prefix.empty() ? 0 : 1); }
+
+  bool IsPrefix(size_t i) const override
+  {
+    ASSERT_LESS(i, Size(), ());
+    return i == m_tokens.size();
+  }
+
+ private:
+  TCont const & m_tokens;
+  TString const & m_prefix;
+};
+
 string DebugPrint(TokenSlice const & slice);

 string DebugPrint(TokenSliceNoCategories const & slice);