Review fixes.

2016-05-19 14:26:44 +03:00 · 2016-05-19 14:26:44 +03:00 · ddbedfde55
commit ddbedfde55
parent 04e5c0eb18
9 changed files with 134 additions and 84 deletions
--- a/base/stl_helpers.hpp
+++ b/base/stl_helpers.hpp
@ -73,10 +73,4 @@ impl::Comparer<false, T, C> CompareBy(T (C::*p)() const)
 {
  return impl::Comparer<false, T, C>(p);
 }
-
-template <typename T>
-struct Id
-{
-  T const & operator()(T const & t) const { return t; }
-};
 }  // namespace my
--- a/search/search_integration_tests/search_query_v2_test.cpp
+++ b/search/search_integration_tests/search_query_v2_test.cpp
@ -33,7 +33,7 @@ namespace
 class SearchQueryV2Test : public SearchTest
 {
 public:
-  unique_ptr<TestSearchRequest> DoRequest(string const & query)
+  unique_ptr<TestSearchRequest> MakeRequest(string const & query)
  {
    SearchParams params;
    params.m_query = query;
@ -332,7 +332,7 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestRankingInfo)

  SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5)));
  {
-    auto request = DoRequest("golden gate bridge ");
+    auto request = MakeRequest("golden gate bridge ");

    TRules rules = {ExactMatch(wonderlandId, goldenGateBridge),
                    ExactMatch(wonderlandId, goldenGateStreet)};
@ -342,15 +342,14 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestRankingInfo)
    {
      auto const & info = result.GetRankingInfo();
      TEST_EQUAL(NAME_SCORE_FULL_MATCH, info.m_nameScore, (result));
-      TEST(!info.m_matchByTrueCats, (result));
-      TEST(!info.m_matchByFalseCats, (result));
-      TEST(my::AlmostEqualAbs(1.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage));
+      TEST(!info.m_pureCats, (result));
+      TEST(!info.m_falseCats, (result));
    }
  }

  // This test is quite important and must always pass.
  {
-    auto request = DoRequest("cafe лермонтов");
+    auto request = MakeRequest("cafe лермонтов");
    auto const & results = request->Results();

    TRules rules{ExactMatch(wonderlandId, cafe1), ExactMatch(wonderlandId, cafe2),
@ -475,6 +474,9 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories)
  TestPOI named(m2::PointD(0.0001, 0.0001), "ATM", "en");
  named.SetTypes({{"amenity", "atm"}});

+  TestPOI busStop(m2::PointD(0.00005, 0.0005), "ATM Bus Stop", "en");
+  busStop.SetTypes({{"highway", "bus_stop"}});
+
  BuildWorld([&](TestMwmBuilder & builder)
             {
               builder.Add(sanFrancisco);
@ -483,24 +485,42 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories)
                                   {
                                     builder.Add(named);
                                     builder.Add(noname);
+                                     builder.Add(busStop);
                                   });

  SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5)));
-  TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)};

  {
-    auto request = DoRequest("atm");
+    TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named),
+                          ExactMatch(wonderlandId, busStop)};
+
+    auto request = MakeRequest("atm");
    TEST(MatchResults(rules, request->Results()), ());
    for (auto const & result : request->Results())
    {
+      Index::FeaturesLoaderGuard loader(m_engine, wonderlandId);
+      FeatureType ft;
+      loader.GetFeatureByIndex(result.GetFeatureID().m_index, ft);
+
      auto const & info = result.GetRankingInfo();
-      TEST(info.m_matchByTrueCats, (result));
-      TEST(!info.m_matchByFalseCats, (result));
+
+      if (busStop.Matches(ft))
+      {
+        TEST(!info.m_pureCats, (result));
+        TEST(info.m_falseCats, (result));
+      }
+      else
+      {
+        TEST(info.m_pureCats, (result));
+        TEST(!info.m_falseCats, (result));
+      }
    }
  }

  {
-    auto request = DoRequest("#atm");
+    TRules const rules = {ExactMatch(wonderlandId, noname), ExactMatch(wonderlandId, named)};
+
+    auto request = MakeRequest("#atm");

    TEST(MatchResults(rules, request->Results()), ());
    for (auto const & result : request->Results())
@ -510,9 +530,6 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestCategories)
      // Token with a hashtag should not participate in name-score
      // calculations.
      TEST_EQUAL(NAME_SCORE_ZERO, info.m_nameScore, (result));
-
-      // TODO (@y): fix this. Name coverage calculations are flawed.
-      // TEST(my::AlmostEqualAbs(0.0, info.m_nameCoverage, 1e-6), (info.m_nameCoverage));
    }
  }

--- a/search/search_quality/download-maps.sh
+++ b/search/search_quality/download-maps.sh
@ -3,25 +3,77 @@
 # Downloads all maps necessary for learning to rank to the current
 # directory.

-case $# in
-    1) VERSION="$1"
-       ;;
-    *) echo "Usage: $0 version" 2>&1
-       exit -1
-       ;;
-esac
+ALL=
+VERSION=
+BASE="http://direct.mapswithme.com/direct"
+
+display_usage() {
+    echo "Usage: $0 -v [version] -a -h"
+    echo "    -v  version of maps to download"
+    echo "    -a  download all maps of the specified version"
+    echo "    -h  display this message"
+}
+
+while getopts ":av:h" opt
+do
+    case "$opt" in
+        a) ALL=1
+           ;;
+        v) VERSION="$OPTARG"
+           ;;
+        h) display_usage
+           exit -1
+           ;;
+        \?) echo "Invalid option: -$OPTARG" 1>&2
+            ;;
+        :) echo "Option -$OPTARG requires an argument" 1>&2
+           ;;
+    esac
+done
+
+if [ -z "$VERSION" ]
+then
+    echo "Version of maps is not specified." 1>&2
+    exit -1
+fi
+
+if ! curl "$BASE/" 2>/dev/null |
+        sed -n 's/^.*href="\(.*\)\/".*$/\1/p' |
+        grep -v "^../$" | grep -q "$VERSION"
+then
+    echo "Invalid version: $VERSION" 1>&2
+    exit -1
+fi

-BASE="http://direct.mapswithme.com/direct/$VERSION/"
 NAMES=("Australia_Brisbane.mwm"
       "Belarus_Minsk*.mwm"
       "Germany_*.mwm"
       "Russia_*.mwm"
       "UK_England_*.mwm"
-       "US_California_*.mwm" "US_Maryland_*.mwm")
+       "US_California_*.mwm"
+       "US_Maryland_*.mwm")

-set -e
-set -x
-for name in ${NAMES[@]}
-do
-    wget -r -np -nd -A "$name" "$BASE"
-done
+DIR="$BASE/$VERSION"
+
+if [ "$ALL" ]
+then
+    echo "Downloading all maps..."
+
+    files=$(curl "$DIR/" 2>/dev/null | sed -n 's/^.*href="\(.*\.mwm\)".*$/\1/p')
+
+    set -e
+    set -x
+    for file in $files
+    do
+        wget -np -nd "$DIR/$file"
+    done
+else
+    echo "Downloading maps..."
+
+    set -e
+    set -x
+    for name in ${NAMES[@]}
+    do
+        wget -r -np -nd -A "$name" "$DIR/"
+    done
+fi
--- a/search/search_quality/features_collector_tool/features_collector_tool.cpp
+++ b/search/search_quality/features_collector_tool/features_collector_tool.cpp
@ -143,14 +143,15 @@ void DisplayStats(ostream & os, vector<Sample> const & samples, vector<Stats> co
  ASSERT_EQUAL(stats.size(), n, ());

  size_t numWarnings = 0;
-  for (auto const & stat : stats) {
+  for (auto const & stat : stats)
+  {
    if (!stat.m_notFound.empty())
      ++numWarnings;
  }

  if (numWarnings == 0)
  {
-    os << "All " << stats.size() << " queries OK." << endl;
+    os << "All " << stats.size() << " queries are OK." << endl;
    return;
  }

--- a/search/search_quality/scoring_model.py
+++ b/search/search_quality/scoring_model.py
@ -38,7 +38,7 @@ def normalize_data(data):
    data['Rank'] = data['Rank'].apply(lambda v: v / MAX_RANK)
    data['Relevance'] = data['Relevance'].apply(lambda v: RELEVANCES[v])

-    cats = data['MatchByTrueCats'].combine(data['MatchByFalseCats'], max)
+    cats = data['PureCats'].combine(data['FalseCats'], max)

    # Full prefix match is unified with a full match as these features
    # are collinear. But we need both of them as they're also used in
@ -49,8 +49,6 @@ def normalize_data(data):
    # the features too.
    data['NameScore'] = data['NameScore'].combine(cats, transform_name_score)

-    data['NameCoverage'] = data['NameCoverage'].combine(cats, lambda v, c: v if c == 0 else 0.0)
-
    # Adds dummy variables to data for NAME_SCORES.
    for ns in NAME_SCORES:
        data[ns] = data['NameScore'].apply(lambda v: int(ns == v))
--- a/search/search_query.cpp
+++ b/search/search_query.cpp
@ -190,20 +190,11 @@ void UpdateNameScore(string const & name, TSlice const & slice, v2::NameScore &

 template <typename TSlice>
 void UpdateNameScore(vector<strings::UniString> const & tokens, TSlice const & slice,
-                     v2::NameScore & bestScore, double & bestCoverage)
+                     v2::NameScore & bestScore)
 {
  auto const score = v2::GetNameScore(tokens, slice);
-  auto const coverage =
-      tokens.empty() ? 0 : static_cast<double>(slice.Size()) / static_cast<double>(tokens.size());
  if (score > bestScore)
-  {
    bestScore = score;
-    bestCoverage = coverage;
-  }
-  else if (score == bestScore && coverage > bestCoverage)
-  {
-    bestCoverage = coverage;
-  }
 }

 inline bool IsHashtagged(strings::UniString const & s) { return !s.empty() && s[0] == '#'; }
@ -663,8 +654,8 @@ class PreResult2Maker
      vector<strings::UniString> tokens;
      SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());

-      UpdateNameScore(tokens, slice, info.m_nameScore, info.m_nameCoverage);
-      UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore, info.m_nameCoverage);
+      UpdateNameScore(tokens, slice, info.m_nameScore);
+      UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore);
    }

    if (info.m_searchType == v2::SearchModel::SEARCH_TYPE_BUILDING)
@ -679,16 +670,14 @@ class PreResult2Maker
        ++matched[i].first;
    });

-    info.m_matchByTrueCats =
-        all_of(matched.begin(), matched.end(), [](pair<size_t, size_t> const & m)
-               {
-                 return m.first != 0;
-               });
-    info.m_matchByFalseCats =
-        all_of(matched.begin(), matched.end(), [](pair<size_t, size_t> const & m)
-               {
-                 return m.first == 0 && m.second != 0;
-               });
+    info.m_pureCats = all_of(matched.begin(), matched.end(), [](pair<size_t, size_t> const & m)
+                             {
+                               return m.first != 0;
+                             });
+    info.m_falseCats = all_of(matched.begin(), matched.end(), [](pair<size_t, size_t> const & m)
+                              {
+                                return m.first == 0 && m.second != 0;
+                              });
  }

  uint8_t NormalizeRank(uint8_t rank, v2::SearchModel::SearchType type, m2::PointD const & center,
--- a/search/v2/geocoder.cpp
+++ b/search/v2/geocoder.cpp
@ -1557,12 +1557,12 @@ SearchModel::SearchType Geocoder::GetSearchTypeInGeocoding(uint32_t featureId)

 bool Geocoder::AllTokensUsed() const
 {
-  return all_of(m_usedTokens.begin(), m_usedTokens.end(), my::Id<bool>());
+  return all_of(m_usedTokens.begin(), m_usedTokens.end(), IdFunctor());
 }

 bool Geocoder::HasUsedTokensInRange(size_t from, size_t to) const
 {
-  return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, my::Id<bool>());
+  return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, IdFunctor());
 }

 size_t Geocoder::NumUnusedTokensGroups() const
--- a/search/v2/ranking_info.cpp
+++ b/search/v2/ranking_info.cpp
@ -47,10 +47,9 @@ void RankingInfo::PrintCSVHeader(ostream & os)
  os << "DistanceToPivot"
     << ",Rank"
     << ",NameScore"
-     << ",NameCoverage"
     << ",SearchType"
-     << ",MatchByTrueCats"
-     << ",MatchByFalseCats";
+     << ",PureCats"
+     << ",FalseCats";
 }

 string DebugPrint(RankingInfo const & info)
@ -60,10 +59,9 @@ string DebugPrint(RankingInfo const & info)
  os << "m_distanceToPivot:" << info.m_distanceToPivot << ",";
  os << "m_rank:" << static_cast<int>(info.m_rank) << ",";
  os << "m_nameScore:" << DebugPrint(info.m_nameScore) << ",";
-  os << "m_nameCoverage:" << info.m_nameCoverage << ",";
  os << "m_searchType:" << DebugPrint(info.m_searchType) << ",";
-  os << "m_matchByTrueCats:" << info.m_matchByTrueCats << ",";
-  os << "m_matchByFalseCats:" << info.m_matchByFalseCats;
+  os << "m_pureCats:" << info.m_pureCats << ",";
+  os << "m_falseCats:" << info.m_falseCats;
  os << "]";
  return os.str();
 }
@ -72,8 +70,7 @@ void RankingInfo::ToCSV(ostream & os) const
 {
  os << fixed;
  os << m_distanceToPivot << "," << static_cast<int>(m_rank) << "," << DebugPrint(m_nameScore)
-     << "," << m_nameCoverage << "," << DebugPrint(m_searchType) << "," << m_matchByTrueCats << ","
-     << m_matchByFalseCats;
+     << "," << DebugPrint(m_searchType) << "," << m_pureCats << "," << m_falseCats;
 }

 double RankingInfo::GetLinearModelRank() const
@ -86,11 +83,15 @@ double RankingInfo::GetLinearModelRank() const
  double const rank = static_cast<double>(m_rank) / numeric_limits<uint8_t>::max();

  auto nameScore = m_nameScore;
-  auto nameCoverage = m_nameCoverage;
-  if (m_matchByTrueCats || m_matchByFalseCats)
+  if (m_pureCats || m_falseCats)
  {
+    // If the feature was matched only by categorial tokens, it's
+    // better for ranking to set name score to zero.  For example,
+    // when we're looking for a "cafe", cafes "Cafe Pushkin" and
+    // "Lermontov" both match to the request, but must be ranked in
+    // accordance to their distances to the user position or viewport,
+    // in spite of "Cafe Pushkin" has a non-zero name rank.
    nameScore = NAME_SCORE_ZERO;
-    nameCoverage = 0.0;
  }

  return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] +
--- a/search/v2/ranking_info.hpp
+++ b/search/v2/ranking_info.hpp
@ -24,19 +24,17 @@ struct RankingInfo
  // Score for the feature's name.
  NameScore m_nameScore = NAME_SCORE_ZERO;

-  // Fraction of tokens from the query matched to a feature name.
-  double m_nameCoverage = 0;
-
  // Search type for the feature.
  SearchModel::SearchType m_searchType = SearchModel::SEARCH_TYPE_COUNT;

-  // True if the feature was matched only by tokens corresponding to
-  // it's categories.
-  bool m_matchByTrueCats = false;
+  // True if all of the tokens that the feature was matched by
+  // correspond to this feature's categories.
+  bool m_pureCats = false;

-  // True if the feature was matched only by tokens don't
-  // corresponding to it's categories.
-  bool m_matchByFalseCats = false;
+  // True if none of the tokens that the feature was matched by
+  // corresponds to this feature's categories although all of the
+  // tokens are categorial ones.
+  bool m_falseCats = false;

  static void PrintCSVHeader(ostream & os);