From 8ef7c2f7675e73d99a3683a0c1ee23f0b97abbaf Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Fri, 22 May 2020 15:39:20 +0300 Subject: [PATCH] [search] Introduce ResultType feature for ranker. --- indexer/feature_data.hpp | 2 + search/ranker.cpp | 3 + search/ranking_info.cpp | 101 +++++++++++++++++++++++++ search/ranking_info.hpp | 27 +++++++ search/search_quality/scoring_model.py | 12 ++- 5 files changed, 143 insertions(+), 2 deletions(-) diff --git a/indexer/feature_data.hpp b/indexer/feature_data.hpp index 79b9bb6633..e1098e1d09 100644 --- a/indexer/feature_data.hpp +++ b/indexer/feature_data.hpp @@ -82,6 +82,8 @@ namespace feature size_t Size() const { return m_size; } bool Empty() const { return (m_size == 0); } + Types::const_iterator cbegin() const { return m_types.cbegin(); } + Types::const_iterator cend() const { return m_types.cbegin() + m_size; } Types::const_iterator begin() const { return m_types.cbegin(); } Types::const_iterator end() const { return m_types.cbegin() + m_size; } Types::iterator begin() { return m_types.begin(); } diff --git a/search/ranker.cpp b/search/ranker.cpp index 9f90942fa8..5503eea6e4 100644 --- a/search/ranker.cpp +++ b/search/ranker.cpp @@ -14,6 +14,7 @@ #include "indexer/brands_holder.hpp" #include "indexer/data_source.hpp" #include "indexer/feature_algo.hpp" +#include "indexer/feature_data.hpp" #include "indexer/feature_utils.hpp" #include "indexer/ftypes_matcher.hpp" #include "indexer/search_string_utils.hpp" @@ -353,6 +354,8 @@ class RankerResultMaker info.m_popularity = preInfo.m_popularity; info.m_rating = preInfo.m_rating; info.m_type = preInfo.m_type; + if (info.m_type == Model::TYPE_POI) + info.m_resultType = GetResultType(feature::TypesHolder(ft)); info.m_allTokensUsed = preInfo.m_allTokensUsed; info.m_numTokens = m_params.GetNumTokens(); info.m_exactMatch = preInfo.m_exactMatch; diff --git a/search/ranking_info.cpp b/search/ranking_info.cpp index d41502aee8..a100398504 100644 --- a/search/ranking_info.cpp +++ b/search/ranking_info.cpp @@ -1,10 +1,14 @@ #include "search/ranking_info.hpp" +#include "search/utils.hpp" + #include "ugc/types.hpp" +#include "indexer/classificator.hpp" #include "indexer/search_string_utils.hpp" #include "base/assert.hpp" +#include "base/stl_helpers.hpp" #include #include @@ -50,6 +54,15 @@ double constexpr kType[Model::TYPE_COUNT] = { 0.0233254 /* State */, 0.1679389 /* Country */ }; +double constexpr kResultType[base::Underlying(ResultType::Count)] = { + 0.0338794 /* TransportMajor */, + 0.0216298 /* TransportLocal */, + 0.0064977 /* Eat */, + -0.0275763 /* Hotel */, + 0.0358858 /* Attraction */, + -0.0195234 /* Service */, + -0.0128952 /* General */ +}; // Coeffs sanity checks. static_assert(kHasName >= 0, ""); @@ -102,6 +115,42 @@ void PrintParse(ostringstream & oss, array const } oss << "]"; } + +class IsServiceTypeChecker +{ +public: + IsServiceTypeChecker() + { + vector const oneLevelTypes = { + "barrier", + "power", + "traffic_calming" + }; + + vector> const twoLevelTypes = {}; + + for (auto const t : oneLevelTypes) + m_oneLevelTypes.push_back(classif().GetTypeByPath({t})); + for (auto const t : twoLevelTypes) + m_twoLevelTypes.push_back(classif().GetTypeByPath(t)); + } + + bool operator()(feature::TypesHolder const & th) const + { + auto findType = [](vector const & v, uint32_t t, uint8_t level) { + ftype::TruncValue(t, level); + return find(v.begin(), v.end(), t) != v.end(); + }; + + return base::AnyOf(th, [&](auto t) { + return findType(m_oneLevelTypes, t, 1) || findType(m_twoLevelTypes, t, 2); + }); + } + +private: + vector m_oneLevelTypes; + vector m_twoLevelTypes; +}; } // namespace // static @@ -118,6 +167,7 @@ void RankingInfo::PrintCSVHeader(ostream & os) << ",ErrorsMade" << ",MatchedFraction" << ",SearchType" + << ",ResultType" << ",PureCats" << ",FalseCats" << ",AllTokensUsed" @@ -142,6 +192,7 @@ string DebugPrint(RankingInfo const & info) os << ", m_numTokens:" << info.m_numTokens; os << ", m_matchedFraction:" << info.m_matchedFraction; os << ", m_type:" << DebugPrint(info.m_type); + os << ", m_resultType:" << DebugPrint(info.m_resultType); os << ", m_pureCats:" << info.m_pureCats; os << ", m_falseCats:" << info.m_falseCats; os << ", m_allTokensUsed:" << info.m_allTokensUsed; @@ -163,6 +214,7 @@ void RankingInfo::ToCSV(ostream & os) const os << GetErrorsMadePerToken() << ","; os << m_matchedFraction << ","; os << DebugPrint(m_type) << ","; + os << DebugPrint(m_resultType) << ","; os << m_pureCats << ","; os << m_falseCats << ","; os << (m_allTokensUsed ? 1 : 0) << ","; @@ -203,6 +255,8 @@ double RankingInfo::GetLinearModelRank() const result += kRating * rating; result += m_falseCats * kFalseCats; result += kType[m_type]; + if (m_type == Model::TYPE_POI) + result += kResultType[base::Underlying(m_resultType)]; result += kNameScore[nameScore]; result += kErrorsMade * GetErrorsMadePerToken(); result += kMatchedFraction * m_matchedFraction; @@ -236,4 +290,51 @@ double RankingInfo::GetErrorsMadePerToken() const CHECK_GREATER(m_numTokens, 0, ()); return static_cast(m_errorsMade.m_errorsMade) / static_cast(m_numTokens); } + +ResultType GetResultType(feature::TypesHolder const & th) +{ + if (ftypes::IsEatChecker::Instance()(th)) + return ResultType::Eat; + if (ftypes::IsHotelChecker::Instance()(th)) + return ResultType::Hotel; + if (ftypes::IsRailwayStationChecker::Instance()(th) || + ftypes::IsSubwayStationChecker::Instance()(th) || ftypes::IsAirportChecker::Instance()(th)) + { + return ResultType::TransportMajor; + } + if (ftypes::IsPublicTransportStopChecker::Instance()(th)) + return ResultType::TransportLocal; + + // We have several lists for attractions: short list in search categories for @tourism and long + // list in ftypes::AttractionsChecker. We have highway-pedestrian, place-square, historic-tomb, + // landuse-cemetery, amenity-townhall etc in long list and logic of long list is "if this object + // has high popularity and/or wiki description probably it is attraction". It's better to use + // short list here. + auto static const attractionTypes = + search::GetCategoryTypes("sights", "en", GetDefaultCategories()); + if (base::AnyOf(attractionTypes, [&th](auto t) { return th.Has(t); })) + return ResultType::Attraction; + + static const IsServiceTypeChecker isServiceTypeChecker; + if (isServiceTypeChecker(th)) + return ResultType::Service; + + return ResultType::General; +} + +string DebugPrint(ResultType type) +{ + switch (type) + { + case ResultType::TransportMajor: return "TransportMajor"; + case ResultType::TransportLocal: return "TransportLocal"; + case ResultType::Eat: return "Eat"; + case ResultType::Hotel: return "Hotel"; + case ResultType::Attraction: return "Attraction"; + case ResultType::Service: return "Service"; + case ResultType::General: return "General"; + case ResultType::Count: return "Count"; + } + UNREACHABLE(); +} } // namespace search diff --git a/search/ranking_info.hpp b/search/ranking_info.hpp index f704b4ff99..ae0b76e73d 100644 --- a/search/ranking_info.hpp +++ b/search/ranking_info.hpp @@ -4,6 +4,8 @@ #include "search/pre_ranking_info.hpp" #include "search/ranking_utils.hpp" +#include "indexer/feature_data.hpp" + #include #include #include @@ -15,6 +17,25 @@ class FeatureType; namespace search { +enum class ResultType : uint8_t +{ + // Railway/subway stations, airports + TransportMajor, + // Bus/tram stops + TransportLocal, + // Cafes, restaurants, bars + Eat, + // Hotels + Hotel, + // Attractions + Attraction, + // Service types: power lines and substations, barrier-fence, etc. + Service, + // All other POIs + General, + Count +}; + struct RankingInfo { static double const kMaxDistMeters; @@ -72,6 +93,9 @@ struct RankingInfo // Search type for the feature. Model::Type m_type = Model::TYPE_COUNT; + // Type (food/transport/attraction/etc) for POI results for non-categorial requests. + ResultType m_resultType = ResultType::Count; + // True if all of the tokens that the feature was matched by // correspond to this feature's categories. bool m_pureCats = false; @@ -88,5 +112,8 @@ struct RankingInfo bool m_hasName = false; }; +ResultType GetResultType(feature::TypesHolder const & th); + std::string DebugPrint(RankingInfo const & info); +std::string DebugPrint(ResultType type); } // namespace search diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index f74242fb7a..32342ba414 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -20,8 +20,9 @@ MAX_POPULARITY = 255.0 RELEVANCES = {'Harmful': -3, 'Irrelevant': 0, 'Relevant': 1, 'Vital': 3} NAME_SCORES = ['Zero', 'Substring', 'Prefix', 'Full Match'] SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country'] +RESULT_TYPES = ['TransportMajor', 'TransportLocal', 'Eat', 'Hotel', 'Attraction', 'Service', 'General'] FEATURES = ['DistanceToPivot', 'Rank', 'Popularity', 'Rating', 'FalseCats', 'ErrorsMade', 'MatchedFraction', - 'AllTokensUsed', 'ExactCountryOrCapital'] + NAME_SCORES + SEARCH_TYPES + 'AllTokensUsed', 'ExactCountryOrCapital'] + NAME_SCORES + SEARCH_TYPES + RESULT_TYPES BOOTSTRAP_ITERATIONS = 10000 @@ -62,6 +63,10 @@ def normalize_data(data): for st in SEARCH_TYPES: data[st] = data['SearchType'].apply(lambda v: int(st == v)) + # Adds dummy variables to data for RESULT_TYPES. + for rt in RESULT_TYPES: + data[rt] = data['ResultType'].apply(lambda v: int(rt == v)) + def compute_ndcg(relevances): """ @@ -215,17 +220,20 @@ def cpp_output(features, ws): Prints feature-coeff pairs in the C++-compatible format. """ - ns, st = [], [] + ns, st, rt = [], [], [] for f, w in zip(features, ws): if f in NAME_SCORES: ns.append((f, w)) elif f in SEARCH_TYPES: st.append((f, w)) + elif f in RESULT_TYPES: + rt.append((f, w)) else: print_const(f, w) print_array('kNameScore', 'NameScore::NAME_SCORE_COUNT', ns) print_array('kType', 'Model::TYPE_COUNT', st) + print_array('kResultType', 'base::Underlying(ResultType::Count)', rt) def show_bootstrap_statistics(clf, X, y, features):