diff --git a/search/geocoder.cpp b/search/geocoder.cpp index 3812b0fcf6..8f4edc66ed 100644 --- a/search/geocoder.cpp +++ b/search/geocoder.cpp @@ -315,7 +315,8 @@ CBV DecimateCianResults(CBV const & cbv) // to worsen the percieved result. size_t const kMaxCianResults = 10000; minstd_rand rng(0); - auto survivedIds = base::RandomSample(cbv.PopCount(), kMaxCianResults, rng); + auto survivedIds = + base::RandomSample(base::checked_cast(cbv.PopCount()), kMaxCianResults, rng); sort(survivedIds.begin(), survivedIds.end()); auto it = survivedIds.begin(); vector setBits; @@ -767,7 +768,7 @@ void Geocoder::ForEachCountry(vector> const & infos, TFn && } } -void Geocoder::MatchCategories(BaseContext & ctx, bool aroundPivot) +size_t Geocoder::MatchCategories(BaseContext & ctx, bool aroundPivot) { auto features = ctx.m_features[0]; @@ -778,13 +779,16 @@ void Geocoder::MatchCategories(BaseContext & ctx, bool aroundPivot) features = filter.Filter(features); } + size_t numEmitted = 0; auto emit = [&](uint64_t bit) { auto const featureId = base::asserted_cast(bit); Model::Type type; if (!GetTypeInGeocoding(ctx, featureId, type)) return; - EmitResult(ctx, m_context->GetId(), featureId, type, TokenRange(0, 1), nullptr /* geoParts */); + EmitResult(ctx, m_context->GetId(), featureId, type, TokenRange(0, 1), nullptr /* geoParts */, + true /* allTokensUsed */); + ++numEmitted; }; // By now there's only one token and zero prefix tokens. @@ -792,23 +796,27 @@ void Geocoder::MatchCategories(BaseContext & ctx, bool aroundPivot) // using the exact (non-fuzzy) matching and intersected // with viewport, if needed. Every such feature is relevant. features.ForEach(emit); + + return numEmitted; } -void Geocoder::MatchRegions(BaseContext & ctx, Region::Type type) +size_t Geocoder::MatchRegions(BaseContext & ctx, Region::Type type) { + size_t numEmitted = 0; + switch (type) { case Region::TYPE_STATE: // Tries to skip state matching and go to cities matching. // Then, performs states matching. - MatchCities(ctx); + numEmitted += MatchCities(ctx); break; case Region::TYPE_COUNTRY: // Tries to skip country matching and go to states matching. // Then, performs countries matching. - MatchRegions(ctx, Region::TYPE_STATE); + numEmitted += MatchRegions(ctx, Region::TYPE_STATE); break; - case Region::TYPE_COUNT: ASSERT(false, ("Invalid region type.")); return; + case Region::TYPE_COUNT: ASSERT(false, ("Invalid region type.")); return numEmitted; } auto const & regions = m_regions[type]; @@ -850,27 +858,32 @@ void Geocoder::MatchRegions(BaseContext & ctx, Region::Type type) MY_SCOPE_GUARD(cleanup, [&ctx]() { ctx.m_regions.pop_back(); }); ScopedMarkTokens mark(ctx.m_tokens, BaseContext::FromRegionType(type), tokenRange); + if (ctx.AllTokensUsed()) { // Region matches to search query, we need to emit it as is. - EmitResult(ctx, region, tokenRange); + EmitResult(ctx, region, tokenRange, true /* allTokensUsed */); continue; } switch (type) { - case Region::TYPE_STATE: MatchCities(ctx); break; - case Region::TYPE_COUNTRY: MatchRegions(ctx, Region::TYPE_STATE); break; + case Region::TYPE_STATE: numEmitted += MatchCities(ctx); break; + case Region::TYPE_COUNTRY: numEmitted += MatchRegions(ctx, Region::TYPE_STATE); break; case Region::TYPE_COUNT: ASSERT(false, ("Invalid region type.")); break; } } } + + return numEmitted; } -void Geocoder::MatchCities(BaseContext & ctx) +size_t Geocoder::MatchCities(BaseContext & ctx) { ASSERT(!ctx.m_city, ()); + size_t numEmitted = 0; + // Localities are ordered my (m_startToken, m_endToken) pairs. for (auto const & p : m_cities) { @@ -895,7 +908,8 @@ void Geocoder::MatchCities(BaseContext & ctx) if (ctx.AllTokensUsed()) { // City matches to search query, we need to emit it as is. - EmitResult(ctx, city, tokenRange); + EmitResult(ctx, city, tokenRange, true /* allTokensUsed */); + ++numEmitted; continue; } @@ -909,19 +923,21 @@ void Geocoder::MatchCities(BaseContext & ctx) continue; LocalityFilter filter(cityFeatures); - LimitedSearch(ctx, filter); + numEmitted += LimitedSearch(ctx, filter); } } + + return numEmitted; } -void Geocoder::MatchAroundPivot(BaseContext & ctx) +size_t Geocoder::MatchAroundPivot(BaseContext & ctx) { auto const features = RetrieveGeometryFeatures(*m_context, m_params.m_pivot, RECT_ID_PIVOT); ViewportFilter filter(features, m_preRanker.Limit() /* threshold */); - LimitedSearch(ctx, filter); + return LimitedSearch(ctx, filter); } -void Geocoder::LimitedSearch(BaseContext & ctx, FeaturesFilter const & filter) +size_t Geocoder::LimitedSearch(BaseContext & ctx, FeaturesFilter const & filter) { m_filter = &filter; MY_SCOPE_GUARD(resetFilter, [&]() @@ -932,16 +948,19 @@ void Geocoder::LimitedSearch(BaseContext & ctx, FeaturesFilter const & filter) if (!ctx.m_streets) ctx.m_streets = m_streetsCache.Get(*m_context); - MatchUnclassified(ctx, 0 /* curToken */); + size_t numEmitted = 0; - auto const search = [this, &ctx]() - { - GreedilyMatchStreets(ctx); - MatchPOIsAndBuildings(ctx, 0 /* curToken */); + numEmitted += MatchUnclassified(ctx, 0 /* curToken */); + + auto const search = [this, &ctx, &numEmitted]() { + numEmitted += GreedilyMatchStreets(ctx); + numEmitted += MatchPOIsAndBuildings(ctx, 0 /* curToken */); }; WithPostcodes(ctx, search); search(); + + return numEmitted; } template @@ -983,17 +1002,20 @@ void Geocoder::WithPostcodes(BaseContext & ctx, TFn && fn) } } -void Geocoder::GreedilyMatchStreets(BaseContext & ctx) +size_t Geocoder::GreedilyMatchStreets(BaseContext & ctx) { vector predictions; StreetsMatcher::Go(ctx, *m_filter, m_params, predictions); + size_t numEmitted = 0; for (auto const & prediction : predictions) - CreateStreetsLayerAndMatchLowerLayers(ctx, prediction); + numEmitted += CreateStreetsLayerAndMatchLowerLayers(ctx, prediction); + + return numEmitted; } -void Geocoder::CreateStreetsLayerAndMatchLowerLayers(BaseContext & ctx, - StreetsMatcher::Prediction const & prediction) +size_t Geocoder::CreateStreetsLayerAndMatchLowerLayers( + BaseContext & ctx, StreetsMatcher::Prediction const & prediction) { auto & layers = ctx.m_layers; ASSERT(layers.empty(), ()); @@ -1005,20 +1027,28 @@ void Geocoder::CreateStreetsLayerAndMatchLowerLayers(BaseContext & ctx, InitLayer(Model::TYPE_STREET, prediction.m_tokenRange, layer); vector sortedFeatures; - sortedFeatures.reserve(prediction.m_features.PopCount()); + sortedFeatures.reserve(base::checked_cast(prediction.m_features.PopCount())); prediction.m_features.ForEach([&sortedFeatures](uint64_t bit) { sortedFeatures.push_back(base::asserted_cast(bit)); }); layer.m_sortedFeatures = &sortedFeatures; ScopedMarkTokens mark(ctx.m_tokens, BaseContext::TOKEN_TYPE_STREET, prediction.m_tokenRange); - MatchPOIsAndBuildings(ctx, 0 /* curToken */); + size_t numEmitted = MatchPOIsAndBuildings(ctx, 0 /* curToken */); + + // A relaxed best effort parse: at least show the street if we can find one. + if (numEmitted == 0) + numEmitted += FindPaths(ctx); + + return numEmitted; } -void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken) +size_t Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken) { BailIfCancelled(); + size_t numEmitted = 0; + auto & layers = ctx.m_layers; curToken = ctx.SkipUsedTokens(curToken); @@ -1027,7 +1057,10 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken) // All tokens were consumed, find paths through layers, emit // features. if (m_postcodes.m_features.IsEmpty()) - return FindPaths(ctx); + { + numEmitted += FindPaths(ctx); + return numEmitted; + } // When there are no layers but user entered a postcode, we have // to emit all features matching to the postcode. @@ -1042,14 +1075,18 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken) if (GetTypeInGeocoding(ctx, featureId, type)) { EmitResult(ctx, m_context->GetId(), featureId, type, m_postcodes.m_tokenRange, - nullptr /* geoParts */); + nullptr /* geoParts */, true /* allTokensUsed */); + ++numEmitted; } }); - return; + return numEmitted; } if (!(layers.size() == 1 && layers[0].m_type == Model::TYPE_STREET)) - return FindPaths(ctx); + { + numEmitted += FindPaths(ctx); + return numEmitted; + } // If there're only one street layer but user also entered a // postcode, we need to emit all features matching to postcode on @@ -1064,7 +1101,8 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken) if (!m_postcodes.m_features.HasBit(id)) continue; EmitResult(ctx, m_context->GetId(), id, Model::TYPE_STREET, layers.back().m_tokenRange, - nullptr /* geoParts */); + nullptr /* geoParts */, true /* allTokensUsed */); + ++numEmitted; } } @@ -1081,7 +1119,8 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken) features.push_back(base::asserted_cast(bit)); }); layer.m_sortedFeatures = &features; - return FindPaths(ctx); + numEmitted += FindPaths(ctx); + return numEmitted; } layers.emplace_back(); @@ -1135,7 +1174,6 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken) bool const looksLikeHouseNumber = house_numbers::LooksLikeHouseNumber( layers.back().m_subQuery, layers.back().m_lastTokenIsPrefix); - if (filtered.IsEmpty() && !looksLikeHouseNumber) break; @@ -1197,9 +1235,11 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken) ScopedMarkTokens mark(ctx.m_tokens, BaseContext::FromModelType(layer.m_type), TokenRange(curToken, curToken + n)); if (IsLayerSequenceSane(layers)) - MatchPOIsAndBuildings(ctx, curToken + n); + numEmitted += MatchPOIsAndBuildings(ctx, curToken + n); } } + + return numEmitted; } bool Geocoder::IsLayerSequenceSane(vector const & layers) const @@ -1244,12 +1284,14 @@ bool Geocoder::IsLayerSequenceSane(vector const & layers) const return true; } -void Geocoder::FindPaths(BaseContext const & ctx) +size_t Geocoder::FindPaths(BaseContext const & ctx) { auto const & layers = ctx.m_layers; + size_t numEmitted = 0; + if (layers.empty()) - return; + return numEmitted; // Layers ordered by search type. vector sortedLayers; @@ -1264,13 +1306,17 @@ void Geocoder::FindPaths(BaseContext const & ctx) m_matcher->SetPostcodes(&m_postcodes.m_features); else m_matcher->SetPostcodes(nullptr); + m_finder.ForEachReachableVertex( - *m_matcher, sortedLayers, [this, &ctx, &innermostLayer](IntersectionResult const & result) - { + *m_matcher, sortedLayers, + [this, &ctx, &innermostLayer, &numEmitted](IntersectionResult const & result) { ASSERT(result.IsValid(), ()); EmitResult(ctx, m_context->GetId(), result.InnermostResult(), innermostLayer.m_type, - innermostLayer.m_tokenRange, &result); + innermostLayer.m_tokenRange, &result, ctx.AllTokensUsed()); + ++numEmitted; }); + + return numEmitted; } void Geocoder::TraceResult(Tracer & tracer, BaseContext const & ctx, MwmSet::MwmId const & mwmId, @@ -1298,7 +1344,7 @@ void Geocoder::TraceResult(Tracer & tracer, BaseContext const & ctx, MwmSet::Mwm void Geocoder::EmitResult(BaseContext const & ctx, MwmSet::MwmId const & mwmId, uint32_t ftId, Model::Type type, TokenRange const & tokenRange, - IntersectionResult const * geoParts) + IntersectionResult const * geoParts, bool allTokensUsed) { FeatureID id(mwmId, ftId); @@ -1333,26 +1379,32 @@ void Geocoder::EmitResult(BaseContext const & ctx, MwmSet::MwmId const & mwmId, if (geoParts) info.m_geoParts = *geoParts; + info.m_allTokensUsed = allTokensUsed; + m_preRanker.Emplace(id, info); } void Geocoder::EmitResult(BaseContext const & ctx, Region const & region, - TokenRange const & tokenRange) + TokenRange const & tokenRange, bool allTokensUsed) { auto const type = Region::ToModelType(region.m_type); - EmitResult(ctx, region.m_countryId, region.m_featureId, type, tokenRange, nullptr /* geoParts */); + EmitResult(ctx, region.m_countryId, region.m_featureId, type, tokenRange, nullptr /* geoParts */, + allTokensUsed); } -void Geocoder::EmitResult(BaseContext const & ctx, City const & city, TokenRange const & tokenRange) +void Geocoder::EmitResult(BaseContext const & ctx, City const & city, TokenRange const & tokenRange, + bool allTokensUsed) { EmitResult(ctx, city.m_countryId, city.m_featureId, city.m_type, tokenRange, - nullptr /* geoParts */); + nullptr /* geoParts */, allTokensUsed); } -void Geocoder::MatchUnclassified(BaseContext & ctx, size_t curToken) +size_t Geocoder::MatchUnclassified(BaseContext & ctx, size_t curToken) { + size_t numEmitted = 0; + if (m_params.m_cianMode) - return; + return numEmitted; ASSERT(ctx.m_layers.empty(), ()); @@ -1364,7 +1416,7 @@ void Geocoder::MatchUnclassified(BaseContext & ctx, size_t curToken) // ok to match something to "Park London Hyde", because tokens // "Park" and "Hyde" are not adjacent. if (ctx.NumUnusedTokenGroups() != 1) - return; + return numEmitted; CBV allFeatures; allFeatures.SetFull(); @@ -1388,10 +1440,12 @@ void Geocoder::MatchUnclassified(BaseContext & ctx, size_t curToken) if (type == Model::TYPE_UNCLASSIFIED) { EmitResult(ctx, m_context->GetId(), featureId, type, TokenRange(startToken, curToken), - nullptr /* geoParts */); + nullptr /* geoParts */, true /* allTokensUsed */); + ++numEmitted; } }; allFeatures.ForEach(emitUnclassified); + return numEmitted; } CBV Geocoder::RetrievePostcodeFeatures(MwmContext const & context, TokenSlice const & slice) diff --git a/search/geocoder.hpp b/search/geocoder.hpp index dbb77dcd44..f6719ad98c 100644 --- a/search/geocoder.hpp +++ b/search/geocoder.hpp @@ -167,41 +167,49 @@ private: inline void BailIfCancelled() { ::search::BailIfCancelled(m_cancellable); } // A fast-path branch for categorial requests. - void MatchCategories(BaseContext & ctx, bool aroundPivot); + // Returns the number of emitted results. + size_t MatchCategories(BaseContext & ctx, bool aroundPivot); // Tries to find all countries and states in a search query and then // performs matching of cities in found maps. - void MatchRegions(BaseContext & ctx, Region::Type type); + // Returns the number of emitted results. + size_t MatchRegions(BaseContext & ctx, Region::Type type); // Tries to find all cities in a search query and then performs // matching of streets in found cities. - void MatchCities(BaseContext & ctx); + // Returns the number of emitted results. + size_t MatchCities(BaseContext & ctx); // Tries to do geocoding without localities, ie. find POIs, // BUILDINGs and STREETs without knowledge about country, state, // city or village. If during the geocoding too many features are // retrieved, viewport is used to throw away excess features. - void MatchAroundPivot(BaseContext & ctx); + // Returns the number of emitted results. + size_t MatchAroundPivot(BaseContext & ctx); // Tries to do geocoding in a limited scope, assuming that knowledge // about high-level features, like cities or countries, is // incorporated into |filter|. - void LimitedSearch(BaseContext & ctx, FeaturesFilter const & filter); + // Returns the number of emitted results. + size_t LimitedSearch(BaseContext & ctx, FeaturesFilter const & filter); template void WithPostcodes(BaseContext & ctx, TFn && fn); // Tries to match some adjacent tokens in the query as streets and // then performs geocoding in street vicinities. - void GreedilyMatchStreets(BaseContext & ctx); + // Returns the number of emitted results. + size_t GreedilyMatchStreets(BaseContext & ctx); - void CreateStreetsLayerAndMatchLowerLayers(BaseContext & ctx, - StreetsMatcher::Prediction const & prediction); + // Returns the number of emitted results. + size_t CreateStreetsLayerAndMatchLowerLayers(BaseContext & ctx, + StreetsMatcher::Prediction const & prediction); // Tries to find all paths in a search tree, where each edge is // marked with some substring of the query tokens. These paths are // called "layer sequence" and current path is stored in |m_layers|. - void MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken); + // Returns the number of emitted results. + size_t MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken); // Returns true if current path in the search tree (see comment for // MatchPOIsAndBuildings()) looks sane. This method is used as a fast @@ -210,7 +218,8 @@ private: // Finds all paths through layers and emits reachable features from // the lowest layer. - void FindPaths(BaseContext const & ctx); + // Returns the number of emitted results. + size_t FindPaths(BaseContext const & ctx); void TraceResult(Tracer & tracer, BaseContext const & ctx, MwmSet::MwmId const & mwmId, uint32_t ftId, Model::Type type, TokenRange const & tokenRange); @@ -218,14 +227,17 @@ private: // Forms result and feeds it to |m_preRanker|. void EmitResult(BaseContext const & ctx, MwmSet::MwmId const & mwmId, uint32_t ftId, Model::Type type, TokenRange const & tokenRange, - IntersectionResult const * geoParts); - void EmitResult(BaseContext const & ctx, Region const & region, TokenRange const & tokenRange); - void EmitResult(BaseContext const & ctx, City const & city, TokenRange const & tokenRange); + IntersectionResult const * geoParts, bool allTokensUsed); + void EmitResult(BaseContext const & ctx, Region const & region, TokenRange const & tokenRange, + bool allTokensUsed); + void EmitResult(BaseContext const & ctx, City const & city, TokenRange const & tokenRange, + bool allTokensUsed); // Tries to match unclassified objects from lower layers, like // parks, forests, lakes, rivers, etc. This method finds all // UNCLASSIFIED objects that match to all currently unused tokens. - void MatchUnclassified(BaseContext & ctx, size_t curToken); + // Returns the number of emitted results. + size_t MatchUnclassified(BaseContext & ctx, size_t curToken); // A wrapper around RetrievePostcodeFeatures. CBV RetrievePostcodeFeatures(MwmContext const & context, TokenSlice const & slice); diff --git a/search/pre_ranking_info.hpp b/search/pre_ranking_info.hpp index 6240c719c7..8408931c34 100644 --- a/search/pre_ranking_info.hpp +++ b/search/pre_ranking_info.hpp @@ -22,13 +22,13 @@ struct PreRankingInfo m_tokenRange[m_type] = range; } - inline TokenRange const & InnermostTokenRange() const + TokenRange const & InnermostTokenRange() const { ASSERT_LESS(m_type, Model::TYPE_COUNT, ()); return m_tokenRange[m_type]; } - inline size_t GetNumTokens() const { return InnermostTokenRange().Size(); } + size_t GetNumTokens() const { return InnermostTokenRange().Size(); } // An abstract distance from the feature to the pivot. Measurement // units do not matter here. @@ -44,6 +44,10 @@ struct PreRankingInfo // building and street ids are in |m_geoParts|. IntersectionResult m_geoParts; + // True iff all tokens that are not stop-words + // were used when retrieving the feature. + bool m_allTokensUsed = true; + // Rank of the feature. uint8_t m_rank = 0; diff --git a/search/ranker.cpp b/search/ranker.cpp index 65f313309d..f38fa2694b 100644 --- a/search/ranker.cpp +++ b/search/ranker.cpp @@ -3,6 +3,8 @@ #include "search/emitter.hpp" #include "search/geometry_utils.hpp" #include "search/highlighting.hpp" +#include "search/model.hpp" +#include "search/pre_ranking_info.hpp" #include "search/token_slice.hpp" #include "search/utils.hpp" @@ -231,6 +233,7 @@ class RankerResultMaker info.m_distanceToPivot = MercatorBounds::DistanceOnEarth(center, pivot); info.m_rank = preInfo.m_rank; info.m_type = preInfo.m_type; + info.m_allTokensUsed = preInfo.m_allTokensUsed; auto const nameScores = GetNameScores(ft, m_params, preInfo.InnermostTokenRange(), info.m_type); diff --git a/search/ranking_info.cpp b/search/ranking_info.cpp index 704adb71dc..8932a6df7f 100644 --- a/search/ranking_info.cpp +++ b/search/ranking_info.cpp @@ -12,25 +12,26 @@ namespace { // See search/search_quality/scoring_model.py for details. In short, // these coeffs correspond to coeffs in a linear model. -double const kDistanceToPivot = -1.0000000; -double const kRank = 0.5238890; -double const kFalseCats = -0.7319971; -double const kErrorsMade = -0.0238639; +double const kDistanceToPivot = -0.3359819; +double const kRank = 0.3886029; +double const kFalseCats = 0.0000000; +double const kErrorsMade = 0.0201364; +double const kAllTokensUsed = 1.0000000; double const kNameScore[NameScore::NAME_SCORE_COUNT] = { - -0.1683931 /* Zero */, - 0.0268117 /* Substring */, - 0.0599575 /* Prefix */, - 0.0816240 /* Full Match */ + -0.6731264 /* Zero */, + 0.2244507 /* Substring */, + 0.2141080 /* Prefix */, + 0.2345677 /* Full Match */ }; double const kType[Model::TYPE_COUNT] = { - -0.4322325 /* POI */, - -0.4322325 /* Building */, - -0.3823704 /* Street */, - -0.3747346 /* Unclassified */, - -0.4453585 /* Village */, - 0.3900264 /* City */, - 0.5397572 /* State */, - 0.7049124 /* Country */ + -0.1749965 /* POI */, + -0.1749965 /* Building */, + -0.0777042 /* Street */, + -0.0695158 /* Unclassified */, + -0.1233553 /* Village */, + 0.0391744 /* City */, + 0.1592614 /* State */, + 0.2471361 /* Country */ }; double TransformDistance(double distance) @@ -51,7 +52,8 @@ void RankingInfo::PrintCSVHeader(ostream & os) << ",ErrorsMade" << ",SearchType" << ",PureCats" - << ",FalseCats"; + << ",FalseCats" + << ",AllTokensUsed"; } string DebugPrint(RankingInfo const & info) @@ -64,7 +66,8 @@ string DebugPrint(RankingInfo const & info) os << "m_errorsMade:" << DebugPrint(info.m_errorsMade) << ","; os << "m_type:" << DebugPrint(info.m_type) << ","; os << "m_pureCats:" << info.m_pureCats << ","; - os << "m_falseCats:" << info.m_falseCats; + os << "m_falseCats:" << info.m_falseCats << ","; + os << "m_allTokensUsed:" << boolalpha << info.m_allTokensUsed; os << "]"; return os.str(); } @@ -78,7 +81,8 @@ void RankingInfo::ToCSV(ostream & os) const os << GetErrorsMade() << ","; os << DebugPrint(m_type) << ","; os << m_pureCats << ","; - os << m_falseCats; + os << m_falseCats << ","; + os << (m_allTokensUsed ? 1 : 0); } double RankingInfo::GetLinearModelRank() const @@ -102,8 +106,15 @@ double RankingInfo::GetLinearModelRank() const nameScore = NAME_SCORE_ZERO; } - return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] + - kErrorsMade * GetErrorsMade() + kType[m_type] + m_falseCats * kFalseCats; + double result = 0.0; + result += kDistanceToPivot * distanceToPivot; + result += kRank * rank; + result += kNameScore[nameScore]; + result += kErrorsMade * GetErrorsMade(); + result += kType[m_type]; + result += m_falseCats * kFalseCats; + result += (m_allTokensUsed ? +1.0 : -1.0) * kAllTokensUsed; + return result; } size_t RankingInfo::GetErrorsMade() const diff --git a/search/ranking_info.hpp b/search/ranking_info.hpp index fb9b261024..efe7da45b7 100644 --- a/search/ranking_info.hpp +++ b/search/ranking_info.hpp @@ -29,6 +29,10 @@ struct RankingInfo // Number of typos. ErrorsMade m_errorsMade; + // True iff all tokens that are not stop-words + // were used when retrieving the feature. + bool m_allTokensUsed = true; + // Search type for the feature. Model::Type m_type = Model::TYPE_COUNT; diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp index 943d02d1bb..a636503a1a 100644 --- a/search/search_integration_tests/processor_test.cpp +++ b/search/search_integration_tests/processor_test.cpp @@ -192,23 +192,25 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke) TEST(ResultsMatch("feynman street 3", rules), ()); } { - TRules rules = {ExactMatch(wonderlandId, feynmanHouse), ExactMatch(wonderlandId, lantern1)}; + TRules rules = {ExactMatch(wonderlandId, feynmanHouse), ExactMatch(wonderlandId, lantern1), + ExactMatch(wonderlandId, firstAprilStreet)}; TEST(ResultsMatch("feynman street 1", rules), ()); } { TRules rules = {ExactMatch(wonderlandId, bohrHouse), ExactMatch(wonderlandId, hilbertHouse), - ExactMatch(wonderlandId, lantern1)}; + ExactMatch(wonderlandId, lantern1), ExactMatch(wonderlandId, firstAprilStreet)}; TEST(ResultsMatch("bohr street 1", rules), ()); } { - TEST(ResultsMatch("bohr street 1 unit 3", TRules()), ()); + TEST(ResultsMatch("bohr street 1 unit 3", {ExactMatch(wonderlandId, bohrStreet1)}), ()); } { TRules rules = {ExactMatch(wonderlandId, lantern1), ExactMatch(wonderlandId, lantern2)}; TEST(ResultsMatch("bohr street 1 lantern ", rules), ()); } { - TRules rules = {ExactMatch(wonderlandId, feynmanHouse)}; + TRules rules = {ExactMatch(wonderlandId, feynmanHouse), + ExactMatch(wonderlandId, feynmanStreet)}; TEST(ResultsMatch("wonderland los alamos feynman 1 unit 1 ", rules), ()); } { @@ -224,12 +226,13 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke) TEST(ResultsMatch("Los Alamos 2", rules), ()); } { - TRules rules = {ExactMatch(wonderlandId, bornHouse)}; - TEST(ResultsMatch("long pond 1st april street 8", rules), ()); + TRules rules = {ExactMatch(wonderlandId, bornHouse), + ExactMatch(wonderlandId, firstAprilStreet)}; + TEST(ResultsMatch("long pond 1st april street 8 ", rules), ()); } { - TRules rules = {ExactMatch(wonderlandId, terranceHouse)}; + TRules rules = {ExactMatch(wonderlandId, terranceHouse), ExactMatch(wonderlandId, stradaDrive)}; TEST(ResultsMatch("Toronto strada drive 155", rules), ()); } } @@ -483,27 +486,27 @@ UNIT_CLASS_TEST(ProcessorTest, TestHouseNumbers) }); { - TRules rules{ExactMatch(countryId, building0)}; - TEST(ResultsMatch("Зеленоград генералова к100", "ru", rules), ()); + TRules rules{ExactMatch(countryId, building0), ExactMatch(countryId, street)}; + TEST(ResultsMatch("Зеленоград генералова к100 ", "ru", rules), ()); } { - TRules rules{ExactMatch(countryId, building1)}; - TEST(ResultsMatch("Зеленоград генералова к200", "ru", rules), ()); + TRules rules{ExactMatch(countryId, building1), ExactMatch(countryId, street)}; + TEST(ResultsMatch("Зеленоград генералова к200 ", "ru", rules), ()); } { - TRules rules{ExactMatch(countryId, building1)}; - TEST(ResultsMatch("Зеленоград к200 генералова", "ru", rules), ()); + TRules rules{ExactMatch(countryId, building1), ExactMatch(countryId, street)}; + TEST(ResultsMatch("Зеленоград к200 генералова ", "ru", rules), ()); } { - TRules rules{ExactMatch(countryId, building2)}; - TEST(ResultsMatch("Зеленоград 300 строение 400 генералова", "ru", rules), ()); + TRules rules{ExactMatch(countryId, building2), ExactMatch(countryId, street)}; + TEST(ResultsMatch("Зеленоград 300 строение 400 генералова ", "ru", rules), ()); } { - TRules rules{}; + TRules rules{ExactMatch(countryId, street)}; TEST(ResultsMatch("Зеленоград генералова строе 300", "ru", rules), ()); } { - TRules rules{ExactMatch(countryId, building2)}; + TRules rules{ExactMatch(countryId, building2), ExactMatch(countryId, street)}; TEST(ResultsMatch("Зеленоград генералова 300 строе", "ru", rules), ()); } } @@ -582,11 +585,11 @@ UNIT_CLASS_TEST(ProcessorTest, TestPostcodes) } { - TRules rules{ExactMatch(countryId, building28)}; - TEST(ResultsMatch("Долгопрудный первомайская 28а", "ru", rules), ()); + TRules rules{ExactMatch(countryId, building28), ExactMatch(countryId, street)}; + TEST(ResultsMatch("Долгопрудный первомайская 28а ", "ru", rules), ()); } { - TRules rules{ExactMatch(countryId, building28)}; + TRules rules{ExactMatch(countryId, building28), ExactMatch(countryId, street)}; TEST(ResultsMatch("Долгопрудный первомайская 28а, 141701", "ru", rules), ()); } { @@ -595,7 +598,7 @@ UNIT_CLASS_TEST(ProcessorTest, TestPostcodes) TEST(ResultsMatch("Долгопрудный первомайская 141701", "ru", rules), ()); } { - TRules rules{ExactMatch(countryId, building31)}; + TRules rules{ExactMatch(countryId, building31), ExactMatch(countryId, street)}; TEST(ResultsMatch("Долгопрудный первомайская 141702", "ru", rules), ()); } { @@ -929,8 +932,9 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch) SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0))); { - TRules rules = {ExactMatch(id, bar)}; - TEST(ResultsMatch("москва черчилль", "ru", rules), ()); + TRules rulesWithoutStreet = {ExactMatch(id, bar)}; + TRules rules = {ExactMatch(id, bar), ExactMatch(id, street)}; + TEST(ResultsMatch("москва черчилль", "ru", rulesWithoutStreet), ()); TEST(ResultsMatch("москва ленинградский черчилль", "ru", rules), ()); TEST(ResultsMatch("москва ленинградский паб черчилль", "ru", rules), ()); @@ -938,12 +942,12 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch) TEST(ResultsMatch("масква ленинргадский черчиль", "ru", rules), ()); // Too many errors, can't do anything. - TEST(ResultsMatch("масква ленинргадский чирчиль", "ru", TRules{}), ()); + TEST(ResultsMatch("масква лениноргадсский чирчиль", "ru", TRules{}), ()); TEST(ResultsMatch("моксва ленинргадский черчиль", "ru", rules), ()); - TEST(ResultsMatch("food", "ru", rules), ()); - TEST(ResultsMatch("foood", "ru", rules), ()); + TEST(ResultsMatch("food", "ru", rulesWithoutStreet), ()); + TEST(ResultsMatch("foood", "ru", rulesWithoutStreet), ()); TEST(ResultsMatch("fod", "ru", TRules{}), ()); TRules rulesMetro = {ExactMatch(id, metro)}; @@ -1058,11 +1062,13 @@ UNIT_CLASS_TEST(ProcessorTest, TestWeirdTypes) TEST(ResultsMatch("除細動器", "ja", rules), ()); TRules onlyFirst{ExactMatch(countryId, defibrillator1)}; + TRules firstWithStreet{ExactMatch(countryId, defibrillator1), ExactMatch(countryId, street)}; + // City + category. Only the first defibrillator is inside. - TEST(ResultsMatch("東京 除細動器", "ja", onlyFirst), ()); + TEST(ResultsMatch("東京 除細動器 ", "ja", onlyFirst), ()); // City + street + category. - TEST(ResultsMatch("東京 竹下通り 除細動器", "ja", onlyFirst), ()); + TEST(ResultsMatch("東京 竹下通り 除細動器 ", "ja", firstWithStreet), ()); } { @@ -1191,5 +1197,94 @@ UNIT_CLASS_TEST(ProcessorTest, CityBoundarySmoke) } } } + +// Tests for the non-strict aspects of retrieval. +// Currently, the only possible non-strictness is that +// some tokens in the query may be ignored, +// which results in a pruned parse tree for the query. +UNIT_CLASS_TEST(ProcessorTest, RelaxedRetrieval) +{ + string const countryName = "Wonderland"; + TestCountry country(m2::PointD(10.0, 10.0), countryName, "en"); + + TestCity city({{-10.0, -10.0}, {10.0, -10.0}, {10.0, 10.0}, {-10.0, 10.0}} /* boundary */, + "Sick City", "en", 255 /* rank */); + + TestStreet street(vector{m2::PointD(-1.0, 0.0), m2::PointD(1.0, 0.0)}, "Queer Street", + "en"); + TestBuilding building0(m2::PointD(-1.0, 0.0), "" /* name */, "0", street, "en"); + TestBuilding building1(m2::PointD(1.0, 0.0), "", "1", street, "en"); + TestBuilding building2(m2::PointD(2.0, 0.0), "named building", "" /* house number */, "en"); + TestBuilding building3(m2::PointD(3.0, 0.0), "named building", "", "en"); + + TestPOI poi0(m2::PointD(-1.0, 0.0), "Farmacia de guardia", "en"); + poi0.SetTypes({{"amenity", "pharmacy"}}); + + // A poi inside building2. + TestPOI poi2(m2::PointD(2.0, 0.0), "Post box", "en"); + poi2.SetTypes({{"amenity", "post_box"}}); + + auto countryId = BuildCountry(countryName, [&](TestMwmBuilder & builder) { + builder.Add(street); + builder.Add(building0); + builder.Add(building1); + builder.Add(poi0); + }); + RegisterCountry(countryName, m2::RectD(m2::PointD(-10.0, -10.0), m2::PointD(10.0, 10.0))); + + auto worldId = BuildWorld([&](TestMwmBuilder & builder) { + builder.Add(country); + builder.Add(city); + }); + + { + TRules rulesStrict = {ExactMatch(countryId, building0)}; + TRules rulesRelaxed = {ExactMatch(countryId, street)}; + + // "street" instead of "street-building" + TEST(ResultsMatch("queer street 0 ", rulesStrict), ()); + TEST(ResultsMatch("queer street ", rulesRelaxed), ()); + TEST(ResultsMatch("queer street 2 ", rulesRelaxed), ()); + } + + { + TRules rulesStrict = {ExactMatch(countryId, poi0), ExactMatch(countryId, street)}; + TRules rulesRelaxed = {ExactMatch(countryId, street)}; + + // "country-city-street" instead of "country-city-street-poi" + TEST(ResultsMatch("wonderland sick city queer street pharmacy ", rulesStrict), ()); + TEST(ResultsMatch("wonderland sick city queer street school ", rulesRelaxed), ()); + } + + { + TRules rulesStrict = {ExactMatch(countryId, street)}; + TRules rulesRelaxed = {}; + + // Cities and larger toponyms should not be relaxed. + // "city" instead of "city-street" + TEST(ResultsMatch("sick city queer street ", rulesStrict), ()); + TEST(ResultsMatch("sick city sick street ", rulesRelaxed), ()); + } + + { + TRules rulesStrict = {ExactMatch(countryId, street)}; + TRules rulesRelaxed = {}; + + // Should not be relaxed. + // "country-city" instead of "country-city-street" + TEST(ResultsMatch("wonderland sick city queer street ", rulesStrict), ()); + TEST(ResultsMatch("wonderland sick city other street ", rulesRelaxed), ()); + } + + { + TRules rulesStrict = {ExactMatch(countryId, poi0)}; + TRules rulesRelaxed = {}; + + // Should not be relaxed. + // "city" instead of "city-poi" + TEST(ResultsMatch("sick city pharmacy ", rulesStrict), ()); + TEST(ResultsMatch("sick city library ", rulesRelaxed), ()); + } +} } // namespace } // namespace search diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py index 1f26328d46..6c20a2d257 100755 --- a/search/search_quality/scoring_model.py +++ b/search/search_quality/scoring_model.py @@ -19,7 +19,7 @@ MAX_RANK = 255 RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3} NAME_SCORES = ['Zero', 'Substring', 'Prefix', 'Full Match'] SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country'] -FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats', 'ErrorsMade'] + NAME_SCORES + SEARCH_TYPES +FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats', 'ErrorsMade', 'AllTokensUsed'] + NAME_SCORES + SEARCH_TYPES BOOTSTRAP_ITERATIONS = 10000 @@ -37,6 +37,7 @@ def normalize_data(data): data['DistanceToPivot'] = data['DistanceToPivot'].apply(transform_distance) data['Rank'] = data['Rank'].apply(lambda v: v / MAX_RANK) data['Relevance'] = data['Relevance'].apply(lambda v: RELEVANCES[v]) + data['AllTokensUsed'] = data['AllTokensUsed'].apply(lambda v : +1 if bool(v) else -1) cats = data['PureCats'].combine(data['FalseCats'], max)