[search] Relaxed parsing in Geocoder.

This commit is contained in:
Maxim Pimenov 2017-11-24 20:35:12 +03:00 committed by Yuri Gorshenin
parent c7f14933ec
commit 79dc8f45c5
8 changed files with 300 additions and 116 deletions

View file

@ -315,7 +315,8 @@ CBV DecimateCianResults(CBV const & cbv)
// to worsen the percieved result.
size_t const kMaxCianResults = 10000;
minstd_rand rng(0);
auto survivedIds = base::RandomSample(cbv.PopCount(), kMaxCianResults, rng);
auto survivedIds =
base::RandomSample(base::checked_cast<size_t>(cbv.PopCount()), kMaxCianResults, rng);
sort(survivedIds.begin(), survivedIds.end());
auto it = survivedIds.begin();
vector<uint64_t> setBits;
@ -767,7 +768,7 @@ void Geocoder::ForEachCountry(vector<shared_ptr<MwmInfo>> const & infos, TFn &&
}
}
void Geocoder::MatchCategories(BaseContext & ctx, bool aroundPivot)
size_t Geocoder::MatchCategories(BaseContext & ctx, bool aroundPivot)
{
auto features = ctx.m_features[0];
@ -778,13 +779,16 @@ void Geocoder::MatchCategories(BaseContext & ctx, bool aroundPivot)
features = filter.Filter(features);
}
size_t numEmitted = 0;
auto emit = [&](uint64_t bit) {
auto const featureId = base::asserted_cast<uint32_t>(bit);
Model::Type type;
if (!GetTypeInGeocoding(ctx, featureId, type))
return;
EmitResult(ctx, m_context->GetId(), featureId, type, TokenRange(0, 1), nullptr /* geoParts */);
EmitResult(ctx, m_context->GetId(), featureId, type, TokenRange(0, 1), nullptr /* geoParts */,
true /* allTokensUsed */);
++numEmitted;
};
// By now there's only one token and zero prefix tokens.
@ -792,23 +796,27 @@ void Geocoder::MatchCategories(BaseContext & ctx, bool aroundPivot)
// using the exact (non-fuzzy) matching and intersected
// with viewport, if needed. Every such feature is relevant.
features.ForEach(emit);
return numEmitted;
}
void Geocoder::MatchRegions(BaseContext & ctx, Region::Type type)
size_t Geocoder::MatchRegions(BaseContext & ctx, Region::Type type)
{
size_t numEmitted = 0;
switch (type)
{
case Region::TYPE_STATE:
// Tries to skip state matching and go to cities matching.
// Then, performs states matching.
MatchCities(ctx);
numEmitted += MatchCities(ctx);
break;
case Region::TYPE_COUNTRY:
// Tries to skip country matching and go to states matching.
// Then, performs countries matching.
MatchRegions(ctx, Region::TYPE_STATE);
numEmitted += MatchRegions(ctx, Region::TYPE_STATE);
break;
case Region::TYPE_COUNT: ASSERT(false, ("Invalid region type.")); return;
case Region::TYPE_COUNT: ASSERT(false, ("Invalid region type.")); return numEmitted;
}
auto const & regions = m_regions[type];
@ -850,27 +858,32 @@ void Geocoder::MatchRegions(BaseContext & ctx, Region::Type type)
MY_SCOPE_GUARD(cleanup, [&ctx]() { ctx.m_regions.pop_back(); });
ScopedMarkTokens mark(ctx.m_tokens, BaseContext::FromRegionType(type), tokenRange);
if (ctx.AllTokensUsed())
{
// Region matches to search query, we need to emit it as is.
EmitResult(ctx, region, tokenRange);
EmitResult(ctx, region, tokenRange, true /* allTokensUsed */);
continue;
}
switch (type)
{
case Region::TYPE_STATE: MatchCities(ctx); break;
case Region::TYPE_COUNTRY: MatchRegions(ctx, Region::TYPE_STATE); break;
case Region::TYPE_STATE: numEmitted += MatchCities(ctx); break;
case Region::TYPE_COUNTRY: numEmitted += MatchRegions(ctx, Region::TYPE_STATE); break;
case Region::TYPE_COUNT: ASSERT(false, ("Invalid region type.")); break;
}
}
}
return numEmitted;
}
void Geocoder::MatchCities(BaseContext & ctx)
size_t Geocoder::MatchCities(BaseContext & ctx)
{
ASSERT(!ctx.m_city, ());
size_t numEmitted = 0;
// Localities are ordered my (m_startToken, m_endToken) pairs.
for (auto const & p : m_cities)
{
@ -895,7 +908,8 @@ void Geocoder::MatchCities(BaseContext & ctx)
if (ctx.AllTokensUsed())
{
// City matches to search query, we need to emit it as is.
EmitResult(ctx, city, tokenRange);
EmitResult(ctx, city, tokenRange, true /* allTokensUsed */);
++numEmitted;
continue;
}
@ -909,19 +923,21 @@ void Geocoder::MatchCities(BaseContext & ctx)
continue;
LocalityFilter filter(cityFeatures);
LimitedSearch(ctx, filter);
numEmitted += LimitedSearch(ctx, filter);
}
}
return numEmitted;
}
void Geocoder::MatchAroundPivot(BaseContext & ctx)
size_t Geocoder::MatchAroundPivot(BaseContext & ctx)
{
auto const features = RetrieveGeometryFeatures(*m_context, m_params.m_pivot, RECT_ID_PIVOT);
ViewportFilter filter(features, m_preRanker.Limit() /* threshold */);
LimitedSearch(ctx, filter);
return LimitedSearch(ctx, filter);
}
void Geocoder::LimitedSearch(BaseContext & ctx, FeaturesFilter const & filter)
size_t Geocoder::LimitedSearch(BaseContext & ctx, FeaturesFilter const & filter)
{
m_filter = &filter;
MY_SCOPE_GUARD(resetFilter, [&]()
@ -932,16 +948,19 @@ void Geocoder::LimitedSearch(BaseContext & ctx, FeaturesFilter const & filter)
if (!ctx.m_streets)
ctx.m_streets = m_streetsCache.Get(*m_context);
MatchUnclassified(ctx, 0 /* curToken */);
size_t numEmitted = 0;
auto const search = [this, &ctx]()
{
GreedilyMatchStreets(ctx);
MatchPOIsAndBuildings(ctx, 0 /* curToken */);
numEmitted += MatchUnclassified(ctx, 0 /* curToken */);
auto const search = [this, &ctx, &numEmitted]() {
numEmitted += GreedilyMatchStreets(ctx);
numEmitted += MatchPOIsAndBuildings(ctx, 0 /* curToken */);
};
WithPostcodes(ctx, search);
search();
return numEmitted;
}
template <typename TFn>
@ -983,17 +1002,20 @@ void Geocoder::WithPostcodes(BaseContext & ctx, TFn && fn)
}
}
void Geocoder::GreedilyMatchStreets(BaseContext & ctx)
size_t Geocoder::GreedilyMatchStreets(BaseContext & ctx)
{
vector<StreetsMatcher::Prediction> predictions;
StreetsMatcher::Go(ctx, *m_filter, m_params, predictions);
size_t numEmitted = 0;
for (auto const & prediction : predictions)
CreateStreetsLayerAndMatchLowerLayers(ctx, prediction);
numEmitted += CreateStreetsLayerAndMatchLowerLayers(ctx, prediction);
return numEmitted;
}
void Geocoder::CreateStreetsLayerAndMatchLowerLayers(BaseContext & ctx,
StreetsMatcher::Prediction const & prediction)
size_t Geocoder::CreateStreetsLayerAndMatchLowerLayers(
BaseContext & ctx, StreetsMatcher::Prediction const & prediction)
{
auto & layers = ctx.m_layers;
ASSERT(layers.empty(), ());
@ -1005,20 +1027,28 @@ void Geocoder::CreateStreetsLayerAndMatchLowerLayers(BaseContext & ctx,
InitLayer(Model::TYPE_STREET, prediction.m_tokenRange, layer);
vector<uint32_t> sortedFeatures;
sortedFeatures.reserve(prediction.m_features.PopCount());
sortedFeatures.reserve(base::checked_cast<size_t>(prediction.m_features.PopCount()));
prediction.m_features.ForEach([&sortedFeatures](uint64_t bit) {
sortedFeatures.push_back(base::asserted_cast<uint32_t>(bit));
});
layer.m_sortedFeatures = &sortedFeatures;
ScopedMarkTokens mark(ctx.m_tokens, BaseContext::TOKEN_TYPE_STREET, prediction.m_tokenRange);
MatchPOIsAndBuildings(ctx, 0 /* curToken */);
size_t numEmitted = MatchPOIsAndBuildings(ctx, 0 /* curToken */);
// A relaxed best effort parse: at least show the street if we can find one.
if (numEmitted == 0)
numEmitted += FindPaths(ctx);
return numEmitted;
}
void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken)
size_t Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken)
{
BailIfCancelled();
size_t numEmitted = 0;
auto & layers = ctx.m_layers;
curToken = ctx.SkipUsedTokens(curToken);
@ -1027,7 +1057,10 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken)
// All tokens were consumed, find paths through layers, emit
// features.
if (m_postcodes.m_features.IsEmpty())
return FindPaths(ctx);
{
numEmitted += FindPaths(ctx);
return numEmitted;
}
// When there are no layers but user entered a postcode, we have
// to emit all features matching to the postcode.
@ -1042,14 +1075,18 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken)
if (GetTypeInGeocoding(ctx, featureId, type))
{
EmitResult(ctx, m_context->GetId(), featureId, type, m_postcodes.m_tokenRange,
nullptr /* geoParts */);
nullptr /* geoParts */, true /* allTokensUsed */);
++numEmitted;
}
});
return;
return numEmitted;
}
if (!(layers.size() == 1 && layers[0].m_type == Model::TYPE_STREET))
return FindPaths(ctx);
{
numEmitted += FindPaths(ctx);
return numEmitted;
}
// If there're only one street layer but user also entered a
// postcode, we need to emit all features matching to postcode on
@ -1064,7 +1101,8 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken)
if (!m_postcodes.m_features.HasBit(id))
continue;
EmitResult(ctx, m_context->GetId(), id, Model::TYPE_STREET, layers.back().m_tokenRange,
nullptr /* geoParts */);
nullptr /* geoParts */, true /* allTokensUsed */);
++numEmitted;
}
}
@ -1081,7 +1119,8 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken)
features.push_back(base::asserted_cast<uint32_t>(bit));
});
layer.m_sortedFeatures = &features;
return FindPaths(ctx);
numEmitted += FindPaths(ctx);
return numEmitted;
}
layers.emplace_back();
@ -1135,7 +1174,6 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken)
bool const looksLikeHouseNumber = house_numbers::LooksLikeHouseNumber(
layers.back().m_subQuery, layers.back().m_lastTokenIsPrefix);
if (filtered.IsEmpty() && !looksLikeHouseNumber)
break;
@ -1197,9 +1235,11 @@ void Geocoder::MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken)
ScopedMarkTokens mark(ctx.m_tokens, BaseContext::FromModelType(layer.m_type),
TokenRange(curToken, curToken + n));
if (IsLayerSequenceSane(layers))
MatchPOIsAndBuildings(ctx, curToken + n);
numEmitted += MatchPOIsAndBuildings(ctx, curToken + n);
}
}
return numEmitted;
}
bool Geocoder::IsLayerSequenceSane(vector<FeaturesLayer> const & layers) const
@ -1244,12 +1284,14 @@ bool Geocoder::IsLayerSequenceSane(vector<FeaturesLayer> const & layers) const
return true;
}
void Geocoder::FindPaths(BaseContext const & ctx)
size_t Geocoder::FindPaths(BaseContext const & ctx)
{
auto const & layers = ctx.m_layers;
size_t numEmitted = 0;
if (layers.empty())
return;
return numEmitted;
// Layers ordered by search type.
vector<FeaturesLayer const *> sortedLayers;
@ -1264,13 +1306,17 @@ void Geocoder::FindPaths(BaseContext const & ctx)
m_matcher->SetPostcodes(&m_postcodes.m_features);
else
m_matcher->SetPostcodes(nullptr);
m_finder.ForEachReachableVertex(
*m_matcher, sortedLayers, [this, &ctx, &innermostLayer](IntersectionResult const & result)
{
*m_matcher, sortedLayers,
[this, &ctx, &innermostLayer, &numEmitted](IntersectionResult const & result) {
ASSERT(result.IsValid(), ());
EmitResult(ctx, m_context->GetId(), result.InnermostResult(), innermostLayer.m_type,
innermostLayer.m_tokenRange, &result);
innermostLayer.m_tokenRange, &result, ctx.AllTokensUsed());
++numEmitted;
});
return numEmitted;
}
void Geocoder::TraceResult(Tracer & tracer, BaseContext const & ctx, MwmSet::MwmId const & mwmId,
@ -1298,7 +1344,7 @@ void Geocoder::TraceResult(Tracer & tracer, BaseContext const & ctx, MwmSet::Mwm
void Geocoder::EmitResult(BaseContext const & ctx, MwmSet::MwmId const & mwmId, uint32_t ftId,
Model::Type type, TokenRange const & tokenRange,
IntersectionResult const * geoParts)
IntersectionResult const * geoParts, bool allTokensUsed)
{
FeatureID id(mwmId, ftId);
@ -1333,26 +1379,32 @@ void Geocoder::EmitResult(BaseContext const & ctx, MwmSet::MwmId const & mwmId,
if (geoParts)
info.m_geoParts = *geoParts;
info.m_allTokensUsed = allTokensUsed;
m_preRanker.Emplace(id, info);
}
void Geocoder::EmitResult(BaseContext const & ctx, Region const & region,
TokenRange const & tokenRange)
TokenRange const & tokenRange, bool allTokensUsed)
{
auto const type = Region::ToModelType(region.m_type);
EmitResult(ctx, region.m_countryId, region.m_featureId, type, tokenRange, nullptr /* geoParts */);
EmitResult(ctx, region.m_countryId, region.m_featureId, type, tokenRange, nullptr /* geoParts */,
allTokensUsed);
}
void Geocoder::EmitResult(BaseContext const & ctx, City const & city, TokenRange const & tokenRange)
void Geocoder::EmitResult(BaseContext const & ctx, City const & city, TokenRange const & tokenRange,
bool allTokensUsed)
{
EmitResult(ctx, city.m_countryId, city.m_featureId, city.m_type, tokenRange,
nullptr /* geoParts */);
nullptr /* geoParts */, allTokensUsed);
}
void Geocoder::MatchUnclassified(BaseContext & ctx, size_t curToken)
size_t Geocoder::MatchUnclassified(BaseContext & ctx, size_t curToken)
{
size_t numEmitted = 0;
if (m_params.m_cianMode)
return;
return numEmitted;
ASSERT(ctx.m_layers.empty(), ());
@ -1364,7 +1416,7 @@ void Geocoder::MatchUnclassified(BaseContext & ctx, size_t curToken)
// ok to match something to "Park London Hyde", because tokens
// "Park" and "Hyde" are not adjacent.
if (ctx.NumUnusedTokenGroups() != 1)
return;
return numEmitted;
CBV allFeatures;
allFeatures.SetFull();
@ -1388,10 +1440,12 @@ void Geocoder::MatchUnclassified(BaseContext & ctx, size_t curToken)
if (type == Model::TYPE_UNCLASSIFIED)
{
EmitResult(ctx, m_context->GetId(), featureId, type, TokenRange(startToken, curToken),
nullptr /* geoParts */);
nullptr /* geoParts */, true /* allTokensUsed */);
++numEmitted;
}
};
allFeatures.ForEach(emitUnclassified);
return numEmitted;
}
CBV Geocoder::RetrievePostcodeFeatures(MwmContext const & context, TokenSlice const & slice)

View file

@ -167,41 +167,49 @@ private:
inline void BailIfCancelled() { ::search::BailIfCancelled(m_cancellable); }
// A fast-path branch for categorial requests.
void MatchCategories(BaseContext & ctx, bool aroundPivot);
// Returns the number of emitted results.
size_t MatchCategories(BaseContext & ctx, bool aroundPivot);
// Tries to find all countries and states in a search query and then
// performs matching of cities in found maps.
void MatchRegions(BaseContext & ctx, Region::Type type);
// Returns the number of emitted results.
size_t MatchRegions(BaseContext & ctx, Region::Type type);
// Tries to find all cities in a search query and then performs
// matching of streets in found cities.
void MatchCities(BaseContext & ctx);
// Returns the number of emitted results.
size_t MatchCities(BaseContext & ctx);
// Tries to do geocoding without localities, ie. find POIs,
// BUILDINGs and STREETs without knowledge about country, state,
// city or village. If during the geocoding too many features are
// retrieved, viewport is used to throw away excess features.
void MatchAroundPivot(BaseContext & ctx);
// Returns the number of emitted results.
size_t MatchAroundPivot(BaseContext & ctx);
// Tries to do geocoding in a limited scope, assuming that knowledge
// about high-level features, like cities or countries, is
// incorporated into |filter|.
void LimitedSearch(BaseContext & ctx, FeaturesFilter const & filter);
// Returns the number of emitted results.
size_t LimitedSearch(BaseContext & ctx, FeaturesFilter const & filter);
template <typename TFn>
void WithPostcodes(BaseContext & ctx, TFn && fn);
// Tries to match some adjacent tokens in the query as streets and
// then performs geocoding in street vicinities.
void GreedilyMatchStreets(BaseContext & ctx);
// Returns the number of emitted results.
size_t GreedilyMatchStreets(BaseContext & ctx);
void CreateStreetsLayerAndMatchLowerLayers(BaseContext & ctx,
StreetsMatcher::Prediction const & prediction);
// Returns the number of emitted results.
size_t CreateStreetsLayerAndMatchLowerLayers(BaseContext & ctx,
StreetsMatcher::Prediction const & prediction);
// Tries to find all paths in a search tree, where each edge is
// marked with some substring of the query tokens. These paths are
// called "layer sequence" and current path is stored in |m_layers|.
void MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken);
// Returns the number of emitted results.
size_t MatchPOIsAndBuildings(BaseContext & ctx, size_t curToken);
// Returns true if current path in the search tree (see comment for
// MatchPOIsAndBuildings()) looks sane. This method is used as a fast
@ -210,7 +218,8 @@ private:
// Finds all paths through layers and emits reachable features from
// the lowest layer.
void FindPaths(BaseContext const & ctx);
// Returns the number of emitted results.
size_t FindPaths(BaseContext const & ctx);
void TraceResult(Tracer & tracer, BaseContext const & ctx, MwmSet::MwmId const & mwmId,
uint32_t ftId, Model::Type type, TokenRange const & tokenRange);
@ -218,14 +227,17 @@ private:
// Forms result and feeds it to |m_preRanker|.
void EmitResult(BaseContext const & ctx, MwmSet::MwmId const & mwmId, uint32_t ftId,
Model::Type type, TokenRange const & tokenRange,
IntersectionResult const * geoParts);
void EmitResult(BaseContext const & ctx, Region const & region, TokenRange const & tokenRange);
void EmitResult(BaseContext const & ctx, City const & city, TokenRange const & tokenRange);
IntersectionResult const * geoParts, bool allTokensUsed);
void EmitResult(BaseContext const & ctx, Region const & region, TokenRange const & tokenRange,
bool allTokensUsed);
void EmitResult(BaseContext const & ctx, City const & city, TokenRange const & tokenRange,
bool allTokensUsed);
// Tries to match unclassified objects from lower layers, like
// parks, forests, lakes, rivers, etc. This method finds all
// UNCLASSIFIED objects that match to all currently unused tokens.
void MatchUnclassified(BaseContext & ctx, size_t curToken);
// Returns the number of emitted results.
size_t MatchUnclassified(BaseContext & ctx, size_t curToken);
// A wrapper around RetrievePostcodeFeatures.
CBV RetrievePostcodeFeatures(MwmContext const & context, TokenSlice const & slice);

View file

@ -22,13 +22,13 @@ struct PreRankingInfo
m_tokenRange[m_type] = range;
}
inline TokenRange const & InnermostTokenRange() const
TokenRange const & InnermostTokenRange() const
{
ASSERT_LESS(m_type, Model::TYPE_COUNT, ());
return m_tokenRange[m_type];
}
inline size_t GetNumTokens() const { return InnermostTokenRange().Size(); }
size_t GetNumTokens() const { return InnermostTokenRange().Size(); }
// An abstract distance from the feature to the pivot. Measurement
// units do not matter here.
@ -44,6 +44,10 @@ struct PreRankingInfo
// building and street ids are in |m_geoParts|.
IntersectionResult m_geoParts;
// True iff all tokens that are not stop-words
// were used when retrieving the feature.
bool m_allTokensUsed = true;
// Rank of the feature.
uint8_t m_rank = 0;

View file

@ -3,6 +3,8 @@
#include "search/emitter.hpp"
#include "search/geometry_utils.hpp"
#include "search/highlighting.hpp"
#include "search/model.hpp"
#include "search/pre_ranking_info.hpp"
#include "search/token_slice.hpp"
#include "search/utils.hpp"
@ -231,6 +233,7 @@ class RankerResultMaker
info.m_distanceToPivot = MercatorBounds::DistanceOnEarth(center, pivot);
info.m_rank = preInfo.m_rank;
info.m_type = preInfo.m_type;
info.m_allTokensUsed = preInfo.m_allTokensUsed;
auto const nameScores = GetNameScores(ft, m_params, preInfo.InnermostTokenRange(), info.m_type);

View file

@ -12,25 +12,26 @@ namespace
{
// See search/search_quality/scoring_model.py for details. In short,
// these coeffs correspond to coeffs in a linear model.
double const kDistanceToPivot = -1.0000000;
double const kRank = 0.5238890;
double const kFalseCats = -0.7319971;
double const kErrorsMade = -0.0238639;
double const kDistanceToPivot = -0.3359819;
double const kRank = 0.3886029;
double const kFalseCats = 0.0000000;
double const kErrorsMade = 0.0201364;
double const kAllTokensUsed = 1.0000000;
double const kNameScore[NameScore::NAME_SCORE_COUNT] = {
-0.1683931 /* Zero */,
0.0268117 /* Substring */,
0.0599575 /* Prefix */,
0.0816240 /* Full Match */
-0.6731264 /* Zero */,
0.2244507 /* Substring */,
0.2141080 /* Prefix */,
0.2345677 /* Full Match */
};
double const kType[Model::TYPE_COUNT] = {
-0.4322325 /* POI */,
-0.4322325 /* Building */,
-0.3823704 /* Street */,
-0.3747346 /* Unclassified */,
-0.4453585 /* Village */,
0.3900264 /* City */,
0.5397572 /* State */,
0.7049124 /* Country */
-0.1749965 /* POI */,
-0.1749965 /* Building */,
-0.0777042 /* Street */,
-0.0695158 /* Unclassified */,
-0.1233553 /* Village */,
0.0391744 /* City */,
0.1592614 /* State */,
0.2471361 /* Country */
};
double TransformDistance(double distance)
@ -51,7 +52,8 @@ void RankingInfo::PrintCSVHeader(ostream & os)
<< ",ErrorsMade"
<< ",SearchType"
<< ",PureCats"
<< ",FalseCats";
<< ",FalseCats"
<< ",AllTokensUsed";
}
string DebugPrint(RankingInfo const & info)
@ -64,7 +66,8 @@ string DebugPrint(RankingInfo const & info)
os << "m_errorsMade:" << DebugPrint(info.m_errorsMade) << ",";
os << "m_type:" << DebugPrint(info.m_type) << ",";
os << "m_pureCats:" << info.m_pureCats << ",";
os << "m_falseCats:" << info.m_falseCats;
os << "m_falseCats:" << info.m_falseCats << ",";
os << "m_allTokensUsed:" << boolalpha << info.m_allTokensUsed;
os << "]";
return os.str();
}
@ -78,7 +81,8 @@ void RankingInfo::ToCSV(ostream & os) const
os << GetErrorsMade() << ",";
os << DebugPrint(m_type) << ",";
os << m_pureCats << ",";
os << m_falseCats;
os << m_falseCats << ",";
os << (m_allTokensUsed ? 1 : 0);
}
double RankingInfo::GetLinearModelRank() const
@ -102,8 +106,15 @@ double RankingInfo::GetLinearModelRank() const
nameScore = NAME_SCORE_ZERO;
}
return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] +
kErrorsMade * GetErrorsMade() + kType[m_type] + m_falseCats * kFalseCats;
double result = 0.0;
result += kDistanceToPivot * distanceToPivot;
result += kRank * rank;
result += kNameScore[nameScore];
result += kErrorsMade * GetErrorsMade();
result += kType[m_type];
result += m_falseCats * kFalseCats;
result += (m_allTokensUsed ? +1.0 : -1.0) * kAllTokensUsed;
return result;
}
size_t RankingInfo::GetErrorsMade() const

View file

@ -29,6 +29,10 @@ struct RankingInfo
// Number of typos.
ErrorsMade m_errorsMade;
// True iff all tokens that are not stop-words
// were used when retrieving the feature.
bool m_allTokensUsed = true;
// Search type for the feature.
Model::Type m_type = Model::TYPE_COUNT;

View file

@ -192,23 +192,25 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke)
TEST(ResultsMatch("feynman street 3", rules), ());
}
{
TRules rules = {ExactMatch(wonderlandId, feynmanHouse), ExactMatch(wonderlandId, lantern1)};
TRules rules = {ExactMatch(wonderlandId, feynmanHouse), ExactMatch(wonderlandId, lantern1),
ExactMatch(wonderlandId, firstAprilStreet)};
TEST(ResultsMatch("feynman street 1", rules), ());
}
{
TRules rules = {ExactMatch(wonderlandId, bohrHouse), ExactMatch(wonderlandId, hilbertHouse),
ExactMatch(wonderlandId, lantern1)};
ExactMatch(wonderlandId, lantern1), ExactMatch(wonderlandId, firstAprilStreet)};
TEST(ResultsMatch("bohr street 1", rules), ());
}
{
TEST(ResultsMatch("bohr street 1 unit 3", TRules()), ());
TEST(ResultsMatch("bohr street 1 unit 3", {ExactMatch(wonderlandId, bohrStreet1)}), ());
}
{
TRules rules = {ExactMatch(wonderlandId, lantern1), ExactMatch(wonderlandId, lantern2)};
TEST(ResultsMatch("bohr street 1 lantern ", rules), ());
}
{
TRules rules = {ExactMatch(wonderlandId, feynmanHouse)};
TRules rules = {ExactMatch(wonderlandId, feynmanHouse),
ExactMatch(wonderlandId, feynmanStreet)};
TEST(ResultsMatch("wonderland los alamos feynman 1 unit 1 ", rules), ());
}
{
@ -224,12 +226,13 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke)
TEST(ResultsMatch("Los Alamos 2", rules), ());
}
{
TRules rules = {ExactMatch(wonderlandId, bornHouse)};
TEST(ResultsMatch("long pond 1st april street 8", rules), ());
TRules rules = {ExactMatch(wonderlandId, bornHouse),
ExactMatch(wonderlandId, firstAprilStreet)};
TEST(ResultsMatch("long pond 1st april street 8 ", rules), ());
}
{
TRules rules = {ExactMatch(wonderlandId, terranceHouse)};
TRules rules = {ExactMatch(wonderlandId, terranceHouse), ExactMatch(wonderlandId, stradaDrive)};
TEST(ResultsMatch("Toronto strada drive 155", rules), ());
}
}
@ -483,27 +486,27 @@ UNIT_CLASS_TEST(ProcessorTest, TestHouseNumbers)
});
{
TRules rules{ExactMatch(countryId, building0)};
TEST(ResultsMatch("Зеленоград генералова к100", "ru", rules), ());
TRules rules{ExactMatch(countryId, building0), ExactMatch(countryId, street)};
TEST(ResultsMatch("Зеленоград генералова к100 ", "ru", rules), ());
}
{
TRules rules{ExactMatch(countryId, building1)};
TEST(ResultsMatch("Зеленоград генералова к200", "ru", rules), ());
TRules rules{ExactMatch(countryId, building1), ExactMatch(countryId, street)};
TEST(ResultsMatch("Зеленоград генералова к200 ", "ru", rules), ());
}
{
TRules rules{ExactMatch(countryId, building1)};
TEST(ResultsMatch("Зеленоград к200 генералова", "ru", rules), ());
TRules rules{ExactMatch(countryId, building1), ExactMatch(countryId, street)};
TEST(ResultsMatch("Зеленоград к200 генералова ", "ru", rules), ());
}
{
TRules rules{ExactMatch(countryId, building2)};
TEST(ResultsMatch("Зеленоград 300 строение 400 генералова", "ru", rules), ());
TRules rules{ExactMatch(countryId, building2), ExactMatch(countryId, street)};
TEST(ResultsMatch("Зеленоград 300 строение 400 генералова ", "ru", rules), ());
}
{
TRules rules{};
TRules rules{ExactMatch(countryId, street)};
TEST(ResultsMatch("Зеленоград генералова строе 300", "ru", rules), ());
}
{
TRules rules{ExactMatch(countryId, building2)};
TRules rules{ExactMatch(countryId, building2), ExactMatch(countryId, street)};
TEST(ResultsMatch("Зеленоград генералова 300 строе", "ru", rules), ());
}
}
@ -582,11 +585,11 @@ UNIT_CLASS_TEST(ProcessorTest, TestPostcodes)
}
{
TRules rules{ExactMatch(countryId, building28)};
TEST(ResultsMatch("Долгопрудный первомайская 28а", "ru", rules), ());
TRules rules{ExactMatch(countryId, building28), ExactMatch(countryId, street)};
TEST(ResultsMatch("Долгопрудный первомайская 28а ", "ru", rules), ());
}
{
TRules rules{ExactMatch(countryId, building28)};
TRules rules{ExactMatch(countryId, building28), ExactMatch(countryId, street)};
TEST(ResultsMatch("Долгопрудный первомайская 28а, 141701", "ru", rules), ());
}
{
@ -595,7 +598,7 @@ UNIT_CLASS_TEST(ProcessorTest, TestPostcodes)
TEST(ResultsMatch("Долгопрудный первомайская 141701", "ru", rules), ());
}
{
TRules rules{ExactMatch(countryId, building31)};
TRules rules{ExactMatch(countryId, building31), ExactMatch(countryId, street)};
TEST(ResultsMatch("Долгопрудный первомайская 141702", "ru", rules), ());
}
{
@ -929,8 +932,9 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0)));
{
TRules rules = {ExactMatch(id, bar)};
TEST(ResultsMatch("москва черчилль", "ru", rules), ());
TRules rulesWithoutStreet = {ExactMatch(id, bar)};
TRules rules = {ExactMatch(id, bar), ExactMatch(id, street)};
TEST(ResultsMatch("москва черчилль", "ru", rulesWithoutStreet), ());
TEST(ResultsMatch("москва ленинградский черчилль", "ru", rules), ());
TEST(ResultsMatch("москва ленинградский паб черчилль", "ru", rules), ());
@ -938,12 +942,12 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
TEST(ResultsMatch("масква ленинргадский черчиль", "ru", rules), ());
// Too many errors, can't do anything.
TEST(ResultsMatch("масква ленинргадский чирчиль", "ru", TRules{}), ());
TEST(ResultsMatch("масква лениноргадсский чирчиль", "ru", TRules{}), ());
TEST(ResultsMatch("моксва ленинргадский черчиль", "ru", rules), ());
TEST(ResultsMatch("food", "ru", rules), ());
TEST(ResultsMatch("foood", "ru", rules), ());
TEST(ResultsMatch("food", "ru", rulesWithoutStreet), ());
TEST(ResultsMatch("foood", "ru", rulesWithoutStreet), ());
TEST(ResultsMatch("fod", "ru", TRules{}), ());
TRules rulesMetro = {ExactMatch(id, metro)};
@ -1058,11 +1062,13 @@ UNIT_CLASS_TEST(ProcessorTest, TestWeirdTypes)
TEST(ResultsMatch("除細動器", "ja", rules), ());
TRules onlyFirst{ExactMatch(countryId, defibrillator1)};
TRules firstWithStreet{ExactMatch(countryId, defibrillator1), ExactMatch(countryId, street)};
// City + category. Only the first defibrillator is inside.
TEST(ResultsMatch("東京 除細動器", "ja", onlyFirst), ());
TEST(ResultsMatch("東京 除細動器 ", "ja", onlyFirst), ());
// City + street + category.
TEST(ResultsMatch("東京 竹下通り 除細動器", "ja", onlyFirst), ());
TEST(ResultsMatch("東京 竹下通り 除細動器 ", "ja", firstWithStreet), ());
}
{
@ -1191,5 +1197,94 @@ UNIT_CLASS_TEST(ProcessorTest, CityBoundarySmoke)
}
}
}
// Tests for the non-strict aspects of retrieval.
// Currently, the only possible non-strictness is that
// some tokens in the query may be ignored,
// which results in a pruned parse tree for the query.
UNIT_CLASS_TEST(ProcessorTest, RelaxedRetrieval)
{
string const countryName = "Wonderland";
TestCountry country(m2::PointD(10.0, 10.0), countryName, "en");
TestCity city({{-10.0, -10.0}, {10.0, -10.0}, {10.0, 10.0}, {-10.0, 10.0}} /* boundary */,
"Sick City", "en", 255 /* rank */);
TestStreet street(vector<m2::PointD>{m2::PointD(-1.0, 0.0), m2::PointD(1.0, 0.0)}, "Queer Street",
"en");
TestBuilding building0(m2::PointD(-1.0, 0.0), "" /* name */, "0", street, "en");
TestBuilding building1(m2::PointD(1.0, 0.0), "", "1", street, "en");
TestBuilding building2(m2::PointD(2.0, 0.0), "named building", "" /* house number */, "en");
TestBuilding building3(m2::PointD(3.0, 0.0), "named building", "", "en");
TestPOI poi0(m2::PointD(-1.0, 0.0), "Farmacia de guardia", "en");
poi0.SetTypes({{"amenity", "pharmacy"}});
// A poi inside building2.
TestPOI poi2(m2::PointD(2.0, 0.0), "Post box", "en");
poi2.SetTypes({{"amenity", "post_box"}});
auto countryId = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
builder.Add(street);
builder.Add(building0);
builder.Add(building1);
builder.Add(poi0);
});
RegisterCountry(countryName, m2::RectD(m2::PointD(-10.0, -10.0), m2::PointD(10.0, 10.0)));
auto worldId = BuildWorld([&](TestMwmBuilder & builder) {
builder.Add(country);
builder.Add(city);
});
{
TRules rulesStrict = {ExactMatch(countryId, building0)};
TRules rulesRelaxed = {ExactMatch(countryId, street)};
// "street" instead of "street-building"
TEST(ResultsMatch("queer street 0 ", rulesStrict), ());
TEST(ResultsMatch("queer street ", rulesRelaxed), ());
TEST(ResultsMatch("queer street 2 ", rulesRelaxed), ());
}
{
TRules rulesStrict = {ExactMatch(countryId, poi0), ExactMatch(countryId, street)};
TRules rulesRelaxed = {ExactMatch(countryId, street)};
// "country-city-street" instead of "country-city-street-poi"
TEST(ResultsMatch("wonderland sick city queer street pharmacy ", rulesStrict), ());
TEST(ResultsMatch("wonderland sick city queer street school ", rulesRelaxed), ());
}
{
TRules rulesStrict = {ExactMatch(countryId, street)};
TRules rulesRelaxed = {};
// Cities and larger toponyms should not be relaxed.
// "city" instead of "city-street"
TEST(ResultsMatch("sick city queer street ", rulesStrict), ());
TEST(ResultsMatch("sick city sick street ", rulesRelaxed), ());
}
{
TRules rulesStrict = {ExactMatch(countryId, street)};
TRules rulesRelaxed = {};
// Should not be relaxed.
// "country-city" instead of "country-city-street"
TEST(ResultsMatch("wonderland sick city queer street ", rulesStrict), ());
TEST(ResultsMatch("wonderland sick city other street ", rulesRelaxed), ());
}
{
TRules rulesStrict = {ExactMatch(countryId, poi0)};
TRules rulesRelaxed = {};
// Should not be relaxed.
// "city" instead of "city-poi"
TEST(ResultsMatch("sick city pharmacy ", rulesStrict), ());
TEST(ResultsMatch("sick city library ", rulesRelaxed), ());
}
}
} // namespace
} // namespace search

View file

@ -19,7 +19,7 @@ MAX_RANK = 255
RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3}
NAME_SCORES = ['Zero', 'Substring', 'Prefix', 'Full Match']
SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country']
FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats', 'ErrorsMade'] + NAME_SCORES + SEARCH_TYPES
FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats', 'ErrorsMade', 'AllTokensUsed'] + NAME_SCORES + SEARCH_TYPES
BOOTSTRAP_ITERATIONS = 10000
@ -37,6 +37,7 @@ def normalize_data(data):
data['DistanceToPivot'] = data['DistanceToPivot'].apply(transform_distance)
data['Rank'] = data['Rank'].apply(lambda v: v / MAX_RANK)
data['Relevance'] = data['Relevance'].apply(lambda v: RELEVANCES[v])
data['AllTokensUsed'] = data['AllTokensUsed'].apply(lambda v : +1 if bool(v) else -1)
cats = data['PureCats'].combine(data['FalseCats'], max)