[search] A probabilistic tweak to locality scorer.

This commit is contained in:
Maxim Pimenov 2016-05-31 13:26:03 +03:00
parent eeb41dadb8
commit 90fe2ee57b
5 changed files with 33 additions and 9 deletions

View file

@ -694,9 +694,14 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter
for (size_t startToken = 0; startToken < m_numTokens; ++startToken)
{
CBVPtr intersection;
CBVPtr unfilteredIntersection;
intersection.SetFull();
unfilteredIntersection.SetFull();
if (filter)
{
intersection.Intersect(filter);
unfilteredIntersection.Intersect(m_addressFeatures[startToken].get());
}
intersection.Intersect(m_addressFeatures[startToken].get());
if (intersection.IsEmpty())
continue;
@ -713,6 +718,11 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter
l.m_featureId = featureId;
l.m_startToken = startToken;
l.m_endToken = endToken;
if (filter)
{
l.m_prob = static_cast<double>(intersection->PopCount()) /
static_cast<double>(unfilteredIntersection->PopCount());
}
preLocalities.push_back(l);
});
}
@ -720,6 +730,8 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter
if (endToken < m_numTokens)
{
intersection.Intersect(m_addressFeatures[endToken].get());
if (filter)
unfilteredIntersection.Intersect(m_addressFeatures[endToken].get());
if (intersection.IsEmpty())
break;
}
@ -829,7 +841,7 @@ void Geocoder::FillVillageLocalities()
#if defined(DEBUG)
ft.GetName(StringUtf8Multilang::kDefaultCode, village.m_defaultName);
LOG(LDEBUG, ("Village =", village.m_defaultName));
LOG(LDEBUG, ("Village =", village.m_defaultName, "prob =", village.m_prob));
#endif
m_cities[{l.m_startToken, l.m_endToken}].push_back(village);

View file

@ -99,10 +99,10 @@ public:
struct Locality
{
Locality() : m_featureId(0), m_startToken(0), m_endToken(0) {}
Locality() : m_featureId(0), m_startToken(0), m_endToken(0), m_prob(0.0) {}
Locality(uint32_t featureId, size_t startToken, size_t endToken)
: m_featureId(featureId), m_startToken(startToken), m_endToken(endToken)
: m_featureId(featureId), m_startToken(startToken), m_endToken(endToken), m_prob(0.0)
{
}
@ -110,6 +110,12 @@ public:
uint32_t m_featureId;
size_t m_startToken;
size_t m_endToken;
// Measures our belief in the fact that tokens in the range [m_startToken, m_endToken)
// indeed specify a locality. Currently it is set only for villages.
double m_prob;
string m_name;
};
// This struct represents a country or US- or Canadian- state. It

View file

@ -44,8 +44,8 @@ void LocalityScorer::GetTopLocalities(size_t limit, vector<Geocoder::Locality> &
ls.emplace_back(locality);
RemoveDuplicates(ls);
LeaveTopByRank(std::max(limit, kDefaultReadLimit), ls);
SortByName(ls);
LeaveTopByRankAndProb(std::max(limit, kDefaultReadLimit), ls);
SortByNameAndProb(ls);
if (ls.size() > limit)
ls.resize(limit);
@ -71,7 +71,7 @@ void LocalityScorer::RemoveDuplicates(vector<ExLocality> & ls) const
ls.end());
}
void LocalityScorer::LeaveTopByRank(size_t limit, vector<ExLocality> & ls) const
void LocalityScorer::LeaveTopByRankAndProb(size_t limit, vector<ExLocality> & ls) const
{
if (ls.size() <= limit)
return;
@ -81,6 +81,8 @@ void LocalityScorer::LeaveTopByRank(size_t limit, vector<ExLocality> & ls) const
sort(ls.begin(), ls.end(), [](ExLocality const & lhs, ExLocality const & rhs)
{
if (lhs.m_locality.m_prob != rhs.m_locality.m_prob)
return lhs.m_locality.m_prob > rhs.m_locality.m_prob;
if (lhs.m_rank != rhs.m_rank)
return lhs.m_rank > rhs.m_rank;
return lhs.m_numTokens > rhs.m_numTokens;
@ -88,7 +90,7 @@ void LocalityScorer::LeaveTopByRank(size_t limit, vector<ExLocality> & ls) const
ls.resize(limit);
}
void LocalityScorer::SortByName(vector<ExLocality> & ls) const
void LocalityScorer::SortByNameAndProb(vector<ExLocality> & ls) const
{
vector<string> names;
for (auto & l : ls)
@ -107,6 +109,9 @@ void LocalityScorer::SortByName(vector<ExLocality> & ls) const
sort(ls.begin(), ls.end(), [](ExLocality const & lhs, ExLocality const & rhs)
{
// Probabilities form a stronger signal than name scores do.
if (lhs.m_locality.m_prob != rhs.m_locality.m_prob)
return lhs.m_locality.m_prob > rhs.m_locality.m_prob;
if (IsAlmostFullMatch(lhs.m_nameScore) && IsAlmostFullMatch(rhs.m_nameScore))
{
// When both localities match well, e.g. full or full prefix

View file

@ -45,8 +45,8 @@ private:
};
void RemoveDuplicates(vector<ExLocality> & ls) const;
void LeaveTopByRank(size_t limit, vector<ExLocality> & ls) const;
void SortByName(vector<ExLocality> & ls) const;
void LeaveTopByRankAndProb(size_t limit, vector<ExLocality> & ls) const;
void SortByNameAndProb(vector<ExLocality> & ls) const;
QueryParams const & m_params;
Delegate const & m_delegate;

View file

@ -51,6 +51,7 @@ fi
NAMES=("Australia_Brisbane.mwm"
"Belarus_Minsk*.mwm"
"Canada_Quebek_Montreal.mwm"
"Germany_*.mwm"
"Russia_*.mwm"
"UK_England_*.mwm"