Review fixes.

This commit is contained in:
Yuri Gorshenin 2017-12-15 17:12:12 +03:00 committed by mpimenov
parent 0e63410a65
commit 78cbdf1daf
11 changed files with 382 additions and 303 deletions

View file

@ -20,6 +20,7 @@ set(
common.hpp
displayed_categories.cpp
displayed_categories.hpp
doc_vec.cpp
doc_vec.hpp
downloader_search_callback.cpp
downloader_search_callback.hpp

234
search/doc_vec.cpp Normal file
View file

@ -0,0 +1,234 @@
#include "search/doc_vec.hpp"
#include "base/logging.hpp"
#include <limits>
using namespace std;
namespace search
{
namespace
{
// Accumulates frequencies of equal tokens in |tfs|. Result is sorted
// by tokens.
void SortAndMerge(vector<strings::UniString> tokens, vector<TokenFrequencyPair> & tfs)
{
ASSERT(tfs.empty(), ());
sort(tokens.begin(), tokens.end());
for (size_t i = 0; i < tokens.size(); ++i)
{
if (tfs.empty() || tfs.back().m_token != tokens[i])
tfs.emplace_back(tokens[i], 1 /* frequency */);
else
++tfs.back().m_frequency;
}
}
double GetTfIdf(double tf, double idf) { return tf * idf; }
double GetWeightImpl(IdfMap & idfs, TokenFrequencyPair const & tf, bool isPrefix)
{
return GetTfIdf(tf.m_frequency, idfs.Get(tf.m_token, isPrefix));
}
double GetSqrWeightImpl(IdfMap & idfs, TokenFrequencyPair const & tf, bool isPrefix)
{
auto const w = GetWeightImpl(idfs, tf, isPrefix);
return w * w;
}
// Computes squared L2 norm of vector of tokens.
double SqrL2(IdfMap & idfs, vector<TokenFrequencyPair> const & tfs)
{
double sum = 0;
for (auto const & tf : tfs)
sum += GetSqrWeightImpl(idfs, tf, false /* isPrefix */);
return sum;
}
// Computes squared L2 norm of vector of tokens + prefix token.
double SqrL2(IdfMap & idfs, vector<TokenFrequencyPair> const & tfs,
boost::optional<strings::UniString> const & prefix)
{
auto result = SqrL2(idfs, tfs);
if (prefix)
{
result +=
GetSqrWeightImpl(idfs, TokenFrequencyPair(*prefix, 1 /* frequency */), true /* isPrefix */);
}
return result;
}
} // namespace
// TokenFrequencyPair ------------------------------------------------------------------------------
bool TokenFrequencyPair::operator<(TokenFrequencyPair const & rhs) const
{
if (m_token != rhs.m_token)
return m_token < rhs.m_token;
return m_frequency < rhs.m_frequency;
}
void TokenFrequencyPair::Swap(TokenFrequencyPair & rhs)
{
m_token.swap(rhs.m_token);
swap(m_frequency, rhs.m_frequency);
}
string DebugPrint(TokenFrequencyPair const & tf)
{
ostringstream os;
os << "TokenFrequencyPair [" << DebugPrint(tf.m_token) << ", " << tf.m_frequency << "]";
return os.str();
}
// DocVec ------------------------------------------------------------------------------------------
DocVec::DocVec(IdfMap & idfs, Builder const & builder) : m_idfs(&idfs)
{
SortAndMerge(builder.m_tokens, m_tfs);
}
double DocVec::Norm() { return SqrL2(*m_idfs, m_tfs); }
strings::UniString const & DocVec::GetToken(size_t i) const
{
ASSERT_LESS(i, m_tfs.size(), ());
return m_tfs[i].m_token;
}
double DocVec::GetIdf(size_t i)
{
ASSERT_LESS(i, m_tfs.size(), ());
return m_idfs->Get(m_tfs[i].m_token, false /* isPrefix */);
}
double DocVec::GetWeight(size_t i)
{
ASSERT_LESS(i, m_tfs.size(), ());
return GetWeightImpl(*m_idfs, m_tfs[i], false /* isPrefix */);
}
// QueryVec ----------------------------------------------------------------------------------------
QueryVec::QueryVec(IdfMap & idfs, Builder const & builder)
: m_idfs(&idfs), m_prefix(builder.m_prefix)
{
SortAndMerge(builder.m_tokens, m_tfs);
}
double QueryVec::Similarity(DocVec & rhs)
{
size_t kInvalidIndex = numeric_limits<size_t>::max();
if (Empty() && rhs.Empty())
return 1.0;
if (Empty() || rhs.Empty())
return 0.0;
vector<size_t> rsMatchTo(rhs.GetNumTokens(), kInvalidIndex);
double dot = 0;
{
size_t i = 0, j = 0;
while (i < m_tfs.size() && j < rhs.GetNumTokens())
{
auto const & lt = m_tfs[i].m_token;
auto const & rt = rhs.GetToken(j);
if (lt < rt)
{
++i;
}
else if (lt > rt)
{
++j;
}
else
{
dot += GetFullTokenWeight(i) * rhs.GetWeight(j);
rsMatchTo[j] = i;
++i;
++j;
}
}
}
auto const ln = Norm();
auto const rn = rhs.Norm();
// This similarity metric assumes that prefix is not matched in the document.
double const similarityNoPrefix = ln > 0 && rn > 0 ? dot / sqrt(ln) / sqrt(rn) : 0;
if (!m_prefix)
return similarityNoPrefix;
double similarityWithPrefix = 0;
auto const & prefix = *m_prefix;
// Let's try to match prefix token with all tokens in the
// document, and compute the best cosine distance.
for (size_t j = 0; j < rhs.GetNumTokens(); ++j)
{
auto const & t = rhs.GetToken(j);
if (!strings::StartsWith(t.begin(), t.end(), prefix.begin(), prefix.end()))
continue;
auto const i = rsMatchTo[j];
double num = 0;
double denom = 0;
if (i == kInvalidIndex)
{
// If this document token is not matched with full tokens in a
// query, we need to update its weight in the cosine distance
// - so we need to update correspondingly dot product and
// vector norms of query and doc.
auto const oldW = GetPrefixTokenWeight();
auto const newW = GetTfIdf(1 /* frequency */, rhs.GetIdf(j));
auto const l = max(0.0, ln - oldW * oldW + newW * newW);
num = dot + newW * rhs.GetWeight(j);
denom = sqrt(l) * sqrt(rn);
}
else
{
// If this document token is already matched with |i|-th full
// token in a query - we know that completion of the prefix
// token is the |i|-th query token. So we need to update
// correspondingly dot product and vector norm of the query.
auto const oldFW = GetFullTokenWeight(i);
auto const oldPW = GetPrefixTokenWeight();
auto const tf = m_tfs[i].m_frequency + 1;
auto const idf = m_idfs->Get(m_tfs[i].m_token, false /* isPrefix */);
auto const newW = GetTfIdf(tf, idf);
auto const l = ln - oldFW * oldFW - oldPW * oldPW + newW * newW;
num = dot + (newW - oldFW) * rhs.GetWeight(j);
denom = sqrt(l) * sqrt(rn);
}
if (denom > 0)
similarityWithPrefix = max(similarityWithPrefix, num / denom);
}
return max(similarityWithPrefix, similarityNoPrefix);
}
double QueryVec::Norm() { return SqrL2(*m_idfs, m_tfs, m_prefix); }
double QueryVec::GetFullTokenWeight(size_t i)
{
ASSERT_LESS(i, m_tfs.size(), ());
return GetWeightImpl(*m_idfs, m_tfs[i], false /* isPrefix */);
}
double QueryVec::GetPrefixTokenWeight()
{
ASSERT(m_prefix, ());
return GetWeightImpl(*m_idfs, TokenFrequencyPair(*m_prefix, 1 /* frequency */),
true /* isPrefix */);
}
} // namespace search

View file

@ -1,11 +1,14 @@
#pragma once
#include "search/idf_map.hpp"
#include "base/assert.hpp"
#include "base/string_utils.hpp"
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <sstream>
#include <string>
#include <utility>
@ -14,309 +17,125 @@
namespace search
{
template <typename Token>
struct TokenWeightPair
class IdfMap;
struct TokenFrequencyPair
{
TokenWeightPair() = default;
TokenFrequencyPair() = default;
template <typename T>
TokenWeightPair(T && token, double weight) : m_token(std::forward<T>(token)), m_weight(weight)
template <typename Token>
TokenFrequencyPair(Token && token, uint64_t frequency)
: m_token(std::forward<Token>(token)), m_frequency(frequency)
{
}
bool operator<(TokenWeightPair const & rhs) const
{
if (m_token != rhs.m_token)
return m_token < rhs.m_token;
return m_weight < rhs.m_weight;
}
bool operator<(TokenFrequencyPair const & rhs) const;
void Swap(TokenWeightPair & rhs)
{
m_token.swap(rhs.m_token);
std::swap(m_weight, rhs.m_weight);
}
void Swap(TokenFrequencyPair & rhs);
// Returns squared weight of the token-weight pair.
double SqrWeight() const { return m_weight * m_weight; }
Token m_token;
double m_weight = 0;
strings::UniString m_token;
uint64_t m_frequency = 0;
};
template <typename Token>
std::string DebugPrint(TokenWeightPair<Token> const & tw)
{
std::ostringstream os;
os << "TokenWeightPair [ " << DebugPrint(tw.m_token) << ", " << tw.m_weight << " ]";
return os.str();
}
namespace impl
{
// Accumulates weights of equal tokens in |tws|. Result is sorted by
// tokens. Also, maximum weight from a group of equal tokens will be
// stored in the corresponding |maxWeight| elem.
template <typename Token>
void SortAndMerge(std::vector<TokenWeightPair<Token>> & tws, std::vector<double> & maxWeights)
{
std::sort(tws.begin(), tws.end());
size_t n = 0;
maxWeights.clear();
for (size_t i = 0; i < tws.size(); ++i)
{
ASSERT_LESS_OR_EQUAL(n, i, ());
ASSERT_EQUAL(n, maxWeights.size(), ());
if (n == 0 || tws[n - 1].m_token != tws[i].m_token)
{
tws[n].Swap(tws[i]);
maxWeights.push_back(tws[n].m_weight);
++n;
}
else
{
tws[n - 1].m_weight += tws[i].m_weight;
maxWeights[n - 1] = std::max(maxWeights[n - 1], tws[i].m_weight);
}
}
ASSERT_LESS_OR_EQUAL(n, tws.size(), ());
tws.erase(tws.begin() + n, tws.end());
}
// Computes squared L2 norm of vector of tokens.
template <typename Token>
double SqrL2(std::vector<TokenWeightPair<Token>> const & tws)
{
double sum = 0;
for (auto const & tw : tws)
sum += tw.SqrWeight();
return sum;
}
// Computes squared L2 norm of vector of tokens + prefix token.
template <typename Token>
double SqrL2(std::vector<TokenWeightPair<Token>> const & tws,
boost::optional<TokenWeightPair<Token>> const & prefix)
{
double result = SqrL2(tws);
return result + (prefix ? prefix->SqrWeight() : 0);
}
} // namespace impl
std::string DebugPrint(TokenFrequencyPair const & tf);
// This class represents a document in a vector space of tokens.
template <typename Token>
class DocVec
{
public:
using TokenWeightPair = TokenWeightPair<Token>;
using TokenWeightPairs = std::vector<TokenWeightPair>;
class Builder
{
public:
template <typename T>
void Add(T && token, double weight)
template <typename Token>
void Add(Token && token)
{
m_tws.emplace_back(std::forward<T>(token), weight);
m_tokens.emplace_back(std::forward<Token>(token));
}
private:
friend class DocVec;
TokenWeightPairs m_tws;
std::vector<strings::UniString> m_tokens;
};
DocVec() = default;
explicit DocVec(Builder && builder) : m_tws(std::move(builder.m_tws)) { Init(); }
explicit DocVec(Builder const & builder) : m_tws(builder.m_tws) { Init(); }
explicit DocVec(IdfMap & idfs) : m_idfs(&idfs) {}
TokenWeightPairs const & GetTokenWeightPairs() const { return m_tws; }
std::vector<double> const & GetMaxWeights() const { return m_maxWeights; }
DocVec(IdfMap & idfs, Builder const & builder);
bool Empty() const { return m_tws.empty(); }
// Computes vector norm of the doc.
double Norm();
size_t GetNumTokens() const { return m_tfs.size(); }
strings::UniString const & GetToken(size_t i) const;
double GetIdf(size_t i);
double GetWeight(size_t i);
bool Empty() const { return m_tfs.empty(); }
private:
template <typename T>
friend std::string DebugPrint(DocVec<T> const & dv)
friend std::string DebugPrint(DocVec const & dv)
{
return "DocVec " + DebugPrint(dv.m_tws);
return "DocVec " + ::DebugPrint(dv.m_tfs);
}
void Init() { impl::SortAndMerge(m_tws, m_maxWeights); }
TokenWeightPairs m_tws;
std::vector<double> m_maxWeights;
IdfMap * m_idfs;
std::vector<TokenFrequencyPair> m_tfs;
};
// This class represents a search query in a vector space of tokens.
template <typename Token>
class QueryVec
{
public:
using TokenWeightPair = TokenWeightPair<Token>;
using TokenWeightPairs = std::vector<TokenWeightPair>;
class Builder
{
public:
template <typename T>
void AddFull(T && token, double weight)
template <typename Token>
void AddFull(Token && token)
{
m_tws.emplace_back(std::forward<T>(token), weight);
m_tokens.emplace_back(std::forward<Token>(token));
}
template <typename T>
void SetPrefix(T && token, double weight)
template <typename Token>
void SetPrefix(Token && token)
{
m_prefix = TokenWeightPair(std::forward<T>(token), weight);
m_prefix = std::forward<Token>(token);
}
private:
friend class QueryVec;
TokenWeightPairs m_tws;
boost::optional<TokenWeightPair> m_prefix;
std::vector<strings::UniString> m_tokens;
boost::optional<strings::UniString> m_prefix;
};
QueryVec() = default;
explicit QueryVec(IdfMap & idfs) : m_idfs(&idfs) {}
explicit QueryVec(Builder && builder)
: m_tws(std::move(builder.m_tws)), m_prefix(std::move(builder.m_prefix))
{
Init();
}
QueryVec(IdfMap & idfs, Builder const & builder);
explicit QueryVec(Builder const & builder) : m_tws(builder.m_tws), m_prefix(builder.m_prefix)
{
Init();
}
// Computes cosine similarity between |*this| and |rhs|.
double Similarity(DocVec & rhs);
// Computes cosine distance between |*this| and |rhs|.
double Similarity(DocVec<Token> const & rhs) const
{
size_t kInvalidIndex = std::numeric_limits<size_t>::max();
// Computes vector norm of the query.
double Norm();
if (Empty() && rhs.Empty())
return 1.0;
if (Empty() || rhs.Empty())
return 0.0;
auto const & ls = m_tws;
auto const & rs = rhs.GetTokenWeightPairs();
auto const & maxWeights = rhs.GetMaxWeights();
ASSERT(std::is_sorted(ls.begin(), ls.end()), ());
ASSERT(std::is_sorted(rs.begin(), rs.end()), ());
std::vector<size_t> rsMatchTo(rs.size(), kInvalidIndex);
size_t i = 0, j = 0;
double dot = 0;
while (i < ls.size() && j < rs.size())
{
if (ls[i].m_token < rs[j].m_token)
{
++i;
}
else if (ls[i].m_token > rs[j].m_token)
{
++j;
}
else
{
dot += ls[i].m_weight * rs[j].m_weight;
rsMatchTo[j] = i;
++i;
++j;
}
}
auto const ln = impl::SqrL2(ls, m_prefix);
auto const rn = impl::SqrL2(rs);
// This similarity metric assumes that prefix is not matched in the document.
double const similarityNoPrefix = ln > 0 && rn > 0 ? dot / sqrt(ln) / sqrt(rn) : 0;
if (!m_prefix)
return similarityNoPrefix;
double similarityWithPrefix = 0;
auto const & prefix = *m_prefix;
// Let's try to match prefix token with all tokens in the
// document, and compute the best cosine distance.
for (size_t j = 0; j < rs.size(); ++j)
{
auto const & tw = rs[j];
if (!strings::StartsWith(tw.m_token.begin(), tw.m_token.end(), prefix.m_token.begin(),
prefix.m_token.end()))
{
continue;
}
auto const i = rsMatchTo[j];
double nom = 0;
double denom = 0;
if (i == kInvalidIndex)
{
// If this document token is not matched with full tokens in a
// query, we need to update it's weight in the cosine distance
// - so we need to update correspondingly dot product and
// vector norms of query and doc.
auto const w = maxWeights[j];
auto const l = std::max(0.0, ln - prefix.SqrWeight() + w * w);
nom = dot + w * tw.m_weight;
denom = sqrt(l) * sqrt(rn);
}
else
{
// If this document token is already matched with |i|-th full
// token in a query - we know that completion of the prefix
// token is the |i|-th query token. So we need to update
// correspondingly dot product and vector norm of the query.
auto const w = ls[i].m_weight + m_maxWeights[i];
auto const l = ln - ls[i].SqrWeight() - prefix.SqrWeight() + w * w;
nom = dot + (w - ls[i].m_weight) * tw.m_weight;
denom = sqrt(l) * sqrt(rn);
}
if (denom > 0)
similarityWithPrefix = std::max(similarityWithPrefix, nom / denom);
}
return std::max(similarityWithPrefix, similarityNoPrefix);
}
double Norm() const
{
double n = 0;
for (auto const & tw : m_tws)
n += tw.m_weight * tw.m_weight;
if (m_prefix)
n += m_prefix->m_weight * m_prefix->m_weight;
return sqrt(n);
}
bool Empty() const { return m_tws.empty() && !m_prefix; }
bool Empty() const { return m_tfs.empty() && !m_prefix; }
private:
template <typename T>
friend std::string DebugPrint(QueryVec<T> const & qv)
double GetFullTokenWeight(size_t i);
double GetPrefixTokenWeight();
friend std::string DebugPrint(QueryVec const & qv)
{
return "QueryVec " + DebugPrint(qv.m_tws);
std::ostringstream os;
os << "QueryVec " + ::DebugPrint(qv.m_tfs);
if (qv.m_prefix)
os << " " << DebugPrint(*qv.m_prefix);
return os.str();
}
void Init() { impl::SortAndMerge(m_tws, m_maxWeights); }
std::vector<TokenWeightPair> m_tws;
std::vector<double> m_maxWeights;
boost::optional<TokenWeightPair> m_prefix;
IdfMap * m_idfs;
std::vector<TokenFrequencyPair> m_tfs;
boost::optional<strings::UniString> m_prefix;
};
} // namespace search

View file

@ -183,13 +183,22 @@ public:
uint8_t GetRank(uint32_t featureId) const override { return m_ranks.Get(featureId); }
CBV GetMatchedFeatures(strings::UniString const & token) const override
CBV GetMatchedFeatures(strings::UniString const & token, bool isPrefix) const override
{
SearchTrieRequest<strings::UniStringDFA> request;
request.m_names.emplace_back(token);
request.SetLangs(m_params.GetLangs());
return CBV{m_retrieval.RetrieveAddressFeatures(request)};
if (isPrefix)
{
SearchTrieRequest<strings::PrefixDFAModifier<strings::UniStringDFA>> request;
request.m_names.emplace_back(strings::UniStringDFA(token));
request.SetLangs(m_params.GetLangs());
return CBV{m_retrieval.RetrieveAddressFeatures(request)};
}
else
{
SearchTrieRequest<strings::UniStringDFA> request;
request.m_names.emplace_back(token);
request.SetLangs(m_params.GetLangs());
return CBV{m_retrieval.RetrieveAddressFeatures(request)};
}
}
private:

View file

@ -16,19 +16,17 @@
namespace search
{
class IdfMap;
struct Locality
{
using QueryVec = QueryVec<strings::UniString>;
Locality() = default;
Locality(MwmSet::MwmId const & countryId, uint32_t featureId, TokenRange const & tokenRange,
QueryVec const & queryVec)
: m_countryId(countryId), m_featureId(featureId), m_tokenRange(tokenRange), m_queryVec(queryVec)
{
}
double QueryNorm() const { return m_queryVec.Norm(); }
double QueryNorm() { return m_queryVec.Norm(); }
MwmSet::MwmId m_countryId;
uint32_t m_featureId = 0;

View file

@ -1,21 +1,24 @@
#include "search/idf_map.hpp"
#include "base/assert.hpp"
namespace search
{
IdfMap::IdfMap(Delegate & delegate, double unknownIdf)
: m_delegate(delegate), m_unknownIdf(unknownIdf)
{
ASSERT_GREATER(m_unknownIdf, 0.0, ());
}
double IdfMap::Get(strings::UniString const & s)
double IdfMap::GetImpl(Map & idfs, strings::UniString const & s, bool isPrefix)
{
auto const it = m_idfs.find(s);
if (it != m_idfs.cend())
auto const it = idfs.find(s);
if (it != idfs.cend())
return it->second;
auto const df = static_cast<double>(m_delegate.GetNumDocs(s));
auto const df = static_cast<double>(m_delegate.GetNumDocs(s, isPrefix));
auto const idf = df == 0 ? m_unknownIdf : 1.0 / df;
m_idfs[s] = idf;
idfs[s] = idf;
return idf;
}

View file

@ -14,16 +14,29 @@ public:
{
virtual ~Delegate() = default;
virtual uint64_t GetNumDocs(strings::UniString const & token) const = 0;
virtual uint64_t GetNumDocs(strings::UniString const & token, bool isPrefix) const = 0;
};
IdfMap(Delegate & delegate, double unknownIdf);
void Set(strings::UniString const & s, double idf) { m_idfs[s] = idf; }
double Get(strings::UniString const & s);
void Set(strings::UniString const & s, bool isPrefix, double idf)
{
SetImpl(isPrefix ? m_prefixIdfs : m_fullIdfs, s, idf);
}
double Get(strings::UniString const & s, bool isPrefix)
{
return GetImpl(isPrefix ? m_prefixIdfs : m_fullIdfs, s, isPrefix);
}
private:
std::map<strings::UniString, double> m_idfs;
using Map = std::map<strings::UniString, double>;
void SetImpl(Map & idfs, strings::UniString const & s, double idf) { idfs[s] = idf; }
double GetImpl(Map & idfs, strings::UniString const & s, bool isPrefix);
Map m_fullIdfs;
Map m_prefixIdfs;
Delegate & m_delegate;
double m_unknownIdf;

View file

@ -30,9 +30,9 @@ struct IdfMapDelegate : public IdfMap::Delegate
~IdfMapDelegate() override = default;
uint64_t GetNumDocs(strings::UniString const & token) const override
uint64_t GetNumDocs(strings::UniString const & token, bool isPrefix) const override
{
return m_filter.Intersect(m_delegate.GetMatchedFeatures(token)).PopCount();
return m_filter.Intersect(m_delegate.GetMatchedFeatures(token, isPrefix)).PopCount();
}
LocalityScorer::Delegate const & m_delegate;
@ -59,6 +59,8 @@ void LocalityScorer::GetTopLocalities(MwmSet::MwmId const & countryId, BaseConte
CBV const & filter, size_t limit,
vector<Locality> & localities)
{
double const kUnknownIdf = 1.0;
CHECK_EQUAL(ctx.m_numTokens, m_params.GetNumTokens(), ());
localities.clear();
@ -68,24 +70,16 @@ void LocalityScorer::GetTopLocalities(MwmSet::MwmId const & countryId, BaseConte
intersections[i] = filter.Intersect(ctx.m_features[i]);
IdfMapDelegate delegate(m_delegate, filter);
IdfMap idfs(delegate, 1.0 /* unknownIdf */);
double prefixIdf = 1.0;
for (size_t i = 0; i < ctx.m_numTokens; ++i)
IdfMap idfs(delegate, kUnknownIdf);
if (ctx.m_numTokens > 0 && m_params.LastTokenIsPrefix())
{
auto const numDocs = intersections[i].PopCount();
double idf = 1.0;
auto const numDocs = intersections.back().PopCount();
double idf = kUnknownIdf;
if (numDocs > 0)
idf = 1.0 / static_cast<double>(numDocs);
if (m_params.IsPrefixToken(i))
{
prefixIdf = idf;
}
else
{
m_params.GetToken(i).ForEach(
[&idfs, &idf](strings::UniString const & s) { idfs.Set(s, idf); });
}
m_params.GetToken(ctx.m_numTokens - 1).ForEach([&idfs, &idf](strings::UniString const & s) {
idfs.Set(s, true /* isPrefix */, idf);
});
}
for (size_t startToken = 0; startToken < ctx.m_numTokens; ++startToken)
@ -99,9 +93,9 @@ void LocalityScorer::GetTopLocalities(MwmSet::MwmId const & countryId, BaseConte
auto const curToken = endToken - 1;
auto const & token = m_params.GetToken(curToken).m_original;
if (m_params.IsPrefixToken(curToken))
builder.SetPrefix(token, prefixIdf);
builder.SetPrefix(token);
else
builder.AddFull(token, idfs.Get(token));
builder.AddFull(token);
TokenRange const tokenRange(startToken, endToken);
// Skip locality candidates that match only numbers.
@ -109,7 +103,7 @@ void LocalityScorer::GetTopLocalities(MwmSet::MwmId const & countryId, BaseConte
{
intersection.ForEach([&](uint64_t bit) {
auto const featureId = base::asserted_cast<uint32_t>(bit);
localities.emplace_back(countryId, featureId, tokenRange, QueryVec(builder));
localities.emplace_back(countryId, featureId, tokenRange, QueryVec(idfs, builder));
});
}
@ -126,15 +120,15 @@ void LocalityScorer::LeaveTopLocalities(IdfMap & idfs, size_t limit,
{
vector<ExLocality> els;
els.reserve(localities.size());
for (auto const & locality : localities)
for (auto & locality : localities)
{
auto const queryNorm = locality.m_queryVec.Norm();
auto const rank = m_delegate.GetRank(locality.m_featureId);
els.emplace_back(locality, queryNorm, rank);
}
// We don't want to read too much names for localities, to this is
// the best effort - select best features by available params -
// We don't want to read too many names for localities, so this is
// the best effort - select the best features by available params -
// query norm and rank.
LeaveTopByNormAndRank(max(limit, kDefaultReadLimit) /* limitUniqueIds */, els);
@ -209,7 +203,7 @@ void LocalityScorer::LeaveTopBySimilarityAndRank(size_t limit, vector<ExLocality
++n;
}
}
els.resize(n);
els.erase(els.begin() + n, els.end());
}
void LocalityScorer::GetDocVecs(IdfMap & idfs, uint32_t localityId, vector<DocVec> & dvs) const
@ -224,22 +218,22 @@ void LocalityScorer::GetDocVecs(IdfMap & idfs, uint32_t localityId, vector<DocVe
DocVec::Builder builder;
for (auto const & token : tokens)
builder.Add(token, idfs.Get(token) /* weight */);
dvs.emplace_back(move(builder));
builder.Add(token);
dvs.emplace_back(idfs, builder);
}
}
double LocalityScorer::GetSimilarity(QueryVec const & qv, vector<DocVec> const & dvc) const
double LocalityScorer::GetSimilarity(QueryVec & qv, vector<DocVec> & dvc) const
{
double const kScale = 1e6;
double similarity = 0;
for (auto const & dv : dvc)
for (auto & dv : dvc)
similarity = max(similarity, qv.Similarity(dv));
// We need scale here to prevent double artifacts, and to make
// sorting by similarity more robust, as 1e-6 is good enough for our
// purposes.
// We need to scale similarity here to prevent floating-point
// artifacts, and to make sorting by similarity more robust, as 1e-6
// is good enough for our purposes.
return round(similarity * kScale);
}

View file

@ -30,7 +30,7 @@ public:
virtual void GetNames(uint32_t featureId, std::vector<std::string> & names) const = 0;
virtual uint8_t GetRank(uint32_t featureId) const = 0;
virtual CBV GetMatchedFeatures(strings::UniString const & token) const = 0;
virtual CBV GetMatchedFeatures(strings::UniString const & token, bool isPrefix) const = 0;
};
LocalityScorer(QueryParams const & params, Delegate const & delegate);
@ -41,9 +41,6 @@ public:
CBV const & filter, size_t limit, std::vector<Locality> & localities);
private:
using DocVec = DocVec<strings::UniString>;
using QueryVec = Locality::QueryVec;
struct ExLocality
{
ExLocality() = default;
@ -68,12 +65,12 @@ private:
// features in |els|.
void LeaveTopByNormAndRank(size_t limitUniqueIds, std::vector<ExLocality> & els) const;
// Leaves at most |limit| best localities by similarity to the query
// and rank. Result doesn't contain duplicate features.
// Leaves at most |limit| unique best localities by similarity to
// the query and rank.
void LeaveTopBySimilarityAndRank(size_t limit, std::vector<ExLocality> & els) const;
void GetDocVecs(IdfMap & idfs, uint32_t localityId, std::vector<DocVec> & dvs) const;
double GetSimilarity(QueryVec const & qv, std::vector<DocVec> const & dvs) const;
double GetSimilarity(QueryVec & qv, std::vector<DocVec> & dvs) const;
QueryParams const & m_params;
Delegate const & m_delegate;

View file

@ -96,6 +96,7 @@ SOURCES += \
cities_boundaries_table.cpp \
city_finder.cpp \
displayed_categories.cpp \
doc_vec.cpp \
downloader_search_callback.cpp \
dummy_rank_table.cpp \
editor_delegate.cpp \

View file

@ -122,10 +122,20 @@ public:
return it == m_ranks.end() ? 0 : it->second;
}
CBV GetMatchedFeatures(strings::UniString const & token) const override
CBV GetMatchedFeatures(strings::UniString const & token, bool isPrefix) const override
{
vector<uint64_t> ids;
m_searchIndex.ForEachInNode(token, [&ids](uint32_t id) { ids.push_back(id); });
if (isPrefix)
{
m_searchIndex.ForEachInSubtree(token, [&ids](strings::UniString const & /* prefix */,
uint32_t id) { ids.push_back(id); });
}
else
{
m_searchIndex.ForEachInNode(token, [&ids](uint32_t id) { ids.push_back(id); });
}
my::SortUnique(ids);
return CBV{coding::CompressedBitVectorBuilder::FromBitPositions(move(ids))};
}