[search] Fixed locality scoring.

Score is decreased for localities matched by a number.
This commit is contained in:
Yuri Gorshenin 2016-01-29 18:59:38 +03:00 committed by Sergey Yershov
parent 135e369313
commit bfa4d51762
7 changed files with 322 additions and 81 deletions

View file

@ -53,6 +53,7 @@ HEADERS += \
v2/house_numbers_matcher.hpp \
v2/house_to_street_table.hpp \
v2/intersection_result.hpp \
v2/locality_scorer.hpp \
v2/mwm_context.hpp \
v2/rank_table_cache.hpp \
v2/search_model.hpp \
@ -94,6 +95,7 @@ SOURCES += \
v2/house_numbers_matcher.cpp \
v2/house_to_street_table.cpp \
v2/intersection_result.cpp \
v2/locality_scorer.cpp \
v2/mwm_context.cpp \
v2/rank_table_cache.cpp \
v2/search_model.cpp \

View file

@ -0,0 +1,129 @@
#include "testing/testing.hpp"
#include "search/dummy_rank_table.hpp"
#include "search/search_delimiters.hpp"
#include "search/search_string_utils.hpp"
#include "search/v2/locality_scorer.hpp"
#include "base/string_utils.hpp"
#include "std/set.hpp"
#include "std/vector.hpp"
using namespace strings;
using namespace search;
using namespace search::v2;
namespace
{
void InitParams(string const & query, SearchQueryParams & params)
{
params.m_tokens.clear();
params.m_prefixTokens.clear();
vector<UniString> tokens;
Delimiters delims;
SplitUniString(NormalizeAndSimplifyString(query), MakeBackInsertFunctor(tokens), delims);
for (auto const & token : tokens)
params.m_tokens.push_back({token});
}
void AddLocality(string const & name, uint32_t featureId, SearchQueryParams & params,
vector<Geocoder::Locality> & localities)
{
set<UniString> tokens;
Delimiters delims;
SplitUniString(NormalizeAndSimplifyString(name), [&tokens](UniString const & token)
{
tokens.insert(token);
},
delims);
for (size_t startToken = 0; startToken < params.m_tokens.size(); ++startToken)
{
for (size_t endToken = startToken + 1; endToken <= params.m_tokens.size(); ++endToken)
{
bool matches = true;
for (size_t i = startToken; i != endToken && matches; ++i)
{
if (tokens.count(params.m_tokens[i].front()) == 0)
matches = false;
}
if (matches)
localities.emplace_back(featureId, startToken, endToken);
}
}
}
} // namespace
UNIT_TEST(LocalityScorer_Smoke)
{
enum
{
ID_NEW_ORLEANS,
ID_YORK,
ID_NEW_YORK,
};
SearchQueryParams params;
InitParams("New York Time Square", params);
vector<Geocoder::Locality> localities;
AddLocality("New Orleans", ID_NEW_ORLEANS, params, localities);
AddLocality("York", ID_YORK, params, localities);
AddLocality("New York", ID_NEW_YORK, params, localities);
LocalityScorer scorer(DummyRankTable(), params);
scorer.LeaveTopLocalities(100 /* limit */, localities);
TEST_EQUAL(3, localities.size(), ());
TEST_EQUAL(localities[0].m_featureId, ID_NEW_ORLEANS, ());
TEST_EQUAL(localities[1].m_featureId, ID_YORK, ());
TEST_EQUAL(localities[2].m_featureId, ID_NEW_YORK, ());
// New York is the best matching locality
scorer.LeaveTopLocalities(1 /* limit */, localities);
TEST_EQUAL(1, localities.size(), ());
TEST_EQUAL(localities[0].m_featureId, ID_NEW_YORK, ());
}
UNIT_TEST(LocalityScorer_NumbersMatching)
{
enum
{
ID_MARCH,
ID_APRIL,
ID_MAY,
ID_TVER
};
SearchQueryParams params;
InitParams("тверь советская 1", params);
vector<Geocoder::Locality> localities;
AddLocality("поселок 1 марта", ID_MARCH, params, localities);
AddLocality("поселок 1 апреля", ID_APRIL, params, localities);
AddLocality("поселок 1 мая", ID_MAY, params, localities);
AddLocality("тверь", ID_TVER, params, localities);
LocalityScorer scorer(DummyRankTable(), params);
scorer.LeaveTopLocalities(100 /* limit */, localities);
TEST_EQUAL(4, localities.size(), ());
TEST_EQUAL(localities[0].m_featureId, ID_MARCH, ());
TEST_EQUAL(localities[1].m_featureId, ID_APRIL, ());
TEST_EQUAL(localities[2].m_featureId, ID_MAY, ());
TEST_EQUAL(localities[3].m_featureId, ID_TVER, ());
// Tver is the best matching locality, as other localities were
// matched by number.
scorer.LeaveTopLocalities(1 /* limit */, localities);
TEST_EQUAL(1, localities.size(), ());
TEST_EQUAL(localities[0].m_featureId, ID_TVER, ());
}

View file

@ -25,6 +25,7 @@ SOURCES += \
keyword_matcher_test.cpp \
latlon_match_test.cpp \
locality_finder_test.cpp \
locality_scorer_test.cpp \
query_saver_tests.cpp \
search_string_utils_test.cpp \
string_intersection_test.cpp \

View file

@ -6,6 +6,7 @@
#include "search/search_string_utils.hpp"
#include "search/v2/cbv_ptr.hpp"
#include "search/v2/features_layer_matcher.hpp"
#include "search/v2/locality_scorer.hpp"
#include "indexer/classificator.hpp"
#include "indexer/feature_decl.hpp"
@ -33,6 +34,7 @@
#include "std/algorithm.hpp"
#include "std/bind.hpp"
#include "std/iterator.hpp"
#include "std/sstream.hpp"
#include "std/target_os.hpp"
#include "std/transform_iterator.hpp"
@ -95,23 +97,47 @@ struct ScopedMarkTokens
size_t const m_to;
};
struct LazyRankTable
class LazyRankTable : public RankTable
{
public:
LazyRankTable(MwmValue const & value) : m_value(value) {}
uint8_t Get(uint64_t i)
uint8_t Get(uint64_t i) const override
{
if (!m_table)
{
m_table = search::RankTable::Load(m_value.m_cont);
if (!m_table)
m_table = make_unique<search::DummyRankTable>();
}
EnsureTableLoaded();
return m_table->Get(i);
}
uint64_t Size() const override
{
EnsureTableLoaded();
return m_table->Size();
}
RankTable::Version GetVersion() const override
{
EnsureTableLoaded();
return m_table->GetVersion();
}
void Serialize(Writer & writer, bool preserveHostEndiannes) override
{
EnsureTableLoaded();
m_table->Serialize(writer, preserveHostEndiannes);
}
private:
void EnsureTableLoaded() const
{
if (m_table)
return;
m_table = search::RankTable::Load(m_value.m_cont);
if (!m_table)
m_table = make_unique<search::DummyRankTable>();
}
MwmValue const & m_value;
unique_ptr<search::RankTable> m_table;
mutable unique_ptr<search::RankTable> m_table;
};
class StreetCategories
@ -624,7 +650,6 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter
l.m_endToken = endToken;
preLocalities.push_back(l);
});
if (endToken < m_numTokens)
{
intersection.Intersect(m_addressFeatures[endToken].get());
@ -634,51 +659,9 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter
}
}
auto const tokensCountFn = [&](Locality const & l)
{
// Important! Prefix match costs 1 while token match costs 2 for locality comparison.
size_t d = 2 * (l.m_endToken - l.m_startToken);
ASSERT_GREATER(d, 0, ());
if (l.m_endToken == m_numTokens && !m_params.m_prefixTokens.empty())
d -= 1;
return d;
};
// Unique preLocalities with featureId but leave the longest range if equal.
sort(preLocalities.begin(), preLocalities.end(),
[&](Locality const & l1, Locality const & l2)
{
if (l1.m_featureId != l2.m_featureId)
return l1.m_featureId < l2.m_featureId;
return tokensCountFn(l1) > tokensCountFn(l2);
});
preLocalities.erase(unique(preLocalities.begin(), preLocalities.end(),
[](Locality const & l1, Locality const & l2)
{
return l1.m_featureId == l2.m_featureId;
}),
preLocalities.end());
LazyRankTable rankTable(m_context->m_value);
// Leave the most popular localities.
if (preLocalities.size() > maxNumLocalities)
{
/// @todo Calculate match costs according to the exact locality name
/// (for 'york' query "york city" is better than "new york").
sort(preLocalities.begin(), preLocalities.end(),
[&](Locality const & l1, Locality const & l2)
{
auto const d1 = tokensCountFn(l1);
auto const d2 = tokensCountFn(l2);
if (d1 != d2)
return d1 > d2;
return rankTable.Get(l1.m_featureId) > rankTable.Get(l2.m_featureId);
});
preLocalities.resize(maxNumLocalities);
}
LocalityScorer scorer(rankTable, m_params);
scorer.LeaveTopLocalities(maxNumLocalities, preLocalities);
}
void Geocoder::FillLocalitiesTable()
@ -999,11 +982,8 @@ void Geocoder::GreedilyMatchStreets()
if (IsStreetSynonym(token))
continue;
if (feature::IsHouseNumber(token) &&
!coding::CompressedBitVector::IsEmpty(allFeatures))
{
if (feature::IsHouseNumber(token))
CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures);
}
unique_ptr<coding::CompressedBitVector> buffer;
if (startToken == curToken || coding::CompressedBitVector::IsEmpty(allFeatures))
@ -1017,9 +997,6 @@ void Geocoder::GreedilyMatchStreets()
allFeatures.swap(buffer);
}
if (coding::CompressedBitVector::IsEmpty(allFeatures))
continue;
CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures);
}
}
@ -1328,5 +1305,13 @@ bool Geocoder::HasUsedTokensInRange(size_t from, size_t to) const
{
return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, Id<bool>());
}
string DebugPrint(Geocoder::Locality const & locality)
{
ostringstream os;
os << "Locality [" << DebugPrint(locality.m_countryId) << ", " << locality.m_featureId << ", "
<< locality.m_startToken << ", " << locality.m_endToken << "]";
return os.str();
}
} // namespace v2
} // namespace search

View file

@ -75,21 +75,6 @@ public:
size_t m_maxNumResults;
};
Geocoder(Index & index, storage::CountryInfoGetter const & infoGetter);
~Geocoder() override;
// Sets search query params.
void SetParams(Params const & params);
// Starts geocoding, retrieved features will be appended to
// |results|.
void GoEverywhere(vector<FeatureID> & results);
void GoInViewport(vector<FeatureID> & results);
void ClearCaches();
private:
enum RegionType
{
REGION_TYPE_STATE,
@ -97,14 +82,19 @@ private:
REGION_TYPE_COUNT
};
void GoImpl(vector<shared_ptr<MwmInfo>> & infos, bool inViewport);
struct Locality
{
Locality() : m_featureId(0), m_startToken(0), m_endToken(0) {}
Locality(uint32_t featureId, size_t startToken, size_t endToken)
: m_featureId(featureId), m_startToken(startToken), m_endToken(endToken)
{
}
MwmSet::MwmId m_countryId;
uint32_t m_featureId = 0;
size_t m_startToken = 0;
size_t m_endToken = 0;
uint32_t m_featureId;
size_t m_startToken;
size_t m_endToken;
};
// This struct represents a country or US- or Canadian- state. It
@ -131,6 +121,23 @@ private:
m2::RectD m_rect;
};
Geocoder(Index & index, storage::CountryInfoGetter const & infoGetter);
~Geocoder() override;
// Sets search query params.
void SetParams(Params const & params);
// Starts geocoding, retrieved features will be appended to
// |results|.
void GoEverywhere(vector<FeatureID> & results);
void GoInViewport(vector<FeatureID> & results);
void ClearCaches();
private:
void GoImpl(vector<shared_ptr<MwmInfo>> & infos, bool inViewport);
template <typename TLocality>
using TLocalitiesCache = map<pair<size_t, size_t>, vector<TLocality>>;
@ -295,5 +302,7 @@ private:
// Non-owning pointer to a vector of results.
vector<FeatureID> * m_results;
};
string DebugPrint(Geocoder::Locality const & locality);
} // namespace v2
} // namespace search

View file

@ -0,0 +1,87 @@
#include "search/v2/locality_scorer.hpp"
#include "search/dummy_rank_table.hpp"
#include "search/search_query_params.hpp"
#include "search/v2/mwm_context.hpp"
#include "indexer/feature_impl.hpp"
#include "indexer/index.hpp"
#include "indexer/rank_table.hpp"
#include "std/algorithm.hpp"
#include "std/unique_ptr.hpp"
namespace search
{
namespace v2
{
LocalityScorer::LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params)
: m_rankTable(rankTable), m_params(params)
{
}
void LocalityScorer::LeaveTopLocalities(size_t const limit,
vector<Geocoder::Locality> & localities) const
{
// Unique localities by featureId but leave the longest range if equal.
sort(localities.begin(), localities.end(), [&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
{
if (lhs.m_featureId != rhs.m_featureId)
return lhs.m_featureId < rhs.m_featureId;
return GetTokensScore(lhs) > GetTokensScore(rhs);
});
localities.erase(unique(localities.begin(), localities.end(),
[](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
{
return lhs.m_featureId == rhs.m_featureId;
}),
localities.end());
// Leave the most popular localities.
if (localities.size() > limit)
{
/// @todo Calculate match costs according to the exact locality name
/// (for 'york' query "york city" is better than "new york").
sort(localities.begin(), localities.end(),
[&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
{
auto const ls = GetTokensScore(lhs);
auto const rs = GetTokensScore(rhs);
if (ls != rs)
return ls > rs;
return m_rankTable.Get(lhs.m_featureId) > m_rankTable.Get(rhs.m_featureId);
});
localities.resize(limit);
}
}
size_t LocalityScorer::GetTokensScore(Geocoder::Locality const & locality) const
{
// *NOTE*
// * full token match costs 2
// * prefix match costs 1
//
// If locality is matched only by a single integral token or by an
// integral token + a prefix, overall score is reduced by one.
//
// TODO (@y, @m, @vng): consider to loop over all non-prefix
// tokens and decrement overall score by one for each integral
// token.
size_t const numTokens = locality.m_endToken - locality.m_startToken;
bool const prefixMatch = locality.m_endToken == m_params.m_tokens.size() + 1;
size_t score = 2 * numTokens;
if (prefixMatch)
--score;
if ((numTokens == 2 && prefixMatch) || (numTokens == 1 && !prefixMatch))
{
auto const & token = m_params.GetTokens(locality.m_startToken).front();
if (feature::IsNumber(token))
--score;
}
return score;
}
} // namespace v2
} // namespace search

View file

@ -0,0 +1,28 @@
#pragma once
#include "search/v2/geocoder.hpp"
#include "std/vector.hpp"
namespace search
{
class RankTable;
struct SearchQueryParams;
namespace v2
{
class LocalityScorer
{
public:
LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params);
void LeaveTopLocalities(size_t const limit, vector<Geocoder::Locality> & localities) const;
private:
size_t GetTokensScore(Geocoder::Locality const & locality) const;
RankTable const & m_rankTable;
SearchQueryParams const & m_params;
};
} // namespace v2
} // namespace search