forked from organicmaps/organicmaps
[search] Fixed locality scoring.
Score is decreased for localities matched by a number.
This commit is contained in:
parent
135e369313
commit
bfa4d51762
7 changed files with 322 additions and 81 deletions
|
@ -53,6 +53,7 @@ HEADERS += \
|
|||
v2/house_numbers_matcher.hpp \
|
||||
v2/house_to_street_table.hpp \
|
||||
v2/intersection_result.hpp \
|
||||
v2/locality_scorer.hpp \
|
||||
v2/mwm_context.hpp \
|
||||
v2/rank_table_cache.hpp \
|
||||
v2/search_model.hpp \
|
||||
|
@ -94,6 +95,7 @@ SOURCES += \
|
|||
v2/house_numbers_matcher.cpp \
|
||||
v2/house_to_street_table.cpp \
|
||||
v2/intersection_result.cpp \
|
||||
v2/locality_scorer.cpp \
|
||||
v2/mwm_context.cpp \
|
||||
v2/rank_table_cache.cpp \
|
||||
v2/search_model.cpp \
|
||||
|
|
129
search/search_tests/locality_scorer_test.cpp
Normal file
129
search/search_tests/locality_scorer_test.cpp
Normal file
|
@ -0,0 +1,129 @@
|
|||
#include "testing/testing.hpp"
|
||||
|
||||
#include "search/dummy_rank_table.hpp"
|
||||
#include "search/search_delimiters.hpp"
|
||||
#include "search/search_string_utils.hpp"
|
||||
#include "search/v2/locality_scorer.hpp"
|
||||
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/set.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
using namespace strings;
|
||||
using namespace search;
|
||||
using namespace search::v2;
|
||||
|
||||
namespace
|
||||
{
|
||||
void InitParams(string const & query, SearchQueryParams & params)
|
||||
{
|
||||
params.m_tokens.clear();
|
||||
params.m_prefixTokens.clear();
|
||||
|
||||
vector<UniString> tokens;
|
||||
|
||||
Delimiters delims;
|
||||
SplitUniString(NormalizeAndSimplifyString(query), MakeBackInsertFunctor(tokens), delims);
|
||||
for (auto const & token : tokens)
|
||||
params.m_tokens.push_back({token});
|
||||
}
|
||||
|
||||
void AddLocality(string const & name, uint32_t featureId, SearchQueryParams & params,
|
||||
vector<Geocoder::Locality> & localities)
|
||||
{
|
||||
set<UniString> tokens;
|
||||
|
||||
Delimiters delims;
|
||||
SplitUniString(NormalizeAndSimplifyString(name), [&tokens](UniString const & token)
|
||||
{
|
||||
tokens.insert(token);
|
||||
},
|
||||
delims);
|
||||
|
||||
for (size_t startToken = 0; startToken < params.m_tokens.size(); ++startToken)
|
||||
{
|
||||
for (size_t endToken = startToken + 1; endToken <= params.m_tokens.size(); ++endToken)
|
||||
{
|
||||
bool matches = true;
|
||||
for (size_t i = startToken; i != endToken && matches; ++i)
|
||||
{
|
||||
if (tokens.count(params.m_tokens[i].front()) == 0)
|
||||
matches = false;
|
||||
}
|
||||
if (matches)
|
||||
localities.emplace_back(featureId, startToken, endToken);
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
UNIT_TEST(LocalityScorer_Smoke)
|
||||
{
|
||||
enum
|
||||
{
|
||||
ID_NEW_ORLEANS,
|
||||
ID_YORK,
|
||||
ID_NEW_YORK,
|
||||
};
|
||||
|
||||
SearchQueryParams params;
|
||||
InitParams("New York Time Square", params);
|
||||
|
||||
vector<Geocoder::Locality> localities;
|
||||
|
||||
AddLocality("New Orleans", ID_NEW_ORLEANS, params, localities);
|
||||
AddLocality("York", ID_YORK, params, localities);
|
||||
AddLocality("New York", ID_NEW_YORK, params, localities);
|
||||
|
||||
LocalityScorer scorer(DummyRankTable(), params);
|
||||
scorer.LeaveTopLocalities(100 /* limit */, localities);
|
||||
|
||||
TEST_EQUAL(3, localities.size(), ());
|
||||
|
||||
TEST_EQUAL(localities[0].m_featureId, ID_NEW_ORLEANS, ());
|
||||
TEST_EQUAL(localities[1].m_featureId, ID_YORK, ());
|
||||
TEST_EQUAL(localities[2].m_featureId, ID_NEW_YORK, ());
|
||||
|
||||
// New York is the best matching locality
|
||||
scorer.LeaveTopLocalities(1 /* limit */, localities);
|
||||
TEST_EQUAL(1, localities.size(), ());
|
||||
TEST_EQUAL(localities[0].m_featureId, ID_NEW_YORK, ());
|
||||
}
|
||||
|
||||
UNIT_TEST(LocalityScorer_NumbersMatching)
|
||||
{
|
||||
enum
|
||||
{
|
||||
ID_MARCH,
|
||||
ID_APRIL,
|
||||
ID_MAY,
|
||||
ID_TVER
|
||||
};
|
||||
|
||||
SearchQueryParams params;
|
||||
InitParams("тверь советская 1", params);
|
||||
|
||||
vector<Geocoder::Locality> localities;
|
||||
|
||||
AddLocality("поселок 1 марта", ID_MARCH, params, localities);
|
||||
AddLocality("поселок 1 апреля", ID_APRIL, params, localities);
|
||||
AddLocality("поселок 1 мая", ID_MAY, params, localities);
|
||||
AddLocality("тверь", ID_TVER, params, localities);
|
||||
|
||||
LocalityScorer scorer(DummyRankTable(), params);
|
||||
scorer.LeaveTopLocalities(100 /* limit */, localities);
|
||||
|
||||
TEST_EQUAL(4, localities.size(), ());
|
||||
|
||||
TEST_EQUAL(localities[0].m_featureId, ID_MARCH, ());
|
||||
TEST_EQUAL(localities[1].m_featureId, ID_APRIL, ());
|
||||
TEST_EQUAL(localities[2].m_featureId, ID_MAY, ());
|
||||
TEST_EQUAL(localities[3].m_featureId, ID_TVER, ());
|
||||
|
||||
// Tver is the best matching locality, as other localities were
|
||||
// matched by number.
|
||||
scorer.LeaveTopLocalities(1 /* limit */, localities);
|
||||
TEST_EQUAL(1, localities.size(), ());
|
||||
TEST_EQUAL(localities[0].m_featureId, ID_TVER, ());
|
||||
}
|
|
@ -25,6 +25,7 @@ SOURCES += \
|
|||
keyword_matcher_test.cpp \
|
||||
latlon_match_test.cpp \
|
||||
locality_finder_test.cpp \
|
||||
locality_scorer_test.cpp \
|
||||
query_saver_tests.cpp \
|
||||
search_string_utils_test.cpp \
|
||||
string_intersection_test.cpp \
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include "search/search_string_utils.hpp"
|
||||
#include "search/v2/cbv_ptr.hpp"
|
||||
#include "search/v2/features_layer_matcher.hpp"
|
||||
#include "search/v2/locality_scorer.hpp"
|
||||
|
||||
#include "indexer/classificator.hpp"
|
||||
#include "indexer/feature_decl.hpp"
|
||||
|
@ -33,6 +34,7 @@
|
|||
#include "std/algorithm.hpp"
|
||||
#include "std/bind.hpp"
|
||||
#include "std/iterator.hpp"
|
||||
#include "std/sstream.hpp"
|
||||
#include "std/target_os.hpp"
|
||||
#include "std/transform_iterator.hpp"
|
||||
|
||||
|
@ -95,23 +97,47 @@ struct ScopedMarkTokens
|
|||
size_t const m_to;
|
||||
};
|
||||
|
||||
struct LazyRankTable
|
||||
class LazyRankTable : public RankTable
|
||||
{
|
||||
public:
|
||||
LazyRankTable(MwmValue const & value) : m_value(value) {}
|
||||
|
||||
uint8_t Get(uint64_t i)
|
||||
uint8_t Get(uint64_t i) const override
|
||||
{
|
||||
if (!m_table)
|
||||
{
|
||||
m_table = search::RankTable::Load(m_value.m_cont);
|
||||
if (!m_table)
|
||||
m_table = make_unique<search::DummyRankTable>();
|
||||
}
|
||||
EnsureTableLoaded();
|
||||
return m_table->Get(i);
|
||||
}
|
||||
|
||||
uint64_t Size() const override
|
||||
{
|
||||
EnsureTableLoaded();
|
||||
return m_table->Size();
|
||||
}
|
||||
|
||||
RankTable::Version GetVersion() const override
|
||||
{
|
||||
EnsureTableLoaded();
|
||||
return m_table->GetVersion();
|
||||
}
|
||||
|
||||
void Serialize(Writer & writer, bool preserveHostEndiannes) override
|
||||
{
|
||||
EnsureTableLoaded();
|
||||
m_table->Serialize(writer, preserveHostEndiannes);
|
||||
}
|
||||
|
||||
private:
|
||||
void EnsureTableLoaded() const
|
||||
{
|
||||
if (m_table)
|
||||
return;
|
||||
m_table = search::RankTable::Load(m_value.m_cont);
|
||||
if (!m_table)
|
||||
m_table = make_unique<search::DummyRankTable>();
|
||||
}
|
||||
|
||||
MwmValue const & m_value;
|
||||
unique_ptr<search::RankTable> m_table;
|
||||
mutable unique_ptr<search::RankTable> m_table;
|
||||
};
|
||||
|
||||
class StreetCategories
|
||||
|
@ -624,7 +650,6 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter
|
|||
l.m_endToken = endToken;
|
||||
preLocalities.push_back(l);
|
||||
});
|
||||
|
||||
if (endToken < m_numTokens)
|
||||
{
|
||||
intersection.Intersect(m_addressFeatures[endToken].get());
|
||||
|
@ -634,51 +659,9 @@ void Geocoder::FillLocalityCandidates(coding::CompressedBitVector const * filter
|
|||
}
|
||||
}
|
||||
|
||||
auto const tokensCountFn = [&](Locality const & l)
|
||||
{
|
||||
// Important! Prefix match costs 1 while token match costs 2 for locality comparison.
|
||||
size_t d = 2 * (l.m_endToken - l.m_startToken);
|
||||
ASSERT_GREATER(d, 0, ());
|
||||
if (l.m_endToken == m_numTokens && !m_params.m_prefixTokens.empty())
|
||||
d -= 1;
|
||||
return d;
|
||||
};
|
||||
|
||||
// Unique preLocalities with featureId but leave the longest range if equal.
|
||||
sort(preLocalities.begin(), preLocalities.end(),
|
||||
[&](Locality const & l1, Locality const & l2)
|
||||
{
|
||||
if (l1.m_featureId != l2.m_featureId)
|
||||
return l1.m_featureId < l2.m_featureId;
|
||||
return tokensCountFn(l1) > tokensCountFn(l2);
|
||||
});
|
||||
|
||||
preLocalities.erase(unique(preLocalities.begin(), preLocalities.end(),
|
||||
[](Locality const & l1, Locality const & l2)
|
||||
{
|
||||
return l1.m_featureId == l2.m_featureId;
|
||||
}),
|
||||
preLocalities.end());
|
||||
|
||||
LazyRankTable rankTable(m_context->m_value);
|
||||
|
||||
// Leave the most popular localities.
|
||||
if (preLocalities.size() > maxNumLocalities)
|
||||
{
|
||||
/// @todo Calculate match costs according to the exact locality name
|
||||
/// (for 'york' query "york city" is better than "new york").
|
||||
|
||||
sort(preLocalities.begin(), preLocalities.end(),
|
||||
[&](Locality const & l1, Locality const & l2)
|
||||
{
|
||||
auto const d1 = tokensCountFn(l1);
|
||||
auto const d2 = tokensCountFn(l2);
|
||||
if (d1 != d2)
|
||||
return d1 > d2;
|
||||
return rankTable.Get(l1.m_featureId) > rankTable.Get(l2.m_featureId);
|
||||
});
|
||||
preLocalities.resize(maxNumLocalities);
|
||||
}
|
||||
LocalityScorer scorer(rankTable, m_params);
|
||||
scorer.LeaveTopLocalities(maxNumLocalities, preLocalities);
|
||||
}
|
||||
|
||||
void Geocoder::FillLocalitiesTable()
|
||||
|
@ -999,11 +982,8 @@ void Geocoder::GreedilyMatchStreets()
|
|||
if (IsStreetSynonym(token))
|
||||
continue;
|
||||
|
||||
if (feature::IsHouseNumber(token) &&
|
||||
!coding::CompressedBitVector::IsEmpty(allFeatures))
|
||||
{
|
||||
if (feature::IsHouseNumber(token))
|
||||
CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures);
|
||||
}
|
||||
|
||||
unique_ptr<coding::CompressedBitVector> buffer;
|
||||
if (startToken == curToken || coding::CompressedBitVector::IsEmpty(allFeatures))
|
||||
|
@ -1017,9 +997,6 @@ void Geocoder::GreedilyMatchStreets()
|
|||
allFeatures.swap(buffer);
|
||||
}
|
||||
|
||||
if (coding::CompressedBitVector::IsEmpty(allFeatures))
|
||||
continue;
|
||||
|
||||
CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures);
|
||||
}
|
||||
}
|
||||
|
@ -1328,5 +1305,13 @@ bool Geocoder::HasUsedTokensInRange(size_t from, size_t to) const
|
|||
{
|
||||
return any_of(m_usedTokens.begin() + from, m_usedTokens.begin() + to, Id<bool>());
|
||||
}
|
||||
|
||||
string DebugPrint(Geocoder::Locality const & locality)
|
||||
{
|
||||
ostringstream os;
|
||||
os << "Locality [" << DebugPrint(locality.m_countryId) << ", " << locality.m_featureId << ", "
|
||||
<< locality.m_startToken << ", " << locality.m_endToken << "]";
|
||||
return os.str();
|
||||
}
|
||||
} // namespace v2
|
||||
} // namespace search
|
||||
|
|
|
@ -75,21 +75,6 @@ public:
|
|||
size_t m_maxNumResults;
|
||||
};
|
||||
|
||||
Geocoder(Index & index, storage::CountryInfoGetter const & infoGetter);
|
||||
|
||||
~Geocoder() override;
|
||||
|
||||
// Sets search query params.
|
||||
void SetParams(Params const & params);
|
||||
|
||||
// Starts geocoding, retrieved features will be appended to
|
||||
// |results|.
|
||||
void GoEverywhere(vector<FeatureID> & results);
|
||||
void GoInViewport(vector<FeatureID> & results);
|
||||
|
||||
void ClearCaches();
|
||||
|
||||
private:
|
||||
enum RegionType
|
||||
{
|
||||
REGION_TYPE_STATE,
|
||||
|
@ -97,14 +82,19 @@ private:
|
|||
REGION_TYPE_COUNT
|
||||
};
|
||||
|
||||
void GoImpl(vector<shared_ptr<MwmInfo>> & infos, bool inViewport);
|
||||
|
||||
struct Locality
|
||||
{
|
||||
Locality() : m_featureId(0), m_startToken(0), m_endToken(0) {}
|
||||
|
||||
Locality(uint32_t featureId, size_t startToken, size_t endToken)
|
||||
: m_featureId(featureId), m_startToken(startToken), m_endToken(endToken)
|
||||
{
|
||||
}
|
||||
|
||||
MwmSet::MwmId m_countryId;
|
||||
uint32_t m_featureId = 0;
|
||||
size_t m_startToken = 0;
|
||||
size_t m_endToken = 0;
|
||||
uint32_t m_featureId;
|
||||
size_t m_startToken;
|
||||
size_t m_endToken;
|
||||
};
|
||||
|
||||
// This struct represents a country or US- or Canadian- state. It
|
||||
|
@ -131,6 +121,23 @@ private:
|
|||
m2::RectD m_rect;
|
||||
};
|
||||
|
||||
Geocoder(Index & index, storage::CountryInfoGetter const & infoGetter);
|
||||
|
||||
~Geocoder() override;
|
||||
|
||||
// Sets search query params.
|
||||
void SetParams(Params const & params);
|
||||
|
||||
// Starts geocoding, retrieved features will be appended to
|
||||
// |results|.
|
||||
void GoEverywhere(vector<FeatureID> & results);
|
||||
void GoInViewport(vector<FeatureID> & results);
|
||||
|
||||
void ClearCaches();
|
||||
|
||||
private:
|
||||
void GoImpl(vector<shared_ptr<MwmInfo>> & infos, bool inViewport);
|
||||
|
||||
template <typename TLocality>
|
||||
using TLocalitiesCache = map<pair<size_t, size_t>, vector<TLocality>>;
|
||||
|
||||
|
@ -295,5 +302,7 @@ private:
|
|||
// Non-owning pointer to a vector of results.
|
||||
vector<FeatureID> * m_results;
|
||||
};
|
||||
|
||||
string DebugPrint(Geocoder::Locality const & locality);
|
||||
} // namespace v2
|
||||
} // namespace search
|
||||
|
|
87
search/v2/locality_scorer.cpp
Normal file
87
search/v2/locality_scorer.cpp
Normal file
|
@ -0,0 +1,87 @@
|
|||
#include "search/v2/locality_scorer.hpp"
|
||||
|
||||
#include "search/dummy_rank_table.hpp"
|
||||
#include "search/search_query_params.hpp"
|
||||
#include "search/v2/mwm_context.hpp"
|
||||
|
||||
#include "indexer/feature_impl.hpp"
|
||||
#include "indexer/index.hpp"
|
||||
#include "indexer/rank_table.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/unique_ptr.hpp"
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace v2
|
||||
{
|
||||
LocalityScorer::LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params)
|
||||
: m_rankTable(rankTable), m_params(params)
|
||||
{
|
||||
}
|
||||
|
||||
void LocalityScorer::LeaveTopLocalities(size_t const limit,
|
||||
vector<Geocoder::Locality> & localities) const
|
||||
{
|
||||
// Unique localities by featureId but leave the longest range if equal.
|
||||
sort(localities.begin(), localities.end(), [&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
|
||||
{
|
||||
if (lhs.m_featureId != rhs.m_featureId)
|
||||
return lhs.m_featureId < rhs.m_featureId;
|
||||
return GetTokensScore(lhs) > GetTokensScore(rhs);
|
||||
});
|
||||
localities.erase(unique(localities.begin(), localities.end(),
|
||||
[](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
|
||||
{
|
||||
return lhs.m_featureId == rhs.m_featureId;
|
||||
}),
|
||||
localities.end());
|
||||
|
||||
// Leave the most popular localities.
|
||||
if (localities.size() > limit)
|
||||
{
|
||||
/// @todo Calculate match costs according to the exact locality name
|
||||
/// (for 'york' query "york city" is better than "new york").
|
||||
sort(localities.begin(), localities.end(),
|
||||
[&](Geocoder::Locality const & lhs, Geocoder::Locality const & rhs)
|
||||
{
|
||||
auto const ls = GetTokensScore(lhs);
|
||||
auto const rs = GetTokensScore(rhs);
|
||||
if (ls != rs)
|
||||
return ls > rs;
|
||||
return m_rankTable.Get(lhs.m_featureId) > m_rankTable.Get(rhs.m_featureId);
|
||||
});
|
||||
localities.resize(limit);
|
||||
}
|
||||
}
|
||||
|
||||
size_t LocalityScorer::GetTokensScore(Geocoder::Locality const & locality) const
|
||||
{
|
||||
// *NOTE*
|
||||
// * full token match costs 2
|
||||
// * prefix match costs 1
|
||||
//
|
||||
// If locality is matched only by a single integral token or by an
|
||||
// integral token + a prefix, overall score is reduced by one.
|
||||
//
|
||||
// TODO (@y, @m, @vng): consider to loop over all non-prefix
|
||||
// tokens and decrement overall score by one for each integral
|
||||
// token.
|
||||
size_t const numTokens = locality.m_endToken - locality.m_startToken;
|
||||
bool const prefixMatch = locality.m_endToken == m_params.m_tokens.size() + 1;
|
||||
|
||||
size_t score = 2 * numTokens;
|
||||
if (prefixMatch)
|
||||
--score;
|
||||
|
||||
if ((numTokens == 2 && prefixMatch) || (numTokens == 1 && !prefixMatch))
|
||||
{
|
||||
auto const & token = m_params.GetTokens(locality.m_startToken).front();
|
||||
if (feature::IsNumber(token))
|
||||
--score;
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
} // namespace v2
|
||||
} // namespace search
|
28
search/v2/locality_scorer.hpp
Normal file
28
search/v2/locality_scorer.hpp
Normal file
|
@ -0,0 +1,28 @@
|
|||
#pragma once
|
||||
|
||||
#include "search/v2/geocoder.hpp"
|
||||
|
||||
#include "std/vector.hpp"
|
||||
|
||||
namespace search
|
||||
{
|
||||
class RankTable;
|
||||
struct SearchQueryParams;
|
||||
|
||||
namespace v2
|
||||
{
|
||||
class LocalityScorer
|
||||
{
|
||||
public:
|
||||
LocalityScorer(RankTable const & rankTable, SearchQueryParams const & params);
|
||||
|
||||
void LeaveTopLocalities(size_t const limit, vector<Geocoder::Locality> & localities) const;
|
||||
|
||||
private:
|
||||
size_t GetTokensScore(Geocoder::Locality const & locality) const;
|
||||
|
||||
RankTable const & m_rankTable;
|
||||
SearchQueryParams const & m_params;
|
||||
};
|
||||
} // namespace v2
|
||||
} // namespace search
|
Loading…
Add table
Reference in a new issue