[search] Fixed discrepancy in tokens between geocoder and ranker.

This commit is contained in:
Yuri Gorshenin 2017-02-06 16:11:28 +03:00
parent d719020c9c
commit e09b14f438
6 changed files with 120 additions and 62 deletions

View file

@ -218,21 +218,6 @@ MwmSet::MwmHandle FindWorld(Index const & index, vector<shared_ptr<MwmInfo>> con
return handle;
}
UniString AsciiToUniString(char const * s) { return UniString(s, s + strlen(s)); }
bool IsStopWord(UniString const & s)
{
/// @todo Get all common used stop words and factor out this array into
/// search_string_utils.cpp module for example.
static char const * arr[] = {"a", "de", "da", "la"};
static set<UniString> const kStopWords(
make_transform_iterator(arr, &AsciiToUniString),
make_transform_iterator(arr + ARRAY_SIZE(arr), &AsciiToUniString));
return kStopWords.count(s) > 0;
}
double Area(m2::RectD const & rect) { return rect.IsValid() ? rect.SizeX() * rect.SizeY() : 0; }
// Computes an average similaty between |rect| and |pivot|. By
@ -375,43 +360,6 @@ void Geocoder::SetParams(Params const & params)
{
m_params = params;
// Filter stop words.
if (m_params.GetNumTokens() > 1)
{
for (size_t i = 0; i < m_params.GetNumTokens();)
{
if (m_params.IsPrefixToken(i))
{
++i;
continue;
}
auto & token = m_params.GetToken(i);
if (IsStopWord(token.m_original))
{
m_params.RemoveToken(i);
}
else
{
my::EraseIf(token.m_synonyms, &IsStopWord);
++i;
}
}
// If all tokens are stop words - give up.
if (m_params.GetNumTokens() == 0)
m_params = params;
}
// Remove all category synonyms for streets, as they're extracted
// individually.
for (size_t i = 0; i < m_params.GetNumTokens(); ++i)
{
auto & token = m_params.GetToken(i);
if (IsStreetSynonym(token.m_original))
m_params.GetTypeIndices(i).clear();
}
m_tokenRequests.clear();
m_prefixTokenRequest.Clear();
for (size_t i = 0; i < m_params.GetNumTokens(); ++i)

View file

@ -129,6 +129,42 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res
alohalytics::LogEvent("searchEmitResultsAndCoords", stats);
GetPlatform().GetMarketingService().SendMarketingEvent(marketing::kSearchEmitResultsAndCoords, {});
}
// Removes all full-token stop words from |params|, unless |params|
// consists of all such tokens.
void RemoveStopWordsIfNeeded(QueryParams & params)
{
size_t numStopWords = 0;
for (size_t i = 0; i < params.GetNumTokens(); ++i)
{
auto & token = params.GetToken(i);
if (!params.IsPrefixToken(i) && IsStopWord(token.m_original))
++numStopWords;
}
if (numStopWords == params.GetNumTokens())
return;
for (size_t i = 0; i < params.GetNumTokens();)
{
if (params.IsPrefixToken(i))
{
++i;
continue;
}
auto & token = params.GetToken(i);
if (IsStopWord(token.m_original))
{
params.RemoveToken(i);
}
else
{
my::EraseIf(token.m_synonyms, &IsStopWord);
++i;
}
}
}
} // namespace
// static
@ -642,6 +678,17 @@ void Processor::InitParams(QueryParams & params)
auto & langs = params.GetLangs();
for (int i = 0; i < LANG_COUNT; ++i)
langs.insert(GetLanguage(i));
RemoveStopWordsIfNeeded(params);
// Remove all type indices for streets, as they're considired
// individually.
for (size_t i = 0; i < params.GetNumTokens(); ++i)
{
auto & token = params.GetToken(i);
if (IsStreetSynonym(token.m_original))
params.GetTypeIndices(i).clear();
}
}
void Processor::InitGeocoder(Geocoder::Params & params)

View file

@ -188,7 +188,7 @@ class PreResult2Maker
if (!ft.GetName(lang, name))
continue;
vector<strings::UniString> tokens;
SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());
PrepareStringForMatching(name, tokens);
UpdateNameScore(tokens, slice, info.m_nameScore);
UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore);

View file

@ -1,11 +1,18 @@
#include "search/ranking_utils.hpp"
#include "std/algorithm.hpp"
#include "std/transform_iterator.hpp"
#include <algorithm>
using namespace strings;
namespace search
{
namespace
{
UniString AsciiToUniString(char const * s) { return UniString(s, s + strlen(s)); }
} // namespace
namespace impl
{
bool FullMatch(QueryParams::Token const & token, UniString const & text)
@ -30,6 +37,29 @@ bool PrefixMatch(QueryParams::Token const & token, UniString const & text)
}
} // namespace impl
bool IsStopWord(UniString const & s)
{
/// @todo Get all common used stop words and factor out this array into
/// search_string_utils.cpp module for example.
static char const * arr[] = {"a", "de", "da", "la"};
static std::set<UniString> const kStopWords(
make_transform_iterator(arr, &AsciiToUniString),
make_transform_iterator(arr + ARRAY_SIZE(arr), &AsciiToUniString));
return kStopWords.count(s) > 0;
}
void PrepareStringForMatching(std::string const & name, std::vector<strings::UniString> & tokens)
{
auto filter = [&tokens](strings::UniString const & token)
{
if (!IsStopWord(token))
tokens.push_back(token);
};
SplitUniString(NormalizeAndSimplifyString(name), filter, Delimiters());
}
string DebugPrint(NameScore score)
{
switch (score)
@ -43,5 +73,4 @@ string DebugPrint(NameScore score)
}
return "Unknown";
}
} // namespace search

View file

@ -9,10 +9,10 @@
#include "base/stl_add.hpp"
#include "base/string_utils.hpp"
#include "std/cstdint.hpp"
#include "std/limits.hpp"
#include "std/string.hpp"
#include "std/vector.hpp"
#include <cstdint>
#include <limits>
#include <string>
#include <vector>
namespace search
{
@ -38,19 +38,25 @@ enum NameScore
NAME_SCORE_COUNT
};
// Returns true when |s| is a stop-word and may be removed from a query.
bool IsStopWord(strings::UniString const & s);
// Normalizes, simplifies and splits string, removes stop-words.
void PrepareStringForMatching(std::string const & name, std::vector<strings::UniString> & tokens);
template <typename TSlice>
NameScore GetNameScore(string const & name, TSlice const & slice)
NameScore GetNameScore(std::string const & name, TSlice const & slice)
{
if (slice.Empty())
return NAME_SCORE_ZERO;
vector<strings::UniString> tokens;
std::vector<strings::UniString> tokens;
SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());
return GetNameScore(tokens, slice);
}
template <typename TSlice>
NameScore GetNameScore(vector<strings::UniString> const & tokens, TSlice const & slice)
NameScore GetNameScore(std::vector<strings::UniString> const & tokens, TSlice const & slice)
{
if (slice.Empty())
return NAME_SCORE_ZERO;

View file

@ -805,5 +805,33 @@ UNIT_CLASS_TEST(ProcessorTest, SpacesInCategories)
TEST(ResultsMatch("Москва ночной клуб", "ru", rules), ());
}
}
UNIT_CLASS_TEST(ProcessorTest, StopWords)
{
TestCountry country(m2::PointD(0, 0), "France", "en");
TestCity city(m2::PointD(0, 0), "Paris", "en", 100 /* rank */);
TestStreet street(
vector<m2::PointD>{m2::PointD(-0.001, -0.001), m2::PointD(0, 0), m2::PointD(0.001, 0.001)},
"Rue de la Paix", "en");
BuildWorld([&](TestMwmBuilder & builder) {
builder.Add(country);
builder.Add(city);
});
auto id = BuildCountry(country.GetName(), [&](TestMwmBuilder & builder) { builder.Add(street); });
{
auto request = MakeRequest("la France à Paris Rue de la Paix");
TRules rules = {ExactMatch(id, street)};
auto const & results = request->Results();
TEST(MatchResults(rules, results), ());
auto const & info = results[0].GetRankingInfo();
TEST_EQUAL(info.m_nameScore, NAME_SCORE_FULL_MATCH, ());
}
}
} // namespace
} // namespace search