[search] Fixed a bug in the code for stop words.

This commit is contained in:
Maxim Pimenov 2017-11-30 14:11:23 +03:00 committed by Vladimir Byko-Ianko
parent bbb6a7d8a2
commit 4aea738cd1
13 changed files with 112 additions and 89 deletions

View file

@ -403,12 +403,12 @@ public:
insert(where, &value, &value + 1);
}
template <class TFn>
void erase_if(TFn fn)
template <class Fn>
void erase_if(Fn && fn)
{
iterator b = begin();
iterator e = end();
iterator i = std::remove_if(b, e, fn);
iterator i = std::remove_if(b, e, std::forward<Fn>(fn));
if (i != e)
resize(std::distance(b, i));
}

View file

@ -178,7 +178,8 @@ struct FeatureNameInserter
});
}
int const maxTokensCount = search::MAX_TOKENS - 1;
CHECK_GREATER(search::kMaxNumTokens, 0, ());
size_t const maxTokensCount = search::kMaxNumTokens - 1;
if (tokens.size() > maxTokensCount)
{
LOG(LWARNING, ("Name has too many tokens:", name));

View file

@ -6,6 +6,8 @@
#include "base/small_set.hpp"
#include "base/string_utils.hpp"
#include <cstddef>
namespace search
{
// The prefix is stored separately.
@ -17,6 +19,6 @@ using Locales =
::base::SafeSmallSet<static_cast<uint64_t>(CategoriesHolder::kMaxSupportedLocaleIndex) + 1>;
/// Upper bound for max count of tokens for indexing and scoring.
int constexpr MAX_TOKENS = 32;
int constexpr MAX_SUGGESTS_COUNT = 5;
size_t constexpr kMaxNumTokens = 32;
size_t constexpr kMaxNumSuggests = 5;
} // namespace search

View file

@ -39,7 +39,7 @@ KeywordMatcher::Score KeywordMatcher::CalcScore(string const & name) const
KeywordMatcher::Score KeywordMatcher::CalcScore(strings::UniString const & name) const
{
buffer_vector<strings::UniString, MAX_TOKENS> tokens;
buffer_vector<strings::UniString, kMaxNumTokens> tokens;
SplitUniString(name, MakeBackInsertFunctor(tokens), Delimiters());
return CalcScore(tokens.data(), tokens.size());
@ -49,7 +49,7 @@ KeywordMatcher::Score KeywordMatcher::CalcScore(strings::UniString const * token
size_t count) const
{
// Some names can have too many tokens. Trim them.
count = min(count, static_cast<size_t>(MAX_TOKENS));
count = min(count, kMaxNumTokens);
vector<bool> isQueryTokenMatched(m_keywords.size());
vector<bool> isNameTokenMatched(count);
@ -103,7 +103,7 @@ KeywordMatcher::Score KeywordMatcher::CalcScore(strings::UniString const * token
for (size_t i = 0; i < count; ++i)
{
if (isNameTokenMatched[i])
score.m_nameTokensMatched |= (1 << (MAX_TOKENS-1 - i));
score.m_nameTokensMatched |= (1 << (kMaxNumTokens - 1 - i));
score.m_nameTokensLength += tokens[i].size();
}
@ -164,7 +164,7 @@ string DebugPrint(KeywordMatcher::Score const & score)
out << ",nQTM=" << static_cast<int>(score.m_numQueryTokensAndPrefixMatched);
out << ",PM=" << score.m_prefixMatched;
out << ",NTM=";
for (int i = MAX_TOKENS-1; i >= 0; --i)
for (int i = static_cast<int>(kMaxNumTokens) - 1; i >= 0; --i)
out << ((score.m_nameTokensMatched >> i) & 1);
out << ",STMD=" << score.m_sumTokenMatchDistance;
out << ")";

View file

@ -118,40 +118,21 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res
GetPlatform().GetMarketingService().SendMarketingEvent(marketing::kSearchEmitResultsAndCoords, {});
}
// Removes all full-token stop words from |params|.
// Does nothing if all tokens in |params| are non-prefix stop words.
void RemoveStopWordsIfNeeded(QueryParams & params)
// Removes all full-token stop words from |tokens|.
// Does nothing if all tokens in are non-prefix stop words.
void RemoveStopWordsIfNeeded(QueryTokens & tokens, strings::UniString & prefix)
{
size_t numStopWords = 0;
for (size_t i = 0; i < params.GetNumTokens(); ++i)
for (auto const & token : tokens)
{
auto & token = params.GetToken(i);
if (!params.IsPrefixToken(i) && IsStopWord(token.m_original))
if (IsStopWord(token))
++numStopWords;
}
if (numStopWords == params.GetNumTokens())
if (numStopWords == tokens.size() && prefix.empty())
return;
for (size_t i = 0; i < params.GetNumTokens();)
{
if (params.IsPrefixToken(i))
{
++i;
continue;
}
auto & token = params.GetToken(i);
if (IsStopWord(token.m_original))
{
params.RemoveToken(i);
}
else
{
my::EraseIf(token.m_synonyms, &IsStopWord);
++i;
}
}
tokens.erase_if(&IsStopWord);
}
} // namespace
@ -271,16 +252,23 @@ void Processor::SetQuery(string const & query)
}
}
// Assign prefix with last parsed token.
if (!m_tokens.empty() && !delims(strings::LastUniChar(query)))
size_t const maxTokensCount = kMaxNumTokens - 1;
ASSERT_GREATER(maxTokensCount, 0, ());
if (m_tokens.size() > maxTokensCount)
{
m_prefix.swap(m_tokens.back());
m_tokens.pop_back();
m_tokens.resize(maxTokensCount);
}
else
{
// Assign the last parsed token to prefix.
if (!m_tokens.empty() && !delims(strings::LastUniChar(query)))
{
m_prefix.swap(m_tokens.back());
m_tokens.pop_back();
}
}
int const maxTokensCount = MAX_TOKENS - 1;
if (m_tokens.size() > maxTokensCount)
m_tokens.resize(maxTokensCount);
RemoveStopWordsIfNeeded(m_tokens, m_prefix);
// Assign tokens and prefix to scorer.
m_keywordsScorer.SetKeywords(m_tokens.data(), m_tokens.size(), m_prefix);
@ -440,11 +428,9 @@ void Processor::InitParams(QueryParams & params)
else
params.InitWithPrefix(m_tokens.begin(), m_tokens.end(), m_prefix);
RemoveStopWordsIfNeeded(params);
// Add names of categories (and synonyms).
Classificator const & c = classif();
auto addSynonyms = [&](size_t i, uint32_t t) {
auto addCategorySynonyms = [&](size_t i, uint32_t t) {
uint32_t const index = c.GetIndexForType(t);
params.GetTypeIndices(i).push_back(index);
};
@ -454,12 +440,12 @@ void Processor::InitParams(QueryParams & params)
params.SetCategorialRequest(isCategorialRequest);
if (isCategorialRequest)
{
ForEachCategoryType(tokenSlice, addSynonyms);
ForEachCategoryType(tokenSlice, addCategorySynonyms);
}
else
{
// todo(@m, @y). Shall we match prefix tokens for categories?
ForEachCategoryTypeFuzzy(tokenSlice, addSynonyms);
ForEachCategoryTypeFuzzy(tokenSlice, addCategorySynonyms);
}
// Remove all type indices for streets, as they're considired

View file

@ -1,10 +1,13 @@
#include "search/query_params.hpp"
#include "search/ranking_utils.hpp"
#include "search/token_range.hpp"
#include "indexer/feature_impl.hpp"
#include "std/algorithm.hpp"
#include <algorithm>
using namespace std;
namespace search
{
@ -48,6 +51,27 @@ private:
};
} // namespace
// QueryParams::Token ------------------------------------------------------------------------------
void QueryParams::Token::AddSynonym(std::string const & s)
{
AddSynonym(strings::MakeUniString(s));
}
void QueryParams::Token::AddSynonym(String const & s)
{
if (!IsStopWord(s))
m_synonyms.push_back(s);
}
string DebugPrint(QueryParams::Token const & token)
{
ostringstream os;
os << "Token [ m_original=" << DebugPrint(token.m_original)
<< ", m_synonyms=" << DebugPrint(token.m_synonyms) << " ]";
return os.str();
}
// QueryParams -------------------------------------------------------------------------------------
void QueryParams::Clear()
{
m_tokens.clear();
@ -138,12 +162,4 @@ string DebugPrint(QueryParams const & params)
<< ", m_langs=" << DebugPrint(params.m_langs) << " ]";
return os.str();
}
string DebugPrint(QueryParams::Token const & token)
{
ostringstream os;
os << "Token [ m_original=" << DebugPrint(token.m_original)
<< ", m_synonyms=" << DebugPrint(token.m_synonyms) << " ]";
return os.str();
}
} // namespace search

View file

@ -8,11 +8,10 @@
#include "base/small_set.hpp"
#include "base/string_utils.hpp"
#include "std/cstdint.hpp"
#include "std/type_traits.hpp"
#include "std/unordered_set.hpp"
#include "std/utility.hpp"
#include "std/vector.hpp"
#include <cstdint>
#include <string>
#include <type_traits>
#include <vector>
namespace search
{
@ -22,7 +21,7 @@ class QueryParams
{
public:
using String = strings::UniString;
using TypeIndices = vector<uint32_t>;
using TypeIndices = std::vector<uint32_t>;
using Langs = ::base::SafeSmallSet<StringUtf8Multilang::kMaxSupportedLanguages>;
struct Token
@ -30,8 +29,8 @@ public:
Token() = default;
Token(String const & original) : m_original(original) {}
void AddSynonym(String const & s) { m_synonyms.push_back(s); }
void AddSynonym(string const & s) { m_synonyms.push_back(strings::MakeUniString(s)); }
void AddSynonym(std::string const & s);
void AddSynonym(String const & s);
// Calls |fn| on the original token and on synonyms.
template <typename Fn>
@ -39,7 +38,7 @@ public:
Fn && fn) const
{
fn(m_original);
for_each(m_synonyms.begin(), m_synonyms.end(), forward<Fn>(fn));
for_each(m_synonyms.begin(), m_synonyms.end(), std::forward<Fn>(fn));
}
// Calls |fn| on the original token and on synonyms until |fn| return false.
@ -63,7 +62,7 @@ public:
}
String m_original;
vector<String> m_synonyms;
std::vector<String> m_synonyms;
};
QueryParams() = default;
@ -82,7 +81,7 @@ public:
{
Clear();
for (; tokenBegin != tokenEnd; ++tokenBegin)
m_tokens.push_back(*tokenBegin);
m_tokens.emplace_back(*tokenBegin);
m_prefixToken.m_original = prefix;
m_hasPrefix = true;
m_typeIndices.resize(GetNumTokens());
@ -121,18 +120,18 @@ public:
inline int GetScale() const { return m_scale; }
private:
friend string DebugPrint(QueryParams const & params);
friend std::string DebugPrint(QueryParams const & params);
vector<Token> m_tokens;
std::vector<Token> m_tokens;
Token m_prefixToken;
bool m_hasPrefix = false;
bool m_isCategorialRequest = false;
vector<TypeIndices> m_typeIndices;
std::vector<TypeIndices> m_typeIndices;
Langs m_langs;
int m_scale = scales::GetUpperScale();
};
string DebugPrint(QueryParams::Token const & token);
std::string DebugPrint(QueryParams::Token const & token);
} // namespace search

View file

@ -550,7 +550,7 @@ void Ranker::ProcessSuggestions(vector<RankerResult> & vec) const
if (m_params.m_prefix.empty() || !m_params.m_suggestsEnabled)
return;
int added = 0;
size_t added = 0;
for (auto i = vec.begin(); i != vec.end();)
{
RankerResult const & r = *i;
@ -560,12 +560,14 @@ void Ranker::ProcessSuggestions(vector<RankerResult> & vec) const
{
string suggestion;
GetSuggestion(r, m_params.m_query, m_params.m_tokens, m_params.m_prefix, suggestion);
if (!suggestion.empty() && added < MAX_SUGGESTS_COUNT)
if (!suggestion.empty() && added < kMaxNumSuggests)
{
// todo(@m) RankingInfo is lost here. Should it be?
if (m_emitter.AddResult(Result(
MakeResult(r, false /* needAddress */, true /* needHighlighting */), suggestion)))
{
++added;
}
i = vec.erase(i);
continue;

View file

@ -114,7 +114,7 @@ bool IsStopWord(UniString const & s)
{
/// @todo Get all common used stop words and factor out this array into
/// search_string_utils.cpp module for example.
static char const * arr[] = {"a", "de", "da", "la"};
static char const * arr[] = {"a", "de", "da", "la", "le"};
static set<UniString> const kStopWords(
make_transform_iterator(arr, &AsciiToUniString),

View file

@ -162,7 +162,7 @@ bool Results::AddResult(Result && result)
if (result.IsSuggest())
{
if (distance(m_results.begin(), it) >= MAX_SUGGESTS_COUNT)
if (distance(m_results.begin(), it) >= kMaxNumSuggests)
return false;
for (auto i = m_results.begin(); i != it; ++i)

View file

@ -992,12 +992,18 @@ UNIT_CLASS_TEST(ProcessorTest, StopWords)
vector<m2::PointD>{m2::PointD(-0.001, -0.001), m2::PointD(0, 0), m2::PointD(0.001, 0.001)},
"Rue de la Paix", "en");
TestPOI bakery(m2::PointD(0.0, 0.0), "" /* name */, "en");
bakery.SetTypes({{"shop", "bakery"}});
BuildWorld([&](TestMwmBuilder & builder) {
builder.Add(country);
builder.Add(city);
});
auto id = BuildCountry(country.GetName(), [&](TestMwmBuilder & builder) { builder.Add(street); });
auto id = BuildCountry(country.GetName(), [&](TestMwmBuilder & builder) {
builder.Add(street);
builder.Add(bakery);
});
{
auto request = MakeRequest("la France à Paris Rue de la Paix");
@ -1010,6 +1016,18 @@ UNIT_CLASS_TEST(ProcessorTest, StopWords)
auto const & info = results[0].GetRankingInfo();
TEST_EQUAL(info.m_nameScore, NAME_SCORE_FULL_MATCH, ());
}
{
TRules rules = {ExactMatch(id, bakery)};
TEST(ResultsMatch("la boulangerie ", "fr", rules), ());
}
{
TEST(ResultsMatch("la motviderie ", "fr", TRules{}), ());
TEST(ResultsMatch("la la le la la la ", "fr", {ExactMatch(id, street)}), ());
TEST(ResultsMatch("la la le la la la", "fr", TRules{}), ());
}
}
UNIT_CLASS_TEST(ProcessorTest, Numerals)

View file

@ -16,7 +16,7 @@ namespace
{
using search::KeywordMatcher;
using search::MAX_TOKENS;
using search::kMaxNumTokens;
enum ExpectedMatchResult
{
@ -259,7 +259,7 @@ string GetManyTokens(string tokenPrefix, int tokenCount, bool countForward = tru
UNIT_TEST(KeywordMatcher_QueryTooLong)
{
for (int queryLength = MAX_TOKENS - 2; queryLength <= MAX_TOKENS + 2; ++queryLength)
for (int queryLength = kMaxNumTokens - 2; queryLength <= kMaxNumTokens + 2; ++queryLength)
{
string const query = GetManyTokens("Q", queryLength);
string const queryWithPrefix = query + " Prefix";
@ -292,11 +292,10 @@ UNIT_TEST(KeywordMatcher_QueryTooLong)
UNIT_TEST(KeywordMatcher_NameTooLong)
{
string const name[] =
{
"Aa Bb " + GetManyTokens("T", MAX_TOKENS + 1),
"Aa Bb " + GetManyTokens("T", MAX_TOKENS),
"Aa Bb " + GetManyTokens("T", MAX_TOKENS - 1),
string const name[] = {
"Aa Bb " + GetManyTokens("T", kMaxNumTokens + 1),
"Aa Bb " + GetManyTokens("T", kMaxNumTokens),
"Aa Bb " + GetManyTokens("T", kMaxNumTokens - 1),
};
KeywordMatcherTestCase const testCases[] =
@ -315,9 +314,9 @@ UNIT_TEST(KeywordMatcher_NameTooLong)
UNIT_TEST(KeywordMatcher_ManyTokensInReverseOrder)
{
string const query = GetManyTokens("Q", MAX_TOKENS);
string const name = GetManyTokens("Q", MAX_TOKENS);
string const reversedName = GetManyTokens("Q", MAX_TOKENS, false);
string const query = GetManyTokens("Q", kMaxNumTokens);
string const name = GetManyTokens("Q", kMaxNumTokens);
string const reversedName = GetManyTokens("Q", kMaxNumTokens, false);
KeywordMatcherTestCase const testCases[] =
{

View file

@ -19,8 +19,8 @@ public:
: m_begin(static_cast<uint8_t>(begin)), m_end(static_cast<uint8_t>(end))
{
ASSERT_LESS_OR_EQUAL(begin, MAX_TOKENS, ());
ASSERT_LESS_OR_EQUAL(end, MAX_TOKENS, ());
ASSERT_LESS_OR_EQUAL(begin, kMaxNumTokens, ());
ASSERT_LESS_OR_EQUAL(end, kMaxNumTokens, ());
ASSERT(IsValid(), (*this));
}