forked from organicmaps/organicmaps-tmp
[search] Fixed a bug in the code for stop words.
This commit is contained in:
parent
bbb6a7d8a2
commit
4aea738cd1
13 changed files with 112 additions and 89 deletions
|
@ -403,12 +403,12 @@ public:
|
|||
insert(where, &value, &value + 1);
|
||||
}
|
||||
|
||||
template <class TFn>
|
||||
void erase_if(TFn fn)
|
||||
template <class Fn>
|
||||
void erase_if(Fn && fn)
|
||||
{
|
||||
iterator b = begin();
|
||||
iterator e = end();
|
||||
iterator i = std::remove_if(b, e, fn);
|
||||
iterator i = std::remove_if(b, e, std::forward<Fn>(fn));
|
||||
if (i != e)
|
||||
resize(std::distance(b, i));
|
||||
}
|
||||
|
|
|
@ -178,7 +178,8 @@ struct FeatureNameInserter
|
|||
});
|
||||
}
|
||||
|
||||
int const maxTokensCount = search::MAX_TOKENS - 1;
|
||||
CHECK_GREATER(search::kMaxNumTokens, 0, ());
|
||||
size_t const maxTokensCount = search::kMaxNumTokens - 1;
|
||||
if (tokens.size() > maxTokensCount)
|
||||
{
|
||||
LOG(LWARNING, ("Name has too many tokens:", name));
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
#include "base/small_set.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace search
|
||||
{
|
||||
// The prefix is stored separately.
|
||||
|
@ -17,6 +19,6 @@ using Locales =
|
|||
::base::SafeSmallSet<static_cast<uint64_t>(CategoriesHolder::kMaxSupportedLocaleIndex) + 1>;
|
||||
|
||||
/// Upper bound for max count of tokens for indexing and scoring.
|
||||
int constexpr MAX_TOKENS = 32;
|
||||
int constexpr MAX_SUGGESTS_COUNT = 5;
|
||||
size_t constexpr kMaxNumTokens = 32;
|
||||
size_t constexpr kMaxNumSuggests = 5;
|
||||
} // namespace search
|
||||
|
|
|
@ -39,7 +39,7 @@ KeywordMatcher::Score KeywordMatcher::CalcScore(string const & name) const
|
|||
|
||||
KeywordMatcher::Score KeywordMatcher::CalcScore(strings::UniString const & name) const
|
||||
{
|
||||
buffer_vector<strings::UniString, MAX_TOKENS> tokens;
|
||||
buffer_vector<strings::UniString, kMaxNumTokens> tokens;
|
||||
SplitUniString(name, MakeBackInsertFunctor(tokens), Delimiters());
|
||||
|
||||
return CalcScore(tokens.data(), tokens.size());
|
||||
|
@ -49,7 +49,7 @@ KeywordMatcher::Score KeywordMatcher::CalcScore(strings::UniString const * token
|
|||
size_t count) const
|
||||
{
|
||||
// Some names can have too many tokens. Trim them.
|
||||
count = min(count, static_cast<size_t>(MAX_TOKENS));
|
||||
count = min(count, kMaxNumTokens);
|
||||
|
||||
vector<bool> isQueryTokenMatched(m_keywords.size());
|
||||
vector<bool> isNameTokenMatched(count);
|
||||
|
@ -103,7 +103,7 @@ KeywordMatcher::Score KeywordMatcher::CalcScore(strings::UniString const * token
|
|||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
if (isNameTokenMatched[i])
|
||||
score.m_nameTokensMatched |= (1 << (MAX_TOKENS-1 - i));
|
||||
score.m_nameTokensMatched |= (1 << (kMaxNumTokens - 1 - i));
|
||||
score.m_nameTokensLength += tokens[i].size();
|
||||
}
|
||||
|
||||
|
@ -164,7 +164,7 @@ string DebugPrint(KeywordMatcher::Score const & score)
|
|||
out << ",nQTM=" << static_cast<int>(score.m_numQueryTokensAndPrefixMatched);
|
||||
out << ",PM=" << score.m_prefixMatched;
|
||||
out << ",NTM=";
|
||||
for (int i = MAX_TOKENS-1; i >= 0; --i)
|
||||
for (int i = static_cast<int>(kMaxNumTokens) - 1; i >= 0; --i)
|
||||
out << ((score.m_nameTokensMatched >> i) & 1);
|
||||
out << ",STMD=" << score.m_sumTokenMatchDistance;
|
||||
out << ")";
|
||||
|
|
|
@ -118,40 +118,21 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res
|
|||
GetPlatform().GetMarketingService().SendMarketingEvent(marketing::kSearchEmitResultsAndCoords, {});
|
||||
}
|
||||
|
||||
// Removes all full-token stop words from |params|.
|
||||
// Does nothing if all tokens in |params| are non-prefix stop words.
|
||||
void RemoveStopWordsIfNeeded(QueryParams & params)
|
||||
// Removes all full-token stop words from |tokens|.
|
||||
// Does nothing if all tokens in are non-prefix stop words.
|
||||
void RemoveStopWordsIfNeeded(QueryTokens & tokens, strings::UniString & prefix)
|
||||
{
|
||||
size_t numStopWords = 0;
|
||||
for (size_t i = 0; i < params.GetNumTokens(); ++i)
|
||||
for (auto const & token : tokens)
|
||||
{
|
||||
auto & token = params.GetToken(i);
|
||||
if (!params.IsPrefixToken(i) && IsStopWord(token.m_original))
|
||||
if (IsStopWord(token))
|
||||
++numStopWords;
|
||||
}
|
||||
|
||||
if (numStopWords == params.GetNumTokens())
|
||||
if (numStopWords == tokens.size() && prefix.empty())
|
||||
return;
|
||||
|
||||
for (size_t i = 0; i < params.GetNumTokens();)
|
||||
{
|
||||
if (params.IsPrefixToken(i))
|
||||
{
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto & token = params.GetToken(i);
|
||||
if (IsStopWord(token.m_original))
|
||||
{
|
||||
params.RemoveToken(i);
|
||||
}
|
||||
else
|
||||
{
|
||||
my::EraseIf(token.m_synonyms, &IsStopWord);
|
||||
++i;
|
||||
}
|
||||
}
|
||||
tokens.erase_if(&IsStopWord);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
|
@ -271,16 +252,23 @@ void Processor::SetQuery(string const & query)
|
|||
}
|
||||
}
|
||||
|
||||
// Assign prefix with last parsed token.
|
||||
if (!m_tokens.empty() && !delims(strings::LastUniChar(query)))
|
||||
size_t const maxTokensCount = kMaxNumTokens - 1;
|
||||
ASSERT_GREATER(maxTokensCount, 0, ());
|
||||
if (m_tokens.size() > maxTokensCount)
|
||||
{
|
||||
m_prefix.swap(m_tokens.back());
|
||||
m_tokens.pop_back();
|
||||
m_tokens.resize(maxTokensCount);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Assign the last parsed token to prefix.
|
||||
if (!m_tokens.empty() && !delims(strings::LastUniChar(query)))
|
||||
{
|
||||
m_prefix.swap(m_tokens.back());
|
||||
m_tokens.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
int const maxTokensCount = MAX_TOKENS - 1;
|
||||
if (m_tokens.size() > maxTokensCount)
|
||||
m_tokens.resize(maxTokensCount);
|
||||
RemoveStopWordsIfNeeded(m_tokens, m_prefix);
|
||||
|
||||
// Assign tokens and prefix to scorer.
|
||||
m_keywordsScorer.SetKeywords(m_tokens.data(), m_tokens.size(), m_prefix);
|
||||
|
@ -440,11 +428,9 @@ void Processor::InitParams(QueryParams & params)
|
|||
else
|
||||
params.InitWithPrefix(m_tokens.begin(), m_tokens.end(), m_prefix);
|
||||
|
||||
RemoveStopWordsIfNeeded(params);
|
||||
|
||||
// Add names of categories (and synonyms).
|
||||
Classificator const & c = classif();
|
||||
auto addSynonyms = [&](size_t i, uint32_t t) {
|
||||
auto addCategorySynonyms = [&](size_t i, uint32_t t) {
|
||||
uint32_t const index = c.GetIndexForType(t);
|
||||
params.GetTypeIndices(i).push_back(index);
|
||||
};
|
||||
|
@ -454,12 +440,12 @@ void Processor::InitParams(QueryParams & params)
|
|||
params.SetCategorialRequest(isCategorialRequest);
|
||||
if (isCategorialRequest)
|
||||
{
|
||||
ForEachCategoryType(tokenSlice, addSynonyms);
|
||||
ForEachCategoryType(tokenSlice, addCategorySynonyms);
|
||||
}
|
||||
else
|
||||
{
|
||||
// todo(@m, @y). Shall we match prefix tokens for categories?
|
||||
ForEachCategoryTypeFuzzy(tokenSlice, addSynonyms);
|
||||
ForEachCategoryTypeFuzzy(tokenSlice, addCategorySynonyms);
|
||||
}
|
||||
|
||||
// Remove all type indices for streets, as they're considired
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
#include "search/query_params.hpp"
|
||||
|
||||
#include "search/ranking_utils.hpp"
|
||||
#include "search/token_range.hpp"
|
||||
|
||||
#include "indexer/feature_impl.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include <algorithm>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace search
|
||||
{
|
||||
|
@ -48,6 +51,27 @@ private:
|
|||
};
|
||||
} // namespace
|
||||
|
||||
// QueryParams::Token ------------------------------------------------------------------------------
|
||||
void QueryParams::Token::AddSynonym(std::string const & s)
|
||||
{
|
||||
AddSynonym(strings::MakeUniString(s));
|
||||
}
|
||||
|
||||
void QueryParams::Token::AddSynonym(String const & s)
|
||||
{
|
||||
if (!IsStopWord(s))
|
||||
m_synonyms.push_back(s);
|
||||
}
|
||||
|
||||
string DebugPrint(QueryParams::Token const & token)
|
||||
{
|
||||
ostringstream os;
|
||||
os << "Token [ m_original=" << DebugPrint(token.m_original)
|
||||
<< ", m_synonyms=" << DebugPrint(token.m_synonyms) << " ]";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
// QueryParams -------------------------------------------------------------------------------------
|
||||
void QueryParams::Clear()
|
||||
{
|
||||
m_tokens.clear();
|
||||
|
@ -138,12 +162,4 @@ string DebugPrint(QueryParams const & params)
|
|||
<< ", m_langs=" << DebugPrint(params.m_langs) << " ]";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
string DebugPrint(QueryParams::Token const & token)
|
||||
{
|
||||
ostringstream os;
|
||||
os << "Token [ m_original=" << DebugPrint(token.m_original)
|
||||
<< ", m_synonyms=" << DebugPrint(token.m_synonyms) << " ]";
|
||||
return os.str();
|
||||
}
|
||||
} // namespace search
|
||||
|
|
|
@ -8,11 +8,10 @@
|
|||
#include "base/small_set.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/cstdint.hpp"
|
||||
#include "std/type_traits.hpp"
|
||||
#include "std/unordered_set.hpp"
|
||||
#include "std/utility.hpp"
|
||||
#include "std/vector.hpp"
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
namespace search
|
||||
{
|
||||
|
@ -22,7 +21,7 @@ class QueryParams
|
|||
{
|
||||
public:
|
||||
using String = strings::UniString;
|
||||
using TypeIndices = vector<uint32_t>;
|
||||
using TypeIndices = std::vector<uint32_t>;
|
||||
using Langs = ::base::SafeSmallSet<StringUtf8Multilang::kMaxSupportedLanguages>;
|
||||
|
||||
struct Token
|
||||
|
@ -30,8 +29,8 @@ public:
|
|||
Token() = default;
|
||||
Token(String const & original) : m_original(original) {}
|
||||
|
||||
void AddSynonym(String const & s) { m_synonyms.push_back(s); }
|
||||
void AddSynonym(string const & s) { m_synonyms.push_back(strings::MakeUniString(s)); }
|
||||
void AddSynonym(std::string const & s);
|
||||
void AddSynonym(String const & s);
|
||||
|
||||
// Calls |fn| on the original token and on synonyms.
|
||||
template <typename Fn>
|
||||
|
@ -39,7 +38,7 @@ public:
|
|||
Fn && fn) const
|
||||
{
|
||||
fn(m_original);
|
||||
for_each(m_synonyms.begin(), m_synonyms.end(), forward<Fn>(fn));
|
||||
for_each(m_synonyms.begin(), m_synonyms.end(), std::forward<Fn>(fn));
|
||||
}
|
||||
|
||||
// Calls |fn| on the original token and on synonyms until |fn| return false.
|
||||
|
@ -63,7 +62,7 @@ public:
|
|||
}
|
||||
|
||||
String m_original;
|
||||
vector<String> m_synonyms;
|
||||
std::vector<String> m_synonyms;
|
||||
};
|
||||
|
||||
QueryParams() = default;
|
||||
|
@ -82,7 +81,7 @@ public:
|
|||
{
|
||||
Clear();
|
||||
for (; tokenBegin != tokenEnd; ++tokenBegin)
|
||||
m_tokens.push_back(*tokenBegin);
|
||||
m_tokens.emplace_back(*tokenBegin);
|
||||
m_prefixToken.m_original = prefix;
|
||||
m_hasPrefix = true;
|
||||
m_typeIndices.resize(GetNumTokens());
|
||||
|
@ -121,18 +120,18 @@ public:
|
|||
inline int GetScale() const { return m_scale; }
|
||||
|
||||
private:
|
||||
friend string DebugPrint(QueryParams const & params);
|
||||
friend std::string DebugPrint(QueryParams const & params);
|
||||
|
||||
vector<Token> m_tokens;
|
||||
std::vector<Token> m_tokens;
|
||||
Token m_prefixToken;
|
||||
bool m_hasPrefix = false;
|
||||
bool m_isCategorialRequest = false;
|
||||
|
||||
vector<TypeIndices> m_typeIndices;
|
||||
std::vector<TypeIndices> m_typeIndices;
|
||||
|
||||
Langs m_langs;
|
||||
int m_scale = scales::GetUpperScale();
|
||||
};
|
||||
|
||||
string DebugPrint(QueryParams::Token const & token);
|
||||
std::string DebugPrint(QueryParams::Token const & token);
|
||||
} // namespace search
|
||||
|
|
|
@ -550,7 +550,7 @@ void Ranker::ProcessSuggestions(vector<RankerResult> & vec) const
|
|||
if (m_params.m_prefix.empty() || !m_params.m_suggestsEnabled)
|
||||
return;
|
||||
|
||||
int added = 0;
|
||||
size_t added = 0;
|
||||
for (auto i = vec.begin(); i != vec.end();)
|
||||
{
|
||||
RankerResult const & r = *i;
|
||||
|
@ -560,12 +560,14 @@ void Ranker::ProcessSuggestions(vector<RankerResult> & vec) const
|
|||
{
|
||||
string suggestion;
|
||||
GetSuggestion(r, m_params.m_query, m_params.m_tokens, m_params.m_prefix, suggestion);
|
||||
if (!suggestion.empty() && added < MAX_SUGGESTS_COUNT)
|
||||
if (!suggestion.empty() && added < kMaxNumSuggests)
|
||||
{
|
||||
// todo(@m) RankingInfo is lost here. Should it be?
|
||||
if (m_emitter.AddResult(Result(
|
||||
MakeResult(r, false /* needAddress */, true /* needHighlighting */), suggestion)))
|
||||
{
|
||||
++added;
|
||||
}
|
||||
|
||||
i = vec.erase(i);
|
||||
continue;
|
||||
|
|
|
@ -114,7 +114,7 @@ bool IsStopWord(UniString const & s)
|
|||
{
|
||||
/// @todo Get all common used stop words and factor out this array into
|
||||
/// search_string_utils.cpp module for example.
|
||||
static char const * arr[] = {"a", "de", "da", "la"};
|
||||
static char const * arr[] = {"a", "de", "da", "la", "le"};
|
||||
|
||||
static set<UniString> const kStopWords(
|
||||
make_transform_iterator(arr, &AsciiToUniString),
|
||||
|
|
|
@ -162,7 +162,7 @@ bool Results::AddResult(Result && result)
|
|||
|
||||
if (result.IsSuggest())
|
||||
{
|
||||
if (distance(m_results.begin(), it) >= MAX_SUGGESTS_COUNT)
|
||||
if (distance(m_results.begin(), it) >= kMaxNumSuggests)
|
||||
return false;
|
||||
|
||||
for (auto i = m_results.begin(); i != it; ++i)
|
||||
|
|
|
@ -992,12 +992,18 @@ UNIT_CLASS_TEST(ProcessorTest, StopWords)
|
|||
vector<m2::PointD>{m2::PointD(-0.001, -0.001), m2::PointD(0, 0), m2::PointD(0.001, 0.001)},
|
||||
"Rue de la Paix", "en");
|
||||
|
||||
TestPOI bakery(m2::PointD(0.0, 0.0), "" /* name */, "en");
|
||||
bakery.SetTypes({{"shop", "bakery"}});
|
||||
|
||||
BuildWorld([&](TestMwmBuilder & builder) {
|
||||
builder.Add(country);
|
||||
builder.Add(city);
|
||||
});
|
||||
|
||||
auto id = BuildCountry(country.GetName(), [&](TestMwmBuilder & builder) { builder.Add(street); });
|
||||
auto id = BuildCountry(country.GetName(), [&](TestMwmBuilder & builder) {
|
||||
builder.Add(street);
|
||||
builder.Add(bakery);
|
||||
});
|
||||
|
||||
{
|
||||
auto request = MakeRequest("la France à Paris Rue de la Paix");
|
||||
|
@ -1010,6 +1016,18 @@ UNIT_CLASS_TEST(ProcessorTest, StopWords)
|
|||
auto const & info = results[0].GetRankingInfo();
|
||||
TEST_EQUAL(info.m_nameScore, NAME_SCORE_FULL_MATCH, ());
|
||||
}
|
||||
|
||||
{
|
||||
TRules rules = {ExactMatch(id, bakery)};
|
||||
|
||||
TEST(ResultsMatch("la boulangerie ", "fr", rules), ());
|
||||
}
|
||||
|
||||
{
|
||||
TEST(ResultsMatch("la motviderie ", "fr", TRules{}), ());
|
||||
TEST(ResultsMatch("la la le la la la ", "fr", {ExactMatch(id, street)}), ());
|
||||
TEST(ResultsMatch("la la le la la la", "fr", TRules{}), ());
|
||||
}
|
||||
}
|
||||
|
||||
UNIT_CLASS_TEST(ProcessorTest, Numerals)
|
||||
|
|
|
@ -16,7 +16,7 @@ namespace
|
|||
{
|
||||
|
||||
using search::KeywordMatcher;
|
||||
using search::MAX_TOKENS;
|
||||
using search::kMaxNumTokens;
|
||||
|
||||
enum ExpectedMatchResult
|
||||
{
|
||||
|
@ -259,7 +259,7 @@ string GetManyTokens(string tokenPrefix, int tokenCount, bool countForward = tru
|
|||
|
||||
UNIT_TEST(KeywordMatcher_QueryTooLong)
|
||||
{
|
||||
for (int queryLength = MAX_TOKENS - 2; queryLength <= MAX_TOKENS + 2; ++queryLength)
|
||||
for (int queryLength = kMaxNumTokens - 2; queryLength <= kMaxNumTokens + 2; ++queryLength)
|
||||
{
|
||||
string const query = GetManyTokens("Q", queryLength);
|
||||
string const queryWithPrefix = query + " Prefix";
|
||||
|
@ -292,11 +292,10 @@ UNIT_TEST(KeywordMatcher_QueryTooLong)
|
|||
|
||||
UNIT_TEST(KeywordMatcher_NameTooLong)
|
||||
{
|
||||
string const name[] =
|
||||
{
|
||||
"Aa Bb " + GetManyTokens("T", MAX_TOKENS + 1),
|
||||
"Aa Bb " + GetManyTokens("T", MAX_TOKENS),
|
||||
"Aa Bb " + GetManyTokens("T", MAX_TOKENS - 1),
|
||||
string const name[] = {
|
||||
"Aa Bb " + GetManyTokens("T", kMaxNumTokens + 1),
|
||||
"Aa Bb " + GetManyTokens("T", kMaxNumTokens),
|
||||
"Aa Bb " + GetManyTokens("T", kMaxNumTokens - 1),
|
||||
};
|
||||
|
||||
KeywordMatcherTestCase const testCases[] =
|
||||
|
@ -315,9 +314,9 @@ UNIT_TEST(KeywordMatcher_NameTooLong)
|
|||
|
||||
UNIT_TEST(KeywordMatcher_ManyTokensInReverseOrder)
|
||||
{
|
||||
string const query = GetManyTokens("Q", MAX_TOKENS);
|
||||
string const name = GetManyTokens("Q", MAX_TOKENS);
|
||||
string const reversedName = GetManyTokens("Q", MAX_TOKENS, false);
|
||||
string const query = GetManyTokens("Q", kMaxNumTokens);
|
||||
string const name = GetManyTokens("Q", kMaxNumTokens);
|
||||
string const reversedName = GetManyTokens("Q", kMaxNumTokens, false);
|
||||
|
||||
KeywordMatcherTestCase const testCases[] =
|
||||
{
|
||||
|
|
|
@ -19,8 +19,8 @@ public:
|
|||
: m_begin(static_cast<uint8_t>(begin)), m_end(static_cast<uint8_t>(end))
|
||||
|
||||
{
|
||||
ASSERT_LESS_OR_EQUAL(begin, MAX_TOKENS, ());
|
||||
ASSERT_LESS_OR_EQUAL(end, MAX_TOKENS, ());
|
||||
ASSERT_LESS_OR_EQUAL(begin, kMaxNumTokens, ());
|
||||
ASSERT_LESS_OR_EQUAL(end, kMaxNumTokens, ());
|
||||
ASSERT(IsValid(), (*this));
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue