[search] Keyword Matcher rewrite.

This commit is contained in:
Yury Melnichek 2013-02-07 01:07:56 +01:00 committed by Alex Zolotarev
parent 7fdac1d3bd
commit 056f14ef4e
12 changed files with 613 additions and 219 deletions

View file

@ -23,4 +23,24 @@ void SplitUniString(strings::UniString const & uniS, F f, DelimsT const & delims
strings::UniString FeatureTypeToString(uint32_t type);
template <class ContainerT, class DelimsT>
bool TokenizeStringAndCheckIfLastTokenIsPrefix(strings::UniString const & s,
ContainerT & tokens,
DelimsT const & delimiter)
{
SplitUniString(s, MakeBackInsertFunctor(tokens), delimiter);
return !s.empty() && !delimiter(s.back());
}
template <class ContainerT, class DelimsT>
bool TokenizeStringAndCheckIfLastTokenIsPrefix(string const & s,
ContainerT & tokens,
DelimsT const & delimiter)
{
return TokenizeStringAndCheckIfLastTokenIsPrefix(NormalizeAndSimplifyString(s),
tokens,
delimiter);
}
} // namespace search

View file

@ -0,0 +1,90 @@
#include "keyword_lang_matcher.hpp"
#include "../indexer/search_string_utils.hpp"
#include "../indexer/search_delimiters.hpp"
#include "../base/stl_add.hpp"
#include "../std/algorithm.hpp"
namespace search
{
KeywordLangMatcher::ScoreT::ScoreT(KeywordMatcher::ScoreT const & score, int langScore)
: m_parentScore(score), m_langScore(langScore)
{
}
bool KeywordLangMatcher::ScoreT::operator <(KeywordLangMatcher::ScoreT const & score) const
{
if (m_parentScore < score.m_parentScore)
return true;
if (score.m_parentScore < m_parentScore)
return false;
if (m_langScore != score.m_langScore)
return m_langScore < score.m_langScore;
return false;
}
void KeywordLangMatcher::SetLanguages(vector<vector<int8_t> > const & languagePriorities)
{
m_languagePriorities = languagePriorities;
#ifdef DEBUG
ASSERT_EQUAL ( static_cast<size_t>(NUM_LANG_PRIORITY_TIERS), m_languagePriorities.size(), () );
for (int i = 0; i < NUM_LANG_PRIORITY_TIERS; ++i)
ASSERT_LESS_OR_EQUAL ( m_languagePriorities[i].size(), static_cast<size_t>(MAX_LANGS_IN_TIER), () );
#endif
}
bool KeywordLangMatcher::AssertIndex(pair<int, int> const & ind) const
{
ASSERT_LESS ( static_cast<size_t>(ind.first), m_languagePriorities.size(), () );
ASSERT_LESS ( static_cast<size_t>(ind.second), m_languagePriorities[ind.first].size(), () );
return true;
}
void KeywordLangMatcher::SetLanguage(pair<int, int> const & ind, int8_t lang)
{
ASSERT ( AssertIndex(ind), () );
m_languagePriorities[ind.first][ind.second] = lang;
}
int8_t KeywordLangMatcher::GetLanguage(pair<int, int> const & ind) const
{
ASSERT ( AssertIndex(ind), () );
return m_languagePriorities[ind.first][ind.second];
}
int KeywordLangMatcher::GetLangScore(int8_t lang) const
{
int const LANG_TIER_COUNT = static_cast<int>(m_languagePriorities.size());
for (int i = 0; i < m_languagePriorities.size(); ++i)
for (int j = 0; j < m_languagePriorities[i].size(); ++j)
if (m_languagePriorities[i][j] == lang)
return -i; // All languages in the same tier are equal.
return -LANG_TIER_COUNT;
}
KeywordLangMatcher::ScoreT KeywordLangMatcher::Score(int8_t lang, string const & name) const
{
return ScoreT(m_keywordMatcher.Score(name), GetLangScore(lang));
}
KeywordLangMatcher::ScoreT KeywordLangMatcher::Score(int8_t lang, StringT const & name) const
{
return ScoreT(m_keywordMatcher.Score(name), GetLangScore(lang));
}
KeywordLangMatcher::ScoreT KeywordLangMatcher::Score(int8_t lang,
StringT const * tokens, size_t count) const
{
return ScoreT(m_keywordMatcher.Score(tokens, count), GetLangScore(lang));
}
} // namespace search

View file

@ -3,12 +3,28 @@
#include "../std/vector.hpp"
namespace search
{
class LangKeywordsScorer
class KeywordLangMatcher
{
public:
class ScoreT
{
public:
ScoreT() {}
bool operator < (ScoreT const & s) const;
private:
friend class KeywordLangMatcher;
ScoreT(KeywordMatcher::ScoreT const & score, int langScore);
KeywordMatcher::ScoreT m_parentScore;
int m_langScore;
};
private:
enum { NUM_LANG_PRIORITY_TIERS = 4 };
enum { MAX_LANGS_IN_TIER = 2 };
@ -26,12 +42,16 @@ public:
m_keywordMatcher.SetKeywords(keywords, count, prefix);
}
uint32_t Score(int8_t lang, string const & name) const;
uint32_t Score(int8_t lang, StringT const & name) const;
uint32_t Score(int8_t lang, StringT const * tokens, size_t count) const;
/// @return Score of the name (greater is better).
//@{
ScoreT Score(int8_t lang, string const & name) const;
ScoreT Score(int8_t lang, StringT const & name) const;
ScoreT Score(int8_t lang, StringT const * tokens, size_t count) const;
//@}
private:
bool AssertIndex(pair<int, int> const & ind) const;
int GetLangScore(int8_t lang) const;
vector<vector<int8_t> > m_languagePriorities;
KeywordMatcher m_keywordMatcher;

View file

@ -7,84 +7,127 @@
#include "../std/algorithm.hpp"
namespace search
{
KeywordMatcher::KeywordMatcher()
{
Clear();
}
void KeywordMatcher::Clear()
{
m_keywords = NULL;
m_keywordsCount = 0;
m_prefix = NULL;
}
void KeywordMatcher::SetKeywords(StringT const * keywords, size_t count, StringT const * prefix)
{
ASSERT_LESS ( count, static_cast<size_t>(MAX_TOKENS), () );
m_keywords.resize(count);
for (size_t i = 0; i < count; ++i)
m_keywords[i] = &keywords[i];
m_keywords = keywords;
m_keywordsCount = min(static_cast<size_t>(MAX_TOKENS), count);
m_prefix = prefix;
if (m_prefix && m_prefix->empty())
m_prefix = 0;
m_prefix = NULL;
}
uint32_t KeywordMatcher::Score(string const & name) const
KeywordMatcher::ScoreT KeywordMatcher::Score(string const & name) const
{
return Score(NormalizeAndSimplifyString(name));
}
uint32_t KeywordMatcher::Score(StringT const & name) const
KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const & name) const
{
buffer_vector<StringT, MAX_TOKENS> tokens;
SplitUniString(name, MakeBackInsertFunctor(tokens), Delimiters());
/// @todo Some Arabian names have a lot of tokens.
/// Trim this stuff while generation.
//ASSERT_LESS ( tokens.size(), static_cast<size_t>(MAX_TOKENS), () );
return Score(tokens.data(), min(size_t(MAX_TOKENS-1), tokens.size()));
// Some names can have too many tokens. Trim them.
return Score(tokens.data(), min(size_t(MAX_TOKENS), tokens.size()));
}
uint32_t KeywordMatcher::Score(StringT const * tokens, size_t count) const
KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const * tokens, size_t count) const
{
ASSERT_LESS ( count, static_cast<size_t>(MAX_TOKENS), () );
vector<bool> isQueryTokenMatched(m_keywordsCount);
vector<bool> isNameTokenMatched(count);
uint32_t numQueryTokensMatched = 0;
uint32_t sumTokenMatchDistance = 0;
uint32_t prevTokenMatchDistance = 0;
bool bPrefixMatched = true;
// boolean array of matched input tokens
unsigned char isTokenMatched[MAX_TOKENS] = { 0 };
// calculate penalty by keywords - add MAX_TOKENS for each unmatched keyword
uint32_t score = 0;
for (size_t i = 0; i < m_keywords.size(); ++i)
{
unsigned char isKeywordMatched = 0;
for (size_t j = 0; j < count; ++j)
if (*m_keywords[i] == tokens[j])
isKeywordMatched = isTokenMatched[j] = 1;
if (!isKeywordMatched)
score += MAX_TOKENS;
}
// calculate penalty for prefix - add MAX_TOKENS for unmatched prefix
if (m_prefix)
{
bool bPrefixMatched = false;
for (size_t i = 0; i < count && !bPrefixMatched; ++i)
if (StartsWith(tokens[i].begin(), tokens[i].end(),
m_prefix->begin(), m_prefix->end()))
for (int i = 0; i < m_keywordsCount; ++i)
for (int j = 0; j < count && !isQueryTokenMatched[i]; ++j)
if (!isNameTokenMatched[j] && m_keywords[i] == tokens[j])
{
bPrefixMatched = true;
isQueryTokenMatched[i] = isNameTokenMatched[j] = true;
uint32_t const tokenMatchDistance = i - j;
sumTokenMatchDistance += abs(tokenMatchDistance - prevTokenMatchDistance);
prevTokenMatchDistance = tokenMatchDistance;
}
if (!bPrefixMatched)
score += MAX_TOKENS;
if (m_prefix)
{
bPrefixMatched = false;
for (int j = 0; j < count && !bPrefixMatched; ++j)
if (!isNameTokenMatched[j] &&
StartsWith(tokens[j].begin(), tokens[j].end(), m_prefix->begin(), m_prefix->end()))
{
isNameTokenMatched[j] = bPrefixMatched = true;
uint32_t const tokenMatchDistance = int(m_keywordsCount) - j;
sumTokenMatchDistance += abs(tokenMatchDistance - prevTokenMatchDistance);
}
}
// add penalty for each unmatched token in input sequence
for (size_t i = 0; i <= count; ++i)
{
// check for token length (skip common tokens such as "de", "la", "a")
if (tokens[i].size() > 2 && !isTokenMatched[i])
++score;
}
for (size_t i = 0; i < isQueryTokenMatched.size(); ++i)
if (isQueryTokenMatched[i])
++numQueryTokensMatched;
ScoreT score = ScoreT();
score.m_bFullQueryMatched = bPrefixMatched && (numQueryTokensMatched == isQueryTokenMatched.size());
score.m_bPrefixMatched = bPrefixMatched;
score.m_numQueryTokensAndPrefixMatched = numQueryTokensMatched + (bPrefixMatched ? 1 : 0);
score.m_nameTokensMatched = 0xFFFFFFFF;
for (uint32_t i = 0; i < min(size_t(32), isNameTokenMatched.size()); ++i)
if (!isNameTokenMatched[i])
score.m_nameTokensMatched &= ~(1 << (31 - i));
score.m_sumTokenMatchDistance = sumTokenMatchDistance;
return score;
}
KeywordMatcher::ScoreT::ScoreT()
: m_sumTokenMatchDistance(0), m_nameTokensMatched(0), m_numQueryTokensAndPrefixMatched(0),
m_bFullQueryMatched(false), m_bPrefixMatched(false)
{
}
bool KeywordMatcher::ScoreT::operator < (KeywordMatcher::ScoreT const & s) const
{
if (m_bFullQueryMatched != s.m_bFullQueryMatched)
return m_bFullQueryMatched < s.m_bFullQueryMatched;
if (m_numQueryTokensAndPrefixMatched != s.m_numQueryTokensAndPrefixMatched)
return m_numQueryTokensAndPrefixMatched < s.m_numQueryTokensAndPrefixMatched;
if (m_bPrefixMatched != s.m_bPrefixMatched)
return m_bPrefixMatched < s.m_bPrefixMatched;
if (m_nameTokensMatched != s.m_nameTokensMatched)
return m_nameTokensMatched < s.m_nameTokensMatched;
if (m_sumTokenMatchDistance != s.m_sumTokenMatchDistance)
return m_sumTokenMatchDistance > s.m_sumTokenMatchDistance;
return false;
}
string DebugPrint(KeywordMatcher::ScoreT const & score)
{
ostringstream out;
out << "KeywordMatcher::ScoreT(";
out << "FQM=" << score.m_bFullQueryMatched;
out << ",nQTM=" << static_cast<int>(score.m_numQueryTokensAndPrefixMatched);
out << ",PM=" << score.m_bPrefixMatched;
out << ",NTM=";
for (int i = 31; i >= 0; --i) out << ((score.m_nameTokensMatched >> i) & 1);
out << ",STMD=" << score.m_sumTokenMatchDistance;
out << ")";
return out.str();
}
} // namespace search

View file

@ -4,7 +4,7 @@
#include "../base/string_utils.hpp"
#include "../std/string.hpp"
#include "../std/vector.hpp"
namespace search
{
@ -12,29 +12,46 @@ namespace search
class KeywordMatcher
{
public:
enum { MAX_SCORE = MAX_TOKENS * MAX_TOKENS };
typedef strings::UniString StringT;
KeywordMatcher() : m_prefix(0) {}
inline void Clear()
class ScoreT
{
m_keywords.clear();
m_prefix = 0;
}
public:
ScoreT();
bool operator < (ScoreT const & s) const;
private:
friend class KeywordMatcher;
friend string DebugPrint(ScoreT const & score);
bool IsQueryMatched() const { return m_bFullQueryMatched; }
uint32_t m_sumTokenMatchDistance;
uint32_t m_nameTokensMatched;
uint8_t m_numQueryTokensAndPrefixMatched;
bool m_bFullQueryMatched : 1;
bool m_bPrefixMatched : 1;
};
KeywordMatcher();
void Clear();
/// Store references to keywords from source array of strings.
void SetKeywords(StringT const * keywords, size_t count, StringT const * prefix);
/// @return penalty of string (less is better).
/// @return Score of the name (greater is better).
//@{
uint32_t Score(string const & name) const;
uint32_t Score(StringT const & name) const;
uint32_t Score(StringT const * tokens, size_t count) const;
ScoreT Score(string const & name) const;
ScoreT Score(StringT const & name) const;
ScoreT Score(StringT const * tokens, size_t count) const;
//@}
static bool IsQueryMatched(ScoreT const & score) { return score.IsQueryMatched(); }
private:
buffer_vector<StringT const *, 10> m_keywords;
StringT const * m_keywords;
size_t m_keywordsCount;
StringT const * m_prefix;
};

View file

@ -1,77 +0,0 @@
#include "lang_keywords_scorer.hpp"
#include "../indexer/search_string_utils.hpp"
#include "../indexer/search_delimiters.hpp"
#include "../base/stl_add.hpp"
#include "../std/algorithm.hpp"
namespace search
{
void LangKeywordsScorer::SetLanguages(vector<vector<int8_t> > const & languagePriorities)
{
m_languagePriorities = languagePriorities;
#ifdef DEBUG
ASSERT_EQUAL ( static_cast<size_t>(NUM_LANG_PRIORITY_TIERS), m_languagePriorities.size(), () );
for (int i = 0; i < NUM_LANG_PRIORITY_TIERS; ++i)
ASSERT_LESS_OR_EQUAL ( m_languagePriorities[i].size(), static_cast<size_t>(MAX_LANGS_IN_TIER), () );
#endif
}
bool LangKeywordsScorer::AssertIndex(pair<int, int> const & ind) const
{
ASSERT_LESS ( static_cast<size_t>(ind.first), m_languagePriorities.size(), () );
ASSERT_LESS ( static_cast<size_t>(ind.second), m_languagePriorities[ind.first].size(), () );
return true;
}
void LangKeywordsScorer::SetLanguage(pair<int, int> const & ind, int8_t lang)
{
ASSERT ( AssertIndex(ind), () );
m_languagePriorities[ind.first][ind.second] = lang;
}
int8_t LangKeywordsScorer::GetLanguage(pair<int, int> const & ind) const
{
ASSERT ( AssertIndex(ind), () );
return m_languagePriorities[ind.first][ind.second];
}
uint32_t LangKeywordsScorer::Score(int8_t lang, string const & name) const
{
return Score(lang, NormalizeAndSimplifyString(name));
}
uint32_t LangKeywordsScorer::Score(int8_t lang, StringT const & name) const
{
buffer_vector<StringT, MAX_TOKENS> tokens;
SplitUniString(name, MakeBackInsertFunctor(tokens), Delimiters());
/// @todo Some Arabian names have a lot of tokens.
/// Trim this stuff while generation.
//ASSERT_LESS ( tokens.size(), static_cast<size_t>(MAX_TOKENS), () );
return Score(lang, tokens.data(), min(size_t(MAX_TOKENS-1), tokens.size()));
}
uint32_t LangKeywordsScorer::Score(int8_t lang, StringT const * tokens, size_t count) const
{
uint32_t const keywordScore = m_keywordMatcher.Score(tokens, count);
// get score by language priority
uint32_t const factor = KeywordMatcher::MAX_SCORE * MAX_LANGS_IN_TIER;
uint32_t const value = keywordScore * MAX_LANGS_IN_TIER;
for (uint32_t i = 0; i < NUM_LANG_PRIORITY_TIERS; ++i)
for (uint32_t j = 0; j < m_languagePriorities[i].size(); ++j)
if (m_languagePriorities[i][j] == lang)
return (i * factor + value + j);
return (NUM_LANG_PRIORITY_TIERS * factor);
}
} // namespace search

View file

@ -19,7 +19,7 @@ HEADERS += \
latlon_match.hpp \
approximate_string_match.hpp \
feature_offset_match.hpp \
lang_keywords_scorer.hpp \
keyword_lang_matcher.hpp \
params.hpp \
SOURCES += \
@ -30,5 +30,5 @@ SOURCES += \
result.cpp \
latlon_match.cpp \
approximate_string_match.cpp \
lang_keywords_scorer.cpp \
keyword_lang_matcher.cpp \
params.cpp \

View file

@ -1,6 +1,5 @@
#include "search_query.hpp"
#include "feature_offset_match.hpp"
#include "lang_keywords_scorer.hpp"
#include "latlon_match.hpp"
#include "search_common.hpp"
@ -441,8 +440,7 @@ namespace impl
m_pFV->GetFeature(id.second, f);
uint32_t penalty;
m_query.GetBestMatchName(f, penalty, name);
m_query.GetBestMatchName(f, name);
// country (region) name is a file name if feature isn't from World.mwm
if (m_pFV->IsWorld())
@ -592,22 +590,21 @@ namespace impl
class BestNameFinder
{
uint32_t & m_penalty;
KeywordLangMatcher::ScoreT m_score;
string & m_name;
LangKeywordsScorer const & m_keywordsScorer;
KeywordLangMatcher const & m_keywordsScorer;
public:
BestNameFinder(uint32_t & penalty, string & name, LangKeywordsScorer const & keywordsScorer)
: m_penalty(penalty), m_name(name), m_keywordsScorer(keywordsScorer)
BestNameFinder(string & name, KeywordLangMatcher const & keywordsScorer)
: m_score(), m_name(name), m_keywordsScorer(keywordsScorer)
{
m_penalty = uint32_t(-1);
}
bool operator()(signed char lang, string const & name) const
bool operator()(signed char lang, string const & name)
{
uint32_t penalty = m_keywordsScorer.Score(lang, name);
if (penalty < m_penalty)
KeywordLangMatcher::ScoreT const score = m_keywordsScorer.Score(lang, name);
if (m_score < score)
{
m_penalty = penalty;
m_score = score;
m_name = name;
}
return true;
@ -616,19 +613,10 @@ public:
} // namespace search::impl
void Query::GetBestMatchName(FeatureType const & f, uint32_t & penalty, string & name) const
void Query::GetBestMatchName(FeatureType const & f, string & name) const
{
impl::BestNameFinder bestNameFinder(penalty, name, m_keywordsScorer);
impl::BestNameFinder bestNameFinder(name, m_keywordsScorer);
(void)f.ForEachNameRef(bestNameFinder);
/*
if (!f.ForEachNameRef(bestNameFinder))
{
feature::TypesHolder types(f);
LOG(LDEBUG, (types));
LOG(LDEBUG, (f.GetLimitRect(FeatureType::BEST_GEOMETRY)));
}
*/
}
Result Query::MakeResult(impl::PreResult2 const & r, set<uint32_t> const * pPrefferedTypes/* = 0*/) const

View file

@ -1,6 +1,6 @@
#pragma once
#include "intermediate_result.hpp"
#include "lang_keywords_scorer.hpp"
#include "keyword_lang_matcher.hpp"
#include "../indexer/search_trie.hpp"
#include "../indexer/index.hpp" // for Index::MwmLock
@ -173,7 +173,7 @@ private:
bool MatchForSuggestionsImpl(strings::UniString const & token, int8_t lang, Results & res);
void MatchForSuggestions(strings::UniString const & token, Results & res);
void GetBestMatchName(FeatureType const & f, uint32_t & penalty, string & name) const;
void GetBestMatchName(FeatureType const & f, string & name) const;
Result MakeResult(impl::PreResult2 const & r, set<uint32_t> const * pPrefferedTypes = 0) const;
@ -209,7 +209,7 @@ private:
void SetLanguage(int id, int8_t lang);
int8_t GetLanguage(int id) const;
LangKeywordsScorer m_keywordsScorer;
KeywordLangMatcher m_keywordsScorer;
OffsetsVectorT m_offsetsInViewport[RECTSCOUNT];

View file

@ -0,0 +1,71 @@
#include "../../testing/testing.hpp"
#include "../keyword_lang_matcher.hpp"
#include "../../indexer/search_delimiters.hpp"
#include "../../indexer/search_string_utils.hpp"
#include "../../base/stl_add.hpp"
#include "../../std/vector.hpp"
namespace
{
using search::KeywordLangMatcher;
typedef search::KeywordLangMatcher::ScoreT ScoreT;
enum
{
LANG_UNKNOWN = 1,
LANG_SOME = 2,
LANG_SOME_OTHER = 3,
LANG_HIGH_PRIORITY = 10
};
KeywordLangMatcher CreateMatcher(string const & query)
{
KeywordLangMatcher matcher;
vector<vector<int8_t> > langPriorities(4, vector<int8_t>());
langPriorities[0].push_back(LANG_HIGH_PRIORITY);
// langPriorities[1] is intentionally left empty.
langPriorities[2].push_back(LANG_SOME);
langPriorities[2].push_back(LANG_SOME_OTHER);
// langPriorities[3] is intentionally left empty.
matcher.SetLanguages(langPriorities);
vector<strings::UniString> keywords;
strings::UniString prefix;
if (search::TokenizeStringAndCheckIfLastTokenIsPrefix(query, keywords, search::Delimiters()))
{
prefix = keywords.back();
keywords.pop_back();
}
matcher.SetKeywords(keywords.data(), keywords.size(), &prefix);
return matcher;
}
} // unnamed namespace
UNIT_TEST(KeywordMatcher_TokensMatchHasPriority)
{
}
UNIT_TEST(KeywordMatcher_LanguageMatchIsUsedWhenTokenMatchIsTheSame)
{
char const * query = "test";
char const * name = "test";
KeywordLangMatcher matcher = CreateMatcher(query);
TEST(matcher.Score(LANG_UNKNOWN, name) < matcher.Score(LANG_SOME, name), ());
TEST(matcher.Score(LANG_UNKNOWN, name) < matcher.Score(LANG_SOME_OTHER, name), ());
TEST(matcher.Score(LANG_UNKNOWN, name) < matcher.Score(LANG_HIGH_PRIORITY, name), ());
TEST(!(matcher.Score(LANG_SOME, name) < matcher.Score(LANG_SOME_OTHER, name)), ());
TEST(!(matcher.Score(LANG_SOME_OTHER, name) < matcher.Score(LANG_SOME, name)), ());
TEST(matcher.Score(LANG_SOME, name) < matcher.Score(LANG_HIGH_PRIORITY, name), ());
TEST(matcher.Score(LANG_SOME_OTHER, name) < matcher.Score(LANG_HIGH_PRIORITY, name), ());
}

View file

@ -1,79 +1,300 @@
#include "../../testing/testing.hpp"
#include "../keyword_matcher.hpp"
#include "../search_common.hpp"
#include "../../indexer/search_string_utils.hpp"
#include "../../indexer/search_delimiters.hpp"
#include "../../base/buffer_vector.hpp"
#include "../../base/stl_add.hpp"
#include "../../std/scoped_ptr.hpp"
#include "../../std/sstream.hpp"
#include "../../std/vector.hpp"
namespace
{
static const uint32_t MAX_SCORE = search::KeywordMatcher::MAX_SCORE;
class Matcher
using search::KeywordMatcher;
typedef search::KeywordMatcher::ScoreT ScoreT;
using search::MAX_TOKENS;
enum ExpectedMatchResult
{
public:
Matcher(char const * query)
{
strings::UniString const uniQuery = search::NormalizeAndSimplifyString(query);
SplitUniString(uniQuery, MakeBackInsertFunctor(m_keywords), search::Delimiters());
if (!uniQuery.empty() && uniQuery.back() != ' ')
{
m_prefix = m_keywords.back();
m_keywords.pop_back();
}
NOMATCH,
MATCHES,
ANY_RES
};
m_matcher.SetKeywords(m_keywords.data(), m_keywords.size(), &m_prefix);
enum ExpectedScoreComparison
{
DOES_NOT_MATTER, // Score does not matter.
PERFECTLY_EQUAL, // Matches with the score == previous.
BETTER_OR_EQUAL, // Matches with the score <= previous.
STRONGLY_BETTER // Matched with the score < previous.
};
struct KeywordMatcherTestCase
{
ExpectedMatchResult m_eMatch;
ExpectedScoreComparison m_eMatchType;
char const * m_name;
};
template <size_t N>
void TestKeywordMatcher(char const * const query, KeywordMatcherTestCase const (&testCases)[N])
{
vector<strings::UniString> keywords;
strings::UniString prefix;
if (search::TokenizeStringAndCheckIfLastTokenIsPrefix(query, keywords, search::Delimiters()))
{
prefix = keywords.back();
keywords.pop_back();
}
search::KeywordMatcher m_matcher;
private:
buffer_vector<strings::UniString, 10> m_keywords;
strings::UniString m_prefix;
};
KeywordMatcher matcher;
matcher.SetKeywords(keywords.data(), keywords.size(), &prefix);
ScoreT prevScore = ScoreT();
for (size_t i = 0; i < N; ++i)
{
char const * const name = testCases[i].m_name;
char const * const prevName = (i == 0 ? "N/A" : testCases[i-1].m_name);
ScoreT const testScore = matcher.Score(name);
// Test that a newly created matcher returns the same result
{
KeywordMatcher freshMatcher;
freshMatcher.SetKeywords(keywords.data(), keywords.size(), &prefix);
ScoreT const freshScore = freshMatcher.Score(name);
// TEST_EQUAL(testScore, freshScore, (query, name));
TEST(!(testScore < freshScore), (query, name));
TEST(!(freshScore < testScore), (query, name));
}
if (testCases[i].m_eMatch != ANY_RES)
{
TEST_EQUAL(testCases[i].m_eMatch == MATCHES,
KeywordMatcher::IsQueryMatched(testScore),
(query, name, testScore));
}
switch (testCases[i].m_eMatchType)
{
case DOES_NOT_MATTER:
break;
case PERFECTLY_EQUAL:
TEST(!(testScore < prevScore), (query, name, testScore, prevName, prevScore));
TEST(!(prevScore < testScore), (query, name, testScore, prevName, prevScore));
break;
case BETTER_OR_EQUAL:
TEST(!(testScore < prevScore), (query, name, testScore, prevName, prevScore));
break;
case STRONGLY_BETTER:
TEST(prevScore < testScore, (query, name, testScore, prevName, prevScore));
break;
default:
ASSERT(false, ());
}
prevScore = testScore;
}
}
} // unnamed namespace
UNIT_TEST(KeywordMatcher_New)
UNIT_TEST(KeywordMatcher_Prefix)
{
Matcher matcher("new ");
TEST_EQUAL(matcher.m_matcher.Score("new"), 0, ());
TEST_EQUAL(matcher.m_matcher.Score("york"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_matcher.Score("new york"), 0, ());
char const query[] = "new";
KeywordMatcherTestCase const testCases[] =
{
{NOMATCH, DOES_NOT_MATTER, ""},
{NOMATCH, DOES_NOT_MATTER, "zzz"},
{NOMATCH, DOES_NOT_MATTER, "ne"},
{MATCHES, STRONGLY_BETTER, "the newark"},
{MATCHES, BETTER_OR_EQUAL, "york new"},
{MATCHES, STRONGLY_BETTER, "new york gym"},
{MATCHES, BETTER_OR_EQUAL, "new new york"},
{MATCHES, STRONGLY_BETTER, "new york"},
{MATCHES, STRONGLY_BETTER, "newark"},
{MATCHES, BETTER_OR_EQUAL, "new"},
};
TestKeywordMatcher(query, testCases);
}
UNIT_TEST(KeywordMatcher_York)
UNIT_TEST(KeywordMatcher_Keyword)
{
Matcher matcher("york ");
TEST_EQUAL(matcher.m_matcher.Score("new"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_matcher.Score("york"), 0, ());
TEST_EQUAL(matcher.m_matcher.Score("new york"), 1, ());
char const query[] = "new ";
KeywordMatcherTestCase const testCases[] =
{
{NOMATCH, DOES_NOT_MATTER, ""},
{NOMATCH, DOES_NOT_MATTER, "zzz"},
{NOMATCH, DOES_NOT_MATTER, "ne"},
{NOMATCH, DOES_NOT_MATTER, "the netherlands"},
{NOMATCH, STRONGLY_BETTER, "newark"},
{MATCHES, STRONGLY_BETTER, "york new"},
{MATCHES, STRONGLY_BETTER, "new york gym"},
{MATCHES, BETTER_OR_EQUAL, "new new york"},
{MATCHES, STRONGLY_BETTER, "new york"},
};
TestKeywordMatcher(query, testCases);
}
UNIT_TEST(KeywordMatcher_NewYork)
UNIT_TEST(KeywordMatcher_SanSa_ShouldMatch_SanSalvador_BetterThan_San)
{
Matcher matcher1("new york ");
Matcher matcher2("new york");
TEST_EQUAL(matcher1.m_matcher.Score("new"), MAX_SCORE, ());
TEST_EQUAL(matcher2.m_matcher.Score("new"), MAX_SCORE, ());
TEST_EQUAL(matcher1.m_matcher.Score("york"), MAX_SCORE, ());
TEST_EQUAL(matcher2.m_matcher.Score("york"), MAX_SCORE, ());
TEST_EQUAL(matcher1.m_matcher.Score("new york"), 0, ());
TEST_EQUAL(matcher2.m_matcher.Score("new york"), 0, ());
char const query[] = "San Sa";
KeywordMatcherTestCase const testCases[] =
{
{NOMATCH, DOES_NOT_MATTER, "San"},
{MATCHES, STRONGLY_BETTER, "San Salvador"},
};
TestKeywordMatcher(query, testCases);
}
UNIT_TEST(KeywordMatcher_YorkNew)
UNIT_TEST(KeywordMatcher_KeywordAndPrefix)
{
Matcher matcher("new york ");
TEST_EQUAL(matcher.m_matcher.Score("new"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_matcher.Score("york"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_matcher.Score("new york"), 0, ());
char const query[] = "new yo";
KeywordMatcherTestCase const testCases[] =
{
{NOMATCH, DOES_NOT_MATTER, "new"},
{NOMATCH, DOES_NOT_MATTER, "new old"},
{NOMATCH, DOES_NOT_MATTER, "old york"},
{MATCHES, STRONGLY_BETTER, "the york new"},
{MATCHES, STRONGLY_BETTER, "the new york"},
{MATCHES, BETTER_OR_EQUAL, "york new the"},
{MATCHES, STRONGLY_BETTER, "new york pizza"},
{MATCHES, STRONGLY_BETTER, "york new"},
{MATCHES, BETTER_OR_EQUAL, "yo new"},
{MATCHES, STRONGLY_BETTER, "new york"},
{MATCHES, BETTER_OR_EQUAL, "new yo"},
};
TestKeywordMatcher(query, testCases);
}
UNIT_TEST(KeywordMatcher_NewYo)
UNIT_TEST(KeywordMatcher_KeywordAndKeyword)
{
Matcher matcher("new yo");
TEST_EQUAL(matcher.m_matcher.Score("new"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_matcher.Score("york"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_matcher.Score("new york"), 0, ());
char const query[] = "new york ";
KeywordMatcherTestCase const testCases[] =
{
{NOMATCH, DOES_NOT_MATTER, "new"},
{NOMATCH, DOES_NOT_MATTER, "new old"},
{NOMATCH, DOES_NOT_MATTER, "old york"},
{NOMATCH, DOES_NOT_MATTER, "new yorkshire"},
{NOMATCH, DOES_NOT_MATTER, "york newcastle"},
{MATCHES, STRONGLY_BETTER, "the york new"},
{MATCHES, STRONGLY_BETTER, "the new york"},
{MATCHES, BETTER_OR_EQUAL, "york new the"},
{MATCHES, STRONGLY_BETTER, "new york pizza"},
{MATCHES, STRONGLY_BETTER, "york new"},
{MATCHES, STRONGLY_BETTER, "new york"},
};
TestKeywordMatcher(query, testCases);
}
namespace
{
string GetManyTokens(string tokenPrefix, int tokenCount, bool countForward = true)
{
ostringstream out;
for (int i = 0; i < tokenCount; ++i)
out << tokenPrefix << (countForward ? i : tokenCount - 1 - i) << " ";
return out.str();
}
} // unnamed namespace
UNIT_TEST(KeywordMatcher_QueryTooLong)
{
for (int queryLength = MAX_TOKENS - 2; queryLength <= MAX_TOKENS + 2; ++queryLength)
{
string const query = GetManyTokens("Q", queryLength);
string const queryWithPrefix = query + " Prefix";
string const queryWithPrefixAndSomethingElse = query + " PrefixAndSomethingElse";
KeywordMatcherTestCase const testCases[] =
{
{NOMATCH, DOES_NOT_MATTER, ""},
{NOMATCH, DOES_NOT_MATTER, "Q"},
{NOMATCH, DOES_NOT_MATTER, "Q "},
{NOMATCH, DOES_NOT_MATTER, "Q3"},
{NOMATCH, DOES_NOT_MATTER, "Q3 "},
{NOMATCH, DOES_NOT_MATTER, "Q3 Q"},
{NOMATCH, DOES_NOT_MATTER, "Q3 Q4"},
{NOMATCH, DOES_NOT_MATTER, "zzz"},
{NOMATCH, DOES_NOT_MATTER, "Q"},
{ANY_RES, STRONGLY_BETTER, query.c_str()},
{NOMATCH, DOES_NOT_MATTER, "Q"},
{ANY_RES, STRONGLY_BETTER, queryWithPrefix.c_str()},
{NOMATCH, DOES_NOT_MATTER, "Q"},
{ANY_RES, STRONGLY_BETTER, queryWithPrefixAndSomethingElse.c_str()},
};
TestKeywordMatcher(query.c_str(), testCases);
TestKeywordMatcher(queryWithPrefix.c_str(), testCases);
}
}
UNIT_TEST(KeywordMatcher_NameTooLong)
{
string const name[] =
{
"Aa Bb " + GetManyTokens("T", MAX_TOKENS + 1),
"Aa Bb " + GetManyTokens("T", MAX_TOKENS),
"Aa Bb " + GetManyTokens("T", MAX_TOKENS - 1),
};
KeywordMatcherTestCase const testCases[] =
{
{NOMATCH, DOES_NOT_MATTER, "zzz"},
{MATCHES, STRONGLY_BETTER, name[0].c_str()},
{MATCHES, BETTER_OR_EQUAL, name[1].c_str()},
{MATCHES, BETTER_OR_EQUAL, name[2].c_str()},
};
char const * query[] = { "a", "aa", "aa ", "b", "bb", "bb ", "t" };
for (int i = 0; i < ARRAY_SIZE(query); ++i)
TestKeywordMatcher(query[i], testCases);
}
UNIT_TEST(KeywordMatcher_ManyTokensInReverseOrder)
{
string const query = GetManyTokens("Q", MAX_TOKENS);
string const name = GetManyTokens("Q", MAX_TOKENS);
string const reversedName = GetManyTokens("Q", MAX_TOKENS, false);
KeywordMatcherTestCase const testCases[] =
{
{NOMATCH, DOES_NOT_MATTER, "zzz"},
{MATCHES, STRONGLY_BETTER, reversedName.c_str()},
{MATCHES, STRONGLY_BETTER, name.c_str()},
};
TestKeywordMatcher(query.c_str(), testCases);
}

View file

@ -20,6 +20,7 @@ win32 {
SOURCES += \
../../testing/testingmain.cpp \
keyword_matcher_test.cpp \
keyword_lang_matcher_test.cpp \
latlon_match_test.cpp \
string_match_test.cpp \