diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index 1503b5d4d8..f845a43bf0 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -23,4 +23,24 @@ void SplitUniString(strings::UniString const & uniS, F f, DelimsT const & delims strings::UniString FeatureTypeToString(uint32_t type); +template +bool TokenizeStringAndCheckIfLastTokenIsPrefix(strings::UniString const & s, + ContainerT & tokens, + DelimsT const & delimiter) +{ + SplitUniString(s, MakeBackInsertFunctor(tokens), delimiter); + return !s.empty() && !delimiter(s.back()); +} + + +template +bool TokenizeStringAndCheckIfLastTokenIsPrefix(string const & s, + ContainerT & tokens, + DelimsT const & delimiter) +{ + return TokenizeStringAndCheckIfLastTokenIsPrefix(NormalizeAndSimplifyString(s), + tokens, + delimiter); +} + } // namespace search diff --git a/search/keyword_lang_matcher.cpp b/search/keyword_lang_matcher.cpp new file mode 100644 index 0000000000..d61a1ff05c --- /dev/null +++ b/search/keyword_lang_matcher.cpp @@ -0,0 +1,90 @@ +#include "keyword_lang_matcher.hpp" + +#include "../indexer/search_string_utils.hpp" +#include "../indexer/search_delimiters.hpp" + +#include "../base/stl_add.hpp" + +#include "../std/algorithm.hpp" + + +namespace search +{ + +KeywordLangMatcher::ScoreT::ScoreT(KeywordMatcher::ScoreT const & score, int langScore) + : m_parentScore(score), m_langScore(langScore) +{ +} + +bool KeywordLangMatcher::ScoreT::operator <(KeywordLangMatcher::ScoreT const & score) const +{ + if (m_parentScore < score.m_parentScore) + return true; + if (score.m_parentScore < m_parentScore) + return false; + + if (m_langScore != score.m_langScore) + return m_langScore < score.m_langScore; + + return false; +} + +void KeywordLangMatcher::SetLanguages(vector > const & languagePriorities) +{ + m_languagePriorities = languagePriorities; + +#ifdef DEBUG + ASSERT_EQUAL ( static_cast(NUM_LANG_PRIORITY_TIERS), m_languagePriorities.size(), () ); + for (int i = 0; i < NUM_LANG_PRIORITY_TIERS; ++i) + ASSERT_LESS_OR_EQUAL ( m_languagePriorities[i].size(), static_cast(MAX_LANGS_IN_TIER), () ); +#endif +} + +bool KeywordLangMatcher::AssertIndex(pair const & ind) const +{ + ASSERT_LESS ( static_cast(ind.first), m_languagePriorities.size(), () ); + ASSERT_LESS ( static_cast(ind.second), m_languagePriorities[ind.first].size(), () ); + return true; +} + +void KeywordLangMatcher::SetLanguage(pair const & ind, int8_t lang) +{ + ASSERT ( AssertIndex(ind), () ); + m_languagePriorities[ind.first][ind.second] = lang; +} + +int8_t KeywordLangMatcher::GetLanguage(pair const & ind) const +{ + ASSERT ( AssertIndex(ind), () ); + return m_languagePriorities[ind.first][ind.second]; +} + +int KeywordLangMatcher::GetLangScore(int8_t lang) const +{ + int const LANG_TIER_COUNT = static_cast(m_languagePriorities.size()); + + for (int i = 0; i < m_languagePriorities.size(); ++i) + for (int j = 0; j < m_languagePriorities[i].size(); ++j) + if (m_languagePriorities[i][j] == lang) + return -i; // All languages in the same tier are equal. + + return -LANG_TIER_COUNT; +} + +KeywordLangMatcher::ScoreT KeywordLangMatcher::Score(int8_t lang, string const & name) const +{ + return ScoreT(m_keywordMatcher.Score(name), GetLangScore(lang)); +} + +KeywordLangMatcher::ScoreT KeywordLangMatcher::Score(int8_t lang, StringT const & name) const +{ + return ScoreT(m_keywordMatcher.Score(name), GetLangScore(lang)); +} + +KeywordLangMatcher::ScoreT KeywordLangMatcher::Score(int8_t lang, + StringT const * tokens, size_t count) const +{ + return ScoreT(m_keywordMatcher.Score(tokens, count), GetLangScore(lang)); +} + +} // namespace search diff --git a/search/lang_keywords_scorer.hpp b/search/keyword_lang_matcher.hpp similarity index 59% rename from search/lang_keywords_scorer.hpp rename to search/keyword_lang_matcher.hpp index 166375132b..ec273888e5 100644 --- a/search/lang_keywords_scorer.hpp +++ b/search/keyword_lang_matcher.hpp @@ -3,12 +3,28 @@ #include "../std/vector.hpp" - namespace search { -class LangKeywordsScorer +class KeywordLangMatcher { +public: + + class ScoreT + { + public: + ScoreT() {} + bool operator < (ScoreT const & s) const; + private: + friend class KeywordLangMatcher; + + ScoreT(KeywordMatcher::ScoreT const & score, int langScore); + + KeywordMatcher::ScoreT m_parentScore; + int m_langScore; + }; + +private: enum { NUM_LANG_PRIORITY_TIERS = 4 }; enum { MAX_LANGS_IN_TIER = 2 }; @@ -26,12 +42,16 @@ public: m_keywordMatcher.SetKeywords(keywords, count, prefix); } - uint32_t Score(int8_t lang, string const & name) const; - uint32_t Score(int8_t lang, StringT const & name) const; - uint32_t Score(int8_t lang, StringT const * tokens, size_t count) const; + /// @return Score of the name (greater is better). + //@{ + ScoreT Score(int8_t lang, string const & name) const; + ScoreT Score(int8_t lang, StringT const & name) const; + ScoreT Score(int8_t lang, StringT const * tokens, size_t count) const; + //@} private: bool AssertIndex(pair const & ind) const; + int GetLangScore(int8_t lang) const; vector > m_languagePriorities; KeywordMatcher m_keywordMatcher; diff --git a/search/keyword_matcher.cpp b/search/keyword_matcher.cpp index b7d6a4ebf2..a8f0f77fb3 100644 --- a/search/keyword_matcher.cpp +++ b/search/keyword_matcher.cpp @@ -7,84 +7,127 @@ #include "../std/algorithm.hpp" - namespace search { +KeywordMatcher::KeywordMatcher() +{ + Clear(); +} + +void KeywordMatcher::Clear() +{ + m_keywords = NULL; + m_keywordsCount = 0; + m_prefix = NULL; +} + void KeywordMatcher::SetKeywords(StringT const * keywords, size_t count, StringT const * prefix) { - ASSERT_LESS ( count, static_cast(MAX_TOKENS), () ); - - m_keywords.resize(count); - for (size_t i = 0; i < count; ++i) - m_keywords[i] = &keywords[i]; + m_keywords = keywords; + m_keywordsCount = min(static_cast(MAX_TOKENS), count); m_prefix = prefix; if (m_prefix && m_prefix->empty()) - m_prefix = 0; + m_prefix = NULL; } -uint32_t KeywordMatcher::Score(string const & name) const +KeywordMatcher::ScoreT KeywordMatcher::Score(string const & name) const { return Score(NormalizeAndSimplifyString(name)); } -uint32_t KeywordMatcher::Score(StringT const & name) const +KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const & name) const { buffer_vector tokens; SplitUniString(name, MakeBackInsertFunctor(tokens), Delimiters()); - /// @todo Some Arabian names have a lot of tokens. - /// Trim this stuff while generation. - //ASSERT_LESS ( tokens.size(), static_cast(MAX_TOKENS), () ); - - return Score(tokens.data(), min(size_t(MAX_TOKENS-1), tokens.size())); + // Some names can have too many tokens. Trim them. + return Score(tokens.data(), min(size_t(MAX_TOKENS), tokens.size())); } -uint32_t KeywordMatcher::Score(StringT const * tokens, size_t count) const +KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const * tokens, size_t count) const { - ASSERT_LESS ( count, static_cast(MAX_TOKENS), () ); + vector isQueryTokenMatched(m_keywordsCount); + vector isNameTokenMatched(count); + uint32_t numQueryTokensMatched = 0; + uint32_t sumTokenMatchDistance = 0; + uint32_t prevTokenMatchDistance = 0; + bool bPrefixMatched = true; - // boolean array of matched input tokens - unsigned char isTokenMatched[MAX_TOKENS] = { 0 }; - - // calculate penalty by keywords - add MAX_TOKENS for each unmatched keyword - uint32_t score = 0; - for (size_t i = 0; i < m_keywords.size(); ++i) - { - unsigned char isKeywordMatched = 0; - for (size_t j = 0; j < count; ++j) - if (*m_keywords[i] == tokens[j]) - isKeywordMatched = isTokenMatched[j] = 1; - - if (!isKeywordMatched) - score += MAX_TOKENS; - } - - // calculate penalty for prefix - add MAX_TOKENS for unmatched prefix - if (m_prefix) - { - bool bPrefixMatched = false; - for (size_t i = 0; i < count && !bPrefixMatched; ++i) - if (StartsWith(tokens[i].begin(), tokens[i].end(), - m_prefix->begin(), m_prefix->end())) + for (int i = 0; i < m_keywordsCount; ++i) + for (int j = 0; j < count && !isQueryTokenMatched[i]; ++j) + if (!isNameTokenMatched[j] && m_keywords[i] == tokens[j]) { - bPrefixMatched = true; + isQueryTokenMatched[i] = isNameTokenMatched[j] = true; + uint32_t const tokenMatchDistance = i - j; + sumTokenMatchDistance += abs(tokenMatchDistance - prevTokenMatchDistance); + prevTokenMatchDistance = tokenMatchDistance; } - if (!bPrefixMatched) - score += MAX_TOKENS; + if (m_prefix) + { + bPrefixMatched = false; + for (int j = 0; j < count && !bPrefixMatched; ++j) + if (!isNameTokenMatched[j] && + StartsWith(tokens[j].begin(), tokens[j].end(), m_prefix->begin(), m_prefix->end())) + { + isNameTokenMatched[j] = bPrefixMatched = true; + uint32_t const tokenMatchDistance = int(m_keywordsCount) - j; + sumTokenMatchDistance += abs(tokenMatchDistance - prevTokenMatchDistance); + } } - // add penalty for each unmatched token in input sequence - for (size_t i = 0; i <= count; ++i) - { - // check for token length (skip common tokens such as "de", "la", "a") - if (tokens[i].size() > 2 && !isTokenMatched[i]) - ++score; - } + for (size_t i = 0; i < isQueryTokenMatched.size(); ++i) + if (isQueryTokenMatched[i]) + ++numQueryTokensMatched; + + ScoreT score = ScoreT(); + score.m_bFullQueryMatched = bPrefixMatched && (numQueryTokensMatched == isQueryTokenMatched.size()); + score.m_bPrefixMatched = bPrefixMatched; + score.m_numQueryTokensAndPrefixMatched = numQueryTokensMatched + (bPrefixMatched ? 1 : 0); + score.m_nameTokensMatched = 0xFFFFFFFF; + for (uint32_t i = 0; i < min(size_t(32), isNameTokenMatched.size()); ++i) + if (!isNameTokenMatched[i]) + score.m_nameTokensMatched &= ~(1 << (31 - i)); + score.m_sumTokenMatchDistance = sumTokenMatchDistance; return score; } +KeywordMatcher::ScoreT::ScoreT() + : m_sumTokenMatchDistance(0), m_nameTokensMatched(0), m_numQueryTokensAndPrefixMatched(0), + m_bFullQueryMatched(false), m_bPrefixMatched(false) +{ +} + +bool KeywordMatcher::ScoreT::operator < (KeywordMatcher::ScoreT const & s) const +{ + if (m_bFullQueryMatched != s.m_bFullQueryMatched) + return m_bFullQueryMatched < s.m_bFullQueryMatched; + if (m_numQueryTokensAndPrefixMatched != s.m_numQueryTokensAndPrefixMatched) + return m_numQueryTokensAndPrefixMatched < s.m_numQueryTokensAndPrefixMatched; + if (m_bPrefixMatched != s.m_bPrefixMatched) + return m_bPrefixMatched < s.m_bPrefixMatched; + if (m_nameTokensMatched != s.m_nameTokensMatched) + return m_nameTokensMatched < s.m_nameTokensMatched; + if (m_sumTokenMatchDistance != s.m_sumTokenMatchDistance) + return m_sumTokenMatchDistance > s.m_sumTokenMatchDistance; + return false; +} + +string DebugPrint(KeywordMatcher::ScoreT const & score) +{ + ostringstream out; + out << "KeywordMatcher::ScoreT("; + out << "FQM=" << score.m_bFullQueryMatched; + out << ",nQTM=" << static_cast(score.m_numQueryTokensAndPrefixMatched); + out << ",PM=" << score.m_bPrefixMatched; + out << ",NTM="; + for (int i = 31; i >= 0; --i) out << ((score.m_nameTokensMatched >> i) & 1); + out << ",STMD=" << score.m_sumTokenMatchDistance; + out << ")"; + return out.str(); +} + } // namespace search diff --git a/search/keyword_matcher.hpp b/search/keyword_matcher.hpp index 8730df2c8b..1c208c8521 100644 --- a/search/keyword_matcher.hpp +++ b/search/keyword_matcher.hpp @@ -4,7 +4,7 @@ #include "../base/string_utils.hpp" #include "../std/string.hpp" - +#include "../std/vector.hpp" namespace search { @@ -12,29 +12,46 @@ namespace search class KeywordMatcher { public: - enum { MAX_SCORE = MAX_TOKENS * MAX_TOKENS }; typedef strings::UniString StringT; - KeywordMatcher() : m_prefix(0) {} - - inline void Clear() + class ScoreT { - m_keywords.clear(); - m_prefix = 0; - } + public: + ScoreT(); + bool operator < (ScoreT const & s) const; + + private: + friend class KeywordMatcher; + friend string DebugPrint(ScoreT const & score); + + bool IsQueryMatched() const { return m_bFullQueryMatched; } + + uint32_t m_sumTokenMatchDistance; + uint32_t m_nameTokensMatched; + uint8_t m_numQueryTokensAndPrefixMatched; + bool m_bFullQueryMatched : 1; + bool m_bPrefixMatched : 1; + }; + + KeywordMatcher(); + + void Clear(); /// Store references to keywords from source array of strings. void SetKeywords(StringT const * keywords, size_t count, StringT const * prefix); - /// @return penalty of string (less is better). + /// @return Score of the name (greater is better). //@{ - uint32_t Score(string const & name) const; - uint32_t Score(StringT const & name) const; - uint32_t Score(StringT const * tokens, size_t count) const; + ScoreT Score(string const & name) const; + ScoreT Score(StringT const & name) const; + ScoreT Score(StringT const * tokens, size_t count) const; //@} + static bool IsQueryMatched(ScoreT const & score) { return score.IsQueryMatched(); } + private: - buffer_vector m_keywords; + StringT const * m_keywords; + size_t m_keywordsCount; StringT const * m_prefix; }; diff --git a/search/lang_keywords_scorer.cpp b/search/lang_keywords_scorer.cpp deleted file mode 100644 index 9e60263f6d..0000000000 --- a/search/lang_keywords_scorer.cpp +++ /dev/null @@ -1,77 +0,0 @@ -#include "lang_keywords_scorer.hpp" - -#include "../indexer/search_string_utils.hpp" -#include "../indexer/search_delimiters.hpp" - -#include "../base/stl_add.hpp" - -#include "../std/algorithm.hpp" - - -namespace search -{ - -void LangKeywordsScorer::SetLanguages(vector > const & languagePriorities) -{ - m_languagePriorities = languagePriorities; - -#ifdef DEBUG - ASSERT_EQUAL ( static_cast(NUM_LANG_PRIORITY_TIERS), m_languagePriorities.size(), () ); - for (int i = 0; i < NUM_LANG_PRIORITY_TIERS; ++i) - ASSERT_LESS_OR_EQUAL ( m_languagePriorities[i].size(), static_cast(MAX_LANGS_IN_TIER), () ); -#endif -} - -bool LangKeywordsScorer::AssertIndex(pair const & ind) const -{ - ASSERT_LESS ( static_cast(ind.first), m_languagePriorities.size(), () ); - ASSERT_LESS ( static_cast(ind.second), m_languagePriorities[ind.first].size(), () ); - return true; -} - -void LangKeywordsScorer::SetLanguage(pair const & ind, int8_t lang) -{ - ASSERT ( AssertIndex(ind), () ); - m_languagePriorities[ind.first][ind.second] = lang; -} - -int8_t LangKeywordsScorer::GetLanguage(pair const & ind) const -{ - ASSERT ( AssertIndex(ind), () ); - return m_languagePriorities[ind.first][ind.second]; -} - -uint32_t LangKeywordsScorer::Score(int8_t lang, string const & name) const -{ - return Score(lang, NormalizeAndSimplifyString(name)); -} - -uint32_t LangKeywordsScorer::Score(int8_t lang, StringT const & name) const -{ - buffer_vector tokens; - SplitUniString(name, MakeBackInsertFunctor(tokens), Delimiters()); - - /// @todo Some Arabian names have a lot of tokens. - /// Trim this stuff while generation. - //ASSERT_LESS ( tokens.size(), static_cast(MAX_TOKENS), () ); - - return Score(lang, tokens.data(), min(size_t(MAX_TOKENS-1), tokens.size())); -} - -uint32_t LangKeywordsScorer::Score(int8_t lang, StringT const * tokens, size_t count) const -{ - uint32_t const keywordScore = m_keywordMatcher.Score(tokens, count); - - // get score by language priority - uint32_t const factor = KeywordMatcher::MAX_SCORE * MAX_LANGS_IN_TIER; - uint32_t const value = keywordScore * MAX_LANGS_IN_TIER; - - for (uint32_t i = 0; i < NUM_LANG_PRIORITY_TIERS; ++i) - for (uint32_t j = 0; j < m_languagePriorities[i].size(); ++j) - if (m_languagePriorities[i][j] == lang) - return (i * factor + value + j); - - return (NUM_LANG_PRIORITY_TIERS * factor); -} - -} // namespace search diff --git a/search/search.pro b/search/search.pro index 780752e84e..1fb1f49adb 100644 --- a/search/search.pro +++ b/search/search.pro @@ -19,7 +19,7 @@ HEADERS += \ latlon_match.hpp \ approximate_string_match.hpp \ feature_offset_match.hpp \ - lang_keywords_scorer.hpp \ + keyword_lang_matcher.hpp \ params.hpp \ SOURCES += \ @@ -30,5 +30,5 @@ SOURCES += \ result.cpp \ latlon_match.cpp \ approximate_string_match.cpp \ - lang_keywords_scorer.cpp \ + keyword_lang_matcher.cpp \ params.cpp \ diff --git a/search/search_query.cpp b/search/search_query.cpp index ec7d210070..5bdbea055e 100644 --- a/search/search_query.cpp +++ b/search/search_query.cpp @@ -1,6 +1,5 @@ #include "search_query.hpp" #include "feature_offset_match.hpp" -#include "lang_keywords_scorer.hpp" #include "latlon_match.hpp" #include "search_common.hpp" @@ -441,8 +440,7 @@ namespace impl m_pFV->GetFeature(id.second, f); - uint32_t penalty; - m_query.GetBestMatchName(f, penalty, name); + m_query.GetBestMatchName(f, name); // country (region) name is a file name if feature isn't from World.mwm if (m_pFV->IsWorld()) @@ -592,22 +590,21 @@ namespace impl class BestNameFinder { - uint32_t & m_penalty; + KeywordLangMatcher::ScoreT m_score; string & m_name; - LangKeywordsScorer const & m_keywordsScorer; + KeywordLangMatcher const & m_keywordsScorer; public: - BestNameFinder(uint32_t & penalty, string & name, LangKeywordsScorer const & keywordsScorer) - : m_penalty(penalty), m_name(name), m_keywordsScorer(keywordsScorer) + BestNameFinder(string & name, KeywordLangMatcher const & keywordsScorer) + : m_score(), m_name(name), m_keywordsScorer(keywordsScorer) { - m_penalty = uint32_t(-1); } - bool operator()(signed char lang, string const & name) const + bool operator()(signed char lang, string const & name) { - uint32_t penalty = m_keywordsScorer.Score(lang, name); - if (penalty < m_penalty) + KeywordLangMatcher::ScoreT const score = m_keywordsScorer.Score(lang, name); + if (m_score < score) { - m_penalty = penalty; + m_score = score; m_name = name; } return true; @@ -616,19 +613,10 @@ public: } // namespace search::impl -void Query::GetBestMatchName(FeatureType const & f, uint32_t & penalty, string & name) const +void Query::GetBestMatchName(FeatureType const & f, string & name) const { - impl::BestNameFinder bestNameFinder(penalty, name, m_keywordsScorer); + impl::BestNameFinder bestNameFinder(name, m_keywordsScorer); (void)f.ForEachNameRef(bestNameFinder); - - /* - if (!f.ForEachNameRef(bestNameFinder)) - { - feature::TypesHolder types(f); - LOG(LDEBUG, (types)); - LOG(LDEBUG, (f.GetLimitRect(FeatureType::BEST_GEOMETRY))); - } - */ } Result Query::MakeResult(impl::PreResult2 const & r, set const * pPrefferedTypes/* = 0*/) const diff --git a/search/search_query.hpp b/search/search_query.hpp index fda7e1c93a..122c525036 100644 --- a/search/search_query.hpp +++ b/search/search_query.hpp @@ -1,6 +1,6 @@ #pragma once #include "intermediate_result.hpp" -#include "lang_keywords_scorer.hpp" +#include "keyword_lang_matcher.hpp" #include "../indexer/search_trie.hpp" #include "../indexer/index.hpp" // for Index::MwmLock @@ -173,7 +173,7 @@ private: bool MatchForSuggestionsImpl(strings::UniString const & token, int8_t lang, Results & res); void MatchForSuggestions(strings::UniString const & token, Results & res); - void GetBestMatchName(FeatureType const & f, uint32_t & penalty, string & name) const; + void GetBestMatchName(FeatureType const & f, string & name) const; Result MakeResult(impl::PreResult2 const & r, set const * pPrefferedTypes = 0) const; @@ -209,7 +209,7 @@ private: void SetLanguage(int id, int8_t lang); int8_t GetLanguage(int id) const; - LangKeywordsScorer m_keywordsScorer; + KeywordLangMatcher m_keywordsScorer; OffsetsVectorT m_offsetsInViewport[RECTSCOUNT]; diff --git a/search/search_tests/keyword_lang_matcher_test.cpp b/search/search_tests/keyword_lang_matcher_test.cpp new file mode 100644 index 0000000000..360c071d3d --- /dev/null +++ b/search/search_tests/keyword_lang_matcher_test.cpp @@ -0,0 +1,71 @@ +#include "../../testing/testing.hpp" +#include "../keyword_lang_matcher.hpp" + +#include "../../indexer/search_delimiters.hpp" +#include "../../indexer/search_string_utils.hpp" + +#include "../../base/stl_add.hpp" + +#include "../../std/vector.hpp" + +namespace +{ + +using search::KeywordLangMatcher; +typedef search::KeywordLangMatcher::ScoreT ScoreT; + +enum +{ + LANG_UNKNOWN = 1, + LANG_SOME = 2, + LANG_SOME_OTHER = 3, + LANG_HIGH_PRIORITY = 10 +}; + +KeywordLangMatcher CreateMatcher(string const & query) +{ + KeywordLangMatcher matcher; + + vector > langPriorities(4, vector()); + langPriorities[0].push_back(LANG_HIGH_PRIORITY); + // langPriorities[1] is intentionally left empty. + langPriorities[2].push_back(LANG_SOME); + langPriorities[2].push_back(LANG_SOME_OTHER); + // langPriorities[3] is intentionally left empty. + matcher.SetLanguages(langPriorities); + + vector keywords; + strings::UniString prefix; + if (search::TokenizeStringAndCheckIfLastTokenIsPrefix(query, keywords, search::Delimiters())) + { + prefix = keywords.back(); + keywords.pop_back(); + } + matcher.SetKeywords(keywords.data(), keywords.size(), &prefix); + + return matcher; +} + +} // unnamed namespace + + +UNIT_TEST(KeywordMatcher_TokensMatchHasPriority) +{ +} + +UNIT_TEST(KeywordMatcher_LanguageMatchIsUsedWhenTokenMatchIsTheSame) +{ + char const * query = "test"; + char const * name = "test"; + KeywordLangMatcher matcher = CreateMatcher(query); + + TEST(matcher.Score(LANG_UNKNOWN, name) < matcher.Score(LANG_SOME, name), ()); + TEST(matcher.Score(LANG_UNKNOWN, name) < matcher.Score(LANG_SOME_OTHER, name), ()); + TEST(matcher.Score(LANG_UNKNOWN, name) < matcher.Score(LANG_HIGH_PRIORITY, name), ()); + + TEST(!(matcher.Score(LANG_SOME, name) < matcher.Score(LANG_SOME_OTHER, name)), ()); + TEST(!(matcher.Score(LANG_SOME_OTHER, name) < matcher.Score(LANG_SOME, name)), ()); + + TEST(matcher.Score(LANG_SOME, name) < matcher.Score(LANG_HIGH_PRIORITY, name), ()); + TEST(matcher.Score(LANG_SOME_OTHER, name) < matcher.Score(LANG_HIGH_PRIORITY, name), ()); +} diff --git a/search/search_tests/keyword_matcher_test.cpp b/search/search_tests/keyword_matcher_test.cpp index 2653869d5f..34cad61305 100644 --- a/search/search_tests/keyword_matcher_test.cpp +++ b/search/search_tests/keyword_matcher_test.cpp @@ -1,79 +1,300 @@ #include "../../testing/testing.hpp" #include "../keyword_matcher.hpp" + +#include "../search_common.hpp" + #include "../../indexer/search_string_utils.hpp" #include "../../indexer/search_delimiters.hpp" + #include "../../base/buffer_vector.hpp" #include "../../base/stl_add.hpp" + #include "../../std/scoped_ptr.hpp" +#include "../../std/sstream.hpp" +#include "../../std/vector.hpp" namespace { -static const uint32_t MAX_SCORE = search::KeywordMatcher::MAX_SCORE; -class Matcher +using search::KeywordMatcher; +typedef search::KeywordMatcher::ScoreT ScoreT; +using search::MAX_TOKENS; + +enum ExpectedMatchResult { -public: - Matcher(char const * query) - { - strings::UniString const uniQuery = search::NormalizeAndSimplifyString(query); - SplitUniString(uniQuery, MakeBackInsertFunctor(m_keywords), search::Delimiters()); - if (!uniQuery.empty() && uniQuery.back() != ' ') - { - m_prefix = m_keywords.back(); - m_keywords.pop_back(); - } + NOMATCH, + MATCHES, + ANY_RES +}; - m_matcher.SetKeywords(m_keywords.data(), m_keywords.size(), &m_prefix); +enum ExpectedScoreComparison +{ + DOES_NOT_MATTER, // Score does not matter. + PERFECTLY_EQUAL, // Matches with the score == previous. + BETTER_OR_EQUAL, // Matches with the score <= previous. + STRONGLY_BETTER // Matched with the score < previous. +}; + +struct KeywordMatcherTestCase +{ + ExpectedMatchResult m_eMatch; + ExpectedScoreComparison m_eMatchType; + char const * m_name; +}; + +template +void TestKeywordMatcher(char const * const query, KeywordMatcherTestCase const (&testCases)[N]) +{ + vector keywords; + strings::UniString prefix; + if (search::TokenizeStringAndCheckIfLastTokenIsPrefix(query, keywords, search::Delimiters())) + { + prefix = keywords.back(); + keywords.pop_back(); } - search::KeywordMatcher m_matcher; -private: - buffer_vector m_keywords; - strings::UniString m_prefix; -}; + KeywordMatcher matcher; + matcher.SetKeywords(keywords.data(), keywords.size(), &prefix); + ScoreT prevScore = ScoreT(); + for (size_t i = 0; i < N; ++i) + { + char const * const name = testCases[i].m_name; + char const * const prevName = (i == 0 ? "N/A" : testCases[i-1].m_name); + ScoreT const testScore = matcher.Score(name); + + // Test that a newly created matcher returns the same result + { + KeywordMatcher freshMatcher; + freshMatcher.SetKeywords(keywords.data(), keywords.size(), &prefix); + ScoreT const freshScore = freshMatcher.Score(name); + // TEST_EQUAL(testScore, freshScore, (query, name)); + TEST(!(testScore < freshScore), (query, name)); + TEST(!(freshScore < testScore), (query, name)); + } + + if (testCases[i].m_eMatch != ANY_RES) + { + TEST_EQUAL(testCases[i].m_eMatch == MATCHES, + KeywordMatcher::IsQueryMatched(testScore), + (query, name, testScore)); + } + + switch (testCases[i].m_eMatchType) + { + case DOES_NOT_MATTER: + break; + case PERFECTLY_EQUAL: + TEST(!(testScore < prevScore), (query, name, testScore, prevName, prevScore)); + TEST(!(prevScore < testScore), (query, name, testScore, prevName, prevScore)); + break; + case BETTER_OR_EQUAL: + TEST(!(testScore < prevScore), (query, name, testScore, prevName, prevScore)); + break; + case STRONGLY_BETTER: + TEST(prevScore < testScore, (query, name, testScore, prevName, prevScore)); + break; + default: + ASSERT(false, ()); + } + + prevScore = testScore; + } +} } // unnamed namespace -UNIT_TEST(KeywordMatcher_New) +UNIT_TEST(KeywordMatcher_Prefix) { - Matcher matcher("new "); - TEST_EQUAL(matcher.m_matcher.Score("new"), 0, ()); - TEST_EQUAL(matcher.m_matcher.Score("york"), MAX_SCORE, ()); - TEST_EQUAL(matcher.m_matcher.Score("new york"), 0, ()); + char const query[] = "new"; + KeywordMatcherTestCase const testCases[] = + { + {NOMATCH, DOES_NOT_MATTER, ""}, + {NOMATCH, DOES_NOT_MATTER, "zzz"}, + {NOMATCH, DOES_NOT_MATTER, "ne"}, + + {MATCHES, STRONGLY_BETTER, "the newark"}, + {MATCHES, BETTER_OR_EQUAL, "york new"}, + + {MATCHES, STRONGLY_BETTER, "new york gym"}, + {MATCHES, BETTER_OR_EQUAL, "new new york"}, + + {MATCHES, STRONGLY_BETTER, "new york"}, + + {MATCHES, STRONGLY_BETTER, "newark"}, + {MATCHES, BETTER_OR_EQUAL, "new"}, + }; + TestKeywordMatcher(query, testCases); } -UNIT_TEST(KeywordMatcher_York) +UNIT_TEST(KeywordMatcher_Keyword) { - Matcher matcher("york "); - TEST_EQUAL(matcher.m_matcher.Score("new"), MAX_SCORE, ()); - TEST_EQUAL(matcher.m_matcher.Score("york"), 0, ()); - TEST_EQUAL(matcher.m_matcher.Score("new york"), 1, ()); + char const query[] = "new "; + KeywordMatcherTestCase const testCases[] = + { + {NOMATCH, DOES_NOT_MATTER, ""}, + {NOMATCH, DOES_NOT_MATTER, "zzz"}, + {NOMATCH, DOES_NOT_MATTER, "ne"}, + {NOMATCH, DOES_NOT_MATTER, "the netherlands"}, + + {NOMATCH, STRONGLY_BETTER, "newark"}, + + {MATCHES, STRONGLY_BETTER, "york new"}, + + {MATCHES, STRONGLY_BETTER, "new york gym"}, + {MATCHES, BETTER_OR_EQUAL, "new new york"}, + + {MATCHES, STRONGLY_BETTER, "new york"}, + }; + TestKeywordMatcher(query, testCases); } -UNIT_TEST(KeywordMatcher_NewYork) +UNIT_TEST(KeywordMatcher_SanSa_ShouldMatch_SanSalvador_BetterThan_San) { - Matcher matcher1("new york "); - Matcher matcher2("new york"); - TEST_EQUAL(matcher1.m_matcher.Score("new"), MAX_SCORE, ()); - TEST_EQUAL(matcher2.m_matcher.Score("new"), MAX_SCORE, ()); - TEST_EQUAL(matcher1.m_matcher.Score("york"), MAX_SCORE, ()); - TEST_EQUAL(matcher2.m_matcher.Score("york"), MAX_SCORE, ()); - TEST_EQUAL(matcher1.m_matcher.Score("new york"), 0, ()); - TEST_EQUAL(matcher2.m_matcher.Score("new york"), 0, ()); + char const query[] = "San Sa"; + + KeywordMatcherTestCase const testCases[] = + { + {NOMATCH, DOES_NOT_MATTER, "San"}, + {MATCHES, STRONGLY_BETTER, "San Salvador"}, + }; + TestKeywordMatcher(query, testCases); } -UNIT_TEST(KeywordMatcher_YorkNew) +UNIT_TEST(KeywordMatcher_KeywordAndPrefix) { - Matcher matcher("new york "); - TEST_EQUAL(matcher.m_matcher.Score("new"), MAX_SCORE, ()); - TEST_EQUAL(matcher.m_matcher.Score("york"), MAX_SCORE, ()); - TEST_EQUAL(matcher.m_matcher.Score("new york"), 0, ()); + char const query[] = "new yo"; + + KeywordMatcherTestCase const testCases[] = + { + {NOMATCH, DOES_NOT_MATTER, "new"}, + {NOMATCH, DOES_NOT_MATTER, "new old"}, + {NOMATCH, DOES_NOT_MATTER, "old york"}, + + {MATCHES, STRONGLY_BETTER, "the york new"}, + + {MATCHES, STRONGLY_BETTER, "the new york"}, + {MATCHES, BETTER_OR_EQUAL, "york new the"}, + + {MATCHES, STRONGLY_BETTER, "new york pizza"}, + + {MATCHES, STRONGLY_BETTER, "york new"}, + {MATCHES, BETTER_OR_EQUAL, "yo new"}, + + {MATCHES, STRONGLY_BETTER, "new york"}, + {MATCHES, BETTER_OR_EQUAL, "new yo"}, + }; + TestKeywordMatcher(query, testCases); } -UNIT_TEST(KeywordMatcher_NewYo) +UNIT_TEST(KeywordMatcher_KeywordAndKeyword) { - Matcher matcher("new yo"); - TEST_EQUAL(matcher.m_matcher.Score("new"), MAX_SCORE, ()); - TEST_EQUAL(matcher.m_matcher.Score("york"), MAX_SCORE, ()); - TEST_EQUAL(matcher.m_matcher.Score("new york"), 0, ()); + char const query[] = "new york "; + + KeywordMatcherTestCase const testCases[] = + { + {NOMATCH, DOES_NOT_MATTER, "new"}, + {NOMATCH, DOES_NOT_MATTER, "new old"}, + {NOMATCH, DOES_NOT_MATTER, "old york"}, + {NOMATCH, DOES_NOT_MATTER, "new yorkshire"}, + {NOMATCH, DOES_NOT_MATTER, "york newcastle"}, + + {MATCHES, STRONGLY_BETTER, "the york new"}, + + {MATCHES, STRONGLY_BETTER, "the new york"}, + {MATCHES, BETTER_OR_EQUAL, "york new the"}, + + {MATCHES, STRONGLY_BETTER, "new york pizza"}, + + {MATCHES, STRONGLY_BETTER, "york new"}, + + {MATCHES, STRONGLY_BETTER, "new york"}, + }; + TestKeywordMatcher(query, testCases); +} + + +namespace +{ + +string GetManyTokens(string tokenPrefix, int tokenCount, bool countForward = true) +{ + ostringstream out; + for (int i = 0; i < tokenCount; ++i) + out << tokenPrefix << (countForward ? i : tokenCount - 1 - i) << " "; + return out.str(); +} + +} // unnamed namespace + +UNIT_TEST(KeywordMatcher_QueryTooLong) +{ + for (int queryLength = MAX_TOKENS - 2; queryLength <= MAX_TOKENS + 2; ++queryLength) + { + string const query = GetManyTokens("Q", queryLength); + string const queryWithPrefix = query + " Prefix"; + string const queryWithPrefixAndSomethingElse = query + " PrefixAndSomethingElse"; + + KeywordMatcherTestCase const testCases[] = + { + {NOMATCH, DOES_NOT_MATTER, ""}, + {NOMATCH, DOES_NOT_MATTER, "Q"}, + {NOMATCH, DOES_NOT_MATTER, "Q "}, + {NOMATCH, DOES_NOT_MATTER, "Q3"}, + {NOMATCH, DOES_NOT_MATTER, "Q3 "}, + {NOMATCH, DOES_NOT_MATTER, "Q3 Q"}, + {NOMATCH, DOES_NOT_MATTER, "Q3 Q4"}, + {NOMATCH, DOES_NOT_MATTER, "zzz"}, + + {NOMATCH, DOES_NOT_MATTER, "Q"}, + {ANY_RES, STRONGLY_BETTER, query.c_str()}, + + {NOMATCH, DOES_NOT_MATTER, "Q"}, + {ANY_RES, STRONGLY_BETTER, queryWithPrefix.c_str()}, + + {NOMATCH, DOES_NOT_MATTER, "Q"}, + {ANY_RES, STRONGLY_BETTER, queryWithPrefixAndSomethingElse.c_str()}, + }; + TestKeywordMatcher(query.c_str(), testCases); + TestKeywordMatcher(queryWithPrefix.c_str(), testCases); + } +} + +UNIT_TEST(KeywordMatcher_NameTooLong) +{ + string const name[] = + { + "Aa Bb " + GetManyTokens("T", MAX_TOKENS + 1), + "Aa Bb " + GetManyTokens("T", MAX_TOKENS), + "Aa Bb " + GetManyTokens("T", MAX_TOKENS - 1), + }; + + KeywordMatcherTestCase const testCases[] = + { + {NOMATCH, DOES_NOT_MATTER, "zzz"}, + + {MATCHES, STRONGLY_BETTER, name[0].c_str()}, + {MATCHES, BETTER_OR_EQUAL, name[1].c_str()}, + {MATCHES, BETTER_OR_EQUAL, name[2].c_str()}, + }; + + char const * query[] = { "a", "aa", "aa ", "b", "bb", "bb ", "t" }; + for (int i = 0; i < ARRAY_SIZE(query); ++i) + TestKeywordMatcher(query[i], testCases); +} + +UNIT_TEST(KeywordMatcher_ManyTokensInReverseOrder) +{ + string const query = GetManyTokens("Q", MAX_TOKENS); + string const name = GetManyTokens("Q", MAX_TOKENS); + string const reversedName = GetManyTokens("Q", MAX_TOKENS, false); + + KeywordMatcherTestCase const testCases[] = + { + {NOMATCH, DOES_NOT_MATTER, "zzz"}, + + {MATCHES, STRONGLY_BETTER, reversedName.c_str()}, + + {MATCHES, STRONGLY_BETTER, name.c_str()}, + }; + TestKeywordMatcher(query.c_str(), testCases); } diff --git a/search/search_tests/search_tests.pro b/search/search_tests/search_tests.pro index f38bd71704..a099247df9 100644 --- a/search/search_tests/search_tests.pro +++ b/search/search_tests/search_tests.pro @@ -20,6 +20,7 @@ win32 { SOURCES += \ ../../testing/testingmain.cpp \ keyword_matcher_test.cpp \ + keyword_lang_matcher_test.cpp \ latlon_match_test.cpp \ string_match_test.cpp \