diff --git a/search/keyword_matcher.cpp b/search/keyword_matcher.cpp index c8137ed8ed..67e0ca78b3 100644 --- a/search/keyword_matcher.cpp +++ b/search/keyword_matcher.cpp @@ -1,93 +1,75 @@ #include "keyword_matcher.hpp" #include "../indexer/search_delimiters.hpp" #include "../indexer/search_string_utils.hpp" -#include "../base/logging.hpp" +#include "../base/stl_add.hpp" #include "../base/string_utils.hpp" -#include "../std/bind.hpp" -#include "../std/numeric.hpp" +#include "../std/algorithm.hpp" -namespace search +search::KeywordMatcher::KeywordMatcher(strings::UniString const * const * pKeywords, + int keywordCount, + strings::UniString const * pPrefix) + : m_pKeywords(pKeywords), m_keywordCount(keywordCount), m_pPrefix(pPrefix) { -namespace impl -{ - -KeywordMatcher::KeywordMatcher(strings::UniString const * const * pKeywords, - size_t keywordsCount, - strings::UniString const & prefix, - uint32_t maxKeywordMatchCost, uint32_t maxPrefixMatchCost, - StringMatchFn keywordMatchFn, StringMatchFn prefixMatchFn) - : m_pKeywords(pKeywords), m_prefix(prefix), - m_maxKeywordMatchCost(maxKeywordMatchCost), - m_maxPrefixMatchCost(maxPrefixMatchCost), - m_keywordMatchFn(keywordMatchFn), - m_prefixMatchFn(prefixMatchFn), - m_minKeywordMatchCost(keywordsCount, m_maxKeywordMatchCost + 1), - m_minPrefixMatchCost(m_maxPrefixMatchCost + 1), - m_bestMatchNamePenalty(static_cast(-1)) -{ -#ifdef DEBUG - for (size_t i = 0; i < keywordsCount; ++i) - ASSERT(!m_pKeywords[i]->empty(), (i)); -#endif + ASSERT_LESS(m_keywordCount, int(MAX_TOKENS), ()); + m_keywordCount = min(m_keywordCount, int(MAX_TOKENS)); + if (m_pPrefix && m_pPrefix->empty()) + m_pPrefix = NULL; } -void KeywordMatcher::ProcessName(string const & name) +uint32_t search::KeywordMatcher::Score(string const & name) const { - SplitUniString(NormalizeAndSimplifyString(name), - bind(&KeywordMatcher::ProcessNameToken, this, cref(name), _1), - Delimiters()); + return Score(NormalizeAndSimplifyString(name)); } -void KeywordMatcher::ProcessNameToken(string const & name, strings::UniString const & s) +uint32_t search::KeywordMatcher::Score(strings::UniString const & name) const { - uint32_t matchPenalty = 0; - for (size_t i = 0; i < m_minKeywordMatchCost.size(); ++i) - { - strings::UniString const & keyword = *(m_pKeywords[i]); - uint32_t const matchCost = m_keywordMatchFn(&keyword[0], keyword.size(), - &s[0], s.size(), m_minKeywordMatchCost[i]); - matchPenalty += matchCost; - if (matchCost <= m_maxKeywordMatchCost) - { - if (matchCost < m_minKeywordMatchCost[i]) - { - // LOG(LDEBUG, (matchCost, name)); - m_minKeywordMatchCost[i] = matchCost; - } - } - } - - bool bPrefixMatch = false; - if (!m_prefix.empty()) - { - uint32_t const matchCost = m_prefixMatchFn(&m_prefix[0], m_prefix.size(), - &s[0], s.size(), m_minPrefixMatchCost); - matchPenalty += matchCost; - if (matchCost <= m_maxPrefixMatchCost) - { - bPrefixMatch = true; - if (matchCost < m_minPrefixMatchCost) - m_minPrefixMatchCost = matchCost; - } - } - else - { - bPrefixMatch = true; - m_minPrefixMatchCost = 0; - } - - if (bPrefixMatch && matchPenalty < m_bestMatchNamePenalty) - { - m_bestMatchName = name; - m_bestMatchNamePenalty = matchPenalty; - } + buffer_vector tokens; + SplitUniString(name, MakeBackInsertFunctor(tokens), Delimiters()); + ASSERT_LESS(tokens.size(), size_t(MAX_TOKENS), ()); + return Score(tokens.data(), static_cast(tokens.size())); } -uint32_t KeywordMatcher::GetMatchScore() const +uint32_t search::KeywordMatcher::Score(strings::UniString const * tokens, int tokenCount) const { - return accumulate(m_minKeywordMatchCost.begin(), m_minKeywordMatchCost.end(), - m_minPrefixMatchCost); -} + ASSERT_LESS(tokenCount, int(MAX_TOKENS), ()); -} // namespace search::impl -} // namespace search + // We will use this for scoring. + unsigned char isTokenMatched[MAX_TOKENS] = { 0 }; + + // Check that all keywords matched. + for (int k = 0; k < m_keywordCount; ++k) + { + unsigned char isKeywordMatched = 0; + for (int t = 0; t < tokenCount; ++t) + if (*m_pKeywords[k] == tokens[t]) + isKeywordMatched = isTokenMatched[t] = 1; + + // All keywords should be matched. + if (!isKeywordMatched) + return MAX_SCORE; + } + + // Check that prefix matched. + if (m_pPrefix) + { + bool bPrefixMatched = false; + for (int t = 0; t < tokenCount && !bPrefixMatched; ++t) + if (StartsWith(tokens[t].begin(), tokens[t].end(), + m_pPrefix->begin(), m_pPrefix->end())) + bPrefixMatched = true; + if (!bPrefixMatched) + return MAX_SCORE; + } + + // Calculate score. + int lastTokenMatched = 0; + for (int t = 0; t < tokenCount; ++t) + if (isTokenMatched[t]) + lastTokenMatched = t; + uint32_t score = 0; + for (int t = 0; t <= lastTokenMatched; ++t) + if (tokens[t].size() > 2 && !isTokenMatched[t]) + ++score; + + return score; +} diff --git a/search/keyword_matcher.hpp b/search/keyword_matcher.hpp index d202f2f288..3dcced4254 100644 --- a/search/keyword_matcher.hpp +++ b/search/keyword_matcher.hpp @@ -1,59 +1,31 @@ #pragma once -#include "../base/base.hpp" +#include "search_common.hpp" +#include "../base/assert.hpp" #include "../base/buffer_vector.hpp" #include "../base/string_utils.hpp" #include "../std/string.hpp" namespace search { -namespace impl -{ -typedef uint32_t (* StringMatchFn)(strings::UniChar const * sA, uint32_t sizeA, - strings::UniChar const * sB, uint32_t sizeB, - uint32_t maxCost); - - -// Matches keywords agains given names. class KeywordMatcher { - strings::UniString const * const * m_pKeywords; - strings::UniString const & m_prefix; - uint32_t m_maxKeywordMatchCost, m_maxPrefixMatchCost; - StringMatchFn m_keywordMatchFn, m_prefixMatchFn; - buffer_vector m_minKeywordMatchCost; - uint32_t m_minPrefixMatchCost; - string m_bestMatchName; - uint32_t m_bestMatchNamePenalty; - public: - KeywordMatcher(strings::UniString const * const * pKeywords, - size_t keywordsCount, - strings::UniString const & prefix, - uint32_t maxKeywordMatchCost, uint32_t maxPrefixMatchCost, - StringMatchFn keywordMatchFn, StringMatchFn prefixMatchFn); + enum { MAX_SCORE = MAX_TOKENS }; - void ProcessName(string const & name); - void ProcessNameToken(string const & name, strings::UniString const & token); + KeywordMatcher(strings::UniString const * const * pKeywords, int keywordCount, + strings::UniString const * pPrefix); - // Useful for FeatureType.ForEachName(), calls ProcessName() and always returns true. - bool operator () (int /*lang*/, string const & name) - { - ProcessName(name); - return true; - } - // Get total feature match score. - uint32_t GetMatchScore() const; + // Returns penalty (which is less than MAX_SCORE) if name matched, or MAX_SCORE otherwise. + uint32_t Score(string const & name) const; + uint32_t Score(strings::UniString const & name) const; + uint32_t Score(strings::UniString const * tokens, int tokenCount) const; - // Get prefix match score. - uint32_t GetPrefixMatchScore() const { return m_minPrefixMatchCost; } - - // Get match score for each keyword. - uint32_t const * GetKeywordMatchScores() const { return &m_minKeywordMatchCost[0]; } - - string GetBestMatchName() const { return m_bestMatchName; } +private: + strings::UniString const * const * m_pKeywords; + int m_keywordCount; + strings::UniString const * m_pPrefix; }; -} // namespace search::impl } // namespace search diff --git a/search/query.cpp b/search/query.cpp index 1aa6262f72..fb808f261f 100644 --- a/search/query.cpp +++ b/search/query.cpp @@ -59,15 +59,26 @@ inline uint32_t GetMaxPrefixMatchScore(int size) return 512; } -template -inline KeywordMatcher MakeMatcher(UniStringPtrVectorT const & tokens, - strings::UniString const & prefix) +struct FeatureMatcher { - return KeywordMatcher(tokens.empty() ? NULL : &tokens[0], tokens.size(), - prefix, - GetMaxKeywordMatchScore(), GetMaxPrefixMatchScore(prefix.size()), - &KeywordMatch, &PrefixMatch); -} + KeywordMatcher & m_keywordMatcher; + uint32_t m_minScore; + string m_bestName; + + explicit FeatureMatcher(KeywordMatcher & keywordMatcher) + : m_keywordMatcher(keywordMatcher), m_minScore(keywordMatcher.MAX_SCORE) {} + + bool operator () (int /*lang*/, string const & name) + { + uint32_t const score = m_keywordMatcher.Score(name); + if (score < m_minScore) + { + m_minScore = score; + m_bestName = name; + } + return true; + } +}; struct FeatureProcessor { @@ -99,23 +110,20 @@ struct FeatureProcessor if (!(keywordsSkipMask & (1 << i))) keywords.push_back(&queryKeywords[i]); - KeywordMatcher matcher(MakeMatcher(keywords, m_query.GetPrefix())); + KeywordMatcher keywordMatcher(keywords.data(), int(keywords.size()), &m_query.GetPrefix()); + FeatureMatcher matcher(keywordMatcher); feature.ForEachNameRef(matcher); - if (matcher.GetPrefixMatchScore() <= GetMaxPrefixMatchScore(m_query.GetPrefix().size())) + if (matcher.m_minScore < KeywordMatcher::MAX_SCORE) { - uint32_t const matchScore = matcher.GetMatchScore(); - if (matchScore <= GetMaxKeywordMatchScore()) - { - pair const scaleRange = feature::DrawableScaleRangeForText(feature); - if (scaleRange.first < 0) - return; - - m_query.AddResult(IntermediateResult(m_query.GetViewport(), - feature, - matcher.GetBestMatchName(), - matchScore, - scaleRange.first)); - } + pair const scaleRange = feature::DrawableScaleRangeForText(feature); + // TODO: Why scaleRange.first can be < 0? + if (scaleRange.first < 0) + return; + m_query.AddResult(IntermediateResult(m_query.GetViewport(), + feature, + matcher.m_bestName, + matcher.m_minScore, + scaleRange.first)); } } }; @@ -211,15 +219,17 @@ void Query::Search(function const & f) // TODO: Prefer user languages here. if (m_prefix.size() >= iName->m_prefixLengthToSuggest) { - KeywordMatcher matcher = MakeMatcher(vector(), m_prefix); - matcher.ProcessNameToken(string(), NormalizeAndSimplifyString(iName->m_name)); - ASSERT_LESS(iName->m_prefixLengthToSuggest, 1 << PREFIX_LEN_BITS, ()); - int const penalty = - (matcher.GetPrefixMatchScore() << PREFIX_LEN_BITS) + iName->m_prefixLengthToSuggest; - if (penalty < bestPrefixMatchPenalty) + KeywordMatcher matcher(NULL, 0, &m_prefix); + int const score = matcher.Score(iName->m_name); + if (score < KeywordMatcher::MAX_SCORE) { - bestPrefixMatchPenalty = penalty; - bestPrefixMatch = iName->m_name; + ASSERT_LESS(iName->m_prefixLengthToSuggest, 1 << PREFIX_LEN_BITS, ()); + int const penalty = (score << PREFIX_LEN_BITS) + iName->m_prefixLengthToSuggest; + if (penalty < bestPrefixMatchPenalty) + { + bestPrefixMatchPenalty = penalty; + bestPrefixMatch = iName->m_name; + } } } } diff --git a/search/search_tests/keyword_matcher_test.cpp b/search/search_tests/keyword_matcher_test.cpp index 94181f50e1..35683d9e20 100644 --- a/search/search_tests/keyword_matcher_test.cpp +++ b/search/search_tests/keyword_matcher_test.cpp @@ -1,110 +1,90 @@ #include "../../testing/testing.hpp" #include "../keyword_matcher.hpp" -#include "match_cost_mock.hpp" -#include "../approximate_string_match.hpp" #include "../../indexer/search_string_utils.hpp" -#include "../../testing/testing_utils.hpp" -#include "../../base/string_utils.hpp" +#include "../../indexer/search_delimiters.hpp" +#include "../../base/buffer_vector.hpp" +#include "../../base/stl_add.hpp" #include "../../std/scoped_ptr.hpp" -#include "../../std/vector.hpp" namespace { +static const uint32_t MAX_SCORE = search::KeywordMatcher::MAX_SCORE; -uint32_t KeywordMatchForTest(strings::UniChar const * sA, uint32_t sizeA, - strings::UniChar const * sB, uint32_t sizeB, - uint32_t maxCost) +class Matcher { - return StringMatchCost(sA, sizeA, sB, sizeB, search::MatchCostMock(), - maxCost, false); -} - -uint32_t PrefixMatchForTest(strings::UniChar const * sA, uint32_t sizeA, - strings::UniChar const * sB, uint32_t sizeB, - uint32_t maxCost) -{ - return StringMatchCost(sA, sizeA, sB, sizeB, search::MatchCostMock(), - maxCost, true); -} - -struct KeywordMatcherAdaptor -{ - explicit KeywordMatcherAdaptor(char const * prefix, - uint32_t maxKeywordMatchCost, uint32_t maxPrefixMatchCost, - char const * s0, char const * s1 = NULL) +public: + Matcher(char const * query) { - m_keywords.push_back(strings::MakeUniString(s0)); - if (s1) - m_keywords.push_back(strings::MakeUniString(s1)); + strings::UniString const uniQuery = search::NormalizeAndSimplifyString(query); + SplitUniString(uniQuery, MakeBackInsertFunctor(m_keywords), search::Delimiters()); + if (!uniQuery.empty() && uniQuery.back() != ' ') + { + m_prefix = m_keywords.back(); + m_keywords.pop_back(); + } + m_ptrs.resize(m_keywords.size()); for (size_t i = 0; i < m_keywords.size(); ++i) - m_keywordPtrs.push_back(&m_keywords[i]); - m_pMatcher.reset(new search::impl::KeywordMatcher(&m_keywordPtrs[0], m_keywordPtrs.size(), - strings::MakeUniString(prefix), - maxKeywordMatchCost, maxPrefixMatchCost, - &KeywordMatchForTest, &PrefixMatchForTest)); + m_ptrs[i] = &m_keywords[i]; + m_pMatcher.reset(new search::KeywordMatcher(m_ptrs.data(), int(m_ptrs.size()), &m_prefix)); } - vector m_keywords; - vector m_keywordPtrs; - scoped_ptr m_pMatcher; + scoped_ptr m_pMatcher; +private: + buffer_vector m_keywords; + buffer_vector m_ptrs; + strings::UniString m_prefix; }; } // unnamed namespace - -// TODO: KeywordMatcher tests. -/* -UNIT_TEST(KeywordMatcher_Smoke) +UNIT_TEST(KeywordMatcher_New) { - KeywordMatcherAdaptor matcherAdaptor("l", 3, 3, "minsk", "belarus"); - search::impl::KeywordMatcher & matcher = *matcherAdaptor.m_pMatcher; - TEST_EQUAL(matcher.GetPrefixMatchScore(), 4, ()); - TEST_EQUAL(vector(matcher.GetKeywordMatchScores(), - matcher.GetKeywordMatchScores() + 2), - Vec(4, 4), ()); - TEST_EQUAL(matcher.GetMatchScore(), 4 + 4 + 4, ()); - - matcher.ProcessName("belarrr"); - TEST_EQUAL(matcher.GetPrefixMatchScore(), 1, ()); - TEST_EQUAL(vector(matcher.GetKeywordMatchScores(), - matcher.GetKeywordMatchScores() + 2), - Vec(4, 2), ()); - TEST_EQUAL(matcher.GetMatchScore(), 1 + 4 + 2, ()); - - matcher.ProcessName("belaruu minnn"); - TEST_EQUAL(matcher.GetPrefixMatchScore(), 1, ()); - TEST_EQUAL(vector(matcher.GetKeywordMatchScores(), - matcher.GetKeywordMatchScores() + 2), - Vec(2, 1), ()); - TEST_EQUAL(matcher.GetMatchScore(), 1 + 2 + 1, ()); - - matcher.ProcessName("belaruu les minnn"); - TEST_EQUAL(matcher.GetPrefixMatchScore(), 0, ()); - TEST_EQUAL(vector(matcher.GetKeywordMatchScores(), - matcher.GetKeywordMatchScores() + 2), - Vec(2, 1), ()); - TEST_EQUAL(matcher.GetMatchScore(), 0 + 2 + 1, ()); + Matcher matcher("new "); + TEST_EQUAL(matcher.m_pMatcher->Score("new"), 0, ()); + TEST_EQUAL(matcher.m_pMatcher->Score("york"), MAX_SCORE, ()); + TEST_EQUAL(matcher.m_pMatcher->Score("new york"), 0, ()); } -UNIT_TEST(KeywordMatcher_NoPrefix) +UNIT_TEST(KeywordMatcher_York) { - KeywordMatcherAdaptor matcherAdaptor("", 3, 3, "minsk", "belarus"); - search::impl::KeywordMatcher & matcher = *matcherAdaptor.m_pMatcher; - TEST_EQUAL(matcher.GetPrefixMatchScore(), 4, ()); - TEST_EQUAL(matcher.GetMatchScore(), 4 + 4 + 4, ()); - - matcher.ProcessName("belaruu zzz minnn"); - TEST_EQUAL(matcher.GetPrefixMatchScore(), 0, ()); - TEST_EQUAL(vector(matcher.GetKeywordMatchScores(), - matcher.GetKeywordMatchScores() + 1), - Vec(2, 1), ()); - TEST_EQUAL(matcher.GetMatchScore(), 0 + 2 + 1, ()); + Matcher matcher("york "); + TEST_EQUAL(matcher.m_pMatcher->Score("new"), MAX_SCORE, ()); + TEST_EQUAL(matcher.m_pMatcher->Score("york"), 0, ()); + TEST_EQUAL(matcher.m_pMatcher->Score("new york"), 1, ()); } -UNIT_TEST(KeywordMatcher_Suomi) +UNIT_TEST(KeywordMatcher_NewYork) { - KeywordMatcherAdaptor matcherAdaptor("", 4, 4, "minsk"); - search::impl::KeywordMatcher & matcher = *matcherAdaptor.m_pMatcher; - matcher.ProcessName("Suomi"); - TEST_EQUAL(matcher.GetMatchScore(), 5, ()); + Matcher matcher1("new york "); + Matcher matcher2("new york"); + TEST_EQUAL(matcher1.m_pMatcher->Score("new"), MAX_SCORE, ()); + TEST_EQUAL(matcher2.m_pMatcher->Score("new"), MAX_SCORE, ()); + TEST_EQUAL(matcher1.m_pMatcher->Score("york"), MAX_SCORE, ()); + TEST_EQUAL(matcher2.m_pMatcher->Score("york"), MAX_SCORE, ()); + TEST_EQUAL(matcher1.m_pMatcher->Score("new york"), 0, ()); + TEST_EQUAL(matcher2.m_pMatcher->Score("new york"), 0, ()); } -*/ + +UNIT_TEST(KeywordMatcher_YorkNew) +{ + Matcher matcher("new york "); + TEST_EQUAL(matcher.m_pMatcher->Score("new"), MAX_SCORE, ()); + TEST_EQUAL(matcher.m_pMatcher->Score("york"), MAX_SCORE, ()); + TEST_EQUAL(matcher.m_pMatcher->Score("new york"), 0, ()); +} + +UNIT_TEST(KeywordMatcher_NewYo) +{ + Matcher matcher("new yo"); + TEST_EQUAL(matcher.m_pMatcher->Score("new"), MAX_SCORE, ()); + TEST_EQUAL(matcher.m_pMatcher->Score("york"), MAX_SCORE, ()); + TEST_EQUAL(matcher.m_pMatcher->Score("new york"), 0, ()); +} + + + + + + + + +