From fb7494df03673eab8e2f8df71dae6f170cd4137f Mon Sep 17 00:00:00 2001 From: vng Date: Tue, 14 Jan 2014 17:57:26 +0300 Subject: [PATCH] [search] Fixed bug with feature names scoring. --- search/keyword_lang_matcher.cpp | 11 +++++ search/keyword_lang_matcher.hpp | 5 ++- search/keyword_matcher.cpp | 42 +++++++++++++------- search/keyword_matcher.hpp | 1 + search/search_query.cpp | 4 +- search/search_tests/keyword_matcher_test.cpp | 19 +++++---- 6 files changed, 53 insertions(+), 29 deletions(-) diff --git a/search/keyword_lang_matcher.cpp b/search/keyword_lang_matcher.cpp index 11b674d0ad..0ac697c12b 100644 --- a/search/keyword_lang_matcher.cpp +++ b/search/keyword_lang_matcher.cpp @@ -11,6 +11,10 @@ namespace search { +KeywordLangMatcher::ScoreT::ScoreT() : m_langScore(numeric_limits::min()) +{ +} + KeywordLangMatcher::ScoreT::ScoreT(KeywordMatcher::ScoreT const & score, int langScore) : m_parentScore(score), m_langScore(langScore) { @@ -71,4 +75,11 @@ KeywordLangMatcher::ScoreT KeywordLangMatcher::Score(int8_t lang, return ScoreT(m_keywordMatcher.Score(tokens, count), GetLangScore(lang)); } +string DebugPrint(KeywordLangMatcher::ScoreT const & score) +{ + ostringstream ss; + ss << "KLM::ScoreT(" << DebugPrint(score.m_parentScore) << ", LS=" << score.m_langScore << ")"; + return ss.str(); +} + } // namespace search diff --git a/search/keyword_lang_matcher.hpp b/search/keyword_lang_matcher.hpp index 3786dc107f..8848751be2 100644 --- a/search/keyword_lang_matcher.hpp +++ b/search/keyword_lang_matcher.hpp @@ -9,14 +9,15 @@ namespace search class KeywordLangMatcher { public: - class ScoreT { public: - ScoreT() {} + ScoreT(); bool operator < (ScoreT const & s) const; + private: friend class KeywordLangMatcher; + friend string DebugPrint(ScoreT const & score); ScoreT(KeywordMatcher::ScoreT const & score, int langScore); diff --git a/search/keyword_matcher.cpp b/search/keyword_matcher.cpp index 0f6502a748..72335aad38 100644 --- a/search/keyword_matcher.cpp +++ b/search/keyword_matcher.cpp @@ -38,16 +38,17 @@ KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const & name) const SplitUniString(name, MakeBackInsertFunctor(tokens), Delimiters()); // Some names can have too many tokens. Trim them. - return Score(tokens.data(), min(size_t(MAX_TOKENS), tokens.size())); + return Score(tokens.data(), tokens.size()); } KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const * tokens, size_t count) const { + count = min(count, size_t(MAX_TOKENS)); + vector isQueryTokenMatched(m_keywords.size()); vector isNameTokenMatched(count); - uint32_t numQueryTokensMatched = 0; uint32_t sumTokenMatchDistance = 0; - int32_t prevTokenMatchDistance = 0; + int8_t prevTokenMatchDistance = 0; bool bPrefixMatched = true; for (int i = 0; i < m_keywords.size(); ++i) @@ -55,7 +56,7 @@ KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const * tokens, size_t coun if (!isNameTokenMatched[j] && m_keywords[i] == tokens[j]) { isQueryTokenMatched[i] = isNameTokenMatched[j] = true; - int32_t const tokenMatchDistance = i - j; + int8_t const tokenMatchDistance = i - j; sumTokenMatchDistance += abs(tokenMatchDistance - prevTokenMatchDistance); prevTokenMatchDistance = tokenMatchDistance; } @@ -68,31 +69,37 @@ KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const * tokens, size_t coun StartsWith(tokens[j].begin(), tokens[j].end(), m_prefix.begin(), m_prefix.end())) { isNameTokenMatched[j] = bPrefixMatched = true; - int32_t const tokenMatchDistance = int(m_keywords.size()) - j; + int8_t const tokenMatchDistance = int(m_keywords.size()) - j; sumTokenMatchDistance += abs(tokenMatchDistance - prevTokenMatchDistance); } } + uint8_t numQueryTokensMatched = 0; for (size_t i = 0; i < isQueryTokenMatched.size(); ++i) if (isQueryTokenMatched[i]) ++numQueryTokensMatched; - ScoreT score = ScoreT(); + ScoreT score; score.m_bFullQueryMatched = bPrefixMatched && (numQueryTokensMatched == isQueryTokenMatched.size()); score.m_bPrefixMatched = bPrefixMatched; score.m_numQueryTokensAndPrefixMatched = numQueryTokensMatched + (bPrefixMatched ? 1 : 0); - score.m_nameTokensMatched = 0xFFFFFFFF; - for (uint32_t i = 0; i < min(size_t(32), isNameTokenMatched.size()); ++i) - if (!isNameTokenMatched[i]) - score.m_nameTokensMatched &= ~(1 << (31 - i)); - score.m_sumTokenMatchDistance = sumTokenMatchDistance; + score.m_nameTokensMatched = 0; + score.m_nameTokensLength = 0; + for (size_t i = 0; i < count; ++i) + { + if (isNameTokenMatched[i]) + score.m_nameTokensMatched |= (1 << (MAX_TOKENS-1 - i)); + score.m_nameTokensLength += tokens[i].size(); + } + + score.m_sumTokenMatchDistance = sumTokenMatchDistance; return score; } KeywordMatcher::ScoreT::ScoreT() - : m_sumTokenMatchDistance(0), m_nameTokensMatched(0), m_numQueryTokensAndPrefixMatched(0), - m_bFullQueryMatched(false), m_bPrefixMatched(false) + : m_sumTokenMatchDistance(0), m_nameTokensMatched(0), m_nameTokensLength(0), + m_numQueryTokensAndPrefixMatched(0), m_bFullQueryMatched(false), m_bPrefixMatched(false) { } @@ -108,7 +115,11 @@ bool KeywordMatcher::ScoreT::operator < (KeywordMatcher::ScoreT const & s) const return m_nameTokensMatched < s.m_nameTokensMatched; if (m_sumTokenMatchDistance != s.m_sumTokenMatchDistance) return m_sumTokenMatchDistance > s.m_sumTokenMatchDistance; - return false; + + if (m_bFullQueryMatched) + return m_nameTokensLength > s.m_nameTokensLength; + else + return false; } string DebugPrint(KeywordMatcher::ScoreT const & score) @@ -119,7 +130,8 @@ string DebugPrint(KeywordMatcher::ScoreT const & score) out << ",nQTM=" << static_cast(score.m_numQueryTokensAndPrefixMatched); out << ",PM=" << score.m_bPrefixMatched; out << ",NTM="; - for (int i = 31; i >= 0; --i) out << ((score.m_nameTokensMatched >> i) & 1); + for (int i = MAX_TOKENS-1; i >= 0; --i) + out << ((score.m_nameTokensMatched >> i) & 1); out << ",STMD=" << score.m_sumTokenMatchDistance; out << ")"; return out.str(); diff --git a/search/keyword_matcher.hpp b/search/keyword_matcher.hpp index 20a8e593b7..28b7d36e12 100644 --- a/search/keyword_matcher.hpp +++ b/search/keyword_matcher.hpp @@ -28,6 +28,7 @@ public: uint32_t m_sumTokenMatchDistance; uint32_t m_nameTokensMatched; + uint32_t m_nameTokensLength; uint8_t m_numQueryTokensAndPrefixMatched; bool m_bFullQueryMatched : 1; bool m_bPrefixMatched : 1; diff --git a/search/search_query.cpp b/search/search_query.cpp index 8986955c0b..e91acfd097 100644 --- a/search/search_query.cpp +++ b/search/search_query.cpp @@ -95,8 +95,8 @@ Query::Query(Index const * pIndex, // Initialize keywords scorer. // Note! This order should match the indexes arrays above. vector > langPriorities(4); - langPriorities[0].push_back(0); // future current lang - langPriorities[1].push_back(0); // future input lang + langPriorities[0].push_back(-1); // future current lang + langPriorities[1].push_back(-1); // future input lang langPriorities[2].push_back(StringUtf8Multilang::GetLangIndex("int_name")); langPriorities[2].push_back(StringUtf8Multilang::GetLangIndex("en")); langPriorities[3].push_back(StringUtf8Multilang::GetLangIndex("default")); diff --git a/search/search_tests/keyword_matcher_test.cpp b/search/search_tests/keyword_matcher_test.cpp index 41d1c05fea..7bbb0691a9 100644 --- a/search/search_tests/keyword_matcher_test.cpp +++ b/search/search_tests/keyword_matcher_test.cpp @@ -117,11 +117,11 @@ UNIT_TEST(KeywordMatcher_Prefix) {MATCHES, STRONGLY_BETTER, "new york gym"}, {MATCHES, BETTER_OR_EQUAL, "new new york"}, - {MATCHES, STRONGLY_BETTER, "new york"}, {MATCHES, STRONGLY_BETTER, "newark"}, - {MATCHES, BETTER_OR_EQUAL, "new"}, + + {MATCHES, STRONGLY_BETTER, "new"}, }; TestKeywordMatcher(query, testCases); } @@ -135,8 +135,7 @@ UNIT_TEST(KeywordMatcher_Keyword) {NOMATCH, DOES_NOT_MATTER, "zzz"}, {NOMATCH, DOES_NOT_MATTER, "ne"}, {NOMATCH, DOES_NOT_MATTER, "the netherlands"}, - - {NOMATCH, STRONGLY_BETTER, "newark"}, + {NOMATCH, DOES_NOT_MATTER, "newark"}, {MATCHES, STRONGLY_BETTER, "york new"}, @@ -174,14 +173,15 @@ UNIT_TEST(KeywordMatcher_KeywordAndPrefix) {MATCHES, STRONGLY_BETTER, "the new york"}, {MATCHES, BETTER_OR_EQUAL, "york new the"}, + {MATCHES, BETTER_OR_EQUAL, "york new"}, + + {MATCHES, STRONGLY_BETTER, "yo new"}, {MATCHES, STRONGLY_BETTER, "new york pizza"}, - {MATCHES, STRONGLY_BETTER, "york new"}, - {MATCHES, BETTER_OR_EQUAL, "yo new"}, - {MATCHES, STRONGLY_BETTER, "new york"}, - {MATCHES, BETTER_OR_EQUAL, "new yo"}, + + {MATCHES, STRONGLY_BETTER, "new yo"}, }; TestKeywordMatcher(query, testCases); } @@ -203,10 +203,9 @@ UNIT_TEST(KeywordMatcher_KeywordAndKeyword) {MATCHES, STRONGLY_BETTER, "the new york"}, {MATCHES, BETTER_OR_EQUAL, "york new the"}, - {MATCHES, STRONGLY_BETTER, "new york pizza"}, - {MATCHES, STRONGLY_BETTER, "york new"}, + {MATCHES, STRONGLY_BETTER, "new york pizza"}, {MATCHES, STRONGLY_BETTER, "new york"}, }; TestKeywordMatcher(query, testCases);