[search] Fixed bug with feature names scoring.

This commit is contained in:
vng 2014-01-14 17:57:26 +03:00 committed by Alex Zolotarev
parent 419c12b2b7
commit fb7494df03
6 changed files with 53 additions and 29 deletions

View file

@ -11,6 +11,10 @@
namespace search
{
KeywordLangMatcher::ScoreT::ScoreT() : m_langScore(numeric_limits<int>::min())
{
}
KeywordLangMatcher::ScoreT::ScoreT(KeywordMatcher::ScoreT const & score, int langScore)
: m_parentScore(score), m_langScore(langScore)
{
@ -71,4 +75,11 @@ KeywordLangMatcher::ScoreT KeywordLangMatcher::Score(int8_t lang,
return ScoreT(m_keywordMatcher.Score(tokens, count), GetLangScore(lang));
}
string DebugPrint(KeywordLangMatcher::ScoreT const & score)
{
ostringstream ss;
ss << "KLM::ScoreT(" << DebugPrint(score.m_parentScore) << ", LS=" << score.m_langScore << ")";
return ss.str();
}
} // namespace search

View file

@ -9,14 +9,15 @@ namespace search
class KeywordLangMatcher
{
public:
class ScoreT
{
public:
ScoreT() {}
ScoreT();
bool operator < (ScoreT const & s) const;
private:
friend class KeywordLangMatcher;
friend string DebugPrint(ScoreT const & score);
ScoreT(KeywordMatcher::ScoreT const & score, int langScore);

View file

@ -38,16 +38,17 @@ KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const & name) const
SplitUniString(name, MakeBackInsertFunctor(tokens), Delimiters());
// Some names can have too many tokens. Trim them.
return Score(tokens.data(), min(size_t(MAX_TOKENS), tokens.size()));
return Score(tokens.data(), tokens.size());
}
KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const * tokens, size_t count) const
{
count = min(count, size_t(MAX_TOKENS));
vector<bool> isQueryTokenMatched(m_keywords.size());
vector<bool> isNameTokenMatched(count);
uint32_t numQueryTokensMatched = 0;
uint32_t sumTokenMatchDistance = 0;
int32_t prevTokenMatchDistance = 0;
int8_t prevTokenMatchDistance = 0;
bool bPrefixMatched = true;
for (int i = 0; i < m_keywords.size(); ++i)
@ -55,7 +56,7 @@ KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const * tokens, size_t coun
if (!isNameTokenMatched[j] && m_keywords[i] == tokens[j])
{
isQueryTokenMatched[i] = isNameTokenMatched[j] = true;
int32_t const tokenMatchDistance = i - j;
int8_t const tokenMatchDistance = i - j;
sumTokenMatchDistance += abs(tokenMatchDistance - prevTokenMatchDistance);
prevTokenMatchDistance = tokenMatchDistance;
}
@ -68,31 +69,37 @@ KeywordMatcher::ScoreT KeywordMatcher::Score(StringT const * tokens, size_t coun
StartsWith(tokens[j].begin(), tokens[j].end(), m_prefix.begin(), m_prefix.end()))
{
isNameTokenMatched[j] = bPrefixMatched = true;
int32_t const tokenMatchDistance = int(m_keywords.size()) - j;
int8_t const tokenMatchDistance = int(m_keywords.size()) - j;
sumTokenMatchDistance += abs(tokenMatchDistance - prevTokenMatchDistance);
}
}
uint8_t numQueryTokensMatched = 0;
for (size_t i = 0; i < isQueryTokenMatched.size(); ++i)
if (isQueryTokenMatched[i])
++numQueryTokensMatched;
ScoreT score = ScoreT();
ScoreT score;
score.m_bFullQueryMatched = bPrefixMatched && (numQueryTokensMatched == isQueryTokenMatched.size());
score.m_bPrefixMatched = bPrefixMatched;
score.m_numQueryTokensAndPrefixMatched = numQueryTokensMatched + (bPrefixMatched ? 1 : 0);
score.m_nameTokensMatched = 0xFFFFFFFF;
for (uint32_t i = 0; i < min(size_t(32), isNameTokenMatched.size()); ++i)
if (!isNameTokenMatched[i])
score.m_nameTokensMatched &= ~(1 << (31 - i));
score.m_sumTokenMatchDistance = sumTokenMatchDistance;
score.m_nameTokensMatched = 0;
score.m_nameTokensLength = 0;
for (size_t i = 0; i < count; ++i)
{
if (isNameTokenMatched[i])
score.m_nameTokensMatched |= (1 << (MAX_TOKENS-1 - i));
score.m_nameTokensLength += tokens[i].size();
}
score.m_sumTokenMatchDistance = sumTokenMatchDistance;
return score;
}
KeywordMatcher::ScoreT::ScoreT()
: m_sumTokenMatchDistance(0), m_nameTokensMatched(0), m_numQueryTokensAndPrefixMatched(0),
m_bFullQueryMatched(false), m_bPrefixMatched(false)
: m_sumTokenMatchDistance(0), m_nameTokensMatched(0), m_nameTokensLength(0),
m_numQueryTokensAndPrefixMatched(0), m_bFullQueryMatched(false), m_bPrefixMatched(false)
{
}
@ -108,7 +115,11 @@ bool KeywordMatcher::ScoreT::operator < (KeywordMatcher::ScoreT const & s) const
return m_nameTokensMatched < s.m_nameTokensMatched;
if (m_sumTokenMatchDistance != s.m_sumTokenMatchDistance)
return m_sumTokenMatchDistance > s.m_sumTokenMatchDistance;
return false;
if (m_bFullQueryMatched)
return m_nameTokensLength > s.m_nameTokensLength;
else
return false;
}
string DebugPrint(KeywordMatcher::ScoreT const & score)
@ -119,7 +130,8 @@ string DebugPrint(KeywordMatcher::ScoreT const & score)
out << ",nQTM=" << static_cast<int>(score.m_numQueryTokensAndPrefixMatched);
out << ",PM=" << score.m_bPrefixMatched;
out << ",NTM=";
for (int i = 31; i >= 0; --i) out << ((score.m_nameTokensMatched >> i) & 1);
for (int i = MAX_TOKENS-1; i >= 0; --i)
out << ((score.m_nameTokensMatched >> i) & 1);
out << ",STMD=" << score.m_sumTokenMatchDistance;
out << ")";
return out.str();

View file

@ -28,6 +28,7 @@ public:
uint32_t m_sumTokenMatchDistance;
uint32_t m_nameTokensMatched;
uint32_t m_nameTokensLength;
uint8_t m_numQueryTokensAndPrefixMatched;
bool m_bFullQueryMatched : 1;
bool m_bPrefixMatched : 1;

View file

@ -95,8 +95,8 @@ Query::Query(Index const * pIndex,
// Initialize keywords scorer.
// Note! This order should match the indexes arrays above.
vector<vector<int8_t> > langPriorities(4);
langPriorities[0].push_back(0); // future current lang
langPriorities[1].push_back(0); // future input lang
langPriorities[0].push_back(-1); // future current lang
langPriorities[1].push_back(-1); // future input lang
langPriorities[2].push_back(StringUtf8Multilang::GetLangIndex("int_name"));
langPriorities[2].push_back(StringUtf8Multilang::GetLangIndex("en"));
langPriorities[3].push_back(StringUtf8Multilang::GetLangIndex("default"));

View file

@ -117,11 +117,11 @@ UNIT_TEST(KeywordMatcher_Prefix)
{MATCHES, STRONGLY_BETTER, "new york gym"},
{MATCHES, BETTER_OR_EQUAL, "new new york"},
{MATCHES, STRONGLY_BETTER, "new york"},
{MATCHES, STRONGLY_BETTER, "newark"},
{MATCHES, BETTER_OR_EQUAL, "new"},
{MATCHES, STRONGLY_BETTER, "new"},
};
TestKeywordMatcher(query, testCases);
}
@ -135,8 +135,7 @@ UNIT_TEST(KeywordMatcher_Keyword)
{NOMATCH, DOES_NOT_MATTER, "zzz"},
{NOMATCH, DOES_NOT_MATTER, "ne"},
{NOMATCH, DOES_NOT_MATTER, "the netherlands"},
{NOMATCH, STRONGLY_BETTER, "newark"},
{NOMATCH, DOES_NOT_MATTER, "newark"},
{MATCHES, STRONGLY_BETTER, "york new"},
@ -174,14 +173,15 @@ UNIT_TEST(KeywordMatcher_KeywordAndPrefix)
{MATCHES, STRONGLY_BETTER, "the new york"},
{MATCHES, BETTER_OR_EQUAL, "york new the"},
{MATCHES, BETTER_OR_EQUAL, "york new"},
{MATCHES, STRONGLY_BETTER, "yo new"},
{MATCHES, STRONGLY_BETTER, "new york pizza"},
{MATCHES, STRONGLY_BETTER, "york new"},
{MATCHES, BETTER_OR_EQUAL, "yo new"},
{MATCHES, STRONGLY_BETTER, "new york"},
{MATCHES, BETTER_OR_EQUAL, "new yo"},
{MATCHES, STRONGLY_BETTER, "new yo"},
};
TestKeywordMatcher(query, testCases);
}
@ -203,10 +203,9 @@ UNIT_TEST(KeywordMatcher_KeywordAndKeyword)
{MATCHES, STRONGLY_BETTER, "the new york"},
{MATCHES, BETTER_OR_EQUAL, "york new the"},
{MATCHES, STRONGLY_BETTER, "new york pizza"},
{MATCHES, STRONGLY_BETTER, "york new"},
{MATCHES, STRONGLY_BETTER, "new york pizza"},
{MATCHES, STRONGLY_BETTER, "new york"},
};
TestKeywordMatcher(query, testCases);