[search] New, simpler KeywordMatcher.

This commit is contained in:
Yury Melnichek 2011-09-03 16:16:53 +02:00 committed by Alex Zolotarev
parent 8faa4f71bc
commit 005e35bad3
4 changed files with 179 additions and 235 deletions

View file

@ -1,93 +1,75 @@
#include "keyword_matcher.hpp"
#include "../indexer/search_delimiters.hpp"
#include "../indexer/search_string_utils.hpp"
#include "../base/logging.hpp"
#include "../base/stl_add.hpp"
#include "../base/string_utils.hpp"
#include "../std/bind.hpp"
#include "../std/numeric.hpp"
#include "../std/algorithm.hpp"
namespace search
search::KeywordMatcher::KeywordMatcher(strings::UniString const * const * pKeywords,
int keywordCount,
strings::UniString const * pPrefix)
: m_pKeywords(pKeywords), m_keywordCount(keywordCount), m_pPrefix(pPrefix)
{
namespace impl
{
KeywordMatcher::KeywordMatcher(strings::UniString const * const * pKeywords,
size_t keywordsCount,
strings::UniString const & prefix,
uint32_t maxKeywordMatchCost, uint32_t maxPrefixMatchCost,
StringMatchFn keywordMatchFn, StringMatchFn prefixMatchFn)
: m_pKeywords(pKeywords), m_prefix(prefix),
m_maxKeywordMatchCost(maxKeywordMatchCost),
m_maxPrefixMatchCost(maxPrefixMatchCost),
m_keywordMatchFn(keywordMatchFn),
m_prefixMatchFn(prefixMatchFn),
m_minKeywordMatchCost(keywordsCount, m_maxKeywordMatchCost + 1),
m_minPrefixMatchCost(m_maxPrefixMatchCost + 1),
m_bestMatchNamePenalty(static_cast<uint32_t>(-1))
{
#ifdef DEBUG
for (size_t i = 0; i < keywordsCount; ++i)
ASSERT(!m_pKeywords[i]->empty(), (i));
#endif
ASSERT_LESS(m_keywordCount, int(MAX_TOKENS), ());
m_keywordCount = min(m_keywordCount, int(MAX_TOKENS));
if (m_pPrefix && m_pPrefix->empty())
m_pPrefix = NULL;
}
void KeywordMatcher::ProcessName(string const & name)
uint32_t search::KeywordMatcher::Score(string const & name) const
{
SplitUniString(NormalizeAndSimplifyString(name),
bind(&KeywordMatcher::ProcessNameToken, this, cref(name), _1),
Delimiters());
return Score(NormalizeAndSimplifyString(name));
}
void KeywordMatcher::ProcessNameToken(string const & name, strings::UniString const & s)
uint32_t search::KeywordMatcher::Score(strings::UniString const & name) const
{
uint32_t matchPenalty = 0;
for (size_t i = 0; i < m_minKeywordMatchCost.size(); ++i)
{
strings::UniString const & keyword = *(m_pKeywords[i]);
uint32_t const matchCost = m_keywordMatchFn(&keyword[0], keyword.size(),
&s[0], s.size(), m_minKeywordMatchCost[i]);
matchPenalty += matchCost;
if (matchCost <= m_maxKeywordMatchCost)
{
if (matchCost < m_minKeywordMatchCost[i])
{
// LOG(LDEBUG, (matchCost, name));
m_minKeywordMatchCost[i] = matchCost;
}
}
}
bool bPrefixMatch = false;
if (!m_prefix.empty())
{
uint32_t const matchCost = m_prefixMatchFn(&m_prefix[0], m_prefix.size(),
&s[0], s.size(), m_minPrefixMatchCost);
matchPenalty += matchCost;
if (matchCost <= m_maxPrefixMatchCost)
{
bPrefixMatch = true;
if (matchCost < m_minPrefixMatchCost)
m_minPrefixMatchCost = matchCost;
}
}
else
{
bPrefixMatch = true;
m_minPrefixMatchCost = 0;
}
if (bPrefixMatch && matchPenalty < m_bestMatchNamePenalty)
{
m_bestMatchName = name;
m_bestMatchNamePenalty = matchPenalty;
}
buffer_vector<strings::UniString, MAX_TOKENS> tokens;
SplitUniString(name, MakeBackInsertFunctor(tokens), Delimiters());
ASSERT_LESS(tokens.size(), size_t(MAX_TOKENS), ());
return Score(tokens.data(), static_cast<int>(tokens.size()));
}
uint32_t KeywordMatcher::GetMatchScore() const
uint32_t search::KeywordMatcher::Score(strings::UniString const * tokens, int tokenCount) const
{
return accumulate(m_minKeywordMatchCost.begin(), m_minKeywordMatchCost.end(),
m_minPrefixMatchCost);
}
ASSERT_LESS(tokenCount, int(MAX_TOKENS), ());
} // namespace search::impl
} // namespace search
// We will use this for scoring.
unsigned char isTokenMatched[MAX_TOKENS] = { 0 };
// Check that all keywords matched.
for (int k = 0; k < m_keywordCount; ++k)
{
unsigned char isKeywordMatched = 0;
for (int t = 0; t < tokenCount; ++t)
if (*m_pKeywords[k] == tokens[t])
isKeywordMatched = isTokenMatched[t] = 1;
// All keywords should be matched.
if (!isKeywordMatched)
return MAX_SCORE;
}
// Check that prefix matched.
if (m_pPrefix)
{
bool bPrefixMatched = false;
for (int t = 0; t < tokenCount && !bPrefixMatched; ++t)
if (StartsWith(tokens[t].begin(), tokens[t].end(),
m_pPrefix->begin(), m_pPrefix->end()))
bPrefixMatched = true;
if (!bPrefixMatched)
return MAX_SCORE;
}
// Calculate score.
int lastTokenMatched = 0;
for (int t = 0; t < tokenCount; ++t)
if (isTokenMatched[t])
lastTokenMatched = t;
uint32_t score = 0;
for (int t = 0; t <= lastTokenMatched; ++t)
if (tokens[t].size() > 2 && !isTokenMatched[t])
++score;
return score;
}

View file

@ -1,59 +1,31 @@
#pragma once
#include "../base/base.hpp"
#include "search_common.hpp"
#include "../base/assert.hpp"
#include "../base/buffer_vector.hpp"
#include "../base/string_utils.hpp"
#include "../std/string.hpp"
namespace search
{
namespace impl
{
typedef uint32_t (* StringMatchFn)(strings::UniChar const * sA, uint32_t sizeA,
strings::UniChar const * sB, uint32_t sizeB,
uint32_t maxCost);
// Matches keywords agains given names.
class KeywordMatcher
{
strings::UniString const * const * m_pKeywords;
strings::UniString const & m_prefix;
uint32_t m_maxKeywordMatchCost, m_maxPrefixMatchCost;
StringMatchFn m_keywordMatchFn, m_prefixMatchFn;
buffer_vector<uint32_t, 8> m_minKeywordMatchCost;
uint32_t m_minPrefixMatchCost;
string m_bestMatchName;
uint32_t m_bestMatchNamePenalty;
public:
KeywordMatcher(strings::UniString const * const * pKeywords,
size_t keywordsCount,
strings::UniString const & prefix,
uint32_t maxKeywordMatchCost, uint32_t maxPrefixMatchCost,
StringMatchFn keywordMatchFn, StringMatchFn prefixMatchFn);
enum { MAX_SCORE = MAX_TOKENS };
void ProcessName(string const & name);
void ProcessNameToken(string const & name, strings::UniString const & token);
KeywordMatcher(strings::UniString const * const * pKeywords, int keywordCount,
strings::UniString const * pPrefix);
// Useful for FeatureType.ForEachName(), calls ProcessName() and always returns true.
bool operator () (int /*lang*/, string const & name)
{
ProcessName(name);
return true;
}
// Get total feature match score.
uint32_t GetMatchScore() const;
// Returns penalty (which is less than MAX_SCORE) if name matched, or MAX_SCORE otherwise.
uint32_t Score(string const & name) const;
uint32_t Score(strings::UniString const & name) const;
uint32_t Score(strings::UniString const * tokens, int tokenCount) const;
// Get prefix match score.
uint32_t GetPrefixMatchScore() const { return m_minPrefixMatchCost; }
// Get match score for each keyword.
uint32_t const * GetKeywordMatchScores() const { return &m_minKeywordMatchCost[0]; }
string GetBestMatchName() const { return m_bestMatchName; }
private:
strings::UniString const * const * m_pKeywords;
int m_keywordCount;
strings::UniString const * m_pPrefix;
};
} // namespace search::impl
} // namespace search

View file

@ -59,15 +59,26 @@ inline uint32_t GetMaxPrefixMatchScore(int size)
return 512;
}
template <typename UniStringPtrVectorT>
inline KeywordMatcher MakeMatcher(UniStringPtrVectorT const & tokens,
strings::UniString const & prefix)
struct FeatureMatcher
{
return KeywordMatcher(tokens.empty() ? NULL : &tokens[0], tokens.size(),
prefix,
GetMaxKeywordMatchScore(), GetMaxPrefixMatchScore(prefix.size()),
&KeywordMatch, &PrefixMatch);
}
KeywordMatcher & m_keywordMatcher;
uint32_t m_minScore;
string m_bestName;
explicit FeatureMatcher(KeywordMatcher & keywordMatcher)
: m_keywordMatcher(keywordMatcher), m_minScore(keywordMatcher.MAX_SCORE) {}
bool operator () (int /*lang*/, string const & name)
{
uint32_t const score = m_keywordMatcher.Score(name);
if (score < m_minScore)
{
m_minScore = score;
m_bestName = name;
}
return true;
}
};
struct FeatureProcessor
{
@ -99,23 +110,20 @@ struct FeatureProcessor
if (!(keywordsSkipMask & (1 << i)))
keywords.push_back(&queryKeywords[i]);
KeywordMatcher matcher(MakeMatcher(keywords, m_query.GetPrefix()));
KeywordMatcher keywordMatcher(keywords.data(), int(keywords.size()), &m_query.GetPrefix());
FeatureMatcher matcher(keywordMatcher);
feature.ForEachNameRef(matcher);
if (matcher.GetPrefixMatchScore() <= GetMaxPrefixMatchScore(m_query.GetPrefix().size()))
if (matcher.m_minScore < KeywordMatcher::MAX_SCORE)
{
uint32_t const matchScore = matcher.GetMatchScore();
if (matchScore <= GetMaxKeywordMatchScore())
{
pair<int, int> const scaleRange = feature::DrawableScaleRangeForText(feature);
if (scaleRange.first < 0)
return;
m_query.AddResult(IntermediateResult(m_query.GetViewport(),
feature,
matcher.GetBestMatchName(),
matchScore,
scaleRange.first));
}
pair<int, int> const scaleRange = feature::DrawableScaleRangeForText(feature);
// TODO: Why scaleRange.first can be < 0?
if (scaleRange.first < 0)
return;
m_query.AddResult(IntermediateResult(m_query.GetViewport(),
feature,
matcher.m_bestName,
matcher.m_minScore,
scaleRange.first));
}
}
};
@ -211,15 +219,17 @@ void Query::Search(function<void (Result const &)> const & f)
// TODO: Prefer user languages here.
if (m_prefix.size() >= iName->m_prefixLengthToSuggest)
{
KeywordMatcher matcher = MakeMatcher(vector<strings::UniString const *>(), m_prefix);
matcher.ProcessNameToken(string(), NormalizeAndSimplifyString(iName->m_name));
ASSERT_LESS(iName->m_prefixLengthToSuggest, 1 << PREFIX_LEN_BITS, ());
int const penalty =
(matcher.GetPrefixMatchScore() << PREFIX_LEN_BITS) + iName->m_prefixLengthToSuggest;
if (penalty < bestPrefixMatchPenalty)
KeywordMatcher matcher(NULL, 0, &m_prefix);
int const score = matcher.Score(iName->m_name);
if (score < KeywordMatcher::MAX_SCORE)
{
bestPrefixMatchPenalty = penalty;
bestPrefixMatch = iName->m_name;
ASSERT_LESS(iName->m_prefixLengthToSuggest, 1 << PREFIX_LEN_BITS, ());
int const penalty = (score << PREFIX_LEN_BITS) + iName->m_prefixLengthToSuggest;
if (penalty < bestPrefixMatchPenalty)
{
bestPrefixMatchPenalty = penalty;
bestPrefixMatch = iName->m_name;
}
}
}
}

View file

@ -1,110 +1,90 @@
#include "../../testing/testing.hpp"
#include "../keyword_matcher.hpp"
#include "match_cost_mock.hpp"
#include "../approximate_string_match.hpp"
#include "../../indexer/search_string_utils.hpp"
#include "../../testing/testing_utils.hpp"
#include "../../base/string_utils.hpp"
#include "../../indexer/search_delimiters.hpp"
#include "../../base/buffer_vector.hpp"
#include "../../base/stl_add.hpp"
#include "../../std/scoped_ptr.hpp"
#include "../../std/vector.hpp"
namespace
{
static const uint32_t MAX_SCORE = search::KeywordMatcher::MAX_SCORE;
uint32_t KeywordMatchForTest(strings::UniChar const * sA, uint32_t sizeA,
strings::UniChar const * sB, uint32_t sizeB,
uint32_t maxCost)
class Matcher
{
return StringMatchCost(sA, sizeA, sB, sizeB, search::MatchCostMock<strings::UniChar>(),
maxCost, false);
}
uint32_t PrefixMatchForTest(strings::UniChar const * sA, uint32_t sizeA,
strings::UniChar const * sB, uint32_t sizeB,
uint32_t maxCost)
{
return StringMatchCost(sA, sizeA, sB, sizeB, search::MatchCostMock<strings::UniChar>(),
maxCost, true);
}
struct KeywordMatcherAdaptor
{
explicit KeywordMatcherAdaptor(char const * prefix,
uint32_t maxKeywordMatchCost, uint32_t maxPrefixMatchCost,
char const * s0, char const * s1 = NULL)
public:
Matcher(char const * query)
{
m_keywords.push_back(strings::MakeUniString(s0));
if (s1)
m_keywords.push_back(strings::MakeUniString(s1));
strings::UniString const uniQuery = search::NormalizeAndSimplifyString(query);
SplitUniString(uniQuery, MakeBackInsertFunctor(m_keywords), search::Delimiters());
if (!uniQuery.empty() && uniQuery.back() != ' ')
{
m_prefix = m_keywords.back();
m_keywords.pop_back();
}
m_ptrs.resize(m_keywords.size());
for (size_t i = 0; i < m_keywords.size(); ++i)
m_keywordPtrs.push_back(&m_keywords[i]);
m_pMatcher.reset(new search::impl::KeywordMatcher(&m_keywordPtrs[0], m_keywordPtrs.size(),
strings::MakeUniString(prefix),
maxKeywordMatchCost, maxPrefixMatchCost,
&KeywordMatchForTest, &PrefixMatchForTest));
m_ptrs[i] = &m_keywords[i];
m_pMatcher.reset(new search::KeywordMatcher(m_ptrs.data(), int(m_ptrs.size()), &m_prefix));
}
vector<strings::UniString> m_keywords;
vector<strings::UniString const *> m_keywordPtrs;
scoped_ptr<search::impl::KeywordMatcher> m_pMatcher;
scoped_ptr<search::KeywordMatcher> m_pMatcher;
private:
buffer_vector<strings::UniString, 10> m_keywords;
buffer_vector<strings::UniString const *, 10> m_ptrs;
strings::UniString m_prefix;
};
} // unnamed namespace
// TODO: KeywordMatcher tests.
/*
UNIT_TEST(KeywordMatcher_Smoke)
UNIT_TEST(KeywordMatcher_New)
{
KeywordMatcherAdaptor matcherAdaptor("l", 3, 3, "minsk", "belarus");
search::impl::KeywordMatcher & matcher = *matcherAdaptor.m_pMatcher;
TEST_EQUAL(matcher.GetPrefixMatchScore(), 4, ());
TEST_EQUAL(vector<uint32_t>(matcher.GetKeywordMatchScores(),
matcher.GetKeywordMatchScores() + 2),
Vec<uint32_t>(4, 4), ());
TEST_EQUAL(matcher.GetMatchScore(), 4 + 4 + 4, ());
matcher.ProcessName("belarrr");
TEST_EQUAL(matcher.GetPrefixMatchScore(), 1, ());
TEST_EQUAL(vector<uint32_t>(matcher.GetKeywordMatchScores(),
matcher.GetKeywordMatchScores() + 2),
Vec<uint32_t>(4, 2), ());
TEST_EQUAL(matcher.GetMatchScore(), 1 + 4 + 2, ());
matcher.ProcessName("belaruu minnn");
TEST_EQUAL(matcher.GetPrefixMatchScore(), 1, ());
TEST_EQUAL(vector<uint32_t>(matcher.GetKeywordMatchScores(),
matcher.GetKeywordMatchScores() + 2),
Vec<uint32_t>(2, 1), ());
TEST_EQUAL(matcher.GetMatchScore(), 1 + 2 + 1, ());
matcher.ProcessName("belaruu les minnn");
TEST_EQUAL(matcher.GetPrefixMatchScore(), 0, ());
TEST_EQUAL(vector<uint32_t>(matcher.GetKeywordMatchScores(),
matcher.GetKeywordMatchScores() + 2),
Vec<uint32_t>(2, 1), ());
TEST_EQUAL(matcher.GetMatchScore(), 0 + 2 + 1, ());
Matcher matcher("new ");
TEST_EQUAL(matcher.m_pMatcher->Score("new"), 0, ());
TEST_EQUAL(matcher.m_pMatcher->Score("york"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_pMatcher->Score("new york"), 0, ());
}
UNIT_TEST(KeywordMatcher_NoPrefix)
UNIT_TEST(KeywordMatcher_York)
{
KeywordMatcherAdaptor matcherAdaptor("", 3, 3, "minsk", "belarus");
search::impl::KeywordMatcher & matcher = *matcherAdaptor.m_pMatcher;
TEST_EQUAL(matcher.GetPrefixMatchScore(), 4, ());
TEST_EQUAL(matcher.GetMatchScore(), 4 + 4 + 4, ());
matcher.ProcessName("belaruu zzz minnn");
TEST_EQUAL(matcher.GetPrefixMatchScore(), 0, ());
TEST_EQUAL(vector<uint32_t>(matcher.GetKeywordMatchScores(),
matcher.GetKeywordMatchScores() + 1),
Vec<uint32_t>(2, 1), ());
TEST_EQUAL(matcher.GetMatchScore(), 0 + 2 + 1, ());
Matcher matcher("york ");
TEST_EQUAL(matcher.m_pMatcher->Score("new"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_pMatcher->Score("york"), 0, ());
TEST_EQUAL(matcher.m_pMatcher->Score("new york"), 1, ());
}
UNIT_TEST(KeywordMatcher_Suomi)
UNIT_TEST(KeywordMatcher_NewYork)
{
KeywordMatcherAdaptor matcherAdaptor("", 4, 4, "minsk");
search::impl::KeywordMatcher & matcher = *matcherAdaptor.m_pMatcher;
matcher.ProcessName("Suomi");
TEST_EQUAL(matcher.GetMatchScore(), 5, ());
Matcher matcher1("new york ");
Matcher matcher2("new york");
TEST_EQUAL(matcher1.m_pMatcher->Score("new"), MAX_SCORE, ());
TEST_EQUAL(matcher2.m_pMatcher->Score("new"), MAX_SCORE, ());
TEST_EQUAL(matcher1.m_pMatcher->Score("york"), MAX_SCORE, ());
TEST_EQUAL(matcher2.m_pMatcher->Score("york"), MAX_SCORE, ());
TEST_EQUAL(matcher1.m_pMatcher->Score("new york"), 0, ());
TEST_EQUAL(matcher2.m_pMatcher->Score("new york"), 0, ());
}
*/
UNIT_TEST(KeywordMatcher_YorkNew)
{
Matcher matcher("new york ");
TEST_EQUAL(matcher.m_pMatcher->Score("new"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_pMatcher->Score("york"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_pMatcher->Score("new york"), 0, ());
}
UNIT_TEST(KeywordMatcher_NewYo)
{
Matcher matcher("new yo");
TEST_EQUAL(matcher.m_pMatcher->Score("new"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_pMatcher->Score("york"), MAX_SCORE, ());
TEST_EQUAL(matcher.m_pMatcher->Score("new york"), 0, ());
}