[search] Fixed stop words processing for category synonyms.

This commit is contained in:
vng 2014-04-23 15:34:41 -04:00 committed by Alex Zolotarev
parent 9632652c23
commit 98d058924d
4 changed files with 24 additions and 1 deletions

View file

@ -12,6 +12,11 @@
namespace strings
{
bool UniString::IsEqualAscii(char const * s) const
{
return (size() == strlen(s) && equal(begin(), end(), s));
}
SimpleDelimiter::SimpleDelimiter(char const * delimChars)
{
string const s(delimChars);

View file

@ -25,6 +25,8 @@ public:
UniString() {}
explicit UniString(size_t n, UniChar c = UniChar()) : BaseT(n, c) {}
template <class IterT> UniString(IterT b, IterT e) : BaseT(b, e) {}
bool IsEqualAscii(char const * s) const;
};
UniString MakeLowerCase(UniString const & s);

View file

@ -49,7 +49,8 @@ void CategoriesHolder::AddCategory(Category & cat, vector<uint32_t> & types)
for (size_t j = 0; j < tokens.size(); ++j)
for (size_t k = 0; k < types.size(); ++k)
m_name2type.insert(make_pair(tokens[j], types[k]));
if (ValidKeyToken(tokens[j]))
m_name2type.insert(make_pair(tokens[j], types[k]));
}
}
@ -57,6 +58,20 @@ void CategoriesHolder::AddCategory(Category & cat, vector<uint32_t> & types)
types.clear();
}
bool CategoriesHolder::ValidKeyToken(StringT const & s)
{
if (s.size() > 2)
return true;
/// @todo We need to have global stop words array for the most used languages.
char const * arr[] = { "a", "z", "s", "d", "di", "de", "le" };
for (size_t i = 0; i < ARRAY_SIZE(arr); ++i)
if (s.IsEqualAscii(arr[i]))
return false;
return true;
}
void CategoriesHolder::LoadFromStream(istream & s)
{
m_type2cat.clear();

View file

@ -96,6 +96,7 @@ public:
private:
void AddCategory(Category & cat, vector<uint32_t> & types);
static bool ValidKeyToken(StringT const & s);
};
inline void swap(CategoriesHolder & a, CategoriesHolder & b)