From 98d058924d40889f72ed6ce13c1d8cc0ceb87668 Mon Sep 17 00:00:00 2001 From: vng Date: Wed, 23 Apr 2014 15:34:41 -0400 Subject: [PATCH] [search] Fixed stop words processing for category synonyms. --- base/string_utils.cpp | 5 +++++ base/string_utils.hpp | 2 ++ indexer/categories_holder.cpp | 17 ++++++++++++++++- indexer/categories_holder.hpp | 1 + 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/base/string_utils.cpp b/base/string_utils.cpp index 673a91f8ac..a4e363671f 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -12,6 +12,11 @@ namespace strings { +bool UniString::IsEqualAscii(char const * s) const +{ + return (size() == strlen(s) && equal(begin(), end(), s)); +} + SimpleDelimiter::SimpleDelimiter(char const * delimChars) { string const s(delimChars); diff --git a/base/string_utils.hpp b/base/string_utils.hpp index f24fddf17c..a9a4523040 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -25,6 +25,8 @@ public: UniString() {} explicit UniString(size_t n, UniChar c = UniChar()) : BaseT(n, c) {} template UniString(IterT b, IterT e) : BaseT(b, e) {} + + bool IsEqualAscii(char const * s) const; }; UniString MakeLowerCase(UniString const & s); diff --git a/indexer/categories_holder.cpp b/indexer/categories_holder.cpp index 50dc4c58d9..97a1ba241e 100644 --- a/indexer/categories_holder.cpp +++ b/indexer/categories_holder.cpp @@ -49,7 +49,8 @@ void CategoriesHolder::AddCategory(Category & cat, vector & types) for (size_t j = 0; j < tokens.size(); ++j) for (size_t k = 0; k < types.size(); ++k) - m_name2type.insert(make_pair(tokens[j], types[k])); + if (ValidKeyToken(tokens[j])) + m_name2type.insert(make_pair(tokens[j], types[k])); } } @@ -57,6 +58,20 @@ void CategoriesHolder::AddCategory(Category & cat, vector & types) types.clear(); } +bool CategoriesHolder::ValidKeyToken(StringT const & s) +{ + if (s.size() > 2) + return true; + + /// @todo We need to have global stop words array for the most used languages. + char const * arr[] = { "a", "z", "s", "d", "di", "de", "le" }; + for (size_t i = 0; i < ARRAY_SIZE(arr); ++i) + if (s.IsEqualAscii(arr[i])) + return false; + + return true; +} + void CategoriesHolder::LoadFromStream(istream & s) { m_type2cat.clear(); diff --git a/indexer/categories_holder.hpp b/indexer/categories_holder.hpp index e756aab61e..1b740b8cc4 100644 --- a/indexer/categories_holder.hpp +++ b/indexer/categories_holder.hpp @@ -96,6 +96,7 @@ public: private: void AddCategory(Category & cat, vector & types); + static bool ValidKeyToken(StringT const & s); }; inline void swap(CategoriesHolder & a, CategoriesHolder & b)