diff --git a/env/strings.cpp b/env/strings.cpp index 66a347b..5f6bcbd 100644 --- a/env/strings.cpp +++ b/env/strings.cpp @@ -5,17 +5,10 @@ #include "../3rdparty/utf8proc/utf8proc.h" -#include - namespace str { -void Trim(string & s) -{ - boost::trim(s); -} - string MakeNormalizeAndLowerUtf8(string const & s) { int const count = static_cast(s.size()); diff --git a/env/strings.hpp b/env/strings.hpp index 9fd25b3..cbec89c 100644 --- a/env/strings.hpp +++ b/env/strings.hpp @@ -14,8 +14,6 @@ template string ToString(T const & t) return ss.str(); } -void Trim(string & s); - string MakeNormalizeAndLowerUtf8(string const & s); template diff --git a/storage/article_info.cpp b/storage/article_info.cpp index b043846..fdc1ee7 100644 --- a/storage/article_info.cpp +++ b/storage/article_info.cpp @@ -6,16 +6,52 @@ #include "../env/reader.hpp" #include "../env/latlon.hpp" #include "../env/assert.hpp" +#include "../env/logging.hpp" #include "../std/iterator.hpp" #include "../std/algorithm.hpp" #include "../std/cmath.hpp" #include "../std/static_assert.hpp" +#include "../std/array.hpp" +namespace +{ + class AppendString + { + string & m_str; + public: + AppendString(string & str) : m_str(str) {} + void operator() (string const & s) const + { + m_str = m_str + s + ' '; + } + }; +} + +string ArticleInfo::Title2Key(string const & s) +{ + string res; + str::Tokenize(str::MakeNormalizeAndLowerUtf8(s), " ()'\"-&\t", AppendString(res)); + return res; +} + +string ArticleInfo::Prefix2Key(string const & s) +{ + string res = Title2Key(s); + + if (!res.empty()) + { + if (s[s.size()-1] != ' ') + res.erase(res.size()-1, 1); + } + + return res; +} + void ArticleInfo::GenerateKey() { - m_key = str::MakeNormalizeAndLowerUtf8(m_title); + m_key = Title2Key(m_title); } bool ArticleInfo::IsValidCoordinates() const @@ -126,3 +162,42 @@ bool ArticleInfo::operator == (ArticleInfo const & r) const EqualCoord(m_lat, r.m_lat) && EqualCoord(m_lon, r.m_lon)); } + +namespace +{ + +bool IsStopWord(string const & s, size_t pos, string const & query) +{ + static char const * arr[] = { "by ", "of ", "on ", "in ", "upon ", "and ", "the " }; + for (size_t i = 0; i < ArraySize(arr); ++i) + { + size_t const len = strlen(arr[i]); + if (query.size() < len && + len + pos <= s.size() && + s.compare(pos, len, arr[i]) == 0) + { + return true; + } + } + return false; +} + +} + +bool ArticleInfo::PrefixMatchExcept1stToken(string const & query) const +{ + size_t i = 0; + while (i < m_key.size()) + { + i = m_key.find(query, i); + if (i == string::npos) + return false; + + if (i != 0 && m_key[i-1] == ' ' && !IsStopWord(m_key, i, query)) + return true; + + ++i; + } + + return false; +} diff --git a/storage/article_info.hpp b/storage/article_info.hpp index 3e0e133..d8b83c6 100644 --- a/storage/article_info.hpp +++ b/storage/article_info.hpp @@ -14,6 +14,7 @@ const int EMPTY_COORD = 1000; class ArticleInfo { void GenerateKey(); + static string Title2Key(string const & s); protected: string m_key; @@ -52,6 +53,8 @@ public: GenerateKey(); } + static string Prefix2Key(string const & s); + string const & GetTitle() const { return m_title; } string GetUrl() const { return m_url + ".html"; } string GetThumbnailUrl() const { return m_url + ".jpg"; } @@ -71,6 +74,9 @@ public: bool operator == (ArticleInfo const & r) const; + /// @param[in] query should be simplified, lower case, utf8 string (matching by m_key). + bool PrefixMatchExcept1stToken(string const & query) const; + struct LessStorage { bool operator() (ArticleInfo const & i1, ArticleInfo const & i2) const diff --git a/storage/storage.cpp b/storage/storage.cpp index f7cb8e4..91782af 100644 --- a/storage/storage.cpp +++ b/storage/storage.cpp @@ -6,7 +6,6 @@ #include "../std/algorithm.hpp" #include "../std/utility.hpp" -#include "../std/map.hpp" #include "../std/iterator.hpp" @@ -38,54 +37,74 @@ void Storage::Load(string const & path) Load(reader); } +void Storage::ResultsAccumulator::Add(size_t ind) +{ + pair res = m_map.insert(make_pair(m_storage.GetUrl(ind), ind)); + + // replace redirect index with origin index + if (!res.second && m_storage.IsRedirect(res.first->second) && !m_storage.IsRedirect(ind)) + res.first->second = ind; +} + +bool Storage::ResultsAccumulator::IsExist(size_t ind) const +{ + return (m_map.find(m_storage.GetUrl(ind)) != m_map.end()); +} + +void Storage::ResultsAccumulator::GetResults(vector & out, double lat, double lon) const +{ + size_t const count = out.size(); + out.reserve(count + m_map.size()); + + for (MapT::const_iterator i = m_map.begin(); i != m_map.end(); ++i) + out.push_back(i->second); + + vector::iterator iStart = out.begin(); + advance(iStart, count); + sort(iStart, out.end(), LessScore(m_storage, lat, lon)); +} + void Storage::QueryArticleInfo(string const & prefix, double lat, double lon) { m_lastQuery.clear(); - string query = prefix; - str::Trim(query); - + string query = ArticleInfo::Prefix2Key(prefix); if (query.empty()) { size_t const count = m_info.size(); m_lastQuery.reserve(count); - // add all articles except redirects + // Add all articles except redirects for (size_t i = 0; i < count; ++i) if (!m_info[i].m_redirect) m_lastQuery.push_back(i); + + sort(m_lastQuery.begin(), m_lastQuery.end(), LessScore(*this, lat, lon)); } else { - // find range of articles by input query + // Find and add range of articles by matchin input query. typedef vector::iterator IterT; - pair range = equal_range(m_info.begin(), m_info.end(), - str::MakeNormalizeAndLowerUtf8(query), + pair range = equal_range(m_info.begin(), m_info.end(), query, ArticleInfo::LessPrefix()); - // filter duplicating redirects - map theMap; - typedef map::iterator MapIterT; + ResultsAccumulator acc1(*this); while (range.first != range.second) { - size_t const ind = distance(m_info.begin(), range.first); - pair res = theMap.insert(make_pair(range.first->m_url, ind)); - - // replace redirect index with origin index - if (!res.second && m_info[res.first->second].m_redirect && !m_info[ind].m_redirect) - res.first->second = ind; - + acc1.Add(distance(m_info.begin(), range.first)); ++range.first; } + acc1.GetResults(m_lastQuery, lat, lon); - // assign results - m_lastQuery.reserve(theMap.size()); - for (MapIterT i = theMap.begin(); i != theMap.end(); ++i) - m_lastQuery.push_back(i->second); + // Process all articles by matching 2nd, 3rd, ... tokens + ResultsAccumulator acc2(*this); + for (size_t i = 0; i < m_info.size(); ++i) + { + if (m_info[i].PrefixMatchExcept1stToken(query) && !acc1.IsExist(i)) + acc2.Add(i); + } + acc2.GetResults(m_lastQuery, lat, lon); } - - // sort according to score - sort(m_lastQuery.begin(), m_lastQuery.end(), LessScore(*this, lat, lon)); } string Storage::FormatParentName(ArticleInfo const & info, int maxDepth) const diff --git a/storage/storage.hpp b/storage/storage.hpp index 3cd3756..4cbe8e2 100644 --- a/storage/storage.hpp +++ b/storage/storage.hpp @@ -6,10 +6,30 @@ #include "../std/vector.hpp" #include "../std/noncopyable.hpp" +#include "../std/map.hpp" class Storage : noncopyable { + string const & GetUrl(size_t ind) const { return m_info[ind].m_url; } + bool IsRedirect(size_t ind) const { return m_info[ind].m_redirect; } + + class ResultsAccumulator + { + Storage const & m_storage; + + typedef map MapT; + MapT m_map; + + public: + ResultsAccumulator(Storage & storage) : m_storage(storage) {} + + void Add(size_t ind); + bool IsExist(size_t ind) const; + + void GetResults(vector & out, double lat, double lon) const; + }; + public: void Load(rd::Reader & reader); void Load(string const & path); diff --git a/storage/tests/storage_test.cpp b/storage/tests/storage_test.cpp index c93a770..4a3ca4e 100644 --- a/storage/tests/storage_test.cpp +++ b/storage/tests/storage_test.cpp @@ -11,6 +11,22 @@ #include "../std/array.hpp" +TEST(ArticleInfo, PrefixMatch) +{ + ArticleInfo i(" 'Loch Lomond' and \"The Trossachs\" (National-Park)"); + + EXPECT_FALSE(i.PrefixMatchExcept1stToken("loch")); + EXPECT_TRUE(i.PrefixMatchExcept1stToken("lom")); + EXPECT_FALSE(i.PrefixMatchExcept1stToken("and")); + EXPECT_FALSE(i.PrefixMatchExcept1stToken("the")); + EXPECT_TRUE(i.PrefixMatchExcept1stToken("trossachs")); + EXPECT_TRUE(i.PrefixMatchExcept1stToken("national")); + EXPECT_TRUE(i.PrefixMatchExcept1stToken("park")); + EXPECT_FALSE(i.PrefixMatchExcept1stToken("parke")); + + EXPECT_TRUE(i.PrefixMatchExcept1stToken("the trossachs national park")); +} + namespace {