Search in whole ArticleInfo key.
This commit is contained in:
parent
50c3fe7cfa
commit
672bd9ebb9
7 changed files with 162 additions and 35 deletions
7
env/strings.cpp
vendored
7
env/strings.cpp
vendored
|
@ -5,17 +5,10 @@
|
|||
|
||||
#include "../3rdparty/utf8proc/utf8proc.h"
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
|
||||
namespace str
|
||||
{
|
||||
|
||||
void Trim(string & s)
|
||||
{
|
||||
boost::trim(s);
|
||||
}
|
||||
|
||||
string MakeNormalizeAndLowerUtf8(string const & s)
|
||||
{
|
||||
int const count = static_cast<int>(s.size());
|
||||
|
|
2
env/strings.hpp
vendored
2
env/strings.hpp
vendored
|
@ -14,8 +14,6 @@ template <class T> string ToString(T const & t)
|
|||
return ss.str();
|
||||
}
|
||||
|
||||
void Trim(string & s);
|
||||
|
||||
string MakeNormalizeAndLowerUtf8(string const & s);
|
||||
|
||||
template <class ToDo>
|
||||
|
|
|
@ -6,16 +6,52 @@
|
|||
#include "../env/reader.hpp"
|
||||
#include "../env/latlon.hpp"
|
||||
#include "../env/assert.hpp"
|
||||
#include "../env/logging.hpp"
|
||||
|
||||
#include "../std/iterator.hpp"
|
||||
#include "../std/algorithm.hpp"
|
||||
#include "../std/cmath.hpp"
|
||||
#include "../std/static_assert.hpp"
|
||||
#include "../std/array.hpp"
|
||||
|
||||
|
||||
namespace
|
||||
{
|
||||
class AppendString
|
||||
{
|
||||
string & m_str;
|
||||
public:
|
||||
AppendString(string & str) : m_str(str) {}
|
||||
void operator() (string const & s) const
|
||||
{
|
||||
m_str = m_str + s + ' ';
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
string ArticleInfo::Title2Key(string const & s)
|
||||
{
|
||||
string res;
|
||||
str::Tokenize(str::MakeNormalizeAndLowerUtf8(s), " ()'\"-&\t", AppendString(res));
|
||||
return res;
|
||||
}
|
||||
|
||||
string ArticleInfo::Prefix2Key(string const & s)
|
||||
{
|
||||
string res = Title2Key(s);
|
||||
|
||||
if (!res.empty())
|
||||
{
|
||||
if (s[s.size()-1] != ' ')
|
||||
res.erase(res.size()-1, 1);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void ArticleInfo::GenerateKey()
|
||||
{
|
||||
m_key = str::MakeNormalizeAndLowerUtf8(m_title);
|
||||
m_key = Title2Key(m_title);
|
||||
}
|
||||
|
||||
bool ArticleInfo::IsValidCoordinates() const
|
||||
|
@ -126,3 +162,42 @@ bool ArticleInfo::operator == (ArticleInfo const & r) const
|
|||
EqualCoord(m_lat, r.m_lat) &&
|
||||
EqualCoord(m_lon, r.m_lon));
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
bool IsStopWord(string const & s, size_t pos, string const & query)
|
||||
{
|
||||
static char const * arr[] = { "by ", "of ", "on ", "in ", "upon ", "and ", "the " };
|
||||
for (size_t i = 0; i < ArraySize(arr); ++i)
|
||||
{
|
||||
size_t const len = strlen(arr[i]);
|
||||
if (query.size() < len &&
|
||||
len + pos <= s.size() &&
|
||||
s.compare(pos, len, arr[i]) == 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool ArticleInfo::PrefixMatchExcept1stToken(string const & query) const
|
||||
{
|
||||
size_t i = 0;
|
||||
while (i < m_key.size())
|
||||
{
|
||||
i = m_key.find(query, i);
|
||||
if (i == string::npos)
|
||||
return false;
|
||||
|
||||
if (i != 0 && m_key[i-1] == ' ' && !IsStopWord(m_key, i, query))
|
||||
return true;
|
||||
|
||||
++i;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@ const int EMPTY_COORD = 1000;
|
|||
class ArticleInfo
|
||||
{
|
||||
void GenerateKey();
|
||||
static string Title2Key(string const & s);
|
||||
|
||||
protected:
|
||||
string m_key;
|
||||
|
@ -52,6 +53,8 @@ public:
|
|||
GenerateKey();
|
||||
}
|
||||
|
||||
static string Prefix2Key(string const & s);
|
||||
|
||||
string const & GetTitle() const { return m_title; }
|
||||
string GetUrl() const { return m_url + ".html"; }
|
||||
string GetThumbnailUrl() const { return m_url + ".jpg"; }
|
||||
|
@ -71,6 +74,9 @@ public:
|
|||
|
||||
bool operator == (ArticleInfo const & r) const;
|
||||
|
||||
/// @param[in] query should be simplified, lower case, utf8 string (matching by m_key).
|
||||
bool PrefixMatchExcept1stToken(string const & query) const;
|
||||
|
||||
struct LessStorage
|
||||
{
|
||||
bool operator() (ArticleInfo const & i1, ArticleInfo const & i2) const
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
|
||||
#include "../std/algorithm.hpp"
|
||||
#include "../std/utility.hpp"
|
||||
#include "../std/map.hpp"
|
||||
#include "../std/iterator.hpp"
|
||||
|
||||
|
||||
|
@ -38,54 +37,74 @@ void Storage::Load(string const & path)
|
|||
Load(reader);
|
||||
}
|
||||
|
||||
void Storage::ResultsAccumulator::Add(size_t ind)
|
||||
{
|
||||
pair<MapT::iterator, bool> res = m_map.insert(make_pair(m_storage.GetUrl(ind), ind));
|
||||
|
||||
// replace redirect index with origin index
|
||||
if (!res.second && m_storage.IsRedirect(res.first->second) && !m_storage.IsRedirect(ind))
|
||||
res.first->second = ind;
|
||||
}
|
||||
|
||||
bool Storage::ResultsAccumulator::IsExist(size_t ind) const
|
||||
{
|
||||
return (m_map.find(m_storage.GetUrl(ind)) != m_map.end());
|
||||
}
|
||||
|
||||
void Storage::ResultsAccumulator::GetResults(vector<size_t> & out, double lat, double lon) const
|
||||
{
|
||||
size_t const count = out.size();
|
||||
out.reserve(count + m_map.size());
|
||||
|
||||
for (MapT::const_iterator i = m_map.begin(); i != m_map.end(); ++i)
|
||||
out.push_back(i->second);
|
||||
|
||||
vector<size_t>::iterator iStart = out.begin();
|
||||
advance(iStart, count);
|
||||
sort(iStart, out.end(), LessScore(m_storage, lat, lon));
|
||||
}
|
||||
|
||||
void Storage::QueryArticleInfo(string const & prefix, double lat, double lon)
|
||||
{
|
||||
m_lastQuery.clear();
|
||||
|
||||
string query = prefix;
|
||||
str::Trim(query);
|
||||
|
||||
string query = ArticleInfo::Prefix2Key(prefix);
|
||||
if (query.empty())
|
||||
{
|
||||
size_t const count = m_info.size();
|
||||
m_lastQuery.reserve(count);
|
||||
|
||||
// add all articles except redirects
|
||||
// Add all articles except redirects
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
if (!m_info[i].m_redirect)
|
||||
m_lastQuery.push_back(i);
|
||||
|
||||
sort(m_lastQuery.begin(), m_lastQuery.end(), LessScore(*this, lat, lon));
|
||||
}
|
||||
else
|
||||
{
|
||||
// find range of articles by input query
|
||||
// Find and add range of articles by matchin input query.
|
||||
typedef vector<ArticleInfo>::iterator IterT;
|
||||
pair<IterT, IterT> range = equal_range(m_info.begin(), m_info.end(),
|
||||
str::MakeNormalizeAndLowerUtf8(query),
|
||||
pair<IterT, IterT> range = equal_range(m_info.begin(), m_info.end(), query,
|
||||
ArticleInfo::LessPrefix());
|
||||
|
||||
// filter duplicating redirects
|
||||
map<string, size_t> theMap;
|
||||
typedef map<string, size_t>::iterator MapIterT;
|
||||
ResultsAccumulator acc1(*this);
|
||||
while (range.first != range.second)
|
||||
{
|
||||
size_t const ind = distance(m_info.begin(), range.first);
|
||||
pair<MapIterT, bool> res = theMap.insert(make_pair(range.first->m_url, ind));
|
||||
|
||||
// replace redirect index with origin index
|
||||
if (!res.second && m_info[res.first->second].m_redirect && !m_info[ind].m_redirect)
|
||||
res.first->second = ind;
|
||||
|
||||
acc1.Add(distance(m_info.begin(), range.first));
|
||||
++range.first;
|
||||
}
|
||||
acc1.GetResults(m_lastQuery, lat, lon);
|
||||
|
||||
// assign results
|
||||
m_lastQuery.reserve(theMap.size());
|
||||
for (MapIterT i = theMap.begin(); i != theMap.end(); ++i)
|
||||
m_lastQuery.push_back(i->second);
|
||||
// Process all articles by matching 2nd, 3rd, ... tokens
|
||||
ResultsAccumulator acc2(*this);
|
||||
for (size_t i = 0; i < m_info.size(); ++i)
|
||||
{
|
||||
if (m_info[i].PrefixMatchExcept1stToken(query) && !acc1.IsExist(i))
|
||||
acc2.Add(i);
|
||||
}
|
||||
acc2.GetResults(m_lastQuery, lat, lon);
|
||||
}
|
||||
|
||||
// sort according to score
|
||||
sort(m_lastQuery.begin(), m_lastQuery.end(), LessScore(*this, lat, lon));
|
||||
}
|
||||
|
||||
string Storage::FormatParentName(ArticleInfo const & info, int maxDepth) const
|
||||
|
|
|
@ -6,10 +6,30 @@
|
|||
|
||||
#include "../std/vector.hpp"
|
||||
#include "../std/noncopyable.hpp"
|
||||
#include "../std/map.hpp"
|
||||
|
||||
|
||||
class Storage : noncopyable
|
||||
{
|
||||
string const & GetUrl(size_t ind) const { return m_info[ind].m_url; }
|
||||
bool IsRedirect(size_t ind) const { return m_info[ind].m_redirect; }
|
||||
|
||||
class ResultsAccumulator
|
||||
{
|
||||
Storage const & m_storage;
|
||||
|
||||
typedef map<string, size_t> MapT;
|
||||
MapT m_map;
|
||||
|
||||
public:
|
||||
ResultsAccumulator(Storage & storage) : m_storage(storage) {}
|
||||
|
||||
void Add(size_t ind);
|
||||
bool IsExist(size_t ind) const;
|
||||
|
||||
void GetResults(vector<size_t> & out, double lat, double lon) const;
|
||||
};
|
||||
|
||||
public:
|
||||
void Load(rd::Reader & reader);
|
||||
void Load(string const & path);
|
||||
|
|
|
@ -11,6 +11,22 @@
|
|||
#include "../std/array.hpp"
|
||||
|
||||
|
||||
TEST(ArticleInfo, PrefixMatch)
|
||||
{
|
||||
ArticleInfo i(" 'Loch Lomond' and \"The Trossachs\" (National-Park)");
|
||||
|
||||
EXPECT_FALSE(i.PrefixMatchExcept1stToken("loch"));
|
||||
EXPECT_TRUE(i.PrefixMatchExcept1stToken("lom"));
|
||||
EXPECT_FALSE(i.PrefixMatchExcept1stToken("and"));
|
||||
EXPECT_FALSE(i.PrefixMatchExcept1stToken("the"));
|
||||
EXPECT_TRUE(i.PrefixMatchExcept1stToken("trossachs"));
|
||||
EXPECT_TRUE(i.PrefixMatchExcept1stToken("national"));
|
||||
EXPECT_TRUE(i.PrefixMatchExcept1stToken("park"));
|
||||
EXPECT_FALSE(i.PrefixMatchExcept1stToken("parke"));
|
||||
|
||||
EXPECT_TRUE(i.PrefixMatchExcept1stToken("the trossachs national park"));
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
|
|
Reference in a new issue