Search in whole ArticleInfo key.

This commit is contained in:
vng 2013-08-12 22:01:01 +03:00
parent 50c3fe7cfa
commit 672bd9ebb9
7 changed files with 162 additions and 35 deletions

7
env/strings.cpp vendored
View file

@ -5,17 +5,10 @@
#include "../3rdparty/utf8proc/utf8proc.h"
#include <boost/algorithm/string.hpp>
namespace str
{
void Trim(string & s)
{
boost::trim(s);
}
string MakeNormalizeAndLowerUtf8(string const & s)
{
int const count = static_cast<int>(s.size());

2
env/strings.hpp vendored
View file

@ -14,8 +14,6 @@ template <class T> string ToString(T const & t)
return ss.str();
}
void Trim(string & s);
string MakeNormalizeAndLowerUtf8(string const & s);
template <class ToDo>

View file

@ -6,16 +6,52 @@
#include "../env/reader.hpp"
#include "../env/latlon.hpp"
#include "../env/assert.hpp"
#include "../env/logging.hpp"
#include "../std/iterator.hpp"
#include "../std/algorithm.hpp"
#include "../std/cmath.hpp"
#include "../std/static_assert.hpp"
#include "../std/array.hpp"
namespace
{
class AppendString
{
string & m_str;
public:
AppendString(string & str) : m_str(str) {}
void operator() (string const & s) const
{
m_str = m_str + s + ' ';
}
};
}
string ArticleInfo::Title2Key(string const & s)
{
string res;
str::Tokenize(str::MakeNormalizeAndLowerUtf8(s), " ()'\"-&\t", AppendString(res));
return res;
}
string ArticleInfo::Prefix2Key(string const & s)
{
string res = Title2Key(s);
if (!res.empty())
{
if (s[s.size()-1] != ' ')
res.erase(res.size()-1, 1);
}
return res;
}
void ArticleInfo::GenerateKey()
{
m_key = str::MakeNormalizeAndLowerUtf8(m_title);
m_key = Title2Key(m_title);
}
bool ArticleInfo::IsValidCoordinates() const
@ -126,3 +162,42 @@ bool ArticleInfo::operator == (ArticleInfo const & r) const
EqualCoord(m_lat, r.m_lat) &&
EqualCoord(m_lon, r.m_lon));
}
namespace
{
bool IsStopWord(string const & s, size_t pos, string const & query)
{
static char const * arr[] = { "by ", "of ", "on ", "in ", "upon ", "and ", "the " };
for (size_t i = 0; i < ArraySize(arr); ++i)
{
size_t const len = strlen(arr[i]);
if (query.size() < len &&
len + pos <= s.size() &&
s.compare(pos, len, arr[i]) == 0)
{
return true;
}
}
return false;
}
}
bool ArticleInfo::PrefixMatchExcept1stToken(string const & query) const
{
size_t i = 0;
while (i < m_key.size())
{
i = m_key.find(query, i);
if (i == string::npos)
return false;
if (i != 0 && m_key[i-1] == ' ' && !IsStopWord(m_key, i, query))
return true;
++i;
}
return false;
}

View file

@ -14,6 +14,7 @@ const int EMPTY_COORD = 1000;
class ArticleInfo
{
void GenerateKey();
static string Title2Key(string const & s);
protected:
string m_key;
@ -52,6 +53,8 @@ public:
GenerateKey();
}
static string Prefix2Key(string const & s);
string const & GetTitle() const { return m_title; }
string GetUrl() const { return m_url + ".html"; }
string GetThumbnailUrl() const { return m_url + ".jpg"; }
@ -71,6 +74,9 @@ public:
bool operator == (ArticleInfo const & r) const;
/// @param[in] query should be simplified, lower case, utf8 string (matching by m_key).
bool PrefixMatchExcept1stToken(string const & query) const;
struct LessStorage
{
bool operator() (ArticleInfo const & i1, ArticleInfo const & i2) const

View file

@ -6,7 +6,6 @@
#include "../std/algorithm.hpp"
#include "../std/utility.hpp"
#include "../std/map.hpp"
#include "../std/iterator.hpp"
@ -38,54 +37,74 @@ void Storage::Load(string const & path)
Load(reader);
}
void Storage::ResultsAccumulator::Add(size_t ind)
{
pair<MapT::iterator, bool> res = m_map.insert(make_pair(m_storage.GetUrl(ind), ind));
// replace redirect index with origin index
if (!res.second && m_storage.IsRedirect(res.first->second) && !m_storage.IsRedirect(ind))
res.first->second = ind;
}
bool Storage::ResultsAccumulator::IsExist(size_t ind) const
{
return (m_map.find(m_storage.GetUrl(ind)) != m_map.end());
}
void Storage::ResultsAccumulator::GetResults(vector<size_t> & out, double lat, double lon) const
{
size_t const count = out.size();
out.reserve(count + m_map.size());
for (MapT::const_iterator i = m_map.begin(); i != m_map.end(); ++i)
out.push_back(i->second);
vector<size_t>::iterator iStart = out.begin();
advance(iStart, count);
sort(iStart, out.end(), LessScore(m_storage, lat, lon));
}
void Storage::QueryArticleInfo(string const & prefix, double lat, double lon)
{
m_lastQuery.clear();
string query = prefix;
str::Trim(query);
string query = ArticleInfo::Prefix2Key(prefix);
if (query.empty())
{
size_t const count = m_info.size();
m_lastQuery.reserve(count);
// add all articles except redirects
// Add all articles except redirects
for (size_t i = 0; i < count; ++i)
if (!m_info[i].m_redirect)
m_lastQuery.push_back(i);
sort(m_lastQuery.begin(), m_lastQuery.end(), LessScore(*this, lat, lon));
}
else
{
// find range of articles by input query
// Find and add range of articles by matchin input query.
typedef vector<ArticleInfo>::iterator IterT;
pair<IterT, IterT> range = equal_range(m_info.begin(), m_info.end(),
str::MakeNormalizeAndLowerUtf8(query),
pair<IterT, IterT> range = equal_range(m_info.begin(), m_info.end(), query,
ArticleInfo::LessPrefix());
// filter duplicating redirects
map<string, size_t> theMap;
typedef map<string, size_t>::iterator MapIterT;
ResultsAccumulator acc1(*this);
while (range.first != range.second)
{
size_t const ind = distance(m_info.begin(), range.first);
pair<MapIterT, bool> res = theMap.insert(make_pair(range.first->m_url, ind));
// replace redirect index with origin index
if (!res.second && m_info[res.first->second].m_redirect && !m_info[ind].m_redirect)
res.first->second = ind;
acc1.Add(distance(m_info.begin(), range.first));
++range.first;
}
acc1.GetResults(m_lastQuery, lat, lon);
// assign results
m_lastQuery.reserve(theMap.size());
for (MapIterT i = theMap.begin(); i != theMap.end(); ++i)
m_lastQuery.push_back(i->second);
// Process all articles by matching 2nd, 3rd, ... tokens
ResultsAccumulator acc2(*this);
for (size_t i = 0; i < m_info.size(); ++i)
{
if (m_info[i].PrefixMatchExcept1stToken(query) && !acc1.IsExist(i))
acc2.Add(i);
}
acc2.GetResults(m_lastQuery, lat, lon);
}
// sort according to score
sort(m_lastQuery.begin(), m_lastQuery.end(), LessScore(*this, lat, lon));
}
string Storage::FormatParentName(ArticleInfo const & info, int maxDepth) const

View file

@ -6,10 +6,30 @@
#include "../std/vector.hpp"
#include "../std/noncopyable.hpp"
#include "../std/map.hpp"
class Storage : noncopyable
{
string const & GetUrl(size_t ind) const { return m_info[ind].m_url; }
bool IsRedirect(size_t ind) const { return m_info[ind].m_redirect; }
class ResultsAccumulator
{
Storage const & m_storage;
typedef map<string, size_t> MapT;
MapT m_map;
public:
ResultsAccumulator(Storage & storage) : m_storage(storage) {}
void Add(size_t ind);
bool IsExist(size_t ind) const;
void GetResults(vector<size_t> & out, double lat, double lon) const;
};
public:
void Load(rd::Reader & reader);
void Load(string const & path);

View file

@ -11,6 +11,22 @@
#include "../std/array.hpp"
TEST(ArticleInfo, PrefixMatch)
{
ArticleInfo i(" 'Loch Lomond' and \"The Trossachs\" (National-Park)");
EXPECT_FALSE(i.PrefixMatchExcept1stToken("loch"));
EXPECT_TRUE(i.PrefixMatchExcept1stToken("lom"));
EXPECT_FALSE(i.PrefixMatchExcept1stToken("and"));
EXPECT_FALSE(i.PrefixMatchExcept1stToken("the"));
EXPECT_TRUE(i.PrefixMatchExcept1stToken("trossachs"));
EXPECT_TRUE(i.PrefixMatchExcept1stToken("national"));
EXPECT_TRUE(i.PrefixMatchExcept1stToken("park"));
EXPECT_FALSE(i.PrefixMatchExcept1stToken("parke"));
EXPECT_TRUE(i.PrefixMatchExcept1stToken("the trossachs national park"));
}
namespace
{