[search] Use normalization and case folding. Refactor TokenizeIterator.

This commit is contained in:
Yury Melnichek 2011-05-29 17:02:34 +02:00 committed by Alex Zolotarev
parent 2929b89ced
commit 2380dd73be
2 changed files with 16 additions and 8 deletions

View file

@ -31,11 +31,10 @@ inline UniString MakeUniString(string const & s)
return result;
}
template <typename DelimFuncT>
template <typename DelimFuncT, typename UniCharIterT = UniString::const_iterator>
class TokenizeIterator
{
typedef utf8::unchecked::iterator<string::const_iterator> Utf8IterT;
Utf8IterT m_beg, m_end, m_finish;
UniCharIterT m_beg, m_end, m_finish;
DelimFuncT m_delimFunc;
/// Explicitly disabled, because we're storing iterators for string
@ -68,6 +67,12 @@ public:
move();
}
TokenizeIterator(UniString const & s, DelimFuncT delimFunc)
: m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc)
{
move();
}
string operator*() const
{
ASSERT( m_beg != m_finish, ("dereferencing of empty iterator") );
@ -86,7 +91,7 @@ public:
{
if (!*this)
return false;
TokenizeIterator<DelimFuncT> copy(*this);
TokenizeIterator<DelimFuncT, UniCharIterT> copy(*this);
++copy;
return !copy;
}
@ -94,7 +99,7 @@ public:
UniString GetUniString() const
{
UniString result;
Utf8IterT iter(m_beg);
UniCharIterT iter(m_beg);
while (iter != m_end)
{
result.push_back(*iter);
@ -113,7 +118,8 @@ public:
bool operator()(UniChar c) const;
};
typedef TokenizeIterator<SimpleDelimiter> SimpleTokenizer;
typedef TokenizeIterator<SimpleDelimiter,
::utf8::unchecked::iterator<string::const_iterator> > SimpleTokenizer;
template <typename FunctorT>
void Tokenize(string const & str, char const * delims, FunctorT f)

View file

@ -15,8 +15,10 @@ namespace impl
template <class DelimsT, typename F>
void SplitAndNormalizeAndSimplifyString(string const & s, F f, DelimsT const & delims)
{
for (strings::TokenizeIterator<DelimsT> iter(s, delims); iter; ++iter)
f(strings::MakeLowerCase(iter.GetUniString()));
strings::UniString uniS = strings::MakeLowerCase(strings::MakeUniString(s));
strings::Normalize(uniS);
for (strings::TokenizeIterator<DelimsT> iter(uniS, delims); iter; ++iter)
f(iter.GetUniString());
}
struct MatchCostData