From 2380dd73bea3e20199cf3534a7c21ef1bd1cbab3 Mon Sep 17 00:00:00 2001 From: Yury Melnichek Date: Sun, 29 May 2011 17:02:34 +0200 Subject: [PATCH] [search] Use normalization and case folding. Refactor TokenizeIterator. --- base/string_utils.hpp | 18 ++++++++++++------ search/string_match.hpp | 6 ++++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/base/string_utils.hpp b/base/string_utils.hpp index e13eb2b7de..4926611351 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -31,11 +31,10 @@ inline UniString MakeUniString(string const & s) return result; } -template +template class TokenizeIterator { - typedef utf8::unchecked::iterator Utf8IterT; - Utf8IterT m_beg, m_end, m_finish; + UniCharIterT m_beg, m_end, m_finish; DelimFuncT m_delimFunc; /// Explicitly disabled, because we're storing iterators for string @@ -68,6 +67,12 @@ public: move(); } + TokenizeIterator(UniString const & s, DelimFuncT delimFunc) + : m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc) + { + move(); + } + string operator*() const { ASSERT( m_beg != m_finish, ("dereferencing of empty iterator") ); @@ -86,7 +91,7 @@ public: { if (!*this) return false; - TokenizeIterator copy(*this); + TokenizeIterator copy(*this); ++copy; return !copy; } @@ -94,7 +99,7 @@ public: UniString GetUniString() const { UniString result; - Utf8IterT iter(m_beg); + UniCharIterT iter(m_beg); while (iter != m_end) { result.push_back(*iter); @@ -113,7 +118,8 @@ public: bool operator()(UniChar c) const; }; -typedef TokenizeIterator SimpleTokenizer; +typedef TokenizeIterator > SimpleTokenizer; template void Tokenize(string const & str, char const * delims, FunctorT f) diff --git a/search/string_match.hpp b/search/string_match.hpp index 97e31f808b..db91f6d002 100644 --- a/search/string_match.hpp +++ b/search/string_match.hpp @@ -15,8 +15,10 @@ namespace impl template void SplitAndNormalizeAndSimplifyString(string const & s, F f, DelimsT const & delims) { - for (strings::TokenizeIterator iter(s, delims); iter; ++iter) - f(strings::MakeLowerCase(iter.GetUniString())); + strings::UniString uniS = strings::MakeLowerCase(strings::MakeUniString(s)); + strings::Normalize(uniS); + for (strings::TokenizeIterator iter(uniS, delims); iter; ++iter) + f(iter.GetUniString()); } struct MatchCostData