From 90d01a652f231684cd48b1ea9bde382cf562c990 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Wed, 15 Jun 2016 14:43:38 +0300 Subject: [PATCH] [base] Implemented empty-tokens-support to tokenizer. --- base/base_tests/string_utils_test.cpp | 69 +++++++-- base/string_utils.cpp | 4 +- base/string_utils.hpp | 202 +++++++++++++++++--------- 3 files changed, 193 insertions(+), 82 deletions(-) diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp index aa7e79074d..6e8c36a6b7 100644 --- a/base/base_tests/string_utils_test.cpp +++ b/base/base_tests/string_utils_test.cpp @@ -332,32 +332,47 @@ struct FunctorTester size_t & m_index; vector const & m_tokens; - explicit FunctorTester(size_t & counter, vector const & tokens) - : m_index(counter), m_tokens(tokens) {} + FunctorTester(size_t & counter, vector const & tokens) + : m_index(counter), m_tokens(tokens) + { + } + void operator()(string const & s) { TEST_EQUAL(s, m_tokens[m_index++], ()); } }; -void TestIter(string const & str, char const * delims, vector const & tokens) +void TestIter(string const & s, char const * delims, vector const & tokens) { - strings::SimpleTokenizer it(str, delims); + strings::SimpleTokenizer it(s, delims); for (size_t i = 0; i < tokens.size(); ++i) { - TEST_EQUAL(true, it, (str, delims, i)); - TEST_EQUAL(i == tokens.size() - 1, it.IsLast(), ()); - TEST_EQUAL(*it, tokens[i], (str, delims, i)); + TEST_EQUAL(true, it, (s, delims, i)); + TEST_EQUAL(*it, tokens[i], (s, delims, i)); ++it; } - TEST_EQUAL(false, it, (str, delims)); + TEST_EQUAL(false, it, (s, delims)); size_t counter = 0; FunctorTester f = FunctorTester(counter, tokens); - strings::Tokenize(str, delims, f); + strings::Tokenize(s, delims, f); TEST_EQUAL(counter, tokens.size(), ()); } +void TestIterWithEmptyTokens(string const & s, char const * delims, vector const & tokens) +{ + strings::SimpleTokenizerWithEmptyTokens it(s, delims); + + for (size_t i = 0; i < tokens.size(); ++i) + { + TEST_EQUAL(true, it, (s, delims, i)); + TEST_EQUAL(*it, tokens[i], (s, delims, i)); + ++it; + } + TEST_EQUAL(false, it, (s, delims)); +} + UNIT_TEST(SimpleTokenizer) { vector tokens; @@ -402,6 +417,42 @@ UNIT_TEST(SimpleTokenizer) TEST_EQUAL(vector(SimpleTokenizer(str, ","), SimpleTokenizer()), (vector{"a", "b", "c"}), ()); } + + { + string const s = ""; + vector tokens = {""}; + TestIterWithEmptyTokens(s, ",", tokens); + } + + { + string const s = ","; + vector tokens = {"", ""}; + TestIterWithEmptyTokens(s, ",", tokens); + } + + { + string const s = ",,"; + vector tokens = {"", "", ""}; + TestIterWithEmptyTokens(s, ",", tokens); + } + + { + string const s = "Hello, World!"; + vector tokens = {s}; + TestIterWithEmptyTokens(s, "", tokens); + } + + { + string const s = "Hello, World!"; + vector tokens = {"Hello", " World", ""}; + TestIterWithEmptyTokens(s, ",!", tokens); + } + + { + string const s = ",a,b,,c,d,"; + vector tokens = {"", "a", "b", "", "c", "d", ""}; + TestIterWithEmptyTokens(s, ",", tokens); + } } UNIT_TEST(LastUniChar) diff --git a/base/string_utils.cpp b/base/string_utils.cpp index d5fe4d7c43..d36b33a853 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -16,9 +16,9 @@ bool UniString::IsEqualAscii(char const * s) const return (size() == strlen(s) && equal(begin(), end(), s)); } -SimpleDelimiter::SimpleDelimiter(char const * delimChars) +SimpleDelimiter::SimpleDelimiter(char const * delims) { - string const s(delimChars); + string const s(delims); string::const_iterator it = s.begin(); while (it != s.end()) m_delims.push_back(utf8::unchecked::next(it)); diff --git a/base/string_utils.hpp b/base/string_utils.hpp index 85396b7f19..33b7af7d2d 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -89,82 +89,150 @@ bool IsASCIILatin(UniChar c); inline string DebugPrint(UniString const & s) { return ToUtf8(s); } -template +template class TokenizeIterator { - UniCharIterT m_beg, m_end, m_finish; - DelimFuncT m_delimFunc; - - void move() - { - m_beg = m_end; - while (m_beg != m_finish) - { - if (m_delimFunc(*m_beg)) - ++m_beg; - else - break; - } - m_end = m_beg; - while (m_end != m_finish) - { - if (m_delimFunc(*m_end)) - break; - else - ++m_end; - } - } - public: - /// @warning string S must be not temporary! - TokenizeIterator(string const & s, DelimFuncT const & delimFunc) - : m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc) + using difference_type = std::ptrdiff_t; + using value_type = string; + using pointer = void; + using reference = string; + using iterator_category = std::input_iterator_tag; + + // *NOTE* |s| must be not temporary! + TokenizeIterator(string const & s, TDelimFn const & delimFn) + : m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn) { - move(); + Move(); } - /// @warning unistring S must be not temporary! - TokenizeIterator(UniString const & s, DelimFuncT const & delimFunc) - : m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc) + // *NOTE* |s| must be not temporary! + TokenizeIterator(UniString const & s, TDelimFn const & delimFn) + : m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn) { - move(); + Move(); } - /// Use default-constructed iterator for operator == to determine an end of a token stream. + // Use default-constructed iterator for operator == to determine an + // end of a token stream. TokenizeIterator() = default; - /// Explicitly disabled, because we're storing iterators for string - TokenizeIterator(char const *, DelimFuncT const &) = delete; - string operator*() const { - ASSERT(m_beg != m_finish, ("dereferencing of empty iterator")); - return string(m_beg.base(), m_end.base()); + ASSERT(m_start != m_finish, ("Dereferencing of empty iterator.")); + return string(m_start.base(), m_end.base()); } - operator bool() const { return m_beg != m_finish; } - + UniString GetUniString() const + { + ASSERT(m_start != m_finish, ("Dereferencing of empty iterator.")); + return UniString(m_start, m_end); + } + + operator bool() const { return m_start != m_finish; } + TokenizeIterator & operator++() { - move(); - return (*this); + Move(); + return *this; } - bool IsLast() const - { - if (!*this) - return false; - - TokenizeIterator copy(*this); - ++copy; - return !copy; - } - - UniString GetUniString() const { return UniString(m_beg, m_end); } - /// Same as operator bool() in expression it == end(...) + // Same as operator bool() in expression it == end(...). bool operator==(TokenizeIterator const &) { return !(*this); } - /// Same as operator bool() in expression it != end(...) + + // Same as operator bool() in expression it != end(...). bool operator!=(TokenizeIterator const &) { return (*this); } + +private: + void Move() + { + m_start = m_end; + while (m_start != m_finish && m_delimFn(*m_start)) + ++m_start; + + m_end = m_start; + while (m_end != m_finish && !m_delimFn(*m_end)) + ++m_end; + } + + TIt m_start; + TIt m_end; + TIt m_finish; + TDelimFn m_delimFn; +}; + +template +class TokenizeIterator +{ +public: + using difference_type = std::ptrdiff_t; + using value_type = string; + using pointer = void; + using reference = string; + using iterator_category = std::input_iterator_tag; + + // *NOTE* |s| must be not temporary! + TokenizeIterator(string const & s, TDelimFn const & delimFn) + : m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn), m_finished(false) + { + while (m_end != m_finish && !m_delimFn(*m_end)) + ++m_end; + } + + // Use default-constructed iterator for operator == to determine an + // end of a token stream. + TokenizeIterator() = default; + + string operator*() const + { + ASSERT(!m_finished, ("Dereferencing of empty iterator.")); + return string(m_start.base(), m_end.base()); + } + + UniString GetUniString() const + { + ASSERT(!m_finished, ("Dereferencing of empty iterator.")); + return UniString(m_start, m_end); + } + + operator bool() const { return !m_finished; } + + TokenizeIterator & operator++() + { + Move(); + return *this; + } + + // Same as operator bool() in expression it == end(...). + bool operator==(TokenizeIterator const &) { return !(*this); } + + // Same as operator bool() in expression it != end(...). + bool operator!=(TokenizeIterator const &) { return (*this); } + +private: + void Move() + { + if (m_end == m_finish) + { + ASSERT(!m_finished, ()); + m_start = m_end = m_finish; + m_finished = true; + return; + } + + m_start = m_end; + ++m_start; + + m_end = m_start; + while (m_end != m_finish && !m_delimFn(*m_end)) + ++m_end; + } + + TIt m_start; + TIt m_end; + TIt m_finish; + TDelimFn m_delimFn; + bool m_finished; }; class SimpleDelimiter @@ -172,15 +240,20 @@ class SimpleDelimiter UniString m_delims; public: - SimpleDelimiter(char const * delimChars); + SimpleDelimiter(char const * delims); + // Used in TokenizeIterator to allow past the end iterator construction. SimpleDelimiter() = default; /// @return true if c is delimiter bool operator()(UniChar c) const; }; -typedef TokenizeIterator> - SimpleTokenizer; +using SimpleTokenizer = + TokenizeIterator, + false /* KeepEmptyTokens */>; +using SimpleTokenizerWithEmptyTokens = + TokenizeIterator, + true /* KeepEmptyTokens */>; template void Tokenize(string const & str, char const * delims, TFunctor && f) @@ -389,16 +462,3 @@ size_t EditDistance(TIter const & b1, TIter const & e1, TIter const & b2, TIter return prev[m]; } } // namespace strings - -namespace std -{ -template -struct iterator_traits> -{ - using difference_type = std::ptrdiff_t; - using value_type = string; - using pointer = void; - using reference = string; - using iterator_category = std::input_iterator_tag; -}; -} // namespace std