forked from organicmaps/organicmaps
Merge pull request #3571 from ygorshenin/tokenizer-with-empty-tokens
[base] Implemented empty-tokens-support to tokenizer.
This commit is contained in:
commit
9201925db0
3 changed files with 241 additions and 82 deletions
|
@ -332,32 +332,47 @@ struct FunctorTester
|
|||
size_t & m_index;
|
||||
vector<string> const & m_tokens;
|
||||
|
||||
explicit FunctorTester(size_t & counter, vector<string> const & tokens)
|
||||
: m_index(counter), m_tokens(tokens) {}
|
||||
FunctorTester(size_t & counter, vector<string> const & tokens)
|
||||
: m_index(counter), m_tokens(tokens)
|
||||
{
|
||||
}
|
||||
|
||||
void operator()(string const & s)
|
||||
{
|
||||
TEST_EQUAL(s, m_tokens[m_index++], ());
|
||||
}
|
||||
};
|
||||
|
||||
void TestIter(string const & str, char const * delims, vector<string> const & tokens)
|
||||
void TestIter(string const & s, char const * delims, vector<string> const & tokens)
|
||||
{
|
||||
strings::SimpleTokenizer it(str, delims);
|
||||
strings::SimpleTokenizer it(s, delims);
|
||||
for (size_t i = 0; i < tokens.size(); ++i)
|
||||
{
|
||||
TEST_EQUAL(true, it, (str, delims, i));
|
||||
TEST_EQUAL(i == tokens.size() - 1, it.IsLast(), ());
|
||||
TEST_EQUAL(*it, tokens[i], (str, delims, i));
|
||||
TEST(it, (s, delims, i));
|
||||
TEST_EQUAL(*it, tokens[i], (s, delims, i));
|
||||
++it;
|
||||
}
|
||||
TEST_EQUAL(false, it, (str, delims));
|
||||
TEST(!it, (s, delims));
|
||||
|
||||
size_t counter = 0;
|
||||
FunctorTester f = FunctorTester(counter, tokens);
|
||||
strings::Tokenize(str, delims, f);
|
||||
FunctorTester f(counter, tokens);
|
||||
strings::Tokenize(s, delims, f);
|
||||
TEST_EQUAL(counter, tokens.size(), ());
|
||||
}
|
||||
|
||||
void TestIterWithEmptyTokens(string const & s, char const * delims, vector<string> const & tokens)
|
||||
{
|
||||
strings::SimpleTokenizerWithEmptyTokens it(s, delims);
|
||||
|
||||
for (size_t i = 0; i < tokens.size(); ++i)
|
||||
{
|
||||
TEST(it, (s, delims, i));
|
||||
TEST_EQUAL(*it, tokens[i], (s, delims, i));
|
||||
++it;
|
||||
}
|
||||
TEST(!it, (s, delims));
|
||||
}
|
||||
|
||||
UNIT_TEST(SimpleTokenizer)
|
||||
{
|
||||
vector<string> tokens;
|
||||
|
@ -402,6 +417,42 @@ UNIT_TEST(SimpleTokenizer)
|
|||
TEST_EQUAL(vector<string>(SimpleTokenizer(str, ","), SimpleTokenizer()),
|
||||
(vector<string>{"a", "b", "c"}), ());
|
||||
}
|
||||
|
||||
{
|
||||
string const s = "";
|
||||
vector<string> const tokens = {""};
|
||||
TestIterWithEmptyTokens(s, ",", tokens);
|
||||
}
|
||||
|
||||
{
|
||||
string const s = ";";
|
||||
vector<string> const tokens = {"", ""};
|
||||
TestIterWithEmptyTokens(s, ";", tokens);
|
||||
}
|
||||
|
||||
{
|
||||
string const s = ";;";
|
||||
vector<string> const tokens = {"", "", ""};
|
||||
TestIterWithEmptyTokens(s, ";", tokens);
|
||||
}
|
||||
|
||||
{
|
||||
string const s = "Hello, World!";
|
||||
vector<string> const tokens = {s};
|
||||
TestIterWithEmptyTokens(s, "", tokens);
|
||||
}
|
||||
|
||||
{
|
||||
string const s = "Hello, World!";
|
||||
vector<string> const tokens = {"Hello", " World", ""};
|
||||
TestIterWithEmptyTokens(s, ",!", tokens);
|
||||
}
|
||||
|
||||
{
|
||||
string const s = ";a;b;;c;d;";
|
||||
vector<string> const tokens = {"", "a", "b", "", "c", "d", ""};
|
||||
TestIterWithEmptyTokens(s, ";", tokens);
|
||||
}
|
||||
}
|
||||
|
||||
UNIT_TEST(LastUniChar)
|
||||
|
|
|
@ -16,9 +16,9 @@ bool UniString::IsEqualAscii(char const * s) const
|
|||
return (size() == strlen(s) && equal(begin(), end(), s));
|
||||
}
|
||||
|
||||
SimpleDelimiter::SimpleDelimiter(char const * delimChars)
|
||||
SimpleDelimiter::SimpleDelimiter(char const * delims)
|
||||
{
|
||||
string const s(delimChars);
|
||||
string const s(delims);
|
||||
string::const_iterator it = s.begin();
|
||||
while (it != s.end())
|
||||
m_delims.push_back(utf8::unchecked::next(it));
|
||||
|
|
|
@ -89,82 +89,198 @@ bool IsASCIILatin(UniChar c);
|
|||
|
||||
inline string DebugPrint(UniString const & s) { return ToUtf8(s); }
|
||||
|
||||
template <typename DelimFuncT, typename UniCharIterT = UniString::const_iterator>
|
||||
template <typename TDelimFn, typename TIt = UniString::const_iterator, bool KeepEmptyTokens = false>
|
||||
class TokenizeIterator
|
||||
{
|
||||
UniCharIterT m_beg, m_end, m_finish;
|
||||
DelimFuncT m_delimFunc;
|
||||
|
||||
void move()
|
||||
{
|
||||
m_beg = m_end;
|
||||
while (m_beg != m_finish)
|
||||
{
|
||||
if (m_delimFunc(*m_beg))
|
||||
++m_beg;
|
||||
else
|
||||
break;
|
||||
}
|
||||
m_end = m_beg;
|
||||
while (m_end != m_finish)
|
||||
{
|
||||
if (m_delimFunc(*m_end))
|
||||
break;
|
||||
else
|
||||
++m_end;
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
/// @warning string S must be not temporary!
|
||||
TokenizeIterator(string const & s, DelimFuncT const & delimFunc)
|
||||
: m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc)
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using value_type = string;
|
||||
using pointer = void;
|
||||
using reference = string;
|
||||
using iterator_category = std::input_iterator_tag;
|
||||
|
||||
// *NOTE* |s| must be not temporary!
|
||||
TokenizeIterator(string const & s, TDelimFn const & delimFn)
|
||||
: m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn)
|
||||
{
|
||||
move();
|
||||
Move();
|
||||
}
|
||||
|
||||
/// @warning unistring S must be not temporary!
|
||||
TokenizeIterator(UniString const & s, DelimFuncT const & delimFunc)
|
||||
: m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc)
|
||||
// *NOTE* |s| must be not temporary!
|
||||
TokenizeIterator(UniString const & s, TDelimFn const & delimFn)
|
||||
: m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn)
|
||||
{
|
||||
move();
|
||||
Move();
|
||||
}
|
||||
|
||||
/// Use default-constructed iterator for operator == to determine an end of a token stream.
|
||||
// Use default-constructed iterator for operator == to determine the
|
||||
// end of the token stream.
|
||||
TokenizeIterator() = default;
|
||||
|
||||
/// Explicitly disabled, because we're storing iterators for string
|
||||
TokenizeIterator(char const *, DelimFuncT const &) = delete;
|
||||
|
||||
string operator*() const
|
||||
{
|
||||
ASSERT(m_beg != m_finish, ("dereferencing of empty iterator"));
|
||||
return string(m_beg.base(), m_end.base());
|
||||
ASSERT(m_start != m_finish, ("Dereferencing of empty iterator."));
|
||||
return string(m_start.base(), m_end.base());
|
||||
}
|
||||
|
||||
operator bool() const { return m_beg != m_finish; }
|
||||
|
||||
UniString GetUniString() const
|
||||
{
|
||||
ASSERT(m_start != m_finish, ("Dereferencing of empty iterator."));
|
||||
return UniString(m_start, m_end);
|
||||
}
|
||||
|
||||
operator bool() const { return m_start != m_finish; }
|
||||
|
||||
TokenizeIterator & operator++()
|
||||
{
|
||||
move();
|
||||
return (*this);
|
||||
Move();
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool IsLast() const
|
||||
bool operator==(TokenizeIterator const & rhs) const
|
||||
{
|
||||
if (!*this)
|
||||
return false;
|
||||
|
||||
TokenizeIterator<DelimFuncT, UniCharIterT> copy(*this);
|
||||
++copy;
|
||||
return !copy;
|
||||
if (!*this && !rhs)
|
||||
return true;
|
||||
if (*this && rhs)
|
||||
return m_start == rhs.m_start && m_end == rhs.m_end && m_finish == rhs.m_finish;
|
||||
return false;
|
||||
}
|
||||
|
||||
UniString GetUniString() const { return UniString(m_beg, m_end); }
|
||||
/// Same as operator bool() in expression it == end(...)
|
||||
bool operator==(TokenizeIterator const &) { return !(*this); }
|
||||
/// Same as operator bool() in expression it != end(...)
|
||||
bool operator!=(TokenizeIterator const &) { return (*this); }
|
||||
bool operator!=(TokenizeIterator const & rhs) const { return !(*this == rhs); }
|
||||
|
||||
private:
|
||||
void Move()
|
||||
{
|
||||
m_start = m_end;
|
||||
while (m_start != m_finish && m_delimFn(*m_start))
|
||||
++m_start;
|
||||
|
||||
m_end = m_start;
|
||||
while (m_end != m_finish && !m_delimFn(*m_end))
|
||||
++m_end;
|
||||
}
|
||||
|
||||
// Token is defined as a pair (|m_start|, |m_end|), where:
|
||||
//
|
||||
// * m_start < m_end
|
||||
// * m_start == begin or m_delimFn(m_start - 1)
|
||||
// * m_end == m_finish or m_delimFn(m_end)
|
||||
// * for all i from [m_start, m_end): !m_delimFn(i)
|
||||
//
|
||||
// This version of TokenizeIterator iterates over all tokens and
|
||||
// keeps the invariant above.
|
||||
TIt m_start;
|
||||
TIt m_end;
|
||||
|
||||
// The end of the string the iterator iterates over.
|
||||
TIt m_finish;
|
||||
|
||||
TDelimFn m_delimFn;
|
||||
};
|
||||
|
||||
template <typename TDelimFn, typename TIt>
|
||||
class TokenizeIterator<TDelimFn, TIt, true /* KeepEmptyTokens */>
|
||||
{
|
||||
public:
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using value_type = string;
|
||||
using pointer = void;
|
||||
using reference = string;
|
||||
using iterator_category = std::input_iterator_tag;
|
||||
|
||||
// *NOTE* |s| must be not temporary!
|
||||
TokenizeIterator(string const & s, TDelimFn const & delimFn)
|
||||
: m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn), m_finished(false)
|
||||
{
|
||||
while (m_end != m_finish && !m_delimFn(*m_end))
|
||||
++m_end;
|
||||
}
|
||||
|
||||
// *NOTE* |s| must be not temporary!
|
||||
TokenizeIterator(UniString const & s, TDelimFn const & delimFn)
|
||||
: m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn), m_finished(false)
|
||||
{
|
||||
while (m_end != m_finish && !m_delimFn(*m_end))
|
||||
++m_end;
|
||||
}
|
||||
|
||||
// Use default-constructed iterator for operator == to determine the
|
||||
// end of the token stream.
|
||||
TokenizeIterator() = default;
|
||||
|
||||
string operator*() const
|
||||
{
|
||||
ASSERT(!m_finished, ("Dereferencing of empty iterator."));
|
||||
return string(m_start.base(), m_end.base());
|
||||
}
|
||||
|
||||
UniString GetUniString() const
|
||||
{
|
||||
ASSERT(!m_finished, ("Dereferencing of empty iterator."));
|
||||
return UniString(m_start, m_end);
|
||||
}
|
||||
|
||||
operator bool() const { return !m_finished; }
|
||||
|
||||
TokenizeIterator & operator++()
|
||||
{
|
||||
Move();
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool operator==(TokenizeIterator const & rhs) const
|
||||
{
|
||||
if (!*this && !rhs)
|
||||
return true;
|
||||
if (*this && rhs)
|
||||
{
|
||||
return m_start == rhs.m_start && m_end == rhs.m_end && m_finish == rhs.m_finish &&
|
||||
m_finished == rhs.m_finished;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool operator!=(TokenizeIterator const & rhs) const { return !(*this == rhs); }
|
||||
|
||||
private:
|
||||
void Move()
|
||||
{
|
||||
if (m_end == m_finish)
|
||||
{
|
||||
ASSERT(!m_finished, ());
|
||||
m_start = m_end = m_finish;
|
||||
m_finished = true;
|
||||
return;
|
||||
}
|
||||
|
||||
m_start = m_end;
|
||||
++m_start;
|
||||
|
||||
m_end = m_start;
|
||||
while (m_end != m_finish && !m_delimFn(*m_end))
|
||||
++m_end;
|
||||
}
|
||||
|
||||
// Token is defined as a pair (|m_start|, |m_end|), where:
|
||||
//
|
||||
// * m_start <= m_end
|
||||
// * m_start == begin or m_delimFn(m_start - 1)
|
||||
// * m_end == m_finish or m_delimFn(m_end)
|
||||
// * for all i from [m_start, m_end): !m_delimFn(i)
|
||||
//
|
||||
// This version of TokenizeIterator iterates over all tokens and
|
||||
// keeps the invariant above.
|
||||
TIt m_start;
|
||||
TIt m_end;
|
||||
|
||||
// The end of the string the iterator iterates over.
|
||||
TIt m_finish;
|
||||
|
||||
TDelimFn m_delimFn;
|
||||
|
||||
// When true, iterator is at the end position and is not valid
|
||||
// anymore.
|
||||
bool m_finished;
|
||||
};
|
||||
|
||||
class SimpleDelimiter
|
||||
|
@ -172,15 +288,20 @@ class SimpleDelimiter
|
|||
UniString m_delims;
|
||||
|
||||
public:
|
||||
SimpleDelimiter(char const * delimChars);
|
||||
SimpleDelimiter(char const * delims);
|
||||
|
||||
// Used in TokenizeIterator to allow past the end iterator construction.
|
||||
SimpleDelimiter() = default;
|
||||
/// @return true if c is delimiter
|
||||
bool operator()(UniChar c) const;
|
||||
};
|
||||
|
||||
typedef TokenizeIterator<SimpleDelimiter, ::utf8::unchecked::iterator<string::const_iterator>>
|
||||
SimpleTokenizer;
|
||||
using SimpleTokenizer =
|
||||
TokenizeIterator<SimpleDelimiter, ::utf8::unchecked::iterator<string::const_iterator>,
|
||||
false /* KeepEmptyTokens */>;
|
||||
using SimpleTokenizerWithEmptyTokens =
|
||||
TokenizeIterator<SimpleDelimiter, ::utf8::unchecked::iterator<string::const_iterator>,
|
||||
true /* KeepEmptyTokens */>;
|
||||
|
||||
template <typename TFunctor>
|
||||
void Tokenize(string const & str, char const * delims, TFunctor && f)
|
||||
|
@ -389,16 +510,3 @@ size_t EditDistance(TIter const & b1, TIter const & e1, TIter const & b2, TIter
|
|||
return prev[m];
|
||||
}
|
||||
} // namespace strings
|
||||
|
||||
namespace std
|
||||
{
|
||||
template <typename... Args>
|
||||
struct iterator_traits<strings::TokenizeIterator<Args...>>
|
||||
{
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using value_type = string;
|
||||
using pointer = void;
|
||||
using reference = string;
|
||||
using iterator_category = std::input_iterator_tag;
|
||||
};
|
||||
} // namespace std
|
||||
|
|
Loading…
Add table
Reference in a new issue