Refactored base/string_utils

This commit is contained in:
Alex Zolotarev 2011-05-23 17:36:45 +02:00 committed by Alex Zolotarev
parent aca0b79b0d
commit d6389417b0
9 changed files with 241 additions and 150 deletions

View file

@ -1,6 +1,8 @@
#include "../../testing/testing.hpp"
#include "../string_utils.hpp"
#include "../../std/bind.hpp"
UNIT_TEST(make_lower_case)
{
string s;
@ -51,3 +53,63 @@ UNIT_TEST(to_string)
TEST_EQUAL(strings::to_string(0.56), "0.56", ());
TEST_EQUAL(strings::to_string(-100.2), "-100.2", ());
}
struct FunctorTester
{
size_t & m_index;
vector<string> const & m_tokens;
explicit FunctorTester(size_t & counter, vector<string> const & tokens)
: m_index(counter), m_tokens(tokens) {}
void operator()(string const & s)
{
TEST_EQUAL(s, m_tokens[m_index++], ());
}
};
void TestIter(string const & str, char const * delims, vector<string> const & tokens)
{
strings::SimpleTokenizer it(str, delims);
for (size_t i = 0; i < tokens.size(); ++i)
{
TEST_EQUAL(true, it, (str, delims, i));
TEST_EQUAL(i == tokens.size() - 1, it.IsLast(), ());
TEST_EQUAL(*it, tokens[i], (str, delims, i));
++it;
}
TEST_EQUAL(false, it, (str, delims));
size_t counter = 0;
FunctorTester f = FunctorTester(counter, tokens);
strings::Tokenize(str, delims, f);
TEST_EQUAL(counter, tokens.size(), ());
}
UNIT_TEST(SimpleTokenizer)
{
vector<string> tokens;
TestIter("", "", tokens);
TestIter("", "; ", tokens);
TestIter(" : ; , ;", "; :,", tokens);
{
char const * s[] = {"hello"};
tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s));
TestIter("hello", ";", tokens);
}
{
char const * s[] = {"hello", "world"};
tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s));
TestIter(" hello, world!", ", !", tokens);
}
{
char const * s[] = {"\xD9\x80", "\xD8\xA7\xD9\x84\xD9\x85\xD9\x88\xD8\xA7\xD9\x81\xD9\x82",
"\xD8\xAC"};
tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s));
TestIter("\xD9\x87\xD9\x80 - \xD8\xA7\xD9\x84\xD9\x85\xD9\x88\xD8\xA7\xD9\x81\xD9\x82 \xD9\x87\xD8\xAC",
" -\xD9\x87", tokens);
}
}

View file

@ -18,6 +18,8 @@ public:
typedef T const & const_reference;
typedef T & reference;
typedef size_t size_type;
typedef T const * const_iterator;
typedef T * iterator;
buffer_vector() : m_size(0) {}
explicit buffer_vector(size_t n, T c = T()) : m_size(0)

View file

@ -2,35 +2,27 @@
#include "assert.hpp"
#include "../std/sstream.hpp"
#include "../std/iterator.hpp"
#include <locale> // for make_lower_case
namespace strings
{
TokenizeIterator::TokenizeIterator(string const & s, char const * delim)
: m_start(0), m_src(s), m_delim(delim)
SimpleDelimiter::SimpleDelimiter(char const * delimChars)
{
move();
string const s(delimChars);
string::const_iterator it = s.begin();
while (it != s.end())
m_delims.push_back(utf8::unchecked::next(it));
}
void TokenizeIterator::move()
bool SimpleDelimiter::operator()(UniChar c) const
{
m_end = m_src.find_first_of(m_delim, m_start);
if (m_end == string::npos) m_end = m_src.size();
}
string TokenizeIterator::operator*() const
{
ASSERT ( !end(), ("dereference of empty iterator") );
return m_src.substr(m_start, m_end - m_start);
}
TokenizeIterator & TokenizeIterator::operator++()
{
m_start = m_end + 1;
move();
return (*this);
for (UniString::const_iterator it = m_delims.begin(); it != m_delims.end(); ++it)
if (*it == c)
return true;
return false;
}
bool to_int(char const * s, int & i)

View file

@ -1,127 +1,162 @@
#pragma once
#include "../base/buffer_vector.hpp"
#include "../std/string.hpp"
#include "../std/stdint.hpp"
#include "../std/sstream.hpp"
#include "../3party/utfcpp/source/utf8/unchecked.h"
/// All methods work with strings in utf-8 format
namespace strings
{
// get substrings from s divided by delim and pass them to toDo
template <class ToDo> void TokenizeString(string const & s, char const * delim, ToDo toDo)
typedef uint32_t UniChar;
typedef buffer_vector<UniChar, 32> UniString;
template <typename DelimFuncT>
class TokenizeIterator
{
utf8::unchecked::iterator<string::const_iterator> m_beg, m_end, m_finish;
DelimFuncT m_delimFunc;
void move()
{
size_t const count = s.size();
size_t i = 0;
while (i < count)
m_beg = m_end;
while (m_beg != m_finish)
{
i = s.find_first_not_of(delim, i);
if (i == string::npos) return;
size_t e = s.find_first_of(delim, i);
if (e == string::npos) e = count;
toDo(s.substr(i, e-i));
i = e + 1;
if (m_delimFunc(*m_beg))
++m_beg;
else
break;
}
m_end = m_beg;
while (m_end != m_finish)
{
if (m_delimFunc(*m_end))
break;
else
++m_end;
}
}
/// string tokenizer iterator
class TokenizeIterator
public:
TokenizeIterator(string const & s, DelimFuncT delimFunc)
: m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc)
{
size_t m_start, m_end;
string const & m_src;
char const * m_delim;
void move();
public:
TokenizeIterator(string const & s, char const * delim);
string operator*() const;
TokenizeIterator & operator++();
bool end() const { return (m_start >= m_end); }
size_t is_last() const { return (m_end == m_src.size()); }
};
template <class T, size_t N, class TT> bool IsInArray(T (&arr) [N], TT const & t)
{
for (size_t i = 0; i < N; ++i)
if (arr[i] == t) return true;
return false;
move();
}
bool to_int(char const * s, int & i);
bool to_uint64(char const * s, uint64_t & i);
bool to_int64(char const * s, int64_t & i);
bool to_double(char const * s, double & d);
template <class T>
string to_string(T i)
string operator*() const
{
ostringstream ss;
ss << i;
return ss.str();
ASSERT( m_beg != m_finish, ("dereferencing of empty iterator") );
return string(m_beg.base(), m_end.base());
}
operator bool() const { return m_beg != m_finish; }
TokenizeIterator & operator++()
{
move();
return (*this);
}
inline bool to_int(string const & s, int & i) { return to_int(s.c_str(), i); }
inline bool to_uint64(string const & s, uint64_t & i) { return to_uint64(s.c_str(), i); }
inline bool to_int64(string const & s, int64_t & i) { return to_int64(s.c_str(), i); }
inline bool to_double(string const & s, double & d) { return to_double(s.c_str(), d); }
void make_lower_case(string & s);
bool equal_no_case(string s1, string s2);
inline string ToUtf8(wstring const & wstr)
bool IsLast() const
{
string result;
utf8::unchecked::utf16to8(wstr.begin(), wstr.end(), back_inserter(result));
return result;
if (!*this)
return false;
TokenizeIterator<DelimFuncT> copy(*this);
++copy;
return !copy;
}
};
inline wstring FromUtf8(string const & str)
class SimpleDelimiter
{
UniString m_delims;
public:
SimpleDelimiter(char const * delimChars);
/// @return true if c is delimiter
bool operator()(UniChar c) const;
};
typedef TokenizeIterator<SimpleDelimiter> SimpleTokenizer;
template <typename FunctorT>
void Tokenize(string const & str, char const * delims, FunctorT f)
{
SimpleTokenizer iter(str, delims);
while (iter)
{
wstring result;
utf8::unchecked::utf8to16(str.begin(), str.end(), back_inserter(result));
return result;
}
template <typename ItT, typename DelimiterT>
typename ItT::value_type JoinStrings(ItT begin, ItT end, DelimiterT const & delimiter)
{
typedef typename ItT::value_type StringT;
if (begin == end) return StringT();
StringT result = *begin++;
for (ItT it = begin; it != end; ++it)
{
result += delimiter;
result += *it;
}
return result;
}
template <typename ContainerT, typename DelimiterT>
typename ContainerT::value_type JoinStrings(ContainerT const & container,
DelimiterT const & delimiter)
{
return JoinStrings(container.begin(), container.end(), delimiter);
}
inline bool IsPrefixOf(string const & s1, string const & s2)
{
if (s1.size() > s2.size()) return false;
for (size_t i = 0; i < s1.size(); ++i)
{
if (s1[i] != s2[i]) return false;
}
return true;
f(*iter);
++iter;
}
}
template <class T, size_t N, class TT> bool IsInArray(T (&arr) [N], TT const & t)
{
for (size_t i = 0; i < N; ++i)
if (arr[i] == t) return true;
return false;
}
bool to_int(char const * s, int & i);
bool to_uint64(char const * s, uint64_t & i);
bool to_int64(char const * s, int64_t & i);
bool to_double(char const * s, double & d);
template <class T>
string to_string(T i)
{
ostringstream ss;
ss << i;
return ss.str();
}
inline bool to_int(string const & s, int & i) { return to_int(s.c_str(), i); }
inline bool to_uint64(string const & s, uint64_t & i) { return to_uint64(s.c_str(), i); }
inline bool to_int64(string const & s, int64_t & i) { return to_int64(s.c_str(), i); }
inline bool to_double(string const & s, double & d) { return to_double(s.c_str(), d); }
void make_lower_case(string & s);
bool equal_no_case(string s1, string s2);
inline string ToUtf8(wstring const & wstr)
{
string result;
utf8::unchecked::utf16to8(wstr.begin(), wstr.end(), back_inserter(result));
return result;
}
inline wstring FromUtf8(string const & str)
{
wstring result;
utf8::unchecked::utf8to16(str.begin(), str.end(), back_inserter(result));
return result;
}
template <typename ItT, typename DelimiterT>
typename ItT::value_type JoinStrings(ItT begin, ItT end, DelimiterT const & delimiter)
{
typedef typename ItT::value_type StringT;
if (begin == end) return StringT();
StringT result = *begin++;
for (ItT it = begin; it != end; ++it)
{
result += delimiter;
result += *it;
}
return result;
}
template <typename ContainerT, typename DelimiterT>
typename ContainerT::value_type JoinStrings(ContainerT const & container,
DelimiterT const & delimiter)
{
return JoinStrings(container.begin(), container.end(), delimiter);
}
}

View file

@ -69,7 +69,7 @@ namespace borders
m2::RectD rect;
PolygonLoader loader(baseDir, simplifyCountriesLevel, country, rect);
strings::TokenizeString(line, "|", loader);
strings::Tokenize(line, "|", loader);
if (!country.m_regions.IsEmpty())
countries.Add(country, rect);
}

View file

@ -30,11 +30,11 @@ namespace ftype {
static char const * aTrue[] = { "yes", "true", "1", "*" };
static char const * aFalse[] = { "no", "false", "-1" };
string_utils::TokenizeIterator it(v, "|");
while (!it.end())
strings::SimpleTokenizer it(v, "|");
while (it)
{
if (string_utils::IsInArray(aTrue, *it)) return 1;
if (string_utils::IsInArray(aFalse, *it)) return -1;
if (strings::IsInArray(aTrue, *it)) return 1;
if (strings::IsInArray(aFalse, *it)) return -1;
++it;
}
@ -71,7 +71,7 @@ namespace ftype {
{
static char const * rules[] = { "line", "tunnel", "area", "symbol", "caption", "text",
"circle", "pathText", "wayMarker" };
return string_utils::IsInArray(rules, e);
return strings::IsInArray(rules, e);
}
uint8_t get_rule_type()
@ -87,9 +87,9 @@ namespace ftype {
}
ASSERT ( !e.empty(), () );
string_utils::TokenizeIterator it(e, "|");
strings::SimpleTokenizer it(e, "|");
uint8_t ret = 0;
while (!it.end())
while (it)
{
string const & s = *it;
if (s == "node")
@ -110,7 +110,7 @@ namespace ftype {
// addclass appear in small scales (6-11)
// don't skip it during parsing, but we don't process it like a rule
"addclass" };
return (string_utils::IsInArray(elems, e) || is_draw_rule(e));
return (strings::IsInArray(elems, e) || is_draw_rule(e));
}
/// check if it's processing key
@ -118,7 +118,7 @@ namespace ftype {
{
static char const * bad[] = { "osmarender:render", "osmarender:rendername",
"osmarender:renderref", "addr:housenumber" };
return (!k.empty() && !string_utils::IsInArray(bad, k));
return (!k.empty() && !strings::IsInArray(bad, k));
}
static bool is_valid_value(string const & v)
@ -132,13 +132,13 @@ namespace ftype {
static char const * mark[] = { "bridge", "tunnel", "area", "lock", "oneway", "junction",
"embankment", "cutting", "motorroad", "cycleway",
"bicycle", "horse", "capital", "fee" };
return string_utils::IsInArray(mark, k);
return strings::IsInArray(mark, k);
}
static bool process_feature_like_mark_from_root(string const & /*k*/, string const & v)
{
static char const * mark[] = { "turning_circle", "dyke", "dike", "levee", "embankment" };
return string_utils::IsInArray(mark, v);
return strings::IsInArray(mark, v);
}
static bool process_feature_like_mark(string const & k, string const & v)
@ -150,7 +150,7 @@ namespace ftype {
static bool is_skip_element_by_key(string const & k)
{
static char const * skip[] = { "addr:housenumber", "fixme" };
return string_utils::IsInArray(skip, k);
return strings::IsInArray(skip, k);
}
/// skip element and all it's sub-elements
@ -176,8 +176,8 @@ namespace ftype {
void AddAttr(string name, string value)
{
// make lower case for equivalent string comparison
string_utils::make_lower_case(name);
string_utils::make_lower_case(value);
strings::make_lower_case(name);
strings::make_lower_case(value);
if ((name == "k") && is_skip_element_by_key(value))
m_forceSkip = true;
@ -261,8 +261,8 @@ namespace ftype {
string v = e.attr["v"];
if (!is_valid_value(v)) continue;
string_utils::TokenizeIterator iK(k, "|");
if (iK.is_last())
strings::SimpleTokenizer iK(k, "|");
if (iK.IsLast())
{
// process one key
ASSERT ( *iK == k, () );
@ -324,8 +324,8 @@ namespace ftype {
}
// process values
string_utils::TokenizeIterator iV(v, "|");
while (!iV.end())
strings::SimpleTokenizer iV(v, "|");
while (iV)
{
bool const b1 = process_feature_like_mark_from_root(k, *iV);
if (b1 || process_feature_like_mark(k, *iV))
@ -355,18 +355,18 @@ namespace ftype {
{
char const * aTry[] = { "natural", "landuse" };
while (!iK.end())
while (iK)
{
// let's try to add root keys
bool addMode = (pParent == get_root() && string_utils::IsInArray(aTry, *iK));
bool addMode = (pParent == get_root() && strings::IsInArray(aTry, *iK));
ClassifObject * p = (addMode ? pParent->Add(*iK) : pParent->Find(*iK));
if (p && (get_mark_value(*iK, v) == 0))
{
if (p->IsCriterion()) p = pParent;
string_utils::TokenizeIterator iV(v, "|");
while (!iV.end())
strings::SimpleTokenizer iV(v, "|");
while (iV)
{
ClassifObject * pp = (addMode ? p->Add(*iV) : p->Find(*iV));
if (pp)
@ -525,7 +525,7 @@ namespace ftype {
// get names
string lang;
string_utils::TokenizeString(k, "\t :", get_lang(lang));
strings::Tokenize(k, "\t :", get_lang(lang));
if (!lang.empty())
m_params.name.AddString(lang, v);
@ -552,7 +552,7 @@ namespace ftype {
if (k == "population")
{
int n;
if (string_utils::to_int(v, n))
if (strings::to_int(v, n))
m_params.rank = static_cast<uint8_t>(log(double(n)) / log(1.1));
}

View file

@ -23,9 +23,9 @@ namespace drule
{
int * arrParams[] = { &m_scale, &m_type, &m_index, &m_priority };
strings::TokenizeIterator it(s, "|");
strings::SimpleTokenizer it(s, "|");
size_t i = 0;
while (!it.end())
while (it)
{
ASSERT ( i < ARRAY_SIZE(arrParams), (i) );

View file

@ -213,7 +213,7 @@ namespace drule {
template <> dash_array_t get_value<dash_array_t>(string const & s)
{
dash_array_t ret;
strings::TokenizeString(s, " \tpx,", bind(&dash_array_t::add, ref(ret), _1));
strings::Tokenize(s, " \tpx,", bind(&dash_array_t::add, ref(ret), _1));
/// @see http://www.w3.org/TR/SVG/painting.html stroke-dasharray
size_t const count = ret.m_v.size();
@ -866,7 +866,7 @@ Key RulesHolder::CreateRuleImpl1(string const & name,
#endif
attrs_map_t a;
strings::TokenizeString(clValue, " \t", bind(&RulesHolder::PushAttributes, this, _1, ref(a)));
strings::Tokenize(clValue, " \t", bind(&RulesHolder::PushAttributes, this, _1, ref(a)));
for (attrs_map_t::const_iterator i = attrs.begin(); i != attrs.end(); ++i)
if (!strings::IsInArray(arrClassTags, i->first))

View file

@ -99,7 +99,7 @@ namespace languages
CodesT currentCodes;
Collector c(currentCodes);
strings::TokenizeString(settingsString, LANG_DELIMETER, c);
strings::Tokenize(settingsString, LANG_DELIMETER, c);
GetSupportedLanguages(outLanguages);
Sort(currentCodes, outLanguages);