From d6389417b03a3d992054c309e4b56a8d580dab1c Mon Sep 17 00:00:00 2001 From: Alex Zolotarev Date: Mon, 23 May 2011 17:36:45 +0200 Subject: [PATCH] Refactored base/string_utils --- base/base_tests/string_utils_test.cpp | 62 +++++++ base/buffer_vector.hpp | 2 + base/string_utils.cpp | 30 ++-- base/string_utils.hpp | 237 +++++++++++++++----------- generator/borders_loader.cpp | 2 +- generator/osm2type.cpp | 48 +++--- indexer/drawing_rule_def.cpp | 4 +- indexer/drawing_rules.cpp | 4 +- map/languages.cpp | 2 +- 9 files changed, 241 insertions(+), 150 deletions(-) diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp index f259c6fc98..9e582ada71 100644 --- a/base/base_tests/string_utils_test.cpp +++ b/base/base_tests/string_utils_test.cpp @@ -1,6 +1,8 @@ #include "../../testing/testing.hpp" #include "../string_utils.hpp" +#include "../../std/bind.hpp" + UNIT_TEST(make_lower_case) { string s; @@ -51,3 +53,63 @@ UNIT_TEST(to_string) TEST_EQUAL(strings::to_string(0.56), "0.56", ()); TEST_EQUAL(strings::to_string(-100.2), "-100.2", ()); } + +struct FunctorTester +{ + size_t & m_index; + vector const & m_tokens; + + explicit FunctorTester(size_t & counter, vector const & tokens) + : m_index(counter), m_tokens(tokens) {} + void operator()(string const & s) + { + TEST_EQUAL(s, m_tokens[m_index++], ()); + } +}; + +void TestIter(string const & str, char const * delims, vector const & tokens) +{ + strings::SimpleTokenizer it(str, delims); + for (size_t i = 0; i < tokens.size(); ++i) + { + TEST_EQUAL(true, it, (str, delims, i)); + TEST_EQUAL(i == tokens.size() - 1, it.IsLast(), ()); + TEST_EQUAL(*it, tokens[i], (str, delims, i)); + ++it; + } + TEST_EQUAL(false, it, (str, delims)); + + size_t counter = 0; + FunctorTester f = FunctorTester(counter, tokens); + strings::Tokenize(str, delims, f); + TEST_EQUAL(counter, tokens.size(), ()); +} + +UNIT_TEST(SimpleTokenizer) +{ + vector tokens; + TestIter("", "", tokens); + TestIter("", "; ", tokens); + TestIter(" : ; , ;", "; :,", tokens); + + { + char const * s[] = {"hello"}; + tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s)); + TestIter("hello", ";", tokens); + } + + { + char const * s[] = {"hello", "world"}; + tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s)); + TestIter(" hello, world!", ", !", tokens); + } + + { + char const * s[] = {"\xD9\x80", "\xD8\xA7\xD9\x84\xD9\x85\xD9\x88\xD8\xA7\xD9\x81\xD9\x82", + "\xD8\xAC"}; + tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s)); + TestIter("\xD9\x87\xD9\x80 - \xD8\xA7\xD9\x84\xD9\x85\xD9\x88\xD8\xA7\xD9\x81\xD9\x82 \xD9\x87\xD8\xAC", + " -\xD9\x87", tokens); + } + +} diff --git a/base/buffer_vector.hpp b/base/buffer_vector.hpp index 1255faafde..24d9bb2d62 100644 --- a/base/buffer_vector.hpp +++ b/base/buffer_vector.hpp @@ -18,6 +18,8 @@ public: typedef T const & const_reference; typedef T & reference; typedef size_t size_type; + typedef T const * const_iterator; + typedef T * iterator; buffer_vector() : m_size(0) {} explicit buffer_vector(size_t n, T c = T()) : m_size(0) diff --git a/base/string_utils.cpp b/base/string_utils.cpp index 48d85c6186..6b676cc395 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -2,35 +2,27 @@ #include "assert.hpp" #include "../std/sstream.hpp" +#include "../std/iterator.hpp" #include // for make_lower_case namespace strings { -TokenizeIterator::TokenizeIterator(string const & s, char const * delim) -: m_start(0), m_src(s), m_delim(delim) +SimpleDelimiter::SimpleDelimiter(char const * delimChars) { - move(); + string const s(delimChars); + string::const_iterator it = s.begin(); + while (it != s.end()) + m_delims.push_back(utf8::unchecked::next(it)); } -void TokenizeIterator::move() +bool SimpleDelimiter::operator()(UniChar c) const { - m_end = m_src.find_first_of(m_delim, m_start); - if (m_end == string::npos) m_end = m_src.size(); -} - -string TokenizeIterator::operator*() const -{ - ASSERT ( !end(), ("dereference of empty iterator") ); - return m_src.substr(m_start, m_end - m_start); -} - -TokenizeIterator & TokenizeIterator::operator++() -{ - m_start = m_end + 1; - move(); - return (*this); + for (UniString::const_iterator it = m_delims.begin(); it != m_delims.end(); ++it) + if (*it == c) + return true; + return false; } bool to_int(char const * s, int & i) diff --git a/base/string_utils.hpp b/base/string_utils.hpp index 482c12810f..9ce3c7ff25 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -1,127 +1,162 @@ #pragma once +#include "../base/buffer_vector.hpp" + #include "../std/string.hpp" #include "../std/stdint.hpp" +#include "../std/sstream.hpp" #include "../3party/utfcpp/source/utf8/unchecked.h" +/// All methods work with strings in utf-8 format namespace strings { - // get substrings from s divided by delim and pass them to toDo - template void TokenizeString(string const & s, char const * delim, ToDo toDo) + +typedef uint32_t UniChar; +typedef buffer_vector UniString; + +template +class TokenizeIterator +{ + utf8::unchecked::iterator m_beg, m_end, m_finish; + DelimFuncT m_delimFunc; + + void move() { - size_t const count = s.size(); - size_t i = 0; - while (i < count) + m_beg = m_end; + while (m_beg != m_finish) { - i = s.find_first_not_of(delim, i); - if (i == string::npos) return; - - size_t e = s.find_first_of(delim, i); - if (e == string::npos) e = count; - - toDo(s.substr(i, e-i)); - - i = e + 1; + if (m_delimFunc(*m_beg)) + ++m_beg; + else + break; + } + m_end = m_beg; + while (m_end != m_finish) + { + if (m_delimFunc(*m_end)) + break; + else + ++m_end; } } - /// string tokenizer iterator - class TokenizeIterator +public: + TokenizeIterator(string const & s, DelimFuncT delimFunc) + : m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc) { - size_t m_start, m_end; - - string const & m_src; - char const * m_delim; - - void move(); - - public: - TokenizeIterator(string const & s, char const * delim); - - string operator*() const; - - TokenizeIterator & operator++(); - - bool end() const { return (m_start >= m_end); } - size_t is_last() const { return (m_end == m_src.size()); } - }; - - template bool IsInArray(T (&arr) [N], TT const & t) - { - for (size_t i = 0; i < N; ++i) - if (arr[i] == t) return true; - return false; + move(); } - bool to_int(char const * s, int & i); - bool to_uint64(char const * s, uint64_t & i); - bool to_int64(char const * s, int64_t & i); - bool to_double(char const * s, double & d); - - template - string to_string(T i) + string operator*() const { - ostringstream ss; - ss << i; - return ss.str(); + ASSERT( m_beg != m_finish, ("dereferencing of empty iterator") ); + return string(m_beg.base(), m_end.base()); + } + operator bool() const { return m_beg != m_finish; } + + TokenizeIterator & operator++() + { + move(); + return (*this); } - inline bool to_int(string const & s, int & i) { return to_int(s.c_str(), i); } - inline bool to_uint64(string const & s, uint64_t & i) { return to_uint64(s.c_str(), i); } - inline bool to_int64(string const & s, int64_t & i) { return to_int64(s.c_str(), i); } - inline bool to_double(string const & s, double & d) { return to_double(s.c_str(), d); } - - void make_lower_case(string & s); - bool equal_no_case(string s1, string s2); - - inline string ToUtf8(wstring const & wstr) + bool IsLast() const { - string result; - utf8::unchecked::utf16to8(wstr.begin(), wstr.end(), back_inserter(result)); - return result; + if (!*this) + return false; + TokenizeIterator copy(*this); + ++copy; + return !copy; } +}; - inline wstring FromUtf8(string const & str) +class SimpleDelimiter +{ + UniString m_delims; +public: + SimpleDelimiter(char const * delimChars); + /// @return true if c is delimiter + bool operator()(UniChar c) const; +}; + +typedef TokenizeIterator SimpleTokenizer; + +template +void Tokenize(string const & str, char const * delims, FunctorT f) +{ + SimpleTokenizer iter(str, delims); + while (iter) { - wstring result; - utf8::unchecked::utf8to16(str.begin(), str.end(), back_inserter(result)); - return result; - } - - template - typename ItT::value_type JoinStrings(ItT begin, ItT end, DelimiterT const & delimiter) - { - typedef typename ItT::value_type StringT; - - if (begin == end) return StringT(); - - StringT result = *begin++; - for (ItT it = begin; it != end; ++it) - { - result += delimiter; - result += *it; - } - - return result; - } - - template - typename ContainerT::value_type JoinStrings(ContainerT const & container, - DelimiterT const & delimiter) - { - return JoinStrings(container.begin(), container.end(), delimiter); - } - - inline bool IsPrefixOf(string const & s1, string const & s2) - { - if (s1.size() > s2.size()) return false; - - for (size_t i = 0; i < s1.size(); ++i) - { - if (s1[i] != s2[i]) return false; - } - - return true; + f(*iter); + ++iter; } } + +template bool IsInArray(T (&arr) [N], TT const & t) +{ + for (size_t i = 0; i < N; ++i) + if (arr[i] == t) return true; + return false; +} + +bool to_int(char const * s, int & i); +bool to_uint64(char const * s, uint64_t & i); +bool to_int64(char const * s, int64_t & i); +bool to_double(char const * s, double & d); + +template +string to_string(T i) +{ + ostringstream ss; + ss << i; + return ss.str(); +} + +inline bool to_int(string const & s, int & i) { return to_int(s.c_str(), i); } +inline bool to_uint64(string const & s, uint64_t & i) { return to_uint64(s.c_str(), i); } +inline bool to_int64(string const & s, int64_t & i) { return to_int64(s.c_str(), i); } +inline bool to_double(string const & s, double & d) { return to_double(s.c_str(), d); } + +void make_lower_case(string & s); +bool equal_no_case(string s1, string s2); + +inline string ToUtf8(wstring const & wstr) +{ + string result; + utf8::unchecked::utf16to8(wstr.begin(), wstr.end(), back_inserter(result)); + return result; +} + +inline wstring FromUtf8(string const & str) +{ + wstring result; + utf8::unchecked::utf8to16(str.begin(), str.end(), back_inserter(result)); + return result; +} + +template +typename ItT::value_type JoinStrings(ItT begin, ItT end, DelimiterT const & delimiter) +{ + typedef typename ItT::value_type StringT; + + if (begin == end) return StringT(); + + StringT result = *begin++; + for (ItT it = begin; it != end; ++it) + { + result += delimiter; + result += *it; + } + + return result; +} + +template +typename ContainerT::value_type JoinStrings(ContainerT const & container, + DelimiterT const & delimiter) +{ + return JoinStrings(container.begin(), container.end(), delimiter); +} + +} diff --git a/generator/borders_loader.cpp b/generator/borders_loader.cpp index cf0165fa66..e372af6afc 100644 --- a/generator/borders_loader.cpp +++ b/generator/borders_loader.cpp @@ -69,7 +69,7 @@ namespace borders m2::RectD rect; PolygonLoader loader(baseDir, simplifyCountriesLevel, country, rect); - strings::TokenizeString(line, "|", loader); + strings::Tokenize(line, "|", loader); if (!country.m_regions.IsEmpty()) countries.Add(country, rect); } diff --git a/generator/osm2type.cpp b/generator/osm2type.cpp index 64dcda2166..a0ef846f74 100644 --- a/generator/osm2type.cpp +++ b/generator/osm2type.cpp @@ -30,11 +30,11 @@ namespace ftype { static char const * aTrue[] = { "yes", "true", "1", "*" }; static char const * aFalse[] = { "no", "false", "-1" }; - string_utils::TokenizeIterator it(v, "|"); - while (!it.end()) + strings::SimpleTokenizer it(v, "|"); + while (it) { - if (string_utils::IsInArray(aTrue, *it)) return 1; - if (string_utils::IsInArray(aFalse, *it)) return -1; + if (strings::IsInArray(aTrue, *it)) return 1; + if (strings::IsInArray(aFalse, *it)) return -1; ++it; } @@ -71,7 +71,7 @@ namespace ftype { { static char const * rules[] = { "line", "tunnel", "area", "symbol", "caption", "text", "circle", "pathText", "wayMarker" }; - return string_utils::IsInArray(rules, e); + return strings::IsInArray(rules, e); } uint8_t get_rule_type() @@ -87,9 +87,9 @@ namespace ftype { } ASSERT ( !e.empty(), () ); - string_utils::TokenizeIterator it(e, "|"); + strings::SimpleTokenizer it(e, "|"); uint8_t ret = 0; - while (!it.end()) + while (it) { string const & s = *it; if (s == "node") @@ -110,7 +110,7 @@ namespace ftype { // addclass appear in small scales (6-11) // don't skip it during parsing, but we don't process it like a rule "addclass" }; - return (string_utils::IsInArray(elems, e) || is_draw_rule(e)); + return (strings::IsInArray(elems, e) || is_draw_rule(e)); } /// check if it's processing key @@ -118,7 +118,7 @@ namespace ftype { { static char const * bad[] = { "osmarender:render", "osmarender:rendername", "osmarender:renderref", "addr:housenumber" }; - return (!k.empty() && !string_utils::IsInArray(bad, k)); + return (!k.empty() && !strings::IsInArray(bad, k)); } static bool is_valid_value(string const & v) @@ -132,13 +132,13 @@ namespace ftype { static char const * mark[] = { "bridge", "tunnel", "area", "lock", "oneway", "junction", "embankment", "cutting", "motorroad", "cycleway", "bicycle", "horse", "capital", "fee" }; - return string_utils::IsInArray(mark, k); + return strings::IsInArray(mark, k); } static bool process_feature_like_mark_from_root(string const & /*k*/, string const & v) { static char const * mark[] = { "turning_circle", "dyke", "dike", "levee", "embankment" }; - return string_utils::IsInArray(mark, v); + return strings::IsInArray(mark, v); } static bool process_feature_like_mark(string const & k, string const & v) @@ -150,7 +150,7 @@ namespace ftype { static bool is_skip_element_by_key(string const & k) { static char const * skip[] = { "addr:housenumber", "fixme" }; - return string_utils::IsInArray(skip, k); + return strings::IsInArray(skip, k); } /// skip element and all it's sub-elements @@ -176,8 +176,8 @@ namespace ftype { void AddAttr(string name, string value) { // make lower case for equivalent string comparison - string_utils::make_lower_case(name); - string_utils::make_lower_case(value); + strings::make_lower_case(name); + strings::make_lower_case(value); if ((name == "k") && is_skip_element_by_key(value)) m_forceSkip = true; @@ -261,8 +261,8 @@ namespace ftype { string v = e.attr["v"]; if (!is_valid_value(v)) continue; - string_utils::TokenizeIterator iK(k, "|"); - if (iK.is_last()) + strings::SimpleTokenizer iK(k, "|"); + if (iK.IsLast()) { // process one key ASSERT ( *iK == k, () ); @@ -324,8 +324,8 @@ namespace ftype { } // process values - string_utils::TokenizeIterator iV(v, "|"); - while (!iV.end()) + strings::SimpleTokenizer iV(v, "|"); + while (iV) { bool const b1 = process_feature_like_mark_from_root(k, *iV); if (b1 || process_feature_like_mark(k, *iV)) @@ -355,18 +355,18 @@ namespace ftype { { char const * aTry[] = { "natural", "landuse" }; - while (!iK.end()) + while (iK) { // let's try to add root keys - bool addMode = (pParent == get_root() && string_utils::IsInArray(aTry, *iK)); + bool addMode = (pParent == get_root() && strings::IsInArray(aTry, *iK)); ClassifObject * p = (addMode ? pParent->Add(*iK) : pParent->Find(*iK)); if (p && (get_mark_value(*iK, v) == 0)) { if (p->IsCriterion()) p = pParent; - string_utils::TokenizeIterator iV(v, "|"); - while (!iV.end()) + strings::SimpleTokenizer iV(v, "|"); + while (iV) { ClassifObject * pp = (addMode ? p->Add(*iV) : p->Find(*iV)); if (pp) @@ -525,7 +525,7 @@ namespace ftype { // get names string lang; - string_utils::TokenizeString(k, "\t :", get_lang(lang)); + strings::Tokenize(k, "\t :", get_lang(lang)); if (!lang.empty()) m_params.name.AddString(lang, v); @@ -552,7 +552,7 @@ namespace ftype { if (k == "population") { int n; - if (string_utils::to_int(v, n)) + if (strings::to_int(v, n)) m_params.rank = static_cast(log(double(n)) / log(1.1)); } diff --git a/indexer/drawing_rule_def.cpp b/indexer/drawing_rule_def.cpp index aefcbc6d26..f4da5f5f6c 100644 --- a/indexer/drawing_rule_def.cpp +++ b/indexer/drawing_rule_def.cpp @@ -23,9 +23,9 @@ namespace drule { int * arrParams[] = { &m_scale, &m_type, &m_index, &m_priority }; - strings::TokenizeIterator it(s, "|"); + strings::SimpleTokenizer it(s, "|"); size_t i = 0; - while (!it.end()) + while (it) { ASSERT ( i < ARRAY_SIZE(arrParams), (i) ); diff --git a/indexer/drawing_rules.cpp b/indexer/drawing_rules.cpp index 83e241fc39..ac2b9f709b 100644 --- a/indexer/drawing_rules.cpp +++ b/indexer/drawing_rules.cpp @@ -213,7 +213,7 @@ namespace drule { template <> dash_array_t get_value(string const & s) { dash_array_t ret; - strings::TokenizeString(s, " \tpx,", bind(&dash_array_t::add, ref(ret), _1)); + strings::Tokenize(s, " \tpx,", bind(&dash_array_t::add, ref(ret), _1)); /// @see http://www.w3.org/TR/SVG/painting.html stroke-dasharray size_t const count = ret.m_v.size(); @@ -866,7 +866,7 @@ Key RulesHolder::CreateRuleImpl1(string const & name, #endif attrs_map_t a; - strings::TokenizeString(clValue, " \t", bind(&RulesHolder::PushAttributes, this, _1, ref(a))); + strings::Tokenize(clValue, " \t", bind(&RulesHolder::PushAttributes, this, _1, ref(a))); for (attrs_map_t::const_iterator i = attrs.begin(); i != attrs.end(); ++i) if (!strings::IsInArray(arrClassTags, i->first)) diff --git a/map/languages.cpp b/map/languages.cpp index effba1a417..33ab59c188 100644 --- a/map/languages.cpp +++ b/map/languages.cpp @@ -99,7 +99,7 @@ namespace languages CodesT currentCodes; Collector c(currentCodes); - strings::TokenizeString(settingsString, LANG_DELIMETER, c); + strings::Tokenize(settingsString, LANG_DELIMETER, c); GetSupportedLanguages(outLanguages); Sort(currentCodes, outLanguages);