Refactored base/string_utils

2011-05-23 17:36:45 +02:00 · 2011-05-23 17:36:45 +02:00 · d6389417b0
commit d6389417b0
parent aca0b79b0d
9 changed files with 241 additions and 150 deletions
--- a/base/base_tests/string_utils_test.cpp
+++ b/base/base_tests/string_utils_test.cpp
@ -1,6 +1,8 @@
 #include "../../testing/testing.hpp"
 #include "../string_utils.hpp"

+#include "../../std/bind.hpp"
+
 UNIT_TEST(make_lower_case)
 {
  string s;
@ -51,3 +53,63 @@ UNIT_TEST(to_string)
  TEST_EQUAL(strings::to_string(0.56), "0.56", ());
  TEST_EQUAL(strings::to_string(-100.2), "-100.2", ());
 }
+
+struct FunctorTester
+{
+  size_t & m_index;
+  vector<string> const & m_tokens;
+
+  explicit FunctorTester(size_t & counter, vector<string> const & tokens)
+    : m_index(counter), m_tokens(tokens) {}
+  void operator()(string const & s)
+  {
+    TEST_EQUAL(s, m_tokens[m_index++], ());
+  }
+};
+
+void TestIter(string const & str, char const * delims, vector<string> const & tokens)
+{
+  strings::SimpleTokenizer it(str, delims);
+  for (size_t i = 0; i < tokens.size(); ++i)
+  {
+    TEST_EQUAL(true, it, (str, delims, i));
+    TEST_EQUAL(i == tokens.size() - 1, it.IsLast(), ());
+    TEST_EQUAL(*it, tokens[i], (str, delims, i));
+    ++it;
+  }
+  TEST_EQUAL(false, it, (str, delims));
+
+  size_t counter = 0;
+  FunctorTester f = FunctorTester(counter, tokens);
+  strings::Tokenize(str, delims, f);
+  TEST_EQUAL(counter, tokens.size(), ());
+}
+
+UNIT_TEST(SimpleTokenizer)
+{
+  vector<string> tokens;
+  TestIter("", "", tokens);
+  TestIter("", "; ", tokens);
+  TestIter("  : ;  , ;", "; :,", tokens);
+
+  {
+    char const * s[] = {"hello"};
+    tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s));
+    TestIter("hello", ";", tokens);
+  }
+
+  {
+    char const * s[] = {"hello", "world"};
+    tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s));
+    TestIter(" hello, world!", ", !", tokens);
+  }
+
+  {
+    char const * s[] = {"\xD9\x80", "\xD8\xA7\xD9\x84\xD9\x85\xD9\x88\xD8\xA7\xD9\x81\xD9\x82",
+                       "\xD8\xAC"};
+    tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s));
+    TestIter("\xD9\x87\xD9\x80 - \xD8\xA7\xD9\x84\xD9\x85\xD9\x88\xD8\xA7\xD9\x81\xD9\x82 \xD9\x87\xD8\xAC",
+             " -\xD9\x87", tokens);
+  }
+
+}
--- a/base/buffer_vector.hpp
+++ b/base/buffer_vector.hpp
@ -18,6 +18,8 @@ public:
  typedef T const & const_reference;
  typedef T & reference;
  typedef size_t size_type;
+  typedef T const * const_iterator;
+  typedef T * iterator;

  buffer_vector() : m_size(0) {}
  explicit buffer_vector(size_t n, T c = T()) : m_size(0)
--- a/base/string_utils.cpp
+++ b/base/string_utils.cpp
@ -2,35 +2,27 @@
 #include "assert.hpp"

 #include "../std/sstream.hpp"
+#include "../std/iterator.hpp"

 #include <locale>   // for make_lower_case

 namespace strings
 {

-TokenizeIterator::TokenizeIterator(string const & s, char const * delim)
-: m_start(0), m_src(s), m_delim(delim)
+SimpleDelimiter::SimpleDelimiter(char const * delimChars)
 {
-  move();
+  string const s(delimChars);
+  string::const_iterator it = s.begin();
+  while (it != s.end())
+    m_delims.push_back(utf8::unchecked::next(it));
 }

-void TokenizeIterator::move()
+bool SimpleDelimiter::operator()(UniChar c) const
 {
-  m_end = m_src.find_first_of(m_delim, m_start);
-  if (m_end == string::npos) m_end = m_src.size();
-}
-
-string TokenizeIterator::operator*() const
-{
-  ASSERT ( !end(), ("dereference of empty iterator") );
-  return m_src.substr(m_start, m_end - m_start);
-}
-
-TokenizeIterator & TokenizeIterator::operator++()
-{
-  m_start = m_end + 1;
-  move();
-  return (*this);
+  for (UniString::const_iterator it = m_delims.begin(); it != m_delims.end(); ++it)
+    if (*it == c)
+      return true;
+  return false;
 }

 bool to_int(char const * s, int & i)
--- a/base/string_utils.hpp
+++ b/base/string_utils.hpp
@ -1,127 +1,162 @@
 #pragma once

+#include "../base/buffer_vector.hpp"
+
 #include "../std/string.hpp"
 #include "../std/stdint.hpp"
+#include "../std/sstream.hpp"

 #include "../3party/utfcpp/source/utf8/unchecked.h"

+/// All methods work with strings in utf-8 format
 namespace strings
 {
-  // get substrings from s divided by delim and pass them to toDo
-  template <class ToDo> void TokenizeString(string const & s, char const * delim, ToDo toDo)
+
+typedef uint32_t UniChar;
+typedef buffer_vector<UniChar, 32> UniString;
+
+template <typename DelimFuncT>
+class TokenizeIterator
+{
+  utf8::unchecked::iterator<string::const_iterator> m_beg, m_end, m_finish;
+  DelimFuncT m_delimFunc;
+
+  void move()
  {
-    size_t const count = s.size();
-    size_t i = 0;
-    while (i < count)
+    m_beg = m_end;
+    while (m_beg != m_finish)
    {
-      i = s.find_first_not_of(delim, i);
-      if (i == string::npos) return;
-
-      size_t e = s.find_first_of(delim, i);
-      if (e == string::npos) e = count;
-
-      toDo(s.substr(i, e-i));
-
-      i = e + 1;
+      if (m_delimFunc(*m_beg))
+        ++m_beg;
+      else
+        break;
+    }
+    m_end = m_beg;
+    while (m_end != m_finish)
+    {
+      if (m_delimFunc(*m_end))
+        break;
+      else
+        ++m_end;
    }
  }

-  /// string tokenizer iterator
-  class TokenizeIterator
+public:
+  TokenizeIterator(string const & s, DelimFuncT delimFunc)
+  : m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc)
  {
-    size_t m_start, m_end;
-
-    string const & m_src;
-    char const * m_delim;
-
-    void move();
-
-  public:
-    TokenizeIterator(string const & s, char const * delim);
-
-    string operator*() const;
-
-    TokenizeIterator & operator++();
-
-    bool end() const { return (m_start >= m_end); }
-    size_t is_last() const { return (m_end == m_src.size()); }
-  };
-
-  template <class T, size_t N, class TT> bool IsInArray(T (&arr) [N], TT const & t)
-  {
-    for (size_t i = 0; i < N; ++i)
-      if (arr[i] == t) return true;
-    return false;
+    move();
  }

-  bool to_int(char const * s, int & i);
-  bool to_uint64(char const * s, uint64_t & i);
-  bool to_int64(char const * s, int64_t & i);
-  bool to_double(char const * s, double & d);
-
-  template <class T>
-  string to_string(T i)
+  string operator*() const
  {
-    ostringstream ss;
-    ss << i;
-    return ss.str();
+    ASSERT( m_beg != m_finish, ("dereferencing of empty iterator") );
+    return string(m_beg.base(), m_end.base());
+  }
+  operator bool() const { return m_beg != m_finish; }
+
+  TokenizeIterator & operator++()
+  {
+    move();
+    return (*this);
  }

-  inline bool to_int(string const & s, int & i) { return to_int(s.c_str(), i); }
-  inline bool to_uint64(string const & s, uint64_t & i) { return to_uint64(s.c_str(), i); }
-  inline bool to_int64(string const & s, int64_t & i) { return to_int64(s.c_str(), i); }
-  inline bool to_double(string const & s, double & d) { return to_double(s.c_str(), d); }
-
-  void make_lower_case(string & s);
-  bool equal_no_case(string s1, string s2);
-
-  inline string ToUtf8(wstring const & wstr)
+  bool IsLast() const
  {
-    string result;
-    utf8::unchecked::utf16to8(wstr.begin(), wstr.end(), back_inserter(result));
-    return result;
+    if (!*this)
+      return false;
+    TokenizeIterator<DelimFuncT> copy(*this);
+    ++copy;
+    return !copy;
  }
+};

-  inline wstring FromUtf8(string const & str)
+class SimpleDelimiter
+{
+  UniString m_delims;
+public:
+  SimpleDelimiter(char const * delimChars);
+  /// @return true if c is delimiter
+  bool operator()(UniChar c) const;
+};
+
+typedef TokenizeIterator<SimpleDelimiter> SimpleTokenizer;
+
+template <typename FunctorT>
+void Tokenize(string const & str, char const * delims, FunctorT f)
+{
+  SimpleTokenizer iter(str, delims);
+  while (iter)
  {
-    wstring result;
-    utf8::unchecked::utf8to16(str.begin(), str.end(), back_inserter(result));
-    return result;
-  }
-
-  template <typename ItT, typename DelimiterT>
-  typename ItT::value_type JoinStrings(ItT begin, ItT end, DelimiterT const & delimiter)
-  {
-    typedef typename ItT::value_type StringT;
-
-    if (begin == end) return StringT();
-
-    StringT result = *begin++;
-    for (ItT it = begin; it != end; ++it)
-    {
-      result += delimiter;
-      result += *it;
-    }
-
-    return result;
-  }
-
-  template <typename ContainerT, typename DelimiterT>
-  typename ContainerT::value_type JoinStrings(ContainerT const & container,
-                                              DelimiterT const & delimiter)
-  {
-    return JoinStrings(container.begin(), container.end(), delimiter);
-  }
-
-  inline bool IsPrefixOf(string const & s1, string const & s2)
-  {
-    if (s1.size() > s2.size()) return false;
-
-    for (size_t i = 0; i < s1.size(); ++i)
-    {
-      if (s1[i] != s2[i]) return false;
-    }
-
-    return true;
+    f(*iter);
+    ++iter;
  }
 }
+
+template <class T, size_t N, class TT> bool IsInArray(T (&arr) [N], TT const & t)
+{
+  for (size_t i = 0; i < N; ++i)
+    if (arr[i] == t) return true;
+  return false;
+}
+
+bool to_int(char const * s, int & i);
+bool to_uint64(char const * s, uint64_t & i);
+bool to_int64(char const * s, int64_t & i);
+bool to_double(char const * s, double & d);
+
+template <class T>
+string to_string(T i)
+{
+  ostringstream ss;
+  ss << i;
+  return ss.str();
+}
+
+inline bool to_int(string const & s, int & i) { return to_int(s.c_str(), i); }
+inline bool to_uint64(string const & s, uint64_t & i) { return to_uint64(s.c_str(), i); }
+inline bool to_int64(string const & s, int64_t & i) { return to_int64(s.c_str(), i); }
+inline bool to_double(string const & s, double & d) { return to_double(s.c_str(), d); }
+
+void make_lower_case(string & s);
+bool equal_no_case(string s1, string s2);
+
+inline string ToUtf8(wstring const & wstr)
+{
+  string result;
+  utf8::unchecked::utf16to8(wstr.begin(), wstr.end(), back_inserter(result));
+  return result;
+}
+
+inline wstring FromUtf8(string const & str)
+{
+  wstring result;
+  utf8::unchecked::utf8to16(str.begin(), str.end(), back_inserter(result));
+  return result;
+}
+
+template <typename ItT, typename DelimiterT>
+typename ItT::value_type JoinStrings(ItT begin, ItT end, DelimiterT const & delimiter)
+{
+  typedef typename ItT::value_type StringT;
+
+  if (begin == end) return StringT();
+
+  StringT result = *begin++;
+  for (ItT it = begin; it != end; ++it)
+  {
+    result += delimiter;
+    result += *it;
+  }
+
+  return result;
+}
+
+template <typename ContainerT, typename DelimiterT>
+typename ContainerT::value_type JoinStrings(ContainerT const & container,
+                                            DelimiterT const & delimiter)
+{
+  return JoinStrings(container.begin(), container.end(), delimiter);
+}
+
+}
--- a/generator/borders_loader.cpp
+++ b/generator/borders_loader.cpp
@ -69,7 +69,7 @@ namespace borders
      m2::RectD rect;

      PolygonLoader loader(baseDir, simplifyCountriesLevel, country, rect);
-      strings::TokenizeString(line, "|", loader);
+      strings::Tokenize(line, "|", loader);
      if (!country.m_regions.IsEmpty())
        countries.Add(country, rect);
    }
--- a/generator/osm2type.cpp
+++ b/generator/osm2type.cpp
@ -30,11 +30,11 @@ namespace ftype {
      static char const * aTrue[] = { "yes", "true", "1", "*" };
      static char const * aFalse[] = { "no", "false", "-1" };

-      string_utils::TokenizeIterator it(v, "|");
-      while (!it.end())
+      strings::SimpleTokenizer it(v, "|");
+      while (it)
      {
-        if (string_utils::IsInArray(aTrue, *it)) return 1;
-        if (string_utils::IsInArray(aFalse, *it)) return -1;
+        if (strings::IsInArray(aTrue, *it)) return 1;
+        if (strings::IsInArray(aFalse, *it)) return -1;
        ++it;
      }

@ -71,7 +71,7 @@ namespace ftype {
      {
        static char const * rules[] = { "line", "tunnel", "area", "symbol", "caption", "text",
                                        "circle", "pathText", "wayMarker" };
-        return string_utils::IsInArray(rules, e);
+        return strings::IsInArray(rules, e);
      }

      uint8_t get_rule_type()
@ -87,9 +87,9 @@ namespace ftype {
        }
        ASSERT ( !e.empty(), () );

-        string_utils::TokenizeIterator it(e, "|");
+        strings::SimpleTokenizer it(e, "|");
        uint8_t ret = 0;
-        while (!it.end())
+        while (it)
        {
          string const & s = *it;
          if (s == "node")
@ -110,7 +110,7 @@ namespace ftype {
          // addclass appear in small scales (6-11)
          // don't skip it during parsing, but we don't process it like a rule
                                        "addclass" };
-        return (string_utils::IsInArray(elems, e) || is_draw_rule(e));
+        return (strings::IsInArray(elems, e) || is_draw_rule(e));
      }

      /// check if it's processing key
@ -118,7 +118,7 @@ namespace ftype {
      {
        static char const * bad[] = { "osmarender:render", "osmarender:rendername",
                                      "osmarender:renderref", "addr:housenumber" };
-        return (!k.empty() && !string_utils::IsInArray(bad, k));
+        return (!k.empty() && !strings::IsInArray(bad, k));
      }

      static bool is_valid_value(string const & v)
@ -132,13 +132,13 @@ namespace ftype {
        static char const * mark[] = {  "bridge", "tunnel", "area", "lock", "oneway", "junction",
                                        "embankment", "cutting", "motorroad", "cycleway",
                                        "bicycle", "horse", "capital", "fee" };
-        return string_utils::IsInArray(mark, k);
+        return strings::IsInArray(mark, k);
      }

      static bool process_feature_like_mark_from_root(string const & /*k*/, string const & v)
      {
        static char const * mark[] = { "turning_circle", "dyke", "dike", "levee", "embankment" };
-        return string_utils::IsInArray(mark, v);
+        return strings::IsInArray(mark, v);
      }

      static bool process_feature_like_mark(string const & k, string const & v)
@ -150,7 +150,7 @@ namespace ftype {
      static bool is_skip_element_by_key(string const & k)
      {
        static char const * skip[] = { "addr:housenumber", "fixme" };
-        return string_utils::IsInArray(skip, k);
+        return strings::IsInArray(skip, k);
      }

      /// skip element and all it's sub-elements
@ -176,8 +176,8 @@ namespace ftype {
      void AddAttr(string name, string value)
      {
        // make lower case for equivalent string comparison
-        string_utils::make_lower_case(name);
-        string_utils::make_lower_case(value);
+        strings::make_lower_case(name);
+        strings::make_lower_case(value);

        if ((name == "k") && is_skip_element_by_key(value))
          m_forceSkip = true;
@ -261,8 +261,8 @@ namespace ftype {
            string v = e.attr["v"];
            if (!is_valid_value(v)) continue;

-            string_utils::TokenizeIterator iK(k, "|");
-            if (iK.is_last())
+            strings::SimpleTokenizer iK(k, "|");
+            if (iK.IsLast())
            {
              // process one key
              ASSERT ( *iK == k, () );
@ -324,8 +324,8 @@ namespace ftype {
                  }

                  // process values
-                  string_utils::TokenizeIterator iV(v, "|");
-                  while (!iV.end())
+                  strings::SimpleTokenizer iV(v, "|");
+                  while (iV)
                  {
                    bool const b1 = process_feature_like_mark_from_root(k, *iV);
                    if (b1 || process_feature_like_mark(k, *iV))
@ -355,18 +355,18 @@ namespace ftype {
            {
              char const * aTry[] = { "natural", "landuse" };

-              while (!iK.end())
+              while (iK)
              {
                // let's try to add root keys
-                bool addMode = (pParent == get_root() && string_utils::IsInArray(aTry, *iK));
+                bool addMode = (pParent == get_root() && strings::IsInArray(aTry, *iK));

                ClassifObject * p = (addMode ? pParent->Add(*iK) : pParent->Find(*iK));
                if (p && (get_mark_value(*iK, v) == 0))
                {
                  if (p->IsCriterion()) p = pParent;

-                  string_utils::TokenizeIterator iV(v, "|");
-                  while (!iV.end())
+                  strings::SimpleTokenizer iV(v, "|");
+                  while (iV)
                  {
                    ClassifObject * pp = (addMode ? p->Add(*iV) : p->Find(*iV));
                    if (pp)
@ -525,7 +525,7 @@ namespace ftype {

        // get names
        string lang;
-        string_utils::TokenizeString(k, "\t :", get_lang(lang));
+        strings::Tokenize(k, "\t :", get_lang(lang));
        if (!lang.empty())
          m_params.name.AddString(lang, v);

@ -552,7 +552,7 @@ namespace ftype {
        if (k == "population")
        {
          int n;
-          if (string_utils::to_int(v, n))
+          if (strings::to_int(v, n))
            m_params.rank = static_cast<uint8_t>(log(double(n)) / log(1.1));
        }

--- a/indexer/drawing_rule_def.cpp
+++ b/indexer/drawing_rule_def.cpp
@ -23,9 +23,9 @@ namespace drule
  {
    int * arrParams[] = { &m_scale, &m_type, &m_index, &m_priority };

-    strings::TokenizeIterator it(s, "|");
+    strings::SimpleTokenizer it(s, "|");
    size_t i = 0;
-    while (!it.end())
+    while (it)
    {
      ASSERT ( i < ARRAY_SIZE(arrParams), (i) );

--- a/indexer/drawing_rules.cpp
+++ b/indexer/drawing_rules.cpp
@ -213,7 +213,7 @@ namespace drule {
  template <> dash_array_t get_value<dash_array_t>(string const & s)
  {
    dash_array_t ret;
-    strings::TokenizeString(s, " \tpx,", bind(&dash_array_t::add, ref(ret), _1));
+    strings::Tokenize(s, " \tpx,", bind(&dash_array_t::add, ref(ret), _1));

    /// @see http://www.w3.org/TR/SVG/painting.html stroke-dasharray
    size_t const count = ret.m_v.size();
@ -866,7 +866,7 @@ Key RulesHolder::CreateRuleImpl1(string const & name,
 #endif

  attrs_map_t a;
-  strings::TokenizeString(clValue, " \t", bind(&RulesHolder::PushAttributes, this, _1, ref(a)));
+  strings::Tokenize(clValue, " \t", bind(&RulesHolder::PushAttributes, this, _1, ref(a)));

  for (attrs_map_t::const_iterator i = attrs.begin(); i != attrs.end(); ++i)
    if (!strings::IsInArray(arrClassTags, i->first))
--- a/map/languages.cpp
+++ b/map/languages.cpp
@ -99,7 +99,7 @@ namespace languages

    CodesT currentCodes;
    Collector c(currentCodes);
-    strings::TokenizeString(settingsString, LANG_DELIMETER, c);
+    strings::Tokenize(settingsString, LANG_DELIMETER, c);

    GetSupportedLanguages(outLanguages);
    Sort(currentCodes, outLanguages);