From d6389417b03a3d992054c309e4b56a8d580dab1c Mon Sep 17 00:00:00 2001
From: Alex Zolotarev <deathbaba@gmail.com>
Date: Mon, 23 May 2011 17:36:45 +0200
Subject: [PATCH] Refactored base/string_utils

---
 base/base_tests/string_utils_test.cpp |  62 +++++++
 base/buffer_vector.hpp                |   2 +
 base/string_utils.cpp                 |  30 ++--
 base/string_utils.hpp                 | 237 +++++++++++++++-----------
 generator/borders_loader.cpp          |   2 +-
 generator/osm2type.cpp                |  48 +++---
 indexer/drawing_rule_def.cpp          |   4 +-
 indexer/drawing_rules.cpp             |   4 +-
 map/languages.cpp                     |   2 +-
 9 files changed, 241 insertions(+), 150 deletions(-)
diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp
index f259c6fc98..9e582ada71 100644
--- a/base/base_tests/string_utils_test.cpp
+++ b/base/base_tests/string_utils_test.cpp
@@ -1,6 +1,8 @@
 #include "../../testing/testing.hpp"
 #include "../string_utils.hpp"
 
+#include "../../std/bind.hpp"
+
 UNIT_TEST(make_lower_case)
 {
   string s;
@@ -51,3 +53,63 @@ UNIT_TEST(to_string)
   TEST_EQUAL(strings::to_string(0.56), "0.56", ());
   TEST_EQUAL(strings::to_string(-100.2), "-100.2", ());
 }
+
+struct FunctorTester
+{
+  size_t & m_index;
+  vector<string> const & m_tokens;
+
+  explicit FunctorTester(size_t & counter, vector<string> const & tokens)
+    : m_index(counter), m_tokens(tokens) {}
+  void operator()(string const & s)
+  {
+    TEST_EQUAL(s, m_tokens[m_index++], ());
+  }
+};
+
+void TestIter(string const & str, char const * delims, vector<string> const & tokens)
+{
+  strings::SimpleTokenizer it(str, delims);
+  for (size_t i = 0; i < tokens.size(); ++i)
+  {
+    TEST_EQUAL(true, it, (str, delims, i));
+    TEST_EQUAL(i == tokens.size() - 1, it.IsLast(), ());
+    TEST_EQUAL(*it, tokens[i], (str, delims, i));
+    ++it;
+  }
+  TEST_EQUAL(false, it, (str, delims));
+
+  size_t counter = 0;
+  FunctorTester f = FunctorTester(counter, tokens);
+  strings::Tokenize(str, delims, f);
+  TEST_EQUAL(counter, tokens.size(), ());
+}
+
+UNIT_TEST(SimpleTokenizer)
+{
+  vector<string> tokens;
+  TestIter("", "", tokens);
+  TestIter("", "; ", tokens);
+  TestIter("  : ;  , ;", "; :,", tokens);
+
+  {
+    char const * s[] = {"hello"};
+    tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s));
+    TestIter("hello", ";", tokens);
+  }
+
+  {
+    char const * s[] = {"hello", "world"};
+    tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s));
+    TestIter(" hello, world!", ", !", tokens);
+  }
+
+  {
+    char const * s[] = {"\xD9\x80", "\xD8\xA7\xD9\x84\xD9\x85\xD9\x88\xD8\xA7\xD9\x81\xD9\x82",
+                       "\xD8\xAC"};
+    tokens.assign(&s[0], &s[0] + ARRAY_SIZE(s));
+    TestIter("\xD9\x87\xD9\x80 - \xD8\xA7\xD9\x84\xD9\x85\xD9\x88\xD8\xA7\xD9\x81\xD9\x82 \xD9\x87\xD8\xAC",
+             " -\xD9\x87", tokens);
+  }
+
+}
diff --git a/base/buffer_vector.hpp b/base/buffer_vector.hpp
index 1255faafde..24d9bb2d62 100644
--- a/base/buffer_vector.hpp
+++ b/base/buffer_vector.hpp
@@ -18,6 +18,8 @@ public:
   typedef T const & const_reference;
   typedef T & reference;
   typedef size_t size_type;
+  typedef T const * const_iterator;
+  typedef T * iterator;
 
   buffer_vector() : m_size(0) {}
   explicit buffer_vector(size_t n, T c = T()) : m_size(0)
diff --git a/base/string_utils.cpp b/base/string_utils.cpp
index 48d85c6186..6b676cc395 100644
--- a/base/string_utils.cpp
+++ b/base/string_utils.cpp
@@ -2,35 +2,27 @@
 #include "assert.hpp"
 
 #include "../std/sstream.hpp"
+#include "../std/iterator.hpp"
 
 #include <locale>   // for make_lower_case
 
 namespace strings
 {
 
-TokenizeIterator::TokenizeIterator(string const & s, char const * delim)
-: m_start(0), m_src(s), m_delim(delim)
+SimpleDelimiter::SimpleDelimiter(char const * delimChars)
 {
-  move();
+  string const s(delimChars);
+  string::const_iterator it = s.begin();
+  while (it != s.end())
+    m_delims.push_back(utf8::unchecked::next(it));
 }
 
-void TokenizeIterator::move()
+bool SimpleDelimiter::operator()(UniChar c) const
 {
-  m_end = m_src.find_first_of(m_delim, m_start);
-  if (m_end == string::npos) m_end = m_src.size();
-}
-
-string TokenizeIterator::operator*() const
-{
-  ASSERT ( !end(), ("dereference of empty iterator") );
-  return m_src.substr(m_start, m_end - m_start);
-}
-
-TokenizeIterator & TokenizeIterator::operator++()
-{
-  m_start = m_end + 1;
-  move();
-  return (*this);
+  for (UniString::const_iterator it = m_delims.begin(); it != m_delims.end(); ++it)
+    if (*it == c)
+      return true;
+  return false;
 }
 
 bool to_int(char const * s, int & i)
diff --git a/base/string_utils.hpp b/base/string_utils.hpp
index 482c12810f..9ce3c7ff25 100644
--- a/base/string_utils.hpp
+++ b/base/string_utils.hpp
@@ -1,127 +1,162 @@
 #pragma once
 
+#include "../base/buffer_vector.hpp"
+
 #include "../std/string.hpp"
 #include "../std/stdint.hpp"
+#include "../std/sstream.hpp"
 
 #include "../3party/utfcpp/source/utf8/unchecked.h"
 
+/// All methods work with strings in utf-8 format
 namespace strings
 {
-  // get substrings from s divided by delim and pass them to toDo
-  template <class ToDo> void TokenizeString(string const & s, char const * delim, ToDo toDo)
+
+typedef uint32_t UniChar;
+typedef buffer_vector<UniChar, 32> UniString;
+
+template <typename DelimFuncT>
+class TokenizeIterator
+{
+  utf8::unchecked::iterator<string::const_iterator> m_beg, m_end, m_finish;
+  DelimFuncT m_delimFunc;
+
+  void move()
   {
-    size_t const count = s.size();
-    size_t i = 0;
-    while (i < count)
+    m_beg = m_end;
+    while (m_beg != m_finish)
     {
-      i = s.find_first_not_of(delim, i);
-      if (i == string::npos) return;
-
-      size_t e = s.find_first_of(delim, i);
-      if (e == string::npos) e = count;
-
-      toDo(s.substr(i, e-i));
-
-      i = e + 1;
+      if (m_delimFunc(*m_beg))
+        ++m_beg;
+      else
+        break;
+    }
+    m_end = m_beg;
+    while (m_end != m_finish)
+    {
+      if (m_delimFunc(*m_end))
+        break;
+      else
+        ++m_end;
     }
   }
 
-  /// string tokenizer iterator
-  class TokenizeIterator
+public:
+  TokenizeIterator(string const & s, DelimFuncT delimFunc)
+  : m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc)
   {
-    size_t m_start, m_end;
-
-    string const & m_src;
-    char const * m_delim;
-
-    void move();
-
-  public:
-    TokenizeIterator(string const & s, char const * delim);
-
-    string operator*() const;
-
-    TokenizeIterator & operator++();
-
-    bool end() const { return (m_start >= m_end); }
-    size_t is_last() const { return (m_end == m_src.size()); }
-  };
-
-  template <class T, size_t N, class TT> bool IsInArray(T (&arr) [N], TT const & t)
-  {
-    for (size_t i = 0; i < N; ++i)
-      if (arr[i] == t) return true;
-    return false;
+    move();
   }
 
-  bool to_int(char const * s, int & i);
-  bool to_uint64(char const * s, uint64_t & i);
-  bool to_int64(char const * s, int64_t & i);
-  bool to_double(char const * s, double & d);
-
-  template <class T>
-  string to_string(T i)
+  string operator*() const
   {
-    ostringstream ss;
-    ss << i;
-    return ss.str();
+    ASSERT( m_beg != m_finish, ("dereferencing of empty iterator") );
+    return string(m_beg.base(), m_end.base());
+  }
+  operator bool() const { return m_beg != m_finish; }
+
+  TokenizeIterator & operator++()
+  {
+    move();
+    return (*this);
   }
 
-  inline bool to_int(string const & s, int & i) { return to_int(s.c_str(), i); }
-  inline bool to_uint64(string const & s, uint64_t & i) { return to_uint64(s.c_str(), i); }
-  inline bool to_int64(string const & s, int64_t & i) { return to_int64(s.c_str(), i); }
-  inline bool to_double(string const & s, double & d) { return to_double(s.c_str(), d); }
-
-  void make_lower_case(string & s);
-  bool equal_no_case(string s1, string s2);
-
-  inline string ToUtf8(wstring const & wstr)
+  bool IsLast() const
   {
-    string result;
-    utf8::unchecked::utf16to8(wstr.begin(), wstr.end(), back_inserter(result));
-    return result;
+    if (!*this)
+      return false;
+    TokenizeIterator<DelimFuncT> copy(*this);
+    ++copy;
+    return !copy;
   }
+};
 
-  inline wstring FromUtf8(string const & str)
+class SimpleDelimiter
+{
+  UniString m_delims;
+public:
+  SimpleDelimiter(char const * delimChars);
+  /// @return true if c is delimiter
+  bool operator()(UniChar c) const;
+};
+
+typedef TokenizeIterator<SimpleDelimiter> SimpleTokenizer;
+
+template <typename FunctorT>
+void Tokenize(string const & str, char const * delims, FunctorT f)
+{
+  SimpleTokenizer iter(str, delims);
+  while (iter)
   {
-    wstring result;
-    utf8::unchecked::utf8to16(str.begin(), str.end(), back_inserter(result));
-    return result;
-  }
-
-  template <typename ItT, typename DelimiterT>
-  typename ItT::value_type JoinStrings(ItT begin, ItT end, DelimiterT const & delimiter)
-  {
-    typedef typename ItT::value_type StringT;
-
-    if (begin == end) return StringT();
-
-    StringT result = *begin++;
-    for (ItT it = begin; it != end; ++it)
-    {
-      result += delimiter;
-      result += *it;
-    }
-
-    return result;
-  }
-
-  template <typename ContainerT, typename DelimiterT>
-  typename ContainerT::value_type JoinStrings(ContainerT const & container,
-                                              DelimiterT const & delimiter)
-  {
-    return JoinStrings(container.begin(), container.end(), delimiter);
-  }
-
-  inline bool IsPrefixOf(string const & s1, string const & s2)
-  {
-    if (s1.size() > s2.size()) return false;
-
-    for (size_t i = 0; i < s1.size(); ++i)
-    {
-      if (s1[i] != s2[i]) return false;
-    }
-
-    return true;
+    f(*iter);
+    ++iter;
   }
 }
+
+template <class T, size_t N, class TT> bool IsInArray(T (&arr) [N], TT const & t)
+{
+  for (size_t i = 0; i < N; ++i)
+    if (arr[i] == t) return true;
+  return false;
+}
+
+bool to_int(char const * s, int & i);
+bool to_uint64(char const * s, uint64_t & i);
+bool to_int64(char const * s, int64_t & i);
+bool to_double(char const * s, double & d);
+
+template <class T>
+string to_string(T i)
+{
+  ostringstream ss;
+  ss << i;
+  return ss.str();
+}
+
+inline bool to_int(string const & s, int & i) { return to_int(s.c_str(), i); }
+inline bool to_uint64(string const & s, uint64_t & i) { return to_uint64(s.c_str(), i); }
+inline bool to_int64(string const & s, int64_t & i) { return to_int64(s.c_str(), i); }
+inline bool to_double(string const & s, double & d) { return to_double(s.c_str(), d); }
+
+void make_lower_case(string & s);
+bool equal_no_case(string s1, string s2);
+
+inline string ToUtf8(wstring const & wstr)
+{
+  string result;
+  utf8::unchecked::utf16to8(wstr.begin(), wstr.end(), back_inserter(result));
+  return result;
+}
+
+inline wstring FromUtf8(string const & str)
+{
+  wstring result;
+  utf8::unchecked::utf8to16(str.begin(), str.end(), back_inserter(result));
+  return result;
+}
+
+template <typename ItT, typename DelimiterT>
+typename ItT::value_type JoinStrings(ItT begin, ItT end, DelimiterT const & delimiter)
+{
+  typedef typename ItT::value_type StringT;
+
+  if (begin == end) return StringT();
+
+  StringT result = *begin++;
+  for (ItT it = begin; it != end; ++it)
+  {
+    result += delimiter;
+    result += *it;
+  }
+
+  return result;
+}
+
+template <typename ContainerT, typename DelimiterT>
+typename ContainerT::value_type JoinStrings(ContainerT const & container,
+                                            DelimiterT const & delimiter)
+{
+  return JoinStrings(container.begin(), container.end(), delimiter);
+}
+
+}
diff --git a/generator/borders_loader.cpp b/generator/borders_loader.cpp
index cf0165fa66..e372af6afc 100644
--- a/generator/borders_loader.cpp
+++ b/generator/borders_loader.cpp
@@ -69,7 +69,7 @@ namespace borders
       m2::RectD rect;
 
       PolygonLoader loader(baseDir, simplifyCountriesLevel, country, rect);
-      strings::TokenizeString(line, "|", loader);
+      strings::Tokenize(line, "|", loader);
       if (!country.m_regions.IsEmpty())
         countries.Add(country, rect);
     }
diff --git a/generator/osm2type.cpp b/generator/osm2type.cpp
index 64dcda2166..a0ef846f74 100644
--- a/generator/osm2type.cpp
+++ b/generator/osm2type.cpp
@@ -30,11 +30,11 @@ namespace ftype {
       static char const * aTrue[] = { "yes", "true", "1", "*" };
       static char const * aFalse[] = { "no", "false", "-1" };
 
-      string_utils::TokenizeIterator it(v, "|");
-      while (!it.end())
+      strings::SimpleTokenizer it(v, "|");
+      while (it)
       {
-        if (string_utils::IsInArray(aTrue, *it)) return 1;
-        if (string_utils::IsInArray(aFalse, *it)) return -1;
+        if (strings::IsInArray(aTrue, *it)) return 1;
+        if (strings::IsInArray(aFalse, *it)) return -1;
         ++it;
       }
 
@@ -71,7 +71,7 @@ namespace ftype {
       {
         static char const * rules[] = { "line", "tunnel", "area", "symbol", "caption", "text",
                                         "circle", "pathText", "wayMarker" };
-        return string_utils::IsInArray(rules, e);
+        return strings::IsInArray(rules, e);
       }
 
       uint8_t get_rule_type()
@@ -87,9 +87,9 @@ namespace ftype {
         }
         ASSERT ( !e.empty(), () );
 
-        string_utils::TokenizeIterator it(e, "|");
+        strings::SimpleTokenizer it(e, "|");
         uint8_t ret = 0;
-        while (!it.end())
+        while (it)
         {
           string const & s = *it;
           if (s == "node")
@@ -110,7 +110,7 @@ namespace ftype {
           // addclass appear in small scales (6-11)
           // don't skip it during parsing, but we don't process it like a rule
                                         "addclass" };
-        return (string_utils::IsInArray(elems, e) || is_draw_rule(e));
+        return (strings::IsInArray(elems, e) || is_draw_rule(e));
       }
 
       /// check if it's processing key
@@ -118,7 +118,7 @@ namespace ftype {
       {
         static char const * bad[] = { "osmarender:render", "osmarender:rendername",
                                       "osmarender:renderref", "addr:housenumber" };
-        return (!k.empty() && !string_utils::IsInArray(bad, k));
+        return (!k.empty() && !strings::IsInArray(bad, k));
       }
 
       static bool is_valid_value(string const & v)
@@ -132,13 +132,13 @@ namespace ftype {
         static char const * mark[] = {  "bridge", "tunnel", "area", "lock", "oneway", "junction",
                                         "embankment", "cutting", "motorroad", "cycleway",
                                         "bicycle", "horse", "capital", "fee" };
-        return string_utils::IsInArray(mark, k);
+        return strings::IsInArray(mark, k);
       }
 
       static bool process_feature_like_mark_from_root(string const & /*k*/, string const & v)
       {
         static char const * mark[] = { "turning_circle", "dyke", "dike", "levee", "embankment" };
-        return string_utils::IsInArray(mark, v);
+        return strings::IsInArray(mark, v);
       }
 
       static bool process_feature_like_mark(string const & k, string const & v)
@@ -150,7 +150,7 @@ namespace ftype {
       static bool is_skip_element_by_key(string const & k)
       {
         static char const * skip[] = { "addr:housenumber", "fixme" };
-        return string_utils::IsInArray(skip, k);
+        return strings::IsInArray(skip, k);
       }
 
       /// skip element and all it's sub-elements
@@ -176,8 +176,8 @@ namespace ftype {
       void AddAttr(string name, string value)
       {
         // make lower case for equivalent string comparison
-        string_utils::make_lower_case(name);
-        string_utils::make_lower_case(value);
+        strings::make_lower_case(name);
+        strings::make_lower_case(value);
 
         if ((name == "k") && is_skip_element_by_key(value))
           m_forceSkip = true;
@@ -261,8 +261,8 @@ namespace ftype {
             string v = e.attr["v"];
             if (!is_valid_value(v)) continue;
 
-            string_utils::TokenizeIterator iK(k, "|");
-            if (iK.is_last())
+            strings::SimpleTokenizer iK(k, "|");
+            if (iK.IsLast())
             {
               // process one key
               ASSERT ( *iK == k, () );
@@ -324,8 +324,8 @@ namespace ftype {
                   }
 
                   // process values
-                  string_utils::TokenizeIterator iV(v, "|");
-                  while (!iV.end())
+                  strings::SimpleTokenizer iV(v, "|");
+                  while (iV)
                   {
                     bool const b1 = process_feature_like_mark_from_root(k, *iV);
                     if (b1 || process_feature_like_mark(k, *iV))
@@ -355,18 +355,18 @@ namespace ftype {
             {
               char const * aTry[] = { "natural", "landuse" };
 
-              while (!iK.end())
+              while (iK)
               {
                 // let's try to add root keys
-                bool addMode = (pParent == get_root() && string_utils::IsInArray(aTry, *iK));
+                bool addMode = (pParent == get_root() && strings::IsInArray(aTry, *iK));
 
                 ClassifObject * p = (addMode ? pParent->Add(*iK) : pParent->Find(*iK));
                 if (p && (get_mark_value(*iK, v) == 0))
                 {
                   if (p->IsCriterion()) p = pParent;
 
-                  string_utils::TokenizeIterator iV(v, "|");
-                  while (!iV.end())
+                  strings::SimpleTokenizer iV(v, "|");
+                  while (iV)
                   {
                     ClassifObject * pp = (addMode ? p->Add(*iV) : p->Find(*iV));
                     if (pp)
@@ -525,7 +525,7 @@ namespace ftype {
 
         // get names
         string lang;
-        string_utils::TokenizeString(k, "\t :", get_lang(lang));
+        strings::Tokenize(k, "\t :", get_lang(lang));
         if (!lang.empty())
           m_params.name.AddString(lang, v);
 
@@ -552,7 +552,7 @@ namespace ftype {
         if (k == "population")
         {
           int n;
-          if (string_utils::to_int(v, n))
+          if (strings::to_int(v, n))
             m_params.rank = static_cast<uint8_t>(log(double(n)) / log(1.1));
         }
 
diff --git a/indexer/drawing_rule_def.cpp b/indexer/drawing_rule_def.cpp
index aefcbc6d26..f4da5f5f6c 100644
--- a/indexer/drawing_rule_def.cpp
+++ b/indexer/drawing_rule_def.cpp
@@ -23,9 +23,9 @@ namespace drule
   {
     int * arrParams[] = { &m_scale, &m_type, &m_index, &m_priority };
 
-    strings::TokenizeIterator it(s, "|");
+    strings::SimpleTokenizer it(s, "|");
     size_t i = 0;
-    while (!it.end())
+    while (it)
     {
       ASSERT ( i < ARRAY_SIZE(arrParams), (i) );
 
diff --git a/indexer/drawing_rules.cpp b/indexer/drawing_rules.cpp
index 83e241fc39..ac2b9f709b 100644
--- a/indexer/drawing_rules.cpp
+++ b/indexer/drawing_rules.cpp
@@ -213,7 +213,7 @@ namespace drule {
   template <> dash_array_t get_value<dash_array_t>(string const & s)
   {
     dash_array_t ret;
-    strings::TokenizeString(s, " \tpx,", bind(&dash_array_t::add, ref(ret), _1));
+    strings::Tokenize(s, " \tpx,", bind(&dash_array_t::add, ref(ret), _1));
 
     /// @see http://www.w3.org/TR/SVG/painting.html stroke-dasharray
     size_t const count = ret.m_v.size();
@@ -866,7 +866,7 @@ Key RulesHolder::CreateRuleImpl1(string const & name,
 #endif
 
   attrs_map_t a;
-  strings::TokenizeString(clValue, " \t", bind(&RulesHolder::PushAttributes, this, _1, ref(a)));
+  strings::Tokenize(clValue, " \t", bind(&RulesHolder::PushAttributes, this, _1, ref(a)));
 
   for (attrs_map_t::const_iterator i = attrs.begin(); i != attrs.end(); ++i)
     if (!strings::IsInArray(arrClassTags, i->first))
diff --git a/map/languages.cpp b/map/languages.cpp
index effba1a417..33ab59c188 100644
--- a/map/languages.cpp
+++ b/map/languages.cpp
@@ -99,7 +99,7 @@ namespace languages
 
     CodesT currentCodes;
     Collector c(currentCodes);
-    strings::TokenizeString(settingsString, LANG_DELIMETER, c);
+    strings::Tokenize(settingsString, LANG_DELIMETER, c);
 
     GetSupportedLanguages(outLanguages);
     Sort(currentCodes, outLanguages);