From cd9242df500e2de8044f3a34950510c8811569e3 Mon Sep 17 00:00:00 2001 From: Alex Zolotarev Date: Mon, 16 May 2011 21:48:57 +0200 Subject: [PATCH] Added base/utf8_string.hpp::Split() --- base/base.pro | 2 + base/base_tests/base_tests.pro | 1 + base/base_tests/utf8_string_test.cpp | 46 ++++++++++++++++++++ base/utf8_string.cpp | 63 ++++++++++++++++++++++++++++ base/utf8_string.hpp | 12 ++++++ std/iterator.hpp | 1 + 6 files changed, 125 insertions(+) create mode 100644 base/base_tests/utf8_string_test.cpp create mode 100644 base/utf8_string.cpp create mode 100644 base/utf8_string.hpp diff --git a/base/base.pro b/base/base.pro index ee82058bd1..3cc6a9c79c 100644 --- a/base/base.pro +++ b/base/base.pro @@ -20,6 +20,7 @@ SOURCES += \ memory_mapped_file.cpp \ path_utils.cpp \ condition.cpp \ + utf8_string.cpp \ HEADERS += \ SRC_FIRST.hpp \ @@ -61,3 +62,4 @@ HEADERS += \ buffer_vector.hpp \ path_utils.hpp \ array_adapters.hpp \ + utf8_string.hpp \ diff --git a/base/base_tests/base_tests.pro b/base/base_tests/base_tests.pro index 8c66fff5b7..6d98e5aee5 100644 --- a/base/base_tests/base_tests.pro +++ b/base/base_tests/base_tests.pro @@ -29,5 +29,6 @@ SOURCES += \ matrix_test.cpp \ commands_queue_test.cpp \ buffer_vector_test.cpp \ + utf8_string_test.cpp \ HEADERS += diff --git a/base/base_tests/utf8_string_test.cpp b/base/base_tests/utf8_string_test.cpp new file mode 100644 index 0000000000..c0282a1413 --- /dev/null +++ b/base/base_tests/utf8_string_test.cpp @@ -0,0 +1,46 @@ +#include "../../testing/testing.hpp" + +#include "../utf8_string.hpp" + +using namespace utf8_string; + +bool IsDelimeter(uint32_t symbol) +{ + switch (symbol) + { + case ' ': + case '-': + case '/': + case ',': + case '.': + case 0x0336: + return true; + } + return false; +} + +UNIT_TEST(Utf8_Split) +{ + vector result; + TEST(!Split("", result, &IsDelimeter), ()); + TEST_EQUAL(result.size(), 0, ()); + + TEST(!Split(" - ,. ", result, &IsDelimeter), ()); + TEST_EQUAL(result.size(), 0, ()); + + TEST(Split("London - is the capital of babai-city.", result, &IsDelimeter), ()); + TEST_EQUAL(result.size(), 7, ()); + TEST_EQUAL(result[0], "London", ()); + TEST_EQUAL(result[6], "city", ()); + + // Доллар подорожал на 500 рублей ̶копеек + char const * s = "- \xD0\x94\xD0\xBE\xD0\xBB\xD0\xBB\xD0\xB0\xD1\x80\x20\xD0\xBF\xD0\xBE\xD0\xB4\xD0" + "\xBE\xD1\x80\xD0\xBE\xD0\xB6\xD0\xB0\xD0\xBB\x20\xD0\xBD\xD0\xB0\x20\x35\x30\x30" + "\x20\xD1\x80\xD1\x83\xD0\xB1\xD0\xBB\xD0\xB5\xD0\xB9\x20\xCC\xB6\xD0\xBA\xD0\xBE" + "\xD0\xBF\xD0\xB5\xD0\xB5\xD0\xBA -"; + TEST(Split(s, result, &IsDelimeter), ()); + TEST_EQUAL(result.size(), 6, ()); + TEST_EQUAL(result[3], "500", ()); + TEST_EQUAL(result[4], "\xD1\x80\xD1\x83\xD0\xB1\xD0\xBB\xD0\xB5\xD0\xB9", ()); + TEST_EQUAL(result[5], "\xD0\xBA\xD0\xBE\xD0\xBF\xD0\xB5\xD0\xB5\xD0\xBA", ()); +} diff --git a/base/utf8_string.cpp b/base/utf8_string.cpp new file mode 100644 index 0000000000..53c9a5d211 --- /dev/null +++ b/base/utf8_string.cpp @@ -0,0 +1,63 @@ +#include "utf8_string.hpp" + +#include "../std/iterator.hpp" + +#include "../3party/utfcpp/source/utf8/unchecked.h" + +namespace utf8_string +{ + bool Split(string const & str, vector & out, IsDelimiterFuncT f) + { + out.clear(); + string::const_iterator curr = str.begin(); + string::const_iterator end = str.end(); + string word; + back_insert_iterator inserter = back_inserter(word); + while (curr != end) + { + uint32_t symbol = ::utf8::unchecked::next(curr); + if (f(symbol)) + { + if (!word.empty()) + { + out.push_back(word); + word.clear(); + inserter = back_inserter(word); + } + } + else + { + inserter = utf8::unchecked::append(symbol, inserter); + } + } + if (!word.empty()) + out.push_back(word); + return !out.empty(); + } + + bool IsSearchDelimiter(uint32_t symbol) + { + // latin table optimization + if (symbol >= ' ' && symbol < '0') + return true; + + switch (symbol) + { + case ':': + case ';': + case '[': + case ']': + case '\\': + case '^': + case '_': + case '`': + case '{': + case '}': + case '|': + case '~': + case 0x0336: + return true; + } + return false; + } +} diff --git a/base/utf8_string.hpp b/base/utf8_string.hpp new file mode 100644 index 0000000000..d1239bd35d --- /dev/null +++ b/base/utf8_string.hpp @@ -0,0 +1,12 @@ +#pragma once + +#include "../std/string.hpp" +#include "../std/vector.hpp" + +namespace utf8_string +{ + typedef bool (*IsDelimiterFuncT)(uint32_t); + /// delimeters optimal for search + bool IsSearchDelimiter(uint32_t symbol); + bool Split(string const & str, vector & out, IsDelimiterFuncT f = &IsSearchDelimiter); +} diff --git a/std/iterator.hpp b/std/iterator.hpp index 11a3a60dff..bb7436ce6f 100644 --- a/std/iterator.hpp +++ b/std/iterator.hpp @@ -12,6 +12,7 @@ using std::distance; using std::iterator_traits; using std::istream_iterator; using std::insert_iterator; +using std::back_insert_iterator; #ifdef DEBUG_NEW #define new DEBUG_NEW