Added base/utf8_string.hpp::Split()

This commit is contained in:
Alex Zolotarev 2011-05-16 21:48:57 +02:00 committed by Alex Zolotarev
parent f3a38af8c4
commit cd9242df50
6 changed files with 125 additions and 0 deletions

View file

@ -20,6 +20,7 @@ SOURCES += \
memory_mapped_file.cpp \
path_utils.cpp \
condition.cpp \
utf8_string.cpp \
HEADERS += \
SRC_FIRST.hpp \
@ -61,3 +62,4 @@ HEADERS += \
buffer_vector.hpp \
path_utils.hpp \
array_adapters.hpp \
utf8_string.hpp \

View file

@ -29,5 +29,6 @@ SOURCES += \
matrix_test.cpp \
commands_queue_test.cpp \
buffer_vector_test.cpp \
utf8_string_test.cpp \
HEADERS +=

View file

@ -0,0 +1,46 @@
#include "../../testing/testing.hpp"
#include "../utf8_string.hpp"
using namespace utf8_string;
bool IsDelimeter(uint32_t symbol)
{
switch (symbol)
{
case ' ':
case '-':
case '/':
case ',':
case '.':
case 0x0336:
return true;
}
return false;
}
UNIT_TEST(Utf8_Split)
{
vector<string> result;
TEST(!Split("", result, &IsDelimeter), ());
TEST_EQUAL(result.size(), 0, ());
TEST(!Split(" - ,. ", result, &IsDelimeter), ());
TEST_EQUAL(result.size(), 0, ());
TEST(Split("London - is the capital of babai-city.", result, &IsDelimeter), ());
TEST_EQUAL(result.size(), 7, ());
TEST_EQUAL(result[0], "London", ());
TEST_EQUAL(result[6], "city", ());
// Доллар подорожал на 500 рублей ̶копеек
char const * s = "- \xD0\x94\xD0\xBE\xD0\xBB\xD0\xBB\xD0\xB0\xD1\x80\x20\xD0\xBF\xD0\xBE\xD0\xB4\xD0"
"\xBE\xD1\x80\xD0\xBE\xD0\xB6\xD0\xB0\xD0\xBB\x20\xD0\xBD\xD0\xB0\x20\x35\x30\x30"
"\x20\xD1\x80\xD1\x83\xD0\xB1\xD0\xBB\xD0\xB5\xD0\xB9\x20\xCC\xB6\xD0\xBA\xD0\xBE"
"\xD0\xBF\xD0\xB5\xD0\xB5\xD0\xBA -";
TEST(Split(s, result, &IsDelimeter), ());
TEST_EQUAL(result.size(), 6, ());
TEST_EQUAL(result[3], "500", ());
TEST_EQUAL(result[4], "\xD1\x80\xD1\x83\xD0\xB1\xD0\xBB\xD0\xB5\xD0\xB9", ());
TEST_EQUAL(result[5], "\xD0\xBA\xD0\xBE\xD0\xBF\xD0\xB5\xD0\xB5\xD0\xBA", ());
}

63
base/utf8_string.cpp Normal file
View file

@ -0,0 +1,63 @@
#include "utf8_string.hpp"
#include "../std/iterator.hpp"
#include "../3party/utfcpp/source/utf8/unchecked.h"
namespace utf8_string
{
bool Split(string const & str, vector<string> & out, IsDelimiterFuncT f)
{
out.clear();
string::const_iterator curr = str.begin();
string::const_iterator end = str.end();
string word;
back_insert_iterator<string> inserter = back_inserter(word);
while (curr != end)
{
uint32_t symbol = ::utf8::unchecked::next(curr);
if (f(symbol))
{
if (!word.empty())
{
out.push_back(word);
word.clear();
inserter = back_inserter(word);
}
}
else
{
inserter = utf8::unchecked::append(symbol, inserter);
}
}
if (!word.empty())
out.push_back(word);
return !out.empty();
}
bool IsSearchDelimiter(uint32_t symbol)
{
// latin table optimization
if (symbol >= ' ' && symbol < '0')
return true;
switch (symbol)
{
case ':':
case ';':
case '[':
case ']':
case '\\':
case '^':
case '_':
case '`':
case '{':
case '}':
case '|':
case '~':
case 0x0336:
return true;
}
return false;
}
}

12
base/utf8_string.hpp Normal file
View file

@ -0,0 +1,12 @@
#pragma once
#include "../std/string.hpp"
#include "../std/vector.hpp"
namespace utf8_string
{
typedef bool (*IsDelimiterFuncT)(uint32_t);
/// delimeters optimal for search
bool IsSearchDelimiter(uint32_t symbol);
bool Split(string const & str, vector<string> & out, IsDelimiterFuncT f = &IsSearchDelimiter);
}

View file

@ -12,6 +12,7 @@ using std::distance;
using std::iterator_traits;
using std::istream_iterator;
using std::insert_iterator;
using std::back_insert_iterator;
#ifdef DEBUG_NEW
#define new DEBUG_NEW