forked from organicmaps/organicmaps
Added base/utf8_string.hpp::Split()
This commit is contained in:
parent
f3a38af8c4
commit
cd9242df50
6 changed files with 125 additions and 0 deletions
|
@ -20,6 +20,7 @@ SOURCES += \
|
|||
memory_mapped_file.cpp \
|
||||
path_utils.cpp \
|
||||
condition.cpp \
|
||||
utf8_string.cpp \
|
||||
|
||||
HEADERS += \
|
||||
SRC_FIRST.hpp \
|
||||
|
@ -61,3 +62,4 @@ HEADERS += \
|
|||
buffer_vector.hpp \
|
||||
path_utils.hpp \
|
||||
array_adapters.hpp \
|
||||
utf8_string.hpp \
|
||||
|
|
|
@ -29,5 +29,6 @@ SOURCES += \
|
|||
matrix_test.cpp \
|
||||
commands_queue_test.cpp \
|
||||
buffer_vector_test.cpp \
|
||||
utf8_string_test.cpp \
|
||||
|
||||
HEADERS +=
|
||||
|
|
46
base/base_tests/utf8_string_test.cpp
Normal file
46
base/base_tests/utf8_string_test.cpp
Normal file
|
@ -0,0 +1,46 @@
|
|||
#include "../../testing/testing.hpp"
|
||||
|
||||
#include "../utf8_string.hpp"
|
||||
|
||||
using namespace utf8_string;
|
||||
|
||||
bool IsDelimeter(uint32_t symbol)
|
||||
{
|
||||
switch (symbol)
|
||||
{
|
||||
case ' ':
|
||||
case '-':
|
||||
case '/':
|
||||
case ',':
|
||||
case '.':
|
||||
case 0x0336:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
UNIT_TEST(Utf8_Split)
|
||||
{
|
||||
vector<string> result;
|
||||
TEST(!Split("", result, &IsDelimeter), ());
|
||||
TEST_EQUAL(result.size(), 0, ());
|
||||
|
||||
TEST(!Split(" - ,. ", result, &IsDelimeter), ());
|
||||
TEST_EQUAL(result.size(), 0, ());
|
||||
|
||||
TEST(Split("London - is the capital of babai-city.", result, &IsDelimeter), ());
|
||||
TEST_EQUAL(result.size(), 7, ());
|
||||
TEST_EQUAL(result[0], "London", ());
|
||||
TEST_EQUAL(result[6], "city", ());
|
||||
|
||||
// Доллар подорожал на 500 рублей ̶копеек
|
||||
char const * s = "- \xD0\x94\xD0\xBE\xD0\xBB\xD0\xBB\xD0\xB0\xD1\x80\x20\xD0\xBF\xD0\xBE\xD0\xB4\xD0"
|
||||
"\xBE\xD1\x80\xD0\xBE\xD0\xB6\xD0\xB0\xD0\xBB\x20\xD0\xBD\xD0\xB0\x20\x35\x30\x30"
|
||||
"\x20\xD1\x80\xD1\x83\xD0\xB1\xD0\xBB\xD0\xB5\xD0\xB9\x20\xCC\xB6\xD0\xBA\xD0\xBE"
|
||||
"\xD0\xBF\xD0\xB5\xD0\xB5\xD0\xBA -";
|
||||
TEST(Split(s, result, &IsDelimeter), ());
|
||||
TEST_EQUAL(result.size(), 6, ());
|
||||
TEST_EQUAL(result[3], "500", ());
|
||||
TEST_EQUAL(result[4], "\xD1\x80\xD1\x83\xD0\xB1\xD0\xBB\xD0\xB5\xD0\xB9", ());
|
||||
TEST_EQUAL(result[5], "\xD0\xBA\xD0\xBE\xD0\xBF\xD0\xB5\xD0\xB5\xD0\xBA", ());
|
||||
}
|
63
base/utf8_string.cpp
Normal file
63
base/utf8_string.cpp
Normal file
|
@ -0,0 +1,63 @@
|
|||
#include "utf8_string.hpp"
|
||||
|
||||
#include "../std/iterator.hpp"
|
||||
|
||||
#include "../3party/utfcpp/source/utf8/unchecked.h"
|
||||
|
||||
namespace utf8_string
|
||||
{
|
||||
bool Split(string const & str, vector<string> & out, IsDelimiterFuncT f)
|
||||
{
|
||||
out.clear();
|
||||
string::const_iterator curr = str.begin();
|
||||
string::const_iterator end = str.end();
|
||||
string word;
|
||||
back_insert_iterator<string> inserter = back_inserter(word);
|
||||
while (curr != end)
|
||||
{
|
||||
uint32_t symbol = ::utf8::unchecked::next(curr);
|
||||
if (f(symbol))
|
||||
{
|
||||
if (!word.empty())
|
||||
{
|
||||
out.push_back(word);
|
||||
word.clear();
|
||||
inserter = back_inserter(word);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
inserter = utf8::unchecked::append(symbol, inserter);
|
||||
}
|
||||
}
|
||||
if (!word.empty())
|
||||
out.push_back(word);
|
||||
return !out.empty();
|
||||
}
|
||||
|
||||
bool IsSearchDelimiter(uint32_t symbol)
|
||||
{
|
||||
// latin table optimization
|
||||
if (symbol >= ' ' && symbol < '0')
|
||||
return true;
|
||||
|
||||
switch (symbol)
|
||||
{
|
||||
case ':':
|
||||
case ';':
|
||||
case '[':
|
||||
case ']':
|
||||
case '\\':
|
||||
case '^':
|
||||
case '_':
|
||||
case '`':
|
||||
case '{':
|
||||
case '}':
|
||||
case '|':
|
||||
case '~':
|
||||
case 0x0336:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
12
base/utf8_string.hpp
Normal file
12
base/utf8_string.hpp
Normal file
|
@ -0,0 +1,12 @@
|
|||
#pragma once
|
||||
|
||||
#include "../std/string.hpp"
|
||||
#include "../std/vector.hpp"
|
||||
|
||||
namespace utf8_string
|
||||
{
|
||||
typedef bool (*IsDelimiterFuncT)(uint32_t);
|
||||
/// delimeters optimal for search
|
||||
bool IsSearchDelimiter(uint32_t symbol);
|
||||
bool Split(string const & str, vector<string> & out, IsDelimiterFuncT f = &IsSearchDelimiter);
|
||||
}
|
|
@ -12,6 +12,7 @@ using std::distance;
|
|||
using std::iterator_traits;
|
||||
using std::istream_iterator;
|
||||
using std::insert_iterator;
|
||||
using std::back_insert_iterator;
|
||||
|
||||
#ifdef DEBUG_NEW
|
||||
#define new DEBUG_NEW
|
||||
|
|
Loading…
Add table
Reference in a new issue