mirror of
https://github.com/nemtrif/utfcpp.git
synced 2025-04-05 05:25:07 +00:00
Add append16 function
Support for appending codepoints to existing utf16 encoded strings. See #91
This commit is contained in:
parent
fffc67effd
commit
b455811a4e
8 changed files with 150 additions and 1 deletions
52
README.md
52
README.md
|
@ -219,6 +219,58 @@ Note that `append` does not allocate any memory - it is the burden of the caller
|
|||
|
||||
In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown.
|
||||
|
||||
#### utf8::append16
|
||||
|
||||
Available in version 4.0 and later. Requires a C++11 compliant compiler.
|
||||
|
||||
Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string.
|
||||
|
||||
```cpp
|
||||
void append(utfchar32_t cp, std::u16string& s);
|
||||
```
|
||||
|
||||
`cp`: a code point to append to the string.
|
||||
`s`: a utf-16 encoded string to append the code point to.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
std::u16string u;
|
||||
append(0x0448, u);
|
||||
assert (u[0] == 0x0448 && u.length() == 1);
|
||||
```
|
||||
|
||||
In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown.
|
||||
|
||||
#### utf8::append16
|
||||
|
||||
Available in version 4.0 and later.
|
||||
|
||||
Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string.
|
||||
|
||||
```cpp
|
||||
template <typename word_iterator>
|
||||
word_iterator append16(utfchar32_t cp, word_iterator result);
|
||||
```
|
||||
|
||||
`word_iterator`: an output iterator.
|
||||
`cp`: a 32 bit integer representing a code point to append to the sequence.
|
||||
`result`: an output iterator to the place in the sequence where to append the code point.
|
||||
Return value: an iterator pointing to the place after the newly appended sequence.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
unsigned short u[2] = {0,0};
|
||||
unsigned short* end = append16(0x0448, u);
|
||||
assert (u[0] == 0x0448 && u[1] == 0);
|
||||
```
|
||||
|
||||
Note that `append16` does not allocate any memory - it is the burden of the caller to make sure there is enough memory allocated for the operation. To make things more interesting, `append16` can add either one or two words to the sequence. In practice, you would most often want to use `std::back_inserter` to ensure that the necessary memory is allocated.
|
||||
|
||||
In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::next
|
||||
|
||||
Available in version 1.0 and later.
|
||||
|
|
|
@ -84,6 +84,15 @@ namespace utf8
|
|||
append(cp, std::back_inserter(s));
|
||||
}
|
||||
|
||||
template <typename word_iterator>
|
||||
word_iterator append16(utfchar32_t cp, word_iterator result)
|
||||
{
|
||||
if (!utf8::internal::is_code_point_valid(cp))
|
||||
throw invalid_code_point(cp);
|
||||
|
||||
return internal::append16(cp, result);
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename output_iterator>
|
||||
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
|
||||
{
|
||||
|
|
|
@ -119,6 +119,11 @@ namespace internal
|
|||
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
|
||||
}
|
||||
|
||||
inline bool is_in_bmp(utfchar32_t cp)
|
||||
{
|
||||
return cp < utfchar32_t(0x10000);
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
int sequence_length(octet_iterator lead_it)
|
||||
{
|
||||
|
@ -343,12 +348,44 @@ namespace internal
|
|||
|
||||
// The caller uses some other kind of output operator - not covered above
|
||||
// Note that in this case we are not able to determine octet_type
|
||||
// so we assume it's utfchar_8; that can cause a conversion warning if we are wrong.
|
||||
// so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong.
|
||||
template <typename octet_iterator>
|
||||
octet_iterator append(utfchar32_t cp, octet_iterator result) {
|
||||
return append<octet_iterator, utfchar8_t>(cp, result);
|
||||
}
|
||||
|
||||
// Internal implementation of both checked and unchecked append16() function
|
||||
// This function will be invoked by the overloads below, as they will know
|
||||
// the word_type.
|
||||
template <typename word_iterator, typename word_type>
|
||||
word_iterator append16(utfchar32_t cp, word_iterator result) {
|
||||
if (is_in_bmp(cp))
|
||||
*(result++) = static_cast<word_type>(cp);
|
||||
else {
|
||||
// Code points from the supplementary planes are encoded via surrogate pairs
|
||||
*(result++) = static_cast<word_type>(LEAD_OFFSET + (cp >> 10));
|
||||
*(result++) = static_cast<word_type>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Hopefully, most common case: the caller uses back_inserter
|
||||
// i.e. append16(cp, std::back_inserter(str));
|
||||
template<typename container_type>
|
||||
std::back_insert_iterator<container_type> append16
|
||||
(utfchar32_t cp, std::back_insert_iterator<container_type> result) {
|
||||
return append16<std::back_insert_iterator<container_type>,
|
||||
typename container_type::value_type>(cp, result);
|
||||
}
|
||||
|
||||
// The caller uses some other kind of output operator - not covered above
|
||||
// Note that in this case we are not able to determine word_type
|
||||
// so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong.
|
||||
template <typename word_iterator>
|
||||
word_iterator append16(utfchar32_t cp, word_iterator result) {
|
||||
return append16<word_iterator, utfchar16_t>(cp, result);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// The library API - functions intended to be called by the users
|
||||
|
|
|
@ -32,6 +32,11 @@ DEALINGS IN THE SOFTWARE.
|
|||
|
||||
namespace utf8
|
||||
{
|
||||
inline void append16(utfchar32_t cp, std::u16string& s)
|
||||
{
|
||||
append16(cp, std::back_inserter(s));
|
||||
}
|
||||
|
||||
inline std::string utf16to8(const std::u16string& s)
|
||||
{
|
||||
std::string result;
|
||||
|
|
|
@ -40,6 +40,12 @@ namespace utf8
|
|||
return internal::append(cp, result);
|
||||
}
|
||||
|
||||
template <typename word_iterator>
|
||||
word_iterator append16(utfchar32_t cp, word_iterator result)
|
||||
{
|
||||
return internal::append16(cp, result);
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename output_iterator>
|
||||
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
|
||||
{
|
||||
|
|
|
@ -47,6 +47,22 @@ TEST(CheckedAPITests, test_append)
|
|||
EXPECT_EQ (c[1], 0);
|
||||
}
|
||||
|
||||
TEST(CheckedAPITests, test_append16)
|
||||
{
|
||||
utfchar16_t u[5] = {0,0};
|
||||
append16(0x0448, u);
|
||||
EXPECT_EQ (u[0], 0x0448);
|
||||
EXPECT_EQ (u[1], 0x0000);
|
||||
|
||||
append16(0x65e5, u);
|
||||
EXPECT_EQ (u[0], 0x65e5);
|
||||
EXPECT_EQ (u[1], 0x0000);
|
||||
|
||||
append16(0x10346, u);
|
||||
EXPECT_EQ (u[0], 0xd800);
|
||||
EXPECT_EQ (u[1], 0xdf46);
|
||||
}
|
||||
|
||||
TEST(CheckedAPITests, test_next)
|
||||
{
|
||||
const char* twochars = "\xe6\x97\xa5\xd1\x88";
|
||||
|
|
|
@ -37,6 +37,14 @@ TEST(CPP11APITests, test_append)
|
|||
EXPECT_EQ (u.length(), 4);
|
||||
}
|
||||
|
||||
TEST(CPP11APITests, test_append16)
|
||||
{
|
||||
u16string u;
|
||||
append16(0x0448, u);
|
||||
EXPECT_EQ (u[0], char16_t(0x0448));
|
||||
EXPECT_EQ (u.length(), 1);
|
||||
}
|
||||
|
||||
TEST(CPP11APITests, test_utf16to8)
|
||||
{
|
||||
u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
|
||||
|
|
|
@ -40,6 +40,22 @@ TEST(UnCheckedAPITests, test_append)
|
|||
EXPECT_EQ (u[4], 0);
|
||||
}
|
||||
|
||||
TEST(UnCheckedAPITests, test_append16)
|
||||
{
|
||||
unsigned short u[5] = {0,0};
|
||||
utf8::unchecked::append16(0x0448, u);
|
||||
EXPECT_EQ (u[0], 0x0448);
|
||||
EXPECT_EQ (u[1], 0x0000);
|
||||
|
||||
utf8::unchecked::append16(0x65e5, u);
|
||||
EXPECT_EQ (u[0], 0x65e5);
|
||||
EXPECT_EQ (u[1], 0x0000);
|
||||
|
||||
utf8::unchecked::append16(0x10346, u);
|
||||
EXPECT_EQ (u[0], 0xd800);
|
||||
EXPECT_EQ (u[1], 0xdf46);
|
||||
}
|
||||
|
||||
TEST(UnCheckedAPITests, test_next)
|
||||
{
|
||||
const char* twochars = "\xe6\x97\xa5\xd1\x88";
|
||||
|
|
Loading…
Add table
Reference in a new issue