Add append16 function

Support for appending codepoints to existing utf16 encoded strings.

See #91
This commit is contained in:
nemtrif 2023-06-25 15:52:08 -04:00
parent fffc67effd
commit b455811a4e
8 changed files with 150 additions and 1 deletions

View file

@ -219,6 +219,58 @@ Note that `append` does not allocate any memory - it is the burden of the caller
In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown.
#### utf8::append16
Available in version 4.0 and later. Requires a C++11 compliant compiler.
Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string.
```cpp
void append(utfchar32_t cp, std::u16string& s);
```
`cp`: a code point to append to the string.
`s`: a utf-16 encoded string to append the code point to.
Example of use:
```cpp
std::u16string u;
append(0x0448, u);
assert (u[0] == 0x0448 && u.length() == 1);
```
In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown.
#### utf8::append16
Available in version 4.0 and later.
Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string.
```cpp
template <typename word_iterator>
word_iterator append16(utfchar32_t cp, word_iterator result);
```
`word_iterator`: an output iterator.
`cp`: a 32 bit integer representing a code point to append to the sequence.
`result`: an output iterator to the place in the sequence where to append the code point.
Return value: an iterator pointing to the place after the newly appended sequence.
Example of use:
```cpp
unsigned short u[2] = {0,0};
unsigned short* end = append16(0x0448, u);
assert (u[0] == 0x0448 && u[1] == 0);
```
Note that `append16` does not allocate any memory - it is the burden of the caller to make sure there is enough memory allocated for the operation. To make things more interesting, `append16` can add either one or two words to the sequence. In practice, you would most often want to use `std::back_inserter` to ensure that the necessary memory is allocated.
In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown.
#### utf8::next
Available in version 1.0 and later.

View file

@ -84,6 +84,15 @@ namespace utf8
append(cp, std::back_inserter(s));
}
template <typename word_iterator>
word_iterator append16(utfchar32_t cp, word_iterator result)
{
if (!utf8::internal::is_code_point_valid(cp))
throw invalid_code_point(cp);
return internal::append16(cp, result);
}
template <typename octet_iterator, typename output_iterator>
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
{

View file

@ -119,6 +119,11 @@ namespace internal
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
}
inline bool is_in_bmp(utfchar32_t cp)
{
return cp < utfchar32_t(0x10000);
}
template <typename octet_iterator>
int sequence_length(octet_iterator lead_it)
{
@ -343,12 +348,44 @@ namespace internal
// The caller uses some other kind of output operator - not covered above
// Note that in this case we are not able to determine octet_type
// so we assume it's utfchar_8; that can cause a conversion warning if we are wrong.
// so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong.
template <typename octet_iterator>
octet_iterator append(utfchar32_t cp, octet_iterator result) {
return append<octet_iterator, utfchar8_t>(cp, result);
}
// Internal implementation of both checked and unchecked append16() function
// This function will be invoked by the overloads below, as they will know
// the word_type.
template <typename word_iterator, typename word_type>
word_iterator append16(utfchar32_t cp, word_iterator result) {
if (is_in_bmp(cp))
*(result++) = static_cast<word_type>(cp);
else {
// Code points from the supplementary planes are encoded via surrogate pairs
*(result++) = static_cast<word_type>(LEAD_OFFSET + (cp >> 10));
*(result++) = static_cast<word_type>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
}
return result;
}
// Hopefully, most common case: the caller uses back_inserter
// i.e. append16(cp, std::back_inserter(str));
template<typename container_type>
std::back_insert_iterator<container_type> append16
(utfchar32_t cp, std::back_insert_iterator<container_type> result) {
return append16<std::back_insert_iterator<container_type>,
typename container_type::value_type>(cp, result);
}
// The caller uses some other kind of output operator - not covered above
// Note that in this case we are not able to determine word_type
// so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong.
template <typename word_iterator>
word_iterator append16(utfchar32_t cp, word_iterator result) {
return append16<word_iterator, utfchar16_t>(cp, result);
}
} // namespace internal
/// The library API - functions intended to be called by the users

View file

@ -32,6 +32,11 @@ DEALINGS IN THE SOFTWARE.
namespace utf8
{
inline void append16(utfchar32_t cp, std::u16string& s)
{
append16(cp, std::back_inserter(s));
}
inline std::string utf16to8(const std::u16string& s)
{
std::string result;

View file

@ -40,6 +40,12 @@ namespace utf8
return internal::append(cp, result);
}
template <typename word_iterator>
word_iterator append16(utfchar32_t cp, word_iterator result)
{
return internal::append16(cp, result);
}
template <typename octet_iterator, typename output_iterator>
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
{

View file

@ -47,6 +47,22 @@ TEST(CheckedAPITests, test_append)
EXPECT_EQ (c[1], 0);
}
TEST(CheckedAPITests, test_append16)
{
utfchar16_t u[5] = {0,0};
append16(0x0448, u);
EXPECT_EQ (u[0], 0x0448);
EXPECT_EQ (u[1], 0x0000);
append16(0x65e5, u);
EXPECT_EQ (u[0], 0x65e5);
EXPECT_EQ (u[1], 0x0000);
append16(0x10346, u);
EXPECT_EQ (u[0], 0xd800);
EXPECT_EQ (u[1], 0xdf46);
}
TEST(CheckedAPITests, test_next)
{
const char* twochars = "\xe6\x97\xa5\xd1\x88";

View file

@ -37,6 +37,14 @@ TEST(CPP11APITests, test_append)
EXPECT_EQ (u.length(), 4);
}
TEST(CPP11APITests, test_append16)
{
u16string u;
append16(0x0448, u);
EXPECT_EQ (u[0], char16_t(0x0448));
EXPECT_EQ (u.length(), 1);
}
TEST(CPP11APITests, test_utf16to8)
{
u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};

View file

@ -40,6 +40,22 @@ TEST(UnCheckedAPITests, test_append)
EXPECT_EQ (u[4], 0);
}
TEST(UnCheckedAPITests, test_append16)
{
unsigned short u[5] = {0,0};
utf8::unchecked::append16(0x0448, u);
EXPECT_EQ (u[0], 0x0448);
EXPECT_EQ (u[1], 0x0000);
utf8::unchecked::append16(0x65e5, u);
EXPECT_EQ (u[0], 0x65e5);
EXPECT_EQ (u[1], 0x0000);
utf8::unchecked::append16(0x10346, u);
EXPECT_EQ (u[0], 0xd800);
EXPECT_EQ (u[1], 0xdf46);
}
TEST(UnCheckedAPITests, test_next)
{
const char* twochars = "\xe6\x97\xa5\xd1\x88";