From b455811a4e861c8f95a59628f51a3fdc06568fa3 Mon Sep 17 00:00:00 2001 From: nemtrif Date: Sun, 25 Jun 2023 15:52:08 -0400 Subject: [PATCH] Add append16 function Support for appending codepoints to existing utf16 encoded strings. See #91 --- README.md | 52 ++++++++++++++++++++++++++++++++++++++ source/utf8/checked.h | 9 +++++++ source/utf8/core.h | 39 +++++++++++++++++++++++++++- source/utf8/cpp11.h | 5 ++++ source/utf8/unchecked.h | 6 +++++ tests/test_checked_api.h | 16 ++++++++++++ tests/test_cpp11.cpp | 8 ++++++ tests/test_unchecked_api.h | 16 ++++++++++++ 8 files changed, 150 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 887e26c..1dc0657 100644 --- a/README.md +++ b/README.md @@ -219,6 +219,58 @@ Note that `append` does not allocate any memory - it is the burden of the caller In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. +#### utf8::append16 + +Available in version 4.0 and later. Requires a C++11 compliant compiler. + +Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string. + +```cpp +void append(utfchar32_t cp, std::u16string& s); +``` + +`cp`: a code point to append to the string. +`s`: a utf-16 encoded string to append the code point to. + +Example of use: + +```cpp +std::u16string u; +append(0x0448, u); +assert (u[0] == 0x0448 && u.length() == 1); +``` + +In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + +#### utf8::append16 + +Available in version 4.0 and later. + +Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string. + +```cpp +template +word_iterator append16(utfchar32_t cp, word_iterator result); +``` + +`word_iterator`: an output iterator. +`cp`: a 32 bit integer representing a code point to append to the sequence. +`result`: an output iterator to the place in the sequence where to append the code point. +Return value: an iterator pointing to the place after the newly appended sequence. + +Example of use: + +```cpp +unsigned short u[2] = {0,0}; +unsigned short* end = append16(0x0448, u); +assert (u[0] == 0x0448 && u[1] == 0); +``` + +Note that `append16` does not allocate any memory - it is the burden of the caller to make sure there is enough memory allocated for the operation. To make things more interesting, `append16` can add either one or two words to the sequence. In practice, you would most often want to use `std::back_inserter` to ensure that the necessary memory is allocated. + +In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + + #### utf8::next Available in version 1.0 and later. diff --git a/source/utf8/checked.h b/source/utf8/checked.h index d3c3a75..2d159f5 100644 --- a/source/utf8/checked.h +++ b/source/utf8/checked.h @@ -84,6 +84,15 @@ namespace utf8 append(cp, std::back_inserter(s)); } + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append16(cp, result); + } + template output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) { diff --git a/source/utf8/core.h b/source/utf8/core.h index fbf4535..096e529 100644 --- a/source/utf8/core.h +++ b/source/utf8/core.h @@ -119,6 +119,11 @@ namespace internal return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); } + inline bool is_in_bmp(utfchar32_t cp) + { + return cp < utfchar32_t(0x10000); + } + template int sequence_length(octet_iterator lead_it) { @@ -343,12 +348,44 @@ namespace internal // The caller uses some other kind of output operator - not covered above // Note that in this case we are not able to determine octet_type - // so we assume it's utfchar_8; that can cause a conversion warning if we are wrong. + // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong. template octet_iterator append(utfchar32_t cp, octet_iterator result) { return append(cp, result); } + // Internal implementation of both checked and unchecked append16() function + // This function will be invoked by the overloads below, as they will know + // the word_type. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + if (is_in_bmp(cp)) + *(result++) = static_cast(cp); + else { + // Code points from the supplementary planes are encoded via surrogate pairs + *(result++) = static_cast(LEAD_OFFSET + (cp >> 10)); + *(result++) = static_cast(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + } + return result; + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append16(cp, std::back_inserter(str)); + template + std::back_insert_iterator append16 + (utfchar32_t cp, std::back_insert_iterator result) { + return append16, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine word_type + // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + return append16(cp, result); + } + } // namespace internal /// The library API - functions intended to be called by the users diff --git a/source/utf8/cpp11.h b/source/utf8/cpp11.h index fc63e10..691633c 100644 --- a/source/utf8/cpp11.h +++ b/source/utf8/cpp11.h @@ -32,6 +32,11 @@ DEALINGS IN THE SOFTWARE. namespace utf8 { + inline void append16(utfchar32_t cp, std::u16string& s) + { + append16(cp, std::back_inserter(s)); + } + inline std::string utf16to8(const std::u16string& s) { std::string result; diff --git a/source/utf8/unchecked.h b/source/utf8/unchecked.h index ede8916..835c429 100644 --- a/source/utf8/unchecked.h +++ b/source/utf8/unchecked.h @@ -40,6 +40,12 @@ namespace utf8 return internal::append(cp, result); } + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + return internal::append16(cp, result); + } + template output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) { diff --git a/tests/test_checked_api.h b/tests/test_checked_api.h index d25c352..7672fb3 100644 --- a/tests/test_checked_api.h +++ b/tests/test_checked_api.h @@ -47,6 +47,22 @@ TEST(CheckedAPITests, test_append) EXPECT_EQ (c[1], 0); } +TEST(CheckedAPITests, test_append16) +{ + utfchar16_t u[5] = {0,0}; + append16(0x0448, u); + EXPECT_EQ (u[0], 0x0448); + EXPECT_EQ (u[1], 0x0000); + + append16(0x65e5, u); + EXPECT_EQ (u[0], 0x65e5); + EXPECT_EQ (u[1], 0x0000); + + append16(0x10346, u); + EXPECT_EQ (u[0], 0xd800); + EXPECT_EQ (u[1], 0xdf46); +} + TEST(CheckedAPITests, test_next) { const char* twochars = "\xe6\x97\xa5\xd1\x88"; diff --git a/tests/test_cpp11.cpp b/tests/test_cpp11.cpp index ee4ddd8..ee3518a 100644 --- a/tests/test_cpp11.cpp +++ b/tests/test_cpp11.cpp @@ -37,6 +37,14 @@ TEST(CPP11APITests, test_append) EXPECT_EQ (u.length(), 4); } +TEST(CPP11APITests, test_append16) +{ + u16string u; + append16(0x0448, u); + EXPECT_EQ (u[0], char16_t(0x0448)); + EXPECT_EQ (u.length(), 1); +} + TEST(CPP11APITests, test_utf16to8) { u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; diff --git a/tests/test_unchecked_api.h b/tests/test_unchecked_api.h index 10c5991..455c9c6 100644 --- a/tests/test_unchecked_api.h +++ b/tests/test_unchecked_api.h @@ -40,6 +40,22 @@ TEST(UnCheckedAPITests, test_append) EXPECT_EQ (u[4], 0); } +TEST(UnCheckedAPITests, test_append16) +{ + unsigned short u[5] = {0,0}; + utf8::unchecked::append16(0x0448, u); + EXPECT_EQ (u[0], 0x0448); + EXPECT_EQ (u[1], 0x0000); + + utf8::unchecked::append16(0x65e5, u); + EXPECT_EQ (u[0], 0x65e5); + EXPECT_EQ (u[1], 0x0000); + + utf8::unchecked::append16(0x10346, u); + EXPECT_EQ (u[0], 0xd800); + EXPECT_EQ (u[1], 0xdf46); +} + TEST(UnCheckedAPITests, test_next) { const char* twochars = "\xe6\x97\xa5\xd1\x88";