From ba331cf8e76254a49cf96c25c940287151b0ce96 Mon Sep 17 00:00:00 2001 From: nemtrif Date: Sun, 23 Apr 2023 11:17:44 -0400 Subject: [PATCH] Support for C++20 u8string Issue #89 --- README.md | 220 ++++++++++++++++++++++++++++++++++++++++++ source/utf8/checked.h | 4 +- source/utf8/cpp20.h | 124 ++++++++++++++++++++++++ tests/CMakeLists.txt | 10 ++ tests/test_cpp17.cpp | 4 +- tests/test_cpp20.cpp | 77 +++++++++++++++ 6 files changed, 436 insertions(+), 3 deletions(-) create mode 100644 source/utf8/cpp20.h create mode 100644 tests/test_cpp20.cpp diff --git a/README.md b/README.md index a2800fb..887e26c 100644 --- a/README.md +++ b/README.md @@ -416,6 +416,53 @@ Example of use: In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. +#### utf8::utf16tou8 + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-16 encoded string to UTF-8. + +```cpp +std::u8string utf16tou8(const std::u16string& s); +``` + +`s`: a UTF-16 encoded string. +Return value: A UTF-8 encoded string. + +Example of use: + +```cpp + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u8string u = utf16to8(utf16string); + assert (u.size() == 10); +``` + +In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. + +#### utf8::utf16tou8 + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-16 encoded string to UTF-8. + +```cpp +std::u8string utf16tou8(const std::u16string_view& s); +``` + +`s`: a UTF-16 encoded string. +Return value: A UTF-8 encoded string. + +Example of use: + +```cpp + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u16string_view utf16stringview(u16string); + u8string u = utf16to8(utf16string); + assert (u.size() == 10); +``` + +In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. + #### utf8::utf16to8 @@ -496,6 +543,58 @@ assert (utf16result[3] == 0xdd1e); In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. +#### utf8::utf8to16 + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts an UTF-8 encoded string to UTF-16. + +```cpp +std::u16string utf8to16(std::u8string& s); +``` + +`s`: an UTF-8 encoded string to convert. +Return value: A UTF-16 encoded string + +Example of use: + +```cpp +std::u8string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +std::u16string utf16result = utf8to16(utf8_with_surrogates); +assert (utf16result.length() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + + +#### utf8::utf8to16 + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts an UTF-8 encoded string to UTF-16. + +```cpp +std::u16string utf8to16(std::u8string_view& s); +``` + +`s`: an UTF-8 encoded string to convert. +Return value: A UTF-16 encoded string + +Example of use: + +```cpp +std::u8string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +std::u8string_view utf8stringview {utf8_with_surrogates} +std::u16string utf16result = utf8to16(utf8stringview); +assert (utf16result.length() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + #### utf8::utf8to16 @@ -527,6 +626,78 @@ assert (utf16result[3] == 0xdd1e); In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 seqence, a `utf8::not_enough_room` exception is thrown. +#### utf8::utf32to8 + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::string utf32to8(const std::u32string& s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +string utf8result = utf32to8(utf32string); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + +#### utf8::utf32tou8 + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::u8string utf32to8(const std::u32string& s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +u8string utf8result = utf32to8(utf32string); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + + +#### utf8::utf32tou8 + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::u8string utf32to8(const std::u32string_view& s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +u32string_view utf32stringview(utf32string); +u8string utf8result = utf32to8(utf32stringview); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + + #### utf8::utf32to8 Available in version 3.0 and later. Requires a C++ 11 compliant compiler. @@ -604,6 +775,55 @@ assert (utf8result.size() == 9); In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. +#### utf8::utf8to32 + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-8 encoded string to UTF-32. + +```cpp +std::u32string utf8to32(const std::u8string& s); +``` + +`s`: a UTF-8 encoded string. +Return value: a UTF-32 encoded string. + +Example of use: + +```cpp +const std::u8string* twochars = u8"\xe6\x97\xa5\xd1\x88"; +u32string utf32result = utf8to32(twochars); +assert (utf32result.size() == 2); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + + +#### utf8::utf8to32 + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-8 encoded string to UTF-32. + +```cpp +std::u32string utf8to32(const std::u8string_view& s); +``` + +`s`: a UTF-8 encoded string. +Return value: a UTF-32 encoded string. + +Example of use: + +```cpp +const u8string* twochars = u8"\xe6\x97\xa5\xd1\x88"; +const u8string_view stringview{twochars}; +u32string utf32result = utf8to32(stringview); +assert (utf32result.size() == 2); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + + #### utf8::utf8to32 Available in version 3.0 and later. Requires a C++ 11 compliant compiler. diff --git a/source/utf8/checked.h b/source/utf8/checked.h index c9c0bf3..d3c3a75 100644 --- a/source/utf8/checked.h +++ b/source/utf8/checked.h @@ -328,7 +328,9 @@ namespace utf8 } // namespace utf8 -#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later +#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later +#include "cpp20.h" +#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later #include "cpp17.h" #elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later #include "cpp11.h" diff --git a/source/utf8/cpp20.h b/source/utf8/cpp20.h new file mode 100644 index 0000000..07b61d0 --- /dev/null +++ b/source/utf8/cpp20.h @@ -0,0 +1,124 @@ +// Copyright 2022 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "cpp17.h" + +namespace utf8 +{ + inline std::u8string utf16tou8(const std::u16string& s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf16tou8(std::u16string_view s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string_view& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string_view& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string_view& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(const std::u8string& s) + { + std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + } + + inline bool is_valid(const std::u8string& s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::u8string replace_invalid(const std::u8string& s) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(const std::u8string& s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f3ce258..7d56582 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,6 +1,7 @@ add_executable(negative ${PROJECT_SOURCE_DIR}/tests/negative.cpp) add_executable(cpp11 ${PROJECT_SOURCE_DIR}/tests/test_cpp11.cpp) add_executable(cpp17 ${PROJECT_SOURCE_DIR}/tests/test_cpp17.cpp) +add_executable(cpp20 ${PROJECT_SOURCE_DIR}/tests/test_cpp20.cpp) add_executable(apitests ${PROJECT_SOURCE_DIR}/tests/apitests.cpp) add_executable(noexceptionstests ${PROJECT_SOURCE_DIR}/tests/noexceptionstests.cpp) @@ -8,6 +9,7 @@ add_executable(noexceptionstests ${PROJECT_SOURCE_DIR}/tests/noexceptionstests.c target_link_libraries(negative PRIVATE utf8::cpp) target_link_libraries(cpp11 PRIVATE utf8::cpp) target_link_libraries(cpp17 PRIVATE utf8::cpp) +target_link_libraries(cpp20 PRIVATE utf8::cpp) target_link_libraries(apitests PRIVATE utf8::cpp) target_link_libraries(noexceptionstests PRIVATE utf8::cpp) @@ -35,9 +37,17 @@ set_target_properties(cpp17 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO) +set_target_properties(cpp20 + PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED YES + CXX_EXTENSIONS NO) + + add_test(negative_test negative ${PROJECT_SOURCE_DIR}/tests/test_data/utf8_invalid.txt) add_test(cpp11_test cpp11) add_test(cpp17_test cpp17) +add_test(cpp20_test cpp20) add_test(api_test apitests) add_test(noexceptions_test noexceptionstests) diff --git a/tests/test_cpp17.cpp b/tests/test_cpp17.cpp index 4b87816..a38e6f7 100644 --- a/tests/test_cpp17.cpp +++ b/tests/test_cpp17.cpp @@ -10,8 +10,8 @@ using namespace std; TEST(CPP17APITests, test_utf16to8) { u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - u16string_view utf16stringview(u16string); - string u = utf16to8(utf16string); + u16string_view utf16stringview(utf16string); + string u = utf16to8(utf16stringview); EXPECT_EQ (u.size(), 10); } diff --git a/tests/test_cpp20.cpp b/tests/test_cpp20.cpp new file mode 100644 index 0000000..50dbe30 --- /dev/null +++ b/tests/test_cpp20.cpp @@ -0,0 +1,77 @@ +#include "../extern/ftest/ftest.h" +#define UTF_CPP_CPLUSPLUS 202002L +#include "utf8.h" +#include +using namespace utf8; +using namespace std; + +TEST(CPP20APITests, test_utf16tou8) +{ + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u16string_view utf16stringview{utf16string}; + u8string u = utf16tou8(utf16string); + EXPECT_EQ (u.size(), 10); + u = utf16tou8(utf16stringview); + EXPECT_EQ (u.size(), 10); +} + +TEST(CPP20APITests, tes20t_utf8to16) +{ + u8string utf8_with_surrogates{u8"\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"}; + u16string utf16result = utf8to16(utf8_with_surrogates); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CPP20APITests, test_utf32tou8) +{ + u32string utf32string = {0x448, 0x65E5, 0x10346}; + u32string_view utf32stringview{utf32string}; + u8string utf8result = utf32tou8(utf32stringview); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CPP20APITests, test_utf8to32) +{ + u8string twochars = u8"\xe6\x97\xa5\xd1\x88"; + u32string utf32result = utf8to32(twochars); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CPP20APITests, test_find_invalid) +{ + u8string utf_invalid = u8"\xe6\x97\xa5\xd1\x88\xfa"; + auto invalid = find_invalid(utf_invalid); + EXPECT_EQ (invalid, 5); +} + +TEST(CPP20APITests, test_is_valid) +{ + u8string utf_invalid = u8"\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + u8string utf8_with_surrogates = u8"\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CPP20APITests, test_replace_invalid) +{ + u8string invalid_sequence = u8"a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?'); + bool bvalid = is_valid(replace_invalid_result); + EXPECT_TRUE (bvalid); + const u8string fixed_invalid_sequence = u8"a????z"; + EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); +} + +TEST(CPP20APITests, test_starts_with_bom) +{ + u8string byte_order_mark = u8"\xef\xbb\xbf"; + bool bbom = starts_with_bom(byte_order_mark); + EXPECT_TRUE (bbom); + u8string threechars = u8"\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars); + EXPECT_FALSE (no_bbom); +}