Support for C++20 u8string

Issue #89
This commit is contained in:
nemtrif 2023-04-23 11:17:44 -04:00
parent 2af99eae7a
commit ba331cf8e7
6 changed files with 436 additions and 3 deletions

220
README.md
View file

@ -416,6 +416,53 @@ Example of use:
In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown.
#### utf8::utf16tou8
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
Converts a UTF-16 encoded string to UTF-8.
```cpp
std::u8string utf16tou8(const std::u16string& s);
```
`s`: a UTF-16 encoded string.
Return value: A UTF-8 encoded string.
Example of use:
```cpp
u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
u8string u = utf16to8(utf16string);
assert (u.size() == 10);
```
In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown.
#### utf8::utf16tou8
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
Converts a UTF-16 encoded string to UTF-8.
```cpp
std::u8string utf16tou8(const std::u16string_view& s);
```
`s`: a UTF-16 encoded string.
Return value: A UTF-8 encoded string.
Example of use:
```cpp
u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
u16string_view utf16stringview(u16string);
u8string u = utf16to8(utf16string);
assert (u.size() == 10);
```
In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown.
#### utf8::utf16to8
@ -496,6 +543,58 @@ assert (utf16result[3] == 0xdd1e);
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
#### utf8::utf8to16
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
Converts an UTF-8 encoded string to UTF-16.
```cpp
std::u16string utf8to16(std::u8string& s);
```
`s`: an UTF-8 encoded string to convert.
Return value: A UTF-16 encoded string
Example of use:
```cpp
std::u8string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
std::u16string utf16result = utf8to16(utf8_with_surrogates);
assert (utf16result.length() == 4);
assert (utf16result[2] == 0xd834);
assert (utf16result[3] == 0xdd1e);
```
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
#### utf8::utf8to16
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
Converts an UTF-8 encoded string to UTF-16.
```cpp
std::u16string utf8to16(std::u8string_view& s);
```
`s`: an UTF-8 encoded string to convert.
Return value: A UTF-16 encoded string
Example of use:
```cpp
std::u8string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
std::u8string_view utf8stringview {utf8_with_surrogates}
std::u16string utf16result = utf8to16(utf8stringview);
assert (utf16result.length() == 4);
assert (utf16result[2] == 0xd834);
assert (utf16result[3] == 0xdd1e);
```
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
#### utf8::utf8to16
@ -527,6 +626,78 @@ assert (utf16result[3] == 0xdd1e);
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 seqence, a `utf8::not_enough_room` exception is thrown.
#### utf8::utf32to8
Available in version 3.0 and later. Requires a C++ 11 compliant compiler.
Converts a UTF-32 encoded string to UTF-8.
```cpp
std::string utf32to8(const std::u32string& s);
```
`s`: a UTF-32 encoded string.
Return value: a UTF-8 encoded string.
Example of use:
```cpp
u32string utf32string = {0x448, 0x65E5, 0x10346};
string utf8result = utf32to8(utf32string);
assert (utf8result.size() == 9);
```
In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown.
#### utf8::utf32tou8
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
Converts a UTF-32 encoded string to UTF-8.
```cpp
std::u8string utf32to8(const std::u32string& s);
```
`s`: a UTF-32 encoded string.
Return value: a UTF-8 encoded string.
Example of use:
```cpp
u32string utf32string = {0x448, 0x65E5, 0x10346};
u8string utf8result = utf32to8(utf32string);
assert (utf8result.size() == 9);
```
In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown.
#### utf8::utf32tou8
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
Converts a UTF-32 encoded string to UTF-8.
```cpp
std::u8string utf32to8(const std::u32string_view& s);
```
`s`: a UTF-32 encoded string.
Return value: a UTF-8 encoded string.
Example of use:
```cpp
u32string utf32string = {0x448, 0x65E5, 0x10346};
u32string_view utf32stringview(utf32string);
u8string utf8result = utf32to8(utf32stringview);
assert (utf8result.size() == 9);
```
In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown.
#### utf8::utf32to8
Available in version 3.0 and later. Requires a C++ 11 compliant compiler.
@ -604,6 +775,55 @@ assert (utf8result.size() == 9);
In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown.
#### utf8::utf8to32
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
Converts a UTF-8 encoded string to UTF-32.
```cpp
std::u32string utf8to32(const std::u8string& s);
```
`s`: a UTF-8 encoded string.
Return value: a UTF-32 encoded string.
Example of use:
```cpp
const std::u8string* twochars = u8"\xe6\x97\xa5\xd1\x88";
u32string utf32result = utf8to32(twochars);
assert (utf32result.size() == 2);
```
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
#### utf8::utf8to32
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
Converts a UTF-8 encoded string to UTF-32.
```cpp
std::u32string utf8to32(const std::u8string_view& s);
```
`s`: a UTF-8 encoded string.
Return value: a UTF-32 encoded string.
Example of use:
```cpp
const u8string* twochars = u8"\xe6\x97\xa5\xd1\x88";
const u8string_view stringview{twochars};
u32string utf32result = utf8to32(stringview);
assert (utf32result.size() == 2);
```
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
#### utf8::utf8to32
Available in version 3.0 and later. Requires a C++ 11 compliant compiler.

View file

@ -328,7 +328,9 @@ namespace utf8
} // namespace utf8
#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
#include "cpp20.h"
#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
#include "cpp17.h"
#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
#include "cpp11.h"

124
source/utf8/cpp20.h Normal file
View file

@ -0,0 +1,124 @@
// Copyright 2022 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9
#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9
#include "cpp17.h"
namespace utf8
{
inline std::u8string utf16tou8(const std::u16string& s)
{
std::u8string result;
utf16to8(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u8string utf16tou8(std::u16string_view s)
{
std::u8string result;
utf16to8(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u16string utf8to16(const std::u8string& s)
{
std::u16string result;
utf8to16(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u16string utf8to16(const std::u8string_view& s)
{
std::u16string result;
utf8to16(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u8string utf32tou8(const std::u32string& s)
{
std::u8string result;
utf32to8(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u8string utf32tou8(const std::u32string_view& s)
{
std::u8string result;
utf32to8(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u32string utf8to32(const std::u8string& s)
{
std::u32string result;
utf8to32(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u32string utf8to32(const std::u8string_view& s)
{
std::u32string result;
utf8to32(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::size_t find_invalid(const std::u8string& s)
{
std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end());
return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
}
inline bool is_valid(const std::u8string& s)
{
return is_valid(s.begin(), s.end());
}
inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement)
{
std::u8string result;
replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
return result;
}
inline std::u8string replace_invalid(const std::u8string& s)
{
std::u8string result;
replace_invalid(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline bool starts_with_bom(const std::u8string& s)
{
return starts_with_bom(s.begin(), s.end());
}
} // namespace utf8
#endif // header guard

View file

@ -1,6 +1,7 @@
add_executable(negative ${PROJECT_SOURCE_DIR}/tests/negative.cpp)
add_executable(cpp11 ${PROJECT_SOURCE_DIR}/tests/test_cpp11.cpp)
add_executable(cpp17 ${PROJECT_SOURCE_DIR}/tests/test_cpp17.cpp)
add_executable(cpp20 ${PROJECT_SOURCE_DIR}/tests/test_cpp20.cpp)
add_executable(apitests ${PROJECT_SOURCE_DIR}/tests/apitests.cpp)
add_executable(noexceptionstests ${PROJECT_SOURCE_DIR}/tests/noexceptionstests.cpp)
@ -8,6 +9,7 @@ add_executable(noexceptionstests ${PROJECT_SOURCE_DIR}/tests/noexceptionstests.c
target_link_libraries(negative PRIVATE utf8::cpp)
target_link_libraries(cpp11 PRIVATE utf8::cpp)
target_link_libraries(cpp17 PRIVATE utf8::cpp)
target_link_libraries(cpp20 PRIVATE utf8::cpp)
target_link_libraries(apitests PRIVATE utf8::cpp)
target_link_libraries(noexceptionstests PRIVATE utf8::cpp)
@ -35,9 +37,17 @@ set_target_properties(cpp17
CXX_STANDARD_REQUIRED YES
CXX_EXTENSIONS NO)
set_target_properties(cpp20
PROPERTIES
CXX_STANDARD 20
CXX_STANDARD_REQUIRED YES
CXX_EXTENSIONS NO)
add_test(negative_test negative ${PROJECT_SOURCE_DIR}/tests/test_data/utf8_invalid.txt)
add_test(cpp11_test cpp11)
add_test(cpp17_test cpp17)
add_test(cpp20_test cpp20)
add_test(api_test apitests)
add_test(noexceptions_test noexceptionstests)

View file

@ -10,8 +10,8 @@ using namespace std;
TEST(CPP17APITests, test_utf16to8)
{
u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
u16string_view utf16stringview(u16string);
string u = utf16to8(utf16string);
u16string_view utf16stringview(utf16string);
string u = utf16to8(utf16stringview);
EXPECT_EQ (u.size(), 10);
}

77
tests/test_cpp20.cpp Normal file
View file

@ -0,0 +1,77 @@
#include "../extern/ftest/ftest.h"
#define UTF_CPP_CPLUSPLUS 202002L
#include "utf8.h"
#include <string>
using namespace utf8;
using namespace std;
TEST(CPP20APITests, test_utf16tou8)
{
u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
u16string_view utf16stringview{utf16string};
u8string u = utf16tou8(utf16string);
EXPECT_EQ (u.size(), 10);
u = utf16tou8(utf16stringview);
EXPECT_EQ (u.size(), 10);
}
TEST(CPP20APITests, tes20t_utf8to16)
{
u8string utf8_with_surrogates{u8"\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"};
u16string utf16result = utf8to16(utf8_with_surrogates);
EXPECT_EQ (utf16result.size(), 4);
EXPECT_EQ (utf16result[2], 0xd834);
EXPECT_EQ (utf16result[3], 0xdd1e);
}
TEST(CPP20APITests, test_utf32tou8)
{
u32string utf32string = {0x448, 0x65E5, 0x10346};
u32string_view utf32stringview{utf32string};
u8string utf8result = utf32tou8(utf32stringview);
EXPECT_EQ (utf8result.size(), 9);
}
TEST(CPP20APITests, test_utf8to32)
{
u8string twochars = u8"\xe6\x97\xa5\xd1\x88";
u32string utf32result = utf8to32(twochars);
EXPECT_EQ (utf32result.size(), 2);
}
TEST(CPP20APITests, test_find_invalid)
{
u8string utf_invalid = u8"\xe6\x97\xa5\xd1\x88\xfa";
auto invalid = find_invalid(utf_invalid);
EXPECT_EQ (invalid, 5);
}
TEST(CPP20APITests, test_is_valid)
{
u8string utf_invalid = u8"\xe6\x97\xa5\xd1\x88\xfa";
bool bvalid = is_valid(utf_invalid);
EXPECT_FALSE (bvalid);
u8string utf8_with_surrogates = u8"\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
bvalid = is_valid(utf8_with_surrogates);
EXPECT_TRUE (bvalid);
}
TEST(CPP20APITests, test_replace_invalid)
{
u8string invalid_sequence = u8"a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?');
bool bvalid = is_valid(replace_invalid_result);
EXPECT_TRUE (bvalid);
const u8string fixed_invalid_sequence = u8"a????z";
EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
}
TEST(CPP20APITests, test_starts_with_bom)
{
u8string byte_order_mark = u8"\xef\xbb\xbf";
bool bbom = starts_with_bom(byte_order_mark);
EXPECT_TRUE (bbom);
u8string threechars = u8"\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
bool no_bbom = starts_with_bom(threechars);
EXPECT_FALSE (no_bbom);
}