mirror of
https://github.com/nemtrif/utfcpp.git
synced 2025-04-05 05:25:07 +00:00
parent
2af99eae7a
commit
ba331cf8e7
6 changed files with 436 additions and 3 deletions
220
README.md
220
README.md
|
@ -416,6 +416,53 @@ Example of use:
|
|||
|
||||
In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown.
|
||||
|
||||
#### utf8::utf16tou8
|
||||
|
||||
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
|
||||
|
||||
Converts a UTF-16 encoded string to UTF-8.
|
||||
|
||||
```cpp
|
||||
std::u8string utf16tou8(const std::u16string& s);
|
||||
```
|
||||
|
||||
`s`: a UTF-16 encoded string.
|
||||
Return value: A UTF-8 encoded string.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
|
||||
u8string u = utf16to8(utf16string);
|
||||
assert (u.size() == 10);
|
||||
```
|
||||
|
||||
In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown.
|
||||
|
||||
#### utf8::utf16tou8
|
||||
|
||||
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
|
||||
|
||||
Converts a UTF-16 encoded string to UTF-8.
|
||||
|
||||
```cpp
|
||||
std::u8string utf16tou8(const std::u16string_view& s);
|
||||
```
|
||||
|
||||
`s`: a UTF-16 encoded string.
|
||||
Return value: A UTF-8 encoded string.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
|
||||
u16string_view utf16stringview(u16string);
|
||||
u8string u = utf16to8(utf16string);
|
||||
assert (u.size() == 10);
|
||||
```
|
||||
|
||||
In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::utf16to8
|
||||
|
||||
|
@ -496,6 +543,58 @@ assert (utf16result[3] == 0xdd1e);
|
|||
|
||||
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
|
||||
|
||||
#### utf8::utf8to16
|
||||
|
||||
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
|
||||
|
||||
Converts an UTF-8 encoded string to UTF-16.
|
||||
|
||||
```cpp
|
||||
std::u16string utf8to16(std::u8string& s);
|
||||
```
|
||||
|
||||
`s`: an UTF-8 encoded string to convert.
|
||||
Return value: A UTF-16 encoded string
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
std::u8string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
|
||||
std::u16string utf16result = utf8to16(utf8_with_surrogates);
|
||||
assert (utf16result.length() == 4);
|
||||
assert (utf16result[2] == 0xd834);
|
||||
assert (utf16result[3] == 0xdd1e);
|
||||
```
|
||||
|
||||
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::utf8to16
|
||||
|
||||
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
|
||||
|
||||
Converts an UTF-8 encoded string to UTF-16.
|
||||
|
||||
```cpp
|
||||
std::u16string utf8to16(std::u8string_view& s);
|
||||
```
|
||||
|
||||
`s`: an UTF-8 encoded string to convert.
|
||||
Return value: A UTF-16 encoded string
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
std::u8string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
|
||||
std::u8string_view utf8stringview {utf8_with_surrogates}
|
||||
std::u16string utf16result = utf8to16(utf8stringview);
|
||||
assert (utf16result.length() == 4);
|
||||
assert (utf16result[2] == 0xd834);
|
||||
assert (utf16result[3] == 0xdd1e);
|
||||
```
|
||||
|
||||
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::utf8to16
|
||||
|
||||
|
@ -527,6 +626,78 @@ assert (utf16result[3] == 0xdd1e);
|
|||
|
||||
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 seqence, a `utf8::not_enough_room` exception is thrown.
|
||||
|
||||
#### utf8::utf32to8
|
||||
|
||||
Available in version 3.0 and later. Requires a C++ 11 compliant compiler.
|
||||
|
||||
Converts a UTF-32 encoded string to UTF-8.
|
||||
|
||||
```cpp
|
||||
std::string utf32to8(const std::u32string& s);
|
||||
```
|
||||
|
||||
`s`: a UTF-32 encoded string.
|
||||
Return value: a UTF-8 encoded string.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
u32string utf32string = {0x448, 0x65E5, 0x10346};
|
||||
string utf8result = utf32to8(utf32string);
|
||||
assert (utf8result.size() == 9);
|
||||
```
|
||||
|
||||
In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown.
|
||||
|
||||
#### utf8::utf32tou8
|
||||
|
||||
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
|
||||
|
||||
Converts a UTF-32 encoded string to UTF-8.
|
||||
|
||||
```cpp
|
||||
std::u8string utf32to8(const std::u32string& s);
|
||||
```
|
||||
|
||||
`s`: a UTF-32 encoded string.
|
||||
Return value: a UTF-8 encoded string.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
u32string utf32string = {0x448, 0x65E5, 0x10346};
|
||||
u8string utf8result = utf32to8(utf32string);
|
||||
assert (utf8result.size() == 9);
|
||||
```
|
||||
|
||||
In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::utf32tou8
|
||||
|
||||
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
|
||||
|
||||
Converts a UTF-32 encoded string to UTF-8.
|
||||
|
||||
```cpp
|
||||
std::u8string utf32to8(const std::u32string_view& s);
|
||||
```
|
||||
|
||||
`s`: a UTF-32 encoded string.
|
||||
Return value: a UTF-8 encoded string.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
u32string utf32string = {0x448, 0x65E5, 0x10346};
|
||||
u32string_view utf32stringview(utf32string);
|
||||
u8string utf8result = utf32to8(utf32stringview);
|
||||
assert (utf8result.size() == 9);
|
||||
```
|
||||
|
||||
In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::utf32to8
|
||||
|
||||
Available in version 3.0 and later. Requires a C++ 11 compliant compiler.
|
||||
|
@ -604,6 +775,55 @@ assert (utf8result.size() == 9);
|
|||
|
||||
In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown.
|
||||
|
||||
#### utf8::utf8to32
|
||||
|
||||
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
|
||||
|
||||
Converts a UTF-8 encoded string to UTF-32.
|
||||
|
||||
```cpp
|
||||
std::u32string utf8to32(const std::u8string& s);
|
||||
```
|
||||
|
||||
`s`: a UTF-8 encoded string.
|
||||
Return value: a UTF-32 encoded string.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
const std::u8string* twochars = u8"\xe6\x97\xa5\xd1\x88";
|
||||
u32string utf32result = utf8to32(twochars);
|
||||
assert (utf32result.size() == 2);
|
||||
```
|
||||
|
||||
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::utf8to32
|
||||
|
||||
Available in version 4.0 and later. Requires a C++ 20 compliant compiler.
|
||||
|
||||
Converts a UTF-8 encoded string to UTF-32.
|
||||
|
||||
```cpp
|
||||
std::u32string utf8to32(const std::u8string_view& s);
|
||||
```
|
||||
|
||||
`s`: a UTF-8 encoded string.
|
||||
Return value: a UTF-32 encoded string.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
const u8string* twochars = u8"\xe6\x97\xa5\xd1\x88";
|
||||
const u8string_view stringview{twochars};
|
||||
u32string utf32result = utf8to32(stringview);
|
||||
assert (utf32result.size() == 2);
|
||||
```
|
||||
|
||||
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::utf8to32
|
||||
|
||||
Available in version 3.0 and later. Requires a C++ 11 compliant compiler.
|
||||
|
|
|
@ -328,7 +328,9 @@ namespace utf8
|
|||
|
||||
} // namespace utf8
|
||||
|
||||
#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
|
||||
#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
|
||||
#include "cpp20.h"
|
||||
#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
|
||||
#include "cpp17.h"
|
||||
#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
|
||||
#include "cpp11.h"
|
||||
|
|
124
source/utf8/cpp20.h
Normal file
124
source/utf8/cpp20.h
Normal file
|
@ -0,0 +1,124 @@
|
|||
// Copyright 2022 Nemanja Trifunovic
|
||||
|
||||
/*
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9
|
||||
#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9
|
||||
|
||||
#include "cpp17.h"
|
||||
|
||||
namespace utf8
|
||||
{
|
||||
inline std::u8string utf16tou8(const std::u16string& s)
|
||||
{
|
||||
std::u8string result;
|
||||
utf16to8(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u8string utf16tou8(std::u16string_view s)
|
||||
{
|
||||
std::u8string result;
|
||||
utf16to8(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u16string utf8to16(const std::u8string& s)
|
||||
{
|
||||
std::u16string result;
|
||||
utf8to16(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u16string utf8to16(const std::u8string_view& s)
|
||||
{
|
||||
std::u16string result;
|
||||
utf8to16(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u8string utf32tou8(const std::u32string& s)
|
||||
{
|
||||
std::u8string result;
|
||||
utf32to8(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u8string utf32tou8(const std::u32string_view& s)
|
||||
{
|
||||
std::u8string result;
|
||||
utf32to8(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u32string utf8to32(const std::u8string& s)
|
||||
{
|
||||
std::u32string result;
|
||||
utf8to32(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u32string utf8to32(const std::u8string_view& s)
|
||||
{
|
||||
std::u32string result;
|
||||
utf8to32(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::size_t find_invalid(const std::u8string& s)
|
||||
{
|
||||
std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end());
|
||||
return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
|
||||
}
|
||||
|
||||
inline bool is_valid(const std::u8string& s)
|
||||
{
|
||||
return is_valid(s.begin(), s.end());
|
||||
}
|
||||
|
||||
inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement)
|
||||
{
|
||||
std::u8string result;
|
||||
replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u8string replace_invalid(const std::u8string& s)
|
||||
{
|
||||
std::u8string result;
|
||||
replace_invalid(s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline bool starts_with_bom(const std::u8string& s)
|
||||
{
|
||||
return starts_with_bom(s.begin(), s.end());
|
||||
}
|
||||
|
||||
} // namespace utf8
|
||||
|
||||
#endif // header guard
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
add_executable(negative ${PROJECT_SOURCE_DIR}/tests/negative.cpp)
|
||||
add_executable(cpp11 ${PROJECT_SOURCE_DIR}/tests/test_cpp11.cpp)
|
||||
add_executable(cpp17 ${PROJECT_SOURCE_DIR}/tests/test_cpp17.cpp)
|
||||
add_executable(cpp20 ${PROJECT_SOURCE_DIR}/tests/test_cpp20.cpp)
|
||||
add_executable(apitests ${PROJECT_SOURCE_DIR}/tests/apitests.cpp)
|
||||
|
||||
add_executable(noexceptionstests ${PROJECT_SOURCE_DIR}/tests/noexceptionstests.cpp)
|
||||
|
@ -8,6 +9,7 @@ add_executable(noexceptionstests ${PROJECT_SOURCE_DIR}/tests/noexceptionstests.c
|
|||
target_link_libraries(negative PRIVATE utf8::cpp)
|
||||
target_link_libraries(cpp11 PRIVATE utf8::cpp)
|
||||
target_link_libraries(cpp17 PRIVATE utf8::cpp)
|
||||
target_link_libraries(cpp20 PRIVATE utf8::cpp)
|
||||
target_link_libraries(apitests PRIVATE utf8::cpp)
|
||||
target_link_libraries(noexceptionstests PRIVATE utf8::cpp)
|
||||
|
||||
|
@ -35,9 +37,17 @@ set_target_properties(cpp17
|
|||
CXX_STANDARD_REQUIRED YES
|
||||
CXX_EXTENSIONS NO)
|
||||
|
||||
set_target_properties(cpp20
|
||||
PROPERTIES
|
||||
CXX_STANDARD 20
|
||||
CXX_STANDARD_REQUIRED YES
|
||||
CXX_EXTENSIONS NO)
|
||||
|
||||
|
||||
add_test(negative_test negative ${PROJECT_SOURCE_DIR}/tests/test_data/utf8_invalid.txt)
|
||||
add_test(cpp11_test cpp11)
|
||||
add_test(cpp17_test cpp17)
|
||||
add_test(cpp20_test cpp20)
|
||||
add_test(api_test apitests)
|
||||
add_test(noexceptions_test noexceptionstests)
|
||||
|
||||
|
|
|
@ -10,8 +10,8 @@ using namespace std;
|
|||
TEST(CPP17APITests, test_utf16to8)
|
||||
{
|
||||
u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
|
||||
u16string_view utf16stringview(u16string);
|
||||
string u = utf16to8(utf16string);
|
||||
u16string_view utf16stringview(utf16string);
|
||||
string u = utf16to8(utf16stringview);
|
||||
EXPECT_EQ (u.size(), 10);
|
||||
}
|
||||
|
||||
|
|
77
tests/test_cpp20.cpp
Normal file
77
tests/test_cpp20.cpp
Normal file
|
@ -0,0 +1,77 @@
|
|||
#include "../extern/ftest/ftest.h"
|
||||
#define UTF_CPP_CPLUSPLUS 202002L
|
||||
#include "utf8.h"
|
||||
#include <string>
|
||||
using namespace utf8;
|
||||
using namespace std;
|
||||
|
||||
TEST(CPP20APITests, test_utf16tou8)
|
||||
{
|
||||
u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
|
||||
u16string_view utf16stringview{utf16string};
|
||||
u8string u = utf16tou8(utf16string);
|
||||
EXPECT_EQ (u.size(), 10);
|
||||
u = utf16tou8(utf16stringview);
|
||||
EXPECT_EQ (u.size(), 10);
|
||||
}
|
||||
|
||||
TEST(CPP20APITests, tes20t_utf8to16)
|
||||
{
|
||||
u8string utf8_with_surrogates{u8"\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"};
|
||||
u16string utf16result = utf8to16(utf8_with_surrogates);
|
||||
EXPECT_EQ (utf16result.size(), 4);
|
||||
EXPECT_EQ (utf16result[2], 0xd834);
|
||||
EXPECT_EQ (utf16result[3], 0xdd1e);
|
||||
}
|
||||
|
||||
TEST(CPP20APITests, test_utf32tou8)
|
||||
{
|
||||
u32string utf32string = {0x448, 0x65E5, 0x10346};
|
||||
u32string_view utf32stringview{utf32string};
|
||||
u8string utf8result = utf32tou8(utf32stringview);
|
||||
EXPECT_EQ (utf8result.size(), 9);
|
||||
}
|
||||
|
||||
TEST(CPP20APITests, test_utf8to32)
|
||||
{
|
||||
u8string twochars = u8"\xe6\x97\xa5\xd1\x88";
|
||||
u32string utf32result = utf8to32(twochars);
|
||||
EXPECT_EQ (utf32result.size(), 2);
|
||||
}
|
||||
|
||||
TEST(CPP20APITests, test_find_invalid)
|
||||
{
|
||||
u8string utf_invalid = u8"\xe6\x97\xa5\xd1\x88\xfa";
|
||||
auto invalid = find_invalid(utf_invalid);
|
||||
EXPECT_EQ (invalid, 5);
|
||||
}
|
||||
|
||||
TEST(CPP20APITests, test_is_valid)
|
||||
{
|
||||
u8string utf_invalid = u8"\xe6\x97\xa5\xd1\x88\xfa";
|
||||
bool bvalid = is_valid(utf_invalid);
|
||||
EXPECT_FALSE (bvalid);
|
||||
u8string utf8_with_surrogates = u8"\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
|
||||
bvalid = is_valid(utf8_with_surrogates);
|
||||
EXPECT_TRUE (bvalid);
|
||||
}
|
||||
|
||||
TEST(CPP20APITests, test_replace_invalid)
|
||||
{
|
||||
u8string invalid_sequence = u8"a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
|
||||
u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?');
|
||||
bool bvalid = is_valid(replace_invalid_result);
|
||||
EXPECT_TRUE (bvalid);
|
||||
const u8string fixed_invalid_sequence = u8"a????z";
|
||||
EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result);
|
||||
}
|
||||
|
||||
TEST(CPP20APITests, test_starts_with_bom)
|
||||
{
|
||||
u8string byte_order_mark = u8"\xef\xbb\xbf";
|
||||
bool bbom = starts_with_bom(byte_order_mark);
|
||||
EXPECT_TRUE (bbom);
|
||||
u8string threechars = u8"\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
|
||||
bool no_bbom = starts_with_bom(threechars);
|
||||
EXPECT_FALSE (no_bbom);
|
||||
}
|
Loading…
Add table
Reference in a new issue