mirror of
https://github.com/nemtrif/utfcpp.git
synced 2025-04-05 05:25:07 +00:00
Add support for endianess hints, requested in issue #57
This introduce a new enum "endianess" that specifies source/target endianess, and new utf16to8/utf8to16 overaloads that accept hints or force byte swap. Hints are checked against platform endianess detected at runtime in is_byte_swap_required(). Added tests for code paths introduced
This commit is contained in:
parent
01a6693977
commit
cbb7c0c126
5 changed files with 283 additions and 57 deletions
|
@ -201,46 +201,47 @@ namespace utf8
|
|||
return dist;
|
||||
}
|
||||
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
octet_iterator utf16to8 (bool swapbytes, u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
{
|
||||
while (start != end) {
|
||||
uint32_t cp = utf8::internal::mask16(*start++);
|
||||
// Take care of surrogate pairs first
|
||||
if (utf8::internal::is_lead_surrogate(cp)) {
|
||||
if (start != end) {
|
||||
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
|
||||
if (utf8::internal::is_trail_surrogate(trail_surrogate))
|
||||
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
|
||||
else
|
||||
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
|
||||
}
|
||||
else
|
||||
throw invalid_utf16(static_cast<uint16_t>(cp));
|
||||
|
||||
}
|
||||
// Lone trail surrogate
|
||||
else if (utf8::internal::is_trail_surrogate(cp))
|
||||
throw invalid_utf16(static_cast<uint16_t>(cp));
|
||||
|
||||
result = utf8::append(cp, result);
|
||||
}
|
||||
return result;
|
||||
if (swapbytes)
|
||||
return utf16to8_checked<swapped>(start, end, result);
|
||||
else
|
||||
return utf16to8_checked<unswapped>(start, end, result);
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
octet_iterator utf16to8(endianess hint, u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
{
|
||||
while (start < end) {
|
||||
uint32_t cp = utf8::next(start, end);
|
||||
if (cp > 0xffff) { //make a surrogate pair
|
||||
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
|
||||
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
|
||||
}
|
||||
else
|
||||
*result++ = static_cast<uint16_t>(cp);
|
||||
}
|
||||
return result;
|
||||
return utf16to8(internal::is_byte_swap_required(hint), start, end, result);
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
{
|
||||
return utf16to8(false, start, end, result);
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16 (bool swapbytes, octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
{
|
||||
if (swapbytes)
|
||||
return utf8to16_unchecked<swapped>(start, end, result);
|
||||
else
|
||||
return utf8to16_unchecked<unswapped>(start, end, result);
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
{
|
||||
return utf8to16(false, start, end, result);
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16(endianess hint, octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
{
|
||||
return utf8to16(internal::is_byte_swap_required(hint), start, end, result);
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename u32bit_iterator>
|
||||
|
|
|
@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE.
|
|||
#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
|
||||
|
||||
#include <iterator>
|
||||
#include <stdexcept>
|
||||
|
||||
// Determine the C++ standard version.
|
||||
// If the user defines UTF_CPP_CPLUSPLUS, use that.
|
||||
|
@ -56,6 +57,12 @@ namespace utf8
|
|||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
|
||||
enum endianess
|
||||
{
|
||||
little_endian,
|
||||
big_endian,
|
||||
};
|
||||
|
||||
// Helper code - not intended to be directly called by the library users. May be changed at any time
|
||||
namespace internal
|
||||
{
|
||||
|
@ -72,16 +79,33 @@ namespace internal
|
|||
// Maximum valid value for a Unicode code point
|
||||
const uint32_t CODE_POINT_MAX = 0x0010ffffu;
|
||||
|
||||
// Perform an runtime check to determine if byte swap is required
|
||||
// with the given endianess hint
|
||||
inline bool is_byte_swap_required(endianess hint)
|
||||
{
|
||||
switch (hint)
|
||||
{
|
||||
case little_endian:
|
||||
return (*(const uint16_t*)"\1\0" >> 8) == 1;
|
||||
case big_endian:
|
||||
return (*(const uint16_t*)"\0\1" >> 8) == 1;
|
||||
default:
|
||||
throw std::runtime_error("Unexpected endianess hint");
|
||||
}
|
||||
}
|
||||
|
||||
template<typename octet_type>
|
||||
inline uint8_t mask8(octet_type oc)
|
||||
{
|
||||
return static_cast<uint8_t>(0xff & oc);
|
||||
}
|
||||
|
||||
template<typename u16_type>
|
||||
inline uint16_t mask16(u16_type oc)
|
||||
{
|
||||
return static_cast<uint16_t>(0xffff & oc);
|
||||
}
|
||||
|
||||
template<typename octet_type>
|
||||
inline bool is_trail(octet_type oc)
|
||||
{
|
||||
|
@ -330,7 +354,100 @@ namespace internal
|
|||
((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
|
||||
((it != end) && (utf8::internal::mask8(*it)) == bom[2])
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
struct swapped
|
||||
{
|
||||
static inline uint16_t handle(uint16_t c)
|
||||
{
|
||||
// Perform byte swap
|
||||
return ((c & 0xff00) >> 8) | ((c & 0x00ff) << 8);
|
||||
}
|
||||
};
|
||||
|
||||
struct unswapped
|
||||
{
|
||||
static inline uint16_t handle(uint16_t c)
|
||||
{
|
||||
// Just return same number
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
// Handle reading/writing of utf16 character, swapping byte if needed/requested
|
||||
#define HANDLE_U16C(handler, x) handler::handle(static_cast<uint16_t>(x & 0xffff))
|
||||
|
||||
template <typename swap_handler, typename u16bit_iterator, typename octet_iterator>
|
||||
octet_iterator utf16to8_checked(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
{
|
||||
while (start != end) {
|
||||
uint32_t cp = HANDLE_U16C(swap_handler, *start++);
|
||||
// Take care of surrogate pairs first
|
||||
if (utf8::internal::is_lead_surrogate(cp)) {
|
||||
if (start != end) {
|
||||
uint32_t trail_surrogate = HANDLE_U16C(swap_handler, *start++);
|
||||
if (utf8::internal::is_trail_surrogate(trail_surrogate))
|
||||
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
|
||||
else
|
||||
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
|
||||
}
|
||||
else
|
||||
throw invalid_utf16(static_cast<uint16_t>(cp));
|
||||
|
||||
}
|
||||
// Lone trail surrogate
|
||||
else if (utf8::internal::is_trail_surrogate(cp))
|
||||
throw invalid_utf16(static_cast<uint16_t>(cp));
|
||||
|
||||
result = utf8::append(cp, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename swap_handler, typename u16bit_iterator, typename octet_iterator>
|
||||
octet_iterator utf16to8_unchecked(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
{
|
||||
while (start != end) {
|
||||
uint32_t cp = HANDLE_U16C(swap_handler, *start++);
|
||||
// Take care of surrogate pairs first
|
||||
if (utf8::internal::is_lead_surrogate(cp)) {
|
||||
uint32_t trail_surrogate = HANDLE_U16C(swap_handler, *start++);
|
||||
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
|
||||
}
|
||||
result = utf8::unchecked::append(cp, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename swap_handler, typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16_checked(bool swapbytes, octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
{
|
||||
while (start < end) {
|
||||
uint32_t cp = utf8::next(start, end);
|
||||
if (cp > 0xffff) { //make a surrogate pair
|
||||
*result++ = HANDLE_U16C(swap_handler, (cp >> 10) + internal::LEAD_OFFSET);
|
||||
*result++ = HANDLE_U16C(swap_handler, (cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
|
||||
}
|
||||
else
|
||||
*result++ = HANDLE_U16C(swap_handler, cp);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename swap_handler, typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16_unchecked(octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
{
|
||||
while (start < end) {
|
||||
uint32_t cp = utf8::unchecked::next(start);
|
||||
if (cp > 0xffff) { //make a surrogate pair
|
||||
*result++ = HANDLE_U16C(swap_handler, (cp >> 10) + internal::LEAD_OFFSET);
|
||||
*result++ = HANDLE_U16C(swap_handler, (cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
|
||||
}
|
||||
else
|
||||
*result++ = HANDLE_U16C(swap_handler, cp);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
} // namespace utf8
|
||||
|
||||
#endif // header guard
|
||||
|
|
|
@ -46,6 +46,20 @@ namespace utf8
|
|||
return result;
|
||||
}
|
||||
|
||||
inline std::string utf16to8(const std::u16string& s, endianess hint)
|
||||
{
|
||||
std::string result;
|
||||
utf16to8(hint, s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::string utf16to8(const std::u16string& s, bool swapbytes)
|
||||
{
|
||||
std::string result;
|
||||
utf16to8(swapbytes, s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u16string utf8to16(const std::string& s)
|
||||
{
|
||||
std::u16string result;
|
||||
|
@ -53,6 +67,20 @@ namespace utf8
|
|||
return result;
|
||||
}
|
||||
|
||||
inline std::u16string utf8to16(const std::string& s, endianess hint)
|
||||
{
|
||||
std::u16string result;
|
||||
utf8to16(hint, s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::u16string utf8to16(const std::string& s, bool swapbytes)
|
||||
{
|
||||
std::u16string result;
|
||||
utf8to16(swapbytes, s.begin(), s.end(), std::back_inserter(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::string utf32to8(const std::u32string& s)
|
||||
{
|
||||
std::string result;
|
||||
|
|
|
@ -168,33 +168,45 @@ namespace utf8
|
|||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
octet_iterator utf16to8 (bool swapbytes, u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
{
|
||||
while (start != end) {
|
||||
uint32_t cp = utf8::internal::mask16(*start++);
|
||||
// Take care of surrogate pairs first
|
||||
if (utf8::internal::is_lead_surrogate(cp)) {
|
||||
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
|
||||
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
|
||||
}
|
||||
result = utf8::unchecked::append(cp, result);
|
||||
}
|
||||
return result;
|
||||
if (swapbytes)
|
||||
return utf16to8_unchecked<swapped>(start, end, result);
|
||||
else
|
||||
return utf16to8_unchecked<unswapped>(start, end, result);
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
{
|
||||
while (start < end) {
|
||||
uint32_t cp = utf8::unchecked::next(start);
|
||||
if (cp > 0xffff) { //make a surrogate pair
|
||||
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
|
||||
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
|
||||
}
|
||||
else
|
||||
*result++ = static_cast<uint16_t>(cp);
|
||||
}
|
||||
return result;
|
||||
return utf16to8(false, start, end, result);
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
octet_iterator utf16to8(endianess hint, u16bit_iterator start, u16bit_iterator end, octet_iterator result)
|
||||
{
|
||||
return utf16to8(internal::is_byte_swap_required(hint), start, end, result);
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16 (bool swapbytes, octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
{
|
||||
if (swapbytes)
|
||||
return utf8to16_unchecked<swapped>(start, end, result);
|
||||
else
|
||||
return utf8to16_unchecked<unswapped>(start, end, result);
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
{
|
||||
return utf8to16(false, start, end, result);
|
||||
}
|
||||
|
||||
template <typename u16bit_iterator, typename octet_iterator>
|
||||
u16bit_iterator utf8to16(endianess hint, octet_iterator start, octet_iterator end, u16bit_iterator result)
|
||||
{
|
||||
return utf8to16(internal::is_byte_swap_required(hint), start, end, result);
|
||||
}
|
||||
|
||||
template <typename octet_iterator, typename u32bit_iterator>
|
||||
|
|
|
@ -6,6 +6,8 @@ using namespace std;
|
|||
|
||||
#if __cplusplus >= 201103L // C++ 11 or later
|
||||
|
||||
static void swap_u16_raw_buffer(vector<uint8_t>& buffer);
|
||||
|
||||
TEST(CPP11APITests, test_append)
|
||||
{
|
||||
string u;
|
||||
|
@ -103,4 +105,70 @@ TEST(CPP11APITests, test_starts_with_bom)
|
|||
bool no_bbom = starts_with_bom(threechars);
|
||||
EXPECT_FALSE (no_bbom);
|
||||
}
|
||||
|
||||
static void test_utf16to8_endianess_hint(const vector<uint8_t>& u16_src_raw, endianess hint, const string& u8_ref)
|
||||
{
|
||||
u16string u16_src((const char16_t*)u16_src_raw.data(), u16_src_raw.size() / 2);
|
||||
string u8_conv = utf16to8(u16_src, hint);
|
||||
EXPECT_EQ(u8_conv, u8_ref);
|
||||
|
||||
u8_conv.clear();
|
||||
unchecked::utf16to8(hint, u16_src.begin(), u16_src.end(), std::back_inserter(u8_conv));
|
||||
EXPECT_EQ(u8_conv, u8_ref);
|
||||
|
||||
u8_conv.clear();
|
||||
unchecked::utf16to8(!internal::is_byte_swap_required(hint), u16_src.begin(), u16_src.end(), std::back_inserter(u8_conv));
|
||||
EXPECT_NE(u8_conv, u8_ref);
|
||||
}
|
||||
|
||||
TEST(CPP11APITests, test_utf16to8_endianess_hint)
|
||||
{
|
||||
// https://r12a.github.io/app-conversion/?q=%F0%A0%94%80%F0%A0%81%85%F0%A0%81%BAad%F0%A0%81%BC
|
||||
string u8_ref = "\xF0\xA0\x94\x80\xF0\xA0\x81\x85\xF0\xA0\x81\xBA\x61\x64\xF0\xA0\x81\xBC";
|
||||
vector<uint8_t> u16_src_raw = { 0xD8, 0x41, 0xDD, 0x00, 0xD8, 0x40, 0xDC, 0x45, 0xD8, 0x40,
|
||||
0xDC, 0x7A, 0x00, 0x61, 0x00, 0x64, 0xD8, 0x40, 0xDC, 0x7C };
|
||||
|
||||
test_utf16to8_endianess_hint(u16_src_raw, endianess::big_endian, u8_ref);
|
||||
swap_u16_raw_buffer(u16_src_raw);
|
||||
test_utf16to8_endianess_hint(u16_src_raw, endianess::little_endian, u8_ref);
|
||||
}
|
||||
|
||||
static void test_utf8to16_endianess_hint(const string& u8_src, const vector<uint8_t>& u16_ref_raw, endianess hint)
|
||||
{
|
||||
u16string u16_ref((const char16_t*)u16_ref_raw.data(), u16_ref_raw.size() / 2);
|
||||
u16string u16_conv = utf8::utf8to16(u8_src, hint);
|
||||
EXPECT_EQ(u16_conv, u16_ref);
|
||||
|
||||
u16_conv.clear();
|
||||
unchecked::utf8to16(hint, u8_src.begin(), u8_src.end(), std::back_inserter(u16_conv));
|
||||
EXPECT_EQ(u16_conv, u16_ref);
|
||||
|
||||
u16_conv.clear();
|
||||
unchecked::utf8to16(!internal::is_byte_swap_required(hint), u8_src.begin(), u8_src.end(), std::back_inserter(u16_conv));
|
||||
EXPECT_NE(u16_conv, u16_ref);
|
||||
}
|
||||
|
||||
TEST(CPP11APITests, test_utf8to16_endianess_hint)
|
||||
{
|
||||
// https://r12a.github.io/app-conversion/?q=hello
|
||||
string u8_src = "hello";
|
||||
vector<uint8_t> u16_ref_raw = { 0x00, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F };
|
||||
test_utf8to16_endianess_hint(u8_src, u16_ref_raw, endianess::big_endian);
|
||||
swap_u16_raw_buffer(u16_ref_raw);
|
||||
test_utf8to16_endianess_hint(u8_src, u16_ref_raw, endianess::little_endian);
|
||||
}
|
||||
|
||||
// Swap bytes in utf-16 characters
|
||||
void swap_u16_raw_buffer(vector<uint8_t>& buffer)
|
||||
{
|
||||
size_t loopUpperLimit = buffer.size() / 2;
|
||||
for (size_t i = 0; i < loopUpperLimit; i++)
|
||||
{
|
||||
size_t offesetIdx = i * 2;
|
||||
uint8_t temp = buffer[offesetIdx];
|
||||
buffer[offesetIdx + 0] = buffer[offesetIdx + 1];
|
||||
buffer[offesetIdx + 1] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // C++ 11 or later
|
||||
|
|
Loading…
Add table
Reference in a new issue