Add support for endianess hints, requested in issue #57

This introduce a new enum "endianess" that specifies source/target
endianess, and new utf16to8/utf8to16 overaloads that accept hints or
force byte swap. Hints are checked against platform endianess
detected at runtime in is_byte_swap_required(). Added tests for
code paths introduced
This commit is contained in:
Francesco Pretto 2020-04-25 20:48:09 +02:00
parent 01a6693977
commit cbb7c0c126
5 changed files with 283 additions and 57 deletions

View file

@ -201,46 +201,47 @@ namespace utf8
return dist;
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
octet_iterator utf16to8 (bool swapbytes, u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = utf8::internal::mask16(*start++);
// Take care of surrogate pairs first
if (utf8::internal::is_lead_surrogate(cp)) {
if (start != end) {
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
if (utf8::internal::is_trail_surrogate(trail_surrogate))
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
else
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
}
else
throw invalid_utf16(static_cast<uint16_t>(cp));
}
// Lone trail surrogate
else if (utf8::internal::is_trail_surrogate(cp))
throw invalid_utf16(static_cast<uint16_t>(cp));
result = utf8::append(cp, result);
}
return result;
if (swapbytes)
return utf16to8_checked<swapped>(start, end, result);
else
return utf16to8_checked<unswapped>(start, end, result);
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
octet_iterator utf16to8(endianess hint, u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start < end) {
uint32_t cp = utf8::next(start, end);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = static_cast<uint16_t>(cp);
}
return result;
return utf16to8(internal::is_byte_swap_required(hint), start, end, result);
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
return utf16to8(false, start, end, result);
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (bool swapbytes, octet_iterator start, octet_iterator end, u16bit_iterator result)
{
if (swapbytes)
return utf8to16_unchecked<swapped>(start, end, result);
else
return utf8to16_unchecked<unswapped>(start, end, result);
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
{
return utf8to16(false, start, end, result);
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16(endianess hint, octet_iterator start, octet_iterator end, u16bit_iterator result)
{
return utf8to16(internal::is_byte_swap_required(hint), start, end, result);
}
template <typename octet_iterator, typename u32bit_iterator>

View file

@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE.
#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include <iterator>
#include <stdexcept>
// Determine the C++ standard version.
// If the user defines UTF_CPP_CPLUSPLUS, use that.
@ -56,6 +57,12 @@ namespace utf8
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
enum endianess
{
little_endian,
big_endian,
};
// Helper code - not intended to be directly called by the library users. May be changed at any time
namespace internal
{
@ -72,16 +79,33 @@ namespace internal
// Maximum valid value for a Unicode code point
const uint32_t CODE_POINT_MAX = 0x0010ffffu;
// Perform an runtime check to determine if byte swap is required
// with the given endianess hint
inline bool is_byte_swap_required(endianess hint)
{
switch (hint)
{
case little_endian:
return (*(const uint16_t*)"\1\0" >> 8) == 1;
case big_endian:
return (*(const uint16_t*)"\0\1" >> 8) == 1;
default:
throw std::runtime_error("Unexpected endianess hint");
}
}
template<typename octet_type>
inline uint8_t mask8(octet_type oc)
{
return static_cast<uint8_t>(0xff & oc);
}
template<typename u16_type>
inline uint16_t mask16(u16_type oc)
{
return static_cast<uint16_t>(0xffff & oc);
}
template<typename octet_type>
inline bool is_trail(octet_type oc)
{
@ -330,7 +354,100 @@ namespace internal
((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
((it != end) && (utf8::internal::mask8(*it)) == bom[2])
);
}
}
struct swapped
{
static inline uint16_t handle(uint16_t c)
{
// Perform byte swap
return ((c & 0xff00) >> 8) | ((c & 0x00ff) << 8);
}
};
struct unswapped
{
static inline uint16_t handle(uint16_t c)
{
// Just return same number
return c;
}
};
// Handle reading/writing of utf16 character, swapping byte if needed/requested
#define HANDLE_U16C(handler, x) handler::handle(static_cast<uint16_t>(x & 0xffff))
template <typename swap_handler, typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8_checked(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = HANDLE_U16C(swap_handler, *start++);
// Take care of surrogate pairs first
if (utf8::internal::is_lead_surrogate(cp)) {
if (start != end) {
uint32_t trail_surrogate = HANDLE_U16C(swap_handler, *start++);
if (utf8::internal::is_trail_surrogate(trail_surrogate))
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
else
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
}
else
throw invalid_utf16(static_cast<uint16_t>(cp));
}
// Lone trail surrogate
else if (utf8::internal::is_trail_surrogate(cp))
throw invalid_utf16(static_cast<uint16_t>(cp));
result = utf8::append(cp, result);
}
return result;
}
template <typename swap_handler, typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8_unchecked(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = HANDLE_U16C(swap_handler, *start++);
// Take care of surrogate pairs first
if (utf8::internal::is_lead_surrogate(cp)) {
uint32_t trail_surrogate = HANDLE_U16C(swap_handler, *start++);
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
}
result = utf8::unchecked::append(cp, result);
}
return result;
}
template <typename swap_handler, typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16_checked(bool swapbytes, octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start < end) {
uint32_t cp = utf8::next(start, end);
if (cp > 0xffff) { //make a surrogate pair
*result++ = HANDLE_U16C(swap_handler, (cp >> 10) + internal::LEAD_OFFSET);
*result++ = HANDLE_U16C(swap_handler, (cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = HANDLE_U16C(swap_handler, cp);
}
return result;
}
template <typename swap_handler, typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16_unchecked(octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start < end) {
uint32_t cp = utf8::unchecked::next(start);
if (cp > 0xffff) { //make a surrogate pair
*result++ = HANDLE_U16C(swap_handler, (cp >> 10) + internal::LEAD_OFFSET);
*result++ = HANDLE_U16C(swap_handler, (cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = HANDLE_U16C(swap_handler, cp);
}
return result;
}
} // namespace utf8
#endif // header guard

View file

@ -46,6 +46,20 @@ namespace utf8
return result;
}
inline std::string utf16to8(const std::u16string& s, endianess hint)
{
std::string result;
utf16to8(hint, s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::string utf16to8(const std::u16string& s, bool swapbytes)
{
std::string result;
utf16to8(swapbytes, s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u16string utf8to16(const std::string& s)
{
std::u16string result;
@ -53,6 +67,20 @@ namespace utf8
return result;
}
inline std::u16string utf8to16(const std::string& s, endianess hint)
{
std::u16string result;
utf8to16(hint, s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u16string utf8to16(const std::string& s, bool swapbytes)
{
std::u16string result;
utf8to16(swapbytes, s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::string utf32to8(const std::u32string& s)
{
std::string result;

View file

@ -168,33 +168,45 @@ namespace utf8
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
octet_iterator utf16to8 (bool swapbytes, u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = utf8::internal::mask16(*start++);
// Take care of surrogate pairs first
if (utf8::internal::is_lead_surrogate(cp)) {
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
}
result = utf8::unchecked::append(cp, result);
}
return result;
if (swapbytes)
return utf16to8_unchecked<swapped>(start, end, result);
else
return utf16to8_unchecked<unswapped>(start, end, result);
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start < end) {
uint32_t cp = utf8::unchecked::next(start);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = static_cast<uint16_t>(cp);
}
return result;
return utf16to8(false, start, end, result);
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8(endianess hint, u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
return utf16to8(internal::is_byte_swap_required(hint), start, end, result);
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (bool swapbytes, octet_iterator start, octet_iterator end, u16bit_iterator result)
{
if (swapbytes)
return utf8to16_unchecked<swapped>(start, end, result);
else
return utf8to16_unchecked<unswapped>(start, end, result);
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
{
return utf8to16(false, start, end, result);
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16(endianess hint, octet_iterator start, octet_iterator end, u16bit_iterator result)
{
return utf8to16(internal::is_byte_swap_required(hint), start, end, result);
}
template <typename octet_iterator, typename u32bit_iterator>

View file

@ -6,6 +6,8 @@ using namespace std;
#if __cplusplus >= 201103L // C++ 11 or later
static void swap_u16_raw_buffer(vector<uint8_t>& buffer);
TEST(CPP11APITests, test_append)
{
string u;
@ -103,4 +105,70 @@ TEST(CPP11APITests, test_starts_with_bom)
bool no_bbom = starts_with_bom(threechars);
EXPECT_FALSE (no_bbom);
}
static void test_utf16to8_endianess_hint(const vector<uint8_t>& u16_src_raw, endianess hint, const string& u8_ref)
{
u16string u16_src((const char16_t*)u16_src_raw.data(), u16_src_raw.size() / 2);
string u8_conv = utf16to8(u16_src, hint);
EXPECT_EQ(u8_conv, u8_ref);
u8_conv.clear();
unchecked::utf16to8(hint, u16_src.begin(), u16_src.end(), std::back_inserter(u8_conv));
EXPECT_EQ(u8_conv, u8_ref);
u8_conv.clear();
unchecked::utf16to8(!internal::is_byte_swap_required(hint), u16_src.begin(), u16_src.end(), std::back_inserter(u8_conv));
EXPECT_NE(u8_conv, u8_ref);
}
TEST(CPP11APITests, test_utf16to8_endianess_hint)
{
// https://r12a.github.io/app-conversion/?q=%F0%A0%94%80%F0%A0%81%85%F0%A0%81%BAad%F0%A0%81%BC
string u8_ref = "\xF0\xA0\x94\x80\xF0\xA0\x81\x85\xF0\xA0\x81\xBA\x61\x64\xF0\xA0\x81\xBC";
vector<uint8_t> u16_src_raw = { 0xD8, 0x41, 0xDD, 0x00, 0xD8, 0x40, 0xDC, 0x45, 0xD8, 0x40,
0xDC, 0x7A, 0x00, 0x61, 0x00, 0x64, 0xD8, 0x40, 0xDC, 0x7C };
test_utf16to8_endianess_hint(u16_src_raw, endianess::big_endian, u8_ref);
swap_u16_raw_buffer(u16_src_raw);
test_utf16to8_endianess_hint(u16_src_raw, endianess::little_endian, u8_ref);
}
static void test_utf8to16_endianess_hint(const string& u8_src, const vector<uint8_t>& u16_ref_raw, endianess hint)
{
u16string u16_ref((const char16_t*)u16_ref_raw.data(), u16_ref_raw.size() / 2);
u16string u16_conv = utf8::utf8to16(u8_src, hint);
EXPECT_EQ(u16_conv, u16_ref);
u16_conv.clear();
unchecked::utf8to16(hint, u8_src.begin(), u8_src.end(), std::back_inserter(u16_conv));
EXPECT_EQ(u16_conv, u16_ref);
u16_conv.clear();
unchecked::utf8to16(!internal::is_byte_swap_required(hint), u8_src.begin(), u8_src.end(), std::back_inserter(u16_conv));
EXPECT_NE(u16_conv, u16_ref);
}
TEST(CPP11APITests, test_utf8to16_endianess_hint)
{
// https://r12a.github.io/app-conversion/?q=hello
string u8_src = "hello";
vector<uint8_t> u16_ref_raw = { 0x00, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F };
test_utf8to16_endianess_hint(u8_src, u16_ref_raw, endianess::big_endian);
swap_u16_raw_buffer(u16_ref_raw);
test_utf8to16_endianess_hint(u8_src, u16_ref_raw, endianess::little_endian);
}
// Swap bytes in utf-16 characters
void swap_u16_raw_buffer(vector<uint8_t>& buffer)
{
size_t loopUpperLimit = buffer.size() / 2;
for (size_t i = 0; i < loopUpperLimit; i++)
{
size_t offesetIdx = i * 2;
uint8_t temp = buffer[offesetIdx];
buffer[offesetIdx + 0] = buffer[offesetIdx + 1];
buffer[offesetIdx + 1] = temp;
}
}
#endif // C++ 11 or later