mirror of
https://github.com/nemtrif/utfcpp.git
synced 2025-04-05 05:25:07 +00:00
Add feature to escape non-printable characters
This commit is contained in:
parent
d8b92208fd
commit
4fda8574a4
2 changed files with 82 additions and 15 deletions
|
@ -33,6 +33,14 @@ DEALINGS IN THE SOFTWARE.
|
|||
|
||||
namespace utf8
|
||||
{
|
||||
enum {
|
||||
// Escaping non-printable characters with the "\u" prefix.
|
||||
// This flag should be used in the `replace_invalid` funciton while
|
||||
// sending JSON format data to a web browser or other things likely which expected
|
||||
// u to comply the JSON specification.
|
||||
ESCAPE_NON_PRINTABLE_CHARACTERS = 1 << 1,
|
||||
};
|
||||
|
||||
// Base for the exceptions that may be thrown from the library
|
||||
class exception : public ::std::exception {
|
||||
};
|
||||
|
@ -67,13 +75,61 @@ namespace utf8
|
|||
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
|
||||
};
|
||||
|
||||
class invalid_hex_value : public exception {
|
||||
public:
|
||||
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid hex value"; }
|
||||
}
|
||||
|
||||
/// The library API - functions intended to be called by the users
|
||||
|
||||
inline char hex2char(char hex)
|
||||
{
|
||||
static char table[] = {'0', '1', '2', '3', '4', '5', '6', '7'
|
||||
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
|
||||
|
||||
if (hex > static_cast<char>((sizeof(table) / sizeof(char))))
|
||||
throw utf8::invalid_hex_value;
|
||||
|
||||
return table[hex];
|
||||
}
|
||||
|
||||
|
||||
template <typename octet_iterator>
|
||||
octet_iterator escape_append(uint32_t cp, octet_iterator out)
|
||||
{
|
||||
if (!utf8::internal::is_code_point_valid(cp))
|
||||
throw utf8::invalid_code_point(cp);
|
||||
|
||||
*out++ = '\\';
|
||||
*out++ = 'u';
|
||||
if (cp <= 0xffff) {
|
||||
// Outputing the escaped UTF8 character which formating it's length to 4 by prepending
|
||||
// the character '0'.
|
||||
*out++ = utf8::hex2char((cp & 0x0000f000) >> 12);
|
||||
*out++ = utf8::hex2char((cp & 0x00000f00) >> 8);
|
||||
*out++ = utf8::hex2char((cp & 0x000000f0) >> 4);
|
||||
*out++ = utf8::hex2char((cp * 0x0000000f));
|
||||
} else {
|
||||
*out++ = utf8::hex2char((cp & 0xf0000000) >> 28);
|
||||
*out++ = utf8::hex2char((cp & 0x0f000000) >> 24);
|
||||
*out++ = utf8::hex2char((cp & 0x00f00000) >> 20);
|
||||
*out++ = utf8::hex2char((cp & 0x000f0000) >> 16);
|
||||
*out++ = '\\';
|
||||
*out++ = 'u';
|
||||
*out++ = utf8::hex2char((cp & 0x0000f000) >> 12);
|
||||
*out++ = utf8::hex2char((cp & 0x00000f00) >> 8);
|
||||
*out++ = utf8::hex2char((cp & 0x000000f0) >> 4);
|
||||
*out++ = utf8::hex2char((cp * 0x0000000f));
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
octet_iterator append(uint32_t cp, octet_iterator result)
|
||||
{
|
||||
if (!utf8::internal::is_code_point_valid(cp))
|
||||
throw invalid_code_point(cp);
|
||||
throw utf8::invalid_code_point(cp);
|
||||
|
||||
if (cp < 0x80) // one octet
|
||||
*(result++) = static_cast<uint8_t>(cp);
|
||||
|
@ -96,15 +152,21 @@ namespace utf8
|
|||
}
|
||||
|
||||
template <typename octet_iterator, typename output_iterator>
|
||||
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
|
||||
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement, int flags)
|
||||
{
|
||||
while (start != end) {
|
||||
octet_iterator sequence_start = start;
|
||||
internal::utf_error err_code = utf8::internal::validate_next(start, end);
|
||||
uint32_t cp = 0;
|
||||
internal::utf_error err_code = internal::validate_next(start, end, &cp);
|
||||
switch (err_code) {
|
||||
case internal::UTF8_OK :
|
||||
for (octet_iterator it = sequence_start; it != start; ++it)
|
||||
*out++ = *it;
|
||||
if ((flags & utf8::ESCAPE_NON_PRINTABLE_CHARACTERS)
|
||||
&& internal::is_c0c1_control_code(cp)) {
|
||||
out = utf8::escape_append(cp, out);
|
||||
} else {
|
||||
for (octet_iterator it = sequence_start; it != start; ++it)
|
||||
*out++ = *it;
|
||||
}
|
||||
break;
|
||||
case internal::NOT_ENOUGH_ROOM:
|
||||
out = utf8::append (replacement, out);
|
||||
|
@ -129,10 +191,10 @@ namespace utf8
|
|||
}
|
||||
|
||||
template <typename octet_iterator, typename output_iterator>
|
||||
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
|
||||
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, int flags = 0)
|
||||
{
|
||||
static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
|
||||
return utf8::replace_invalid(start, end, out, replacement_marker);
|
||||
return utf8::replace_invalid(start, end, out, replacement_marker, flags);
|
||||
}
|
||||
|
||||
template <typename octet_iterator>
|
||||
|
@ -150,7 +212,7 @@ namespace utf8
|
|||
case internal::OVERLONG_SEQUENCE :
|
||||
throw invalid_utf8(*it);
|
||||
case internal::INVALID_CODE_POINT :
|
||||
throw invalid_code_point(cp);
|
||||
throw utf8::invalid_code_point(cp);
|
||||
}
|
||||
return cp;
|
||||
}
|
||||
|
|
|
@ -133,21 +133,26 @@ namespace internal
|
|||
inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
|
||||
{
|
||||
if (cp < 0x80) {
|
||||
if (length != 1)
|
||||
if (length != 1)
|
||||
return true;
|
||||
}
|
||||
else if (cp < 0x800) {
|
||||
if (length != 2)
|
||||
if (length != 2)
|
||||
return true;
|
||||
}
|
||||
else if (cp < 0x10000) {
|
||||
if (length != 3)
|
||||
if (length != 3)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
inline bool is_c0c1_control_code(uint32_t cp)
|
||||
{
|
||||
return (cp < 0x1f) || (cp >= 0x80 && cp <= 0x9f);
|
||||
}
|
||||
|
||||
enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
|
||||
|
||||
/// Helper for get_sequence_x
|
||||
|
@ -163,7 +168,7 @@ namespace internal
|
|||
return UTF8_OK;
|
||||
}
|
||||
|
||||
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
|
||||
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
|
||||
|
||||
/// get_sequence_x functions decode utf-8 sequences of the length x
|
||||
template <typename octet_iterator>
|
||||
|
@ -180,7 +185,7 @@ namespace internal
|
|||
template <typename octet_iterator>
|
||||
utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
|
||||
{
|
||||
if (it == end)
|
||||
if (it == end)
|
||||
return NOT_ENOUGH_ROOM;
|
||||
|
||||
code_point = utf8::internal::mask8(*it);
|
||||
|
@ -197,7 +202,7 @@ namespace internal
|
|||
{
|
||||
if (it == end)
|
||||
return NOT_ENOUGH_ROOM;
|
||||
|
||||
|
||||
code_point = utf8::internal::mask8(*it);
|
||||
|
||||
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
|
||||
|
@ -282,7 +287,7 @@ namespace internal
|
|||
else
|
||||
err = OVERLONG_SEQUENCE;
|
||||
}
|
||||
else
|
||||
else
|
||||
err = INVALID_CODE_POINT;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue