From 2af93bd6700a3ba10ab1c97998285087be119734 Mon Sep 17 00:00:00 2001 From: Nemanja Trifunovic Date: Fri, 30 Apr 2021 15:06:55 -0400 Subject: [PATCH] Documentation: C++ 17 support. --- README.md | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) diff --git a/README.md b/README.md index ee6ef47..a519cdb 100644 --- a/README.md +++ b/README.md @@ -392,6 +392,30 @@ Example of use: In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. +#### utf8::utf16to8 + +Available in version 3.2 and later. Requires a C++ 17 compliant compiler. + +Converts a UTF-16 encoded string to UTF-8. + +```cpp +std::string utf16to8(std::u16string_view s); +``` + +`s`: a UTF-16 encoded string. +Return value: A UTF-8 encoded string. + +Example of use: + +```cpp + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u16string_view utf16stringview(u16string); + string u = utf16to8(utf16string); + assert (u.size() == 10); +``` + +In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. + #### utf8::utf16to8 @@ -447,6 +471,32 @@ assert (utf16result[3] == 0xdd1e); In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. +#### utf8::utf8to16 + +Available in version 3.2 and later. Requires a C++ 17 compliant compiler. + +Converts an UTF-8 encoded string to UTF-16. + +```cpp +std::u16string utf8to16(std::string_view s); +``` + +`s`: an UTF-8 encoded string to convert. +Return value: A UTF-16 encoded string + +Example of use: + +```cpp +string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +u16string utf16result = utf8to16(utf8_with_surrogates); +assert (utf16result.length() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + + #### utf8::utf8to16 Available in version 1.0 and later. @@ -500,6 +550,31 @@ assert (utf8result.size() == 9); In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. +#### utf8::utf32to8 + +Available in version 3.2 and later. Requires a C++ 17 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::string utf32to8(std::u32string_view s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +u32string_view utf32stringview(utf32string); +string utf8result = utf32to8(utf32stringview); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + + #### utf8::utf32to8 Available in version 1.0 and later. @@ -552,6 +627,29 @@ assert (utf32result.size() == 2); In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. +#### utf8::utf8to32 + +Available in version 3.2 and later. Requires a C++ 17 compliant compiler. + +Converts a UTF-8 encoded string to UTF-32. + +```cpp +std::u32string utf8to32(std::string_view s); +``` + +`s`: a UTF-8 encoded string. +Return value: a UTF-32 encoded string. + +Example of use: + +```cpp +string_view twochars = "\xe6\x97\xa5\xd1\x88"; +u32string utf32result = utf8to32(twochars); +assert (utf32result.size() == 2); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + #### utf8::utf8to32 @@ -605,6 +703,30 @@ assert (invalid == 5); This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. +#### utf8::find_invalid + +Available in version 3.2 and later. Requires a C++ 17 compliant compiler. + +Detects an invalid sequence within a UTF-8 string. + +```cpp +std::size_t find_invalid(std::string_view s); +``` + +`s`: a UTF-8 encoded string. +Return value: the index of the first invalid octet in the UTF-8 string. In case none were found, equals `std::string_view::npos`. + +Example of use: + +```cpp +string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; +auto invalid = find_invalid(utf_invalid); +assert (invalid == 5); +``` + +This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. + + #### utf8::find_invalid Available in version 1.0 and later. @@ -654,6 +776,30 @@ assert (bvalid == false); You may want to use `is_valid` to make sure that a string contains valid UTF-8 text without the need to know where it fails if it is not valid. +#### utf8::is_valid + +Available in version 3.2 and later. Requires a C++ 17 compliant compiler. + +Checks whether a string object contains valid UTF-8 encoded text. + +```cpp +bool is_valid(std::string_view s); +``` + +`s`: a UTF-8 encoded string. +Return value: `true` if the string contains valid UTF-8 encoded text; `false` if not. + +Example of use: + +```cpp +string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; +bool bvalid = is_valid(utf_invalid); +assert (bvalid == false); +``` + +You may want to use `is_valid` to make sure that a string contains valid UTF-8 text without the need to know where it fails if it is not valid. + + #### utf8::is_valid Available in version 1.0 and later. @@ -706,6 +852,33 @@ const string fixed_invalid_sequence = "a????z"; assert (fixed_invalid_sequence == replace_invalid_result); ``` +#### utf8::replace_invalid + +Available in version 3.2 and later. Requires a C++ 17 compliant compiler. + +Replaces all invalid UTF-8 sequences within a string with a replacement marker. + +```cpp +std::string replace_invalid(std::string_view s, char32_t replacement); +std::string replace_invalid(std::string_view s); +``` + +`s`: a UTF-8 encoded string. +`replacement`: A Unicode code point for the replacement marker. The version without this parameter assumes the value `0xfffd` +Return value: A UTF-8 encoded string with replaced invalid sequences. + +Example of use: + +```cpp +string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; +string replace_invalid_result = replace_invalid(invalid_sequence, '?'); +bool bvalid = is_valid(replace_invalid_result); +assert (bvalid); +const string fixed_invalid_sequence = "a????z"; +assert(fixed_invalid_sequence, replace_invalid_result); +``` + + #### utf8::replace_invalid Available in version 2.0 and later. @@ -768,6 +941,34 @@ assert (no_bbom == false); The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. +#### utf8::starts_with_bom + +Available in version 3.2 and later. Requires a C++ 17 compliant compiler. + +Checks whether a string starts with a UTF-8 byte order mark (BOM) + +```cpp +bool starts_with_bom(std::string_view s); +``` + +`s`: a UTF-8 encoded string. +Return value: `true` if the string starts with a UTF-8 byte order mark; `false` if not. + +Example of use: + +```cpp +string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; +string_view byte_order_mark_view(byte_order_mark); +bool bbom = starts_with_bom(byte_order_mark_view); +assert (bbom); +string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; +bool no_bbom = starts_with_bom(threechars); +assert (!no_bbom); + ``` + +The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. + + #### utf8::starts_with_bom Available in version 2.3 and later.