mirror of
https://github.com/nemtrif/utfcpp.git
synced 2025-04-04 13:05:06 +00:00
Documentation: C++ 17 support.
This commit is contained in:
parent
b6cd18f929
commit
2af93bd670
1 changed files with 201 additions and 0 deletions
201
README.md
201
README.md
|
@ -392,6 +392,30 @@ Example of use:
|
|||
|
||||
In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown.
|
||||
|
||||
#### utf8::utf16to8
|
||||
|
||||
Available in version 3.2 and later. Requires a C++ 17 compliant compiler.
|
||||
|
||||
Converts a UTF-16 encoded string to UTF-8.
|
||||
|
||||
```cpp
|
||||
std::string utf16to8(std::u16string_view s);
|
||||
```
|
||||
|
||||
`s`: a UTF-16 encoded string.
|
||||
Return value: A UTF-8 encoded string.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
|
||||
u16string_view utf16stringview(u16string);
|
||||
string u = utf16to8(utf16string);
|
||||
assert (u.size() == 10);
|
||||
```
|
||||
|
||||
In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::utf16to8
|
||||
|
||||
|
@ -447,6 +471,32 @@ assert (utf16result[3] == 0xdd1e);
|
|||
|
||||
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
|
||||
|
||||
#### utf8::utf8to16
|
||||
|
||||
Available in version 3.2 and later. Requires a C++ 17 compliant compiler.
|
||||
|
||||
Converts an UTF-8 encoded string to UTF-16.
|
||||
|
||||
```cpp
|
||||
std::u16string utf8to16(std::string_view s);
|
||||
```
|
||||
|
||||
`s`: an UTF-8 encoded string to convert.
|
||||
Return value: A UTF-16 encoded string
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
string_view utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
|
||||
u16string utf16result = utf8to16(utf8_with_surrogates);
|
||||
assert (utf16result.length() == 4);
|
||||
assert (utf16result[2] == 0xd834);
|
||||
assert (utf16result[3] == 0xdd1e);
|
||||
```
|
||||
|
||||
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::utf8to16
|
||||
|
||||
Available in version 1.0 and later.
|
||||
|
@ -500,6 +550,31 @@ assert (utf8result.size() == 9);
|
|||
|
||||
In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown.
|
||||
|
||||
#### utf8::utf32to8
|
||||
|
||||
Available in version 3.2 and later. Requires a C++ 17 compliant compiler.
|
||||
|
||||
Converts a UTF-32 encoded string to UTF-8.
|
||||
|
||||
```cpp
|
||||
std::string utf32to8(std::u32string_view s);
|
||||
```
|
||||
|
||||
`s`: a UTF-32 encoded string.
|
||||
Return value: a UTF-8 encoded string.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
u32string utf32string = {0x448, 0x65E5, 0x10346};
|
||||
u32string_view utf32stringview(utf32string);
|
||||
string utf8result = utf32to8(utf32stringview);
|
||||
assert (utf8result.size() == 9);
|
||||
```
|
||||
|
||||
In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::utf32to8
|
||||
|
||||
Available in version 1.0 and later.
|
||||
|
@ -552,6 +627,29 @@ assert (utf32result.size() == 2);
|
|||
|
||||
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
|
||||
|
||||
#### utf8::utf8to32
|
||||
|
||||
Available in version 3.2 and later. Requires a C++ 17 compliant compiler.
|
||||
|
||||
Converts a UTF-8 encoded string to UTF-32.
|
||||
|
||||
```cpp
|
||||
std::u32string utf8to32(std::string_view s);
|
||||
```
|
||||
|
||||
`s`: a UTF-8 encoded string.
|
||||
Return value: a UTF-32 encoded string.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
string_view twochars = "\xe6\x97\xa5\xd1\x88";
|
||||
u32string utf32result = utf8to32(twochars);
|
||||
assert (utf32result.size() == 2);
|
||||
```
|
||||
|
||||
In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown.
|
||||
|
||||
|
||||
#### utf8::utf8to32
|
||||
|
||||
|
@ -605,6 +703,30 @@ assert (invalid == 5);
|
|||
|
||||
This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it.
|
||||
|
||||
#### utf8::find_invalid
|
||||
|
||||
Available in version 3.2 and later. Requires a C++ 17 compliant compiler.
|
||||
|
||||
Detects an invalid sequence within a UTF-8 string.
|
||||
|
||||
```cpp
|
||||
std::size_t find_invalid(std::string_view s);
|
||||
```
|
||||
|
||||
`s`: a UTF-8 encoded string.
|
||||
Return value: the index of the first invalid octet in the UTF-8 string. In case none were found, equals `std::string_view::npos`.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
|
||||
auto invalid = find_invalid(utf_invalid);
|
||||
assert (invalid == 5);
|
||||
```
|
||||
|
||||
This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it.
|
||||
|
||||
|
||||
#### utf8::find_invalid
|
||||
|
||||
Available in version 1.0 and later.
|
||||
|
@ -654,6 +776,30 @@ assert (bvalid == false);
|
|||
|
||||
You may want to use `is_valid` to make sure that a string contains valid UTF-8 text without the need to know where it fails if it is not valid.
|
||||
|
||||
#### utf8::is_valid
|
||||
|
||||
Available in version 3.2 and later. Requires a C++ 17 compliant compiler.
|
||||
|
||||
Checks whether a string object contains valid UTF-8 encoded text.
|
||||
|
||||
```cpp
|
||||
bool is_valid(std::string_view s);
|
||||
```
|
||||
|
||||
`s`: a UTF-8 encoded string.
|
||||
Return value: `true` if the string contains valid UTF-8 encoded text; `false` if not.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
string_view utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa";
|
||||
bool bvalid = is_valid(utf_invalid);
|
||||
assert (bvalid == false);
|
||||
```
|
||||
|
||||
You may want to use `is_valid` to make sure that a string contains valid UTF-8 text without the need to know where it fails if it is not valid.
|
||||
|
||||
|
||||
#### utf8::is_valid
|
||||
|
||||
Available in version 1.0 and later.
|
||||
|
@ -706,6 +852,33 @@ const string fixed_invalid_sequence = "a????z";
|
|||
assert (fixed_invalid_sequence == replace_invalid_result);
|
||||
```
|
||||
|
||||
#### utf8::replace_invalid
|
||||
|
||||
Available in version 3.2 and later. Requires a C++ 17 compliant compiler.
|
||||
|
||||
Replaces all invalid UTF-8 sequences within a string with a replacement marker.
|
||||
|
||||
```cpp
|
||||
std::string replace_invalid(std::string_view s, char32_t replacement);
|
||||
std::string replace_invalid(std::string_view s);
|
||||
```
|
||||
|
||||
`s`: a UTF-8 encoded string.
|
||||
`replacement`: A Unicode code point for the replacement marker. The version without this parameter assumes the value `0xfffd`
|
||||
Return value: A UTF-8 encoded string with replaced invalid sequences.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
string_view invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
|
||||
string replace_invalid_result = replace_invalid(invalid_sequence, '?');
|
||||
bool bvalid = is_valid(replace_invalid_result);
|
||||
assert (bvalid);
|
||||
const string fixed_invalid_sequence = "a????z";
|
||||
assert(fixed_invalid_sequence, replace_invalid_result);
|
||||
```
|
||||
|
||||
|
||||
#### utf8::replace_invalid
|
||||
|
||||
Available in version 2.0 and later.
|
||||
|
@ -768,6 +941,34 @@ assert (no_bbom == false);
|
|||
The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.
|
||||
|
||||
|
||||
#### utf8::starts_with_bom
|
||||
|
||||
Available in version 3.2 and later. Requires a C++ 17 compliant compiler.
|
||||
|
||||
Checks whether a string starts with a UTF-8 byte order mark (BOM)
|
||||
|
||||
```cpp
|
||||
bool starts_with_bom(std::string_view s);
|
||||
```
|
||||
|
||||
`s`: a UTF-8 encoded string.
|
||||
Return value: `true` if the string starts with a UTF-8 byte order mark; `false` if not.
|
||||
|
||||
Example of use:
|
||||
|
||||
```cpp
|
||||
string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)};
|
||||
string_view byte_order_mark_view(byte_order_mark);
|
||||
bool bbom = starts_with_bom(byte_order_mark_view);
|
||||
assert (bbom);
|
||||
string_view threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
|
||||
bool no_bbom = starts_with_bom(threechars);
|
||||
assert (!no_bbom);
|
||||
```
|
||||
|
||||
The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.
|
||||
|
||||
|
||||
#### utf8::starts_with_bom
|
||||
|
||||
Available in version 2.3 and later.
|
||||
|
|
Loading…
Add table
Reference in a new issue