diff --git a/3party/utfcpp/doc/ReleaseNotes b/3party/utfcpp/doc/ReleaseNotes old mode 100644 new mode 100755 index 857a72b756..c0c01a3c61 --- a/3party/utfcpp/doc/ReleaseNotes +++ b/3party/utfcpp/doc/ReleaseNotes @@ -1,9 +1,12 @@ utf8 cpp library -Release 2.2.4 +Release 2.3.1 -This is a minor bug fix release that improves converting from utf-16 to utf-8 error detection. +A bug fix release. Special thanks to dalle and Ivan Sorokin who reported the bugs. -Changes from version 2.2.3 -- Bug fix [2857454] dereference invalid iterator when lead surrogate was last element of the string. +Changes from version 2.3 +- Bug fix [3025042]: is_bom documentation issue. +- Bug fix [3083640]: is_code_point_valid incorrectly returns false. +- Bug fix [3167987]: prior moves it before start. +- Bug fix [3185087]: utf8::prior and utf8::previous documentation issue. Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes diff --git a/3party/utfcpp/doc/utf8cpp.html b/3party/utfcpp/doc/utf8cpp.html old mode 100644 new mode 100755 index 069c2be521..c0a89a27de --- a/3party/utfcpp/doc/utf8cpp.html +++ b/3party/utfcpp/doc/utf8cpp.html @@ -67,7 +67,7 @@
- cp
: A 32 bit integer representing a code point to append to the
+ octet_iterator
: an output iterator.
+ cp
: a 32 bit integer representing a code point to append to the
sequence.
- result
: An output iterator to the place in the sequence where to
+ result
: an output iterator to the place in the sequence where to
append the code point.
- Return value: An iterator pointing to the place
+ Return value: an iterator pointing to the place
after the newly appended sequence.
@@ -326,6 +327,7 @@ uint32_t next(octet_iterator& it, octet_iterator end);
+ octet_iterator
: an input iterator.
it
: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
beginning of the next code point.
@@ -370,6 +372,7 @@ uint32_t peek_next(octet_iterator it, octet_iterator end);
+ octet_iterator
: an input iterator.
it
: an iterator pointing to the beginning of an UTF-8
encoded code point.
end
: end of the UTF-8 sequence to be processed. If it
@@ -400,7 +403,7 @@ assert (w == twochars);
Available in version 1.02 and later.
- Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it + Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point.
@@ -411,6 +414,7 @@ uint32_t prior(octet_iterator& it, octet_iterator start);
+ octet_iterator
: a bidirectional iterator.
it
: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.
@@ -437,7 +441,9 @@ assert (w == twochars);
This function has two purposes: one is two iterate backwards through a UTF-8
encoded string. Note that it is usually a better idea to iterate forward instead,
since utf8::next
is faster. The second purpose is to find a beginning
- of a UTF-8 sequence if we have a random position within a string.
+ of a UTF-8 sequence if we have a random position within a string. Note that in that
+ case utf8::prior
may not detect an invalid UTF-8 sequence in some scenarios:
+ for instance if there are superfluous trail octets, it will just skip them.
it
will typically point to the beginning of
@@ -447,10 +453,12 @@ assert (w == twochars);
beginning with that octet is decoded to a 32 bit representation and returned.
- In case pass_end
is reached before a UTF-8 lead octet is hit, or if an
+ In case start
is reached before a UTF-8 lead octet is hit, or if an
invalid UTF-8 sequence is started by the lead octet, an invalid_utf8
exception is thrown.
In case start
equals it
, a not_enough_room
+ exception is thrown.
+ octet_iterator
: a random access iterator.
it
: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.
@@ -507,7 +516,7 @@ assert (w == twochars);
beginning with that octet is decoded to a 32 bit representation and returned.
- In case pass_end
is reached before a UTF-8 lead octet is hit, or if an
+ In case pass_start
is reached before a UTF-8 lead octet is hit, or if an
invalid UTF-8 sequence is started by the lead octet, an invalid_utf8
exception is thrown
+ octet_iterator
: an input iterator.
+ distance_type
: an integral type convertible to octet_iterator
's difference type.
it
: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
nth following code point.
@@ -574,8 +585,9 @@ assert (w == twochars + 5);
+ octet_iterator
: an input iterator.
first
: an iterator to a beginning of a UTF-8 encoded code point.
- last
: an iterator to a "post-end" of the last UTF-8 encoded code
+ last
: an iterator to a "post-end" of the last UTF-8 encoded code
point in the sequence we are trying to determine the length. It can be the
beginning of a new code point, or not.
Return value the distance between the iterators,
@@ -619,6 +631,8 @@ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_itera
+ u16bit_iterator
: an input iterator.
+ octet_iterator
: an output iterator.
start
: an iterator pointing to the beginning of the UTF-16 encoded
string to convert.
end
: an iterator pointing to pass-the-end of the UTF-16 encoded
@@ -661,6 +675,8 @@ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_itera
+ octet_iterator
: an input iterator.
+ u16bit_iterator
: an output iterator.
start
: an iterator pointing to the beginning of the UTF-8 encoded
string to convert. < br /> end
: an iterator pointing to
pass-the-end of the UTF-8 encoded string to convert.
@@ -705,6 +721,8 @@ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_itera
+ octet_iterator
: an output iterator.
+ u32bit_iterator
: an input iterator.
start
: an iterator pointing to the beginning of the UTF-32 encoded
string to convert.
end
: an iterator pointing to pass-the-end of the UTF-32 encoded
@@ -747,6 +765,8 @@ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_itera
+ octet_iterator
: an input iterator.
+ u32bit_iterator
: an output iterator.
start
: an iterator pointing to the beginning of the UTF-8 encoded
string to convert.
end
: an iterator pointing to pass-the-end of the UTF-8 encoded string
@@ -787,6 +807,7 @@ assert (utf32result.size() == 2);
octet_iterator find_invalid(octet_iterator start, octet_iterator end);
+ octet_iterator
: an input iterator.
start
: an iterator pointing to the beginning of the UTF-8 string to
test for validity.
end
: an iterator pointing to pass-the-end of the UTF-8 string to test
@@ -827,6 +848,7 @@ assert (invalid == utf_invalid + 5);
+ octet_iterator
: an input iterator.
start
: an iterator pointing to the beginning of the UTF-8 string to
test for validity.
end
: an iterator pointing to pass-the-end of the UTF-8 string to test
@@ -868,6 +890,8 @@ output_iterator replace_invalid(octet_iterator start, octet_iterator end, output
+ octet_iterator
: an input iterator.
+ output_iterator
: an output iterator.
start
: an iterator pointing to the beginning of the UTF-8 string to
look for invalid UTF-8 sequences.
end
: an iterator pointing to pass-the-end of the UTF-8 string to look
@@ -904,11 +928,48 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
If end
does not point to the past-of-end of a UTF-8 sequence, a
utf8::not_enough_room
exception is thrown.
+ Available in version 2.3 and later. Relaces deprecated is_bom()
function.
+
+ Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM) +
++template <typename octet_iterator> +bool starts_with_bom (octet_iterator it, octet_iterator end); ++
+ octet_iterator
: an input iterator.
+ it
: beginning of the octet sequence to check
+ end
: pass-end of the sequence to check
+ Return value: true
if the sequence
+ starts with a UTF-8 byte order mark; false
if not.
+
+ Example of use: +
++unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; +bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); +assert (bbom == true); ++
+ The typical use of this function is to check the first three bytes of a file. If + they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 + encoded text. +
- Available in version 1.0 and later.
+ Available in version 1.0 and later. Deprecated in version 2.3. starts_with_bom()
should be used
+ instead.
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM) @@ -916,9 +977,10 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
template <typename octet_iterator> -bool is_bom (octet_iterator it); +bool is_bom (octet_iterator it); // Deprecated
+ octet_iterator
: an input iterator.
it
: beginning of the 3-octet sequence to check
Return value: true
if the sequence
is UTF-8 byte order mark; false
if not.
@@ -938,9 +1000,113 @@ assert (bbom == true);
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
encoded text.
+ If a sequence is
+ shorter than three bytes, an invalid iterator will be dereferenced. Therefore, this function is deprecated
+ in favor of starts_with_bom()
that takes the end of sequence as an argument.
+
+ Available in version 2.3 and later. +
++ Base class for the exceptions thrown by UTF CPP library functions. +
++class exception : public std::exception {}; ++
+ Example of use: +
++try { + code_that_uses_utf_cpp_library(); +} +catch(const utf8::exception& utfcpp_ex) { + cerr << utfcpp_ex.what(); +} ++ +
+ Available in version 1.0 and later. +
+
+ Thrown by UTF8 CPP functions such as advance
and next
if an UTF-8 sequence represents and invalid code point.
+
+class invalid_code_point : public exception { +public: + uint32_t code_point() const; +}; + ++
+ Member function code_point()
can be used to determine the invalid code point that
+ caused the exception to be thrown.
+
+ Available in version 1.0 and later. +
+
+ Thrown by UTF8 CPP functions such as next
and prior
if an invalid UTF-8 sequence
+ is detected during decoding.
+
+class invalid_utf8 : public exception { +public: + uint8_t utf8_octet() const; +}; ++ +
+ Member function utf8_octet()
can be used to determine the beginning of the byte
+ sequence that caused the exception to be thrown.
+
+ Available in version 1.0 and later. +
+
+ Thrown by UTF8 CPP function utf16to8
if an invalid UTF-16 sequence
+ is detected during decoding.
+
+class invalid_utf16 : public exception { +public: + uint16_t utf16_word() const; +}; ++ +
+ Member function utf16_word()
can be used to determine the UTF-16 code unit
+ that caused the exception to be thrown.
+
+ Available in version 1.0 and later. +
+
+ Thrown by UTF8 CPP functions such as next
if the end of the decoded UTF-8 sequence
+ was reached before the code point was decoded.
+
+class not_enough_room : public exception {}; +