Updated utf8cpp to 2.3.1

This commit is contained in:
Alex Zolotarev 2011-10-04 16:16:13 +03:00 committed by Alex Zolotarev
parent c824cfa2d4
commit 1da1ba7602
6 changed files with 251 additions and 62 deletions

11
3party/utfcpp/doc/ReleaseNotes Normal file → Executable file
View file

@ -1,9 +1,12 @@
utf8 cpp library
Release 2.2.4
Release 2.3.1
This is a minor bug fix release that improves converting from utf-16 to utf-8 error detection.
A bug fix release. Special thanks to dalle and Ivan Sorokin who reported the bugs.
Changes from version 2.2.3
- Bug fix [2857454] dereference invalid iterator when lead surrogate was last element of the string.
Changes from version 2.3
- Bug fix [3025042]: is_bom documentation issue.
- Bug fix [3083640]: is_code_point_valid incorrectly returns false.
- Bug fix [3167987]: prior moves it before start.
- Bug fix [3185087]: utf8::prior and utf8::previous documentation issue.
Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes

188
3party/utfcpp/doc/utf8cpp.html Normal file → Executable file
View file

@ -67,7 +67,7 @@
<li>
<a href=#fixinvalid>Ensure that a string contains valid UTF-8 text</a>
</li>
</li>
</ul>
<li>
<a href="#reference">Reference</a>
<ul class="toc">
@ -275,11 +275,12 @@ octet_iterator append(uint32_t cp, octet_iterator result);
</pre>
<p>
<code>cp</code>: A 32 bit integer representing a code point to append to the
<code>octet_iterator</code>: an output iterator.<br>
<code>cp</code>: a 32 bit integer representing a code point to append to the
sequence.<br>
<code>result</code>: An output iterator to the place in the sequence where to
<code>result</code>: an output iterator to the place in the sequence where to
append the code point.<br>
<span class="return_value">Return value</span>: An iterator pointing to the place
<span class="return_value">Return value</span>: an iterator pointing to the place
after the newly appended sequence.
</p>
<p>
@ -326,6 +327,7 @@ uint32_t next(octet_iterator&amp; it, octet_iterator end);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>it</code>: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
beginning of the next code point.<br>
@ -370,6 +372,7 @@ uint32_t peek_next(octet_iterator it, octet_iterator end);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>it</code>: an iterator pointing to the beginning of an UTF-8
encoded code point.<br>
<code>end</code>: end of the UTF-8 sequence to be processed. If <code>it</code>
@ -400,7 +403,7 @@ assert (w == twochars);
Available in version 1.02 and later.
</p>
<p>
Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
code point and returns the 32 bits representation of the code point.
</p>
@ -411,6 +414,7 @@ uint32_t prior(octet_iterator&amp; it, octet_iterator start);
</pre>
<p>
<code>octet_iterator</code>: a bidirectional iterator.<br>
<code>it</code>: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.<br>
@ -437,7 +441,9 @@ assert (w == twochars);
This function has two purposes: one is two iterate backwards through a UTF-8
encoded string. Note that it is usually a better idea to iterate forward instead,
since <code>utf8::next</code> is faster. The second purpose is to find a beginning
of a UTF-8 sequence if we have a random position within a string.
of a UTF-8 sequence if we have a random position within a string. Note that in that
case <code>utf8::prior</code> may not detect an invalid UTF-8 sequence in some scenarios:
for instance if there are superfluous trail octets, it will just skip them.
</p>
<p>
<code>it</code> will typically point to the beginning of
@ -447,10 +453,12 @@ assert (w == twochars);
beginning with that octet is decoded to a 32 bit representation and returned.
</p>
<p>
In case <code>pass_end</code> is reached before a UTF-8 lead octet is hit, or if an
In case <code>start</code> is reached before a UTF-8 lead octet is hit, or if an
invalid UTF-8 sequence is started by the lead octet, an <code>invalid_utf8</code>
exception is thrown.
</p>
<p>In case <code>start</code> equals <code>it</code>, a <code>not_enough_room</code>
exception is thrown.
<h4>
utf8::previous
</h4>
@ -469,6 +477,7 @@ uint32_t previous(octet_iterator&amp; it, octet_iterator pass_start);
</pre>
<p>
<code>octet_iterator</code>: a random access iterator.<br>
<code>it</code>: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.<br>
@ -507,7 +516,7 @@ assert (w == twochars);
beginning with that octet is decoded to a 32 bit representation and returned.
</p>
<p>
In case <code>pass_end</code> is reached before a UTF-8 lead octet is hit, or if an
In case <code>pass_start</code> is reached before a UTF-8 lead octet is hit, or if an
invalid UTF-8 sequence is started by the lead octet, an <code>invalid_utf8</code>
exception is thrown
</p>
@ -529,6 +538,8 @@ assert (w == twochars);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>distance_type</code>: an integral type convertible to <code>octet_iterator</code>'s difference type.<br>
<code>it</code>: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
nth following code point.<br>
@ -574,8 +585,9 @@ assert (w == twochars + <span class="literal">5</span>);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>first</code>: an iterator to a beginning of a UTF-8 encoded code point.<br>
<code>last</code>: an iterator to a "post-end" of the last UTF-8 encoded code
<code>last</code>: an iterator to a "post-end" of the last UTF-8 encoded code
point in the sequence we are trying to determine the length. It can be the
beginning of a new code point, or not.<br>
<span class="return_value">Return value</span> the distance between the iterators,
@ -619,6 +631,8 @@ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_itera
</pre>
<p>
<code>u16bit_iterator</code>: an input iterator.<br>
<code>octet_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-16 encoded
string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-16 encoded
@ -661,6 +675,8 @@ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_itera
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>u16bit_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 encoded
string to convert. &lt; br /&gt; <code>end</code>: an iterator pointing to
pass-the-end of the UTF-8 encoded string to convert.<br>
@ -705,6 +721,8 @@ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_itera
</pre>
<p>
<code>octet_iterator</code>: an output iterator.<br>
<code>u32bit_iterator</code>: an input iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-32 encoded
string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-32 encoded
@ -747,6 +765,8 @@ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_itera
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>u32bit_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 encoded
string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 encoded string
@ -787,6 +807,7 @@ assert (utf32result.size() == <span class="literal">2</span>);
octet_iterator find_invalid(octet_iterator start, octet_iterator end);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
test for validity.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to test
@ -827,6 +848,7 @@ assert (invalid == utf_invalid + <span class="literal">5</span>);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
test for validity.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to test
@ -868,6 +890,8 @@ output_iterator replace_invalid(octet_iterator start, octet_iterator end, output
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>output_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
look for invalid UTF-8 sequences.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to look
@ -904,11 +928,48 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
If <code>end</code> does not point to the past-of-end of a UTF-8 sequence, a
<code>utf8::not_enough_room</code> exception is thrown.
</p>
<h4>
utf8::starts_with_bom
</h4>
<p class="version">
Available in version 2.3 and later. Relaces deprecated <code>is_bom()</code> function.
</p>
<p>
Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM)
</p>
<pre>
<span class="keyword">template</span> &lt;<span class=
"keyword">typename</span> octet_iterator&gt;
<span class="keyword">bool</span> starts_with_bom (octet_iterator it, octet_iterator end);
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>it</code>: beginning of the octet sequence to check<br>
<code>end</code>: pass-end of the sequence to check<br>
<span class="return_value">Return value</span>: <code>true</code> if the sequence
starts with a UTF-8 byte order mark; <code>false</code> if not.
</p>
<p>
Example of use:
</p>
<pre>
<span class="keyword">unsigned char</span> byte_order_mark[] = {<span class=
"literal">0xef</span>, <span class="literal">0xbb</span>, <span class=
"literal">0xbf</span>};
<span class="keyword">bool</span> bbom = starts_with_bom(byte_order_mark, byte_order_mark + <span class="keyword">sizeof</span>(byte_order_mark));
assert (bbom == <span class="literal">true</span>);
</pre>
<p>
The typical use of this function is to check the first three bytes of a file. If
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
encoded text.
</p>
<h4>
utf8::is_bom
</h4>
<p class="version">
Available in version 1.0 and later.
Available in version 1.0 and later. Deprecated in version 2.3. <code>starts_with_bom()</code> should be used
instead.
</p>
<p>
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
@ -916,9 +977,10 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
<pre>
<span class="keyword">template</span> &lt;<span class=
"keyword">typename</span> octet_iterator&gt;
<span class="keyword">bool</span> is_bom (octet_iterator it);
<span class="keyword">bool</span> is_bom (octet_iterator it); <span class="comment"> // Deprecated</span>
</pre>
<p>
<code>octet_iterator</code>: an input iterator.<br>
<code>it</code>: beginning of the 3-octet sequence to check<br>
<span class="return_value">Return value</span>: <code>true</code> if the sequence
is UTF-8 byte order mark; <code>false</code> if not.
@ -938,9 +1000,113 @@ assert (bbom == <span class="literal">true</span>);
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
encoded text.
</p>
<p>
If a sequence is
shorter than three bytes, an invalid iterator will be dereferenced. Therefore, this function is deprecated
in favor of <code>starts_with_bom()</code>that takes the end of sequence as an argument.
</p>
<h3 id="typesutf8">
Types From utf8 Namespace
</h3>
<h4>utf8::exception
</h4>
<p class="version">
Available in version 2.3 and later.
</p>
<p>
Base class for the exceptions thrown by UTF CPP library functions.
</p>
<pre>
<span class="keyword">class</span> exception : <span class="keyword">public</span> std::exception {};
</pre>
<p>
Example of use:
</p>
<pre>
<span class="keyword">try</span> {
code_that_uses_utf_cpp_library();
}
<span class="keyword">catch</span>(<span class="keyword">const</span> utf8::exception&amp; utfcpp_ex) {
cerr &lt;&lt; utfcpp_ex.what();
}
</pre>
<h4>utf8::invalid_code_point
</h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p>
Thrown by UTF8 CPP functions such as <code>advance</code> and <code>next</code> if an UTF-8 sequence represents and invalid code point.
</p>
<pre>
<span class="keyword">class</span> invalid_code_point : <span class="keyword">public</span> exception {
<span class="keyword">public</span>:
uint32_t code_point() <span class="keyword">const</span>;
};
</pre>
<p>
Member function <code>code_point()</code> can be used to determine the invalid code point that
caused the exception to be thrown.
</p>
<h4>utf8::invalid_utf8
</h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p>
Thrown by UTF8 CPP functions such as <code>next</code> and <code>prior</code> if an invalid UTF-8 sequence
is detected during decoding.
</p>
<pre>
<span class="keyword">class</span> invalid_utf8 : <span class="keyword">public</span> exception {
<span class="keyword">public</span>:
uint8_t utf8_octet() <span class="keyword">const</span>;
};
</pre>
<p>
Member function <code>utf8_octet()</code> can be used to determine the beginning of the byte
sequence that caused the exception to be thrown.
</p>
</pre>
<h4>utf8::invalid_utf16
</h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p>
Thrown by UTF8 CPP function <code>utf16to8</code> if an invalid UTF-16 sequence
is detected during decoding.
</p>
<pre>
<span class="keyword">class</span> invalid_utf16 : <span class="keyword">public</span> exception {
<span class="keyword">public</span>:
uint16_t utf16_word() <span class="keyword">const</span>;
};
</pre>
<p>
Member function <code>utf16_word()</code> can be used to determine the UTF-16 code unit
that caused the exception to be thrown.
</p>
<h4>utf8::not_enough_room
</h4>
<p class="version">
Available in version 1.0 and later.
</p>
<p>
Thrown by UTF8 CPP functions such as <code>next</code> if the end of the decoded UTF-8 sequence
was reached before the code point was decoded.
</p>
<pre>
<span class="keyword">class</span> not_enough_room : <span class="keyword">public</span> exception {};
</pre>
<h4>
utf8::iterator
</h4>

View file

@ -1,34 +1,34 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "utf8/checked.h"
#include "utf8/unchecked.h"
#endif // header guard
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "utf8/checked.h"
#include "utf8/unchecked.h"
#endif // header guard

View file

@ -33,8 +33,12 @@ DEALINGS IN THE SOFTWARE.
namespace utf8
{
// Base for the exceptions that may be thrown from the library
class exception : public std::exception {
};
// Exceptions that may be thrown from the library functions.
class invalid_code_point : public std::exception {
class invalid_code_point : public exception {
uint32_t cp;
public:
invalid_code_point(uint32_t cp) : cp(cp) {}
@ -42,7 +46,7 @@ namespace utf8
uint32_t code_point() const {return cp;}
};
class invalid_utf8 : public std::exception {
class invalid_utf8 : public exception {
uint8_t u8;
public:
invalid_utf8 (uint8_t u) : u8(u) {}
@ -50,7 +54,7 @@ namespace utf8
uint8_t utf8_octet() const {return u8;}
};
class invalid_utf16 : public std::exception {
class invalid_utf16 : public exception {
uint16_t u16;
public:
invalid_utf16 (uint16_t u) : u16(u) {}
@ -58,7 +62,7 @@ namespace utf8
uint16_t utf16_word() const {return u16;}
};
class not_enough_room : public std::exception {
class not_enough_room : public exception {
public:
virtual const char* what() const throw() { return "Not enough space"; }
};
@ -157,13 +161,17 @@ namespace utf8
template <typename octet_iterator>
uint32_t prior(octet_iterator& it, octet_iterator start)
{
octet_iterator end = it;
{
// can't do much if it == start
if (it == start)
throw not_enough_room();
octet_iterator end = it;
// Go back until we hit either a lead octet or start
while (internal::is_trail(*(--it)))
if (it < start)
if (it == start)
throw invalid_utf8(*it); // error - no lead byte in the sequence
octet_iterator temp = it;
return next(temp, end);
return peek_next(it, end);
}
/// Deprecated in versions that include "prior"
@ -249,7 +257,7 @@ namespace utf8
template <typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
{
while (start < end)
while (start != end)
(*result++) = next(start, end);
return result;

16
3party/utfcpp/source/utf8/core.h Normal file → Executable file
View file

@ -92,7 +92,7 @@ namespace internal
template <typename u32>
inline bool is_code_point_valid(u32 cp)
{
return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
return (cp <= CODE_POINT_MAX && !is_surrogate(cp));
}
template <typename octet_iterator>
@ -112,7 +112,8 @@ namespace internal
return 0;
}
inline bool is_overlong_sequence(uint32_t cp, int length)
template <typename octet_difference_type>
inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
{
if (cp < 0x80) {
if (length != 1)
@ -330,6 +331,17 @@ namespace internal
return (find_invalid(start, end) == end);
}
template <typename octet_iterator>
inline bool starts_with_bom (octet_iterator it, octet_iterator end)
{
return (
((it != end) && (internal::mask8(*it++)) == bom[0]) &&
((it != end) && (internal::mask8(*it++)) == bom[1]) &&
((it != end) && (internal::mask8(*it)) == bom[2])
);
}
//Deprecated in release 2.3
template <typename octet_iterator>
inline bool is_bom (octet_iterator it)
{

2
3party/utfcpp/source/utf8/unchecked.h Normal file → Executable file
View file

@ -144,7 +144,7 @@ namespace utf8
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start != end) {
while (start < end) {
uint32_t cp = next(start);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);