mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-23004 UTF-8 forward
This commit is contained in:
parent
32131dc2ff
commit
2be45bf6f8
4 changed files with 286 additions and 16 deletions
|
@ -517,7 +517,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
if(U8_IS_TRAIL(__t1)) { \
|
||||
++(i); \
|
||||
} \
|
||||
} else /* c>=0xf0 */ { \
|
||||
} else /* b>=0xf0 */ { \
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
|
||||
++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
|
||||
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <string_view>
|
||||
#ifdef UTYPES_H
|
||||
#include "unicode/utf16.h"
|
||||
#include "unicode/utf8.h"
|
||||
#include "unicode/uversion.h"
|
||||
#else
|
||||
// TODO: Remove checks for UTYPES_H and replacement definitions.
|
||||
|
@ -48,7 +49,6 @@ namespace header {}
|
|||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
// Some defined behaviors for handling ill-formed Unicode strings.
|
||||
// TODO: For 8-bit strings, the SURROGATE option does not have an equivalent -- static_assert.
|
||||
typedef enum UIllFormedBehavior {
|
||||
U_BEHAVIOR_NEGATIVE,
|
||||
U_BEHAVIOR_FFFD,
|
||||
|
@ -57,17 +57,6 @@ typedef enum UIllFormedBehavior {
|
|||
|
||||
namespace U_HEADER_ONLY_NAMESPACE {
|
||||
|
||||
// Handle ill-formed UTF-16: One unpaired surrogate.
|
||||
// @internal
|
||||
template<typename CP32, UIllFormedBehavior behavior>
|
||||
CP32 uprv_u16Sub(CP32 surrogate) {
|
||||
switch (behavior) {
|
||||
case U_BEHAVIOR_NEGATIVE: return U_SENTINEL;
|
||||
case U_BEHAVIOR_FFFD: return 0xfffd;
|
||||
case U_BEHAVIOR_SURROGATE: return surrogate;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of validating and decoding a minimal Unicode code unit sequence.
|
||||
* Returned from validating Unicode string code point iterators.
|
||||
|
@ -198,6 +187,152 @@ private:
|
|||
template<typename UnitIter, typename CP32, UIllFormedBehavior behavior, typename = void>
|
||||
class UTFImpl;
|
||||
|
||||
// UTF-8
|
||||
template<typename UnitIter, typename CP32, UIllFormedBehavior behavior>
|
||||
class UTFImpl<
|
||||
UnitIter,
|
||||
CP32,
|
||||
behavior,
|
||||
std::enable_if_t<
|
||||
sizeof(typename std::iterator_traits<UnitIter>::value_type) == 1>> {
|
||||
static_assert(behavior != U_BEHAVIOR_SURROGATE,
|
||||
"For 8-bit strings, the SURROGATE option does not have an equivalent.");
|
||||
public:
|
||||
// Handle ill-formed UTF-8
|
||||
static CP32 sub() {
|
||||
switch (behavior) {
|
||||
case U_BEHAVIOR_NEGATIVE: return U_SENTINEL;
|
||||
case U_BEHAVIOR_FFFD: return 0xfffd;
|
||||
}
|
||||
}
|
||||
|
||||
static void inc(UnitIter &p, UnitIter limit) {
|
||||
// TODO: assert p != limit -- more precisely: start <= p < limit
|
||||
// Very similar to U8_FWD_1().
|
||||
uint8_t b = *p;
|
||||
++p;
|
||||
if (U8_IS_LEAD(b) && p != limit) {
|
||||
uint8_t t1 = *p;
|
||||
if ((0xe0 <= b && b < 0xf0)) {
|
||||
if (U8_IS_VALID_LEAD3_AND_T1(b, t1) &&
|
||||
++p != limit && U8_IS_TRAIL(*p)) {
|
||||
++p;
|
||||
}
|
||||
} else if (b < 0xe0) {
|
||||
if (U8_IS_TRAIL(t1)) {
|
||||
++p;
|
||||
}
|
||||
} else /* b >= 0xf0 */ {
|
||||
if (U8_IS_VALID_LEAD4_AND_T1(b, t1) &&
|
||||
++p != limit && U8_IS_TRAIL(*p) &&
|
||||
++p != limit && U8_IS_TRAIL(*p)) {
|
||||
++p;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static CodeUnits<UnitIter, CP32> readAndInc(UnitIter &p, UnitIter limit) {
|
||||
// TODO: assert p != limit -- more precisely: start <= p < limit
|
||||
// Very similar to U8_NEXT_OR_FFFD().
|
||||
UnitIter p0 = p;
|
||||
CP32 c = uint8_t(*p);
|
||||
++p;
|
||||
if (U8_IS_SINGLE(c)) {
|
||||
return {c, 1, true, p0};
|
||||
}
|
||||
uint8_t length = 1;
|
||||
uint8_t t = 0;
|
||||
if (p != limit &&
|
||||
// fetch/validate/assemble all but last trail byte
|
||||
(c >= 0xe0 ?
|
||||
(c < 0xf0 ? // U+0800..U+FFFF except surrogates
|
||||
U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
|
||||
(t &= 0x3f, 1)
|
||||
: // U+10000..U+10FFFF
|
||||
(c -= 0xf0) <= 4 &&
|
||||
U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
|
||||
(c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
|
||||
(t = *p - 0x80) <= 0x3f) &&
|
||||
// valid second-to-last trail byte
|
||||
(c = (c << 6) | t, ++length, ++p != limit)
|
||||
: // U+0080..U+07FF
|
||||
c >= 0xc2 && (c &= 0x1f, 1)) &&
|
||||
// last trail byte
|
||||
(t = *p - 0x80) <= 0x3f) {
|
||||
c = (c << 6) | t;
|
||||
++length;
|
||||
++p;
|
||||
return {c, length, true, p0};
|
||||
}
|
||||
return {sub(), length, false, p0}; // ill-formed
|
||||
}
|
||||
|
||||
static CodeUnits<UnitIter, CP32> singlePassReadAndInc(UnitIter &p, UnitIter limit) {
|
||||
// TODO: assert p != limit -- more precisely: start <= p < limit
|
||||
// Very similar to U8_NEXT_OR_FFFD().
|
||||
CP32 c = uint8_t(*p);
|
||||
++p;
|
||||
if (U8_IS_SINGLE(c)) {
|
||||
return {c, 1, true};
|
||||
}
|
||||
uint8_t length = 1;
|
||||
uint8_t t = 0;
|
||||
if (p != limit &&
|
||||
// fetch/validate/assemble all but last trail byte
|
||||
(c >= 0xe0 ?
|
||||
(c < 0xf0 ? // U+0800..U+FFFF except surrogates
|
||||
U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
|
||||
(t &= 0x3f, 1)
|
||||
: // U+10000..U+10FFFF
|
||||
(c -= 0xf0) <= 4 &&
|
||||
U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
|
||||
(c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
|
||||
(t = *p - 0x80) <= 0x3f) &&
|
||||
// valid second-to-last trail byte
|
||||
(c = (c << 6) | t, ++length, ++p != limit)
|
||||
: // U+0080..U+07FF
|
||||
c >= 0xc2 && (c &= 0x1f, 1)) &&
|
||||
// last trail byte
|
||||
(t = *p - 0x80) <= 0x3f) {
|
||||
c = (c << 6) | t;
|
||||
++length;
|
||||
++p;
|
||||
return {c, length, true};
|
||||
}
|
||||
return {sub(), length, false}; // ill-formed
|
||||
}
|
||||
|
||||
static CodeUnits<UnitIter, CP32> decAndRead(UnitIter start, UnitIter &p) {
|
||||
// TODO: assert p != start -- more precisely: start < p <= limit
|
||||
// Very similar to U8_PREV_OR_FFFD().
|
||||
CP32 c = *--p;
|
||||
if (!U8_IS_SURROGATE(c)) {
|
||||
return {c, 1, true, p};
|
||||
} else {
|
||||
UnitIter p1;
|
||||
uint16_t c2;
|
||||
if (U8_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U8_IS_LEAD(c2 = *--p1))) {
|
||||
p = p1;
|
||||
c = U8_GET_SUPPLEMENTARY(c2, c);
|
||||
return {c, 2, true, p};
|
||||
} else {
|
||||
return {sub(c), 1, false, p};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void moveToReadAndIncStart(UnitIter &p, int8_t &state) {
|
||||
// state > 0 after readAndInc()
|
||||
do { --p; } while (--state != 0);
|
||||
}
|
||||
|
||||
static void moveToDecAndReadLimit(UnitIter &p, int8_t &state) {
|
||||
// state < 0 after decAndRead()
|
||||
do { ++p; } while (++state != 0);
|
||||
}
|
||||
};
|
||||
|
||||
// UTF-16
|
||||
template<typename UnitIter, typename CP32, UIllFormedBehavior behavior>
|
||||
class UTFImpl<
|
||||
|
@ -207,6 +342,15 @@ class UTFImpl<
|
|||
std::enable_if_t<
|
||||
sizeof(typename std::iterator_traits<UnitIter>::value_type) == 2>> {
|
||||
public:
|
||||
// Handle ill-formed UTF-16: One unpaired surrogate.
|
||||
static CP32 sub(CP32 surrogate) {
|
||||
switch (behavior) {
|
||||
case U_BEHAVIOR_NEGATIVE: return U_SENTINEL;
|
||||
case U_BEHAVIOR_FFFD: return 0xfffd;
|
||||
case U_BEHAVIOR_SURROGATE: return surrogate;
|
||||
}
|
||||
}
|
||||
|
||||
static void inc(UnitIter &p, UnitIter limit) {
|
||||
// TODO: assert p != limit -- more precisely: start <= p < limit
|
||||
// Very similar to U16_FWD_1().
|
||||
|
@ -232,7 +376,7 @@ public:
|
|||
c = U16_GET_SUPPLEMENTARY(c, c2);
|
||||
return {c, 2, true, p0};
|
||||
} else {
|
||||
return {uprv_u16Sub<CP32, behavior>(c), 1, false, p0};
|
||||
return {sub(c), 1, false, p0};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -251,7 +395,7 @@ public:
|
|||
c = U16_GET_SUPPLEMENTARY(c, c2);
|
||||
return {c, 2, true};
|
||||
} else {
|
||||
return {uprv_u16Sub<CP32, behavior>(c), 1, false};
|
||||
return {sub(c), 1, false};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -270,7 +414,7 @@ public:
|
|||
c = U16_GET_SUPPLEMENTARY(c2, c);
|
||||
return {c, 2, true, p};
|
||||
} else {
|
||||
return {uprv_u16Sub<CP32, behavior>(c), 1, false, p};
|
||||
return {sub(c), 1, false, p};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,6 +49,7 @@ extern IntlTest *createStaticUnicodeSetsTest();
|
|||
#endif
|
||||
static IntlTest *createUHashTest();
|
||||
extern IntlTest *createU16IteratorTest();
|
||||
extern IntlTest *createU8IteratorTest();
|
||||
|
||||
void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
|
||||
{
|
||||
|
@ -86,6 +87,7 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &
|
|||
TESTCASE_AUTO_CREATE_CLASS(UHashTest);
|
||||
TESTCASE_AUTO_CREATE_CLASS(USetHeaderOnlyTest);
|
||||
TESTCASE_AUTO_CREATE_CLASS(U16IteratorTest);
|
||||
TESTCASE_AUTO_CREATE_CLASS(U8IteratorTest);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
|
|
@ -303,3 +303,127 @@ void U16IteratorTest::testFwdIter() {
|
|||
}
|
||||
|
||||
// TODO: test back & forth with bidirectional iterator (not random access, not contiguous)
|
||||
|
||||
class U8IteratorTest : public IntlTest {
|
||||
public:
|
||||
U8IteratorTest() {}
|
||||
|
||||
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override;
|
||||
|
||||
void testGood();
|
||||
void testNegative();
|
||||
void testFFFD();
|
||||
void testSinglePassIter();
|
||||
void testFwdIter();
|
||||
};
|
||||
|
||||
extern IntlTest *createU8IteratorTest() {
|
||||
return new U8IteratorTest();
|
||||
}
|
||||
|
||||
void U8IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
|
||||
if(exec) {
|
||||
logln("TestSuite U8IteratorTest: ");
|
||||
}
|
||||
TESTCASE_AUTO_BEGIN;
|
||||
TESTCASE_AUTO(testGood);
|
||||
// TODO: TESTCASE_AUTO(testNegative);
|
||||
// TODO: TESTCASE_AUTO(testFFFD);
|
||||
TESTCASE_AUTO(testSinglePassIter);
|
||||
TESTCASE_AUTO(testFwdIter);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
void U8IteratorTest::testGood() {
|
||||
std::string_view good(reinterpret_cast<const char*>(u8"abçカ🚴"));
|
||||
U16StringCodePoints<char, UChar32, U_BEHAVIOR_NEGATIVE> range(good);
|
||||
auto iter = range.begin();
|
||||
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
|
||||
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
|
||||
++iter; // pre-increment
|
||||
auto units = *iter;
|
||||
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
|
||||
assertEquals("iter[1] * length", 1, units.length());
|
||||
assertTrue("iter[1] * wellFormed", units.wellFormed());
|
||||
assertTrue("iter[1] * stringView()",
|
||||
units.stringView() == std::string_view(reinterpret_cast<const char*>(u8"b")));
|
||||
++iter;
|
||||
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
|
||||
assertEquals("iter[3] -> codePoint", u'カ', iter->codePoint());
|
||||
++iter;
|
||||
// Fetch the current code point twice.
|
||||
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
|
||||
units = *iter++;
|
||||
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
|
||||
assertEquals("iter[4] * length", 4, units.length());
|
||||
assertTrue("iter[4] * wellFormed", units.wellFormed());
|
||||
assertTrue("iter[4] * stringView()",
|
||||
units.stringView() == std::string_view(reinterpret_cast<const char*>(u8"🚴")));
|
||||
assertTrue("iter == endIter", iter == range.end());
|
||||
}
|
||||
|
||||
void U8IteratorTest::testSinglePassIter() {
|
||||
SinglePassSource<char> good(reinterpret_cast<const char*>(u8"abçカ🚴"));
|
||||
SinglePassIter<char> goodBegin(good);
|
||||
SinglePassIter<char> goodLimit{};
|
||||
U16Iterator<SinglePassIter<char>, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin(
|
||||
goodBegin, goodLimit);
|
||||
U16Iterator<SinglePassIter<char>, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit(goodLimit);
|
||||
auto iter = rangeBegin;
|
||||
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
|
||||
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
|
||||
++iter; // pre-increment
|
||||
auto units = *iter;
|
||||
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
|
||||
assertEquals("iter[1] * length", 1, units.length());
|
||||
assertTrue("iter[1] * wellFormed", units.wellFormed());
|
||||
// No units.stringView() when the unit iterator is not a pointer.
|
||||
// No data() for a single-pass unit iterator.
|
||||
++iter;
|
||||
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
|
||||
assertEquals("iter[3] -> codePoint", u'カ', iter->codePoint());
|
||||
++iter;
|
||||
// Fetch the current code point twice.
|
||||
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
|
||||
units = *iter++;
|
||||
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
|
||||
assertEquals("iter[4] * length", 4, units.length());
|
||||
assertTrue("iter[4] * wellFormed", units.wellFormed());
|
||||
assertTrue("iter == endIter", iter == rangeLimit);
|
||||
}
|
||||
|
||||
void U8IteratorTest::testFwdIter() {
|
||||
std::string_view good(reinterpret_cast<const char*>(u8"abçカ🚴"));
|
||||
FwdIter<char> goodBegin(good.data());
|
||||
FwdIter<char> goodLimit(good.data() + good.length());
|
||||
U16Iterator<FwdIter<char>, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin(
|
||||
goodBegin, goodBegin, goodLimit);
|
||||
U16Iterator<FwdIter<char>, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit(goodLimit);
|
||||
// TODO: U16StringCodePoints<FwdIter, UChar32, U_BEHAVIOR_NEGATIVE> range(good);
|
||||
auto iter = rangeBegin;
|
||||
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
|
||||
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
|
||||
++iter; // pre-increment
|
||||
auto units = *iter;
|
||||
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
|
||||
assertEquals("iter[1] * length", 1, units.length());
|
||||
assertTrue("iter[1] * wellFormed", units.wellFormed());
|
||||
// No units.stringView() when the unit iterator is not a pointer.
|
||||
assertTrue("iter[1] * data()[0]", *units.data() == u8'b');
|
||||
++iter;
|
||||
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
|
||||
assertEquals("iter[3] -> codePoint", u'カ', iter->codePoint());
|
||||
++iter;
|
||||
// Fetch the current code point twice.
|
||||
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
|
||||
units = *iter++;
|
||||
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
|
||||
assertEquals("iter[4] * length", 4, units.length());
|
||||
assertTrue("iter[4] * wellFormed", units.wellFormed());
|
||||
FwdIter<char> data = units.data();
|
||||
assertTrue("iter[4] * data()[0]", *data++ == u8"🚴"[0]);
|
||||
assertTrue("iter[4] * data()[1]", *data++ == u8"🚴"[1]);
|
||||
assertTrue("iter[4] * data()[2]", *data++ == u8"🚴"[2]);
|
||||
assertTrue("iter[4] * data()[3]", *data == u8"🚴"[3]);
|
||||
assertTrue("iter == endIter", iter == rangeLimit);
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue