ICU-23004 UTF-8 forward

This commit is contained in:
Markus Scherer 2025-02-28 14:50:35 -08:00
parent 32131dc2ff
commit 2be45bf6f8
4 changed files with 286 additions and 16 deletions

View file

@ -517,7 +517,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
if(U8_IS_TRAIL(__t1)) { \
++(i); \
} \
} else /* c>=0xf0 */ { \
} else /* b>=0xf0 */ { \
if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \

View file

@ -17,6 +17,7 @@
#include <string_view>
#ifdef UTYPES_H
#include "unicode/utf16.h"
#include "unicode/utf8.h"
#include "unicode/uversion.h"
#else
// TODO: Remove checks for UTYPES_H and replacement definitions.
@ -48,7 +49,6 @@ namespace header {}
#ifndef U_HIDE_DRAFT_API
// Some defined behaviors for handling ill-formed Unicode strings.
// TODO: For 8-bit strings, the SURROGATE option does not have an equivalent -- static_assert.
typedef enum UIllFormedBehavior {
U_BEHAVIOR_NEGATIVE,
U_BEHAVIOR_FFFD,
@ -57,17 +57,6 @@ typedef enum UIllFormedBehavior {
namespace U_HEADER_ONLY_NAMESPACE {
// Handle ill-formed UTF-16: One unpaired surrogate.
// @internal
template<typename CP32, UIllFormedBehavior behavior>
CP32 uprv_u16Sub(CP32 surrogate) {
switch (behavior) {
case U_BEHAVIOR_NEGATIVE: return U_SENTINEL;
case U_BEHAVIOR_FFFD: return 0xfffd;
case U_BEHAVIOR_SURROGATE: return surrogate;
}
}
/**
* Result of validating and decoding a minimal Unicode code unit sequence.
* Returned from validating Unicode string code point iterators.
@ -198,6 +187,152 @@ private:
template<typename UnitIter, typename CP32, UIllFormedBehavior behavior, typename = void>
class UTFImpl;
// UTF-8
template<typename UnitIter, typename CP32, UIllFormedBehavior behavior>
class UTFImpl<
UnitIter,
CP32,
behavior,
std::enable_if_t<
sizeof(typename std::iterator_traits<UnitIter>::value_type) == 1>> {
static_assert(behavior != U_BEHAVIOR_SURROGATE,
"For 8-bit strings, the SURROGATE option does not have an equivalent.");
public:
// Handle ill-formed UTF-8
static CP32 sub() {
switch (behavior) {
case U_BEHAVIOR_NEGATIVE: return U_SENTINEL;
case U_BEHAVIOR_FFFD: return 0xfffd;
}
}
static void inc(UnitIter &p, UnitIter limit) {
// TODO: assert p != limit -- more precisely: start <= p < limit
// Very similar to U8_FWD_1().
uint8_t b = *p;
++p;
if (U8_IS_LEAD(b) && p != limit) {
uint8_t t1 = *p;
if ((0xe0 <= b && b < 0xf0)) {
if (U8_IS_VALID_LEAD3_AND_T1(b, t1) &&
++p != limit && U8_IS_TRAIL(*p)) {
++p;
}
} else if (b < 0xe0) {
if (U8_IS_TRAIL(t1)) {
++p;
}
} else /* b >= 0xf0 */ {
if (U8_IS_VALID_LEAD4_AND_T1(b, t1) &&
++p != limit && U8_IS_TRAIL(*p) &&
++p != limit && U8_IS_TRAIL(*p)) {
++p;
}
}
}
}
static CodeUnits<UnitIter, CP32> readAndInc(UnitIter &p, UnitIter limit) {
// TODO: assert p != limit -- more precisely: start <= p < limit
// Very similar to U8_NEXT_OR_FFFD().
UnitIter p0 = p;
CP32 c = uint8_t(*p);
++p;
if (U8_IS_SINGLE(c)) {
return {c, 1, true, p0};
}
uint8_t length = 1;
uint8_t t = 0;
if (p != limit &&
// fetch/validate/assemble all but last trail byte
(c >= 0xe0 ?
(c < 0xf0 ? // U+0800..U+FFFF except surrogates
U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
(t &= 0x3f, 1)
: // U+10000..U+10FFFF
(c -= 0xf0) <= 4 &&
U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
(c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
(t = *p - 0x80) <= 0x3f) &&
// valid second-to-last trail byte
(c = (c << 6) | t, ++length, ++p != limit)
: // U+0080..U+07FF
c >= 0xc2 && (c &= 0x1f, 1)) &&
// last trail byte
(t = *p - 0x80) <= 0x3f) {
c = (c << 6) | t;
++length;
++p;
return {c, length, true, p0};
}
return {sub(), length, false, p0}; // ill-formed
}
static CodeUnits<UnitIter, CP32> singlePassReadAndInc(UnitIter &p, UnitIter limit) {
// TODO: assert p != limit -- more precisely: start <= p < limit
// Very similar to U8_NEXT_OR_FFFD().
CP32 c = uint8_t(*p);
++p;
if (U8_IS_SINGLE(c)) {
return {c, 1, true};
}
uint8_t length = 1;
uint8_t t = 0;
if (p != limit &&
// fetch/validate/assemble all but last trail byte
(c >= 0xe0 ?
(c < 0xf0 ? // U+0800..U+FFFF except surrogates
U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
(t &= 0x3f, 1)
: // U+10000..U+10FFFF
(c -= 0xf0) <= 4 &&
U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
(c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
(t = *p - 0x80) <= 0x3f) &&
// valid second-to-last trail byte
(c = (c << 6) | t, ++length, ++p != limit)
: // U+0080..U+07FF
c >= 0xc2 && (c &= 0x1f, 1)) &&
// last trail byte
(t = *p - 0x80) <= 0x3f) {
c = (c << 6) | t;
++length;
++p;
return {c, length, true};
}
return {sub(), length, false}; // ill-formed
}
static CodeUnits<UnitIter, CP32> decAndRead(UnitIter start, UnitIter &p) {
// TODO: assert p != start -- more precisely: start < p <= limit
// Very similar to U8_PREV_OR_FFFD().
CP32 c = *--p;
if (!U8_IS_SURROGATE(c)) {
return {c, 1, true, p};
} else {
UnitIter p1;
uint16_t c2;
if (U8_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U8_IS_LEAD(c2 = *--p1))) {
p = p1;
c = U8_GET_SUPPLEMENTARY(c2, c);
return {c, 2, true, p};
} else {
return {sub(c), 1, false, p};
}
}
}
static void moveToReadAndIncStart(UnitIter &p, int8_t &state) {
// state > 0 after readAndInc()
do { --p; } while (--state != 0);
}
static void moveToDecAndReadLimit(UnitIter &p, int8_t &state) {
// state < 0 after decAndRead()
do { ++p; } while (++state != 0);
}
};
// UTF-16
template<typename UnitIter, typename CP32, UIllFormedBehavior behavior>
class UTFImpl<
@ -207,6 +342,15 @@ class UTFImpl<
std::enable_if_t<
sizeof(typename std::iterator_traits<UnitIter>::value_type) == 2>> {
public:
// Handle ill-formed UTF-16: One unpaired surrogate.
static CP32 sub(CP32 surrogate) {
switch (behavior) {
case U_BEHAVIOR_NEGATIVE: return U_SENTINEL;
case U_BEHAVIOR_FFFD: return 0xfffd;
case U_BEHAVIOR_SURROGATE: return surrogate;
}
}
static void inc(UnitIter &p, UnitIter limit) {
// TODO: assert p != limit -- more precisely: start <= p < limit
// Very similar to U16_FWD_1().
@ -232,7 +376,7 @@ public:
c = U16_GET_SUPPLEMENTARY(c, c2);
return {c, 2, true, p0};
} else {
return {uprv_u16Sub<CP32, behavior>(c), 1, false, p0};
return {sub(c), 1, false, p0};
}
}
}
@ -251,7 +395,7 @@ public:
c = U16_GET_SUPPLEMENTARY(c, c2);
return {c, 2, true};
} else {
return {uprv_u16Sub<CP32, behavior>(c), 1, false};
return {sub(c), 1, false};
}
}
}
@ -270,7 +414,7 @@ public:
c = U16_GET_SUPPLEMENTARY(c2, c);
return {c, 2, true, p};
} else {
return {uprv_u16Sub<CP32, behavior>(c), 1, false, p};
return {sub(c), 1, false, p};
}
}
}

View file

@ -49,6 +49,7 @@ extern IntlTest *createStaticUnicodeSetsTest();
#endif
static IntlTest *createUHashTest();
extern IntlTest *createU16IteratorTest();
extern IntlTest *createU8IteratorTest();
void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
{
@ -86,6 +87,7 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &
TESTCASE_AUTO_CREATE_CLASS(UHashTest);
TESTCASE_AUTO_CREATE_CLASS(USetHeaderOnlyTest);
TESTCASE_AUTO_CREATE_CLASS(U16IteratorTest);
TESTCASE_AUTO_CREATE_CLASS(U8IteratorTest);
TESTCASE_AUTO_END;
}

View file

@ -303,3 +303,127 @@ void U16IteratorTest::testFwdIter() {
}
// TODO: test back & forth with bidirectional iterator (not random access, not contiguous)
class U8IteratorTest : public IntlTest {
public:
U8IteratorTest() {}
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override;
void testGood();
void testNegative();
void testFFFD();
void testSinglePassIter();
void testFwdIter();
};
extern IntlTest *createU8IteratorTest() {
return new U8IteratorTest();
}
void U8IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
if(exec) {
logln("TestSuite U8IteratorTest: ");
}
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(testGood);
// TODO: TESTCASE_AUTO(testNegative);
// TODO: TESTCASE_AUTO(testFFFD);
TESTCASE_AUTO(testSinglePassIter);
TESTCASE_AUTO(testFwdIter);
TESTCASE_AUTO_END;
}
void U8IteratorTest::testGood() {
std::string_view good(reinterpret_cast<const char*>(u8"abçカ🚴"));
U16StringCodePoints<char, UChar32, U_BEHAVIOR_NEGATIVE> range(good);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
++iter; // pre-increment
auto units = *iter;
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
assertEquals("iter[1] * length", 1, units.length());
assertTrue("iter[1] * wellFormed", units.wellFormed());
assertTrue("iter[1] * stringView()",
units.stringView() == std::string_view(reinterpret_cast<const char*>(u8"b")));
++iter;
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
assertEquals("iter[3] -> codePoint", u'', iter->codePoint());
++iter;
// Fetch the current code point twice.
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
units = *iter++;
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
assertEquals("iter[4] * length", 4, units.length());
assertTrue("iter[4] * wellFormed", units.wellFormed());
assertTrue("iter[4] * stringView()",
units.stringView() == std::string_view(reinterpret_cast<const char*>(u8"🚴")));
assertTrue("iter == endIter", iter == range.end());
}
void U8IteratorTest::testSinglePassIter() {
SinglePassSource<char> good(reinterpret_cast<const char*>(u8"abçカ🚴"));
SinglePassIter<char> goodBegin(good);
SinglePassIter<char> goodLimit{};
U16Iterator<SinglePassIter<char>, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin(
goodBegin, goodLimit);
U16Iterator<SinglePassIter<char>, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit(goodLimit);
auto iter = rangeBegin;
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
++iter; // pre-increment
auto units = *iter;
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
assertEquals("iter[1] * length", 1, units.length());
assertTrue("iter[1] * wellFormed", units.wellFormed());
// No units.stringView() when the unit iterator is not a pointer.
// No data() for a single-pass unit iterator.
++iter;
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
assertEquals("iter[3] -> codePoint", u'', iter->codePoint());
++iter;
// Fetch the current code point twice.
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
units = *iter++;
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
assertEquals("iter[4] * length", 4, units.length());
assertTrue("iter[4] * wellFormed", units.wellFormed());
assertTrue("iter == endIter", iter == rangeLimit);
}
void U8IteratorTest::testFwdIter() {
std::string_view good(reinterpret_cast<const char*>(u8"abçカ🚴"));
FwdIter<char> goodBegin(good.data());
FwdIter<char> goodLimit(good.data() + good.length());
U16Iterator<FwdIter<char>, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin(
goodBegin, goodBegin, goodLimit);
U16Iterator<FwdIter<char>, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit(goodLimit);
// TODO: U16StringCodePoints<FwdIter, UChar32, U_BEHAVIOR_NEGATIVE> range(good);
auto iter = rangeBegin;
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
++iter; // pre-increment
auto units = *iter;
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
assertEquals("iter[1] * length", 1, units.length());
assertTrue("iter[1] * wellFormed", units.wellFormed());
// No units.stringView() when the unit iterator is not a pointer.
assertTrue("iter[1] * data()[0]", *units.data() == u8'b');
++iter;
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
assertEquals("iter[3] -> codePoint", u'', iter->codePoint());
++iter;
// Fetch the current code point twice.
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
units = *iter++;
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
assertEquals("iter[4] * length", 4, units.length());
assertTrue("iter[4] * wellFormed", units.wellFormed());
FwdIter<char> data = units.data();
assertTrue("iter[4] * data()[0]", *data++ == u8"🚴"[0]);
assertTrue("iter[4] * data()[1]", *data++ == u8"🚴"[1]);
assertTrue("iter[4] * data()[2]", *data++ == u8"🚴"[2]);
assertTrue("iter[4] * data()[3]", *data == u8"🚴"[3]);
assertTrue("iter == endIter", iter == rangeLimit);
}