ICU-23004 UTF-32

This commit is contained in:
Markus Scherer 2025-03-03 19:23:32 -08:00
parent 74278fd566
commit 0c5fa181ff
3 changed files with 224 additions and 2 deletions

View file

@ -89,11 +89,14 @@ namespace header {}
#ifndef U_HIDE_DRAFT_API
// Some defined behaviors for handling ill-formed Unicode strings.
// TODO: For UTF-32, we have basically orthogonal conditions for surrogate vs. out-of-range.
// Maybe make U_BEHAVIOR_SURROGATE return FFFD for out-of-range?
typedef enum UIllFormedBehavior {
// Returns a negative value instead of a code point.
U_BEHAVIOR_NEGATIVE,
// Returns U+FFFD Replacement Character.
U_BEHAVIOR_FFFD,
// UTF-8: Not allowed;
// UTF-16: returns the unpaired surrogate;
// UTF-32: returns the surrogate code point, or U+FFFD if out of range.
U_BEHAVIOR_SURROGATE
} UIllFormedBehavior;
@ -534,6 +537,87 @@ public:
}
};
// UTF-32: trivial, but still validating
template<typename UnitIter, typename CP32, UIllFormedBehavior behavior>
class UTFImpl<
UnitIter,
CP32,
behavior,
std::enable_if_t<
sizeof(typename std::iterator_traits<UnitIter>::value_type) == 4>> {
public:
// Handle ill-formed UTF-32: Out of range.
static inline CP32 sub() {
switch (behavior) {
case U_BEHAVIOR_NEGATIVE: return U_SENTINEL;
case U_BEHAVIOR_FFFD:
case U_BEHAVIOR_SURROGATE: return 0xfffd;
}
}
// Handle ill-formed UTF-32: One unpaired surrogate.
static inline CP32 subSurrogate(CP32 surrogate) {
switch (behavior) {
case U_BEHAVIOR_NEGATIVE: return U_SENTINEL;
case U_BEHAVIOR_FFFD: return 0xfffd;
case U_BEHAVIOR_SURROGATE: return surrogate;
}
}
static inline void inc(UnitIter &p, UnitIter /*limit*/) {
++p;
}
static inline void dec(UnitIter /*start*/, UnitIter &p) {
--p;
}
static inline CodeUnits<UnitIter, CP32> readAndInc(UnitIter &p, UnitIter /*limit*/) {
UnitIter p0 = p;
uint32_t uc = *p;
CP32 c = uc;
++p;
if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
return {c, 1, true, p0};
} else if (uc < 0xe000) {
return {subSurrogate(c), 1, false, p0};
} else {
return {sub(), 1, false, p0};
}
}
static inline CodeUnits<UnitIter, CP32> singlePassReadAndInc(UnitIter &p, UnitIter /*limit*/) {
uint32_t uc = *p;
CP32 c = uc;
++p;
if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
return {c, 1, true};
} else if (uc < 0xe000) {
return {subSurrogate(c), 1, false};
} else {
return {sub(), 1, false};
}
}
static inline CodeUnits<UnitIter, CP32> decAndRead(UnitIter /*start*/, UnitIter &p) {
uint32_t uc = *--p;
CP32 c = uc;
if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
return {c, 1, true, p};
} else if (uc < 0xe000) {
return {subSurrogate(c), 1, false, p};
} else {
return {sub(), 1, false, p};
}
}
static inline void moveToDecAndReadLimit(UnitIter &p, int8_t &state) {
// state < 0 after decAndRead()
++p;
state = 0;
}
};
#endif
/**

View file

@ -50,6 +50,7 @@ extern IntlTest *createStaticUnicodeSetsTest();
static IntlTest *createUHashTest();
extern IntlTest *createU16IteratorTest();
extern IntlTest *createU8IteratorTest();
extern IntlTest *createU32IteratorTest();
void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
{
@ -88,6 +89,7 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &
TESTCASE_AUTO_CREATE_CLASS(USetHeaderOnlyTest);
TESTCASE_AUTO_CREATE_CLASS(U16IteratorTest);
TESTCASE_AUTO_CREATE_CLASS(U8IteratorTest);
TESTCASE_AUTO_CREATE_CLASS(U32IteratorTest);
TESTCASE_AUTO_END;
}

View file

@ -462,3 +462,139 @@ void U8IteratorTest::testFwdIter() {
assertTrue("iter[4] * data()[3]", *data == u8"🚴"[3]);
assertTrue("iter == endIter", iter == rangeLimit);
}
class U32IteratorTest : public IntlTest {
public:
U32IteratorTest() {}
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override;
void testGood();
void testNegative();
void testFFFD();
void testSinglePassIter();
void testFwdIter();
};
extern IntlTest *createU32IteratorTest() {
return new U32IteratorTest();
}
void U32IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
if(exec) {
logln("TestSuite U32IteratorTest: ");
}
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(testGood);
// TODO: TESTCASE_AUTO(testNegative);
// TODO: TESTCASE_AUTO(testFFFD);
TESTCASE_AUTO(testSinglePassIter);
TESTCASE_AUTO(testFwdIter);
TESTCASE_AUTO_END;
}
void U32IteratorTest::testGood() {
std::u32string_view good(U"abçカ🚴"sv);
UTFStringCodePoints<char32_t, UChar32, U_BEHAVIOR_NEGATIVE> range(good);
assertTrue(
"bidirectional_iterator_tag",
std::is_same_v<
typename std::iterator_traits<
UTFIterator<char32_t *, UChar32, U_BEHAVIOR_NEGATIVE>>::iterator_category,
std::bidirectional_iterator_tag>);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
++iter; // pre-increment
auto units = *iter;
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
assertEquals("iter[1] * length", 1, units.length());
assertTrue("iter[1] * wellFormed", units.wellFormed());
assertTrue("iter[1] * stringView()", units.stringView() == U"b"sv);
++iter;
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
assertEquals("iter[3] -> codePoint", u'', iter->codePoint());
++iter;
// Fetch the current code point twice.
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
units = *iter++;
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
assertEquals("iter[4] * length", 1, units.length());
assertTrue("iter[4] * wellFormed", units.wellFormed());
assertTrue("iter[4] * stringView()", units.stringView() == U"🚴"sv);
assertTrue("iter == endIter", iter == range.end());
}
void U32IteratorTest::testSinglePassIter() {
SinglePassSource<char32_t> good(U"abçカ🚴"sv);
SinglePassIter<char32_t> goodBegin(good);
SinglePassIter<char32_t> goodLimit{};
UTFIterator<SinglePassIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin(
goodBegin, goodLimit);
UTFIterator<SinglePassIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit(goodLimit);
assertTrue(
"input_iterator_tag",
std::is_same_v<
typename std::iterator_traits<
UTFIterator<SinglePassIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE>>::iterator_category,
std::input_iterator_tag>);
auto iter = rangeBegin;
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
++iter; // pre-increment
auto units = *iter;
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
assertEquals("iter[1] * length", 1, units.length());
assertTrue("iter[1] * wellFormed", units.wellFormed());
// No units.stringView() when the unit iterator is not a pointer.
// No data() for a single-pass unit iterator.
++iter;
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
assertEquals("iter[3] -> codePoint", u'', iter->codePoint());
++iter;
// Fetch the current code point twice.
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
units = *iter++;
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
assertEquals("iter[4] * length", 1, units.length());
assertTrue("iter[4] * wellFormed", units.wellFormed());
assertTrue("iter == endIter", iter == rangeLimit);
}
void U32IteratorTest::testFwdIter() {
std::u32string_view good(U"abçカ🚴"sv);
FwdIter<char32_t> goodBegin(good.data());
FwdIter<char32_t> goodLimit(good.data() + good.length());
UTFIterator<FwdIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin(goodBegin, goodLimit);
UTFIterator<FwdIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit(goodLimit);
// TODO: UTFStringCodePoints<FwdIter, UChar32, U_BEHAVIOR_NEGATIVE> range(good);
assertTrue(
"forward_iterator_tag",
std::is_same_v<
typename std::iterator_traits<
UTFIterator<FwdIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE>>::iterator_category,
std::forward_iterator_tag>);
auto iter = rangeBegin;
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
++iter; // pre-increment
auto units = *iter;
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
assertEquals("iter[1] * length", 1, units.length());
assertTrue("iter[1] * wellFormed", units.wellFormed());
// No units.stringView() when the unit iterator is not a pointer.
assertTrue("iter[1] * data()[0]", *units.data() == u'b');
++iter;
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
assertEquals("iter[3] -> codePoint", u'', iter->codePoint());
++iter;
// Fetch the current code point twice.
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
units = *iter++;
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
assertEquals("iter[4] * length", 1, units.length());
assertTrue("iter[4] * wellFormed", units.wellFormed());
FwdIter<char32_t> data = units.data();
assertTrue("iter[4] * data()[0]", *data == U"🚴"[0]);
assertTrue("iter == endIter", iter == rangeLimit);
}