From 0c5fa181ff953144a07dbc4114356cfde35d8aef Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 3 Mar 2025 19:23:32 -0800 Subject: [PATCH] ICU-23004 UTF-32 --- icu4c/source/common/unicode/utfiter.h | 88 ++++++++++++- icu4c/source/test/intltest/itutil.cpp | 2 + icu4c/source/test/intltest/utfitertest.cpp | 136 +++++++++++++++++++++ 3 files changed, 224 insertions(+), 2 deletions(-) diff --git a/icu4c/source/common/unicode/utfiter.h b/icu4c/source/common/unicode/utfiter.h index 0ce01fc34ec..3ec78b36966 100644 --- a/icu4c/source/common/unicode/utfiter.h +++ b/icu4c/source/common/unicode/utfiter.h @@ -89,11 +89,14 @@ namespace header {} #ifndef U_HIDE_DRAFT_API // Some defined behaviors for handling ill-formed Unicode strings. -// TODO: For UTF-32, we have basically orthogonal conditions for surrogate vs. out-of-range. -// Maybe make U_BEHAVIOR_SURROGATE return FFFD for out-of-range? typedef enum UIllFormedBehavior { + // Returns a negative value instead of a code point. U_BEHAVIOR_NEGATIVE, + // Returns U+FFFD Replacement Character. U_BEHAVIOR_FFFD, + // UTF-8: Not allowed; + // UTF-16: returns the unpaired surrogate; + // UTF-32: returns the surrogate code point, or U+FFFD if out of range. U_BEHAVIOR_SURROGATE } UIllFormedBehavior; @@ -534,6 +537,87 @@ public: } }; +// UTF-32: trivial, but still validating +template +class UTFImpl< + UnitIter, + CP32, + behavior, + std::enable_if_t< + sizeof(typename std::iterator_traits::value_type) == 4>> { +public: + // Handle ill-formed UTF-32: Out of range. + static inline CP32 sub() { + switch (behavior) { + case U_BEHAVIOR_NEGATIVE: return U_SENTINEL; + case U_BEHAVIOR_FFFD: + case U_BEHAVIOR_SURROGATE: return 0xfffd; + } + } + + // Handle ill-formed UTF-32: One unpaired surrogate. + static inline CP32 subSurrogate(CP32 surrogate) { + switch (behavior) { + case U_BEHAVIOR_NEGATIVE: return U_SENTINEL; + case U_BEHAVIOR_FFFD: return 0xfffd; + case U_BEHAVIOR_SURROGATE: return surrogate; + } + } + + static inline void inc(UnitIter &p, UnitIter /*limit*/) { + ++p; + } + + static inline void dec(UnitIter /*start*/, UnitIter &p) { + --p; + } + + static inline CodeUnits readAndInc(UnitIter &p, UnitIter /*limit*/) { + UnitIter p0 = p; + uint32_t uc = *p; + CP32 c = uc; + ++p; + if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) { + return {c, 1, true, p0}; + } else if (uc < 0xe000) { + return {subSurrogate(c), 1, false, p0}; + } else { + return {sub(), 1, false, p0}; + } + } + + static inline CodeUnits singlePassReadAndInc(UnitIter &p, UnitIter /*limit*/) { + uint32_t uc = *p; + CP32 c = uc; + ++p; + if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) { + return {c, 1, true}; + } else if (uc < 0xe000) { + return {subSurrogate(c), 1, false}; + } else { + return {sub(), 1, false}; + } + } + + static inline CodeUnits decAndRead(UnitIter /*start*/, UnitIter &p) { + uint32_t uc = *--p; + CP32 c = uc; + if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) { + return {c, 1, true, p}; + } else if (uc < 0xe000) { + return {subSurrogate(c), 1, false, p}; + } else { + return {sub(), 1, false, p}; + } + } + + static inline void moveToDecAndReadLimit(UnitIter &p, int8_t &state) { + // state < 0 after decAndRead() + ++p; + state = 0; + } +}; + #endif /** diff --git a/icu4c/source/test/intltest/itutil.cpp b/icu4c/source/test/intltest/itutil.cpp index e2ae799781f..75af141f590 100644 --- a/icu4c/source/test/intltest/itutil.cpp +++ b/icu4c/source/test/intltest/itutil.cpp @@ -50,6 +50,7 @@ extern IntlTest *createStaticUnicodeSetsTest(); static IntlTest *createUHashTest(); extern IntlTest *createU16IteratorTest(); extern IntlTest *createU8IteratorTest(); +extern IntlTest *createU32IteratorTest(); void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par ) { @@ -88,6 +89,7 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* & TESTCASE_AUTO_CREATE_CLASS(USetHeaderOnlyTest); TESTCASE_AUTO_CREATE_CLASS(U16IteratorTest); TESTCASE_AUTO_CREATE_CLASS(U8IteratorTest); + TESTCASE_AUTO_CREATE_CLASS(U32IteratorTest); TESTCASE_AUTO_END; } diff --git a/icu4c/source/test/intltest/utfitertest.cpp b/icu4c/source/test/intltest/utfitertest.cpp index b937c309fd0..ddf920c1e2f 100644 --- a/icu4c/source/test/intltest/utfitertest.cpp +++ b/icu4c/source/test/intltest/utfitertest.cpp @@ -462,3 +462,139 @@ void U8IteratorTest::testFwdIter() { assertTrue("iter[4] * data()[3]", *data == u8"🚴"[3]); assertTrue("iter == endIter", iter == rangeLimit); } + +class U32IteratorTest : public IntlTest { +public: + U32IteratorTest() {} + + void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override; + + void testGood(); + void testNegative(); + void testFFFD(); + void testSinglePassIter(); + void testFwdIter(); +}; + +extern IntlTest *createU32IteratorTest() { + return new U32IteratorTest(); +} + +void U32IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) { + if(exec) { + logln("TestSuite U32IteratorTest: "); + } + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(testGood); + // TODO: TESTCASE_AUTO(testNegative); + // TODO: TESTCASE_AUTO(testFFFD); + TESTCASE_AUTO(testSinglePassIter); + TESTCASE_AUTO(testFwdIter); + TESTCASE_AUTO_END; +} + +void U32IteratorTest::testGood() { + std::u32string_view good(U"abçカ🚴"sv); + UTFStringCodePoints range(good); + assertTrue( + "bidirectional_iterator_tag", + std::is_same_v< + typename std::iterator_traits< + UTFIterator>::iterator_category, + std::bidirectional_iterator_tag>); + auto iter = range.begin(); + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); + assertEquals("iter[0] -> codePoint", u'a', iter->codePoint()); + ++iter; // pre-increment + auto units = *iter; + assertEquals("iter[1] * codePoint", u'b', units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertTrue("iter[1] * wellFormed", units.wellFormed()); + assertTrue("iter[1] * stringView()", units.stringView() == U"b"sv); + ++iter; + assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment + assertEquals("iter[3] -> codePoint", u'カ', iter->codePoint()); + ++iter; + // Fetch the current code point twice. + assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint()); + units = *iter++; + assertEquals("iter[4] * codePoint", U'🚴', units.codePoint()); + assertEquals("iter[4] * length", 1, units.length()); + assertTrue("iter[4] * wellFormed", units.wellFormed()); + assertTrue("iter[4] * stringView()", units.stringView() == U"🚴"sv); + assertTrue("iter == endIter", iter == range.end()); +} + +void U32IteratorTest::testSinglePassIter() { + SinglePassSource good(U"abçカ🚴"sv); + SinglePassIter goodBegin(good); + SinglePassIter goodLimit{}; + UTFIterator, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin( + goodBegin, goodLimit); + UTFIterator, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit(goodLimit); + assertTrue( + "input_iterator_tag", + std::is_same_v< + typename std::iterator_traits< + UTFIterator, UChar32, U_BEHAVIOR_NEGATIVE>>::iterator_category, + std::input_iterator_tag>); + auto iter = rangeBegin; + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); + assertEquals("iter[0] -> codePoint", u'a', iter->codePoint()); + ++iter; // pre-increment + auto units = *iter; + assertEquals("iter[1] * codePoint", u'b', units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertTrue("iter[1] * wellFormed", units.wellFormed()); + // No units.stringView() when the unit iterator is not a pointer. + // No data() for a single-pass unit iterator. + ++iter; + assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment + assertEquals("iter[3] -> codePoint", u'カ', iter->codePoint()); + ++iter; + // Fetch the current code point twice. + assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint()); + units = *iter++; + assertEquals("iter[4] * codePoint", U'🚴', units.codePoint()); + assertEquals("iter[4] * length", 1, units.length()); + assertTrue("iter[4] * wellFormed", units.wellFormed()); + assertTrue("iter == endIter", iter == rangeLimit); +} + +void U32IteratorTest::testFwdIter() { + std::u32string_view good(U"abçカ🚴"sv); + FwdIter goodBegin(good.data()); + FwdIter goodLimit(good.data() + good.length()); + UTFIterator, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin(goodBegin, goodLimit); + UTFIterator, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit(goodLimit); + // TODO: UTFStringCodePoints range(good); + assertTrue( + "forward_iterator_tag", + std::is_same_v< + typename std::iterator_traits< + UTFIterator, UChar32, U_BEHAVIOR_NEGATIVE>>::iterator_category, + std::forward_iterator_tag>); + auto iter = rangeBegin; + assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); + assertEquals("iter[0] -> codePoint", u'a', iter->codePoint()); + ++iter; // pre-increment + auto units = *iter; + assertEquals("iter[1] * codePoint", u'b', units.codePoint()); + assertEquals("iter[1] * length", 1, units.length()); + assertTrue("iter[1] * wellFormed", units.wellFormed()); + // No units.stringView() when the unit iterator is not a pointer. + assertTrue("iter[1] * data()[0]", *units.data() == u'b'); + ++iter; + assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment + assertEquals("iter[3] -> codePoint", u'カ', iter->codePoint()); + ++iter; + // Fetch the current code point twice. + assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint()); + units = *iter++; + assertEquals("iter[4] * codePoint", U'🚴', units.codePoint()); + assertEquals("iter[4] * length", 1, units.length()); + assertTrue("iter[4] * wellFormed", units.wellFormed()); + FwdIter data = units.data(); + assertTrue("iter[4] * data()[0]", *data == U"🚴"[0]); + assertTrue("iter == endIter", iter == rangeLimit); +}