template param: code point type

2025-04-21 04:29:31 +00:00 · 2024-12-26 12:45:17 -08:00 · 2024-12-26 12:45:17 -08:00 · 64ea1100ea
commit 64ea1100ea
parent 6851e8db32
2 changed files with 30 additions and 30 deletions
--- a/icu4c/source/common/unicode/utf16cppiter.h
+++ b/icu4c/source/common/unicode/utf16cppiter.h
@ -29,30 +29,26 @@ namespace U_HEADER_ONLY_NAMESPACE {
 //
 // TODO: A possible alternative to an enum might be some kind of function template
 // which would be fully customizable.
-// The operator*() return value might then want to be a template parameter as well.
-// For example, for a well-formed sequence, the return value could be
-// a tuple of (code point, well-formed), or a string view, or...
-// (And then the caller could choose between UChar32 and char32_t.)
-// However, all of that would make the API more complex and daunting.
 enum U16IllFormedBehavior {
    U16_BEHAVIOR_NEGATIVE,
    U16_BEHAVIOR_FFFD,
    U16_BEHAVIOR_SURROGATE
 };

-// TODO: Consider a template parameter for UChar32 vs. char32_t vs. uint32_t.
-
 /**
 * A code unit sequence for one code point returned by U16Iterator.
+ * TODO: Share with UTF-8?
 *
 * TODO: check doxygen syntax for template parameters
- * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
+ * @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
+ * @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
+ *             should be signed if U16_BEHAVIOR_NEGATIVE
 * @draft ICU 77
 */
-template<typename Unit16>
+template<typename Unit16, typename CP32>
 struct U16OneSeq {
    // Order of fields with padding and access frequency in mind.
-    UChar32 codePoint = 0;
+    CP32 codePoint = 0;
    uint8_t length = 0;
    bool isWellFormed = false;
    const Unit16 *data;
@ -61,18 +57,20 @@ struct U16OneSeq {
        return std::basic_string_view<Unit16>(data, length);
    }

-    // TODO: std::optional<UChar32> maybeCodePoint() const ? (nullopt if !isWellFormed)
+    // TODO: std::optional<CP32> maybeCodePoint() const ? (nullopt if !isWellFormed)
 };

 /**
 * Validating iterator over the code points in a Unicode 16-bit string.
 *
 * TODO: check doxygen syntax for template parameters
- * @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
+ * @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
+ * @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
+ *             should be signed if U16_BEHAVIOR_NEGATIVE
 * @param U16IllFormedBehavior TODO
 * @draft ICU 77
 */
-template<typename Unit16, U16IllFormedBehavior behavior>
+template<typename Unit16, typename CP32, U16IllFormedBehavior behavior>
 class U16Iterator {
 public:
    // TODO: make private, make friends
@ -86,10 +84,10 @@ public:
    bool operator==(const U16Iterator &other) const { return p == other.p; }
    bool operator!=(const U16Iterator &other) const { return !operator==(other); }

-    U16OneSeq<Unit16> operator*() const {
+    U16OneSeq<Unit16, CP32> operator*() const {
        // TODO: assert p != limit -- more precisely: start <= p < limit
        // Similar to U16_NEXT_OR_FFFD().
-        UChar32 c = *p;
+        CP32 c = *p;
        if (!U16_IS_SURROGATE(c)) {
            return {c, 1, true, p};
        } else {
@ -118,7 +116,7 @@ public:
        // More similar to U16_NEXT_OR_FFFD() than U16_FWD_1() to try to help the compiler
        // amortize work between operator*() and operator++(int) in typical *it++ usage.
        // Otherwise this is slightly less efficient because it tests a lead surrogate twice.
-        UChar32 c = *p++;
+        CP32 c = *p++;
        if (U16_IS_SURROGATE(c) &&
                U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
            ++p;
@ -126,9 +124,13 @@ public:
        return result;
    }

+    // TODO: operator--()
+    // TODO: maybe fused readAndInc()?
+    // TODO: maybe fused decAndRead()?
+
 private:
    // Handle ill-formed UTF-16: One unpaired surrogate.
-    UChar32 sub(UChar32 surrogate) const {
+    CP32 sub(CP32 surrogate) const {
        switch (behavior) {
            case U16_BEHAVIOR_NEGATIVE: return U_SENTINEL;
            case U16_BEHAVIOR_FFFD: return 0xfffd;
@ -149,7 +151,7 @@ private:
 * @return a code point iterator.
 * @draft ICU 77
 */
-template<typename Unit16, U16IllFormedBehavior behavior>
+template<typename Unit16, typename CP32, U16IllFormedBehavior behavior>
 class U16StringCodePoints {
 public:
    /**
@ -162,12 +164,12 @@ public:
    U16StringCodePoints(const U16StringCodePoints &other) = default;

    /** @draft ICU 77 */
-    U16Iterator<Unit16, behavior> begin() const {
+    U16Iterator<Unit16, CP32, behavior> begin() const {
        return {s.data(), s.data(), s.data() + s.length()};
    }

    /** @draft ICU 77 */
-    U16Iterator<Unit16, behavior> end() const {
+    U16Iterator<Unit16, CP32, behavior> end() const {
        const Unit16 *limit = s.data() + s.length();
        return {s.data(), limit, limit};
    }
@ -183,8 +185,6 @@ private:
 // template<typename Unit16>
 // class U16UnsafeIterator
 // TODO: only p, no start, no limit
-// TODO: can/should we read the code point only in operator*()?
-// if we read it in the constructor, then we would still need start/limit...

 }  // namespace U_HEADER_ONLY_NAMESPACE

--- a/icu4c/source/test/intltest/utfcppitertest.cpp
+++ b/icu4c/source/test/intltest/utfcppitertest.cpp
@ -58,11 +58,11 @@ void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam
 void U16IteratorTest::testGood() {
    IcuTestErrorCode errorCode(*this, "testGood");
    std::u16string_view good(u"abçカ🚴"sv);
-    U16StringCodePoints<char16_t, U16_BEHAVIOR_NEGATIVE> range(good);
+    U16StringCodePoints<char16_t, UChar32, U16_BEHAVIOR_NEGATIVE> range(good);
    auto iter = range.begin();
    assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
    ++iter;  // pre-increment
-    U16OneSeq<char16_t> seq = *iter;
+    auto seq = *iter;
    assertEquals("iter[1] * codePoint", u'b', seq.codePoint);
    assertEquals("iter[1] * length", 1, seq.length);
    assertTrue("iter[1] * isWellFormed", seq.isWellFormed);
@ -83,11 +83,11 @@ void U16IteratorTest::testNegative() {
    IcuTestErrorCode errorCode(*this, "testNegative");
    static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
    std::u16string_view bad(badChars, 5);
-    U16StringCodePoints<char16_t, U16_BEHAVIOR_NEGATIVE> range(bad);
+    U16StringCodePoints<char16_t, UChar32, U16_BEHAVIOR_NEGATIVE> range(bad);
    auto iter = range.begin();
    assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
    ++iter;  // pre-increment
-    U16OneSeq<char16_t> seq = *iter;
+    auto seq = *iter;
    assertEquals("iter[1] * codePoint", -1, seq.codePoint);
    assertEquals("iter[1] * length", 1, seq.length);
    assertFalse("iter[1] * isWellFormed", seq.isWellFormed);
@ -107,11 +107,11 @@ void U16IteratorTest::testFFFD() {
    IcuTestErrorCode errorCode(*this, "testFFFD");
    static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
    std::u16string_view bad(badChars, 5);
-    U16StringCodePoints<char16_t, U16_BEHAVIOR_FFFD> range(bad);
+    U16StringCodePoints<char16_t, char32_t, U16_BEHAVIOR_FFFD> range(bad);
    auto iter = range.begin();
    assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
    ++iter;  // pre-increment
-    U16OneSeq<char16_t> seq = *iter;
+    auto seq = *iter;
    assertEquals("iter[1] * codePoint", 0xfffd, seq.codePoint);
    assertEquals("iter[1] * length", 1, seq.length);
    assertFalse("iter[1] * isWellFormed", seq.isWellFormed);
@ -131,11 +131,11 @@ void U16IteratorTest::testSurrogate() {
    IcuTestErrorCode errorCode(*this, "testSurrogate");
    static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
    std::u16string_view bad(badChars, 5);
-    U16StringCodePoints<char16_t, U16_BEHAVIOR_SURROGATE> range(bad);
+    U16StringCodePoints<char16_t, uint32_t, U16_BEHAVIOR_SURROGATE> range(bad);
    auto iter = range.begin();
    assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
    ++iter;  // pre-increment
-    U16OneSeq<char16_t> seq = *iter;
+    auto seq = *iter;
    assertEquals("iter[1] * codePoint", 0xd900, seq.codePoint);
    assertEquals("iter[1] * length", 1, seq.length);
    assertFalse("iter[1] * isWellFormed", seq.isWellFormed);