diff --git a/icu4c/source/common/unicode/utfiter.h b/icu4c/source/common/unicode/utfiter.h index 66082111819..a49d795a9c7 100644 --- a/icu4c/source/common/unicode/utfiter.h +++ b/icu4c/source/common/unicode/utfiter.h @@ -1338,6 +1338,20 @@ private: std::basic_string_view s; }; +/** + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @tparam UIllFormedBehavior How to handle ill-formed Unicode strings + * @return a UTFStringCodePoints<Unit, CP32, behavior> + * for the given std::basic_string_view<Unit>, + * deducing the Unit character type + * @draft ICU 78 + */ +template +auto utfStringCodePoints(StringView s) { + return UTFStringCodePoints(s); +} + // Non-validating iterators ------------------------------------------------ *** /** @@ -1354,8 +1368,7 @@ private: * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: * UTF-8: char or char8_t or uint8_t; * UTF-16: char16_t or uint16_t or (on Windows) wchar_t - * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; - * should be signed if U_BEHAVIOR_NEGATIVE + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t * @draft ICU 78 */ template @@ -1741,8 +1754,7 @@ namespace U_HEADER_ONLY_NAMESPACE { * @tparam Unit Code unit type: * UTF-8: char or char8_t or uint8_t; * UTF-16: char16_t or uint16_t or (on Windows) wchar_t - * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; - * should be signed if U_BEHAVIOR_NEGATIVE + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t * @draft ICU 78 */ template @@ -1790,21 +1802,32 @@ private: std::basic_string_view s; }; +/** + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t + * @return an UnsafeUTFStringCodePoints<Unit, CP32> + * for the given std::basic_string_view<Unit>, + * deducing the Unit character type + * @draft ICU 78 + */ +template +auto unsafeUTFStringCodePoints(StringView s) { + return UnsafeUTFStringCodePoints(s); +} + // ------------------------------------------------------------------------- *** // TODO: remove experimental sample code #ifndef UTYPES_H int32_t rangeLoop16(std::u16string_view s) { - header::UTFStringCodePoints range(s); int32_t sum = 0; - for (auto units : range) { + for (auto units : header::utfStringCodePoints(s)) { sum += units.codePoint(); } return sum; } int32_t loopIterPlusPlus16(std::u16string_view s) { - header::UTFStringCodePoints range(s); + auto range = header::utfStringCodePoints(s); int32_t sum = 0; auto iter = range.begin(); auto limit = range.end(); @@ -1815,7 +1838,7 @@ int32_t loopIterPlusPlus16(std::u16string_view s) { } int32_t backwardLoop16(std::u16string_view s) { - header::UTFStringCodePoints range(s); + auto range = header::utfStringCodePoints(s); int32_t sum = 0; auto start = range.begin(); auto iter = range.end(); @@ -1826,7 +1849,7 @@ int32_t backwardLoop16(std::u16string_view s) { } int32_t reverseLoop16(std::u16string_view s) { - header::UTFStringCodePoints range(s); + auto range = header::utfStringCodePoints(s); int32_t sum = 0; for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { sum += iter->codePoint(); @@ -1835,16 +1858,15 @@ int32_t reverseLoop16(std::u16string_view s) { } int32_t unsafeRangeLoop16(std::u16string_view s) { - header::UnsafeUTFStringCodePoints range(s); int32_t sum = 0; - for (auto units : range) { + for (auto units : header::unsafeUTFStringCodePoints(s)) { sum += units.codePoint(); } return sum; } int32_t unsafeReverseLoop16(std::u16string_view s) { - header::UnsafeUTFStringCodePoints range(s); + auto range = header::unsafeUTFStringCodePoints(s); int32_t sum = 0; for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { sum += iter->codePoint(); @@ -1853,16 +1875,15 @@ int32_t unsafeReverseLoop16(std::u16string_view s) { } int32_t rangeLoop8(std::string_view s) { - header::UTFStringCodePoints range(s); int32_t sum = 0; - for (auto units : range) { + for (auto units : header::utfStringCodePoints(s)) { sum += units.codePoint(); } return sum; } int32_t reverseLoop8(std::string_view s) { - header::UTFStringCodePoints range(s); + auto range = header::utfStringCodePoints(s); int32_t sum = 0; for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { sum += iter->codePoint(); @@ -1882,16 +1903,15 @@ int32_t macroLoop8(std::string_view s) { } int32_t unsafeRangeLoop8(std::string_view s) { - header::UnsafeUTFStringCodePoints range(s); int32_t sum = 0; - for (auto units : range) { + for (auto units : header::unsafeUTFStringCodePoints(s)) { sum += units.codePoint(); } return sum; } int32_t unsafeReverseLoop8(std::string_view s) { - header::UnsafeUTFStringCodePoints range(s); + auto range = header::unsafeUTFStringCodePoints(s); int32_t sum = 0; for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { sum += iter->codePoint(); diff --git a/icu4c/source/test/intltest/utfitertest.cpp b/icu4c/source/test/intltest/utfitertest.cpp index ddf920c1e2f..721aa147515 100644 --- a/icu4c/source/test/intltest/utfitertest.cpp +++ b/icu4c/source/test/intltest/utfitertest.cpp @@ -22,6 +22,7 @@ using namespace std::string_view_literals; using U_HEADER_ONLY_NAMESPACE::UTFIterator; using U_HEADER_ONLY_NAMESPACE::UTFStringCodePoints; +using U_HEADER_ONLY_NAMESPACE::utfStringCodePoints; // Shared state for one or more copies of single-pass iterators. // Similar to https://en.cppreference.com/w/cpp/iterator/istreambuf_iterator @@ -141,7 +142,7 @@ void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam void U16IteratorTest::testGood() { std::u16string_view good(u"abçカ🚴"sv); - UTFStringCodePoints range(good); + auto range = utfStringCodePoints(good); // TODO: Try to un-hardcode the iterator types in these checks via declspec. assertTrue( "bidirectional_iterator_tag", @@ -175,7 +176,7 @@ void U16IteratorTest::testGood() { void U16IteratorTest::testNegative() { static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - UTFStringCodePoints range(bad); + auto range = utfStringCodePoints(bad); auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); assertEquals("iter[0] -> codePoint", u'a', iter->codePoint()); @@ -200,7 +201,7 @@ void U16IteratorTest::testNegative() { void U16IteratorTest::testFFFD() { static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - UTFStringCodePoints range(bad); + auto range = utfStringCodePoints(bad); auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); assertEquals("iter[0] -> codePoint", u'a', iter->codePoint()); @@ -224,7 +225,7 @@ void U16IteratorTest::testFFFD() { void U16IteratorTest::testSurrogate() { static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' }; std::u16string_view bad(badChars, 5); - UTFStringCodePoints range(bad); + auto range = utfStringCodePoints(bad); auto iter = range.begin(); assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint()); assertEquals("iter[0] -> codePoint", u'a', iter->codePoint()); @@ -354,7 +355,7 @@ void U8IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&name void U8IteratorTest::testGood() { std::string_view good(reinterpret_cast(u8"abçカ🚴")); - UTFStringCodePoints range(good); + auto range = utfStringCodePoints(good); assertTrue( "bidirectional_iterator_tag", std::is_same_v< @@ -495,7 +496,7 @@ void U32IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam void U32IteratorTest::testGood() { std::u32string_view good(U"abçカ🚴"sv); - UTFStringCodePoints range(good); + auto range = utfStringCodePoints(good); assertTrue( "bidirectional_iterator_tag", std::is_same_v<