From 6fb4eca4937880347ad7551dc896397418f740c0 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 4 Apr 2025 11:44:20 -0700 Subject: [PATCH] ICU-23004 fix utfStringCodePoints(): StringView -> 5 string_view overloads --- icu4c/source/common/unicode/platform.h | 2 + icu4c/source/common/unicode/utfiterator.h | 175 ++++++++++++++++-- .../source/test/intltest/utfiteratortest.cpp | 15 +- 3 files changed, 163 insertions(+), 29 deletions(-) diff --git a/icu4c/source/common/unicode/platform.h b/icu4c/source/common/unicode/platform.h index b2fcb21ef13..45741a83deb 100644 --- a/icu4c/source/common/unicode/platform.h +++ b/icu4c/source/common/unicode/platform.h @@ -479,6 +479,8 @@ /* Otherwise use the predefined value. */ #elif !defined(__cplusplus) # define U_CPLUSPLUS_VERSION 0 +#elif __cplusplus >= 202002L || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) +# define U_CPLUSPLUS_VERSION 20 #elif __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) # define U_CPLUSPLUS_VERSION 17 #elif __cplusplus >= 201402L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) diff --git a/icu4c/source/common/unicode/utfiterator.h b/icu4c/source/common/unicode/utfiterator.h index e022185bdeb..9e1d6a2140a 100644 --- a/icu4c/source/common/unicode/utfiterator.h +++ b/icu4c/source/common/unicode/utfiterator.h @@ -1632,21 +1632,94 @@ auto utfIterator(UnitIter p) { /** * UTFStringCodePoints factory function for a "range" of code points in a string, * which validates while decoding. - * Deduces the Unit template parameter from the input. + * Avoids having to explicitly specify the Unit template parameter for the UTFStringCodePoints. * * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; * should be signed if UTF_BEHAVIOR_NEGATIVE * @tparam behavior How to handle ill-formed Unicode strings - * @tparam StringView Can usually be omitted/deduced: A std::basic_string_view<Unit> - * @param s input string_view + * @param s input string * @return a UTFStringCodePoints<CP32, behavior, Unit> - * for the given std::basic_string_view<Unit>, - * deducing the Unit character type + * for the given std::basic_string_view<Unit> * @draft ICU 78 */ -template -auto utfStringCodePoints(StringView s) { - return UTFStringCodePoints(s); +template +auto utfStringCodePoints(std::string_view s) { + return UTFStringCodePoints(s); +} + +/** + * UTFStringCodePoints factory function for a "range" of code points in a string, + * which validates while decoding. + * Avoids having to explicitly specify the Unit template parameter for the UTFStringCodePoints. + * + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if UTF_BEHAVIOR_NEGATIVE + * @tparam behavior How to handle ill-formed Unicode strings + * @param s input string + * @return a UTFStringCodePoints<CP32, behavior, Unit> + * for the given std::basic_string_view<Unit> + * @draft ICU 78 + */ +template +auto utfStringCodePoints(std::u16string_view s) { + return UTFStringCodePoints(s); +} + +/** + * UTFStringCodePoints factory function for a "range" of code points in a string, + * which validates while decoding. + * Avoids having to explicitly specify the Unit template parameter for the UTFStringCodePoints. + * + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if UTF_BEHAVIOR_NEGATIVE + * @tparam behavior How to handle ill-formed Unicode strings + * @param s input string + * @return a UTFStringCodePoints<CP32, behavior, Unit> + * for the given std::basic_string_view<Unit> + * @draft ICU 78 + */ +template +auto utfStringCodePoints(std::u32string_view s) { + return UTFStringCodePoints(s); +} + +#if U_CPLUSPLUS_VERSION >= 20 +// The new type char8_t is distinct from char. u8"literals" are now char8_t literals. +/** + * UTFStringCodePoints factory function for a "range" of code points in a string, + * which validates while decoding. + * Avoids having to explicitly specify the Unit template parameter for the UTFStringCodePoints. + * + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if UTF_BEHAVIOR_NEGATIVE + * @tparam behavior How to handle ill-formed Unicode strings + * @param s input string + * @return a UTFStringCodePoints<CP32, behavior, Unit> + * for the given std::basic_string_view<Unit> + * @draft ICU 78 + */ +template +auto utfStringCodePoints(std::u8string_view s) { + return UTFStringCodePoints(s); +} +#endif + +/** + * UTFStringCodePoints factory function for a "range" of code points in a string, + * which validates while decoding. + * Avoids having to explicitly specify the Unit template parameter for the UTFStringCodePoints. + * + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if UTF_BEHAVIOR_NEGATIVE + * @tparam behavior How to handle ill-formed Unicode strings + * @param s input string + * @return a UTFStringCodePoints<CP32, behavior, Unit> + * for the given std::basic_string_view<Unit> + * @draft ICU 78 + */ +template +auto utfStringCodePoints(std::wstring_view s) { + return UTFStringCodePoints(s); } // Non-validating iterators ------------------------------------------------ *** @@ -2206,19 +2279,89 @@ auto unsafeUTFIterator(UnitIter iter) { /** * UnsafeUTFStringCodePoints factory function for a "range" of code points in a string. * The string must be well-formed. - * Deduces the Unit template parameter from the input. + * Avoids having to explicitly specify the Unit template parameter + * for the UnsafeUTFStringCodePoints. * * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t - * @tparam StringView Can usually be omitted/deduced: A std::basic_string_view<Unit> - * @param s input string_view + * @param s input string * @return an UnsafeUTFStringCodePoints<CP32, Unit> - * for the given std::basic_string_view<Unit>, - * deducing the Unit character type + * for the given std::basic_string_view<Unit> * @draft ICU 78 */ -template -auto unsafeUTFStringCodePoints(StringView s) { - return UnsafeUTFStringCodePoints(s); +template +auto unsafeUTFStringCodePoints(std::string_view s) { + return UnsafeUTFStringCodePoints(s); +} + +/** + * UnsafeUTFStringCodePoints factory function for a "range" of code points in a string. + * The string must be well-formed. + * Avoids having to explicitly specify the Unit template parameter + * for the UnsafeUTFStringCodePoints. + * + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t + * @param s input string + * @return an UnsafeUTFStringCodePoints<CP32, Unit> + * for the given std::basic_string_view<Unit> + * @draft ICU 78 + */ +template +auto unsafeUTFStringCodePoints(std::u16string_view s) { + return UnsafeUTFStringCodePoints(s); +} + +/** + * UnsafeUTFStringCodePoints factory function for a "range" of code points in a string. + * The string must be well-formed. + * Avoids having to explicitly specify the Unit template parameter + * for the UnsafeUTFStringCodePoints. + * + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t + * @param s input string + * @return an UnsafeUTFStringCodePoints<CP32, Unit> + * for the given std::basic_string_view<Unit> + * @draft ICU 78 + */ +template +auto unsafeUTFStringCodePoints(std::u32string_view s) { + return UnsafeUTFStringCodePoints(s); +} + +#if U_CPLUSPLUS_VERSION >= 20 +// The new type char8_t is distinct from char. u8"literals" are now char8_t literals. +/** + * UnsafeUTFStringCodePoints factory function for a "range" of code points in a string. + * The string must be well-formed. + * Avoids having to explicitly specify the Unit template parameter + * for the UnsafeUTFStringCodePoints. + * + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t + * @param s input string + * @return an UnsafeUTFStringCodePoints<CP32, Unit> + * for the given std::basic_string_view<Unit> + * @draft ICU 78 + */ +template +auto unsafeUTFStringCodePoints(std::u8string_view s) { + return UnsafeUTFStringCodePoints(s); +} +#endif + +/** + * UnsafeUTFStringCodePoints factory function for a "range" of code points in a string. + * The string must be well-formed. + * Avoids having to explicitly specify the Unit template parameter + * for the UnsafeUTFStringCodePoints. + * + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t + * @param s input string + * @return an UnsafeUTFStringCodePoints<CP32, Unit> + * for the given std::basic_string_view<Unit> + * @draft ICU 78 + */ +template +auto unsafeUTFStringCodePoints(std::wstring_view s) { + return UnsafeUTFStringCodePoints(s); } // ------------------------------------------------------------------------- *** diff --git a/icu4c/source/test/intltest/utfiteratortest.cpp b/icu4c/source/test/intltest/utfiteratortest.cpp index 4247df35762..f903eb82d61 100644 --- a/icu4c/source/test/intltest/utfiteratortest.cpp +++ b/icu4c/source/test/intltest/utfiteratortest.cpp @@ -507,22 +507,11 @@ public: template void testLongLinearContig(const ImplTest &test) { initLong(); - // TODO: fix utfStringCodePoints() & unsafeUTFStringCodePoints() - // to *actually take* string_view arguments. - // Currently, if we pass in a string, then the function makes a temporary copy - // of the string and creates an [Unsafe]UTFStringCodePoints which - // takes a copy of a string_view *which refers to the temporary copy* - // which then goes out of scope, taking its heap buffer with it. - // Look at unicode/char16ptr.h ConvertibleToU16StringView. - // If this means that we can no longer deduce the Unit type, then maybe - // remove these functions. - // If we can keep them, then pass test.str directly into the ...CodePoints() function. - std::basic_string_view sv{test.str}; if constexpr (mode == UNSAFE) { - auto range = unsafeUTFStringCodePoints(sv); + auto range = unsafeUTFStringCodePoints(test.str); testLongLinear(test, range.begin(), range.end()); } else { - auto range = utfStringCodePoints(sv); + auto range = utfStringCodePoints(test.str); testLongLinear(test, range.begin(), range.end()); } }