From 201248ad7fc2a5297be90352f9365a741a87e63d Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Sun, 9 Mar 2025 11:27:07 -0700 Subject: [PATCH] ICU-23004 Feinschliff --- icu4c/source/common/unicode/utfiter.h | 42 +++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/icu4c/source/common/unicode/utfiter.h b/icu4c/source/common/unicode/utfiter.h index ba928d96b3c..aa077b7a70f 100644 --- a/icu4c/source/common/unicode/utfiter.h +++ b/icu4c/source/common/unicode/utfiter.h @@ -914,12 +914,15 @@ public: // Constructor with start <= p < limit. // All of these iterators/pointers should be at code point boundaries. + // Not enabled if UnitIter is a single-pass input_iterator. + // TODO: Should we enable this only for a bidirectional_iterator? inline UTFIterator(UnitIter start, UnitIter p, UnitIter limit) : p_(p), start_(start), limit_(limit), units_(0, 0, false, p) {} // Constructs an iterator with start=p. inline UTFIterator(UnitIter p, UnitIter limit) : p_(p), start_(p), limit_(limit), units_(0, 0, false, p) {} // Constructs an iterator start or limit sentinel. + // Requires UnitIter to be copyable. inline UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p) {} inline UTFIterator(UTFIterator &&src) noexcept = default; @@ -1322,13 +1325,12 @@ public: /** @draft ICU 78 */ UTFIterator begin() const { - return {s.data(), s.data(), s.data() + s.length()}; + return {s.begin(), s.begin(), s.end()}; } /** @draft ICU 78 */ UTFIterator end() const { - const Unit *limit = s.data() + s.length(); - return {s.data(), limit, limit}; + return {s.begin(), s.end(), s.end()}; } /** @@ -1353,6 +1355,7 @@ private: /** * UTFIterator factory function for start <= p < limit. + * Not enabled if UnitIter is a single-pass input_iterator. * * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t * @tparam behavior How to handle ill-formed Unicode strings @@ -1851,12 +1854,12 @@ public: /** @draft ICU 78 */ UnsafeUTFIterator begin() const { - return {s.data()}; + return {s.begin()}; } /** @draft ICU 78 */ UnsafeUTFIterator end() const { - return {s.data() + s.length()}; + return {s.end()}; } /** @@ -2004,6 +2007,35 @@ int32_t unsafeReverseLoop8(std::string_view s) { } return sum; } + +char32_t firstCodePointOrFFFD16(std::u16string_view s) { + if (s.empty()) { return 0xfffd; } + auto range = utfStringCodePoints(s); + return range.begin()->codePoint(); +} + +std::string_view firstSequence8(std::string_view s) { + if (s.empty()) { return {}; } + auto range = utfStringCodePoints(s); + auto units = *(range.begin()); + if (units.wellFormed()) { + return units.stringView(); + } else { + return {}; + } +} + +char32_t unsafeFirstCodePointOrFFFD8(std::string_view s) { + if (s.empty()) { return 0xfffd; } + auto range = unsafeUTFStringCodePoints(s); + return range.begin()->codePoint(); +} + +std::string_view unsafeFirstSequence8(std::string_view s) { + if (s.empty()) { return {}; } + auto range = unsafeUTFStringCodePoints(s); + return range.begin()->stringView(); +} #endif } // namespace U_HEADER_ONLY_NAMESPACE