From 65c155de9ba59fb1683842e27936bfa9eae32539 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 7 Mar 2025 10:37:12 -0800 Subject: [PATCH] ICU-23004 derive CodeUnits from UnsafeCodeUnits --- icu4c/source/common/unicode/utfiter.h | 163 +++++++++++++------------- 1 file changed, 83 insertions(+), 80 deletions(-) diff --git a/icu4c/source/common/unicode/utfiter.h b/icu4c/source/common/unicode/utfiter.h index 5fd8620c5f8..66082111819 100644 --- a/icu4c/source/common/unicode/utfiter.h +++ b/icu4c/source/common/unicode/utfiter.h @@ -106,93 +106,17 @@ typedef enum UIllFormedBehavior { namespace U_HEADER_ONLY_NAMESPACE { /** - * Result of validating and decoding a minimal Unicode code unit sequence. - * Returned from validating Unicode string code point iterators. - * - * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: - * UTF-8: char or char8_t or uint8_t; - * UTF-16: char16_t or uint16_t or (on Windows) wchar_t - * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; - * should be signed if U_BEHAVIOR_NEGATIVE - * @draft ICU 78 - */ -template -class CodeUnits { - using Unit = typename std::iterator_traits::value_type; -public: - // @internal - CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter data) : - c(codePoint), len(length), ok(wellFormed), p(data) {} - - CodeUnits(const CodeUnits &other) = default; - CodeUnits &operator=(const CodeUnits &other) = default; - - UChar32 codePoint() const { return c; } - - bool wellFormed() const { return ok; } - - UnitIter data() const { return p; } - - uint8_t length() const { return len; } - - template - std::enable_if_t< - std::is_pointer_v, - std::basic_string_view> - stringView() const { - return std::basic_string_view(p, len); - } - -private: - // Order of fields with padding and access frequency in mind. - CP32 c; - uint8_t len; - bool ok; - UnitIter p; -}; - -#ifndef U_IN_DOXYGEN -// Partial template specialization for single-pass input iterator. -// No UnitIter field, no getter for it, no stringView(). -template -class CodeUnits< - UnitIter, - CP32, - std::enable_if_t< - !std::is_base_of_v< - std::forward_iterator_tag, - typename std::iterator_traits::iterator_category>>> { -public: - // @internal - CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) : - c(codePoint), len(length), ok(wellFormed) {} - - CodeUnits(const CodeUnits &other) = default; - CodeUnits &operator=(const CodeUnits &other) = default; - - UChar32 codePoint() const { return c; } - - bool wellFormed() const { return ok; } - - uint8_t length() const { return len; } - -private: - // Order of fields with padding and access frequency in mind. - CP32 c; - uint8_t len; - bool ok; -}; -#endif // U_IN_DOXYGEN - -/** - * Result of decoding a minimal Unicode code unit sequence which must be well-formed. + * Result of decoding a minimal Unicode code unit sequence. * Returned from non-validating Unicode string code point iterators. + * Base class for class CodeUnits which is returned from validating iterators. * * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: * UTF-8: char or char8_t or uint8_t; * UTF-16: char16_t or uint16_t or (on Windows) wchar_t * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; * should be signed if U_BEHAVIOR_NEGATIVE + * @see UnsafeUTFIterator + * @see UnsafeUTFStringCodePoints * @draft ICU 78 */ template @@ -206,12 +130,33 @@ public: UnsafeCodeUnits(const UnsafeCodeUnits &other) = default; UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default; + /** + * @return the Unicode code point decoded from the code unit sequence. + * If the sequence is ill-formed and the iterator validates, + * then this is a replacement value according to the iterator‘s + * UIllFormedBehavior template parameter. + * @draft ICU 78 + */ UChar32 codePoint() const { return c; } + /** + * @return the start of the minimal Unicode code unit sequence. + * Not enabled if UnitIter is a single-pass input_iterator. + * @draft ICU 78 + */ UnitIter data() const { return p; } + /** + * @return the length of the minimal Unicode code unit sequence. + * @draft ICU 78 + */ uint8_t length() const { return len; } + /** + * @return a string_view of the minimal Unicode code unit sequence. + * Enabled only if UnitIter is a pointer. + * @draft ICU 78 + */ template std::enable_if_t< std::is_pointer_v, @@ -256,6 +201,64 @@ private: }; #endif // U_IN_DOXYGEN +/** + * Result of validating and decoding a minimal Unicode code unit sequence. + * Returned from validating Unicode string code point iterators. + * Adds function wellFormed() to base class UnsafeCodeUnits. + * + * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: + * UTF-8: char or char8_t or uint8_t; + * UTF-16: char16_t or uint16_t or (on Windows) wchar_t + * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; + * should be signed if U_BEHAVIOR_NEGATIVE + * @see UTFIterator + * @see UTFStringCodePoints + * @draft ICU 78 + */ +template +class CodeUnits : public UnsafeCodeUnits { +public: + // @internal + CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter data) : + UnsafeCodeUnits(codePoint, length, data), ok(wellFormed) {} + + CodeUnits(const CodeUnits &other) = default; + CodeUnits &operator=(const CodeUnits &other) = default; + + bool wellFormed() const { return ok; } + +private: + bool ok; +}; + +#ifndef U_IN_DOXYGEN +// Partial template specialization for single-pass input iterator. +// No UnitIter field, no getter for it, no stringView(). +template +class CodeUnits< + UnitIter, + CP32, + std::enable_if_t< + !std::is_base_of_v< + std::forward_iterator_tag, + typename std::iterator_traits::iterator_category>>> : + public UnsafeCodeUnits { +public: + // @internal + CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) : + UnsafeCodeUnits(codePoint, length), ok(wellFormed) {} + + CodeUnits(const CodeUnits &other) = default; + CodeUnits &operator=(const CodeUnits &other) = default; + + bool wellFormed() const { return ok; } + +private: + bool ok; +}; +#endif // U_IN_DOXYGEN + +// Validating implementations --------------------------------------------- *** #ifndef U_IN_DOXYGEN // @internal