From 518967ab0119f6b9f06db0d06bcff06833701c33 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 28 Feb 2025 15:32:30 -0800 Subject: [PATCH] ICU-23004 UTF-8 backward --- icu4c/source/common/unicode/utfiter.h | 102 +++++++++++++++++++++----- 1 file changed, 82 insertions(+), 20 deletions(-) diff --git a/icu4c/source/common/unicode/utfiter.h b/icu4c/source/common/unicode/utfiter.h index c6c75b98da9..11b88d8a92a 100644 --- a/icu4c/source/common/unicode/utfiter.h +++ b/icu4c/source/common/unicode/utfiter.h @@ -39,6 +39,14 @@ namespace header {} #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) #define U16_GET_SUPPLEMENTARY(lead, trail) \ (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET) +// unicode/utf8.h +#define U8_IS_SINGLE(c) (((c)&0x80)==0) +#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32) +#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40) +#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" +#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5))) +#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" +#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) #endif /** @@ -265,7 +273,7 @@ public: ++p; return {c, length, true, p0}; } - return {sub(), length, false, p0}; // ill-formed + return {sub(), length, false, p0}; } static CodeUnits singlePassReadAndInc(UnitIter &p, UnitIter limit) { @@ -300,26 +308,62 @@ public: ++p; return {c, length, true}; } - return {sub(), length, false}; // ill-formed + return {sub(), length, false}; } static CodeUnits decAndRead(UnitIter start, UnitIter &p) { // TODO: assert p != start -- more precisely: start < p <= limit // Very similar to U8_PREV_OR_FFFD(). - CP32 c = *--p; - if (!U8_IS_SURROGATE(c)) { + CP32 c = uint8_t(*--p); + if (U8_IS_SINGLE(c)) { return {c, 1, true, p}; - } else { - UnitIter p1; - uint16_t c2; - if (U8_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U8_IS_LEAD(c2 = *--p1))) { - p = p1; - c = U8_GET_SUPPLEMENTARY(c2, c); - return {c, 2, true, p}; - } else { - return {sub(c), 1, false, p}; + } + if (U8_IS_TRAIL(c) && p != start) { + UnitIter p1 = p; + uint8_t b1 = *--p1; + if (U8_IS_LEAD(b1)) { + if (b1 < 0xe0) { + p = p1; + c = ((b1 - 0xc0) << 6) | (c & 0x3f); + return {c, 2, true, p}; + } else if (b1 < 0xf0 ? + U8_IS_VALID_LEAD3_AND_T1(b1, c) : + U8_IS_VALID_LEAD4_AND_T1(b1, c)) { + // Truncated 3- or 4-byte sequence. + p = p1; + return {sub(), 2, false, p}; + } + } else if (U8_IS_TRAIL(b1) && p1 != start) { + // Extract the value bits from the last trail byte. + c &= 0x3f; + uint8_t b2 = *--p1; + if (0xe0 <= b2 && b2 <= 0xf4) { + if (b2 < 0xf0) { + b2 &= 0xf; + if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { + p = p1; + c = (b2 << 12) | ((b1 & 0x3f) << 6) | c; + return {c, 3, true, p}; + } + } else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { + // Truncated 4-byte sequence. + p = p1; + return {sub(), 3, false, p}; + } + } else if (U8_IS_TRAIL(b2) && p1 != start) { + uint8_t b3 = *--p1; + if (0xf0 <= b3 && b3 <= 0xf4) { + b3 &= 7; + if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { + p = p1; + c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c; + return {c, 4, true, p}; + } + } + } } } + return {sub(), 1, false, p}; } static void moveToReadAndIncStart(UnitIter &p, int8_t &state) { @@ -1023,7 +1067,7 @@ private: // TODO: remove experimental sample code #ifndef UTYPES_H -int32_t rangeLoop(std::u16string_view s) { +int32_t rangeLoop16(std::u16string_view s) { header::U16StringCodePoints range(s); int32_t sum = 0; for (auto units : range) { @@ -1032,7 +1076,7 @@ int32_t rangeLoop(std::u16string_view s) { return sum; } -int32_t loopIterPlusPlus(std::u16string_view s) { +int32_t loopIterPlusPlus16(std::u16string_view s) { header::U16StringCodePoints range(s); int32_t sum = 0; auto iter = range.begin(); @@ -1043,7 +1087,7 @@ int32_t loopIterPlusPlus(std::u16string_view s) { return sum; } -int32_t backwardLoop(std::u16string_view s) { +int32_t backwardLoop16(std::u16string_view s) { header::U16StringCodePoints range(s); int32_t sum = 0; auto start = range.begin(); @@ -1054,7 +1098,7 @@ int32_t backwardLoop(std::u16string_view s) { return sum; } -int32_t reverseLoop(std::u16string_view s) { +int32_t reverseLoop16(std::u16string_view s) { header::U16StringCodePoints range(s); int32_t sum = 0; for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { @@ -1063,7 +1107,7 @@ int32_t reverseLoop(std::u16string_view s) { return sum; } -int32_t unsafeRangeLoop(std::u16string_view s) { +int32_t unsafeRangeLoop16(std::u16string_view s) { header::U16UnsafeStringCodePoints range(s); int32_t sum = 0; for (auto units : range) { @@ -1072,11 +1116,29 @@ int32_t unsafeRangeLoop(std::u16string_view s) { return sum; } -int32_t unsafeReverseLoop(std::u16string_view s) { +int32_t unsafeReverseLoop16(std::u16string_view s) { header::U16UnsafeStringCodePoints range(s); int32_t sum = 0; for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { - sum += (*iter).codePoint(); + sum += iter->codePoint(); + } + return sum; +} + +int32_t rangeLoop8(std::string_view s) { + header::U16StringCodePoints range(s); + int32_t sum = 0; + for (auto units : range) { + sum += units.codePoint(); + } + return sum; +} + +int32_t reverseLoop(std::string_view s) { + header::U16StringCodePoints range(s); + int32_t sum = 0; + for (auto iter = range.rbegin(); iter != range.rend(); ++iter) { + sum += iter->codePoint(); } return sum; }