ICU-23004 utfStringCodePoints(StringView) deduces Unit

This commit is contained in:
Markus Scherer 2025-03-07 11:25:20 -08:00
parent 65c155de9b
commit d1e9fb7003
2 changed files with 45 additions and 24 deletions

View file

@ -1338,6 +1338,20 @@ private:
std::basic_string_view<Unit> s;
};
/**
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U_BEHAVIOR_NEGATIVE
* @tparam UIllFormedBehavior How to handle ill-formed Unicode strings
* @return a UTFStringCodePoints&lt;Unit, CP32, behavior&gt;
* for the given std::basic_string_view&lt;Unit&gt;,
* deducing the Unit character type
* @draft ICU 78
*/
template<typename CP32, UIllFormedBehavior behavior, typename StringView>
auto utfStringCodePoints(StringView s) {
return UTFStringCodePoints<typename StringView::value_type, CP32, behavior>(s);
}
// Non-validating iterators ------------------------------------------------ ***
/**
@ -1354,8 +1368,7 @@ private:
* @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
* UTF-8: char or char8_t or uint8_t;
* UTF-16: char16_t or uint16_t or (on Windows) wchar_t
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U_BEHAVIOR_NEGATIVE
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
* @draft ICU 78
*/
template<typename UnitIter, typename CP32, typename = void>
@ -1741,8 +1754,7 @@ namespace U_HEADER_ONLY_NAMESPACE {
* @tparam Unit Code unit type:
* UTF-8: char or char8_t or uint8_t;
* UTF-16: char16_t or uint16_t or (on Windows) wchar_t
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U_BEHAVIOR_NEGATIVE
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
* @draft ICU 78
*/
template<typename Unit, typename CP32>
@ -1790,21 +1802,32 @@ private:
std::basic_string_view<Unit> s;
};
/**
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
* @return an UnsafeUTFStringCodePoints&lt;Unit, CP32&gt;
* for the given std::basic_string_view&lt;Unit&gt;,
* deducing the Unit character type
* @draft ICU 78
*/
template<typename CP32, typename StringView>
auto unsafeUTFStringCodePoints(StringView s) {
return UnsafeUTFStringCodePoints<typename StringView::value_type, CP32>(s);
}
// ------------------------------------------------------------------------- ***
// TODO: remove experimental sample code
#ifndef UTYPES_H
int32_t rangeLoop16(std::u16string_view s) {
header::UTFStringCodePoints<char16_t, UChar32, U_BEHAVIOR_NEGATIVE> range(s);
int32_t sum = 0;
for (auto units : range) {
for (auto units : header::utfStringCodePoints<UChar32, U_BEHAVIOR_NEGATIVE>(s)) {
sum += units.codePoint();
}
return sum;
}
int32_t loopIterPlusPlus16(std::u16string_view s) {
header::UTFStringCodePoints<char16_t, UChar32, U_BEHAVIOR_NEGATIVE> range(s);
auto range = header::utfStringCodePoints<UChar32, U_BEHAVIOR_NEGATIVE>(s);
int32_t sum = 0;
auto iter = range.begin();
auto limit = range.end();
@ -1815,7 +1838,7 @@ int32_t loopIterPlusPlus16(std::u16string_view s) {
}
int32_t backwardLoop16(std::u16string_view s) {
header::UTFStringCodePoints<char16_t, UChar32, U_BEHAVIOR_NEGATIVE> range(s);
auto range = header::utfStringCodePoints<UChar32, U_BEHAVIOR_NEGATIVE>(s);
int32_t sum = 0;
auto start = range.begin();
auto iter = range.end();
@ -1826,7 +1849,7 @@ int32_t backwardLoop16(std::u16string_view s) {
}
int32_t reverseLoop16(std::u16string_view s) {
header::UTFStringCodePoints<char16_t, UChar32, U_BEHAVIOR_NEGATIVE> range(s);
auto range = header::utfStringCodePoints<UChar32, U_BEHAVIOR_NEGATIVE>(s);
int32_t sum = 0;
for (auto iter = range.rbegin(); iter != range.rend(); ++iter) {
sum += iter->codePoint();
@ -1835,16 +1858,15 @@ int32_t reverseLoop16(std::u16string_view s) {
}
int32_t unsafeRangeLoop16(std::u16string_view s) {
header::UnsafeUTFStringCodePoints<char16_t, UChar32> range(s);
int32_t sum = 0;
for (auto units : range) {
for (auto units : header::unsafeUTFStringCodePoints<UChar32>(s)) {
sum += units.codePoint();
}
return sum;
}
int32_t unsafeReverseLoop16(std::u16string_view s) {
header::UnsafeUTFStringCodePoints<char16_t, UChar32> range(s);
auto range = header::unsafeUTFStringCodePoints<UChar32>(s);
int32_t sum = 0;
for (auto iter = range.rbegin(); iter != range.rend(); ++iter) {
sum += iter->codePoint();
@ -1853,16 +1875,15 @@ int32_t unsafeReverseLoop16(std::u16string_view s) {
}
int32_t rangeLoop8(std::string_view s) {
header::UTFStringCodePoints<char, UChar32, U_BEHAVIOR_NEGATIVE> range(s);
int32_t sum = 0;
for (auto units : range) {
for (auto units : header::utfStringCodePoints<UChar32, U_BEHAVIOR_NEGATIVE>(s)) {
sum += units.codePoint();
}
return sum;
}
int32_t reverseLoop8(std::string_view s) {
header::UTFStringCodePoints<char, UChar32, U_BEHAVIOR_NEGATIVE> range(s);
auto range = header::utfStringCodePoints<UChar32, U_BEHAVIOR_NEGATIVE>(s);
int32_t sum = 0;
for (auto iter = range.rbegin(); iter != range.rend(); ++iter) {
sum += iter->codePoint();
@ -1882,16 +1903,15 @@ int32_t macroLoop8(std::string_view s) {
}
int32_t unsafeRangeLoop8(std::string_view s) {
header::UnsafeUTFStringCodePoints<char, UChar32> range(s);
int32_t sum = 0;
for (auto units : range) {
for (auto units : header::unsafeUTFStringCodePoints<UChar32>(s)) {
sum += units.codePoint();
}
return sum;
}
int32_t unsafeReverseLoop8(std::string_view s) {
header::UnsafeUTFStringCodePoints<char, UChar32> range(s);
auto range = header::unsafeUTFStringCodePoints<UChar32>(s);
int32_t sum = 0;
for (auto iter = range.rbegin(); iter != range.rend(); ++iter) {
sum += iter->codePoint();

View file

@ -22,6 +22,7 @@ using namespace std::string_view_literals;
using U_HEADER_ONLY_NAMESPACE::UTFIterator;
using U_HEADER_ONLY_NAMESPACE::UTFStringCodePoints;
using U_HEADER_ONLY_NAMESPACE::utfStringCodePoints;
// Shared state for one or more copies of single-pass iterators.
// Similar to https://en.cppreference.com/w/cpp/iterator/istreambuf_iterator
@ -141,7 +142,7 @@ void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam
void U16IteratorTest::testGood() {
std::u16string_view good(u"abçカ🚴"sv);
UTFStringCodePoints<char16_t, UChar32, U_BEHAVIOR_NEGATIVE> range(good);
auto range = utfStringCodePoints<UChar32, U_BEHAVIOR_NEGATIVE>(good);
// TODO: Try to un-hardcode the iterator types in these checks via declspec.
assertTrue(
"bidirectional_iterator_tag",
@ -175,7 +176,7 @@ void U16IteratorTest::testGood() {
void U16IteratorTest::testNegative() {
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
std::u16string_view bad(badChars, 5);
UTFStringCodePoints<char16_t, UChar32, U_BEHAVIOR_NEGATIVE> range(bad);
auto range = utfStringCodePoints<UChar32, U_BEHAVIOR_NEGATIVE>(bad);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
@ -200,7 +201,7 @@ void U16IteratorTest::testNegative() {
void U16IteratorTest::testFFFD() {
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
std::u16string_view bad(badChars, 5);
UTFStringCodePoints<char16_t, char32_t, U_BEHAVIOR_FFFD> range(bad);
auto range = utfStringCodePoints<char32_t, U_BEHAVIOR_FFFD>(bad);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
@ -224,7 +225,7 @@ void U16IteratorTest::testFFFD() {
void U16IteratorTest::testSurrogate() {
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
std::u16string_view bad(badChars, 5);
UTFStringCodePoints<char16_t, uint32_t, U_BEHAVIOR_SURROGATE> range(bad);
auto range = utfStringCodePoints<uint32_t, U_BEHAVIOR_SURROGATE>(bad);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
@ -354,7 +355,7 @@ void U8IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&name
void U8IteratorTest::testGood() {
std::string_view good(reinterpret_cast<const char*>(u8"abçカ🚴"));
UTFStringCodePoints<char, UChar32, U_BEHAVIOR_NEGATIVE> range(good);
auto range = utfStringCodePoints<UChar32, U_BEHAVIOR_NEGATIVE>(good);
assertTrue(
"bidirectional_iterator_tag",
std::is_same_v<
@ -495,7 +496,7 @@ void U32IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam
void U32IteratorTest::testGood() {
std::u32string_view good(U"abçカ🚴"sv);
UTFStringCodePoints<char32_t, UChar32, U_BEHAVIOR_NEGATIVE> range(good);
auto range = utfStringCodePoints<UChar32, U_BEHAVIOR_NEGATIVE>(good);
assertTrue(
"bidirectional_iterator_tag",
std::is_same_v<