template param: code point type

This commit is contained in:
Markus Scherer 2024-12-26 12:45:17 -08:00
parent 6851e8db32
commit 64ea1100ea
2 changed files with 30 additions and 30 deletions

View file

@ -29,30 +29,26 @@ namespace U_HEADER_ONLY_NAMESPACE {
//
// TODO: A possible alternative to an enum might be some kind of function template
// which would be fully customizable.
// The operator*() return value might then want to be a template parameter as well.
// For example, for a well-formed sequence, the return value could be
// a tuple of (code point, well-formed), or a string view, or...
// (And then the caller could choose between UChar32 and char32_t.)
// However, all of that would make the API more complex and daunting.
enum U16IllFormedBehavior {
U16_BEHAVIOR_NEGATIVE,
U16_BEHAVIOR_FFFD,
U16_BEHAVIOR_SURROGATE
};
// TODO: Consider a template parameter for UChar32 vs. char32_t vs. uint32_t.
/**
* A code unit sequence for one code point returned by U16Iterator.
* TODO: Share with UTF-8?
*
* TODO: check doxygen syntax for template parameters
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
* @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
* @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U16_BEHAVIOR_NEGATIVE
* @draft ICU 77
*/
template<typename Unit16>
template<typename Unit16, typename CP32>
struct U16OneSeq {
// Order of fields with padding and access frequency in mind.
UChar32 codePoint = 0;
CP32 codePoint = 0;
uint8_t length = 0;
bool isWellFormed = false;
const Unit16 *data;
@ -61,18 +57,20 @@ struct U16OneSeq {
return std::basic_string_view<Unit16>(data, length);
}
// TODO: std::optional<UChar32> maybeCodePoint() const ? (nullopt if !isWellFormed)
// TODO: std::optional<CP32> maybeCodePoint() const ? (nullopt if !isWellFormed)
};
/**
* Validating iterator over the code points in a Unicode 16-bit string.
*
* TODO: check doxygen syntax for template parameters
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
* @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
* @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
* should be signed if U16_BEHAVIOR_NEGATIVE
* @param U16IllFormedBehavior TODO
* @draft ICU 77
*/
template<typename Unit16, U16IllFormedBehavior behavior>
template<typename Unit16, typename CP32, U16IllFormedBehavior behavior>
class U16Iterator {
public:
// TODO: make private, make friends
@ -86,10 +84,10 @@ public:
bool operator==(const U16Iterator &other) const { return p == other.p; }
bool operator!=(const U16Iterator &other) const { return !operator==(other); }
U16OneSeq<Unit16> operator*() const {
U16OneSeq<Unit16, CP32> operator*() const {
// TODO: assert p != limit -- more precisely: start <= p < limit
// Similar to U16_NEXT_OR_FFFD().
UChar32 c = *p;
CP32 c = *p;
if (!U16_IS_SURROGATE(c)) {
return {c, 1, true, p};
} else {
@ -118,7 +116,7 @@ public:
// More similar to U16_NEXT_OR_FFFD() than U16_FWD_1() to try to help the compiler
// amortize work between operator*() and operator++(int) in typical *it++ usage.
// Otherwise this is slightly less efficient because it tests a lead surrogate twice.
UChar32 c = *p++;
CP32 c = *p++;
if (U16_IS_SURROGATE(c) &&
U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
++p;
@ -126,9 +124,13 @@ public:
return result;
}
// TODO: operator--()
// TODO: maybe fused readAndInc()?
// TODO: maybe fused decAndRead()?
private:
// Handle ill-formed UTF-16: One unpaired surrogate.
UChar32 sub(UChar32 surrogate) const {
CP32 sub(CP32 surrogate) const {
switch (behavior) {
case U16_BEHAVIOR_NEGATIVE: return U_SENTINEL;
case U16_BEHAVIOR_FFFD: return 0xfffd;
@ -149,7 +151,7 @@ private:
* @return a code point iterator.
* @draft ICU 77
*/
template<typename Unit16, U16IllFormedBehavior behavior>
template<typename Unit16, typename CP32, U16IllFormedBehavior behavior>
class U16StringCodePoints {
public:
/**
@ -162,12 +164,12 @@ public:
U16StringCodePoints(const U16StringCodePoints &other) = default;
/** @draft ICU 77 */
U16Iterator<Unit16, behavior> begin() const {
U16Iterator<Unit16, CP32, behavior> begin() const {
return {s.data(), s.data(), s.data() + s.length()};
}
/** @draft ICU 77 */
U16Iterator<Unit16, behavior> end() const {
U16Iterator<Unit16, CP32, behavior> end() const {
const Unit16 *limit = s.data() + s.length();
return {s.data(), limit, limit};
}
@ -183,8 +185,6 @@ private:
// template<typename Unit16>
// class U16UnsafeIterator
// TODO: only p, no start, no limit
// TODO: can/should we read the code point only in operator*()?
// if we read it in the constructor, then we would still need start/limit...
} // namespace U_HEADER_ONLY_NAMESPACE

View file

@ -58,11 +58,11 @@ void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam
void U16IteratorTest::testGood() {
IcuTestErrorCode errorCode(*this, "testGood");
std::u16string_view good(u"abçカ🚴"sv);
U16StringCodePoints<char16_t, U16_BEHAVIOR_NEGATIVE> range(good);
U16StringCodePoints<char16_t, UChar32, U16_BEHAVIOR_NEGATIVE> range(good);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
++iter; // pre-increment
U16OneSeq<char16_t> seq = *iter;
auto seq = *iter;
assertEquals("iter[1] * codePoint", u'b', seq.codePoint);
assertEquals("iter[1] * length", 1, seq.length);
assertTrue("iter[1] * isWellFormed", seq.isWellFormed);
@ -83,11 +83,11 @@ void U16IteratorTest::testNegative() {
IcuTestErrorCode errorCode(*this, "testNegative");
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
std::u16string_view bad(badChars, 5);
U16StringCodePoints<char16_t, U16_BEHAVIOR_NEGATIVE> range(bad);
U16StringCodePoints<char16_t, UChar32, U16_BEHAVIOR_NEGATIVE> range(bad);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
++iter; // pre-increment
U16OneSeq<char16_t> seq = *iter;
auto seq = *iter;
assertEquals("iter[1] * codePoint", -1, seq.codePoint);
assertEquals("iter[1] * length", 1, seq.length);
assertFalse("iter[1] * isWellFormed", seq.isWellFormed);
@ -107,11 +107,11 @@ void U16IteratorTest::testFFFD() {
IcuTestErrorCode errorCode(*this, "testFFFD");
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
std::u16string_view bad(badChars, 5);
U16StringCodePoints<char16_t, U16_BEHAVIOR_FFFD> range(bad);
U16StringCodePoints<char16_t, char32_t, U16_BEHAVIOR_FFFD> range(bad);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
++iter; // pre-increment
U16OneSeq<char16_t> seq = *iter;
auto seq = *iter;
assertEquals("iter[1] * codePoint", 0xfffd, seq.codePoint);
assertEquals("iter[1] * length", 1, seq.length);
assertFalse("iter[1] * isWellFormed", seq.isWellFormed);
@ -131,11 +131,11 @@ void U16IteratorTest::testSurrogate() {
IcuTestErrorCode errorCode(*this, "testSurrogate");
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
std::u16string_view bad(badChars, 5);
U16StringCodePoints<char16_t, U16_BEHAVIOR_SURROGATE> range(bad);
U16StringCodePoints<char16_t, uint32_t, U16_BEHAVIOR_SURROGATE> range(bad);
auto iter = range.begin();
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
++iter; // pre-increment
U16OneSeq<char16_t> seq = *iter;
auto seq = *iter;
assertEquals("iter[1] * codePoint", 0xd900, seq.codePoint);
assertEquals("iter[1] * length", 1, seq.length);
assertFalse("iter[1] * isWellFormed", seq.isWellFormed);