mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-21 04:29:31 +00:00
template param: code point type
This commit is contained in:
parent
6851e8db32
commit
64ea1100ea
2 changed files with 30 additions and 30 deletions
|
@ -29,30 +29,26 @@ namespace U_HEADER_ONLY_NAMESPACE {
|
|||
//
|
||||
// TODO: A possible alternative to an enum might be some kind of function template
|
||||
// which would be fully customizable.
|
||||
// The operator*() return value might then want to be a template parameter as well.
|
||||
// For example, for a well-formed sequence, the return value could be
|
||||
// a tuple of (code point, well-formed), or a string view, or...
|
||||
// (And then the caller could choose between UChar32 and char32_t.)
|
||||
// However, all of that would make the API more complex and daunting.
|
||||
enum U16IllFormedBehavior {
|
||||
U16_BEHAVIOR_NEGATIVE,
|
||||
U16_BEHAVIOR_FFFD,
|
||||
U16_BEHAVIOR_SURROGATE
|
||||
};
|
||||
|
||||
// TODO: Consider a template parameter for UChar32 vs. char32_t vs. uint32_t.
|
||||
|
||||
/**
|
||||
* A code unit sequence for one code point returned by U16Iterator.
|
||||
* TODO: Share with UTF-8?
|
||||
*
|
||||
* TODO: check doxygen syntax for template parameters
|
||||
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
|
||||
* @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
|
||||
* @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
|
||||
* should be signed if U16_BEHAVIOR_NEGATIVE
|
||||
* @draft ICU 77
|
||||
*/
|
||||
template<typename Unit16>
|
||||
template<typename Unit16, typename CP32>
|
||||
struct U16OneSeq {
|
||||
// Order of fields with padding and access frequency in mind.
|
||||
UChar32 codePoint = 0;
|
||||
CP32 codePoint = 0;
|
||||
uint8_t length = 0;
|
||||
bool isWellFormed = false;
|
||||
const Unit16 *data;
|
||||
|
@ -61,18 +57,20 @@ struct U16OneSeq {
|
|||
return std::basic_string_view<Unit16>(data, length);
|
||||
}
|
||||
|
||||
// TODO: std::optional<UChar32> maybeCodePoint() const ? (nullopt if !isWellFormed)
|
||||
// TODO: std::optional<CP32> maybeCodePoint() const ? (nullopt if !isWellFormed)
|
||||
};
|
||||
|
||||
/**
|
||||
* Validating iterator over the code points in a Unicode 16-bit string.
|
||||
*
|
||||
* TODO: check doxygen syntax for template parameters
|
||||
* @param Unit16 char16_t or uint16_t or (on Windows) wchar_t
|
||||
* @param Unit16 Code unit type: char16_t or uint16_t or (on Windows) wchar_t
|
||||
* @param CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
|
||||
* should be signed if U16_BEHAVIOR_NEGATIVE
|
||||
* @param U16IllFormedBehavior TODO
|
||||
* @draft ICU 77
|
||||
*/
|
||||
template<typename Unit16, U16IllFormedBehavior behavior>
|
||||
template<typename Unit16, typename CP32, U16IllFormedBehavior behavior>
|
||||
class U16Iterator {
|
||||
public:
|
||||
// TODO: make private, make friends
|
||||
|
@ -86,10 +84,10 @@ public:
|
|||
bool operator==(const U16Iterator &other) const { return p == other.p; }
|
||||
bool operator!=(const U16Iterator &other) const { return !operator==(other); }
|
||||
|
||||
U16OneSeq<Unit16> operator*() const {
|
||||
U16OneSeq<Unit16, CP32> operator*() const {
|
||||
// TODO: assert p != limit -- more precisely: start <= p < limit
|
||||
// Similar to U16_NEXT_OR_FFFD().
|
||||
UChar32 c = *p;
|
||||
CP32 c = *p;
|
||||
if (!U16_IS_SURROGATE(c)) {
|
||||
return {c, 1, true, p};
|
||||
} else {
|
||||
|
@ -118,7 +116,7 @@ public:
|
|||
// More similar to U16_NEXT_OR_FFFD() than U16_FWD_1() to try to help the compiler
|
||||
// amortize work between operator*() and operator++(int) in typical *it++ usage.
|
||||
// Otherwise this is slightly less efficient because it tests a lead surrogate twice.
|
||||
UChar32 c = *p++;
|
||||
CP32 c = *p++;
|
||||
if (U16_IS_SURROGATE(c) &&
|
||||
U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
|
||||
++p;
|
||||
|
@ -126,9 +124,13 @@ public:
|
|||
return result;
|
||||
}
|
||||
|
||||
// TODO: operator--()
|
||||
// TODO: maybe fused readAndInc()?
|
||||
// TODO: maybe fused decAndRead()?
|
||||
|
||||
private:
|
||||
// Handle ill-formed UTF-16: One unpaired surrogate.
|
||||
UChar32 sub(UChar32 surrogate) const {
|
||||
CP32 sub(CP32 surrogate) const {
|
||||
switch (behavior) {
|
||||
case U16_BEHAVIOR_NEGATIVE: return U_SENTINEL;
|
||||
case U16_BEHAVIOR_FFFD: return 0xfffd;
|
||||
|
@ -149,7 +151,7 @@ private:
|
|||
* @return a code point iterator.
|
||||
* @draft ICU 77
|
||||
*/
|
||||
template<typename Unit16, U16IllFormedBehavior behavior>
|
||||
template<typename Unit16, typename CP32, U16IllFormedBehavior behavior>
|
||||
class U16StringCodePoints {
|
||||
public:
|
||||
/**
|
||||
|
@ -162,12 +164,12 @@ public:
|
|||
U16StringCodePoints(const U16StringCodePoints &other) = default;
|
||||
|
||||
/** @draft ICU 77 */
|
||||
U16Iterator<Unit16, behavior> begin() const {
|
||||
U16Iterator<Unit16, CP32, behavior> begin() const {
|
||||
return {s.data(), s.data(), s.data() + s.length()};
|
||||
}
|
||||
|
||||
/** @draft ICU 77 */
|
||||
U16Iterator<Unit16, behavior> end() const {
|
||||
U16Iterator<Unit16, CP32, behavior> end() const {
|
||||
const Unit16 *limit = s.data() + s.length();
|
||||
return {s.data(), limit, limit};
|
||||
}
|
||||
|
@ -183,8 +185,6 @@ private:
|
|||
// template<typename Unit16>
|
||||
// class U16UnsafeIterator
|
||||
// TODO: only p, no start, no limit
|
||||
// TODO: can/should we read the code point only in operator*()?
|
||||
// if we read it in the constructor, then we would still need start/limit...
|
||||
|
||||
} // namespace U_HEADER_ONLY_NAMESPACE
|
||||
|
||||
|
|
|
@ -58,11 +58,11 @@ void U16IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&nam
|
|||
void U16IteratorTest::testGood() {
|
||||
IcuTestErrorCode errorCode(*this, "testGood");
|
||||
std::u16string_view good(u"abçカ🚴"sv);
|
||||
U16StringCodePoints<char16_t, U16_BEHAVIOR_NEGATIVE> range(good);
|
||||
U16StringCodePoints<char16_t, UChar32, U16_BEHAVIOR_NEGATIVE> range(good);
|
||||
auto iter = range.begin();
|
||||
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
|
||||
++iter; // pre-increment
|
||||
U16OneSeq<char16_t> seq = *iter;
|
||||
auto seq = *iter;
|
||||
assertEquals("iter[1] * codePoint", u'b', seq.codePoint);
|
||||
assertEquals("iter[1] * length", 1, seq.length);
|
||||
assertTrue("iter[1] * isWellFormed", seq.isWellFormed);
|
||||
|
@ -83,11 +83,11 @@ void U16IteratorTest::testNegative() {
|
|||
IcuTestErrorCode errorCode(*this, "testNegative");
|
||||
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
|
||||
std::u16string_view bad(badChars, 5);
|
||||
U16StringCodePoints<char16_t, U16_BEHAVIOR_NEGATIVE> range(bad);
|
||||
U16StringCodePoints<char16_t, UChar32, U16_BEHAVIOR_NEGATIVE> range(bad);
|
||||
auto iter = range.begin();
|
||||
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
|
||||
++iter; // pre-increment
|
||||
U16OneSeq<char16_t> seq = *iter;
|
||||
auto seq = *iter;
|
||||
assertEquals("iter[1] * codePoint", -1, seq.codePoint);
|
||||
assertEquals("iter[1] * length", 1, seq.length);
|
||||
assertFalse("iter[1] * isWellFormed", seq.isWellFormed);
|
||||
|
@ -107,11 +107,11 @@ void U16IteratorTest::testFFFD() {
|
|||
IcuTestErrorCode errorCode(*this, "testFFFD");
|
||||
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
|
||||
std::u16string_view bad(badChars, 5);
|
||||
U16StringCodePoints<char16_t, U16_BEHAVIOR_FFFD> range(bad);
|
||||
U16StringCodePoints<char16_t, char32_t, U16_BEHAVIOR_FFFD> range(bad);
|
||||
auto iter = range.begin();
|
||||
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
|
||||
++iter; // pre-increment
|
||||
U16OneSeq<char16_t> seq = *iter;
|
||||
auto seq = *iter;
|
||||
assertEquals("iter[1] * codePoint", 0xfffd, seq.codePoint);
|
||||
assertEquals("iter[1] * length", 1, seq.length);
|
||||
assertFalse("iter[1] * isWellFormed", seq.isWellFormed);
|
||||
|
@ -131,11 +131,11 @@ void U16IteratorTest::testSurrogate() {
|
|||
IcuTestErrorCode errorCode(*this, "testSurrogate");
|
||||
static const char16_t badChars[] = { u'a', 0xd900, u'b', 0xdc05, u'ç' };
|
||||
std::u16string_view bad(badChars, 5);
|
||||
U16StringCodePoints<char16_t, U16_BEHAVIOR_SURROGATE> range(bad);
|
||||
U16StringCodePoints<char16_t, uint32_t, U16_BEHAVIOR_SURROGATE> range(bad);
|
||||
auto iter = range.begin();
|
||||
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint);
|
||||
++iter; // pre-increment
|
||||
U16OneSeq<char16_t> seq = *iter;
|
||||
auto seq = *iter;
|
||||
assertEquals("iter[1] * codePoint", 0xd900, seq.codePoint);
|
||||
assertEquals("iter[1] * length", 1, seq.length);
|
||||
assertFalse("iter[1] * isWellFormed", seq.isWellFormed);
|
||||
|
|
Loading…
Add table
Reference in a new issue