mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-23004 UTF-32
This commit is contained in:
parent
74278fd566
commit
0c5fa181ff
3 changed files with 224 additions and 2 deletions
|
@ -89,11 +89,14 @@ namespace header {}
|
|||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
// Some defined behaviors for handling ill-formed Unicode strings.
|
||||
// TODO: For UTF-32, we have basically orthogonal conditions for surrogate vs. out-of-range.
|
||||
// Maybe make U_BEHAVIOR_SURROGATE return FFFD for out-of-range?
|
||||
typedef enum UIllFormedBehavior {
|
||||
// Returns a negative value instead of a code point.
|
||||
U_BEHAVIOR_NEGATIVE,
|
||||
// Returns U+FFFD Replacement Character.
|
||||
U_BEHAVIOR_FFFD,
|
||||
// UTF-8: Not allowed;
|
||||
// UTF-16: returns the unpaired surrogate;
|
||||
// UTF-32: returns the surrogate code point, or U+FFFD if out of range.
|
||||
U_BEHAVIOR_SURROGATE
|
||||
} UIllFormedBehavior;
|
||||
|
||||
|
@ -534,6 +537,87 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
// UTF-32: trivial, but still validating
|
||||
template<typename UnitIter, typename CP32, UIllFormedBehavior behavior>
|
||||
class UTFImpl<
|
||||
UnitIter,
|
||||
CP32,
|
||||
behavior,
|
||||
std::enable_if_t<
|
||||
sizeof(typename std::iterator_traits<UnitIter>::value_type) == 4>> {
|
||||
public:
|
||||
// Handle ill-formed UTF-32: Out of range.
|
||||
static inline CP32 sub() {
|
||||
switch (behavior) {
|
||||
case U_BEHAVIOR_NEGATIVE: return U_SENTINEL;
|
||||
case U_BEHAVIOR_FFFD:
|
||||
case U_BEHAVIOR_SURROGATE: return 0xfffd;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle ill-formed UTF-32: One unpaired surrogate.
|
||||
static inline CP32 subSurrogate(CP32 surrogate) {
|
||||
switch (behavior) {
|
||||
case U_BEHAVIOR_NEGATIVE: return U_SENTINEL;
|
||||
case U_BEHAVIOR_FFFD: return 0xfffd;
|
||||
case U_BEHAVIOR_SURROGATE: return surrogate;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void inc(UnitIter &p, UnitIter /*limit*/) {
|
||||
++p;
|
||||
}
|
||||
|
||||
static inline void dec(UnitIter /*start*/, UnitIter &p) {
|
||||
--p;
|
||||
}
|
||||
|
||||
static inline CodeUnits<UnitIter, CP32> readAndInc(UnitIter &p, UnitIter /*limit*/) {
|
||||
UnitIter p0 = p;
|
||||
uint32_t uc = *p;
|
||||
CP32 c = uc;
|
||||
++p;
|
||||
if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
|
||||
return {c, 1, true, p0};
|
||||
} else if (uc < 0xe000) {
|
||||
return {subSurrogate(c), 1, false, p0};
|
||||
} else {
|
||||
return {sub(), 1, false, p0};
|
||||
}
|
||||
}
|
||||
|
||||
static inline CodeUnits<UnitIter, CP32> singlePassReadAndInc(UnitIter &p, UnitIter /*limit*/) {
|
||||
uint32_t uc = *p;
|
||||
CP32 c = uc;
|
||||
++p;
|
||||
if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
|
||||
return {c, 1, true};
|
||||
} else if (uc < 0xe000) {
|
||||
return {subSurrogate(c), 1, false};
|
||||
} else {
|
||||
return {sub(), 1, false};
|
||||
}
|
||||
}
|
||||
|
||||
static inline CodeUnits<UnitIter, CP32> decAndRead(UnitIter /*start*/, UnitIter &p) {
|
||||
uint32_t uc = *--p;
|
||||
CP32 c = uc;
|
||||
if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
|
||||
return {c, 1, true, p};
|
||||
} else if (uc < 0xe000) {
|
||||
return {subSurrogate(c), 1, false, p};
|
||||
} else {
|
||||
return {sub(), 1, false, p};
|
||||
}
|
||||
}
|
||||
|
||||
static inline void moveToDecAndReadLimit(UnitIter &p, int8_t &state) {
|
||||
// state < 0 after decAndRead()
|
||||
++p;
|
||||
state = 0;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
|
|
|
@ -50,6 +50,7 @@ extern IntlTest *createStaticUnicodeSetsTest();
|
|||
static IntlTest *createUHashTest();
|
||||
extern IntlTest *createU16IteratorTest();
|
||||
extern IntlTest *createU8IteratorTest();
|
||||
extern IntlTest *createU32IteratorTest();
|
||||
|
||||
void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par )
|
||||
{
|
||||
|
@ -88,6 +89,7 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &
|
|||
TESTCASE_AUTO_CREATE_CLASS(USetHeaderOnlyTest);
|
||||
TESTCASE_AUTO_CREATE_CLASS(U16IteratorTest);
|
||||
TESTCASE_AUTO_CREATE_CLASS(U8IteratorTest);
|
||||
TESTCASE_AUTO_CREATE_CLASS(U32IteratorTest);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
|
|
@ -462,3 +462,139 @@ void U8IteratorTest::testFwdIter() {
|
|||
assertTrue("iter[4] * data()[3]", *data == u8"🚴"[3]);
|
||||
assertTrue("iter == endIter", iter == rangeLimit);
|
||||
}
|
||||
|
||||
class U32IteratorTest : public IntlTest {
|
||||
public:
|
||||
U32IteratorTest() {}
|
||||
|
||||
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override;
|
||||
|
||||
void testGood();
|
||||
void testNegative();
|
||||
void testFFFD();
|
||||
void testSinglePassIter();
|
||||
void testFwdIter();
|
||||
};
|
||||
|
||||
extern IntlTest *createU32IteratorTest() {
|
||||
return new U32IteratorTest();
|
||||
}
|
||||
|
||||
void U32IteratorTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
|
||||
if(exec) {
|
||||
logln("TestSuite U32IteratorTest: ");
|
||||
}
|
||||
TESTCASE_AUTO_BEGIN;
|
||||
TESTCASE_AUTO(testGood);
|
||||
// TODO: TESTCASE_AUTO(testNegative);
|
||||
// TODO: TESTCASE_AUTO(testFFFD);
|
||||
TESTCASE_AUTO(testSinglePassIter);
|
||||
TESTCASE_AUTO(testFwdIter);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
void U32IteratorTest::testGood() {
|
||||
std::u32string_view good(U"abçカ🚴"sv);
|
||||
UTFStringCodePoints<char32_t, UChar32, U_BEHAVIOR_NEGATIVE> range(good);
|
||||
assertTrue(
|
||||
"bidirectional_iterator_tag",
|
||||
std::is_same_v<
|
||||
typename std::iterator_traits<
|
||||
UTFIterator<char32_t *, UChar32, U_BEHAVIOR_NEGATIVE>>::iterator_category,
|
||||
std::bidirectional_iterator_tag>);
|
||||
auto iter = range.begin();
|
||||
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
|
||||
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
|
||||
++iter; // pre-increment
|
||||
auto units = *iter;
|
||||
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
|
||||
assertEquals("iter[1] * length", 1, units.length());
|
||||
assertTrue("iter[1] * wellFormed", units.wellFormed());
|
||||
assertTrue("iter[1] * stringView()", units.stringView() == U"b"sv);
|
||||
++iter;
|
||||
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
|
||||
assertEquals("iter[3] -> codePoint", u'カ', iter->codePoint());
|
||||
++iter;
|
||||
// Fetch the current code point twice.
|
||||
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
|
||||
units = *iter++;
|
||||
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
|
||||
assertEquals("iter[4] * length", 1, units.length());
|
||||
assertTrue("iter[4] * wellFormed", units.wellFormed());
|
||||
assertTrue("iter[4] * stringView()", units.stringView() == U"🚴"sv);
|
||||
assertTrue("iter == endIter", iter == range.end());
|
||||
}
|
||||
|
||||
void U32IteratorTest::testSinglePassIter() {
|
||||
SinglePassSource<char32_t> good(U"abçカ🚴"sv);
|
||||
SinglePassIter<char32_t> goodBegin(good);
|
||||
SinglePassIter<char32_t> goodLimit{};
|
||||
UTFIterator<SinglePassIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin(
|
||||
goodBegin, goodLimit);
|
||||
UTFIterator<SinglePassIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit(goodLimit);
|
||||
assertTrue(
|
||||
"input_iterator_tag",
|
||||
std::is_same_v<
|
||||
typename std::iterator_traits<
|
||||
UTFIterator<SinglePassIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE>>::iterator_category,
|
||||
std::input_iterator_tag>);
|
||||
auto iter = rangeBegin;
|
||||
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
|
||||
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
|
||||
++iter; // pre-increment
|
||||
auto units = *iter;
|
||||
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
|
||||
assertEquals("iter[1] * length", 1, units.length());
|
||||
assertTrue("iter[1] * wellFormed", units.wellFormed());
|
||||
// No units.stringView() when the unit iterator is not a pointer.
|
||||
// No data() for a single-pass unit iterator.
|
||||
++iter;
|
||||
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
|
||||
assertEquals("iter[3] -> codePoint", u'カ', iter->codePoint());
|
||||
++iter;
|
||||
// Fetch the current code point twice.
|
||||
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
|
||||
units = *iter++;
|
||||
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
|
||||
assertEquals("iter[4] * length", 1, units.length());
|
||||
assertTrue("iter[4] * wellFormed", units.wellFormed());
|
||||
assertTrue("iter == endIter", iter == rangeLimit);
|
||||
}
|
||||
|
||||
void U32IteratorTest::testFwdIter() {
|
||||
std::u32string_view good(U"abçカ🚴"sv);
|
||||
FwdIter<char32_t> goodBegin(good.data());
|
||||
FwdIter<char32_t> goodLimit(good.data() + good.length());
|
||||
UTFIterator<FwdIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE> rangeBegin(goodBegin, goodLimit);
|
||||
UTFIterator<FwdIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE> rangeLimit(goodLimit);
|
||||
// TODO: UTFStringCodePoints<FwdIter, UChar32, U_BEHAVIOR_NEGATIVE> range(good);
|
||||
assertTrue(
|
||||
"forward_iterator_tag",
|
||||
std::is_same_v<
|
||||
typename std::iterator_traits<
|
||||
UTFIterator<FwdIter<char32_t>, UChar32, U_BEHAVIOR_NEGATIVE>>::iterator_category,
|
||||
std::forward_iterator_tag>);
|
||||
auto iter = rangeBegin;
|
||||
assertEquals("iter[0] * codePoint", u'a', (*iter).codePoint());
|
||||
assertEquals("iter[0] -> codePoint", u'a', iter->codePoint());
|
||||
++iter; // pre-increment
|
||||
auto units = *iter;
|
||||
assertEquals("iter[1] * codePoint", u'b', units.codePoint());
|
||||
assertEquals("iter[1] * length", 1, units.length());
|
||||
assertTrue("iter[1] * wellFormed", units.wellFormed());
|
||||
// No units.stringView() when the unit iterator is not a pointer.
|
||||
assertTrue("iter[1] * data()[0]", *units.data() == u'b');
|
||||
++iter;
|
||||
assertEquals("iter[2] * codePoint", u'ç', (*iter++).codePoint()); // post-increment
|
||||
assertEquals("iter[3] -> codePoint", u'カ', iter->codePoint());
|
||||
++iter;
|
||||
// Fetch the current code point twice.
|
||||
assertEquals("iter[4.0] * codePoint", U'🚴', (*iter).codePoint());
|
||||
units = *iter++;
|
||||
assertEquals("iter[4] * codePoint", U'🚴', units.codePoint());
|
||||
assertEquals("iter[4] * length", 1, units.length());
|
||||
assertTrue("iter[4] * wellFormed", units.wellFormed());
|
||||
FwdIter<char32_t> data = units.data();
|
||||
assertTrue("iter[4] * data()[0]", *data == U"🚴"[0]);
|
||||
assertTrue("iter == endIter", iter == rangeLimit);
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue