ICU-20984 StringPiece & ByteSink overloads for char8_t*

This commit is contained in:
Markus Scherer 2020-03-12 19:21:24 -07:00
parent d2d144a5bd
commit 524748c6bf
11 changed files with 258 additions and 77 deletions

View file

@ -687,13 +687,13 @@ void toUpper(uint32_t options,
if (change) {
ByteSinkUtil::appendTwoBytes(upper, sink);
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
sink.Append(reinterpret_cast<const char*>(u8"\u0308"), 2); // restore or add a dialytika
sink.AppendU8(u8"\u0308", 2); // restore or add a dialytika
}
if (addTonos) {
sink.Append(reinterpret_cast<const char*>(u8"\u0301"), 2);
sink.AppendU8(u8"\u0301", 2);
}
while (numYpogegrammeni > 0) {
sink.Append(reinterpret_cast<const char*>(u8"\u0399"), 2);
sink.AppendU8(u8"\u0399", 2);
--numYpogegrammeni;
}
}

View file

@ -71,6 +71,40 @@ public:
*/
virtual void Append(const char* bytes, int32_t n) = 0;
#ifndef U_HIDE_DRAFT_API
/**
* Appends n bytes to this. Same as Append().
* Call AppendU8() with u8"string literals" which are const char * in C++11
* but const char8_t * in C++20.
* If the compiler does support char8_t as a distinct type,
* then an AppendU8() overload for that is defined and will be chosen.
*
* @param bytes the pointer to the bytes
* @param n the number of bytes; must be non-negative
* @draft ICU 67
*/
inline void AppendU8(const char* bytes, int32_t n) {
Append(bytes, n);
}
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
/**
* Appends n bytes to this. Same as Append() but for a const char8_t * pointer.
* Call AppendU8() with u8"string literals" which are const char * in C++11
* but const char8_t * in C++20.
* If the compiler does support char8_t as a distinct type,
* then this AppendU8() overload for that is defined and will be chosen.
*
* @param bytes the pointer to the bytes
* @param n the number of bytes; must be non-negative
* @draft ICU 67
*/
inline void AppendU8(const char8_t* bytes, int32_t n) {
Append(reinterpret_cast<const char*>(bytes), n);
}
#endif
#endif // U_HIDE_DRAFT_API
/**
* Returns a writable buffer for appending and writes the buffer's capacity to
* *result_capacity. Guarantees *result_capacity>=min_capacity.

View file

@ -67,19 +67,50 @@ class U_COMMON_API StringPiece : public UMemory {
* Default constructor, creates an empty StringPiece.
* @stable ICU 4.2
*/
StringPiece() : ptr_(NULL), length_(0) { }
StringPiece() : ptr_(nullptr), length_(0) { }
/**
* Constructs from a NUL-terminated const char * pointer.
* @param str a NUL-terminated const char * pointer
* @stable ICU 4.2
*/
StringPiece(const char* str);
#ifndef U_HIDE_DRAFT_API
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
/**
* Constructs from a NUL-terminated const char8_t * pointer.
* @param str a NUL-terminated const char8_t * pointer
* @draft ICU 67
*/
StringPiece(const char8_t* str) : StringPiece(reinterpret_cast<const char*>(str)) {}
#endif
/**
* Constructs an empty StringPiece.
* Needed for type disambiguation from multiple other overloads.
* @param p nullptr
* @draft ICU 67
*/
StringPiece(std::nullptr_t p) : ptr_(p), length_(0) {}
#endif // U_HIDE_DRAFT_API
/**
* Constructs from a std::string.
* @stable ICU 4.2
*/
StringPiece(const std::string& str)
: ptr_(str.data()), length_(static_cast<int32_t>(str.size())) { }
#ifndef U_HIDE_DRAFT_API
#if defined(__cpp_lib_char8_t) || defined(U_IN_DOXYGEN)
/**
* Constructs from a std::u8string.
* @draft ICU 67
*/
StringPiece(const std::u8string& str)
: ptr_(reinterpret_cast<const char*>(str.data())),
length_(static_cast<int32_t>(str.size())) { }
#endif
#endif // U_HIDE_DRAFT_API
#ifndef U_HIDE_DRAFT_API
/**
* Constructs from some other implementation of a string piece class, from any
@ -88,7 +119,7 @@ class U_COMMON_API StringPiece : public UMemory {
* \code{.cpp}
*
* struct OtherStringPieceClass {
* const char* data();
* const char* data(); // or const char8_t*
* size_t size();
* };
*
@ -97,16 +128,25 @@ class U_COMMON_API StringPiece : public UMemory {
* The other string piece class will typically be std::string_view from C++17
* or absl::string_view from Abseil.
*
* Starting with C++20, data() may also return a const char8_t* pointer,
* as from std::u8string_view.
*
* @param str the other string piece
* @draft ICU 65
*/
template <typename T,
typename = typename std::enable_if<
std::is_same<decltype(T().data()), const char*>::value &&
(std::is_same<decltype(T().data()), const char*>::value
#if defined(__cpp_char8_t)
|| std::is_same<decltype(T().data()), const char8_t*>::value
#endif
) &&
std::is_same<decltype(T().size()), size_t>::value>::type>
StringPiece(T str)
: ptr_(str.data()), length_(static_cast<int32_t>(str.size())) {}
: ptr_(reinterpret_cast<const char*>(str.data())),
length_(static_cast<int32_t>(str.size())) {}
#endif // U_HIDE_DRAFT_API
/**
* Constructs from a const char * pointer and a specified length.
* @param offset a const char * pointer (need not be terminated)
@ -114,6 +154,19 @@ class U_COMMON_API StringPiece : public UMemory {
* @stable ICU 4.2
*/
StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { }
#ifndef U_HIDE_DRAFT_API
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
/**
* Constructs from a const char8_t * pointer and a specified length.
* @param str a const char8_t * pointer (need not be terminated)
* @param len the length of the string; must be non-negative
* @draft ICU 67
*/
StringPiece(const char8_t* str, int32_t len) :
StringPiece(reinterpret_cast<const char*>(str), len) {}
#endif
#endif // U_HIDE_DRAFT_API
/**
* Substring of another StringPiece.
* @param x the other StringPiece
@ -132,7 +185,7 @@ class U_COMMON_API StringPiece : public UMemory {
StringPiece(const StringPiece& x, int32_t pos, int32_t len);
/**
* Returns the string pointer. May be NULL if it is empty.
* Returns the string pointer. May be nullptr if it is empty.
*
* data() may return a pointer to a buffer with embedded NULs, and the
* returned buffer may or may not be null terminated. Therefore it is
@ -165,7 +218,7 @@ class U_COMMON_API StringPiece : public UMemory {
* Sets to an empty string.
* @stable ICU 4.2
*/
void clear() { ptr_ = NULL; length_ = 0; }
void clear() { ptr_ = nullptr; length_ = 0; }
/**
* Reset the stringpiece to refer to new data.
@ -182,6 +235,29 @@ class U_COMMON_API StringPiece : public UMemory {
*/
void set(const char* str);
#ifndef U_HIDE_DRAFT_API
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
/**
* Resets the stringpiece to refer to new data.
* @param xdata pointer the new string data. Need not be NUL-terminated.
* @param len the length of the new data
* @draft ICU 67
*/
inline void set(const char8_t* xdata, int32_t len) {
set(reinterpret_cast<const char*>(xdata), len);
}
/**
* Resets the stringpiece to refer to new data.
* @param str a pointer to a NUL-terminated string.
* @draft ICU 67
*/
inline void set(const char8_t* str) {
set(reinterpret_cast<const char*>(str));
}
#endif
#endif // U_HIDE_DRAFT_API
/**
* Removes the first n string units.
* @param n prefix length, must be non-negative and <=length()

View file

@ -22,6 +22,7 @@
#include "unicode/sortkey.h"
#include "unicode/std_string.h"
#include "unicode/strenum.h"
#include "unicode/stringpiece.h"
#include "unicode/tblcoll.h"
#include "unicode/uiter.h"
#include "unicode/uniset.h"
@ -293,15 +294,15 @@ void CollationTest::TestIllegalUTF8() {
}
coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
static const char *strings[] = {
static const StringPiece strings[] = {
// string with U+FFFD == illegal byte sequence
reinterpret_cast<const char*>(u8"a\uFFFDz"), reinterpret_cast<const char*>("a\x80z"), // trail byte
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xc1\x81z"), // non-shortest form
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xe0\x82\x83z"), // non-shortest form
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xed\xa0\x80z"), // lead surrogate: would be U+D800
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xed\xbf\xbfz"), // trail surrogate: would be U+DFFF
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xf0\x8f\xbf\xbfz"), // non-shortest form
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xf4\x90\x80\x80z") // out of range: would be U+110000
u8"a\uFFFDz", "a\x80z", // trail byte
u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form
u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form
u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800
u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form
u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000
};
for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {

View file

@ -23,6 +23,7 @@
typedef struct ExpectedResult {
double value;
// Invariant characters, will be converted to UTF-16 and then unescaped.
const char *expected;
} ExpectedResult;
@ -185,38 +186,38 @@ static ExpectedResult kChineseCurrencyTestData[] = {
{123456789012345.0, "\\u00A5120\\u4E07\\u4EBF"},
};
static ExpectedResult kGermanCurrencyTestData[] = {
{1.0, reinterpret_cast<const char*>(u8"1\\u00A0\\u20AC")},
{12.0, reinterpret_cast<const char*>(u8"12\\u00A0\\u20AC")},
{123.0, reinterpret_cast<const char*>(u8"120\\u00A0\\u20AC")},
{1234.0, reinterpret_cast<const char*>(u8"1200\\u00A0\\u20AC")},
{12345.0, reinterpret_cast<const char*>(u8"12.000\\u00A0\\u20AC")},
{123456.0, reinterpret_cast<const char*>(u8"120.000\\u00A0\\u20AC")},
{1234567.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Mio.\\u00A0\\u20AC")},
{12345678.0, reinterpret_cast<const char*>(u8"12\\u00A0Mio.\\u00A0\\u20AC")},
{123456789.0, reinterpret_cast<const char*>(u8"120\\u00A0Mio.\\u00A0\\u20AC")},
{1234567890.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Mrd.\\u00A0\\u20AC")},
{12345678901.0, reinterpret_cast<const char*>(u8"12\\u00A0Mrd.\\u00A0\\u20AC")},
{123456789012.0, reinterpret_cast<const char*>(u8"120\\u00A0Mrd.\\u00A0\\u20AC")},
{1234567890123.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Bio.\\u00A0\\u20AC")},
{12345678901234.0, reinterpret_cast<const char*>(u8"12\\u00A0Bio.\\u00A0\\u20AC")},
{123456789012345.0, reinterpret_cast<const char*>(u8"120\\u00A0Bio.\\u00A0\\u20AC")},
{1.0, "1\\u00A0\\u20AC"},
{12.0, "12\\u00A0\\u20AC"},
{123.0, "120\\u00A0\\u20AC"},
{1234.0, "1200\\u00A0\\u20AC"},
{12345.0, "12.000\\u00A0\\u20AC"},
{123456.0, "120.000\\u00A0\\u20AC"},
{1234567.0, "1,2\\u00A0Mio.\\u00A0\\u20AC"},
{12345678.0, "12\\u00A0Mio.\\u00A0\\u20AC"},
{123456789.0, "120\\u00A0Mio.\\u00A0\\u20AC"},
{1234567890.0, "1,2\\u00A0Mrd.\\u00A0\\u20AC"},
{12345678901.0, "12\\u00A0Mrd.\\u00A0\\u20AC"},
{123456789012.0, "120\\u00A0Mrd.\\u00A0\\u20AC"},
{1234567890123.0, "1,2\\u00A0Bio.\\u00A0\\u20AC"},
{12345678901234.0, "12\\u00A0Bio.\\u00A0\\u20AC"},
{123456789012345.0, "120\\u00A0Bio.\\u00A0\\u20AC"},
};
static ExpectedResult kEnglishCurrencyTestData[] = {
{1.0, reinterpret_cast<const char*>(u8"$1")},
{12.0, reinterpret_cast<const char*>(u8"$12")},
{123.0, reinterpret_cast<const char*>(u8"$120")},
{1234.0, reinterpret_cast<const char*>(u8"$1.2K")},
{12345.0, reinterpret_cast<const char*>(u8"$12K")},
{123456.0, reinterpret_cast<const char*>(u8"$120K")},
{1234567.0, reinterpret_cast<const char*>(u8"$1.2M")},
{12345678.0, reinterpret_cast<const char*>(u8"$12M")},
{123456789.0, reinterpret_cast<const char*>(u8"$120M")},
{1234567890.0, reinterpret_cast<const char*>(u8"$1.2B")},
{12345678901.0, reinterpret_cast<const char*>(u8"$12B")},
{123456789012.0, reinterpret_cast<const char*>(u8"$120B")},
{1234567890123.0, reinterpret_cast<const char*>(u8"$1.2T")},
{12345678901234.0, reinterpret_cast<const char*>(u8"$12T")},
{123456789012345.0, reinterpret_cast<const char*>(u8"$120T")},
{1.0, "$1"},
{12.0, "$12"},
{123.0, "$120"},
{1234.0, "$1.2K"},
{12345.0, "$12K"},
{123456.0, "$120K"},
{1234567.0, "$1.2M"},
{12345678.0, "$12M"},
{123456789.0, "$120M"},
{1234567890.0, "$1.2B"},
{12345678901.0, "$12B"},
{123456789012.0, "$120B"},
{1234567890123.0, "$1.2T"},
{12345678901234.0, "$12T"},
{123456789012345.0, "$120T"},
};

View file

@ -31,6 +31,7 @@
#include "unicode/localpointer.h"
#include "unicode/regex.h"
#include "unicode/stringpiece.h"
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/uniset.h"
@ -5838,11 +5839,11 @@ void RegexTest::TestBug12884() {
REGEX_ASSERT(status == U_REGEX_TIME_OUT);
// UText, wrapping non-UTF-16 text, also takes a different execution path.
const char *text8 = reinterpret_cast<const char*>(u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
StringPiece text8(u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
"carácter, sin importar la plataforma, sin importar el programa,"
"sin importar el idioma.");
status = U_ZERO_ERROR;
LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
LocalUTextPointer ut(utext_openUTF8(NULL, text8.data(), text8.length(), &status));
REGEX_CHECK_STATUS;
m.reset(ut.getAlias());
m.find(status);

View file

@ -1314,7 +1314,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() {
Edits edits;
int32_t length = CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT,
reinterpret_cast<const char*>(u8"IstanBul"), 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
reinterpret_cast<const char*>(u8"IstanBul"), 8,
dest, UPRV_LENGTHOF(dest), &edits, errorCode);
assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"),
UnicodeString::fromUTF8(StringPiece(dest, length)));
static const EditChange lowerExpectedChanges[] = {
@ -1330,7 +1331,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() {
edits.reset();
length = CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT,
reinterpret_cast<const char*>(u8"Πατάτα"), 6 * 2, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
reinterpret_cast<const char*>(u8"Πατάτα"), 6 * 2,
dest, UPRV_LENGTHOF(dest), &edits, errorCode);
assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"),
UnicodeString::fromUTF8(StringPiece(dest, length)));
static const EditChange upperExpectedChanges[] = {
@ -1370,7 +1372,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() {
// No explicit nor automatic edits.reset(). Edits should be appended.
length = CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_EDITS_NO_RESET |
U_FOLD_CASE_EXCLUDE_SPECIAL_I,
reinterpret_cast<const char*>(u8"IßtanBul"), 1 + 2 + 6, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
reinterpret_cast<const char*>(u8"IßtanBul"), 1 + 2 + 6,
dest, UPRV_LENGTHOF(dest), &edits, errorCode);
assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"),
UnicodeString::fromUTF8(StringPiece(dest, length)));
static const EditChange foldExpectedChanges[] = {
@ -1454,44 +1457,44 @@ void StringCaseTest::TestCaseMapUTF8ToString() {
StringByteSink<std::string> sink(&dest);
// Omit unchanged text.
CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, reinterpret_cast<const char*>(u8"IstanBul"), sink, nullptr, errorCode);
CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, u8"IstanBul", sink, nullptr, errorCode);
assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"), UnicodeString::fromUTF8(dest));
dest.clear();
CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, reinterpret_cast<const char*>(u8"Πατάτα"), sink, nullptr, errorCode);
CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, u8"Πατάτα", sink, nullptr, errorCode);
assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"),
UnicodeString::fromUTF8(dest));
#if !UCONFIG_NO_BREAK_ITERATION
dest.clear();
CaseMap::utf8ToTitle(
"nl", U_OMIT_UNCHANGED_TEXT | U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE,
nullptr, reinterpret_cast<const char*>(u8"IjssEL IglOo"), sink, nullptr, errorCode);
nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode);
assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"J"),
UnicodeString::fromUTF8(dest));
#endif
dest.clear();
CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_FOLD_CASE_EXCLUDE_SPECIAL_I,
reinterpret_cast<const char*>(u8"IßtanBul"), sink, nullptr, errorCode);
u8"IßtanBul", sink, nullptr, errorCode);
assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"),
UnicodeString::fromUTF8(dest));
// Return the whole result string.
dest.clear();
CaseMap::utf8ToLower("tr", 0, reinterpret_cast<const char*>(u8"IstanBul"), sink, nullptr, errorCode);
CaseMap::utf8ToLower("tr", 0, u8"IstanBul", sink, nullptr, errorCode);
assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıstanbul"),
UnicodeString::fromUTF8(dest));
dest.clear();
CaseMap::utf8ToUpper("el", 0, reinterpret_cast<const char*>(u8"Πατάτα"), sink, nullptr, errorCode);
CaseMap::utf8ToUpper("el", 0, u8"Πατάτα", sink, nullptr, errorCode);
assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΠΑΤΑΤΑ"),
UnicodeString::fromUTF8(dest));
#if !UCONFIG_NO_BREAK_ITERATION
dest.clear();
CaseMap::utf8ToTitle("nl", U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE,
nullptr, reinterpret_cast<const char*>(u8"IjssEL IglOo"), sink, nullptr, errorCode);
nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode);
assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"IJssEL IglOo"),
UnicodeString::fromUTF8(dest));
#endif
dest.clear();
CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, reinterpret_cast<const char*>(u8"IßtanBul"), sink, nullptr, errorCode);
CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, u8"IßtanBul", sink, nullptr, errorCode);
assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ısstanbul"),
UnicodeString::fromUTF8(dest));
}

View file

@ -248,9 +248,11 @@ void StringTest::runIndexedTest(int32_t index, UBool exec, const char *&name, ch
#ifdef U_HAVE_STRING_VIEW
TESTCASE_AUTO(TestStringPieceStringView);
#endif
TESTCASE_AUTO(TestStringPieceU8);
TESTCASE_AUTO(TestByteSink);
TESTCASE_AUTO(TestCheckedArrayByteSink);
TESTCASE_AUTO(TestStringByteSink);
TESTCASE_AUTO(TestStringByteSinkAppendU8);
TESTCASE_AUTO(TestCharString);
TESTCASE_AUTO(TestCStr);
TESTCASE_AUTO(Testctou);
@ -265,7 +267,7 @@ StringTest::TestStringPiece() {
errln("StringPiece() failed");
}
// Construct from NULL const char * pointer.
StringPiece null(NULL);
StringPiece null((const char *)nullptr);
if(!null.empty() || null.data()!=NULL || null.length()!=0 || null.size()!=0) {
errln("StringPiece(NULL) failed");
}
@ -395,7 +397,7 @@ StringTest::TestStringPiece() {
void
StringTest::TestStringPieceComparisons() {
StringPiece empty;
StringPiece null(NULL);
StringPiece null(nullptr);
StringPiece abc("abc");
StringPiece abcd("abcdefg", 4);
StringPiece abx("abx");
@ -521,6 +523,52 @@ StringTest::TestStringPieceStringView() {
}
#endif
void
StringTest::TestStringPieceU8() {
// ICU-20984 "mitigate some C++20 char8_t breakages"
// For the following APIs there are overloads for both
// const char * and const char8_t *.
// A u8"string literal" has one type or the other
// depending on C++ version and compiler settings.
StringPiece abc(u8"abc");
assertEquals("abc.length", 3, abc.length());
assertEquals("abc", "\x61\x62\x63", abc.data());
StringPiece abc3(u8"abcdef", 3);
assertEquals("abc3.length", 3, abc3.length());
assertEquals("abc3[0]", 0x61, abc3.data()[0]);
assertEquals("abc3[1]", 0x62, abc3.data()[1]);
assertEquals("abc3[2]", 0x63, abc3.data()[2]);
StringPiece uvw("q");
uvw.set(u8"uvw");
assertEquals("uvw.length", 3, uvw.length());
assertEquals("uvw", "\x75\x76\x77", uvw.data());
StringPiece xyz("r");
xyz.set(u8"xyzXYZ", 3);
assertEquals("xyz.length", 3, xyz.length());
assertEquals("xyz[0]", 0x78, xyz.data()[0]);
assertEquals("xyz[1]", 0x79, xyz.data()[1]);
assertEquals("xyz[2]", 0x7a, xyz.data()[2]);
StringPiece null(nullptr);
assertTrue("null is empty", null.empty());
assertTrue("null is null", null.data() == nullptr);
#ifdef __cpp_lib_char8_t
std::u8string_view u8sv(u8"sv"); // C++20
StringPiece u8svsp(u8sv);
assertEquals("u8svsp.length", 2, u8svsp.length());
assertEquals("u8svsp", "\x73\x76", u8svsp.data());
std::u8string u8str(u8"str"); // C++20
StringPiece u8strsp(u8str);
assertEquals("u8strsp.length", 3, u8strsp.length());
assertEquals("u8strsp", "\x73\x74\x72", u8strsp.data());
#endif // __cpp_lib_char8_t
}
// Verify that ByteSink is subclassable and Flush() overridable.
class SimpleByteSink : public ByteSink {
public:
@ -653,6 +701,20 @@ StringTest::TestStringByteSink() {
}
}
void
StringTest::TestStringByteSinkAppendU8() {
// ICU-20984 "mitigate some C++20 char8_t breakages"
// For the following APIs there are overloads for both
// const char * and const char8_t *.
// A u8"string literal" has one type or the other
// depending on C++ version and compiler settings.
std::string result("abc");
StringByteSink<std::string> sink(&result);
sink.AppendU8("def", 3);
sink.AppendU8(u8"ghijkl", 4);
assertEquals("abcdefghij", "abcdef\x67\x68\x69\x6a", result.c_str());
}
#if defined(_MSC_VER)
#include <vector>
#endif

View file

@ -49,9 +49,11 @@ private:
#ifdef U_HAVE_STRING_VIEW
void TestStringPieceStringView();
#endif
void TestStringPieceU8();
void TestByteSink();
void TestCheckedArrayByteSink();
void TestStringByteSink();
void TestStringByteSinkAppendU8();
void TestSTLCompatibility();
void TestCharString();
void TestCStr();

View file

@ -14,6 +14,7 @@
#include "unicode/errorcode.h"
#include "unicode/normlzr.h"
#include "unicode/stringoptions.h"
#include "unicode/stringpiece.h"
#include "unicode/uniset.h"
#include "unicode/usetiter.h"
#include "unicode/schriter.h"
@ -1573,15 +1574,15 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
if(errorCode.errDataIfFailureAndReset("Normalizer2::getNFKCCasefoldInstance() call failed")) {
return;
}
static const char *const src =
reinterpret_cast<const char*>(u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161\u11A8\u3133 ");
std::string expected = reinterpret_cast<const char*>(u8" aääạ\u0308\u0308,가각갃 ");
static const StringPiece src =
u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161\u11A8\u3133 ";
StringPiece expected = u8" aääạ\u0308\u0308,가각갃 ";
std::string result;
StringByteSink<std::string> sink(&result, static_cast<int32_t>(expected.length()));
Edits edits;
nfkc_cf->normalizeUTF8(0, src, sink, &edits, errorCode);
assertSuccess("normalizeUTF8 with Edits", errorCode.get());
assertEquals("normalizeUTF8 with Edits", expected.c_str(), result.c_str());
assertEquals("normalizeUTF8 with Edits", expected.data(), result.c_str());
static const EditChange expectedChanges[] = {
{ FALSE, 2, 2 }, // 2 spaces
{ TRUE, 1, 1 }, // A→a
@ -1607,12 +1608,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
assertTrue("isNormalizedUTF8(normalized)", nfkc_cf->isNormalizedUTF8(result, errorCode));
// Omit unchanged text.
expected = reinterpret_cast<const char*>(u8"aääạ\u0308\u0308가각갃");
expected = u8"aääạ\u0308\u0308가각갃";
result.clear();
edits.reset();
nfkc_cf->normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
assertSuccess("normalizeUTF8 omit unchanged", errorCode.get());
assertEquals("normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
assertEquals("normalizeUTF8 omit unchanged", expected.data(), result.c_str());
assertTrue("normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
assertEquals("normalizeUTF8 omit unchanged numberOfChanges", 9, edits.numberOfChanges());
TestUtility::checkEditsIter(*this, u"normalizeUTF8 omit unchanged",
@ -1623,12 +1624,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
// With filter: The normalization code does not see the "A" substrings.
UnicodeSet filter(u"[^A]", errorCode);
FilteredNormalizer2 fn2(*nfkc_cf, filter);
expected = reinterpret_cast<const char*>(u8" AäA\u0308A\u0323\u0308\u0308,가각갃 ");
expected = u8" AäA\u0308A\u0323\u0308\u0308,가각갃 ";
result.clear();
edits.reset();
fn2.normalizeUTF8(0, src, sink, &edits, errorCode);
assertSuccess("filtered normalizeUTF8", errorCode.get());
assertEquals("filtered normalizeUTF8", expected.c_str(), result.c_str());
assertEquals("filtered normalizeUTF8", expected.data(), result.c_str());
static const EditChange filteredChanges[] = {
{ FALSE, 3, 3 }, // 2 spaces + A
{ TRUE, 2, 2 }, // Ä→ä
@ -1655,12 +1656,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
// Omit unchanged text.
// Note that the result is not normalized because the inner normalizer
// does not see text across filter spans.
expected = reinterpret_cast<const char*>(u8"ä\u0323\u0308\u0308가각갃");
expected = u8"ä\u0323\u0308\u0308가각갃";
result.clear();
edits.reset();
fn2.normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
assertSuccess("filtered normalizeUTF8 omit unchanged", errorCode.get());
assertEquals("filtered normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
assertEquals("filtered normalizeUTF8 omit unchanged", expected.data(), result.c_str());
assertTrue("filtered normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
assertEquals("filtered normalizeUTF8 omit unchanged numberOfChanges", 7, edits.numberOfChanges());
TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8 omit unchanged",
@ -1777,13 +1778,13 @@ BasicNormalizerTest::TestComposeJamoTBase() {
assertFalse("isNormalized(LV+11A7)", nfkc->isNormalized(s, errorCode));
assertTrue("isNormalized(normalized)", nfkc->isNormalized(result, errorCode));
std::string s8(reinterpret_cast<const char*>(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7\u11A7"));
std::string expected8(reinterpret_cast<const char*>(u8"\u11A7\u11A7\u11A7"));
StringPiece s8(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7\u11A7");
StringPiece expected8(u8"\u11A7\u11A7\u11A7");
std::string result8;
StringByteSink<std::string> sink(&result8, static_cast<int32_t>(expected8.length()));
StringByteSink<std::string> sink(&result8, expected8.length());
nfkc->normalizeUTF8(0, s8, sink, nullptr, errorCode);
assertSuccess("normalizeUTF8(LV+11A7)", errorCode.get());
assertEquals("normalizeUTF8(LV+11A7)", expected8.c_str(), result8.c_str());
assertEquals("normalizeUTF8(LV+11A7)", expected8.data(), result8.c_str());
assertFalse("isNormalizedUTF8(LV+11A7)", nfkc->isNormalizedUTF8(s8, errorCode));
assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode));
}

View file

@ -160,7 +160,7 @@ void UTS46Test::TestAPI() {
char buffer[100];
TestCheckedArrayByteSink sink(buffer, UPRV_LENGTHOF(buffer));
errorCode=U_ZERO_ERROR;
nontrans->labelToUnicodeUTF8(StringPiece(NULL, 5), sink, info, errorCode);
nontrans->labelToUnicodeUTF8(StringPiece((const char *)NULL, 5), sink, info, errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || sink.NumberOfBytesWritten()!=0) {
errln("N.labelToUnicodeUTF8(StringPiece(NULL, 5)) did not set illegal-argument-error ",
"or did output something - %s",