mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 05:55:35 +00:00
ICU-20984 StringPiece & ByteSink overloads for char8_t*
This commit is contained in:
parent
d2d144a5bd
commit
524748c6bf
11 changed files with 258 additions and 77 deletions
|
@ -687,13 +687,13 @@ void toUpper(uint32_t options,
|
|||
if (change) {
|
||||
ByteSinkUtil::appendTwoBytes(upper, sink);
|
||||
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
|
||||
sink.Append(reinterpret_cast<const char*>(u8"\u0308"), 2); // restore or add a dialytika
|
||||
sink.AppendU8(u8"\u0308", 2); // restore or add a dialytika
|
||||
}
|
||||
if (addTonos) {
|
||||
sink.Append(reinterpret_cast<const char*>(u8"\u0301"), 2);
|
||||
sink.AppendU8(u8"\u0301", 2);
|
||||
}
|
||||
while (numYpogegrammeni > 0) {
|
||||
sink.Append(reinterpret_cast<const char*>(u8"\u0399"), 2);
|
||||
sink.AppendU8(u8"\u0399", 2);
|
||||
--numYpogegrammeni;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -71,6 +71,40 @@ public:
|
|||
*/
|
||||
virtual void Append(const char* bytes, int32_t n) = 0;
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Appends n bytes to this. Same as Append().
|
||||
* Call AppendU8() with u8"string literals" which are const char * in C++11
|
||||
* but const char8_t * in C++20.
|
||||
* If the compiler does support char8_t as a distinct type,
|
||||
* then an AppendU8() overload for that is defined and will be chosen.
|
||||
*
|
||||
* @param bytes the pointer to the bytes
|
||||
* @param n the number of bytes; must be non-negative
|
||||
* @draft ICU 67
|
||||
*/
|
||||
inline void AppendU8(const char* bytes, int32_t n) {
|
||||
Append(bytes, n);
|
||||
}
|
||||
|
||||
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
|
||||
/**
|
||||
* Appends n bytes to this. Same as Append() but for a const char8_t * pointer.
|
||||
* Call AppendU8() with u8"string literals" which are const char * in C++11
|
||||
* but const char8_t * in C++20.
|
||||
* If the compiler does support char8_t as a distinct type,
|
||||
* then this AppendU8() overload for that is defined and will be chosen.
|
||||
*
|
||||
* @param bytes the pointer to the bytes
|
||||
* @param n the number of bytes; must be non-negative
|
||||
* @draft ICU 67
|
||||
*/
|
||||
inline void AppendU8(const char8_t* bytes, int32_t n) {
|
||||
Append(reinterpret_cast<const char*>(bytes), n);
|
||||
}
|
||||
#endif
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Returns a writable buffer for appending and writes the buffer's capacity to
|
||||
* *result_capacity. Guarantees *result_capacity>=min_capacity.
|
||||
|
|
|
@ -67,19 +67,50 @@ class U_COMMON_API StringPiece : public UMemory {
|
|||
* Default constructor, creates an empty StringPiece.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
StringPiece() : ptr_(NULL), length_(0) { }
|
||||
StringPiece() : ptr_(nullptr), length_(0) { }
|
||||
|
||||
/**
|
||||
* Constructs from a NUL-terminated const char * pointer.
|
||||
* @param str a NUL-terminated const char * pointer
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
StringPiece(const char* str);
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
|
||||
/**
|
||||
* Constructs from a NUL-terminated const char8_t * pointer.
|
||||
* @param str a NUL-terminated const char8_t * pointer
|
||||
* @draft ICU 67
|
||||
*/
|
||||
StringPiece(const char8_t* str) : StringPiece(reinterpret_cast<const char*>(str)) {}
|
||||
#endif
|
||||
/**
|
||||
* Constructs an empty StringPiece.
|
||||
* Needed for type disambiguation from multiple other overloads.
|
||||
* @param p nullptr
|
||||
* @draft ICU 67
|
||||
*/
|
||||
StringPiece(std::nullptr_t p) : ptr_(p), length_(0) {}
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Constructs from a std::string.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
StringPiece(const std::string& str)
|
||||
: ptr_(str.data()), length_(static_cast<int32_t>(str.size())) { }
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
#if defined(__cpp_lib_char8_t) || defined(U_IN_DOXYGEN)
|
||||
/**
|
||||
* Constructs from a std::u8string.
|
||||
* @draft ICU 67
|
||||
*/
|
||||
StringPiece(const std::u8string& str)
|
||||
: ptr_(reinterpret_cast<const char*>(str.data())),
|
||||
length_(static_cast<int32_t>(str.size())) { }
|
||||
#endif
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Constructs from some other implementation of a string piece class, from any
|
||||
|
@ -88,7 +119,7 @@ class U_COMMON_API StringPiece : public UMemory {
|
|||
* \code{.cpp}
|
||||
*
|
||||
* struct OtherStringPieceClass {
|
||||
* const char* data();
|
||||
* const char* data(); // or const char8_t*
|
||||
* size_t size();
|
||||
* };
|
||||
*
|
||||
|
@ -97,16 +128,25 @@ class U_COMMON_API StringPiece : public UMemory {
|
|||
* The other string piece class will typically be std::string_view from C++17
|
||||
* or absl::string_view from Abseil.
|
||||
*
|
||||
* Starting with C++20, data() may also return a const char8_t* pointer,
|
||||
* as from std::u8string_view.
|
||||
*
|
||||
* @param str the other string piece
|
||||
* @draft ICU 65
|
||||
*/
|
||||
template <typename T,
|
||||
typename = typename std::enable_if<
|
||||
std::is_same<decltype(T().data()), const char*>::value &&
|
||||
(std::is_same<decltype(T().data()), const char*>::value
|
||||
#if defined(__cpp_char8_t)
|
||||
|| std::is_same<decltype(T().data()), const char8_t*>::value
|
||||
#endif
|
||||
) &&
|
||||
std::is_same<decltype(T().size()), size_t>::value>::type>
|
||||
StringPiece(T str)
|
||||
: ptr_(str.data()), length_(static_cast<int32_t>(str.size())) {}
|
||||
: ptr_(reinterpret_cast<const char*>(str.data())),
|
||||
length_(static_cast<int32_t>(str.size())) {}
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Constructs from a const char * pointer and a specified length.
|
||||
* @param offset a const char * pointer (need not be terminated)
|
||||
|
@ -114,6 +154,19 @@ class U_COMMON_API StringPiece : public UMemory {
|
|||
* @stable ICU 4.2
|
||||
*/
|
||||
StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { }
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
|
||||
/**
|
||||
* Constructs from a const char8_t * pointer and a specified length.
|
||||
* @param str a const char8_t * pointer (need not be terminated)
|
||||
* @param len the length of the string; must be non-negative
|
||||
* @draft ICU 67
|
||||
*/
|
||||
StringPiece(const char8_t* str, int32_t len) :
|
||||
StringPiece(reinterpret_cast<const char*>(str), len) {}
|
||||
#endif
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Substring of another StringPiece.
|
||||
* @param x the other StringPiece
|
||||
|
@ -132,7 +185,7 @@ class U_COMMON_API StringPiece : public UMemory {
|
|||
StringPiece(const StringPiece& x, int32_t pos, int32_t len);
|
||||
|
||||
/**
|
||||
* Returns the string pointer. May be NULL if it is empty.
|
||||
* Returns the string pointer. May be nullptr if it is empty.
|
||||
*
|
||||
* data() may return a pointer to a buffer with embedded NULs, and the
|
||||
* returned buffer may or may not be null terminated. Therefore it is
|
||||
|
@ -165,7 +218,7 @@ class U_COMMON_API StringPiece : public UMemory {
|
|||
* Sets to an empty string.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
void clear() { ptr_ = NULL; length_ = 0; }
|
||||
void clear() { ptr_ = nullptr; length_ = 0; }
|
||||
|
||||
/**
|
||||
* Reset the stringpiece to refer to new data.
|
||||
|
@ -182,6 +235,29 @@ class U_COMMON_API StringPiece : public UMemory {
|
|||
*/
|
||||
void set(const char* str);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
|
||||
/**
|
||||
* Resets the stringpiece to refer to new data.
|
||||
* @param xdata pointer the new string data. Need not be NUL-terminated.
|
||||
* @param len the length of the new data
|
||||
* @draft ICU 67
|
||||
*/
|
||||
inline void set(const char8_t* xdata, int32_t len) {
|
||||
set(reinterpret_cast<const char*>(xdata), len);
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the stringpiece to refer to new data.
|
||||
* @param str a pointer to a NUL-terminated string.
|
||||
* @draft ICU 67
|
||||
*/
|
||||
inline void set(const char8_t* str) {
|
||||
set(reinterpret_cast<const char*>(str));
|
||||
}
|
||||
#endif
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Removes the first n string units.
|
||||
* @param n prefix length, must be non-negative and <=length()
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "unicode/sortkey.h"
|
||||
#include "unicode/std_string.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/tblcoll.h"
|
||||
#include "unicode/uiter.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
@ -293,15 +294,15 @@ void CollationTest::TestIllegalUTF8() {
|
|||
}
|
||||
coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
|
||||
|
||||
static const char *strings[] = {
|
||||
static const StringPiece strings[] = {
|
||||
// string with U+FFFD == illegal byte sequence
|
||||
reinterpret_cast<const char*>(u8"a\uFFFDz"), reinterpret_cast<const char*>("a\x80z"), // trail byte
|
||||
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xc1\x81z"), // non-shortest form
|
||||
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xe0\x82\x83z"), // non-shortest form
|
||||
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xed\xa0\x80z"), // lead surrogate: would be U+D800
|
||||
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xed\xbf\xbfz"), // trail surrogate: would be U+DFFF
|
||||
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xf0\x8f\xbf\xbfz"), // non-shortest form
|
||||
reinterpret_cast<const char*>(u8"a\uFFFD\uFFFD\uFFFD\uFFFDz"), reinterpret_cast<const char*>("a\xf4\x90\x80\x80z") // out of range: would be U+110000
|
||||
u8"a\uFFFDz", "a\x80z", // trail byte
|
||||
u8"a\uFFFD\uFFFDz", "a\xc1\x81z", // non-shortest form
|
||||
u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z", // non-shortest form
|
||||
u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z", // lead surrogate: would be U+D800
|
||||
u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF
|
||||
u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz", // non-shortest form
|
||||
u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z" // out of range: would be U+110000
|
||||
};
|
||||
|
||||
for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
|
||||
typedef struct ExpectedResult {
|
||||
double value;
|
||||
// Invariant characters, will be converted to UTF-16 and then unescaped.
|
||||
const char *expected;
|
||||
} ExpectedResult;
|
||||
|
||||
|
@ -185,38 +186,38 @@ static ExpectedResult kChineseCurrencyTestData[] = {
|
|||
{123456789012345.0, "\\u00A5120\\u4E07\\u4EBF"},
|
||||
};
|
||||
static ExpectedResult kGermanCurrencyTestData[] = {
|
||||
{1.0, reinterpret_cast<const char*>(u8"1\\u00A0\\u20AC")},
|
||||
{12.0, reinterpret_cast<const char*>(u8"12\\u00A0\\u20AC")},
|
||||
{123.0, reinterpret_cast<const char*>(u8"120\\u00A0\\u20AC")},
|
||||
{1234.0, reinterpret_cast<const char*>(u8"1200\\u00A0\\u20AC")},
|
||||
{12345.0, reinterpret_cast<const char*>(u8"12.000\\u00A0\\u20AC")},
|
||||
{123456.0, reinterpret_cast<const char*>(u8"120.000\\u00A0\\u20AC")},
|
||||
{1234567.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Mio.\\u00A0\\u20AC")},
|
||||
{12345678.0, reinterpret_cast<const char*>(u8"12\\u00A0Mio.\\u00A0\\u20AC")},
|
||||
{123456789.0, reinterpret_cast<const char*>(u8"120\\u00A0Mio.\\u00A0\\u20AC")},
|
||||
{1234567890.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Mrd.\\u00A0\\u20AC")},
|
||||
{12345678901.0, reinterpret_cast<const char*>(u8"12\\u00A0Mrd.\\u00A0\\u20AC")},
|
||||
{123456789012.0, reinterpret_cast<const char*>(u8"120\\u00A0Mrd.\\u00A0\\u20AC")},
|
||||
{1234567890123.0, reinterpret_cast<const char*>(u8"1,2\\u00A0Bio.\\u00A0\\u20AC")},
|
||||
{12345678901234.0, reinterpret_cast<const char*>(u8"12\\u00A0Bio.\\u00A0\\u20AC")},
|
||||
{123456789012345.0, reinterpret_cast<const char*>(u8"120\\u00A0Bio.\\u00A0\\u20AC")},
|
||||
{1.0, "1\\u00A0\\u20AC"},
|
||||
{12.0, "12\\u00A0\\u20AC"},
|
||||
{123.0, "120\\u00A0\\u20AC"},
|
||||
{1234.0, "1200\\u00A0\\u20AC"},
|
||||
{12345.0, "12.000\\u00A0\\u20AC"},
|
||||
{123456.0, "120.000\\u00A0\\u20AC"},
|
||||
{1234567.0, "1,2\\u00A0Mio.\\u00A0\\u20AC"},
|
||||
{12345678.0, "12\\u00A0Mio.\\u00A0\\u20AC"},
|
||||
{123456789.0, "120\\u00A0Mio.\\u00A0\\u20AC"},
|
||||
{1234567890.0, "1,2\\u00A0Mrd.\\u00A0\\u20AC"},
|
||||
{12345678901.0, "12\\u00A0Mrd.\\u00A0\\u20AC"},
|
||||
{123456789012.0, "120\\u00A0Mrd.\\u00A0\\u20AC"},
|
||||
{1234567890123.0, "1,2\\u00A0Bio.\\u00A0\\u20AC"},
|
||||
{12345678901234.0, "12\\u00A0Bio.\\u00A0\\u20AC"},
|
||||
{123456789012345.0, "120\\u00A0Bio.\\u00A0\\u20AC"},
|
||||
};
|
||||
static ExpectedResult kEnglishCurrencyTestData[] = {
|
||||
{1.0, reinterpret_cast<const char*>(u8"$1")},
|
||||
{12.0, reinterpret_cast<const char*>(u8"$12")},
|
||||
{123.0, reinterpret_cast<const char*>(u8"$120")},
|
||||
{1234.0, reinterpret_cast<const char*>(u8"$1.2K")},
|
||||
{12345.0, reinterpret_cast<const char*>(u8"$12K")},
|
||||
{123456.0, reinterpret_cast<const char*>(u8"$120K")},
|
||||
{1234567.0, reinterpret_cast<const char*>(u8"$1.2M")},
|
||||
{12345678.0, reinterpret_cast<const char*>(u8"$12M")},
|
||||
{123456789.0, reinterpret_cast<const char*>(u8"$120M")},
|
||||
{1234567890.0, reinterpret_cast<const char*>(u8"$1.2B")},
|
||||
{12345678901.0, reinterpret_cast<const char*>(u8"$12B")},
|
||||
{123456789012.0, reinterpret_cast<const char*>(u8"$120B")},
|
||||
{1234567890123.0, reinterpret_cast<const char*>(u8"$1.2T")},
|
||||
{12345678901234.0, reinterpret_cast<const char*>(u8"$12T")},
|
||||
{123456789012345.0, reinterpret_cast<const char*>(u8"$120T")},
|
||||
{1.0, "$1"},
|
||||
{12.0, "$12"},
|
||||
{123.0, "$120"},
|
||||
{1234.0, "$1.2K"},
|
||||
{12345.0, "$12K"},
|
||||
{123456.0, "$120K"},
|
||||
{1234567.0, "$1.2M"},
|
||||
{12345678.0, "$12M"},
|
||||
{123456789.0, "$120M"},
|
||||
{1234567890.0, "$1.2B"},
|
||||
{12345678901.0, "$12B"},
|
||||
{123456789012.0, "$120B"},
|
||||
{1234567890123.0, "$1.2T"},
|
||||
{12345678901234.0, "$12T"},
|
||||
{123456789012345.0, "$120T"},
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/regex.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
@ -5838,11 +5839,11 @@ void RegexTest::TestBug12884() {
|
|||
REGEX_ASSERT(status == U_REGEX_TIME_OUT);
|
||||
|
||||
// UText, wrapping non-UTF-16 text, also takes a different execution path.
|
||||
const char *text8 = reinterpret_cast<const char*>(u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
|
||||
StringPiece text8(u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
|
||||
"carácter, sin importar la plataforma, sin importar el programa,"
|
||||
"sin importar el idioma.");
|
||||
status = U_ZERO_ERROR;
|
||||
LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
|
||||
LocalUTextPointer ut(utext_openUTF8(NULL, text8.data(), text8.length(), &status));
|
||||
REGEX_CHECK_STATUS;
|
||||
m.reset(ut.getAlias());
|
||||
m.find(status);
|
||||
|
|
|
@ -1314,7 +1314,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() {
|
|||
Edits edits;
|
||||
|
||||
int32_t length = CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT,
|
||||
reinterpret_cast<const char*>(u8"IstanBul"), 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
|
||||
reinterpret_cast<const char*>(u8"IstanBul"), 8,
|
||||
dest, UPRV_LENGTHOF(dest), &edits, errorCode);
|
||||
assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"),
|
||||
UnicodeString::fromUTF8(StringPiece(dest, length)));
|
||||
static const EditChange lowerExpectedChanges[] = {
|
||||
|
@ -1330,7 +1331,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() {
|
|||
|
||||
edits.reset();
|
||||
length = CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT,
|
||||
reinterpret_cast<const char*>(u8"Πατάτα"), 6 * 2, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
|
||||
reinterpret_cast<const char*>(u8"Πατάτα"), 6 * 2,
|
||||
dest, UPRV_LENGTHOF(dest), &edits, errorCode);
|
||||
assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"),
|
||||
UnicodeString::fromUTF8(StringPiece(dest, length)));
|
||||
static const EditChange upperExpectedChanges[] = {
|
||||
|
@ -1370,7 +1372,8 @@ void StringCaseTest::TestCaseMapUTF8WithEdits() {
|
|||
// No explicit nor automatic edits.reset(). Edits should be appended.
|
||||
length = CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_EDITS_NO_RESET |
|
||||
U_FOLD_CASE_EXCLUDE_SPECIAL_I,
|
||||
reinterpret_cast<const char*>(u8"IßtanBul"), 1 + 2 + 6, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
|
||||
reinterpret_cast<const char*>(u8"IßtanBul"), 1 + 2 + 6,
|
||||
dest, UPRV_LENGTHOF(dest), &edits, errorCode);
|
||||
assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"),
|
||||
UnicodeString::fromUTF8(StringPiece(dest, length)));
|
||||
static const EditChange foldExpectedChanges[] = {
|
||||
|
@ -1454,44 +1457,44 @@ void StringCaseTest::TestCaseMapUTF8ToString() {
|
|||
StringByteSink<std::string> sink(&dest);
|
||||
|
||||
// Omit unchanged text.
|
||||
CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, reinterpret_cast<const char*>(u8"IstanBul"), sink, nullptr, errorCode);
|
||||
CaseMap::utf8ToLower("tr", U_OMIT_UNCHANGED_TEXT, u8"IstanBul", sink, nullptr, errorCode);
|
||||
assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"), UnicodeString::fromUTF8(dest));
|
||||
dest.clear();
|
||||
CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, reinterpret_cast<const char*>(u8"Πατάτα"), sink, nullptr, errorCode);
|
||||
CaseMap::utf8ToUpper("el", U_OMIT_UNCHANGED_TEXT, u8"Πατάτα", sink, nullptr, errorCode);
|
||||
assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"),
|
||||
UnicodeString::fromUTF8(dest));
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
dest.clear();
|
||||
CaseMap::utf8ToTitle(
|
||||
"nl", U_OMIT_UNCHANGED_TEXT | U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE,
|
||||
nullptr, reinterpret_cast<const char*>(u8"IjssEL IglOo"), sink, nullptr, errorCode);
|
||||
nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode);
|
||||
assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"J"),
|
||||
UnicodeString::fromUTF8(dest));
|
||||
#endif
|
||||
dest.clear();
|
||||
CaseMap::utf8Fold(U_OMIT_UNCHANGED_TEXT | U_FOLD_CASE_EXCLUDE_SPECIAL_I,
|
||||
reinterpret_cast<const char*>(u8"IßtanBul"), sink, nullptr, errorCode);
|
||||
u8"IßtanBul", sink, nullptr, errorCode);
|
||||
assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"),
|
||||
UnicodeString::fromUTF8(dest));
|
||||
|
||||
// Return the whole result string.
|
||||
dest.clear();
|
||||
CaseMap::utf8ToLower("tr", 0, reinterpret_cast<const char*>(u8"IstanBul"), sink, nullptr, errorCode);
|
||||
CaseMap::utf8ToLower("tr", 0, u8"IstanBul", sink, nullptr, errorCode);
|
||||
assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıstanbul"),
|
||||
UnicodeString::fromUTF8(dest));
|
||||
dest.clear();
|
||||
CaseMap::utf8ToUpper("el", 0, reinterpret_cast<const char*>(u8"Πατάτα"), sink, nullptr, errorCode);
|
||||
CaseMap::utf8ToUpper("el", 0, u8"Πατάτα", sink, nullptr, errorCode);
|
||||
assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΠΑΤΑΤΑ"),
|
||||
UnicodeString::fromUTF8(dest));
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
dest.clear();
|
||||
CaseMap::utf8ToTitle("nl", U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE,
|
||||
nullptr, reinterpret_cast<const char*>(u8"IjssEL IglOo"), sink, nullptr, errorCode);
|
||||
nullptr, u8"IjssEL IglOo", sink, nullptr, errorCode);
|
||||
assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"IJssEL IglOo"),
|
||||
UnicodeString::fromUTF8(dest));
|
||||
#endif
|
||||
dest.clear();
|
||||
CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, reinterpret_cast<const char*>(u8"IßtanBul"), sink, nullptr, errorCode);
|
||||
CaseMap::utf8Fold(U_FOLD_CASE_EXCLUDE_SPECIAL_I, u8"IßtanBul", sink, nullptr, errorCode);
|
||||
assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ısstanbul"),
|
||||
UnicodeString::fromUTF8(dest));
|
||||
}
|
||||
|
|
|
@ -248,9 +248,11 @@ void StringTest::runIndexedTest(int32_t index, UBool exec, const char *&name, ch
|
|||
#ifdef U_HAVE_STRING_VIEW
|
||||
TESTCASE_AUTO(TestStringPieceStringView);
|
||||
#endif
|
||||
TESTCASE_AUTO(TestStringPieceU8);
|
||||
TESTCASE_AUTO(TestByteSink);
|
||||
TESTCASE_AUTO(TestCheckedArrayByteSink);
|
||||
TESTCASE_AUTO(TestStringByteSink);
|
||||
TESTCASE_AUTO(TestStringByteSinkAppendU8);
|
||||
TESTCASE_AUTO(TestCharString);
|
||||
TESTCASE_AUTO(TestCStr);
|
||||
TESTCASE_AUTO(Testctou);
|
||||
|
@ -265,7 +267,7 @@ StringTest::TestStringPiece() {
|
|||
errln("StringPiece() failed");
|
||||
}
|
||||
// Construct from NULL const char * pointer.
|
||||
StringPiece null(NULL);
|
||||
StringPiece null((const char *)nullptr);
|
||||
if(!null.empty() || null.data()!=NULL || null.length()!=0 || null.size()!=0) {
|
||||
errln("StringPiece(NULL) failed");
|
||||
}
|
||||
|
@ -395,7 +397,7 @@ StringTest::TestStringPiece() {
|
|||
void
|
||||
StringTest::TestStringPieceComparisons() {
|
||||
StringPiece empty;
|
||||
StringPiece null(NULL);
|
||||
StringPiece null(nullptr);
|
||||
StringPiece abc("abc");
|
||||
StringPiece abcd("abcdefg", 4);
|
||||
StringPiece abx("abx");
|
||||
|
@ -521,6 +523,52 @@ StringTest::TestStringPieceStringView() {
|
|||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
StringTest::TestStringPieceU8() {
|
||||
// ICU-20984 "mitigate some C++20 char8_t breakages"
|
||||
// For the following APIs there are overloads for both
|
||||
// const char * and const char8_t *.
|
||||
// A u8"string literal" has one type or the other
|
||||
// depending on C++ version and compiler settings.
|
||||
StringPiece abc(u8"abc");
|
||||
assertEquals("abc.length", 3, abc.length());
|
||||
assertEquals("abc", "\x61\x62\x63", abc.data());
|
||||
|
||||
StringPiece abc3(u8"abcdef", 3);
|
||||
assertEquals("abc3.length", 3, abc3.length());
|
||||
assertEquals("abc3[0]", 0x61, abc3.data()[0]);
|
||||
assertEquals("abc3[1]", 0x62, abc3.data()[1]);
|
||||
assertEquals("abc3[2]", 0x63, abc3.data()[2]);
|
||||
|
||||
StringPiece uvw("q");
|
||||
uvw.set(u8"uvw");
|
||||
assertEquals("uvw.length", 3, uvw.length());
|
||||
assertEquals("uvw", "\x75\x76\x77", uvw.data());
|
||||
|
||||
StringPiece xyz("r");
|
||||
xyz.set(u8"xyzXYZ", 3);
|
||||
assertEquals("xyz.length", 3, xyz.length());
|
||||
assertEquals("xyz[0]", 0x78, xyz.data()[0]);
|
||||
assertEquals("xyz[1]", 0x79, xyz.data()[1]);
|
||||
assertEquals("xyz[2]", 0x7a, xyz.data()[2]);
|
||||
|
||||
StringPiece null(nullptr);
|
||||
assertTrue("null is empty", null.empty());
|
||||
assertTrue("null is null", null.data() == nullptr);
|
||||
|
||||
#ifdef __cpp_lib_char8_t
|
||||
std::u8string_view u8sv(u8"sv"); // C++20
|
||||
StringPiece u8svsp(u8sv);
|
||||
assertEquals("u8svsp.length", 2, u8svsp.length());
|
||||
assertEquals("u8svsp", "\x73\x76", u8svsp.data());
|
||||
|
||||
std::u8string u8str(u8"str"); // C++20
|
||||
StringPiece u8strsp(u8str);
|
||||
assertEquals("u8strsp.length", 3, u8strsp.length());
|
||||
assertEquals("u8strsp", "\x73\x74\x72", u8strsp.data());
|
||||
#endif // __cpp_lib_char8_t
|
||||
}
|
||||
|
||||
// Verify that ByteSink is subclassable and Flush() overridable.
|
||||
class SimpleByteSink : public ByteSink {
|
||||
public:
|
||||
|
@ -653,6 +701,20 @@ StringTest::TestStringByteSink() {
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
StringTest::TestStringByteSinkAppendU8() {
|
||||
// ICU-20984 "mitigate some C++20 char8_t breakages"
|
||||
// For the following APIs there are overloads for both
|
||||
// const char * and const char8_t *.
|
||||
// A u8"string literal" has one type or the other
|
||||
// depending on C++ version and compiler settings.
|
||||
std::string result("abc");
|
||||
StringByteSink<std::string> sink(&result);
|
||||
sink.AppendU8("def", 3);
|
||||
sink.AppendU8(u8"ghijkl", 4);
|
||||
assertEquals("abcdefghij", "abcdef\x67\x68\x69\x6a", result.c_str());
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <vector>
|
||||
#endif
|
||||
|
|
|
@ -49,9 +49,11 @@ private:
|
|||
#ifdef U_HAVE_STRING_VIEW
|
||||
void TestStringPieceStringView();
|
||||
#endif
|
||||
void TestStringPieceU8();
|
||||
void TestByteSink();
|
||||
void TestCheckedArrayByteSink();
|
||||
void TestStringByteSink();
|
||||
void TestStringByteSinkAppendU8();
|
||||
void TestSTLCompatibility();
|
||||
void TestCharString();
|
||||
void TestCStr();
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include "unicode/errorcode.h"
|
||||
#include "unicode/normlzr.h"
|
||||
#include "unicode/stringoptions.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/usetiter.h"
|
||||
#include "unicode/schriter.h"
|
||||
|
@ -1573,15 +1574,15 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
if(errorCode.errDataIfFailureAndReset("Normalizer2::getNFKCCasefoldInstance() call failed")) {
|
||||
return;
|
||||
}
|
||||
static const char *const src =
|
||||
reinterpret_cast<const char*>(u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133 ");
|
||||
std::string expected = reinterpret_cast<const char*>(u8" aääạ\u0308ạ\u0308,가각갃 ");
|
||||
static const StringPiece src =
|
||||
u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133 ";
|
||||
StringPiece expected = u8" aääạ\u0308ạ\u0308,가각갃 ";
|
||||
std::string result;
|
||||
StringByteSink<std::string> sink(&result, static_cast<int32_t>(expected.length()));
|
||||
Edits edits;
|
||||
nfkc_cf->normalizeUTF8(0, src, sink, &edits, errorCode);
|
||||
assertSuccess("normalizeUTF8 with Edits", errorCode.get());
|
||||
assertEquals("normalizeUTF8 with Edits", expected.c_str(), result.c_str());
|
||||
assertEquals("normalizeUTF8 with Edits", expected.data(), result.c_str());
|
||||
static const EditChange expectedChanges[] = {
|
||||
{ FALSE, 2, 2 }, // 2 spaces
|
||||
{ TRUE, 1, 1 }, // A→a
|
||||
|
@ -1607,12 +1608,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
assertTrue("isNormalizedUTF8(normalized)", nfkc_cf->isNormalizedUTF8(result, errorCode));
|
||||
|
||||
// Omit unchanged text.
|
||||
expected = reinterpret_cast<const char*>(u8"aääạ\u0308ạ\u0308가각갃");
|
||||
expected = u8"aääạ\u0308ạ\u0308가각갃";
|
||||
result.clear();
|
||||
edits.reset();
|
||||
nfkc_cf->normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
|
||||
assertSuccess("normalizeUTF8 omit unchanged", errorCode.get());
|
||||
assertEquals("normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
|
||||
assertEquals("normalizeUTF8 omit unchanged", expected.data(), result.c_str());
|
||||
assertTrue("normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
|
||||
assertEquals("normalizeUTF8 omit unchanged numberOfChanges", 9, edits.numberOfChanges());
|
||||
TestUtility::checkEditsIter(*this, u"normalizeUTF8 omit unchanged",
|
||||
|
@ -1623,12 +1624,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
// With filter: The normalization code does not see the "A" substrings.
|
||||
UnicodeSet filter(u"[^A]", errorCode);
|
||||
FilteredNormalizer2 fn2(*nfkc_cf, filter);
|
||||
expected = reinterpret_cast<const char*>(u8" AäA\u0308A\u0323\u0308ạ\u0308,가각갃 ");
|
||||
expected = u8" AäA\u0308A\u0323\u0308ạ\u0308,가각갃 ";
|
||||
result.clear();
|
||||
edits.reset();
|
||||
fn2.normalizeUTF8(0, src, sink, &edits, errorCode);
|
||||
assertSuccess("filtered normalizeUTF8", errorCode.get());
|
||||
assertEquals("filtered normalizeUTF8", expected.c_str(), result.c_str());
|
||||
assertEquals("filtered normalizeUTF8", expected.data(), result.c_str());
|
||||
static const EditChange filteredChanges[] = {
|
||||
{ FALSE, 3, 3 }, // 2 spaces + A
|
||||
{ TRUE, 2, 2 }, // Ä→ä
|
||||
|
@ -1655,12 +1656,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
// Omit unchanged text.
|
||||
// Note that the result is not normalized because the inner normalizer
|
||||
// does not see text across filter spans.
|
||||
expected = reinterpret_cast<const char*>(u8"ä\u0323\u0308ạ\u0308가각갃");
|
||||
expected = u8"ä\u0323\u0308ạ\u0308가각갃";
|
||||
result.clear();
|
||||
edits.reset();
|
||||
fn2.normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
|
||||
assertSuccess("filtered normalizeUTF8 omit unchanged", errorCode.get());
|
||||
assertEquals("filtered normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
|
||||
assertEquals("filtered normalizeUTF8 omit unchanged", expected.data(), result.c_str());
|
||||
assertTrue("filtered normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
|
||||
assertEquals("filtered normalizeUTF8 omit unchanged numberOfChanges", 7, edits.numberOfChanges());
|
||||
TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8 omit unchanged",
|
||||
|
@ -1777,13 +1778,13 @@ BasicNormalizerTest::TestComposeJamoTBase() {
|
|||
assertFalse("isNormalized(LV+11A7)", nfkc->isNormalized(s, errorCode));
|
||||
assertTrue("isNormalized(normalized)", nfkc->isNormalized(result, errorCode));
|
||||
|
||||
std::string s8(reinterpret_cast<const char*>(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7"));
|
||||
std::string expected8(reinterpret_cast<const char*>(u8"가\u11A7가\u11A7가\u11A7"));
|
||||
StringPiece s8(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7");
|
||||
StringPiece expected8(u8"가\u11A7가\u11A7가\u11A7");
|
||||
std::string result8;
|
||||
StringByteSink<std::string> sink(&result8, static_cast<int32_t>(expected8.length()));
|
||||
StringByteSink<std::string> sink(&result8, expected8.length());
|
||||
nfkc->normalizeUTF8(0, s8, sink, nullptr, errorCode);
|
||||
assertSuccess("normalizeUTF8(LV+11A7)", errorCode.get());
|
||||
assertEquals("normalizeUTF8(LV+11A7)", expected8.c_str(), result8.c_str());
|
||||
assertEquals("normalizeUTF8(LV+11A7)", expected8.data(), result8.c_str());
|
||||
assertFalse("isNormalizedUTF8(LV+11A7)", nfkc->isNormalizedUTF8(s8, errorCode));
|
||||
assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode));
|
||||
}
|
||||
|
|
|
@ -160,7 +160,7 @@ void UTS46Test::TestAPI() {
|
|||
char buffer[100];
|
||||
TestCheckedArrayByteSink sink(buffer, UPRV_LENGTHOF(buffer));
|
||||
errorCode=U_ZERO_ERROR;
|
||||
nontrans->labelToUnicodeUTF8(StringPiece(NULL, 5), sink, info, errorCode);
|
||||
nontrans->labelToUnicodeUTF8(StringPiece((const char *)NULL, 5), sink, info, errorCode);
|
||||
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || sink.NumberOfBytesWritten()!=0) {
|
||||
errln("N.labelToUnicodeUTF8(StringPiece(NULL, 5)) did not set illegal-argument-error ",
|
||||
"or did output something - %s",
|
||||
|
|
Loading…
Add table
Reference in a new issue