mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-04 13:05:31 +00:00
ICU-22718 Export disallowed/ignored UTS 46 data for ICU4X
This commit is contained in:
parent
43b0901a0e
commit
b8389b1186
1 changed files with 41 additions and 17 deletions
|
@ -755,9 +755,12 @@ void computeDecompositions(const char* basename,
|
|||
std::vector<uint32_t> nonRecursive32;
|
||||
LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status));
|
||||
|
||||
UBool uts46 = false;
|
||||
|
||||
if (uprv_strcmp(basename, "nfkd") == 0) {
|
||||
mainNormalizer = Normalizer2::getNFKDInstance(status);
|
||||
} else if (uprv_strcmp(basename, "uts46d") == 0) {
|
||||
uts46 = true;
|
||||
mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status);
|
||||
} else {
|
||||
mainNormalizer = nfdNormalizer;
|
||||
|
@ -817,23 +820,38 @@ void computeDecompositions(const char* basename,
|
|||
nfcNormalizer->normalize(dst, nfc, status);
|
||||
nonNfdOrRoundTrips = (src == nfc);
|
||||
}
|
||||
int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
|
||||
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
|
||||
// Characters that normalize to nothing or to U+FFFD (without the
|
||||
// input being U+FFFD) in ICU4C's UTS 46 normalization normalize
|
||||
// as in NFD in ICU4X's UTF 46 normalization in the interest
|
||||
// of data size and ICU4X's normalizer being unable to handle
|
||||
// normalizing to nothing.
|
||||
// When UTS 46 is implemented on top of ICU4X, a preprocessing
|
||||
// step is supposed to remove these characters before the
|
||||
// normalization step.
|
||||
if (uprv_strcmp(basename, "uts46d") != 0) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
if (uts46) {
|
||||
// Work around https://unicode-org.atlassian.net/browse/ICU-22658
|
||||
// TODO: Remove the workaround after data corresponding to
|
||||
// https://www.unicode.org/L2/L2024/24061.htm#179-C36 lands
|
||||
// for Unicode 16.
|
||||
switch (c) {
|
||||
case 0x2F868:
|
||||
dst.truncate(0);
|
||||
dst.append(UChar32(0x36FC));
|
||||
break;
|
||||
case 0x2F874:
|
||||
dst.truncate(0);
|
||||
dst.append(UChar32(0x5F53));
|
||||
break;
|
||||
case 0x2F91F:
|
||||
dst.truncate(0);
|
||||
dst.append(UChar32(0x243AB));
|
||||
break;
|
||||
case 0x2F95F:
|
||||
dst.truncate(0);
|
||||
dst.append(UChar32(0x7AEE));
|
||||
break;
|
||||
case 0x2F9BF:
|
||||
dst.truncate(0);
|
||||
dst.append(UChar32(0x45D7));
|
||||
break;
|
||||
}
|
||||
nfdNormalizer->normalize(src, dst, status);
|
||||
len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
|
||||
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
|
||||
}
|
||||
int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
|
||||
|
||||
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
|
||||
if (!uts46) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
|
@ -951,7 +969,13 @@ void computeDecompositions(const char* basename,
|
|||
if (!nonNfdOrRoundTrips) {
|
||||
compositionPassthroughBound = c;
|
||||
}
|
||||
if (len == 1 && utf32[0] <= 0xFFFF) {
|
||||
if (!len) {
|
||||
if (!uts46) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
pendingTrieInsertions.push_back({c, 0xFFFFFFFF, false});
|
||||
} else if (len == 1 && utf32[0] <= 0xFFFF) {
|
||||
if (startsWithBackwardCombiningStarter) {
|
||||
if (mainNormalizer == nfdNormalizer) {
|
||||
// Not supposed to happen in NFD
|
||||
|
|
Loading…
Add table
Reference in a new issue