mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-22095 Export ICU4X normalization data with tries only without Unicode sets
This commit is contained in:
parent
86166e0a2d
commit
c258f3d6f8
1 changed files with 100 additions and 14 deletions
|
@ -428,9 +428,7 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t
|
|||
status));
|
||||
handleError(status, basename);
|
||||
|
||||
if (!reference) {
|
||||
usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML);
|
||||
} else {
|
||||
if (reference) {
|
||||
if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) {
|
||||
// NFD expectations don't hold. The set must not contain the half-width
|
||||
// kana voicing marks and must contain iota subscript.
|
||||
|
@ -484,6 +482,28 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t
|
|||
handleError(status, basename);
|
||||
}
|
||||
|
||||
void writeNopCompositionPassThrough(const char* basename) {
|
||||
IcuToolErrorCode status("icuexportdata: writeNopCompositionPassThrough");
|
||||
FILE* f = prepareOutputFile(basename);
|
||||
|
||||
fprintf(f, "first = 0x0\n");
|
||||
|
||||
LocalUMutableCPTriePointer builder(umutablecptrie_open(0xFF, 0xFF, status));
|
||||
|
||||
LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
|
||||
builder.getAlias(),
|
||||
trieType,
|
||||
UCPTRIE_VALUE_BITS_8,
|
||||
status));
|
||||
handleError(status, basename);
|
||||
|
||||
fprintf(f, "[trie]\n");
|
||||
usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
|
||||
|
||||
fclose(f);
|
||||
handleError(status, basename);
|
||||
}
|
||||
|
||||
void writePotentialCompositionPassThrough(const char* basename, const Normalizer2* norm, const USet* decompositionStartsWithNonStarter, const USet* decompositionStartsWithBackwardCombiningStarter, USet* potentialPassthroughAndNotBackwardCombining) {
|
||||
IcuToolErrorCode status("icuexportdata: writePotentialCompositionPassThrough");
|
||||
FILE* f = prepareOutputFile(basename);
|
||||
|
@ -517,12 +537,46 @@ void writePotentialCompositionPassThrough(const char* basename, const Normalizer
|
|||
}
|
||||
}
|
||||
|
||||
// The surrogate range forms a useless discontinuity. The code
|
||||
// that reads from the set never looks up by surrage, so let's
|
||||
// put the surrogate range in the set as a micro-optimization.
|
||||
uset_addRange(potentialPassthroughAndNotBackwardCombining, 0xD800, 0xDFFF);
|
||||
// There are fancier ways to do this, but let's keep things
|
||||
// very simple: Deliberately not working this into the above
|
||||
// loop and not extracting this from the inversion list
|
||||
// directly.
|
||||
for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
|
||||
if (!uset_contains(potentialPassthroughAndNotBackwardCombining, c)) {
|
||||
fprintf(f, "first = 0x%X\n", c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 8 bits per trie value. Default is 0, which means pass-through.
|
||||
// That is, the lookup key isn't actually a UChar32 but a UChar32
|
||||
// divided by 8, but that's still in range, so things work despite
|
||||
// the data structure not being meant to be used like this.
|
||||
LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
|
||||
|
||||
for (int32_t i = 0; i < ((0x10FFFF + 1)/8); ++i) {
|
||||
uint32_t trieVal = 0;
|
||||
for (int32_t j = 0; j < 8; ++j) {
|
||||
UChar32 c = i*8 + j;
|
||||
if (!uset_contains(potentialPassthroughAndNotBackwardCombining, c)) {
|
||||
trieVal |= (1 << j);
|
||||
}
|
||||
}
|
||||
if (trieVal) {
|
||||
umutablecptrie_set(builder.getAlias(), UChar32(i), trieVal, status);
|
||||
}
|
||||
}
|
||||
|
||||
LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
|
||||
builder.getAlias(),
|
||||
trieType,
|
||||
UCPTRIE_VALUE_BITS_8,
|
||||
status));
|
||||
handleError(status, basename);
|
||||
|
||||
fprintf(f, "[trie]\n");
|
||||
usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
|
||||
|
||||
usrc_writeUnicodeSet(f, potentialPassthroughAndNotBackwardCombining, UPRV_TARGET_SYNTAX_TOML);
|
||||
fclose(f);
|
||||
handleError(status, basename);
|
||||
}
|
||||
|
@ -619,15 +673,29 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt
|
|||
bool startsWithNonStarter = u_getCombiningClass(utf32[0]);
|
||||
if (startsWithNonStarter) {
|
||||
uset_add(decompositionStartsWithNonStarter, c);
|
||||
if (src != dst && !(c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F)) {
|
||||
// A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
} else if (uset_contains(backwardCombiningStarters, c)) {
|
||||
uset_add(decompositionStartsWithBackwardCombiningStarter, c);
|
||||
}
|
||||
if (c != 2 && len == 1 && utf32[0] == 2) {
|
||||
// 2 is reserved as a marker for decomposition starts with non-starter.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
if (mainNormalizer != nfdNormalizer) {
|
||||
UnicodeString nfd;
|
||||
nfdNormalizer->normalize(src, nfd, status);
|
||||
if (dst == nfd) {
|
||||
continue;
|
||||
}
|
||||
} else if (startsWithNonStarter) {
|
||||
// Insert a special marker
|
||||
len = 1;
|
||||
utf32[0] = 2; // magic value (1 is reserved for U+FDFA)
|
||||
} else {
|
||||
if (src == dst) {
|
||||
continue;
|
||||
|
@ -681,24 +749,38 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt
|
|||
}
|
||||
}
|
||||
}
|
||||
if (startsWithNonStarter && !(c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F)) {
|
||||
// A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
if (len == 1 && utf32[0] <= 0xFFFF) {
|
||||
if (utf32[0] == 1) {
|
||||
// 1 is reserved as a marker for the expansion of U+FDFA.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, FALSE});
|
||||
// U+0345 is hard-coded in ICU4X
|
||||
if (!(c == 0x0345 && utf32[0] == 0x03B9)) {
|
||||
pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, FALSE});
|
||||
}
|
||||
} else if (len == 2 && utf32[0] <= 0xFFFF && utf32[1] <= 0xFFFF && !u_getCombiningClass(utf32[0]) && u_getCombiningClass(utf32[1])) {
|
||||
for (int32_t i = 0; i < len; ++i) {
|
||||
if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
|
||||
// Assert that iota subscript and half-width voicing marks never occur in these
|
||||
// expansions in the normalization forms where they are special.
|
||||
printf("HER c: %X\n", c);
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
}
|
||||
pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), FALSE});
|
||||
} else {
|
||||
UBool supplementary = FALSE;
|
||||
UBool nonInitialStarter = FALSE;
|
||||
for (int32_t i = 0; i < len; ++i) {
|
||||
if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
|
||||
// Assert that iota subscript and half-width voicing marks never occur in these
|
||||
// expansions in the normalization forms where they are special.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, basename);
|
||||
}
|
||||
|
||||
if (utf32[i] > 0xFFFF) {
|
||||
supplementary = TRUE;
|
||||
}
|
||||
|
@ -1100,6 +1182,8 @@ int exportNorm() {
|
|||
std::vector<uint16_t> storage16;
|
||||
std::vector<uint32_t> storage32;
|
||||
|
||||
// Note: the USets are not exported. They are only used to check that a new
|
||||
// Unicode version doesn't violate expectations that are hard-coded in ICU4X.
|
||||
USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty();
|
||||
USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
|
||||
std::vector<PendingDescriptor> nfdPendingTrieInsertions;
|
||||
|
@ -1139,6 +1223,8 @@ int exportNorm() {
|
|||
USet* uts46PotentialPassthroughAndNotBackwardCombining = uset_openEmpty();
|
||||
writePotentialCompositionPassThrough("uts46", nullptr, uts46DecompositionStartsWithNonStarter, uts46DecompositionStartsWithBackwardCombiningStarter, uts46PotentialPassthroughAndNotBackwardCombining);
|
||||
|
||||
writeNopCompositionPassThrough("passthroughnop");
|
||||
|
||||
// Check that NFKC set has no characters that NFC doesn't also have.
|
||||
uset_removeAll(nfkcPotentialPassthroughAndNotBackwardCombining, nfcPotentialPassthroughAndNotBackwardCombining);
|
||||
if (!uset_isEmpty(nfkcPotentialPassthroughAndNotBackwardCombining)) {
|
||||
|
|
Loading…
Add table
Reference in a new issue