mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-22968 Rearrange bits in trie values in normalization data export for ICU4X
This commit is contained in:
parent
1b8118049f
commit
494e8cdc93
1 changed files with 133 additions and 199 deletions
|
@ -489,12 +489,29 @@ FILE* prepareOutputFile(const char* basename) {
|
|||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
struct PendingDescriptor {
|
||||
class PendingDescriptor {
|
||||
public:
|
||||
UChar32 scalar;
|
||||
uint32_t descriptor;
|
||||
uint32_t descriptorOrFlags;
|
||||
// If false, we use the above fields only. If true, descriptor only
|
||||
// contains the two highest-bit flags and the rest is computed later
|
||||
// from the fields below.
|
||||
UBool complex;
|
||||
UBool supplementary;
|
||||
UBool onlyNonStartersInTrail;
|
||||
uint32_t len;
|
||||
uint32_t offset;
|
||||
|
||||
PendingDescriptor(UChar32 scalar, uint32_t descriptor);
|
||||
PendingDescriptor(UChar32 scalar, uint32_t flags, UBool supplementary, UBool onlyNonStartersInTrail, uint32_t len, uint32_t offset);
|
||||
};
|
||||
|
||||
PendingDescriptor::PendingDescriptor(UChar32 scalar, uint32_t descriptor)
|
||||
: scalar(scalar), descriptorOrFlags(descriptor), complex(false), supplementary(false), onlyNonStartersInTrail(false), len(0), offset(0) {}
|
||||
|
||||
PendingDescriptor::PendingDescriptor(UChar32 scalar, uint32_t flags, UBool supplementary, UBool onlyNonStartersInTrail, uint32_t len, uint32_t offset)
|
||||
: scalar(scalar), descriptorOrFlags(flags), complex(true), supplementary(supplementary), onlyNonStartersInTrail(onlyNonStartersInTrail), len(len), offset(offset) {}
|
||||
|
||||
void writeCanonicalCompositions(USet* backwardCombiningStarters) {
|
||||
IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions");
|
||||
const char* basename = "compositions";
|
||||
|
@ -557,21 +574,18 @@ void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_
|
|||
fclose(f);
|
||||
}
|
||||
|
||||
void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions, char16_t passthroughCap) {
|
||||
IcuToolErrorCode status("icuexportdata: writeDecompositionData");
|
||||
FILE* f = prepareOutputFile(basename);
|
||||
|
||||
// Zero is a magic number that means the character decomposes to itself.
|
||||
LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
|
||||
|
||||
void pendingInsertionsToTrie(const char* basename, UMutableCPTrie* trie, const std::vector<PendingDescriptor>& pendingTrieInsertions, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16) {
|
||||
IcuToolErrorCode status("icuexportdata: pendingInsertionsToTrie");
|
||||
// Iterate backwards to insert lower code points in the trie first in case it matters
|
||||
// for trie block allocation.
|
||||
for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) {
|
||||
const PendingDescriptor& pending = pendingTrieInsertions[i];
|
||||
uint32_t additional = 0;
|
||||
if (!(pending.descriptor & 0xFFFC0000)) {
|
||||
uint32_t offset = pending.descriptor & 0xFFF;
|
||||
if (pending.complex) {
|
||||
uint32_t additional = 0;
|
||||
uint32_t offset = pending.offset;
|
||||
uint32_t len = pending.len;
|
||||
if (!pending.supplementary) {
|
||||
len -= 2;
|
||||
if (offset >= baseSize16) {
|
||||
// This is a offset to supplementary 16-bit data. We have
|
||||
// 16-bit base data and 32-bit base data before. However,
|
||||
|
@ -579,6 +593,7 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t
|
|||
additional = baseSize32;
|
||||
}
|
||||
} else {
|
||||
len -= 1;
|
||||
if (offset >= baseSize32) {
|
||||
// This is an offset to supplementary 32-bit data. We have 16-bit
|
||||
// base data, 32-bit base data, and 16-bit supplementary data before.
|
||||
|
@ -591,21 +606,55 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t
|
|||
additional = baseSize16;
|
||||
}
|
||||
}
|
||||
// +1 to make offset always non-zero
|
||||
offset += 1;
|
||||
if (offset + additional > 0xFFF) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
if (len > 7) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
umutablecptrie_set(trie, pending.scalar, pending.descriptorOrFlags | (uint32_t(pending.onlyNonStartersInTrail) << 4) | len | (offset + additional) << 16, status);
|
||||
} else {
|
||||
umutablecptrie_set(trie, pending.scalar, pending.descriptorOrFlags, status);
|
||||
}
|
||||
// It turns out it's better to swap the halves compared to the initial
|
||||
// idea in order to put special marker values close to zero so that
|
||||
// an important marker value becomes 1, so it's efficient to compare
|
||||
// "1 or 0". Unfortunately, going through all the code to swap
|
||||
// things is too error prone, so let's do the swapping here in one
|
||||
// place.
|
||||
uint32_t oldTrieValue = pending.descriptor + additional;
|
||||
uint32_t swappedTrieValue = (oldTrieValue >> 16) | (oldTrieValue << 16);
|
||||
umutablecptrie_set(builder.getAlias(), pending.scalar, swappedTrieValue, status);
|
||||
}
|
||||
}
|
||||
|
||||
/// Marker that the decomposition does not round trip via NFC.
|
||||
const uint32_t NON_ROUND_TRIP_MASK = (1 << 30);
|
||||
|
||||
/// Marker that the first character of the decomposition can combine
|
||||
/// backwards.
|
||||
const uint32_t BACKWARD_COMBINING_MASK = (1 << 31);
|
||||
|
||||
void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions, const std::vector<PendingDescriptor>& nfdPendingTrieInsertions, char16_t passthroughCap) {
|
||||
IcuToolErrorCode status("icuexportdata: writeDecompositionData");
|
||||
FILE* f = prepareOutputFile(basename);
|
||||
|
||||
// Zero is a magic number that means the character decomposes to itself.
|
||||
LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
|
||||
|
||||
if (uprv_strcmp(basename, "uts46d") != 0) {
|
||||
// Make surrogates decompose to U+FFFD. Don't do this for UTS 46, since this
|
||||
// optimization is only used by the UTF-16 slice mode, and UTS 46 is not
|
||||
// supported in slice modes (which do not support ignorables).
|
||||
// Mark these as potentially backward-combining, to make lead surrogates
|
||||
// for non-BMP characters that are backward-combining count as
|
||||
// backward-combining just in case, though the backward-combiningness
|
||||
// is not actually being looked at today.
|
||||
umutablecptrie_setRange(builder.getAlias(), 0xD800, 0xDFFF, NON_ROUND_TRIP_MASK | BACKWARD_COMBINING_MASK | 0xFFFD, status);
|
||||
}
|
||||
|
||||
// Add a marker value for Hangul syllables
|
||||
umutablecptrie_setRange(builder.getAlias(), 0xAC00, 0xD7A3, 1, status);
|
||||
|
||||
// First put the NFD data in the trie, to be partially overwritten in the NFKD and UTS 46 cases.
|
||||
// This is easier that changing the logic that computes the pending insertions.
|
||||
pendingInsertionsToTrie(basename, builder.getAlias(), nfdPendingTrieInsertions, baseSize16, baseSize32, supplementSize16);
|
||||
pendingInsertionsToTrie(basename, builder.getAlias(), pendingTrieInsertions, baseSize16, baseSize32, supplementSize16);
|
||||
LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
|
||||
builder.getAlias(),
|
||||
trieType,
|
||||
|
@ -613,6 +662,7 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t
|
|||
status));
|
||||
handleError(status, __LINE__, basename);
|
||||
|
||||
// The ICU4X side has changed enough this whole block of expectation checking might be more appropriate to remove.
|
||||
if (reference) {
|
||||
if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) {
|
||||
// NFD expectations don't hold. The set must not contain the half-width
|
||||
|
@ -628,13 +678,9 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t
|
|||
USet* iotaSubscript = uset_openEmpty();
|
||||
uset_add(iotaSubscript, 0x0345);
|
||||
|
||||
uint8_t flags = 0;
|
||||
|
||||
USet* halfWidthCheck = uset_cloneAsThawed(uset);
|
||||
uset_removeAll(halfWidthCheck, reference);
|
||||
if (uset_equals(halfWidthCheck, halfWidthVoicing)) {
|
||||
flags |= 1;
|
||||
} else if (!uset_isEmpty(halfWidthCheck)) {
|
||||
if (!uset_equals(halfWidthCheck, halfWidthVoicing) && !uset_isEmpty(halfWidthCheck)) {
|
||||
// The result was neither empty nor contained exactly
|
||||
// the two half-width voicing marks. The ICU4X
|
||||
// normalizer doesn't know how to deal with this case.
|
||||
|
@ -655,72 +701,14 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t
|
|||
|
||||
uset_close(iotaSubscript);
|
||||
uset_close(halfWidthVoicing);
|
||||
|
||||
fprintf(f, "flags = 0x%X\n", flags);
|
||||
fprintf(f, "cap = 0x%X\n", passthroughCap);
|
||||
}
|
||||
fprintf(f, "cap = 0x%X\n", passthroughCap);
|
||||
fprintf(f, "[trie]\n");
|
||||
usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
|
||||
fclose(f);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
|
||||
// Special marker for the NFKD form of U+FDFA
|
||||
const int32_t FDFA_MARKER = 3;
|
||||
|
||||
// Special marker for characters whose decomposition starts with a non-starter
|
||||
// and the decomposition isn't the character itself.
|
||||
const int32_t SPECIAL_NON_STARTER_DECOMPOSITION_MARKER = 2;
|
||||
|
||||
// Special marker for starters that decompose to themselves but that may
|
||||
// combine backwards under canonical composition
|
||||
const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1;
|
||||
|
||||
/// Marker that a complex decomposition isn't round-trippable
|
||||
/// under re-composition.
|
||||
///
|
||||
/// TODO: When taking a data format break, swap this around with
|
||||
/// `BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER`.
|
||||
const uint32_t NON_ROUND_TRIP_MARKER = 1;
|
||||
|
||||
/// Marker that a complex decomposition starts with a starter
|
||||
/// that can combine backwards.
|
||||
///
|
||||
/// TODO: When taking a data format break, swap this around with
|
||||
/// `NON_ROUND_TRIP_MARKER` to use the same bit as with characters
|
||||
/// that decompose to self but can combine backwards.
|
||||
const uint32_t BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER = 2;
|
||||
|
||||
UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) {
|
||||
if (knownToRoundTrip) {
|
||||
return true;
|
||||
}
|
||||
// Nuktas, Hebrew presentation forms and polytonic Greek with oxia
|
||||
// are special-cased in ICU4X.
|
||||
if (c >= 0xFB1D && c <= 0xFB4E) {
|
||||
// Hebrew presentation forms
|
||||
return true;
|
||||
}
|
||||
if (c >= 0x1F71 && c <= 0x1FFB) {
|
||||
// Polytonic Greek with oxia
|
||||
return true;
|
||||
}
|
||||
if ((second & 0x7F) == 0x3C && second >= 0x0900 && second <= 0x0BFF) {
|
||||
// Nukta
|
||||
return true;
|
||||
}
|
||||
// To avoid more branchiness, 4 characters that decompose to
|
||||
// a BMP starter followed by a BMP non-starter are excluded
|
||||
// from being encoded directly into the trie value and are
|
||||
// handled as complex decompositions instead. These are:
|
||||
// U+0F76 TIBETAN VOWEL SIGN VOCALIC R
|
||||
// U+0F78 TIBETAN VOWEL SIGN VOCALIC L
|
||||
// U+212B ANGSTROM SIGN
|
||||
// U+2ADC FORKING
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Find the slice `needle` within `storage` and return its index, failing which,
|
||||
// append all elements of `needle` to `storage` and return the index of it at the end.
|
||||
template<typename T>
|
||||
|
@ -749,6 +737,8 @@ size_t findOrAppend(std::vector<T>& storage, const UChar32* needle, size_t needl
|
|||
|
||||
|
||||
// Computes data for canonical decompositions
|
||||
// See components/normalizer/trie-value-format.md in the ICU4X repo
|
||||
// for documentation of the trie value format.
|
||||
void computeDecompositions(const char* basename,
|
||||
const USet* backwardCombiningStarters,
|
||||
std::vector<uint16_t>& storage16,
|
||||
|
@ -814,12 +804,23 @@ void computeDecompositions(const char* basename,
|
|||
// Surrogate
|
||||
continue;
|
||||
}
|
||||
if (c == 0xFFFD) {
|
||||
// REPLACEMENT CHARACTER
|
||||
// This character is a starter that decomposes to self,
|
||||
// so without a special case here it would end up as
|
||||
// passthrough-eligible in all normalizations forms.
|
||||
// However, in the potentially-ill-formed UTF-8 case
|
||||
// UTF-8 errors return U+FFFD from the iterator, and
|
||||
// errors need to be treated as ineligible for
|
||||
// passthrough on the slice fast path. By giving
|
||||
// U+FFFD a trie value whose flags make it ineligible
|
||||
// for passthrough avoids a specific U+FFFD branch on
|
||||
// the passthrough fast path.
|
||||
pendingTrieInsertions.push_back({c, NON_ROUND_TRIP_MASK | BACKWARD_COMBINING_MASK});
|
||||
continue;
|
||||
}
|
||||
UnicodeString src;
|
||||
UnicodeString dst;
|
||||
// True if we're building non-NFD or we're building NFD but
|
||||
// the `c` round trips to NFC.
|
||||
// False if we're building NFD and `c` does not round trip to NFC.
|
||||
UBool nonNfdOrRoundTrips = true;
|
||||
src.append(c);
|
||||
if (mainNormalizer != nfdNormalizer) {
|
||||
UnicodeString inter;
|
||||
|
@ -827,39 +828,12 @@ void computeDecompositions(const char* basename,
|
|||
nfdNormalizer->normalize(inter, dst, status);
|
||||
} else {
|
||||
nfdNormalizer->normalize(src, dst, status);
|
||||
UnicodeString nfc;
|
||||
nfcNormalizer->normalize(dst, nfc, status);
|
||||
nonNfdOrRoundTrips = (src == nfc);
|
||||
}
|
||||
if (uts46) {
|
||||
// Work around https://unicode-org.atlassian.net/browse/ICU-22658
|
||||
// TODO: Remove the workaround after data corresponding to
|
||||
// https://www.unicode.org/L2/L2024/24061.htm#179-C36 lands
|
||||
// for Unicode 16.
|
||||
switch (c) {
|
||||
case 0x2F868:
|
||||
dst.truncate(0);
|
||||
dst.append(static_cast<UChar32>(0x36FC));
|
||||
break;
|
||||
case 0x2F874:
|
||||
dst.truncate(0);
|
||||
dst.append(static_cast<UChar32>(0x5F53));
|
||||
break;
|
||||
case 0x2F91F:
|
||||
dst.truncate(0);
|
||||
dst.append(static_cast<UChar32>(0x243AB));
|
||||
break;
|
||||
case 0x2F95F:
|
||||
dst.truncate(0);
|
||||
dst.append(static_cast<UChar32>(0x7AEE));
|
||||
break;
|
||||
case 0x2F9BF:
|
||||
dst.truncate(0);
|
||||
dst.append(static_cast<UChar32>(0x45D7));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString nfc;
|
||||
nfcNormalizer->normalize(dst, nfc, status);
|
||||
UBool roundTripsViaCanonicalComposition = (src == nfc);
|
||||
|
||||
int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
|
||||
|
||||
if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
|
||||
|
@ -880,7 +854,7 @@ void computeDecompositions(const char* basename,
|
|||
compositionPassthroughBound = c;
|
||||
uset_add(decompositionStartsWithNonStarter, c);
|
||||
if (src != dst) {
|
||||
if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F) {
|
||||
if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || (c == 0xFF9E && utf32[0] == 0x3099) || (c == 0xFF9F && utf32[0] == 0x309A)) {
|
||||
specialNonStarterDecomposition = true;
|
||||
} else {
|
||||
// A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
|
||||
|
@ -893,18 +867,6 @@ void computeDecompositions(const char* basename,
|
|||
startsWithBackwardCombiningStarter = true;
|
||||
uset_add(decompositionStartsWithBackwardCombiningStarter, c);
|
||||
}
|
||||
if (c != BACKWARD_COMBINING_STARTER_MARKER && len == 1 && utf32[0] == BACKWARD_COMBINING_STARTER_MARKER) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
if (c != SPECIAL_NON_STARTER_DECOMPOSITION_MARKER && len == 1 && utf32[0] == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
if (c != FDFA_MARKER && len == 1 && utf32[0] == FDFA_MARKER) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
if (mainNormalizer != nfdNormalizer) {
|
||||
UnicodeString nfd;
|
||||
nfdNormalizer->normalize(src, nfd, status);
|
||||
|
@ -913,24 +875,29 @@ void computeDecompositions(const char* basename,
|
|||
}
|
||||
decompositionPassthroughBound = c;
|
||||
compositionPassthroughBound = c;
|
||||
} else if (firstCombiningClass) {
|
||||
}
|
||||
if (firstCombiningClass) {
|
||||
len = 1;
|
||||
if (specialNonStarterDecomposition) {
|
||||
utf32[0] = SPECIAL_NON_STARTER_DECOMPOSITION_MARKER; // magic value
|
||||
// Special marker
|
||||
pendingTrieInsertions.push_back({c, NON_ROUND_TRIP_MASK | BACKWARD_COMBINING_MASK | 0xD900 | u_getCombiningClass(c)});
|
||||
} else {
|
||||
// Use the surrogate range to store the canonical combining class
|
||||
utf32[0] = 0xD800 | static_cast<UChar32>(firstCombiningClass);
|
||||
// XXX: Should non-started that decompose to self be marked as non-round-trippable in
|
||||
// case such semantics turn out to be more useful for `NON_ROUND_TRIP_MASK`?
|
||||
pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_MASK | 0xD800 | static_cast<uint32_t>(firstCombiningClass)});
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
if (src == dst) {
|
||||
if (startsWithBackwardCombiningStarter) {
|
||||
pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_STARTER_MARKER << 16, false});
|
||||
pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_MASK});
|
||||
}
|
||||
continue;
|
||||
}
|
||||
decompositionPassthroughBound = c;
|
||||
// ICU4X hard-codes ANGSTROM SIGN
|
||||
if (c != 0x212B) {
|
||||
if (c != 0x212B && mainNormalizer == nfdNormalizer) {
|
||||
UnicodeString raw;
|
||||
if (!nfdNormalizer->getRawDecomposition(c, raw)) {
|
||||
// We're always supposed to have a non-recursive decomposition
|
||||
|
@ -978,7 +945,7 @@ void computeDecompositions(const char* basename,
|
|||
}
|
||||
}
|
||||
}
|
||||
if (!nonNfdOrRoundTrips) {
|
||||
if (!roundTripsViaCanonicalComposition) {
|
||||
compositionPassthroughBound = c;
|
||||
}
|
||||
if (!len) {
|
||||
|
@ -986,7 +953,7 @@ void computeDecompositions(const char* basename,
|
|||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
pendingTrieInsertions.push_back({c, 0xFFFFFFFF, false});
|
||||
pendingTrieInsertions.push_back({c, uint32_t(0xFFFFFFFF)});
|
||||
} else if (len == 1 && ((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) {
|
||||
// Singleton decompositions to conjoining jamo.
|
||||
if (mainNormalizer == nfdNormalizer) {
|
||||
|
@ -994,16 +961,18 @@ void computeDecompositions(const char* basename,
|
|||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
pendingTrieInsertions.push_back({c, static_cast<uint32_t>(utf32[0]) << 16, false});
|
||||
pendingTrieInsertions.push_back({c, static_cast<uint32_t>(utf32[0]) | NON_ROUND_TRIP_MASK | (startsWithBackwardCombiningStarter ? BACKWARD_COMBINING_MASK : 0)});
|
||||
} else if (!startsWithBackwardCombiningStarter && len == 1 && utf32[0] <= 0xFFFF) {
|
||||
pendingTrieInsertions.push_back({c, static_cast<uint32_t>(utf32[0]) << 16, false});
|
||||
} else if (!startsWithBackwardCombiningStarter &&
|
||||
pendingTrieInsertions.push_back({c, static_cast<uint32_t>(utf32[0]) | NON_ROUND_TRIP_MASK | (startsWithBackwardCombiningStarter ? BACKWARD_COMBINING_MASK : 0)});
|
||||
} else if (c != 0x212B && // ANGSTROM SIGN is special to make the Harfbuzz case branch less in the more common case.
|
||||
!startsWithBackwardCombiningStarter &&
|
||||
len == 2 &&
|
||||
utf32[0] <= 0xFFFF &&
|
||||
utf32[1] <= 0xFFFF &&
|
||||
utf32[0] <= 0x7FFF &&
|
||||
utf32[1] <= 0x7FFF &&
|
||||
utf32[0] > 0x1F &&
|
||||
utf32[1] > 0x1F &&
|
||||
!u_getCombiningClass(utf32[0]) &&
|
||||
u_getCombiningClass(utf32[1]) &&
|
||||
permissibleBmpPair(nonNfdOrRoundTrips, c, utf32[1])) {
|
||||
u_getCombiningClass(utf32[1])) {
|
||||
for (int32_t i = 0; i < len; ++i) {
|
||||
if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
|
||||
// Assert that iota subscript and half-width voicing marks never occur in these
|
||||
|
@ -1012,7 +981,7 @@ void computeDecompositions(const char* basename,
|
|||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
}
|
||||
pendingTrieInsertions.push_back({c, (static_cast<uint32_t>(utf32[0]) << 16) | static_cast<uint32_t>(utf32[1]), false});
|
||||
pendingTrieInsertions.push_back({c, static_cast<uint32_t>(utf32[0]) | (static_cast<uint32_t>(utf32[1]) << 15) | (roundTripsViaCanonicalComposition ? 0 : NON_ROUND_TRIP_MASK)});
|
||||
} else {
|
||||
UBool supplementary = false;
|
||||
UBool nonInitialStarter = false;
|
||||
|
@ -1046,73 +1015,38 @@ void computeDecompositions(const char* basename,
|
|||
if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) {
|
||||
if (len == 18 && c == 0xFDFA) {
|
||||
// Special marker for the one character whose decomposition
|
||||
// is too long.
|
||||
pendingTrieInsertions.push_back({c, FDFA_MARKER << 16, supplementary});
|
||||
// is too long. (Too long even if we took the fourth bit into use!)
|
||||
pendingTrieInsertions.push_back({c, NON_ROUND_TRIP_MASK | 1});
|
||||
continue;
|
||||
} else {
|
||||
// Note: There's a fourth bit available, but let's error out
|
||||
// if it's ever needed so that it doesn't get used without
|
||||
// updating docs.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
}
|
||||
} else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) {
|
||||
// Note: There's a fourth bit available, but let's error out
|
||||
// if it's ever needed so that it doesn't get used without
|
||||
// updating docs.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
// Complex decomposition
|
||||
// Format for 16-bit value:
|
||||
// 15..13: length minus two for 16-bit case and length minus one for
|
||||
// the 32-bit case. Length 8 needs to fit in three bits in
|
||||
// the 16-bit case, and this way the value is future-proofed
|
||||
// up to 9 in the 16-bit case. Zero is unused and length one
|
||||
// in the 16-bit case goes directly into the trie.
|
||||
// 12: 1 if all trailing characters are guaranteed non-starters,
|
||||
// 0 if no guarantees about non-starterness.
|
||||
// Note: The bit choice is this way around to allow for
|
||||
// dynamically falling back to not having this but instead
|
||||
// having one more bit for length by merely choosing
|
||||
// different masks.
|
||||
// 11..0: Start offset in storage. The offset is to the logical
|
||||
// sequence of scalars16, scalars32, supplementary_scalars16,
|
||||
// supplementary_scalars32.
|
||||
uint32_t descriptor = static_cast<uint32_t>(!nonInitialStarter) << 12;
|
||||
if (!supplementary) {
|
||||
descriptor |= (static_cast<uint32_t>(len) - 2) << 13;
|
||||
} else {
|
||||
descriptor |= (static_cast<uint32_t>(len) - 1) << 13;
|
||||
}
|
||||
if (descriptor & 0xFFF) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
|
||||
size_t index = 0;
|
||||
if (!supplementary) {
|
||||
index = findOrAppend(storage16, utf32, len);
|
||||
} else {
|
||||
index = findOrAppend(storage32, utf32, len);
|
||||
}
|
||||
if (index > 0xFFF) {
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
descriptor |= static_cast<uint32_t>(index);
|
||||
if (!descriptor || descriptor > 0xFFFF) {
|
||||
// > 0xFFFF should never happen if the code above is correct.
|
||||
// == 0 should not happen due to the nature of the data.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
handleError(status, __LINE__, basename);
|
||||
}
|
||||
uint32_t nonRoundTripMarker = 0;
|
||||
if (!nonNfdOrRoundTrips) {
|
||||
nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16);
|
||||
}
|
||||
uint32_t canCombineBackwardsMarker = 0;
|
||||
if (startsWithBackwardCombiningStarter) {
|
||||
canCombineBackwardsMarker = (BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER << 16);
|
||||
}
|
||||
pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker | canCombineBackwardsMarker, supplementary});
|
||||
pendingTrieInsertions.push_back({c, (startsWithBackwardCombiningStarter ? BACKWARD_COMBINING_MASK : 0) | (roundTripsViaCanonicalComposition ? 0 : NON_ROUND_TRIP_MASK), supplementary, !nonInitialStarter, uint32_t(len), uint32_t(index)});
|
||||
}
|
||||
}
|
||||
if (storage16.size() + storage32.size() > 0xFFF) {
|
||||
// We actually have 14 bits available, but let's error out so
|
||||
// that docs can be updated when taking a reserved bit out of
|
||||
// potential future flag usage.
|
||||
status.set(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
if (f) {
|
||||
|
@ -1489,9 +1423,9 @@ int exportNorm() {
|
|||
uint32_t supplementSize16 = storage16.size() - baseSize16;
|
||||
uint32_t supplementSize32 = storage32.size() - baseSize32;
|
||||
|
||||
writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, static_cast<char16_t>(nfcBound));
|
||||
writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, static_cast<char16_t>(nfkcBound));
|
||||
writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, static_cast<char16_t>(uts46Bound));
|
||||
writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, nfdPendingTrieInsertions, static_cast<char16_t>(nfcBound));
|
||||
writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, nfdPendingTrieInsertions, static_cast<char16_t>(nfkcBound));
|
||||
writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, nfdPendingTrieInsertions, static_cast<char16_t>(uts46Bound));
|
||||
|
||||
writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32);
|
||||
writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32);
|
||||
|
|
Loading…
Add table
Reference in a new issue