ICU-22757 Remove allow list of known contractions with precomposed form from ICU4X mode of genuca

This assumes that future cases will work OK, since the addition that was seen in Unicode 16 alpha
was OK.
This commit is contained in:
Henri Sivonen 2024-05-08 11:25:50 +03:00 committed by Markus Scherer
parent 564c92d666
commit f5056cb46a
4 changed files with 16 additions and 28 deletions

View file

@ -23,10 +23,7 @@ rm $ICU_SRC/icu4c/source/common/propname_data.h
rm $ICU_SRC/icu4c/source/common/*_props_data.h
rm $ICU4C_DATA_IN/*.icu
rm $ICU4C_DATA_IN/*.nrm
# TODO: Back to deleting coll/*.icu once ICU4X data generation is fixed.
# rm $ICU4C_DATA_IN/coll/*.icu
rm $ICU4C_DATA_IN/coll/ucadata-implicithan.icu
rm $ICU4C_DATA_IN/coll/ucadata-unihan.icu
rm $ICU4C_DATA_IN/coll/*.icu
# icu4c/source/i18n/collationfcd.cpp is generated by genuca;
# probably hard to build genuca without depending on the old version.
@ -49,6 +46,5 @@ bazelisk run //tools/unicode/c/genprops $ICU_SRC/icu4c
bazelisk run //tools/unicode/c/genuca -- --hanOrder implicit $ICU_SRC/icu4c
bazelisk run //tools/unicode/c/genuca -- --hanOrder radical-stroke $ICU_SRC/icu4c
# Also generate the ICU4X versions
# TODO: Currently fails with early Unicode 16.0 FractionalUCA.txt.
# bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder implicit $ICU_SRC/icu4c
# bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder radical-stroke $ICU_SRC/icu4c
bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder implicit $ICU_SRC/icu4c
bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder radical-stroke $ICU_SRC/icu4c

View file

@ -586,27 +586,19 @@ CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &
if (s != sInNfd) {
// s is not in NFD, so it cannot match in ICU4X, since ICU4X only
// does NFD lookups.
// Now check that we're only rejecting known cases.
if (s.length() == 2) {
char16_t second = s.charAt(1);
if (second == 0x0F73 || second == 0x0F75 || second == 0x0F81) {
// Second is a special decomposing Tibetan vowel sign.
// These also get added in the decomposed form, so ignoring
// this instance is OK.
return;
}
if (c == 0xFDD1 && second == 0xAC00) {
// This strange contraction exists in the root and
// doesn't have a decomposed counterpart there.
// This won't match in ICU4X anyway and is very strange:
// Unassigned Arabic presentation form contracting with
// the very first Hangul syllable. Let's ignore this
// explicitly.
return;
}
}
// Unknown case worth investigating if ever found.
errorCode = U_UNSUPPORTED_ERROR;
// As of Unicode 16 alpha, the cases that come here are:
//
// 1. The second character is a special decomposing Tibetan vowel
// sign. These are OK to ignore in the precomposed form, since
// the decomposed form is added also.
// 2. Likewise for KIRAT RAI VOWEL SIGN AA followed by KIRAT RAI VOWEL SIGN AI
// and other such cases.
// For details see the normalization section of
// https://www.unicode.org/review/pri497/pri497-background.html
// 3. U+FDD1 followed by U+AC00 is a marker for the alphabetical
// index feature of ICU4C, which at this time does not have
// a counterpart in ICU4X.
return;
}