ICU-23052 Fix addLikelySubtags

Remove hack in the conversion tool by fixing the code

ICU-23052 Assert the value will never be 0
This commit is contained in:
Frank Tang 2025-03-19 17:29:15 -07:00
parent ee90520429
commit 2d1c3ed684
5 changed files with 2083 additions and 2049 deletions

View file

@ -715,13 +715,29 @@ LSR LikelySubtags::maximize(StringPiece language, StringPiece script, StringPiec
} else {
iter.resetToState64(state);
value = trieNext(iter, "", 0);
U_ASSERT(value > 0);
U_ASSERT(value != 0);
// For the case of und_Latn
if (value < 0) {
retainLanguage = !language.empty();
retainScript = !script.empty();
retainRegion = !region.empty();
// Fallback to und_$region =>
iter.resetToState64(trieUndState); // "und" ("*")
value = trieNext(iter, "", 0);
U_ASSERT(value == 0);
int64_t trieUndEmptyState = iter.getState64();
value = trieNext(iter, region, 0);
// Fallback to und =>
if (value < 0) {
iter.resetToState64(trieUndEmptyState);
value = trieNext(iter, "", 0);
U_ASSERT(value > 0);
}
}
}
}
}
U_ASSERT(value < lsrsLength);
const LSR &matched = lsrs[value];
if (returnInputIfUnmatch &&
(!(matchLanguage || matchScript || (matchRegion && language.empty())))) {
return LSR("", "", "", LSR::EXPLICIT_LSR, errorCode); // no matching.
@ -731,18 +747,23 @@ LSR LikelySubtags::maximize(StringPiece language, StringPiece script, StringPiec
}
if (!(retainLanguage || retainScript || retainRegion)) {
U_ASSERT(value >= 0);
// Quickly return a copy of the lookup-result LSR
// without new allocation of the subtags.
const LSR &matched = lsrs[value];
return LSR(matched.language, matched.script, matched.region, matched.flags);
}
if (!retainLanguage) {
language = matched.language;
U_ASSERT(value >= 0);
language = lsrs[value].language;
}
if (!retainScript) {
script = matched.script;
U_ASSERT(value >= 0);
script = lsrs[value].script;
}
if (!retainRegion) {
region = matched.region;
U_ASSERT(value >= 0);
region = lsrs[value].region;
}
int32_t retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0);
// retainOldMask flags = LSR explicit-subtag flags

File diff suppressed because it is too large Load diff

View file

@ -355,11 +355,27 @@ public final class LikelySubtags {
} else {
iter.resetToState64(state);
value = trieNext(iter, "", 0);
assert value > 0;
assert value != 0;
if (value < 0) {
retainLanguage = !language.isEmpty();
retainScript = !script.isEmpty();
retainRegion = !region.isEmpty();
// Fallback to und_$region =>
iter.resetToState64(trieUndState); // "und" ("*")
value = trieNext(iter, "", 0);
assert value == 0;
long trieUndEmptyState = iter.getState64();
value = trieNext(iter, region, 0);
// Fallback to und =>
if (value < 0) {
iter.resetToState64(trieUndEmptyState);
value = trieNext(iter, "", 0);
assert value > 0;
}
}
}
}
}
LSR result = lsrs[value];
if (returnInputIfUnmatch &&
(!(matchLanguage || matchScript || (matchRegion && language.isEmpty())))) {
@ -370,17 +386,21 @@ public final class LikelySubtags {
}
if (! (retainLanguage || retainScript || retainRegion)) {
assert result.flags == LSR.IMPLICIT_LSR;
return result;
assert value >= 0;
assert lsrs[value].flags == LSR.IMPLICIT_LSR;
return lsrs[value];
}
if (!retainLanguage) {
language = result.language;
assert value >= 0;
language = lsrs[value].language;
}
if (!retainScript) {
script = result.script;
assert value >= 0;
script = lsrs[value].script;
}
if (!retainRegion) {
region = result.region;
assert value >= 0;
region = lsrs[value].region;
}
int retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0);
// retainOldMask flags = LSR explicit-subtag flags

View file

@ -268,12 +268,6 @@ final class LikelySubtagsBuilder {
}
});
// Add the special case for "und-Latn" => "en-Latn-US" (which is a bit of a
// hack for language matching).
// Temporary patch. Needs an update of the ICU algorithm to match CLDR.
// See https://unicode-org.atlassian.net/browse/ICU-23052
set(lsrTable, "und", "Latn", "", lsr("en", "Latn", "US"));
set(lsrTable, "und", "Latn", "RS", lsr("sr", "Latn", "RS"));
logger.fine(lsrTable::toString);
// Ensure that if "und-RR" => "ll-Ssss-RR", then we also add "Ssss" => "RR".
@ -293,10 +287,11 @@ final class LikelySubtagsBuilder {
// Check that every level has "*" (mapped from "und" or "").
lsrTable.forEach((lang, scripts) -> {
checkArgument(scripts.containsKey("*"), "missing likely subtag mapping for: %s", asLocale(lang));
checkArgument(asLocale(lang).equals("und_Latn") || scripts.containsKey("*"), "missing likely subtag mapping for: %s", asLocale(lang));
scripts.forEach(
(script, regions) -> checkArgument(regions.containsKey("*"),
"missing likely subtag mapping for: %s", asLocale(lang, script)));
(script, regions) -> checkArgument(
(asLocale(lang, script).equals("und_Latn")) || regions.containsKey("*"),
"missing likely subtag mapping for: %s", asLocale(lang, script)));
});
return lsrTable;
}