ICU-22742 Fix handling of XA,XB,XC for addLikelySubtags

Add more tests.

ICU-22742 Add PS... variants

ICU-22742 Add java tests

ICU-22742 extend tests

ICU-22742 wrap java
This commit is contained in:
Frank Tang 2024-04-19 13:45:32 -07:00 committed by Mihai Nita
parent a91cbd6578
commit 3aa8b8c5ee
6 changed files with 345 additions and 103 deletions

View file

@ -564,47 +564,40 @@ LSR LikelySubtags::makeMaximizedLsr(const char *language, const char *script, co
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
// They should match only themselves,
// not other locales with what looks like the same language and script subtags.
char c1;
if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
switch (c1) {
case 'A':
if (returnInputIfUnmatch) {
return LSR(language, script, region, LSR::EXPLICIT_LSR);
if (!returnInputIfUnmatch) {
char c1;
if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
switch (c1) {
case 'A':
return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'B':
return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'C':
return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
default: // normal locale
break;
}
return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'B':
if (returnInputIfUnmatch) {
return LSR(language, script, region, LSR::EXPLICIT_LSR);
}
return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'C':
if (returnInputIfUnmatch) {
return LSR(language, script, region, LSR::EXPLICIT_LSR);
}
return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
default: // normal locale
break;
}
}
if (variant[0] == 'P' && variant[1] == 'S') {
int32_t lsrFlags = *region == 0 ?
LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
if (uprv_strcmp(variant, "PSACCENT") == 0) {
return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
*region == 0 ? "XA" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSBIDI") == 0) {
return LSR(PSEUDO_BIDI_PREFIX, language, script,
*region == 0 ? "XB" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSCRACK") == 0) {
return LSR(PSEUDO_CRACKED_PREFIX, language, script,
*region == 0 ? "XC" : region, lsrFlags, errorCode);
if (variant[0] == 'P' && variant[1] == 'S') {
int32_t lsrFlags = *region == 0 ?
LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
if (uprv_strcmp(variant, "PSACCENT") == 0) {
return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
*region == 0 ? "XA" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSBIDI") == 0) {
return LSR(PSEUDO_BIDI_PREFIX, language, script,
*region == 0 ? "XB" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSCRACK") == 0) {
return LSR(PSEUDO_CRACKED_PREFIX, language, script,
*region == 0 ? "XC" : region, lsrFlags, errorCode);
}
// else normal locale
}
// else normal locale
}
} // end of if (!returnInputIfUnmatch)
language = getCanonical(languageAliases, language);
// (We have no script mappings.)

View file

@ -3913,17 +3913,17 @@ const char* const basic_maximize_data[][2] = {
"zh_Hani",
"zh_Hani_CN" // If change, please also update common/unicode/uloc.h
}, {
// ICU-22545
// ICU-22545 & ICU-22742
"en_XA",
"en_XA"
"en_Latn_XA"
}, {
// ICU-22545
"en_XB",
"en_XB"
// ICU-22545 & ICU-22742
"ar_XB",
"ar_Arab_XB"
}, {
// ICU-22545
"en_XC",
"en_XC"
// ICU-22545 & ICU-22742
"ru_XC",
"ru_Cyrl_XC"
}
};

View file

@ -233,6 +233,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c
#endif
TESTCASE_AUTO(TestSetIsBogus);
TESTCASE_AUTO(TestParallelAPIValues);
TESTCASE_AUTO(TestPseudoLocales);
TESTCASE_AUTO(TestAddLikelySubtags);
TESTCASE_AUTO(TestMinimizeSubtags);
TESTCASE_AUTO(TestAddLikelyAndMinimizeSubtags);
@ -1740,6 +1741,119 @@ LocaleTest::TestSetIsBogus() {
}
void LocaleTest::TestPseudoLocales() {
// input locale tag, expected locale tag
static const struct {
const char* const input;
const char* const expected;
} test_cases[] = {
// language + region, en
{ "en-XA", "en-Latn-XA" },
{ "en-XB", "en-Latn-XB" },
{ "en-XC", "en-Latn-XC" },
// language + region, ar
{ "ar-XA", "ar-Arab-XA" },
{ "ar-XB", "ar-Arab-XB" },
{ "ar-XC", "ar-Arab-XC" },
// language + region, something other than en, ar
{ "ru-XA", "ru-Cyrl-XA" },
{ "el-XB", "el-Grek-XB" },
// undefined language - region
{ "und-XA", "en-Latn-XA" },
{ "und-XB", "en-Latn-XB" },
{ "und-XC", "en-Latn-XC" },
// language + script + region
{ "und-Latn-XA", "en-Latn-XA" },
{ "und-Latn-XB", "en-Latn-XB" },
{ "und-Latn-XC", "en-Latn-XC" },
{ "und-Arab-XA", "ar-Arab-XA" },
{ "und-Arab-XB", "ar-Arab-XB" },
{ "und-Arab-XC", "ar-Arab-XC" },
{ "und-Cyrl-XA", "ru-Cyrl-XA" },
{ "und-Grek-XB", "el-Grek-XB" },
// Make sure the script is not damaged, when correct
{ "ru-Cyrl-XA", "ru-Cyrl-XA" },
{ "el-Grek-XB", "el-Grek-XB" },
// Make sure the script is not damaged, even if it is wrong
{ "ru-Grek-XA", "ru-Grek-XA" },
{ "el-Cyrl-XB", "el-Cyrl-XB" },
// PS Variants
{ "en-XA-PSACCENT", "en-Latn-XA-psaccent" },
{ "en-XA-PSBIDI", "en-Latn-XA-psbidi" },
{ "en-XA-PSCRACK", "en-Latn-XA-pscrack" },
{ "ar-XB-PSACCENT", "ar-Arab-XB-psaccent" },
{ "ar-XB-PSBIDI", "ar-Arab-XB-psbidi" },
{ "ar-XB-PSCRACK", "ar-Arab-XB-pscrack" },
{ "en-XC-PSACCENT", "en-Latn-XC-psaccent" },
{ "en-XC-PSBIDI", "en-Latn-XC-psbidi" },
{ "en-XC-PSCRACK", "en-Latn-XC-pscrack" },
{ "en-US-PSACCENT", "en-Latn-US-psaccent" },
{ "en-US-PSBIDI", "en-Latn-US-psbidi" },
{ "en-US-PSCRACK", "en-Latn-US-pscrack" },
{ "ar-EG-PSACCENT", "ar-Arab-EG-psaccent" },
{ "ar-EG-PSBIDI", "ar-Arab-EG-psbidi" },
{ "ar-EG-PSCRACK", "ar-Arab-EG-pscrack" },
{ "en-PSACCENT", "en-Latn-US-psaccent" },
{ "en-PSBIDI", "en-Latn-US-psbidi" },
{ "en-PSCRACK", "en-Latn-US-pscrack" },
{ "ar-PSACCENT", "ar-Arab-EG-psaccent" },
{ "ar-PSBIDI", "ar-Arab-EG-psbidi" },
{ "ar-PSCRACK", "ar-Arab-EG-pscrack" },
{ "und-US-PSACCENT", "en-Latn-US-psaccent" },
{ "und-US-PSBIDI", "en-Latn-US-psbidi" },
{ "und-US-PSCRACK", "en-Latn-US-pscrack" },
{ "und-EG-PSACCENT", "ar-Arab-EG-psaccent" },
{ "und-EG-PSBIDI", "ar-Arab-EG-psbidi" },
{ "und-EG-PSCRACK", "ar-Arab-EG-pscrack" },
{ "und-PSACCENT", "en-Latn-US-psaccent" },
{ "und-PSBIDI", "en-Latn-US-psbidi" },
{ "und-PSCRACK", "en-Latn-US-pscrack" },
{ "und-PSACCENT", "en-Latn-US-psaccent" },
{ "und-PSBIDI", "en-Latn-US-psbidi" },
{ "und-PSCRACK", "en-Latn-US-pscrack" },
};
std::string extensions("-u-nu-Deva-hc-h23-fw-mon-mu-celsius-x-somethin-more");
IcuTestErrorCode status(*this, "TestPseudoLocales()");
for (const auto& item : test_cases) {
const char* const inputTag = item.input;
const char* const expectedTag = item.expected;
Locale result = Locale::forLanguageTag(inputTag, status);
result.addLikelySubtags(status);
status.errIfFailureAndReset("\"%s\"", inputTag);
Locale expected = Locale::forLanguageTag(expectedTag, status);
status.errIfFailureAndReset("\"%s\"", expectedTag);
assertEquals(inputTag, expected.getName(), result.getName());
// Test extension
std::string extendedTag(inputTag);
extendedTag.append(extensions);
result = Locale::forLanguageTag(extendedTag, status);
result.addLikelySubtags(status);
status.errIfFailureAndReset(extendedTag.c_str());
std::string expectedExtendedTag(expectedTag);
expectedExtendedTag.append(extensions);
expected = Locale::forLanguageTag(expectedExtendedTag, status);
status.errIfFailureAndReset(expectedExtendedTag.c_str());
assertEquals(extendedTag.c_str(), expected.getName(), result.getName());
}
}
void
LocaleTest::TestAddLikelySubtags() {
IcuTestErrorCode status(*this, "TestAddLikelySubtags()");
@ -3971,20 +4085,45 @@ LocaleTest::TestAddLikelyAndMinimizeSubtags() {
"zh_Hani_CN", // If change, please also update common/unicode/locid.h
"zh_Hani"
}, {
// ICU-22545
"en_XA",
// ICU-22545 & ICU-22742
"en_XA",
"en_Latn_XA",
"en_XA",
}, {
// ICU-22545
"en_XB",
"en_XB",
"en_XB",
// ICU-22545 & ICU-22742
"ar_XB",
"ar_Arab_XB",
"ar_XB",
}, {
// ICU-22545
"en_XC",
"en_XC",
"en_XC",
// ICU-22545 & ICU-22742
"ru_XC",
"ru_Cyrl_XC",
"ru_XC",
}, {
// ICU-22742
"en_PSACCENT",
"en_Latn_US_PSACCENT",
"en__PSACCENT"
}, {
"ar_PSBIDI",
"ar_Arab_EG_PSBIDI",
"ar__PSBIDI"
}, {
"ru_PSCRACK",
"ru_Cyrl_RU_PSCRACK",
"ru__PSCRACK"
}, {
"ar_PSACCENT",
"ar_Arab_EG_PSACCENT",
"ar__PSACCENT"
}, {
"ru_PSBIDI",
"ru_Cyrl_RU_PSBIDI",
"ru__PSBIDI"
}, {
"en_PSCRACK",
"en_Latn_US_PSCRACK",
"en__PSCRACK"
}
};

View file

@ -129,6 +129,7 @@ public:
void TestKnownCanonicalizedListCorrect();
void TestConstructorAcceptsBCP47();
void TestPseudoLocales();
void TestAddLikelySubtags();
void TestMinimizeSubtags();
void TestAddLikelyAndMinimizeSubtags();

View file

@ -1970,17 +1970,36 @@ public class ULocaleTest extends CoreTestFmwk {
"zh_Hani",
"zh_Hani_CN" // If change, please also update ULocale.java
}, {
// ICU-22545
"en_XA",
// ICU-22545 & ICU-22742
"en_XA",
"en_Latn_XA",
}, {
// ICU-22545
"en_XB",
"en_XB",
// ICU-22545 & ICU-22742
"ar_XB",
"ar_Arab_XB",
}, {
// ICU-22545
"en_XC",
"en_XC",
// ICU-22545 & ICU-22742
"ru_XC",
"ru_Cyrl_XC",
}, {
// ICU-22742
"en_PSACCENT",
"en_Latn_US_PSACCENT",
}, {
"ar_PSBIDI",
"ar_Arab_EG_PSBIDI",
}, {
"ru_PSCRACK",
"ru_Cyrl_RU_PSCRACK",
}, {
"ar_PSACCENT",
"ar_Arab_EG_PSACCENT",
}, {
"ru_PSBIDI",
"ru_Cyrl_RU_PSBIDI",
}, {
"en_PSCRACK",
"en_Latn_US_PSCRACK",
}
};
@ -5590,6 +5609,103 @@ public class ULocaleTest extends CoreTestFmwk {
return tests;
}
// ICU-22742, test addLikelySubtags with pseudo-locales
@Test
public void TestPseudoLocales() {
// input locale tag, expected locale tag
String[][] testCases = new String[][] {
// language + region, en
{ "en-XA", "en-Latn-XA" },
{ "en-XB", "en-Latn-XB" },
{ "en-XC", "en-Latn-XC" },
// language + region, ar
{ "ar-XA", "ar-Arab-XA" },
{ "ar-XB", "ar-Arab-XB" },
{ "ar-XC", "ar-Arab-XC" },
// language + region, something other than en, ar
{ "ru-XA", "ru-Cyrl-XA" },
{ "el-XB", "el-Grek-XB" },
// undefined language - region
{ "und-XA", "en-Latn-XA" },
{ "und-XB", "en-Latn-XB" },
{ "und-XC", "en-Latn-XC" },
// language + script + region
{ "und-Latn-XA", "en-Latn-XA" },
{ "und-Latn-XB", "en-Latn-XB" },
{ "und-Latn-XC", "en-Latn-XC" },
{ "und-Arab-XA", "ar-Arab-XA" },
{ "und-Arab-XB", "ar-Arab-XB" },
{ "und-Arab-XC", "ar-Arab-XC" },
{ "und-Cyrl-XA", "ru-Cyrl-XA" },
{ "und-Grek-XB", "el-Grek-XB" },
// Make sure the script is not damaged, when correct
{ "ru-Cyrl-XA", "ru-Cyrl-XA" },
{ "el-Grek-XB", "el-Grek-XB" },
// Make sure the script is not damaged, even if it is wrong
{ "ru-Grek-XA", "ru-Grek-XA" },
{ "el-Cyrl-XB", "el-Cyrl-XB" },
// PS Variants
{ "en-XA-PSACCENT", "en-Latn-XA-psaccent" },
{ "en-XA-PSBIDI", "en-Latn-XA-psbidi" },
{ "en-XA-PSCRACK", "en-Latn-XA-pscrack" },
{ "ar-XB-PSACCENT", "ar-Arab-XB-psaccent" },
{ "ar-XB-PSBIDI", "ar-Arab-XB-psbidi" },
{ "ar-XB-PSCRACK", "ar-Arab-XB-pscrack" },
{ "en-XC-PSACCENT", "en-Latn-XC-psaccent" },
{ "en-XC-PSBIDI", "en-Latn-XC-psbidi" },
{ "en-XC-PSCRACK", "en-Latn-XC-pscrack" },
{ "en-US-PSACCENT", "en-Latn-US-psaccent" },
{ "en-US-PSBIDI", "en-Latn-US-psbidi" },
{ "en-US-PSCRACK", "en-Latn-US-pscrack" },
{ "ar-EG-PSACCENT", "ar-Arab-EG-psaccent" },
{ "ar-EG-PSBIDI", "ar-Arab-EG-psbidi" },
{ "ar-EG-PSCRACK", "ar-Arab-EG-pscrack" },
{ "en-PSACCENT", "en-Latn-US-psaccent" },
{ "en-PSBIDI", "en-Latn-US-psbidi" },
{ "en-PSCRACK", "en-Latn-US-pscrack" },
{ "ar-PSACCENT", "ar-Arab-EG-psaccent" },
{ "ar-PSBIDI", "ar-Arab-EG-psbidi" },
{ "ar-PSCRACK", "ar-Arab-EG-pscrack" },
{ "und-US-PSACCENT", "en-Latn-US-psaccent" },
{ "und-US-PSBIDI", "en-Latn-US-psbidi" },
{ "und-US-PSCRACK", "en-Latn-US-pscrack" },
{ "und-EG-PSACCENT", "ar-Arab-EG-psaccent" },
{ "und-EG-PSBIDI", "ar-Arab-EG-psbidi" },
{ "und-EG-PSCRACK", "ar-Arab-EG-pscrack" },
{ "und-PSACCENT", "en-Latn-US-psaccent" },
{ "und-PSBIDI", "en-Latn-US-psbidi" },
{ "und-PSCRACK", "en-Latn-US-pscrack" },
{ "und-PSACCENT", "en-Latn-US-psaccent" },
{ "und-PSBIDI", "en-Latn-US-psbidi" },
{ "und-PSCRACK", "en-Latn-US-pscrack" },
};
String extensions = "-u-nu-Deva-hc-h23-fw-mon-mu-celsius-x-something-more";
for (String[] testCase : testCases) {
String inputTag = testCase[0];
String expectedTag = testCase[1];
ULocale result = ULocale.addLikelySubtags(ULocale.forLanguageTag(inputTag));
ULocale expected = ULocale.forLanguageTag(expectedTag);
assertEquals("pseudo-locales(" + inputTag + ")", expected, result);
// Make sure this also works with extensions. Kind of hacky...
result = ULocale.addLikelySubtags(ULocale.forLanguageTag(inputTag + extensions));
expected = ULocale.forLanguageTag(expectedTag + extensions);
assertEquals("pseudo-locales(" + inputTag + ")", expected, result);
}
}
@Test
@Parameters(method = "readLikelySubtagsTestCases")
public void likelySubtagsDataDriven(TestCase test) {

View file

@ -213,49 +213,42 @@ public final class LikelySubtags {
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
// They should match only themselves,
// not other locales with what looks like the same language and script subtags.
if (region.length() == 2 && region.charAt(0) == 'X') {
switch (region.charAt(1)) {
case 'A':
if (returnInputIfUnmatch) {
return new LSR(language, script, region, LSR.EXPLICIT_LSR);
if (!returnInputIfUnmatch) {
if (region.length() == 2 && region.charAt(0) == 'X') {
switch (region.charAt(1)) {
case 'A':
return new LSR(PSEUDO_ACCENTS_PREFIX + language,
PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR);
case 'B':
return new LSR(PSEUDO_BIDI_PREFIX + language,
PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR);
case 'C':
return new LSR(PSEUDO_CRACKED_PREFIX + language,
PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR);
default: // normal locale
break;
}
return new LSR(PSEUDO_ACCENTS_PREFIX + language,
PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR);
case 'B':
if (returnInputIfUnmatch) {
return new LSR(language, script, region, LSR.EXPLICIT_LSR);
}
return new LSR(PSEUDO_BIDI_PREFIX + language,
PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR);
case 'C':
if (returnInputIfUnmatch) {
return new LSR(language, script, region, LSR.EXPLICIT_LSR);
}
return new LSR(PSEUDO_CRACKED_PREFIX + language,
PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR);
default: // normal locale
break;
}
}
if (variant.startsWith("PS")) {
int lsrFlags = region.isEmpty() ?
LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR;
switch (variant) {
case "PSACCENT":
return new LSR(PSEUDO_ACCENTS_PREFIX + language,
PSEUDO_ACCENTS_PREFIX + script,
region.isEmpty() ? "XA" : region, lsrFlags);
case "PSBIDI":
return new LSR(PSEUDO_BIDI_PREFIX + language,
PSEUDO_BIDI_PREFIX + script,
region.isEmpty() ? "XB" : region, lsrFlags);
case "PSCRACK":
return new LSR(PSEUDO_CRACKED_PREFIX + language,
PSEUDO_CRACKED_PREFIX + script,
region.isEmpty() ? "XC" : region, lsrFlags);
default: // normal locale
break;
if (variant.startsWith("PS")) {
int lsrFlags = region.isEmpty() ?
LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR;
switch (variant) {
case "PSACCENT":
return new LSR(PSEUDO_ACCENTS_PREFIX + language,
PSEUDO_ACCENTS_PREFIX + script,
region.isEmpty() ? "XA" : region, lsrFlags);
case "PSBIDI":
return new LSR(PSEUDO_BIDI_PREFIX + language,
PSEUDO_BIDI_PREFIX + script,
region.isEmpty() ? "XB" : region, lsrFlags);
case "PSCRACK":
return new LSR(PSEUDO_CRACKED_PREFIX + language,
PSEUDO_CRACKED_PREFIX + script,
region.isEmpty() ? "XC" : region, lsrFlags);
default: // normal locale
break;
}
}
}