ICU-21460 Changed the ULocale initializers to allow locale IDs that use BCP47 syntax, but with '_' as a field delimiter.

(APIs that specifically require BCP47 syntax are unaffected-- they still require '-').
This commit is contained in:
Rich Gillam 2021-08-17 16:28:32 -07:00
parent 0d407fc616
commit 01e1adc9e4
7 changed files with 66 additions and 14 deletions

View file

@ -1477,21 +1477,37 @@ _canonicalize(const char* localeID,
ByteSink& sink,
uint32_t options,
UErrorCode* err) {
if (U_FAILURE(*err)) {
return;
}
int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
PreflightingLocaleIDBuffer tempBuffer;
PreflightingLocaleIDBuffer tempBuffer; // if localeID has a BCP47 extension, tmpLocaleID points to this
const char* origLocaleID;
const char* tmpLocaleID;
const char* keywordAssign = NULL;
const char* separatorIndicator = NULL;
if (U_FAILURE(*err)) {
return;
}
if (_hasBCP47Extension(localeID)) {
CharString localeIDWithHyphens;
const char* localeIDPtr = localeID;
// convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
if (uprv_strchr(localeID, '_') != nullptr && localeID[1] != '-' && localeID[1] != '_') {
localeIDWithHyphens.append(localeID, -1, *err);
if (U_SUCCESS(*err)) {
for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
if (*p == '_') {
*p = '-';
}
}
localeIDPtr = localeIDWithHyphens.data();
}
}
do {
tempBuffer.requestedCapacity = _ConvertBCP47(tmpLocaleID, localeID,
tempBuffer.getBuffer(), tempBuffer.getCapacity(), err);
tempBuffer.requestedCapacity = _ConvertBCP47(tmpLocaleID, localeIDPtr, tempBuffer.getBuffer(),
tempBuffer.getCapacity(), err);
} while (tempBuffer.needToTryAgain(err));
} else {
if (localeID==NULL) {

View file

@ -90,6 +90,7 @@ void addCollAPITest(TestNode** root)
addTest(root, &TestBengaliSortKey, "tscoll/capitst/TestBengaliSortKey");
addTest(root, &TestGetKeywordValuesForLocale, "tscoll/capitst/TestGetKeywordValuesForLocale");
addTest(root, &TestStrcollNull, "tscoll/capitst/TestStrcollNull");
addTest(root, &TestLocaleIDWithUnderscoreAndExtension, "tscoll/capitst/TestLocaleIDWithUnderscoreAndExtension");
}
void TestGetSetAttr(void) {
@ -2565,4 +2566,18 @@ static void TestStrcollNull(void) {
ucol_close(coll);
}
static void TestLocaleIDWithUnderscoreAndExtension(void) {
UErrorCode err = U_ZERO_ERROR;
UCollator* c1 = ucol_open("en-US-u-kn-true", &err);
UCollator* c2 = ucol_open("en_US-u-kn-true", &err);
if (assertSuccess("Failed to create collators", &err)) {
assertTrue("Comparison using \"normal\" collator failed", !ucol_greater(c1, u"2", -1, u"10", -1));
assertTrue("Comparison using \"bad\" collator failed", !ucol_greater(c2, u"2", -1, u"10", -1));
}
ucol_close(c1);
ucol_close(c2);
}
#endif /* #if !UCONFIG_NO_COLLATION */

View file

@ -136,6 +136,11 @@
* test strcoll with null arg
*/
static void TestStrcollNull(void);
/**
* Simple test for ICU-21460. The issue affects all components, but was originally reported against collation.
*/
static void TestLocaleIDWithUnderscoreAndExtension(void);
#endif /* #if !UCONFIG_NO_COLLATION */

View file

@ -3723,13 +3723,13 @@ const char* const basic_maximize_data[][2] = {
""
}, {
"de_u_co_phonebk",
"de_Latn_DE_U_CO_PHONEBK"
"de_Latn_DE@collation=phonebook"
}, {
"de_Latn_u_co_phonebk",
"de_Latn_DE_U_CO_PHONEBK"
"de_Latn_DE@collation=phonebook"
}, {
"de_Latn_DE_u_co_phonebk",
"de_Latn_DE_U_CO_PHONEBK"
"de_Latn_DE@collation=phonebook"
}, {
"_Arab@em=emoji",
"ar_Arab_EG@em=emoji"
@ -6377,7 +6377,7 @@ static const struct {
{"hant-cmn-cn", "hant", 4},
{"zh-cmn-TW", "cmn_TW", FULL_LENGTH},
{"zh-x_t-ab", "zh", 2},
{"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes", 15},
{"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes", 15},
/* #20140 dupe keys in U-extension */
{"zh-u-ca-chinese-ca-gregory", "zh@calendar=chinese", FULL_LENGTH},
{"zh-u-ca-gregory-co-pinyin-ca-chinese", "zh@calendar=gregorian;collation=pinyin", FULL_LENGTH},

View file

@ -4805,7 +4805,7 @@ void LocaleTest::TestCanonicalization(void)
{ "x-piglatin_ML.MBE", "x-piglatin_ML.MBE", "x-piglatin_ML" },
{ "i-cherokee_US.utf7", "i-cherokee_US.utf7", "i-cherokee_US" },
{ "x-filfli_MT_FILFLA.gb-18030", "x-filfli_MT_FILFLA.gb-18030", "x-filfli_MT_FILFLA" },
{ "no-no-ny.utf8@B", "no_NO_NY.utf8@B", "no_NO_B_NY" /* not: "nn_NO" [alan ICU3.0] */ }, /* @ ignored unless variant is empty */
{ "no-no-ny.utf8@B", "no_NO_NY.utf8@B", "no_NO@b=ny" /* not: "nn_NO" [alan ICU3.0] */ }, /* @ ignored unless variant is empty */
/* fleshing out canonicalization */
/* trim space and sort keywords, ';' is separator so not present at end in canonical form */

View file

@ -1131,10 +1131,13 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
* @stable ICU 3.0
*/
public static String getName(String localeID){
String tmpLocaleID;
String tmpLocaleID = localeID;
// Convert BCP47 id if necessary
if (localeID != null && !localeID.contains("@") && getShortestSubtagLength(localeID) == 1) {
tmpLocaleID = forLanguageTag(localeID).getName();
if (localeID.indexOf('_') >= 0 && localeID.charAt(1) != '_' && localeID.charAt(1) != '-') {
tmpLocaleID = localeID.replace('_', '-');
}
tmpLocaleID = forLanguageTag(tmpLocaleID).getName();
if (tmpLocaleID.length() == 0) {
tmpLocaleID = localeID;
}

View file

@ -1702,4 +1702,17 @@ public class CollationAPITest extends TestFmwk {
errln("unexpected exception for tailoring many characters at the end of symbols: " + e);
}
}
@Test
public void TestBogusLocaleID() {
try {
Collator c1 = Collator.getInstance(new ULocale("en-US-u-kn-true"));
Collator c2 = Collator.getInstance(new ULocale("en_US-u-kn-true"));
assertTrue("Comparison using \"normal\" collator failed", c1.compare("2", "10") < 0);
assertTrue("Comparison using \"bad\" collator failed", c2.compare("2", "10") < 0);
} catch (Exception e) {
errln("Exception creating collators: " + e);
}
}
}