ICU-21460 Changed the ULocale initializers to allow locale IDs that use BCP47 syntax, but with '_' as a field delimiter.

(APIs that specifically require BCP47 syntax are unaffected-- they still require '-').
2025-04-14 17:24:01 +00:00 · 2021-08-17 16:28:32 -07:00 · 2021-08-17 16:28:32 -07:00 · 01e1adc9e4
commit 01e1adc9e4
parent 0d407fc616
7 changed files with 66 additions and 14 deletions
--- a/icu4c/source/common/uloc.cpp
+++ b/icu4c/source/common/uloc.cpp
@ -1477,21 +1477,37 @@ _canonicalize(const char* localeID,
              ByteSink& sink,
              uint32_t options,
              UErrorCode* err) {
+    if (U_FAILURE(*err)) {
+        return;
+    }
+
    int32_t j, fieldCount=0, scriptSize=0, variantSize=0;
-    PreflightingLocaleIDBuffer tempBuffer;
+    PreflightingLocaleIDBuffer tempBuffer;  // if localeID has a BCP47 extension, tmpLocaleID points to this
    const char* origLocaleID;
    const char* tmpLocaleID;
    const char* keywordAssign = NULL;
    const char* separatorIndicator = NULL;

-    if (U_FAILURE(*err)) {
-        return;
-    }
-
    if (_hasBCP47Extension(localeID)) {
+        CharString localeIDWithHyphens;
+        const char* localeIDPtr = localeID;
+
+        // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
+        if (uprv_strchr(localeID, '_') != nullptr && localeID[1] != '-' && localeID[1] != '_') {
+            localeIDWithHyphens.append(localeID, -1, *err);
+            if (U_SUCCESS(*err)) {
+                for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
+                    if (*p == '_') {
+                        *p = '-';
+                    }
+                }
+                localeIDPtr = localeIDWithHyphens.data();
+            }
+        }
+        
        do {
-            tempBuffer.requestedCapacity = _ConvertBCP47(tmpLocaleID, localeID,
-                tempBuffer.getBuffer(), tempBuffer.getCapacity(), err);
+            tempBuffer.requestedCapacity = _ConvertBCP47(tmpLocaleID, localeIDPtr, tempBuffer.getBuffer(),
+                                                         tempBuffer.getCapacity(), err);
        } while (tempBuffer.needToTryAgain(err));
    } else {
        if (localeID==NULL) {
--- a/icu4c/source/test/cintltst/capitst.c
+++ b/icu4c/source/test/cintltst/capitst.c
@ -90,6 +90,7 @@ void addCollAPITest(TestNode** root)
    addTest(root, &TestBengaliSortKey, "tscoll/capitst/TestBengaliSortKey");
    addTest(root, &TestGetKeywordValuesForLocale, "tscoll/capitst/TestGetKeywordValuesForLocale");
    addTest(root, &TestStrcollNull, "tscoll/capitst/TestStrcollNull");
+    addTest(root, &TestLocaleIDWithUnderscoreAndExtension, "tscoll/capitst/TestLocaleIDWithUnderscoreAndExtension");
 }

 void TestGetSetAttr(void) {
@ -2565,4 +2566,18 @@ static void TestStrcollNull(void) {
    ucol_close(coll);
 }

+static void TestLocaleIDWithUnderscoreAndExtension(void) {
+    UErrorCode err = U_ZERO_ERROR;
+    UCollator* c1 = ucol_open("en-US-u-kn-true", &err);
+    UCollator* c2 = ucol_open("en_US-u-kn-true", &err);
+    
+    if (assertSuccess("Failed to create collators", &err)) {
+        assertTrue("Comparison using \"normal\" collator failed", !ucol_greater(c1, u"2", -1, u"10", -1));
+        assertTrue("Comparison using \"bad\" collator failed", !ucol_greater(c2, u"2", -1, u"10", -1));
+    }
+    
+    ucol_close(c1);
+    ucol_close(c2);
+}
+
 #endif /* #if !UCONFIG_NO_COLLATION */
--- a/icu4c/source/test/cintltst/capitst.h
+++ b/icu4c/source/test/cintltst/capitst.h
@ -136,6 +136,11 @@
     * test strcoll with null arg
     */
    static void TestStrcollNull(void);
+ 
+    /**
+     * Simple test for ICU-21460.  The issue affects all components, but was originally reported against collation.
+     */
+    static void TestLocaleIDWithUnderscoreAndExtension(void);

 #endif /* #if !UCONFIG_NO_COLLATION */

--- a/icu4c/source/test/cintltst/cloctst.c
+++ b/icu4c/source/test/cintltst/cloctst.c
@ -3723,13 +3723,13 @@ const char* const basic_maximize_data[][2] = {
    ""
  }, {
     "de_u_co_phonebk",
-     "de_Latn_DE_U_CO_PHONEBK"
+     "de_Latn_DE@collation=phonebook"
  }, {
     "de_Latn_u_co_phonebk",
-     "de_Latn_DE_U_CO_PHONEBK"
+      "de_Latn_DE@collation=phonebook"
  }, {
     "de_Latn_DE_u_co_phonebk",
-     "de_Latn_DE_U_CO_PHONEBK"
+      "de_Latn_DE@collation=phonebook"
  }, {
    "_Arab@em=emoji",
    "ar_Arab_EG@em=emoji"
@ -6377,7 +6377,7 @@ static const struct {
    {"hant-cmn-cn", "hant", 4},
    {"zh-cmn-TW", "cmn_TW", FULL_LENGTH},
    {"zh-x_t-ab", "zh", 2},
-    {"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes",  15},
+    {"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes", 15},
    /* #20140 dupe keys in U-extension */
    {"zh-u-ca-chinese-ca-gregory", "zh@calendar=chinese", FULL_LENGTH},
    {"zh-u-ca-gregory-co-pinyin-ca-chinese", "zh@calendar=gregorian;collation=pinyin", FULL_LENGTH},
--- a/icu4c/source/test/intltest/loctest.cpp
+++ b/icu4c/source/test/intltest/loctest.cpp
@ -4805,7 +4805,7 @@ void LocaleTest::TestCanonicalization(void)
        { "x-piglatin_ML.MBE", "x-piglatin_ML.MBE", "x-piglatin_ML" },
        { "i-cherokee_US.utf7", "i-cherokee_US.utf7", "i-cherokee_US" },
        { "x-filfli_MT_FILFLA.gb-18030", "x-filfli_MT_FILFLA.gb-18030", "x-filfli_MT_FILFLA" },
-        { "no-no-ny.utf8@B", "no_NO_NY.utf8@B", "no_NO_B_NY" /* not: "nn_NO" [alan ICU3.0] */ }, /* @ ignored unless variant is empty */
+        { "no-no-ny.utf8@B", "no_NO_NY.utf8@B", "no_NO@b=ny" /* not: "nn_NO" [alan ICU3.0] */ }, /* @ ignored unless variant is empty */

        /* fleshing out canonicalization */
        /* trim space and sort keywords, ';' is separator so not present at end in canonical form */
--- a/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java
@ -1131,10 +1131,13 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
     * @stable ICU 3.0
     */
    public static String getName(String localeID){
-        String tmpLocaleID;
+        String tmpLocaleID = localeID;
        // Convert BCP47 id if necessary
        if (localeID != null && !localeID.contains("@") && getShortestSubtagLength(localeID) == 1) {
-            tmpLocaleID = forLanguageTag(localeID).getName();
+            if (localeID.indexOf('_') >= 0 && localeID.charAt(1) != '_' && localeID.charAt(1) != '-') {
+                tmpLocaleID = localeID.replace('_', '-');
+            }
+            tmpLocaleID = forLanguageTag(tmpLocaleID).getName();
            if (tmpLocaleID.length() == 0) {
                tmpLocaleID = localeID;
            }
--- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationAPITest.java
+++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationAPITest.java
@ -1702,4 +1702,17 @@ public class CollationAPITest extends TestFmwk {
            errln("unexpected exception for tailoring many characters at the end of symbols: " + e);
        }
    }
+
+    @Test
+    public void TestBogusLocaleID() {
+        try {
+            Collator c1 = Collator.getInstance(new ULocale("en-US-u-kn-true"));
+            Collator c2 = Collator.getInstance(new ULocale("en_US-u-kn-true"));
+
+            assertTrue("Comparison using \"normal\" collator failed", c1.compare("2", "10") < 0);
+            assertTrue("Comparison using \"bad\" collator failed", c2.compare("2", "10") < 0);
+        } catch (Exception e) {
+            errln("Exception creating collators: " + e);
+        }
+    }
 }