diff --git a/icu4c/source/common/locid.cpp b/icu4c/source/common/locid.cpp index c6c976394de..b0766165604 100644 --- a/icu4c/source/common/locid.cpp +++ b/icu4c/source/common/locid.cpp @@ -38,6 +38,7 @@ #include "unicode/strenum.h" #include "unicode/stringpiece.h" #include "unicode/uloc.h" +#include "unicode/ures.h" #include "bytesinkutil.h" #include "charstr.h" @@ -509,6 +510,36 @@ Locale::operator==( const Locale& other) const #define ISASCIIALPHA(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) +namespace { + +CharString& AppendLSCVE(CharString& out, const char* language, const char* script, + const char* country, const char* variants, const char* extension, + UErrorCode& status) { + out.append(language, status); + if (script && script[0] != '\0') { + out.append('_', status); + out.append(script, status); + } + if (country && country[0] != '\0') { + out.append('_', status); + out.append(country, status); + } + if (variants && variants[0] != '\0') { + if ((script == nullptr || script[0] == '\0') && + (country == nullptr || country[0] == '\0')) { + out.append('_', status); + } + out.append('_', status); + out.append(variants, status); + } + if (extension && extension[0] != '\0') { + out.append(extension, status); + } + return out; +} + +} // namespace + /*This function initializes a Locale from a C locale ID*/ Locale& Locale::init(const char* localeID, UBool canonicalize) { @@ -632,6 +663,195 @@ Locale& Locale::init(const char* localeID, UBool canonicalize) break; } + if (canonicalize) { + UErrorCode status = U_ZERO_ERROR; + // TODO: Try to use ResourceDataValue and ures_getValueWithFallback() etc. + LocalUResourceBundlePointer metadata(ures_openDirect(NULL, "metadata", &status)); + LocalUResourceBundlePointer metadataAlias(ures_getByKey(metadata.getAlias(), "alias", NULL, &status)); + // Look up the metadata:alias:language:$key:replacement entries + // key could be one of the following: + // language + // language_Script_REGION + // language_REGION + // language_variant + do { + // The resource structure looks like + // metadata { + // alias { + // language { + // art_lojban { + // replacement{"jbo"} + // } + // ... + // ks_Arab_IN { + // replacement{"ks_IN"} + // } + // ... + // no { + // replacement{"nb"} + // } + // .... + // zh_CN { + // replacement{"zh_Hans_CN"} + // } + // } + // ... + // } + // } + LocalUResourceBundlePointer languageAlias(ures_getByKey(metadataAlias.getAlias(), "language", NULL, &status)); + if (U_FAILURE(status)) + break; + CharString temp; + // Handle cases of key pattern "language _ variant" + // ex: Map "art_lojban" to "jbo" + const char* variants = getVariant(); + if (variants != nullptr && variants[0] != '\0') { + const char* begin = variants; + const char* end = begin; + // We may have multiple variants, need to look at each of + // them. + do { + status = U_ZERO_ERROR; + end = uprv_strchr(begin, '_'); + int32_t len = (end == nullptr) ? int32_t(uprv_strlen(begin)) : int32_t(end - begin); + temp.clear().append(getLanguage(), status).append("_", status).append(begin, len, status); + LocalUResourceBundlePointer languageVariantAlias( + ures_getByKey(languageAlias.getAlias(), + temp.data(), + NULL, &status)); + temp.clear().appendInvariantChars( + UnicodeString(ures_getStringByKey(languageVariantAlias.getAlias(), "replacement", nullptr, &status)), status); + if (U_SUCCESS(status)) { + CharString newVar; + if (begin != variants) { + newVar.append(variants, begin - variants - 1, status); + } + if (end != nullptr) { + if (begin != variants) { + newVar.append("_", status); + } + newVar.append(end + 1, status); + } + Locale l(temp.data()); + init(AppendLSCVE(temp.clear(), + l.getLanguage(), + (getScript() != nullptr && getScript()[0] != '\0') ? getScript() : l.getScript(), + (getCountry() != nullptr && getCountry()[0] != '\0') ? getCountry() : l.getCountry(), + newVar.data(), + uprv_strchr(fullName, '@'), status).data(), false); + break; + } + begin = end + 1; + } while (end != nullptr); + } // End of handle language _ variant + // Handle cases of key pattern "language _ Script _ REGION" + // ex: Map "ks_Arab_IN" to "ks_IN" + if (getScript() != nullptr && getScript()[0] != '\0' && + getCountry() != nullptr && getCountry()[0] != '\0') { + status = U_ZERO_ERROR; + LocalUResourceBundlePointer replacedAlias( + ures_getByKey(languageAlias.getAlias(), + AppendLSCVE(temp.clear(), getLanguage(), getScript(), getCountry(), + nullptr, nullptr, status).data(), NULL, &status)); + temp.clear().appendInvariantChars( + UnicodeString(ures_getStringByKey(replacedAlias.getAlias(), "replacement", nullptr, &status)), status); + if (U_SUCCESS(status)) { + Locale l(temp.data()); + init(AppendLSCVE(temp.clear(), + l.getLanguage(), + l.getScript(), + l.getCountry(), + getVariant(), + uprv_strchr(fullName, '@'), status).data(), false); + } + } // End of handle language _ Script _ REGION + // Handle cases of key pattern "language _ REGION" + // ex: Map "zh_CN" to "zh_Hans_CN" + if (getCountry() != nullptr && getCountry()[0] != '\0') { + status = U_ZERO_ERROR; + LocalUResourceBundlePointer replacedAlias( + ures_getByKey(languageAlias.getAlias(), + AppendLSCVE(temp.clear(), getLanguage(), nullptr, getCountry(), + nullptr, nullptr, status).data(), NULL, &status)); + temp.clear().appendInvariantChars( + UnicodeString(ures_getStringByKey(replacedAlias.getAlias(), "replacement", nullptr, &status)), status); + if (U_SUCCESS(status)) { + Locale l(temp.data()); + init(AppendLSCVE(temp.clear(), + l.getLanguage(), + (getScript() != nullptr && getScript()[0] != '\0') ? getScript() : l.getScript(), + l.getCountry(), + getVariant(), + uprv_strchr(fullName, '@'), status).data(), false); + } + } // End of handle "language _ REGION" + // Handle cases of key pattern "language" + // ex: Map "no" to "nb" + { + status = U_ZERO_ERROR; + LocalUResourceBundlePointer replaceLanguageAlias(ures_getByKey(languageAlias.getAlias(), getLanguage(), NULL, &status)); + temp.clear().appendInvariantChars( + UnicodeString(ures_getStringByKey(replaceLanguageAlias.getAlias(), "replacement", nullptr, &status)), status); + if (U_SUCCESS(status)) { + Locale l(temp.data()); + init(AppendLSCVE(temp.clear(), + l.getLanguage(), + (getScript() != nullptr && getScript()[0] != '\0') ? getScript() : l.getScript(), + (getCountry() != nullptr && getCountry()[0] != '\0') ? getCountry() : l.getCountry(), + getVariant(), + uprv_strchr(fullName, '@'), status).data(), false); + } + } // End of handle "language" + + // Look up the metadata:alias:territory:$key:replacement entries + // key is region code. + if (getCountry() != nullptr) { + status = U_ZERO_ERROR; + // The resource structure looks like + // metadata { + // alias { + // ... + // territory: { + // 172 { + // replacement{"RU AM AZ BY GE KG KZ MD TJ TM UA UZ"} + // } + // ... + // 554 { + // replacement{"NZ"} + // } + // } + // } + // } + LocalUResourceBundlePointer territoryAlias(ures_getByKey(metadataAlias.getAlias(), "territory", NULL, &status)); + LocalUResourceBundlePointer countryAlias(ures_getByKey(territoryAlias.getAlias(), getCountry(), NULL, &status)); + UnicodeString replacements( + ures_getStringByKey(countryAlias.getAlias(), "replacement", nullptr, &status)); + if (U_SUCCESS(status)) { + CharString replacedCountry; + int32_t delPos = replacements.indexOf(' '); + if (delPos == -1) { + replacedCountry.appendInvariantChars(replacements, status); + } else { + Locale l(AppendLSCVE(temp.clear(), getLanguage(), nullptr, getScript(), + nullptr, nullptr, status).data()); + l.addLikelySubtags(status); + if (replacements.indexOf(UnicodeString(l.getCountry())) != -1) { + replacedCountry.append(l.getCountry(), status); + } else { + replacedCountry.appendInvariantChars(replacements.getBuffer(), delPos, status); + } + } + init(AppendLSCVE(temp.clear(), + getLanguage(), + getScript(), + replacedCountry.data(), + getVariant(), + uprv_strchr(fullName, '@'), status).data(), false); + } + } // End of handle REGION + } while (0); + } // if (canonicalize) { + // successful end of init() return *this; } while(0); /*loop doesn't iterate*/ @@ -778,6 +998,25 @@ Locale::minimizeSubtags(UErrorCode& status) { } } +void +Locale::canonicalize(UErrorCode& status) { + if (U_FAILURE(status)) { + return; + } + if (isBogus()) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + CharString uncanonicalized(fullName, status); + if (U_FAILURE(status)) { + return; + } + init(uncanonicalized.data(), /*canonicalize=*/TRUE); + if (isBogus()) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } +} + Locale U_EXPORT2 Locale::forLanguageTag(StringPiece tag, UErrorCode& status) { diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp index 4f3afcb6725..ad5dd6430c9 100644 --- a/icu4c/source/common/uloc_tag.cpp +++ b/icu4c/source/common/uloc_tag.cpp @@ -1681,7 +1681,7 @@ _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendT const char *pKey = NULL; /* LDML key */ const char *pType = NULL; /* LDML type */ - char bcpKeyBuf[9]; /* BCP key length is always 2 for now */ + char bcpKeyBuf[3]; /* BCP key length is always 2 for now */ U_ASSERT(pBcpKey != NULL); @@ -1690,6 +1690,7 @@ _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendT *status = U_ILLEGAL_ARGUMENT_ERROR; return; } + U_ASSERT(bcpKeyLen <= 2); uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen); bcpKeyBuf[bcpKeyLen] = 0; diff --git a/icu4c/source/common/unicode/locid.h b/icu4c/source/common/unicode/locid.h index 57c669b3343..1d031daabc5 100644 --- a/icu4c/source/common/unicode/locid.h +++ b/icu4c/source/common/unicode/locid.h @@ -448,7 +448,7 @@ public: /** * Creates a locale from the given string after canonicalizing - * the string by calling uloc_canonicalize(). + * the string according to CLDR by calling uloc_canonicalize(). * @param name the locale ID to create from. Must not be NULL. * @return a new locale object corresponding to the given name * @stable ICU 3.0 @@ -567,6 +567,16 @@ public: */ void minimizeSubtags(UErrorCode& status); +#ifndef U_HIDE_DRAFT_API + /** + * Canonicalize the locale ID of this object according to CLDR. + * @param status the status code + * @draft ICU 67 + * @see createCanonical + */ + void canonicalize(UErrorCode& status); +#endif // U_HIDE_DRAFT_API + /** * Gets the list of keywords for the specified locale. * diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp index d7ad6722504..52cb19ed271 100644 --- a/icu4c/source/test/intltest/loctest.cpp +++ b/icu4c/source/test/intltest/loctest.cpp @@ -6,6 +6,7 @@ * others. All Rights Reserved. ********************************************************************/ +#include #include #include #include @@ -275,6 +276,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c TESTCASE_AUTO(TestCapturingTagConvertingIterator); TESTCASE_AUTO(TestSetUnicodeKeywordValueInLongLocale); TESTCASE_AUTO(TestSetUnicodeKeywordValueNullInLongLocale); + TESTCASE_AUTO(TestCanonicalize); TESTCASE_AUTO_END; } @@ -2595,13 +2597,13 @@ void LocaleTest::TestCanonicalization(void) "ca_ES_WITH_EXTRA_STUFF_THAT REALLY DOESN'T MAKE ANY SENSE_UNLESS_YOU'RE TRYING TO INCREASE CODE COVERAGE", "ca_ES_WITH_EXTRA_STUFF_THAT REALLY DOESN'T MAKE ANY SENSE_UNLESS_YOU'RE TRYING TO INCREASE CODE COVERAGE"}, { "zh@collation=pinyin", "zh@collation=pinyin", "zh@collation=pinyin" }, - { "zh_CN@collation=pinyin", "zh_CN@collation=pinyin", "zh_CN@collation=pinyin" }, - { "zh_CN_CA@collation=pinyin", "zh_CN_CA@collation=pinyin", "zh_CN_CA@collation=pinyin" }, + { "zh_CN@collation=pinyin", "zh_CN@collation=pinyin", "zh_Hans_CN@collation=pinyin" }, + { "zh_CN_CA@collation=pinyin", "zh_CN_CA@collation=pinyin", "zh_Hans_CN_CA@collation=pinyin" }, { "en_US_POSIX", "en_US_POSIX", "en_US_POSIX" }, { "hy_AM_REVISED", "hy_AM_REVISED", "hy_AM_REVISED" }, - { "no_NO_NY", "no_NO_NY", "no_NO_NY" /* not: "nn_NO" [alan ICU3.0] */ }, - { "no@ny", "no@ny", "no__NY" /* not: "nn" [alan ICU3.0] */ }, /* POSIX ID */ - { "no-no.utf32@B", "no_NO.utf32@B", "no_NO_B" /* not: "nb_NO_B" [alan ICU3.0] */ }, /* POSIX ID */ + { "no_NO_NY", "no_NO_NY", "nb_NO_NY" /* not: "nn_NO" [alan ICU3.0] */ }, + { "no@ny", "no@ny", "nb__NY" /* not: "nn" [alan ICU3.0] */ }, /* POSIX ID */ + { "no-no.utf32@B", "no_NO.utf32@B", "nb_NO_B" /* not: "nb_NO_B" [alan ICU3.0] */ }, /* POSIX ID */ { "qz-qz@Euro", "qz_QZ@Euro", "qz_QZ_EURO" }, /* qz-qz uses private use iso codes */ // NOTE: uloc_getName() works on en-BOONT, but Locale() parser considers it BOGUS // TODO: unify this behavior @@ -2615,7 +2617,7 @@ void LocaleTest::TestCanonicalization(void) { "x-piglatin_ML.MBE", "x-piglatin_ML.MBE", "x-piglatin_ML" }, { "i-cherokee_US.utf7", "i-cherokee_US.utf7", "i-cherokee_US" }, { "x-filfli_MT_FILFLA.gb-18030", "x-filfli_MT_FILFLA.gb-18030", "x-filfli_MT_FILFLA" }, - { "no-no-ny.utf8@B", "no_NO_NY.utf8@B", "no_NO_NY_B" /* not: "nn_NO" [alan ICU3.0] */ }, /* @ ignored unless variant is empty */ + { "no-no-ny.utf8@B", "no_NO_NY.utf8@B", "nb_NO_NY_B" /* not: "nn_NO" [alan ICU3.0] */ }, /* @ ignored unless variant is empty */ /* fleshing out canonicalization */ /* trim space and sort keywords, ';' is separator so not present at end in canonical form */ @@ -2623,7 +2625,7 @@ void LocaleTest::TestCanonicalization(void) /* already-canonical ids are not changed */ { "en_Hant_IL_VALLEY_GIRL@calendar=Japanese;currency=EUR", "en_Hant_IL_VALLEY_GIRL@calendar=Japanese;currency=EUR", "en_Hant_IL_VALLEY_GIRL@calendar=Japanese;currency=EUR" }, /* norwegian is just too weird, if we handle things in their full generality */ - { "no-Hant-GB_NY@currency=$$$", "no_Hant_GB_NY@currency=$$$", "no_Hant_GB_NY@currency=$$$" /* not: "nn_Hant_GB@currency=$$$" [alan ICU3.0] */ }, + { "no-Hant-GB_NY@currency=$$$", "no_Hant_GB_NY@currency=$$$", "nb_Hant_GB_NY@currency=$$$" /* not: "nn_Hant_GB@currency=$$$" [alan ICU3.0] */ }, /* test cases reflecting internal resource bundle usage */ { "root@kw=foo", "root@kw=foo", "root@kw=foo" }, @@ -2662,13 +2664,13 @@ void LocaleTest::TestCanonicalization(void) { "hi__DIRECT", "hi__DIRECT", "hi__DIRECT" }, { "ja_JP_TRADITIONAL", "ja_JP_TRADITIONAL", "ja_JP_TRADITIONAL" }, { "th_TH_TRADITIONAL", "th_TH_TRADITIONAL", "th_TH_TRADITIONAL" }, - { "zh_TW_STROKE", "zh_TW_STROKE", "zh_TW_STROKE" }, + { "zh_TW_STROKE", "zh_TW_STROKE", "zh_Hant_TW_STROKE" }, { "zh__PINYIN", "zh__PINYIN", "zh__PINYIN" }, { "sr-SP-Cyrl", "sr_SP_CYRL", "sr_SP_CYRL" }, /* .NET name */ { "sr-SP-Latn", "sr_SP_LATN", "sr_SP_LATN" }, /* .NET name */ - { "sr_YU_CYRILLIC", "sr_YU_CYRILLIC", "sr_YU_CYRILLIC" }, /* Linux name */ - { "uz-UZ-Cyrl", "uz_UZ_CYRL", "uz_UZ_CYRL" }, /* .NET name */ - { "uz-UZ-Latn", "uz_UZ_LATN", "uz_UZ_LATN" }, /* .NET name */ + { "sr_YU_CYRILLIC", "sr_YU_CYRILLIC", "sr_RS_CYRILLIC" }, /* Linux name */ + { "uz-UZ-Cyrl", "uz_UZ_CYRL", "uz_Latn_UZ_CYRL" }, /* .NET name */ + { "uz-UZ-Latn", "uz_UZ_LATN", "uz_Latn_UZ_LATN" }, /* .NET name */ { "zh-CHS", "zh_CHS", "zh_CHS" }, /* .NET name */ { "zh-CHT", "zh_CHT", "zh_CHT" }, /* .NET name This may change back to zh_Hant */ /* PRE_EURO and EURO conversions don't affect other keywords */ @@ -2699,6 +2701,91 @@ void LocaleTest::TestCanonicalization(void) } } +void LocaleTest::TestCanonicalize(void) +{ + static const struct { + const char *localeID; /* input */ + const char *canonicalID; /* expected canonicalize() result */ + } testCases[] = { + // language _ variant -> language + { "no-BOKMAL", "nb" }, + // also test with script, country and extensions + { "no-Cyrl-ID-BOKMAL-u-ca-japanese", "nb-Cyrl-ID-u-ca-japanese" }, + { "no-Cyrl-ID-1901-BOKMAL-xsistemo-u-ca-japanese", "nb-Cyrl-ID-1901-xsistemo-u-ca-japanese" }, + { "no-Cyrl-ID-1901-BOKMAL-u-ca-japanese", "nb-Cyrl-ID-1901-u-ca-japanese" }, + { "no-Cyrl-ID-BOKMAL-xsistemo-u-ca-japanese", "nb-Cyrl-ID-xsistemo-u-ca-japanese" }, + { "no-NYNORSK", "nn" }, + { "no-Cyrl-ID-NYNORSK-u-ca-japanese", "nn-Cyrl-ID-u-ca-japanese" }, + { "aa-SAAHO", "ssy" }, + // also test with script, country and extensions + { "aa-Deva-IN-SAAHO-u-ca-japanese", "ssy-Deva-IN-u-ca-japanese" }, + + // language -> language + { "aam", "aas" }, + // also test with script, country, variants and extensions + { "aam-Cyrl-ID-3456-u-ca-japanese", "aas-Cyrl-ID-3456-u-ca-japanese" }, + + // language -> language _ Script + { "sh", "sr-Latn" }, + // also test with script + { "sh-Cyrl", "sr-Cyrl" }, + // also test with country, variants and extensions + { "sh-ID-3456-u-ca-roc", "sr-Latn-ID-3456-u-ca-roc" }, + + // language -> language _ country + { "prs", "fa-AF" }, + // also test with country + { "prs-RU", "fa-RU" }, + // also test with script, variants and extensions + { "prs-Cyrl-1009-u-ca-roc", "fa-Cyrl-AF-1009-u-ca-roc" }, + + // language _ country -> language _ script _ country + { "pa-IN", "pa-Guru-IN" }, + // also test with script + { "pa-Latn-IN", "pa-Latn-IN" }, + // also test with variants and extensions + { "pa-IN-5678-u-ca-hindi", "pa-Guru-IN-5678-u-ca-hindi" }, + + // language _ script _ country -> language _ country + { "ky-Cyrl-KG", "ky-KG" }, + // also test with variants and extensions + { "ky-Cyrl-KG-3456-u-ca-roc", "ky-KG-3456-u-ca-roc" }, + + // Test replacement of territoryAlias + // 554 has one replacement + { "en-554", "en-NZ" }, + { "en-554-u-nu-arab", "en-NZ-u-nu-arab" }, + // 172 has multiple replacements + // also test with variants + { "ru-172-1234", "ru-RU-1234" }, + // also test with extensions + { "ru-172-1234-u-nu-latn", "ru-RU-1234-u-nu-latn" }, + // also test with scripts + { "uz-172", "uz-UZ" }, + { "uz-Cyrl-172", "uz-Cyrl-UZ" }, + { "uz-Bopo-172", "uz-Bopo-UZ" }, + // also test with variants and scripts + { "uz-Cyrl-172-5678-u-nu-latn", "uz-Cyrl-UZ-5678-u-nu-latn" }, + // a language not used in this region + { "fr-172", "fr-RU" }, + }; + int32_t i; + for (i=0; i < UPRV_LENGTHOF(testCases); i++) { + UErrorCode status = U_ZERO_ERROR; + std::string otag = testCases[i].localeID; + Locale loc = Locale::forLanguageTag(otag.c_str(), status); + loc.canonicalize(status); + std::string tag = loc.toLanguageTag(status); + if (tag != testCases[i].canonicalID) { + errcheckln(status, "FAIL: %s should be canonicalized to %s but got %s - %s", + otag.c_str(), + testCases[i].canonicalID, + tag.c_str(), + u_errorName(status)); + } + } +} + void LocaleTest::TestCurrencyByDate(void) { #if !UCONFIG_NO_FORMATTING diff --git a/icu4c/source/test/intltest/loctest.h b/icu4c/source/test/intltest/loctest.h index 6c41b132bdd..b217ce26c6a 100644 --- a/icu4c/source/test/intltest/loctest.h +++ b/icu4c/source/test/intltest/loctest.h @@ -107,6 +107,8 @@ public: void TestCanonicalization(void); + void TestCanonicalize(void); + #if !UCONFIG_NO_FORMATTING static UDate date(int32_t y, int32_t m, int32_t d, int32_t hr = 0, int32_t min = 0, int32_t sec = 0); #endif diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/PluralRulesLoader.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/PluralRulesLoader.java index 8caff7c6cfa..e94d455724a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/PluralRulesLoader.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/PluralRulesLoader.java @@ -12,6 +12,7 @@ import java.text.ParseException; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedHashSet; import java.util.Map; import java.util.MissingResourceException; import java.util.Set; @@ -47,12 +48,11 @@ public class PluralRulesLoader extends PluralRules.Factory { */ public ULocale[] getAvailableULocales() { Set keys = getLocaleIdToRulesIdMap(PluralType.CARDINAL).keySet(); - ULocale[] locales = new ULocale[keys.size()]; - int n = 0; + Set locales = new LinkedHashSet(keys.size()); for (Iterator iter = keys.iterator(); iter.hasNext();) { - locales[n++] = ULocale.createCanonical(iter.next()); + locales.add(ULocale.createCanonical(iter.next())); } - return locales; + return locales.toArray(new ULocale[0]); } /** @@ -499,4 +499,4 @@ public class PluralRulesLoader extends PluralRules.Factory { // now make whole thing immutable localeIdToPluralRanges = Collections.unmodifiableMap(tempLocaleIdToPluralRanges); } -} \ No newline at end of file +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java b/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java index f37eade55a5..00659a381cf 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java @@ -498,7 +498,7 @@ public final class ULocale implements Serializable, Comparable { } /** - * {@icu} Creates a ULocale from the id by first canonicalizing the id. + * {@icu} Creates a ULocale from the id by first canonicalizing the id according to CLDR. * @param nonCanonicalID the locale id to canonicalize * @return the locale created from the canonical version of the ID. * @stable ICU 3.0 @@ -507,6 +507,16 @@ public final class ULocale implements Serializable, Comparable { return new ULocale(canonicalize(nonCanonicalID), (Locale)null); } + /** + * Creates a ULocale from the locale by first canonicalizing the locale according to CLDR. + * @param locale the ULocale to canonicalize + * @return the ULocale created from the canonical version of the ULocale. + * @draft ICU 67 + */ + public static ULocale createCanonical(ULocale locale) { + return createCanonical(locale.getName()); + } + private static String lscvToID(String lang, String script, String country, String variant) { StringBuilder buf = new StringBuilder(); @@ -1204,8 +1214,8 @@ public final class ULocale implements Serializable, Comparable { } /** - * {@icu} Returns the canonical name for the specified locale ID. This is used to - * convert POSIX and other grandfathered IDs to standard ICU form. + * {@icu} Returns the canonical name according to CLDR for the specified locale ID. + * This is used to convert POSIX and other grandfathered IDs to standard ICU form. * @param localeID the locale id * @return the canonicalized id * @stable ICU 3.0 @@ -1239,6 +1249,144 @@ public final class ULocale implements Serializable, Comparable { } } + // If the BCP 47 primary language subtag matches the type attribute of a languageAlias + // element in Supplemental Data, replace the language subtag with the replacement value. + // If there are additional subtags in the replacement value, add them to the result, but + // only if there is no corresponding subtag already in the tag. + // Five special deprecated grandfathered codes (such as i-default) are in type attributes, and are also replaced. + try { + UResourceBundle languageAlias = UResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME, + "metadata", ICUResourceBundle.ICU_DATA_CLASS_LOADER) + .get("alias") + .get("language"); + // language _ variant + if (!parser.getVariant().isEmpty()) { + String [] variants = parser.getVariant().split("_"); + for (String variant : variants) { + try { + // Note the key in the metadata.txt is formatted as language_variant + // instead of language__variant but lscvToID will generate + // language__variant so we have to build the string ourselves. + ULocale replaceLocale = new ULocale(languageAlias.get( + (new StringBuilder(parser.getLanguage().length() + 1 + parser.getVariant().length())) + .append(parser.getLanguage()) + .append("_") + .append(variant) + .toString()) + .get("replacement") + .getString()); + StringBuilder replacedVariant = new StringBuilder(parser.getVariant().length()); + for (String current : variants) { + if (current.equals(variant)) continue; + if (replacedVariant.length() > 0) replacedVariant.append("_"); + replacedVariant.append(current); + } + parser = new LocaleIDParser( + (new StringBuilder(localeID.length())) + .append(lscvToID(replaceLocale.getLanguage(), + !parser.getScript().isEmpty() ? parser.getScript() : replaceLocale.getScript(), + !parser.getCountry().isEmpty() ? parser.getCountry() : replaceLocale.getCountry(), + replacedVariant.toString())) + .append(parser.getName().substring(parser.getBaseName().length())) + .toString()); + } catch (MissingResourceException e) { + } + } + } + + // language _ script _ country + // ug_Arab_CN -> ug_CN + if (!parser.getScript().isEmpty() && !parser.getCountry().isEmpty()) { + try { + ULocale replaceLocale = new ULocale(languageAlias.get( + lscvToID(parser.getLanguage(), parser.getScript(), parser.getCountry(), null)) + .get("replacement") + .getString()); + parser = new LocaleIDParser((new StringBuilder(localeID.length())) + .append(lscvToID(replaceLocale.getLanguage(), + replaceLocale.getScript(), + replaceLocale.getCountry(), + parser.getVariant())) + .append(parser.getName().substring(parser.getBaseName().length())) + .toString()); + } catch (MissingResourceException e) { + } + } + // language _ country + // eg. az_AZ -> az_Latn_AZ + if (!parser.getCountry().isEmpty()) { + try { + ULocale replaceLocale = new ULocale(languageAlias.get( + lscvToID(parser.getLanguage(), null, parser.getCountry(), null)) + .get("replacement") + .getString()); + parser = new LocaleIDParser((new StringBuilder(localeID.length())) + .append(lscvToID(replaceLocale.getLanguage(), + parser.getScript().isEmpty() ? replaceLocale.getScript() : parser.getScript(), + replaceLocale.getCountry(), + parser.getVariant())) + .append(parser.getName().substring(parser.getBaseName().length())) + .toString()); + } catch (MissingResourceException e) { + } + } + // only language + // e.g. twi -> ak + try { + ULocale replaceLocale = new ULocale(languageAlias.get(parser.getLanguage()) + .get("replacement") + .getString()); + parser = new LocaleIDParser((new StringBuilder(localeID.length())) + .append(lscvToID(replaceLocale.getLanguage(), + parser.getScript().isEmpty() ? replaceLocale.getScript() : parser.getScript() , + parser.getCountry().isEmpty() ? replaceLocale.getCountry() : parser.getCountry() , + parser.getVariant())) + .append(parser.getName().substring(parser.getBaseName().length())) + .toString()); + } catch (MissingResourceException e) { + } + } catch (MissingResourceException e) { + } + + // If the BCP 47 region subtag matches the type attribute of a + // territoryAlias element in Supplemental Data, replace the language + // subtag with the replacement value, as follows: + if (!parser.getCountry().isEmpty()) { + try { + String replacements[] = UResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME, + "metadata", ICUResourceBundle.ICU_DATA_CLASS_LOADER) + .get("alias") + .get("territory") + .get(parser.getCountry()) + .get("replacement") + .getString() + .split(" "); + String replacement = replacements[0]; + // If there is a single territory in the replacement, use it. + // If there are multiple territories: + // Look up the most likely territory for the base language code (and script, if there is one). + // If that likely territory is in the list, use it. + // Otherwise, use the first territory in the list. + if (replacements.length > 1) { + String likelyCountry = ULocale.addLikelySubtags( + new ULocale(lscvToID(parser.getLanguage(), parser.getScript(), null, parser.getVariant()))) + .getCountry(); + for (String country : replacements) { + if (country.equals(likelyCountry)) { + replacement = likelyCountry; + break; + } + } + } + parser = new LocaleIDParser( + (new StringBuilder(localeID.length())) + .append(lscvToID(parser.getLanguage(), parser.getScript(), replacement, parser.getVariant())) + .append(parser.getName().substring(parser.getBaseName().length())) + .toString()); + } catch (MissingResourceException e) { + } + } + return parser.getName(); } diff --git a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/format/GlobalizationPreferencesTest.java b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/format/GlobalizationPreferencesTest.java index 28850ba302d..627d286f7e5 100644 --- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/format/GlobalizationPreferencesTest.java +++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/format/GlobalizationPreferencesTest.java @@ -239,7 +239,7 @@ public class GlobalizationPreferencesTest extends TestFmwk { {"fr_CA", "fr"}, {"fr", "fr_CA"}, {"es", "fr", "en_US"}, - {"zh_CN", "zh_Hans", "zh_Hans_CN"}, + {"zh_Hans", "zh_Hans_CN"}, {"en_US_123"}, {"es_US", "es"}, {"de_DE", "es", "fr_FR"}, @@ -261,7 +261,7 @@ public class GlobalizationPreferencesTest extends TestFmwk { {"fr_CA", "fr"}, {"fr_CA", "fr"}, {"es", "fr", "en_US", "en"}, - {"zh_Hans_CN", "zh_CN", "zh_Hans", "zh"}, + {"zh_Hans_CN", "zh_Hans", "zh"}, {"en_US_123", "en_US", "en"}, {"es_US", "es"}, {"de_DE", "de", "es", "fr_FR", "fr"}, @@ -347,10 +347,14 @@ public class GlobalizationPreferencesTest extends TestFmwk { gp.reset(); gp.setLocales(acceptLanguage); - List resultLocales = gp.getLocales(); + List resultLocales = gp.getLocales(); if (resultLocales.size() != RESULTS_LOCALEIDS[i].length) { + StringBuilder res = new StringBuilder(); + for (ULocale l : resultLocales) { + res.append(l.toString()).append(","); + } errln("FAIL: Number of locales mismatch - GP:" + resultLocales.size() - + " Expected:" + RESULTS_LOCALEIDS[i].length); + + " Expected:" + RESULTS_LOCALEIDS[i].length + " index: " + i + " " + res.toString()); } else { for (int j = 0; j < RESULTS_LOCALEIDS[i].length; j++) { diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java index c35deceb050..0d00729578d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java @@ -673,10 +673,10 @@ public class ULocaleTest extends TestFmwk { {"x-piglatin", "", "ML", "", "x-piglatin_ML.MBE", "x-piglatin_ML.MBE", "x-piglatin_ML"}, /* Multibyte English */ {"i-cherokee", "","US", "", "i-Cherokee_US.utf7", "i-cherokee_US.utf7", "i-cherokee_US"}, {"x-filfli", "", "MT", "FILFLA", "x-filfli_MT_FILFLA.gb-18030", "x-filfli_MT_FILFLA.gb-18030", "x-filfli_MT_FILFLA"}, - {"no", "", "NO", "NY_B", "no-no-ny.utf32@B", "no_NO_NY.utf32@B", "no_NO_NY_B"}, - {"no", "", "NO", "B", "no-no.utf32@B", "no_NO.utf32@B", "no_NO_B"}, - {"no", "", "", "NY", "no__ny", "no__NY", null}, - {"no", "", "", "NY", "no@ny", "no@ny", "no__NY"}, + {"no", "", "NO", "NY_B", "no-no-ny.utf32@B", "no_NO_NY.utf32@B", "nb_NO_NY_B"}, + {"no", "", "NO", "B", "no-no.utf32@B", "no_NO.utf32@B", "nb_NO_B"}, + {"no", "", "", "NY", "no__ny", "no__NY", "nb__NY"}, + {"no", "", "", "NY", "no@ny", "no@ny", "nb__NY"}, {"el", "Latn", "", "", "el-latn", "el_Latn", null}, {"en", "Cyrl", "RU", "", "en-cyrl-ru", "en_Cyrl_RU", null}, {"qq", "Qqqq", "QQ", "QQ", "qq_Qqqq_QQ_QQ", "qq_Qqqq_QQ_QQ", null}, @@ -893,13 +893,13 @@ public class ULocaleTest extends TestFmwk { public void TestCanonicalization(){ final String[][]testCases = new String[][]{ { "zh@collation=pinyin", "zh@collation=pinyin", "zh@collation=pinyin" }, - { "zh_CN@collation=pinyin", "zh_CN@collation=pinyin", "zh_CN@collation=pinyin" }, - { "zh_CN_CA@collation=pinyin", "zh_CN_CA@collation=pinyin", "zh_CN_CA@collation=pinyin" }, + { "zh_CN@collation=pinyin", "zh_CN@collation=pinyin", "zh_Hans_CN@collation=pinyin" }, + { "zh_CN_CA@collation=pinyin", "zh_CN_CA@collation=pinyin", "zh_Hans_CN_CA@collation=pinyin" }, { "en_US_POSIX", "en_US_POSIX", "en_US_POSIX" }, { "hy_AM_REVISED", "hy_AM_REVISED", "hy_AM_REVISED" }, - { "no_NO_NY", "no_NO_NY", "no_NO_NY" /* not: "nn_NO" [alan ICU3.0] */ }, - { "no@ny", null, "no__NY" /* not: "nn" [alan ICU3.0] */ }, /* POSIX ID */ - { "no-no.utf32@B", null, "no_NO_B" /* not: "nb_NO_B" [alan ICU3.0] */ }, /* POSIX ID */ + { "no_NO_NY", "no_NO_NY", "nb_NO_NY" /* not: "nn_NO" [alan ICU3.0] */ }, + { "no@ny", null, "nb__NY" /* not: "nn" [alan ICU3.0] */ }, /* POSIX ID */ + { "no-no.utf32@B", null, "nb_NO_B" /* not: "nb_NO_B" [alan ICU3.0] */ }, /* POSIX ID */ { "en-BOONT", "en__BOONT", "en__BOONT" }, /* registered name */ { "de-1901", "de__1901", "de__1901" }, /* registered name */ { "de-1906", "de__1906", "de__1906" }, /* registered name */ @@ -910,7 +910,7 @@ public class ULocaleTest extends TestFmwk { { "x-piglatin_ML.MBE", null, "x-piglatin_ML" }, { "i-cherokee_US.utf7", null, "i-cherokee_US" }, { "x-filfli_MT_FILFLA.gb-18030", null, "x-filfli_MT_FILFLA" }, - { "no-no-ny.utf8@B", null, "no_NO_NY_B" /* not: "nn_NO" [alan ICU3.0] */ }, /* @ ignored unless variant is empty */ + { "no-no-ny.utf8@B", null, "nb_NO_NY_B" /* not: "nn_NO" [alan ICU3.0] */ }, /* @ ignored unless variant is empty */ /* fleshing out canonicalization */ /* sort keywords, ';' is separator so not present at end in canonical form */ @@ -919,7 +919,7 @@ public class ULocaleTest extends TestFmwk { { "en_Hant_IL_VALLEY_GIRL@calendar=Japanese;currency=EUR", "en_Hant_IL_VALLEY_GIRL@calendar=Japanese;currency=EUR", "en_Hant_IL_VALLEY_GIRL@calendar=Japanese;currency=EUR" }, /* norwegian is just too weird, if we handle things in their full generality */ /* this is a negative test to show that we DO NOT handle 'lang=no,var=NY' specially. */ - { "no-Hant-GB_NY@currency=$$$", "no_Hant_GB_NY@currency=$$$", "no_Hant_GB_NY@currency=$$$" /* not: "nn_Hant_GB@currency=$$$" [alan ICU3.0] */ }, + { "no-Hant-GB_NY@currency=$$$", "no_Hant_GB_NY@currency=$$$", "nb_Hant_GB_NY@currency=$$$" /* not: "nn_Hant_GB@currency=$$$" [alan ICU3.0] */ }, /* test cases reflecting internal resource bundle usage */ /* root is just a language */ @@ -957,14 +957,14 @@ public class ULocaleTest extends TestFmwk { { "hi__DIRECT", "hi__DIRECT", "hi__DIRECT" }, { "ja_JP_TRADITIONAL", "ja_JP_TRADITIONAL", "ja_JP_TRADITIONAL" }, { "th_TH_TRADITIONAL", "th_TH_TRADITIONAL", "th_TH_TRADITIONAL" }, - { "zh_TW_STROKE", "zh_TW_STROKE", "zh_TW_STROKE" }, + { "zh_TW_STROKE", "zh_TW_STROKE", "zh_Hant_TW_STROKE" }, { "zh__PINYIN", "zh__PINYIN", "zh__PINYIN" }, { "qz-qz@Euro", null, "qz_QZ_EURO" }, /* qz-qz uses private use iso codes */ { "sr-SP-Cyrl", "sr_SP_CYRL", "sr_SP_CYRL" }, /* .NET name */ { "sr-SP-Latn", "sr_SP_LATN", "sr_SP_LATN" }, /* .NET name */ - { "sr_YU_CYRILLIC", "sr_YU_CYRILLIC", "sr_YU_CYRILLIC" }, /* Linux name */ - { "uz-UZ-Cyrl", "uz_UZ_CYRL", "uz_UZ_CYRL" }, /* .NET name */ - { "uz-UZ-Latn", "uz_UZ_LATN", "uz_UZ_LATN" }, /* .NET name */ + { "sr_YU_CYRILLIC", "sr_YU_CYRILLIC", "sr_RS_CYRILLIC" }, /* Linux name */ + { "uz-UZ-Cyrl", "uz_UZ_CYRL", "uz_Latn_UZ_CYRL" }, /* .NET name */ + { "uz-UZ-Latn", "uz_UZ_LATN", "uz_Latn_UZ_LATN" }, /* .NET name */ { "zh-CHS", "zh_CHS", "zh_CHS" }, /* .NET name */ { "zh-CHT", "zh_CHT", "zh_CHT" }, /* .NET name This may change back to zh_Hant */ /* PRE_EURO and EURO conversions don't affect other keywords */ @@ -1590,7 +1590,7 @@ public class ULocaleTest extends TestFmwk { /*3*/ { null, "true" }, /*4*/ { "es", "false" }, /*5*/ { "de", "false" }, - /*6*/ { "zh_TW", "false" }, + /*6*/ { "zh_Hant_TW", "false" }, /*7*/ { "zh", "true" }, }; @@ -5154,4 +5154,84 @@ public class ULocaleTest extends TestFmwk { Assert.assertEquals(testData[row][1], loc.toLanguageTag()); } } + + // Helper function + private String canonicalTag(String languageTag) { + return ULocale.createCanonical(ULocale.forLanguageTag(languageTag)).toLanguageTag(); + } + + @Test + public void TestCanonical() { + // Test replacement of languageAlias + + // language _ variant -> language + Assert.assertEquals("nb", canonicalTag("no-BOKMAL")); + // also test with script, country and extensions + Assert.assertEquals("nb-Cyrl-ID-u-ca-japanese", canonicalTag("no-Cyrl-ID-BOKMAL-u-ca-japanese")); + // also test with other variants, script, country and extensions + Assert.assertEquals("nb-Cyrl-ID-1901-xsistemo-u-ca-japanese", + canonicalTag("no-Cyrl-ID-1901-BOKMAL-xsistemo-u-ca-japanese")); + Assert.assertEquals("nb-Cyrl-ID-1901-u-ca-japanese", + canonicalTag("no-Cyrl-ID-1901-BOKMAL-u-ca-japanese")); + Assert.assertEquals("nb-Cyrl-ID-xsistemo-u-ca-japanese", + canonicalTag("no-Cyrl-ID-BOKMAL-xsistemo-u-ca-japanese")); + + Assert.assertEquals("nn", canonicalTag("no-NYNORSK")); + // also test with script, country and extensions + Assert.assertEquals("nn-Cyrl-ID-u-ca-japanese", canonicalTag("no-Cyrl-ID-NYNORSK-u-ca-japanese")); + + Assert.assertEquals("ssy", canonicalTag("aa-SAAHO")); + // also test with script, country and extensions + Assert.assertEquals("ssy-Devn-IN-u-ca-japanese", canonicalTag("aa-Devn-IN-SAAHO-u-ca-japanese")); + + // language -> language + Assert.assertEquals("aas", canonicalTag("aam")); + // also test with script, country, variants and extensions + Assert.assertEquals("aas-Cyrl-ID-3456-u-ca-japanese", canonicalTag("aam-Cyrl-ID-3456-u-ca-japanese")); + + // language -> language _ Script + Assert.assertEquals("sr-Latn", canonicalTag("sh")); + // also test with script + Assert.assertEquals("sr-Cyrl", canonicalTag("sh-Cyrl")); + // also test with country, variants and extensions + Assert.assertEquals("sr-Latn-ID-3456-u-ca-roc", canonicalTag("sh-ID-3456-u-ca-roc")); + + // language -> language _ country + Assert.assertEquals("fa-AF", canonicalTag("prs")); + // also test with country + Assert.assertEquals("fa-RU", canonicalTag("prs-RU")); + // also test with script, variants and extensions + Assert.assertEquals("fa-Cyrl-AF-1009-u-ca-roc", canonicalTag("prs-Cyrl-1009-u-ca-roc")); + + // language _ country -> language _ script _ country + Assert.assertEquals("pa-Guru-IN", canonicalTag("pa-IN")); + // also test with script + Assert.assertEquals("pa-Latn-IN", canonicalTag("pa-Latn-IN")); + // also test with variants and extensions + Assert.assertEquals("pa-Guru-IN-5678-u-ca-hindi", canonicalTag("pa-IN-5678-u-ca-hindi")); + + // language _ script _ country -> language _ country + Assert.assertEquals("ky-KG", canonicalTag("ky-Cyrl-KG")); + // also test with variants and extensions + Assert.assertEquals("ky-KG-3456-u-ca-roc", canonicalTag("ky-Cyrl-KG-3456-u-ca-roc")); + + // Test replacement of territoryAlias + // 554 has one replacement + Assert.assertEquals("en-NZ", canonicalTag("en-554")); + Assert.assertEquals("en-NZ-u-nu-arab", canonicalTag("en-554-u-nu-arab")); + + // 172 has multiple replacements + // also test with variants + Assert.assertEquals("ru-RU-1234", canonicalTag("ru-172-1234")); + // also test with variants + Assert.assertEquals("ru-RU-1234-u-nu-latn", canonicalTag("ru-172-1234-u-nu-latn")); + Assert.assertEquals("uz-UZ", canonicalTag("uz-172")); + // also test with scripts + Assert.assertEquals("uz-Cyrl-UZ", canonicalTag("uz-Cyrl-172")); + Assert.assertEquals("uz-Bopo-UZ", canonicalTag("uz-Bopo-172")); + // also test with variants and scripts + Assert.assertEquals("uz-Cyrl-UZ-5678-u-nu-latn", canonicalTag("uz-Cyrl-172-5678-u-nu-latn")); + // a language not used in this region + Assert.assertEquals("fr-RU", canonicalTag("fr-172")); + } }