From 84f6735fde76e44de2d4d1ad682eba1c77669a68 Mon Sep 17 00:00:00 2001 From: Frank Tang Date: Wed, 23 Oct 2019 17:12:56 +0000 Subject: [PATCH] ICU-20478 Sort variant in (for|to)LanguageTag of icu::Locale and ULocale See #836 --- icu4c/source/common/uloc_tag.cpp | 17 +++++++++++++++++ icu4c/source/test/cintltst/cloctst.c | 16 ++++++++++++++-- icu4c/source/test/intltest/loctest.cpp | 12 ++++++++++++ .../impl/locale/InternalLocaleBuilder.java | 4 +++- .../core/src/com/ibm/icu/util/ULocale.java | 5 ++++- .../ibm/icu/dev/test/util/ULocaleTest.java | 19 ++++++++++++++++--- 6 files changed, 66 insertions(+), 7 deletions(-) diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp index 1c10c48182c..2b76a9273c6 100644 --- a/icu4c/source/common/uloc_tag.cpp +++ b/icu4c/source/common/uloc_tag.cpp @@ -1110,6 +1110,19 @@ _appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool stri } } +static void _sortVariants(VariantListEntry* first) { + for (VariantListEntry* var1 = first; var1 != NULL; var1 = var1->next) { + for (VariantListEntry* var2 = var1->next; var2 != NULL; var2 = var2->next) { + // Swap var1->variant and var2->variant. + if (uprv_compareInvCharsAsAscii(var1->variant, var2->variant) > 0) { + const char* temp = var1->variant; + var1->variant = var2->variant; + var2->variant = temp; + } + } + } +} + static void _appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool *hadPosix, UErrorCode* status) { char buf[ULOC_FULLNAME_CAPACITY]; @@ -1199,6 +1212,9 @@ _appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool st if (varFirst != NULL) { int32_t varLen; + /* per UTS35, we should sort the variants */ + _sortVariants(varFirst); + /* write out validated/normalized variants to the target */ var = varFirst; while (var != NULL) { @@ -2822,6 +2838,7 @@ ulocimp_forLanguageTag(const char* langtag, } /* variants */ + _sortVariants(lt.getAlias()->variants); n = ultag_getVariantsSize(lt.getAlias()); if (n > 0) { if (noRegion) { diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 0a1a0ae9f0c..36cd7825c59 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -6029,7 +6029,10 @@ const char* const locale_to_langtag[][3] = { {"aa_BB_CYRL", "aa-BB-x-lvariant-cyrl", NULL}, {"en_US_1234", "en-US-1234", "en-US-1234"}, {"en_US_VARIANTA_VARIANTB", "en-US-varianta-variantb", "en-US-varianta-variantb"}, - {"ja__9876_5432", "ja-9876-5432", "ja-9876-5432"}, + {"en_US_VARIANTB_VARIANTA", "en-US-varianta-variantb", "en-US-varianta-variantb"}, /* ICU-20478 */ + {"ja__9876_5432", "ja-5432-9876", "ja-5432-9876"}, /* ICU-20478 */ + {"sl__ROZAJ_BISKE_1994", "sl-1994-biske-rozaj", "sl-1994-biske-rozaj"}, /* ICU-20478 */ + {"en__SCOUSE_FONIPA", "en-fonipa-scouse", "en-fonipa-scouse"}, /* ICU-20478 */ {"zh_Hant__VAR", "zh-Hant-x-lvariant-var", NULL}, {"es__BADVARIANT_GOODVAR", "es-goodvar", NULL}, {"en@calendar=gregorian", "en-u-ca-gregory", "en-u-ca-gregory"}, @@ -6187,7 +6190,16 @@ static const struct { {"bogus", "bogus", FULL_LENGTH}, {"boguslang", "", 0}, {"EN-lATN-us", "en_Latn_US", FULL_LENGTH}, - {"und-variant-1234", "__VARIANT_1234", FULL_LENGTH}, + {"und-variant-1234", "__1234_VARIANT", FULL_LENGTH}, /* ICU-20478 */ + {"ja-9876-5432", "ja__5432_9876", FULL_LENGTH}, /* ICU-20478 */ + {"en-US-varianta-variantb", "en_US_VARIANTA_VARIANTB", FULL_LENGTH}, /* ICU-20478 */ + {"en-US-variantb-varianta", "en_US_VARIANTA_VARIANTB", FULL_LENGTH}, /* ICU-20478 */ + {"sl-rozaj-1994-biske", "sl__1994_BISKE_ROZAJ", FULL_LENGTH}, /* ICU-20478 */ + {"sl-biske-1994-rozaj", "sl__1994_BISKE_ROZAJ", FULL_LENGTH}, /* ICU-20478 */ + {"sl-1994-rozaj-biske", "sl__1994_BISKE_ROZAJ", FULL_LENGTH}, /* ICU-20478 */ + {"sl-rozaj-biske-1994", "sl__1994_BISKE_ROZAJ", FULL_LENGTH}, /* ICU-20478 */ + {"en-fonipa-scouse", "en__FONIPA_SCOUSE", FULL_LENGTH}, /* ICU-20478 */ + {"en-scouse-fonipa", "en__FONIPA_SCOUSE", FULL_LENGTH}, /* ICU-20478 */ {"und-varzero-var1-vartwo", "__VARZERO", 11}, {"en-u-ca-gregory", "en@calendar=gregorian", FULL_LENGTH}, {"en-U-cu-USD", "en@currency=usd", FULL_LENGTH}, diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp index 73f1b73794a..06fd6176094 100644 --- a/icu4c/source/test/intltest/loctest.cpp +++ b/icu4c/source/test/intltest/loctest.cpp @@ -3156,6 +3156,7 @@ void LocaleTest::TestForLanguageTag() { static const char tag_ill[] = "!"; static const char tag_no_nul[] = { 'e', 'n', '-', 'G', 'B' }; static const char tag_ext[] = "en-GB-1-abc-efg-a-xyz"; + static const char tag_var[] = "sl-rozaj-biske-1994"; static const Locale loc_en("en_US"); static const Locale loc_oed("en_GB_OXENDICT"); @@ -3163,6 +3164,7 @@ void LocaleTest::TestForLanguageTag() { static const Locale loc_null(""); static const Locale loc_gb("en_GB"); static const Locale loc_ext("en_GB@1=abc-efg;a=xyz"); + static const Locale loc_var("sl__1994_BISKE_ROZAJ"); Locale result_en = Locale::forLanguageTag(tag_en, status); status.errIfFailureAndReset("\"%s\"", tag_en); @@ -3176,6 +3178,10 @@ void LocaleTest::TestForLanguageTag() { status.errIfFailureAndReset("\"%s\"", tag_af); assertEquals(tag_af, loc_af.getName(), result_af.getName()); + Locale result_var = Locale::forLanguageTag(tag_var, status); + status.errIfFailureAndReset("\"%s\"", tag_var); + assertEquals(tag_var, loc_var.getName(), result_var.getName()); + Locale result_ill = Locale::forLanguageTag(tag_ill, status); assertEquals(tag_ill, U_ILLEGAL_ARGUMENT_ERROR, status.reset()); assertTrue(result_ill.getName(), result_ill.isBogus()); @@ -3210,12 +3216,14 @@ void LocaleTest::TestToLanguageTag() { static const Locale loc_ext("en@0=abc;a=xyz"); static const Locale loc_empty(""); static const Locale loc_ill("!"); + static const Locale loc_variant("sl__ROZAJ_BISKE_1994"); static const char tag_c[] = "en-US-u-va-posix"; static const char tag_en[] = "en-US"; static const char tag_af[] = "af-t-ar-i0-handwrit-u-ca-coptic-x-foo"; static const char tag_ext[] = "en-0-abc-a-xyz"; static const char tag_und[] = "und"; + static const char tag_variant[] = "sl-1994-biske-rozaj"; std::string result; StringByteSink sink(&result); @@ -3247,6 +3255,10 @@ void LocaleTest::TestToLanguageTag() { status.errIfFailureAndReset("\"%s\"", loc_ill.getName()); assertEquals(loc_ill.getName(), tag_und, result_ill.c_str()); + std::string result_variant = loc_variant.toLanguageTag(status); + status.errIfFailureAndReset("\"%s\"", loc_variant.getName()); + assertEquals(loc_variant.getName(), tag_variant, result_variant.c_str()); + Locale loc_bogus; loc_bogus.setToBogus(); std::string result_bogus = loc_bogus.toLanguageTag(status); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/InternalLocaleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/InternalLocaleBuilder.java index 615156b86f7..2a676a369be 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/InternalLocaleBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/InternalLocaleBuilder.java @@ -9,6 +9,7 @@ package com.ibm.icu.impl.locale; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -326,7 +327,8 @@ public final class InternalLocaleBuilder { _script = langtag.getScript(); _region = langtag.getRegion(); - List bcpVariants = langtag.getVariants(); + ArrayList bcpVariants = new ArrayList(langtag.getVariants()); + Collections.sort(bcpVariants); if (bcpVariants.size() > 0) { StringBuilder var = new StringBuilder(bcpVariants.get(0)); for (int i = 1; i < bcpVariants.size(); i++) { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java b/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java index a8d8f7f115f..06de947bb09 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java @@ -3246,7 +3246,10 @@ public final class ULocale implements Serializable, Comparable { } Listsubtags = tag.getVariants(); - for (String s : subtags) { + // ICU-20478: Sort variants per UTS35. + ArrayList variants = new ArrayList(subtags); + Collections.sort(variants); + for (String s : variants) { buf.append(LanguageTag.SEP); buf.append(LanguageTag.canonicalizeVariant(s)); } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java index fc2dc4f98cb..4c4ba972a08 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java @@ -4106,8 +4106,8 @@ public class ULocaleTest extends TestFmwk { {"aa_BB_CYRL", "aa-BB-x-lvariant-cyrl"}, {"en_US_1234", "en-US-1234"}, {"en_US_VARIANTA_VARIANTB", "en-US-varianta-variantb"}, - {"en_US_VARIANTB_VARIANTA", "en-US-variantb-varianta"}, - {"ja__9876_5432", "ja-9876-5432"}, + {"en_US_VARIANTB_VARIANTA", "en-US-varianta-variantb"}, /* ICU-20478 */ + {"ja__9876_5432", "ja-5432-9876"}, /* ICU-20478 */ {"zh_Hant__VAR", "zh-Hant-x-lvariant-var"}, {"es__BADVARIANT_GOODVAR", "es"}, {"es__GOODVAR_BAD_BADVARIANT", "es-goodvar-x-lvariant-bad"}, @@ -4131,6 +4131,9 @@ public class ULocaleTest extends TestFmwk { {"en@a=bar;attribute=baz;calendar=islamic-civil;x=u-foo", "en-a-bar-u-baz-ca-islamic-civil-x-u-foo"}, /* ICU-20320*/ {"en@9=efg;a=baz", "en-9-efg-a-baz"}, + /* ICU-20478 */ + {"sl__ROZAJ_BISKE_1994", "sl-1994-biske-rozaj"}, + {"en__SCOUSE_FONIPA", "en-fonipa-scouse"}, }; for (int i = 0; i < locale_to_langtag.length; i++) { @@ -4228,7 +4231,7 @@ public class ULocaleTest extends TestFmwk { {"bogus", "bogus", NOERROR}, {"boguslang", "", Integer.valueOf(0)}, {"EN-lATN-us", "en_Latn_US", NOERROR}, - {"und-variant-1234", "__VARIANT_1234", NOERROR}, + {"und-variant-1234", "__1234_VARIANT", NOERROR}, /* ICU-20478 */ {"und-varzero-var1-vartwo", "__VARZERO", Integer.valueOf(12)}, {"en-u-ca-gregory", "en@calendar=gregorian", NOERROR}, {"en-U-cu-USD", "en@currency=usd", NOERROR}, @@ -4274,6 +4277,16 @@ public class ULocaleTest extends TestFmwk { /* #20410 */ {"art-lojban-x-0", "jbo@x=0", NOERROR}, {"zh-xiang-u-nu-thai-x-0", "hsn@numbers=thai;x=0", NOERROR}, + /* ICU-20478 */ + {"ja-9876-5432", "ja__5432_9876", NOERROR}, + {"en-US-variantb-varianta", "en_US_VARIANTA_VARIANTB", NOERROR}, + {"en-US-varianta-variantb", "en_US_VARIANTA_VARIANTB", NOERROR}, + {"sl-rozaj-biske-1994", "sl__1994_BISKE_ROZAJ", NOERROR}, + {"sl-biske-rozaj-1994", "sl__1994_BISKE_ROZAJ", NOERROR}, + {"sl-biske-1994-rozaj", "sl__1994_BISKE_ROZAJ", NOERROR}, + {"sl-1994-biske-rozaj", "sl__1994_BISKE_ROZAJ", NOERROR}, + {"en-fonipa-scouse", "en__FONIPA_SCOUSE", NOERROR}, + {"en-scouse-fonipa", "en__FONIPA_SCOUSE", NOERROR}, }; for (int i = 0; i < langtag_to_locale.length; i++) {