diff --git a/icu4c/source/common/locid.cpp b/icu4c/source/common/locid.cpp index 19b61cd50e2..a2c38a95e16 100644 --- a/icu4c/source/common/locid.cpp +++ b/icu4c/source/common/locid.cpp @@ -1025,13 +1025,14 @@ Locale::forLanguageTag(StringPiece tag, UErrorCode& status) return result; } - // If a BCP-47 language tag is passed as the language parameter to the + // If a BCP 47 language tag is passed as the language parameter to the // normal Locale constructor, it will actually fall back to invoking // uloc_forLanguageTag() to parse it if it somehow is able to detect that - // the string actually is BCP-47. This works well for things like strings - // using BCP-47 extensions, but it does not at all work for things like - // BCP-47 grandfathered tags (eg. "en-GB-oed") which are possible to also - // interpret as ICU locale IDs and because of that won't trigger the BCP-47 + // the string actually is BCP 47. This works well for things like strings + // using BCP 47 extensions, but it does not at all work for things like + // legacy language tags (marked as “Type: grandfathered” in BCP 47, + // e.g., "en-GB-oed") which are possible to also + // interpret as ICU locale IDs and because of that won't trigger the BCP 47 // parsing. Therefore the code here explicitly calls uloc_forLanguageTag() // and then Locale::init(), instead of just calling the normal constructor. diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp index ad5dd6430c9..5eed02c6114 100644 --- a/icu4c/source/common/uloc_tag.cpp +++ b/icu4c/source/common/uloc_tag.cpp @@ -53,7 +53,7 @@ typedef struct ULanguageTag { VariantListEntry *variants; ExtensionListEntry *extensions; const char *privateuse; - const char *grandfathered; + const char *legacy; } ULanguageTag; #define MINLEN 2 @@ -85,8 +85,9 @@ static const char LOCALE_TYPE_YES[] = "yes"; Updated on 2018-09-12 from https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . - This table has 2 parts. The parts for Grandfathered tags is generated by the - following scripts from the IANA language tag registry. + This table has 2 parts. The part for + legacy language tags (marked as “Type: grandfathered” in BCP 47) + is generated by the following scripts from the IANA language tag registry. curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\ egrep -A 7 'Type: grandfathered' | \ @@ -100,8 +101,8 @@ static const char LOCALE_TYPE_YES[] = "yes"; values. They may have to be removed for the strict BCP 47 compliance. */ -static const char* const GRANDFATHERED[] = { -/* grandfathered preferred */ +static const char* const LEGACY[] = { +/* legacy preferred */ "art-lojban", "jbo", "en-gb-oed", "en-gb-oxendict", "i-ami", "ami", @@ -124,7 +125,7 @@ static const char* const GRANDFATHERED[] = { "zh-min-nan", "nan", "zh-xiang", "hsn", - // Grandfathered tags with no preferred value in the IANA + // Legacy tags with no preferred value in the IANA // registry. Kept for now for the backward compatibility // because ICU has mapped them this way. "cel-gaulish", "xtg-x-cel-gaulish", @@ -346,7 +347,7 @@ ultag_getPrivateUse(const ULanguageTag* langtag); #if 0 static const char* -ultag_getGrandfathered(const ULanguageTag* langtag); +ultag_getLegacy(const ULanguageTag* langtag); #endif U_NAMESPACE_BEGIN @@ -986,7 +987,7 @@ _initializeULanguageTag(ULanguageTag* langtag) { langtag->variants = NULL; langtag->extensions = NULL; - langtag->grandfathered = EMPTY; + langtag->legacy = EMPTY; langtag->privateuse = EMPTY; } @@ -2042,7 +2043,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta char *pExtValueSubtag, *pExtValueSubtagEnd; int32_t i; UBool privateuseVar = FALSE; - int32_t grandfatheredLen = 0; + int32_t legacyLen = 0; if (parsedLen != NULL) { *parsedLen = 0; @@ -2082,25 +2083,25 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta } size_t parsedLenDelta = 0; - // Grandfathered tag will be consider together. Grandfathered tag with intervening + // Legacy tag will be consider together. Legacy tag with intervening // script and region such as art-DE-lojban or art-Latn-lojban won't be // matched. - /* check if the tag is grandfathered */ - for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) { - int32_t checkGrandfatheredLen = static_cast(uprv_strlen(GRANDFATHERED[i])); - if (tagLen < checkGrandfatheredLen) { + /* check if the tag is legacy */ + for (i = 0; i < UPRV_LENGTHOF(LEGACY); i += 2) { + int32_t checkLegacyLen = static_cast(uprv_strlen(LEGACY[i])); + if (tagLen < checkLegacyLen) { continue; } - if (tagLen > checkGrandfatheredLen && tagBuf[checkGrandfatheredLen] != '-') { + if (tagLen > checkLegacyLen && tagBuf[checkLegacyLen] != '-') { // make sure next char is '-'. continue; } - if (uprv_strnicmp(GRANDFATHERED[i], tagBuf, checkGrandfatheredLen) == 0) { + if (uprv_strnicmp(LEGACY[i], tagBuf, checkLegacyLen) == 0) { int32_t newTagLength; - grandfatheredLen = checkGrandfatheredLen; /* back up for output parsedLen */ - int32_t replacementLen = static_cast(uprv_strlen(GRANDFATHERED[i+1])); - newTagLength = replacementLen + tagLen - checkGrandfatheredLen; + legacyLen = checkLegacyLen; /* back up for output parsedLen */ + int32_t replacementLen = static_cast(uprv_strlen(LEGACY[i+1])); + newTagLength = replacementLen + tagLen - checkLegacyLen; if (tagLen < newTagLength) { uprv_free(tagBuf); tagBuf = (char*)uprv_malloc(newTagLength + 1); @@ -2111,16 +2112,16 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta t->buf = tagBuf; tagLen = newTagLength; } - parsedLenDelta = checkGrandfatheredLen - replacementLen; - uprv_strcpy(t->buf, GRANDFATHERED[i + 1]); - if (checkGrandfatheredLen != tagLen) { - uprv_strcpy(t->buf + replacementLen, tag + checkGrandfatheredLen); + parsedLenDelta = checkLegacyLen - replacementLen; + uprv_strcpy(t->buf, LEGACY[i + 1]); + if (checkLegacyLen != tagLen) { + uprv_strcpy(t->buf + replacementLen, tag + checkLegacyLen); } break; } } - if (grandfatheredLen == 0) { + if (legacyLen == 0) { for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) { const char* redundantTag = REDUNDANT[i]; size_t redundantTagLen = uprv_strlen(redundantTag); @@ -2608,8 +2609,8 @@ ultag_getPrivateUse(const ULanguageTag* langtag) { #if 0 static const char* -ultag_getGrandfathered(const ULanguageTag* langtag) { - return langtag->grandfathered; +ultag_getLegacy(const ULanguageTag* langtag) { + return langtag->legacy; } #endif diff --git a/icu4c/source/common/ulocimp.h b/icu4c/source/common/ulocimp.h index b4d64c55cfb..cd5a357f82e 100644 --- a/icu4c/source/common/ulocimp.h +++ b/icu4c/source/common/ulocimp.h @@ -109,13 +109,17 @@ ulocimp_toLanguageTag(const char* localeID, * If the specified language tag contains any ill-formed subtags, * the first such subtag and all following subtags are ignored. *

- * This implements the 'Language-Tag' production of BCP47, and so - * supports grandfathered (regular and irregular) as well as private - * use language tags. Private use tags are represented as 'x-whatever', - * and grandfathered tags are converted to their canonical replacements - * where they exist. Note that a few grandfathered tags have no modern - * replacement, these will be converted using the fallback described in + * This implements the 'Language-Tag' production of BCP 47, and so + * supports legacy language tags (marked as “Type: grandfathered” in BCP 47) + * (regular and irregular) as well as private use language tags. + * + * Private use tags are represented as 'x-whatever', + * and legacy tags are converted to their canonical replacements where they exist. + * + * Note that a few legacy tags have no modern replacement; + * these will be converted using the fallback described in * the first paragraph, so some information might be lost. + * * @param langtag the input BCP47 language tag. * @param tagLen the length of langtag, or -1 to call uprv_strlen(). * @param sink the output sink receiving a locale ID for the diff --git a/icu4c/source/common/unicode/localebuilder.h b/icu4c/source/common/unicode/localebuilder.h index c5836fe2702..664ee6a84c4 100644 --- a/icu4c/source/common/unicode/localebuilder.h +++ b/icu4c/source/common/unicode/localebuilder.h @@ -92,11 +92,12 @@ public: /** * Resets the LocaleBuilder to match the provided * [Unicode Locale Identifier](http://www.unicode.org/reports/tr35/tr35.html#unicode_locale_id) . - * Discards the existing state. the empty string cause the builder to be - * reset, like {@link #clear}. Grandfathered tags are converted to their - * canonical form before being processed. Otherwise, the language - * tag must be well-formed, or else the build() method will later - * report an U_ILLEGAL_ARGUMENT_ERROR. + * Discards the existing state. + * The empty string causes the builder to be reset, like {@link #clear}. + * Legacy language tags (marked as “Type: grandfathered” in BCP 47) + * are converted to their canonical form before being processed. + * Otherwise, the language tag must be well-formed, + * or else the build() method will later report an U_ILLEGAL_ARGUMENT_ERROR. * *

This method clears the internal UErrorCode. * diff --git a/icu4c/source/common/unicode/locid.h b/icu4c/source/common/unicode/locid.h index 1d031daabc5..f955743bce4 100644 --- a/icu4c/source/common/unicode/locid.h +++ b/icu4c/source/common/unicode/locid.h @@ -393,13 +393,17 @@ public: * If the specified language tag contains any ill-formed subtags, * the first such subtag and all following subtags are ignored. *

- * This implements the 'Language-Tag' production of BCP47, and so - * supports grandfathered (regular and irregular) as well as private - * use language tags. Private use tags are represented as 'x-whatever', - * and grandfathered tags are converted to their canonical replacements - * where they exist. Note that a few grandfathered tags have no modern - * replacement, these will be converted using the fallback described in + * This implements the 'Language-Tag' production of BCP 47, and so + * supports legacy language tags (marked as “Type: grandfathered” in BCP 47) + * (regular and irregular) as well as private use language tags. + * + * Private use tags are represented as 'x-whatever', + * and legacy tags are converted to their canonical replacements where they exist. + * + * Note that a few legacy tags have no modern replacement; + * these will be converted using the fallback described in * the first paragraph, so some information might be lost. + * * @param tag the input BCP47 language tag. * @param status error information if creating the Locale failed. * @return the Locale for the specified BCP47 language tag. diff --git a/icu4c/source/common/unicode/uloc.h b/icu4c/source/common/unicode/uloc.h index d3de89f993e..fa380929806 100644 --- a/icu4c/source/common/unicode/uloc.h +++ b/icu4c/source/common/unicode/uloc.h @@ -1237,14 +1237,18 @@ uloc_minimizeSubtags(const char* localeID, * Returns a locale ID for the specified BCP47 language tag string. * If the specified language tag contains any ill-formed subtags, * the first such subtag and all following subtags are ignored. - *

- * This implements the 'Language-Tag' production of BCP47, and so - * supports grandfathered (regular and irregular) as well as private - * use language tags. Private use tags are represented as 'x-whatever', - * and grandfathered tags are converted to their canonical replacements - * where they exist. Note that a few grandfathered tags have no modern - * replacement, these will be converted using the fallback described in + *

+ * This implements the 'Language-Tag' production of BCP 47, and so + * supports legacy language tags (marked as “Type: grandfathered” in BCP 47) + * (regular and irregular) as well as private use language tags. + * + * Private use tags are represented as 'x-whatever', + * and legacy tags are converted to their canonical replacements where they exist. + * + * Note that a few legacy tags have no modern replacement; + * these will be converted using the fallback described in * the first paragraph, so some information might be lost. + * * @param langtag the input BCP47 language tag. * @param localeID the output buffer receiving a locale ID for the * specified BCP47 language tag. diff --git a/icu4c/source/i18n/calendar.cpp b/icu4c/source/i18n/calendar.cpp index 9497a85fd05..c3e5e8c4f66 100644 --- a/icu4c/source/i18n/calendar.cpp +++ b/icu4c/source/i18n/calendar.cpp @@ -266,7 +266,7 @@ static ECalType getCalendarTypeForLocale(const char *locid) { //TODO: ULOC_FULL_NAME is out of date and too small.. char canonicalName[256]; - // canonicalize, so grandfathered variant will be transformed to keywords + // Canonicalize, so that an old-style variant will be transformed to keywords. // e.g ja_JP_TRADITIONAL -> ja_JP@calendar=japanese // NOTE: Since ICU-20187, ja_JP_TRADITIONAL no longer canonicalizes, and // the Gregorian calendar is returned instead. diff --git a/icu4c/source/i18n/fmtable_cnv.cpp b/icu4c/source/i18n/fmtable_cnv.cpp index 9a647927797..bc3847b6963 100644 --- a/icu4c/source/i18n/fmtable_cnv.cpp +++ b/icu4c/source/i18n/fmtable_cnv.cpp @@ -30,8 +30,6 @@ U_NAMESPACE_BEGIN // ------------------------------------- // Creates a formattable object with a char* string. // This API is useless. The API that takes a UnicodeString is actually just as good. -// This is just a grandfathered API. - Formattable::Formattable(const char* stringToCopy) { init(); diff --git a/icu4c/source/test/testdata/localeMatcherTest.txt b/icu4c/source/test/testdata/localeMatcherTest.txt index 7a10986737f..70afcd02344 100644 --- a/icu4c/source/test/testdata/localeMatcherTest.txt +++ b/icu4c/source/test/testdata/localeMatcherTest.txt @@ -279,7 +279,7 @@ und-TW >> zh-Hant zh-Hant >> und-TW zh >> und-TW -** test: testMatchGrandfatheredCode +** test: testMatchLegacyCode @supported=fr, i-klingon, en-Latn-US en-GB-oed >> en-Latn-US @@ -984,7 +984,7 @@ x-bork >> x-bork x-piglatin >> x-bork x-bork >> x-bork -** test: MatchGrandfatheredCode +** test: MatchLegacyCode @supported=fr, i-klingon, en-Latn-US en-GB-oed >> en-Latn-US i-klingon >> tlh @@ -1525,7 +1525,7 @@ en >> null x-piglatin >> fr x-bork >> x-bork -** test: grandfathered codes +** test: legacy codes @supported=fr, i-klingon, en-Latn-US en-GB-oed >> en-Latn-US i-klingon >> tlh diff --git a/icu4j/eclipse-build/plugins.template/com.ibm.icu.base/src/com/ibm/icu/impl/locale/LanguageTag.java b/icu4j/eclipse-build/plugins.template/com.ibm.icu.base/src/com/ibm/icu/impl/locale/LanguageTag.java index c5cf38ef12d..3adf08a557e 100644 --- a/icu4j/eclipse-build/plugins.template/com.ibm.icu.base/src/com/ibm/icu/impl/locale/LanguageTag.java +++ b/icu4j/eclipse-build/plugins.template/com.ibm.icu.base/src/com/ibm/icu/impl/locale/LanguageTag.java @@ -38,13 +38,13 @@ public class LanguageTag { private List _variants = Collections.emptyList(); // variant subtags private List _extensions = Collections.emptyList(); // extensions - // Map contains grandfathered tags and its preferred mappings from - // http://www.ietf.org/rfc/rfc5646.txt - private static final Map GRANDFATHERED = + // The Map contains legacy language tags (marked as “Type: grandfathered” in BCP 47) + // and their preferred mappings from BCP 47. + private static final Map LEGACY = new HashMap(); static { - // grandfathered = irregular ; non-redundant tags registered + // legacy = irregular ; non-redundant tags registered // / regular ; during the RFC 3066 era // // irregular = "en-GB-oed" ; irregular tags do not match @@ -105,57 +105,17 @@ public class LanguageTag { {"zh-xiang", "hsn"}, }; for (String[] e : entries) { - GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e); + LEGACY.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e); } } private LanguageTag() { } - /* - * BNF in RFC5464 - * - * Language-Tag = langtag ; normal language tags - * / privateuse ; private use tag - * / grandfathered ; grandfathered tags - * - * - * langtag = language - * ["-" script] - * ["-" region] - * *("-" variant) - * *("-" extension) - * ["-" privateuse] - * - * language = 2*3ALPHA ; shortest ISO 639 code - * ["-" extlang] ; sometimes followed by - * ; extended language subtags - * / 4ALPHA ; or reserved for future use - * / 5*8ALPHA ; or registered language subtag - * - * extlang = 3ALPHA ; selected ISO 639 codes - * *2("-" 3ALPHA) ; permanently reserved - * - * script = 4ALPHA ; ISO 15924 code - * - * region = 2ALPHA ; ISO 3166-1 code - * / 3DIGIT ; UN M.49 code - * - * variant = 5*8alphanum ; registered variants - * / (DIGIT 3alphanum) - * - * extension = singleton 1*("-" (2*8alphanum)) - * - * ; Single alphanumerics - * ; "x" reserved for private use - * singleton = DIGIT ; 0 - 9 - * / %x41-57 ; A - W - * / %x59-5A ; Y - Z - * / %x61-77 ; a - w - * / %x79-7A ; y - z - * - * privateuse = "x" 1*("-" (1*8alphanum)) - * + /** + * See BCP 47 “Tags for Identifying Languages”: + * https://www.rfc-editor.org/info/bcp47 --> + * https://www.rfc-editor.org/rfc/rfc5646.html#section-2.1 */ public static LanguageTag parse(String languageTag, ParseStatus sts) { if (sts == null) { @@ -166,8 +126,7 @@ public class LanguageTag { StringTokenIterator itr; - // Check if the tag is grandfathered - String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag)); + String[] gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag)); if (gfmap != null) { // use preferred mapping itr = new StringTokenIterator(gfmap[1], SEP); diff --git a/icu4j/eclipse-build/plugins.template/com.ibm.icu.base/src/com/ibm/icu/util/ULocale.java b/icu4j/eclipse-build/plugins.template/com.ibm.icu.base/src/com/ibm/icu/util/ULocale.java index 5081767b388..b811d50b659 100644 --- a/icu4j/eclipse-build/plugins.template/com.ibm.icu.base/src/com/ibm/icu/util/ULocale.java +++ b/icu4j/eclipse-build/plugins.template/com.ibm.icu.base/src/com/ibm/icu/util/ULocale.java @@ -70,7 +70,8 @@ import com.ibm.icu.impl.locale.UnicodeLocaleExtension; * Canonicalization additionally performs the following: *

    *
  • POSIX ids are converted to ICU format IDs
  • - *
  • 'grandfathered' 3066 ids are converted to ICU standard form
  • + *
  • Legacy language tags (marked as “Type: grandfathered” in BCP 47) + * are converted to ICU standard form
  • *
  • 'PREEURO' and 'EURO' variants are converted to currency keyword form, * with the currency * id appropriate to the country of the locale (for PREEURO) or EUR (for EURO). @@ -1033,7 +1034,7 @@ public final class ULocale implements Serializable { /** * {@icu} Returns the canonical name for the specified locale ID. This is used to - * convert POSIX and other grandfathered IDs to standard ICU form. + * convert POSIX and other legacy IDs to standard ICU form. * @param localeID the locale id * @return the canonicalized id * @stable ICU 3.0 @@ -2666,60 +2667,18 @@ public final class ULocale implements Serializable { * script to title case, country to upper case, variant to upper case, * and extensions to lower case. * - *

    This implements the 'Language-Tag' production of BCP47, and - * so supports grandfathered (regular and irregular) as well as - * private use language tags. Stand alone private use tags are - * represented as empty language and extension 'x-whatever', - * and grandfathered tags are converted to their canonical replacements - * where they exist. + *

    This implements the 'Language-Tag' production of BCP 47, and so + * supports legacy language tags (marked as “Type: grandfathered” in BCP 47) + * (regular and irregular) as well as private use language tags. * - *

    Grandfathered tags with canonical replacements are as follows: + *

    Stand-alone private use tags are represented as empty language and extension 'x-whatever', + * and legacy tags are converted to their canonical replacements where they exist. * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
    grandfathered tag modern replacement
    art-lojban jbo
    i-ami ami
    i-bnn bnn
    i-hak hak
    i-klingon tlh
    i-lux lb
    i-navajo nv
    i-pwn pwn
    i-tao tao
    i-tay tay
    i-tsu tsu
    no-bok nb
    no-nyn nn
    sgn-BE-FR sfb
    sgn-BE-NL vgt
    sgn-CH-DE sgg
    zh-guoyu cmn
    zh-hakka hak
    zh-min-nan nan
    zh-xiang hsn
    + *

    Note that a few legacy tags have no modern replacement; + * these will be converted using the fallback described in + * the first paragraph, so some information might be lost. * - *

    Grandfathered tags with no modern replacement will be - * converted as follows: - * - * - * - * - * - * - * - * - * - * - * - *
    grandfathered tag converts to
    cel-gaulish xtg-x-cel-gaulish
    en-GB-oed en-GB-x-oed
    i-default en-x-i-default
    i-enochian und-x-i-enochian
    i-mingo see-x-i-mingo
    zh-min nan-x-zh-min
    - * - *

    For a list of all grandfathered tags, see the - * IANA Language Subtag Registry (search for "Type: grandfathered"). - * - *

    Note: there is no guarantee that toLanguageTag + *

    Note: There is no guarantee that toLanguageTag * and forLanguageTag will round-trip. * * @param languageTag the language tag @@ -2821,7 +2780,7 @@ public final class ULocale implements Serializable { * Resets the Builder to match the provided IETF BCP 47 * language tag. Discards the existing state. Null and the * empty string cause the builder to be reset, like {@link - * #clear}. Grandfathered tags (see {@link + * #clear}. Legacy tags (see {@link * ULocale#forLanguageTag}) are converted to their canonical * form before being processed. Otherwise, the language tag * must be well-formed (see {@link ULocale}) or an exception is diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CalendarUtil.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CalendarUtil.java index 83da8ce778f..8ceb8d1f8c6 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/CalendarUtil.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CalendarUtil.java @@ -46,7 +46,7 @@ public final class CalendarUtil { return calType.toLowerCase(Locale.ROOT); } - // Canonicalize, so grandfathered variant will be transformed to keywords + // Canonicalize, so that an old-style variant will be transformed to keywords. ULocale canonical = ULocale.createCanonical(loc.toString()); calType = canonical.getKeywordValue(CALKEY); if (calType != null) { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java index 14d1a943b91..53f9879a8dd 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java @@ -38,13 +38,13 @@ public class LanguageTag { private List _variants = Collections.emptyList(); // variant subtags private List _extensions = Collections.emptyList(); // extensions - // Map contains grandfathered tags and its preferred mappings from - // http://www.ietf.org/rfc/rfc5646.txt - private static final Map GRANDFATHERED = + // The Map contains legacy language tags (marked as “Type: grandfathered” in BCP 47) + // and their preferred mappings from BCP 47. + private static final Map LEGACY = new HashMap(); static { - // grandfathered = irregular ; non-redundant tags registered + // legacy = irregular ; non-redundant tags registered // / regular ; during the RFC 3066 era // // irregular = "en-GB-oed" ; irregular tags do not match @@ -105,57 +105,17 @@ public class LanguageTag { {"zh-xiang", "hsn"}, }; for (String[] e : entries) { - GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e); + LEGACY.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e); } } private LanguageTag() { } - /* - * BNF in RFC5464 - * - * Language-Tag = langtag ; normal language tags - * / privateuse ; private use tag - * / grandfathered ; grandfathered tags - * - * - * langtag = language - * ["-" script] - * ["-" region] - * *("-" variant) - * *("-" extension) - * ["-" privateuse] - * - * language = 2*3ALPHA ; shortest ISO 639 code - * ["-" extlang] ; sometimes followed by - * ; extended language subtags - * / 4ALPHA ; or reserved for future use - * / 5*8ALPHA ; or registered language subtag - * - * extlang = 3ALPHA ; selected ISO 639 codes - * *2("-" 3ALPHA) ; permanently reserved - * - * script = 4ALPHA ; ISO 15924 code - * - * region = 2ALPHA ; ISO 3166-1 code - * / 3DIGIT ; UN M.49 code - * - * variant = 5*8alphanum ; registered variants - * / (DIGIT 3alphanum) - * - * extension = singleton 1*("-" (2*8alphanum)) - * - * ; Single alphanumerics - * ; "x" reserved for private use - * singleton = DIGIT ; 0 - 9 - * / %x41-57 ; A - W - * / %x59-5A ; Y - Z - * / %x61-77 ; a - w - * / %x79-7A ; y - z - * - * privateuse = "x" 1*("-" (1*8alphanum)) - * + /** + * See BCP 47 “Tags for Identifying Languages”: + * https://www.rfc-editor.org/info/bcp47 --> + * https://www.rfc-editor.org/rfc/rfc5646.html#section-2.1 */ public static LanguageTag parse(String languageTag, ParseStatus sts) { if (sts == null) { @@ -165,14 +125,13 @@ public class LanguageTag { } StringTokenIterator itr; - boolean isGrandfathered = false; + boolean isLegacy = false; - // Check if the tag is grandfathered - String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag)); + String[] gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag)); // Language tag is at least 2 alpha so we can skip searching the first 2 chars. int dash = 2; while (gfmap == null && (dash = languageTag.indexOf('-', dash + 1)) != -1) { - gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag.substring(0, dash))); + gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag.substring(0, dash))); } if (gfmap != null) { @@ -183,7 +142,7 @@ public class LanguageTag { // append the rest of the tag. itr = new StringTokenIterator(gfmap[1] + languageTag.substring(dash), SEP); } - isGrandfathered = true; + isLegacy = true; } else { itr = new StringTokenIterator(languageTag, SEP); } @@ -202,8 +161,8 @@ public class LanguageTag { } tag.parsePrivateuse(itr, sts); - if (isGrandfathered) { - // Grandfathered tag is replaced with a well-formed tag above. + if (isLegacy) { + // A legacy tag is replaced with a well-formed tag above. // However, the parsed length must be the original tag length. assert (itr.isDone()); assert (!sts.isError()); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java b/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java index ac8746f7061..8c3a4ebfa28 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java @@ -80,7 +80,8 @@ import com.ibm.icu.text.LocaleDisplayNames.DialectHandling; * Canonicalization additionally performs the following: *

      *
    • POSIX ids are converted to ICU format IDs
    • - *
    • 'grandfathered' 3066 ids are converted to ICU standard form
    • + *
    • Legacy language tags (marked as “Type: grandfathered” in BCP 47) + * are converted to ICU standard form
    • *
    * All ULocale constructors automatically normalize the locale id. To handle * POSIX ids, canonicalize can be called to convert the id @@ -1204,7 +1205,7 @@ public final class ULocale implements Serializable, Comparable { /** * {@icu} Returns the canonical name according to CLDR for the specified locale ID. - * This is used to convert POSIX and other grandfathered IDs to standard ICU form. + * This is used to convert POSIX and other legacy IDs to standard ICU form. * @param localeID the locale id * @return the canonicalized id * @stable ICU 3.0 @@ -1242,7 +1243,7 @@ public final class ULocale implements Serializable, Comparable { // element in Supplemental Data, replace the language subtag with the replacement value. // If there are additional subtags in the replacement value, add them to the result, but // only if there is no corresponding subtag already in the tag. - // Five special deprecated grandfathered codes (such as i-default) are in type attributes, and are also replaced. + // Five special deprecated codes (such as i-default) are in type attributes, and are also replaced. try { UResourceBundle languageAlias = UResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME, "metadata", ICUResourceBundle.ICU_DATA_CLASS_LOADER) @@ -3201,58 +3202,16 @@ public final class ULocale implements Serializable, Comparable { * *
* - *

This implements the 'Language-Tag' production of BCP47, and - * so supports grandfathered (regular and irregular) as well as - * private use language tags. Stand alone private use tags are - * represented as empty language and extension 'x-whatever', - * and grandfathered tags are converted to their canonical replacements - * where they exist. + *

This implements the 'Language-Tag' production of BCP 47, and so + * supports legacy language tags (marked as “Type: grandfathered” in BCP 47) + * (regular and irregular) as well as private use language tags. * - *

Grandfathered tags with canonical replacements are as follows: + *

Stand-alone private use tags are represented as empty language and extension 'x-whatever', + * and legacy tags are converted to their canonical replacements where they exist. * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
grandfathered tag modern replacement
art-lojban jbo
i-ami ami
i-bnn bnn
i-hak hak
i-klingon tlh
i-lux lb
i-navajo nv
i-pwn pwn
i-tao tao
i-tay tay
i-tsu tsu
no-bok nb
no-nyn nn
sgn-BE-FR sfb
sgn-BE-NL vgt
sgn-CH-DE sgg
zh-guoyu cmn
zh-hakka hak
zh-min-nan nan
zh-xiang hsn
- * - *

Grandfathered tags with no modern replacement will be - * converted as follows: - * - * - * - * - * - * - * - * - * - * - * - *
grandfathered tag converts to
cel-gaulish xtg-x-cel-gaulish
en-GB-oed en-GB-x-oed
i-default en-x-i-default
i-enochian und-x-i-enochian
i-mingo see-x-i-mingo
zh-min nan-x-zh-min
- * - *

For a list of all grandfathered tags, see the - * IANA Language Subtag Registry (search for "Type: grandfathered"). + *

Note that a few legacy tags have no modern replacement; + * these will be converted using the fallback described in + * the first paragraph, so some information might be lost. * *

Note: there is no guarantee that toLanguageTag * and forLanguageTag will round-trip. @@ -3491,7 +3450,7 @@ public final class ULocale implements Serializable, Comparable { * Resets the Builder to match the provided IETF BCP 47 * language tag. Discards the existing state. Null and the * empty string cause the builder to be reset, like {@link - * #clear}. Grandfathered tags (see {@link + * #clear}. Legacy tags (see {@link * ULocale#forLanguageTag}) are converted to their canonical * form before being processed. Otherwise, the language tag * must be well-formed (see {@link ULocale}) or an exception is diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java index 97e2dab7d9b..deaf4863eaf 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java @@ -365,7 +365,7 @@ public class LocaleMatcherTest extends TestFmwk { } @Test - public void testMatchGrandfatheredCode() { + public void testMatchLegacyCode() { final LocaleMatcher matcher = newLocaleMatcher("fr, i_klingon, en_Latn_US"); assertEquals("en_Latn_US", matcher.getBestMatch("en_GB_oed").toString()); // assertEquals("tlh", matcher.getBestMatch("i_klingon").toString()); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt index 7a10986737f..70afcd02344 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt @@ -279,7 +279,7 @@ und-TW >> zh-Hant zh-Hant >> und-TW zh >> und-TW -** test: testMatchGrandfatheredCode +** test: testMatchLegacyCode @supported=fr, i-klingon, en-Latn-US en-GB-oed >> en-Latn-US @@ -984,7 +984,7 @@ x-bork >> x-bork x-piglatin >> x-bork x-bork >> x-bork -** test: MatchGrandfatheredCode +** test: MatchLegacyCode @supported=fr, i-klingon, en-Latn-US en-GB-oed >> en-Latn-US i-klingon >> tlh @@ -1525,7 +1525,7 @@ en >> null x-piglatin >> fr x-bork >> x-bork -** test: grandfathered codes +** test: legacy codes @supported=fr, i-klingon, en-Latn-US en-GB-oed >> en-Latn-US i-klingon >> tlh diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/langtagRegex.txt b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/langtagRegex.txt index 696750a4e2f..26ba85bb0d6 100644 --- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/langtagRegex.txt +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/langtagRegex.txt @@ -38,14 +38,15 @@ $extension = $singleton (?: $s $alphanum{2,8} )+ ; # singleton 1*("-" (2*8alpha $privateUse = $x (?: $s $alphanum{1,8} )+ ; # "x" 1*("-" (1*8alphanum)) -# Define certain grandfathered codes, since otherwise the regex is pretty useless. +# Define certain legacy language tags (marked as “Type: grandfathered” in BCP 47), +# since otherwise the regex is pretty useless. # Since these are limited, this is safe even later changes to the registry -- # the only oddity is that it might change the type of the tag, and thus # the results from the capturing groups. # http://www.iana.org/assignments/language-subtag-registry # Note that these have to be compared case insensitively, requiring (?i) below. -$grandfathered = en $s GB $s oed +$legacy = en $s GB $s oed | i $s (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu ) | no $s (?: bok | nyn ) | sgn $s (?: BE $s (?: fr | nl) | CH $s de ) @@ -55,7 +56,7 @@ $grandfathered = en $s GB $s oed # For well-formedness, we don't need the ones that would otherwise pass. # For validity, they need to be checked. -# $grandfatheredWellFormed = (?: +# $legacyWellFormed = (?: # art $s lojban # | cel $s gaulish # | zh $s (?: guoyu | hakka | xiang ) @@ -78,12 +79,12 @@ $langtag = (?: ( $language ) (?: $s ( $privateUse ) )? 5%); # Here is the final breakdown, with capturing groups for each of these components -# The variants, extensions, grandfathered, and private-use may have interior '-' +# The variants, extensions, legacy, and private-use may have interior '-' $root = (?i) # case-insensitive (?: $langtag 90% | ( $privateUse ) 5% - | ( $grandfathered ) 5%) + | ( $legacy ) 5%) # (?: \@ $keywords )? 5% ; diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java index ddc64786ecf..57c6df8cf6b 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java @@ -527,7 +527,8 @@ public final class SupplementalData { // ... // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur. // - // Note that this implementation does not need to handle "grandfathered" tags. + // Note that this implementation does not need to handle + // legacy language tags (marked as “Type: grandfathered” in BCP 47). private Optional addLikelySubtags(String localeId) { if (localeId.equals("root")) { return Optional.empty();