ICU-21184 rephrase docs/comments using the term grandfathered

This commit is contained in:
Markus Scherer 2020-08-18 15:05:22 -07:00
parent cde54fc5ba
commit 39da689d30
18 changed files with 138 additions and 287 deletions

View file

@ -1025,13 +1025,14 @@ Locale::forLanguageTag(StringPiece tag, UErrorCode& status)
return result;
}
// If a BCP-47 language tag is passed as the language parameter to the
// If a BCP 47 language tag is passed as the language parameter to the
// normal Locale constructor, it will actually fall back to invoking
// uloc_forLanguageTag() to parse it if it somehow is able to detect that
// the string actually is BCP-47. This works well for things like strings
// using BCP-47 extensions, but it does not at all work for things like
// BCP-47 grandfathered tags (eg. "en-GB-oed") which are possible to also
// interpret as ICU locale IDs and because of that won't trigger the BCP-47
// the string actually is BCP 47. This works well for things like strings
// using BCP 47 extensions, but it does not at all work for things like
// legacy language tags (marked as “Type: grandfathered” in BCP 47,
// e.g., "en-GB-oed") which are possible to also
// interpret as ICU locale IDs and because of that won't trigger the BCP 47
// parsing. Therefore the code here explicitly calls uloc_forLanguageTag()
// and then Locale::init(), instead of just calling the normal constructor.

View file

@ -53,7 +53,7 @@ typedef struct ULanguageTag {
VariantListEntry *variants;
ExtensionListEntry *extensions;
const char *privateuse;
const char *grandfathered;
const char *legacy;
} ULanguageTag;
#define MINLEN 2
@ -85,8 +85,9 @@ static const char LOCALE_TYPE_YES[] = "yes";
Updated on 2018-09-12 from
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
This table has 2 parts. The parts for Grandfathered tags is generated by the
following scripts from the IANA language tag registry.
This table has 2 parts. The part for
legacy language tags (marked as Type: grandfathered in BCP 47)
is generated by the following scripts from the IANA language tag registry.
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
egrep -A 7 'Type: grandfathered' | \
@ -100,8 +101,8 @@ static const char LOCALE_TYPE_YES[] = "yes";
values. They may have to be removed for the strict BCP 47 compliance.
*/
static const char* const GRANDFATHERED[] = {
/* grandfathered preferred */
static const char* const LEGACY[] = {
/* legacy preferred */
"art-lojban", "jbo",
"en-gb-oed", "en-gb-oxendict",
"i-ami", "ami",
@ -124,7 +125,7 @@ static const char* const GRANDFATHERED[] = {
"zh-min-nan", "nan",
"zh-xiang", "hsn",
// Grandfathered tags with no preferred value in the IANA
// Legacy tags with no preferred value in the IANA
// registry. Kept for now for the backward compatibility
// because ICU has mapped them this way.
"cel-gaulish", "xtg-x-cel-gaulish",
@ -346,7 +347,7 @@ ultag_getPrivateUse(const ULanguageTag* langtag);
#if 0
static const char*
ultag_getGrandfathered(const ULanguageTag* langtag);
ultag_getLegacy(const ULanguageTag* langtag);
#endif
U_NAMESPACE_BEGIN
@ -986,7 +987,7 @@ _initializeULanguageTag(ULanguageTag* langtag) {
langtag->variants = NULL;
langtag->extensions = NULL;
langtag->grandfathered = EMPTY;
langtag->legacy = EMPTY;
langtag->privateuse = EMPTY;
}
@ -2042,7 +2043,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
char *pExtValueSubtag, *pExtValueSubtagEnd;
int32_t i;
UBool privateuseVar = FALSE;
int32_t grandfatheredLen = 0;
int32_t legacyLen = 0;
if (parsedLen != NULL) {
*parsedLen = 0;
@ -2082,25 +2083,25 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
}
size_t parsedLenDelta = 0;
// Grandfathered tag will be consider together. Grandfathered tag with intervening
// Legacy tag will be consider together. Legacy tag with intervening
// script and region such as art-DE-lojban or art-Latn-lojban won't be
// matched.
/* check if the tag is grandfathered */
for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) {
int32_t checkGrandfatheredLen = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i]));
if (tagLen < checkGrandfatheredLen) {
/* check if the tag is legacy */
for (i = 0; i < UPRV_LENGTHOF(LEGACY); i += 2) {
int32_t checkLegacyLen = static_cast<int32_t>(uprv_strlen(LEGACY[i]));
if (tagLen < checkLegacyLen) {
continue;
}
if (tagLen > checkGrandfatheredLen && tagBuf[checkGrandfatheredLen] != '-') {
if (tagLen > checkLegacyLen && tagBuf[checkLegacyLen] != '-') {
// make sure next char is '-'.
continue;
}
if (uprv_strnicmp(GRANDFATHERED[i], tagBuf, checkGrandfatheredLen) == 0) {
if (uprv_strnicmp(LEGACY[i], tagBuf, checkLegacyLen) == 0) {
int32_t newTagLength;
grandfatheredLen = checkGrandfatheredLen; /* back up for output parsedLen */
int32_t replacementLen = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i+1]));
newTagLength = replacementLen + tagLen - checkGrandfatheredLen;
legacyLen = checkLegacyLen; /* back up for output parsedLen */
int32_t replacementLen = static_cast<int32_t>(uprv_strlen(LEGACY[i+1]));
newTagLength = replacementLen + tagLen - checkLegacyLen;
if (tagLen < newTagLength) {
uprv_free(tagBuf);
tagBuf = (char*)uprv_malloc(newTagLength + 1);
@ -2111,16 +2112,16 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
t->buf = tagBuf;
tagLen = newTagLength;
}
parsedLenDelta = checkGrandfatheredLen - replacementLen;
uprv_strcpy(t->buf, GRANDFATHERED[i + 1]);
if (checkGrandfatheredLen != tagLen) {
uprv_strcpy(t->buf + replacementLen, tag + checkGrandfatheredLen);
parsedLenDelta = checkLegacyLen - replacementLen;
uprv_strcpy(t->buf, LEGACY[i + 1]);
if (checkLegacyLen != tagLen) {
uprv_strcpy(t->buf + replacementLen, tag + checkLegacyLen);
}
break;
}
}
if (grandfatheredLen == 0) {
if (legacyLen == 0) {
for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
const char* redundantTag = REDUNDANT[i];
size_t redundantTagLen = uprv_strlen(redundantTag);
@ -2608,8 +2609,8 @@ ultag_getPrivateUse(const ULanguageTag* langtag) {
#if 0
static const char*
ultag_getGrandfathered(const ULanguageTag* langtag) {
return langtag->grandfathered;
ultag_getLegacy(const ULanguageTag* langtag) {
return langtag->legacy;
}
#endif

View file

@ -109,13 +109,17 @@ ulocimp_toLanguageTag(const char* localeID,
* If the specified language tag contains any ill-formed subtags,
* the first such subtag and all following subtags are ignored.
* <p>
* This implements the 'Language-Tag' production of BCP47, and so
* supports grandfathered (regular and irregular) as well as private
* use language tags. Private use tags are represented as 'x-whatever',
* and grandfathered tags are converted to their canonical replacements
* where they exist. Note that a few grandfathered tags have no modern
* replacement, these will be converted using the fallback described in
* This implements the 'Language-Tag' production of BCP 47, and so
* supports legacy language tags (marked as Type: grandfathered in BCP 47)
* (regular and irregular) as well as private use language tags.
*
* Private use tags are represented as 'x-whatever',
* and legacy tags are converted to their canonical replacements where they exist.
*
* Note that a few legacy tags have no modern replacement;
* these will be converted using the fallback described in
* the first paragraph, so some information might be lost.
*
* @param langtag the input BCP47 language tag.
* @param tagLen the length of langtag, or -1 to call uprv_strlen().
* @param sink the output sink receiving a locale ID for the

View file

@ -92,11 +92,12 @@ public:
/**
* Resets the LocaleBuilder to match the provided
* [Unicode Locale Identifier](http://www.unicode.org/reports/tr35/tr35.html#unicode_locale_id) .
* Discards the existing state. the empty string cause the builder to be
* reset, like {@link #clear}. Grandfathered tags are converted to their
* canonical form before being processed. Otherwise, the <code>language
* tag</code> must be well-formed, or else the build() method will later
* report an U_ILLEGAL_ARGUMENT_ERROR.
* Discards the existing state.
* The empty string causes the builder to be reset, like {@link #clear}.
* Legacy language tags (marked as Type: grandfathered in BCP 47)
* are converted to their canonical form before being processed.
* Otherwise, the <code>language tag</code> must be well-formed,
* or else the build() method will later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>This method clears the internal UErrorCode.
*

View file

@ -393,13 +393,17 @@ public:
* If the specified language tag contains any ill-formed subtags,
* the first such subtag and all following subtags are ignored.
* <p>
* This implements the 'Language-Tag' production of BCP47, and so
* supports grandfathered (regular and irregular) as well as private
* use language tags. Private use tags are represented as 'x-whatever',
* and grandfathered tags are converted to their canonical replacements
* where they exist. Note that a few grandfathered tags have no modern
* replacement, these will be converted using the fallback described in
* This implements the 'Language-Tag' production of BCP 47, and so
* supports legacy language tags (marked as Type: grandfathered in BCP 47)
* (regular and irregular) as well as private use language tags.
*
* Private use tags are represented as 'x-whatever',
* and legacy tags are converted to their canonical replacements where they exist.
*
* Note that a few legacy tags have no modern replacement;
* these will be converted using the fallback described in
* the first paragraph, so some information might be lost.
*
* @param tag the input BCP47 language tag.
* @param status error information if creating the Locale failed.
* @return the Locale for the specified BCP47 language tag.

View file

@ -1237,14 +1237,18 @@ uloc_minimizeSubtags(const char* localeID,
* Returns a locale ID for the specified BCP47 language tag string.
* If the specified language tag contains any ill-formed subtags,
* the first such subtag and all following subtags are ignored.
* <p>
* This implements the 'Language-Tag' production of BCP47, and so
* supports grandfathered (regular and irregular) as well as private
* use language tags. Private use tags are represented as 'x-whatever',
* and grandfathered tags are converted to their canonical replacements
* where they exist. Note that a few grandfathered tags have no modern
* replacement, these will be converted using the fallback described in
* <p>
* This implements the 'Language-Tag' production of BCP 47, and so
* supports legacy language tags (marked as Type: grandfathered in BCP 47)
* (regular and irregular) as well as private use language tags.
*
* Private use tags are represented as 'x-whatever',
* and legacy tags are converted to their canonical replacements where they exist.
*
* Note that a few legacy tags have no modern replacement;
* these will be converted using the fallback described in
* the first paragraph, so some information might be lost.
*
* @param langtag the input BCP47 language tag.
* @param localeID the output buffer receiving a locale ID for the
* specified BCP47 language tag.

View file

@ -266,7 +266,7 @@ static ECalType getCalendarTypeForLocale(const char *locid) {
//TODO: ULOC_FULL_NAME is out of date and too small..
char canonicalName[256];
// canonicalize, so grandfathered variant will be transformed to keywords
// Canonicalize, so that an old-style variant will be transformed to keywords.
// e.g ja_JP_TRADITIONAL -> ja_JP@calendar=japanese
// NOTE: Since ICU-20187, ja_JP_TRADITIONAL no longer canonicalizes, and
// the Gregorian calendar is returned instead.

View file

@ -30,8 +30,6 @@ U_NAMESPACE_BEGIN
// -------------------------------------
// Creates a formattable object with a char* string.
// This API is useless. The API that takes a UnicodeString is actually just as good.
// This is just a grandfathered API.
Formattable::Formattable(const char* stringToCopy)
{
init();

View file

@ -279,7 +279,7 @@ und-TW >> zh-Hant
zh-Hant >> und-TW
zh >> und-TW
** test: testMatchGrandfatheredCode
** test: testMatchLegacyCode
@supported=fr, i-klingon, en-Latn-US
en-GB-oed >> en-Latn-US
@ -984,7 +984,7 @@ x-bork >> x-bork
x-piglatin >> x-bork
x-bork >> x-bork
** test: MatchGrandfatheredCode
** test: MatchLegacyCode
@supported=fr, i-klingon, en-Latn-US
en-GB-oed >> en-Latn-US
i-klingon >> tlh
@ -1525,7 +1525,7 @@ en >> null
x-piglatin >> fr
x-bork >> x-bork
** test: grandfathered codes
** test: legacy codes
@supported=fr, i-klingon, en-Latn-US
en-GB-oed >> en-Latn-US
i-klingon >> tlh

View file

@ -38,13 +38,13 @@ public class LanguageTag {
private List<String> _variants = Collections.emptyList(); // variant subtags
private List<String> _extensions = Collections.emptyList(); // extensions
// Map contains grandfathered tags and its preferred mappings from
// http://www.ietf.org/rfc/rfc5646.txt
private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> GRANDFATHERED =
// The Map contains legacy language tags (marked as Type: grandfathered in BCP 47)
// and their preferred mappings from BCP 47.
private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> LEGACY =
new HashMap<AsciiUtil.CaseInsensitiveKey, String[]>();
static {
// grandfathered = irregular ; non-redundant tags registered
// legacy = irregular ; non-redundant tags registered
// / regular ; during the RFC 3066 era
//
// irregular = "en-GB-oed" ; irregular tags do not match
@ -105,57 +105,17 @@ public class LanguageTag {
{"zh-xiang", "hsn"},
};
for (String[] e : entries) {
GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
LEGACY.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
}
}
private LanguageTag() {
}
/*
* BNF in RFC5464
*
* Language-Tag = langtag ; normal language tags
* / privateuse ; private use tag
* / grandfathered ; grandfathered tags
*
*
* langtag = language
* ["-" script]
* ["-" region]
* *("-" variant)
* *("-" extension)
* ["-" privateuse]
*
* language = 2*3ALPHA ; shortest ISO 639 code
* ["-" extlang] ; sometimes followed by
* ; extended language subtags
* / 4ALPHA ; or reserved for future use
* / 5*8ALPHA ; or registered language subtag
*
* extlang = 3ALPHA ; selected ISO 639 codes
* *2("-" 3ALPHA) ; permanently reserved
*
* script = 4ALPHA ; ISO 15924 code
*
* region = 2ALPHA ; ISO 3166-1 code
* / 3DIGIT ; UN M.49 code
*
* variant = 5*8alphanum ; registered variants
* / (DIGIT 3alphanum)
*
* extension = singleton 1*("-" (2*8alphanum))
*
* ; Single alphanumerics
* ; "x" reserved for private use
* singleton = DIGIT ; 0 - 9
* / %x41-57 ; A - W
* / %x59-5A ; Y - Z
* / %x61-77 ; a - w
* / %x79-7A ; y - z
*
* privateuse = "x" 1*("-" (1*8alphanum))
*
/**
* See BCP 47 Tags for Identifying Languages:
* https://www.rfc-editor.org/info/bcp47 -->
* https://www.rfc-editor.org/rfc/rfc5646.html#section-2.1
*/
public static LanguageTag parse(String languageTag, ParseStatus sts) {
if (sts == null) {
@ -166,8 +126,7 @@ public class LanguageTag {
StringTokenIterator itr;
// Check if the tag is grandfathered
String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
String[] gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
if (gfmap != null) {
// use preferred mapping
itr = new StringTokenIterator(gfmap[1], SEP);

View file

@ -70,7 +70,8 @@ import com.ibm.icu.impl.locale.UnicodeLocaleExtension;
* Canonicalization additionally performs the following:
* <ul>
* <li>POSIX ids are converted to ICU format IDs</li>
* <li>'grandfathered' 3066 ids are converted to ICU standard form</li>
* <li>Legacy language tags (marked as Type: grandfathered in BCP 47)
* are converted to ICU standard form</li>
* <li>'PREEURO' and 'EURO' variants are converted to currency keyword form,
* with the currency
* id appropriate to the country of the locale (for PREEURO) or EUR (for EURO).
@ -1033,7 +1034,7 @@ public final class ULocale implements Serializable {
/**
* {@icu} Returns the canonical name for the specified locale ID. This is used to
* convert POSIX and other grandfathered IDs to standard ICU form.
* convert POSIX and other legacy IDs to standard ICU form.
* @param localeID the locale id
* @return the canonicalized id
* @stable ICU 3.0
@ -2666,60 +2667,18 @@ public final class ULocale implements Serializable {
* script to title case, country to upper case, variant to upper case,
* and extensions to lower case.
*
* <p>This implements the 'Language-Tag' production of BCP47, and
* so supports grandfathered (regular and irregular) as well as
* private use language tags. Stand alone private use tags are
* represented as empty language and extension 'x-whatever',
* and grandfathered tags are converted to their canonical replacements
* where they exist.
* <p>This implements the 'Language-Tag' production of BCP 47, and so
* supports legacy language tags (marked as Type: grandfathered in BCP 47)
* (regular and irregular) as well as private use language tags.
*
* <p>Grandfathered tags with canonical replacements are as follows:
* <p>Stand-alone private use tags are represented as empty language and extension 'x-whatever',
* and legacy tags are converted to their canonical replacements where they exist.
*
* <table>
* <tbody align="center">
* <tr><th>grandfathered tag</th><th>&nbsp;</th><th>modern replacement</th></tr>
* <tr><td>art-lojban</td><td>&nbsp;</td><td>jbo</td></tr>
* <tr><td>i-ami</td><td>&nbsp;</td><td>ami</td></tr>
* <tr><td>i-bnn</td><td>&nbsp;</td><td>bnn</td></tr>
* <tr><td>i-hak</td><td>&nbsp;</td><td>hak</td></tr>
* <tr><td>i-klingon</td><td>&nbsp;</td><td>tlh</td></tr>
* <tr><td>i-lux</td><td>&nbsp;</td><td>lb</td></tr>
* <tr><td>i-navajo</td><td>&nbsp;</td><td>nv</td></tr>
* <tr><td>i-pwn</td><td>&nbsp;</td><td>pwn</td></tr>
* <tr><td>i-tao</td><td>&nbsp;</td><td>tao</td></tr>
* <tr><td>i-tay</td><td>&nbsp;</td><td>tay</td></tr>
* <tr><td>i-tsu</td><td>&nbsp;</td><td>tsu</td></tr>
* <tr><td>no-bok</td><td>&nbsp;</td><td>nb</td></tr>
* <tr><td>no-nyn</td><td>&nbsp;</td><td>nn</td></tr>
* <tr><td>sgn-BE-FR</td><td>&nbsp;</td><td>sfb</td></tr>
* <tr><td>sgn-BE-NL</td><td>&nbsp;</td><td>vgt</td></tr>
* <tr><td>sgn-CH-DE</td><td>&nbsp;</td><td>sgg</td></tr>
* <tr><td>zh-guoyu</td><td>&nbsp;</td><td>cmn</td></tr>
* <tr><td>zh-hakka</td><td>&nbsp;</td><td>hak</td></tr>
* <tr><td>zh-min-nan</td><td>&nbsp;</td><td>nan</td></tr>
* <tr><td>zh-xiang</td><td>&nbsp;</td><td>hsn</td></tr>
* </tbody>
* </table>
* <p>Note that a few legacy tags have no modern replacement;
* these will be converted using the fallback described in
* the first paragraph, so some information might be lost.
*
* <p>Grandfathered tags with no modern replacement will be
* converted as follows:
*
* <table>
* <tbody align="center">
* <tr><th>grandfathered tag</th><th>&nbsp;</th><th>converts to</th></tr>
* <tr><td>cel-gaulish</td><td>&nbsp;</td><td>xtg-x-cel-gaulish</td></tr>
* <tr><td>en-GB-oed</td><td>&nbsp;</td><td>en-GB-x-oed</td></tr>
* <tr><td>i-default</td><td>&nbsp;</td><td>en-x-i-default</td></tr>
* <tr><td>i-enochian</td><td>&nbsp;</td><td>und-x-i-enochian</td></tr>
* <tr><td>i-mingo</td><td>&nbsp;</td><td>see-x-i-mingo</td></tr>
* <tr><td>zh-min</td><td>&nbsp;</td><td>nan-x-zh-min</td></tr>
* </tbody>
* </table>
*
* <p>For a list of all grandfathered tags, see the
* IANA Language Subtag Registry (search for "Type: grandfathered").
*
* <p><b>Note</b>: there is no guarantee that <code>toLanguageTag</code>
* <p><b>Note</b>: There is no guarantee that <code>toLanguageTag</code>
* and <code>forLanguageTag</code> will round-trip.
*
* @param languageTag the language tag
@ -2821,7 +2780,7 @@ public final class ULocale implements Serializable {
* Resets the Builder to match the provided IETF BCP 47
* language tag. Discards the existing state. Null and the
* empty string cause the builder to be reset, like {@link
* #clear}. Grandfathered tags (see {@link
* #clear}. Legacy tags (see {@link
* ULocale#forLanguageTag}) are converted to their canonical
* form before being processed. Otherwise, the language tag
* must be well-formed (see {@link ULocale}) or an exception is

View file

@ -46,7 +46,7 @@ public final class CalendarUtil {
return calType.toLowerCase(Locale.ROOT);
}
// Canonicalize, so grandfathered variant will be transformed to keywords
// Canonicalize, so that an old-style variant will be transformed to keywords.
ULocale canonical = ULocale.createCanonical(loc.toString());
calType = canonical.getKeywordValue(CALKEY);
if (calType != null) {

View file

@ -38,13 +38,13 @@ public class LanguageTag {
private List<String> _variants = Collections.emptyList(); // variant subtags
private List<String> _extensions = Collections.emptyList(); // extensions
// Map contains grandfathered tags and its preferred mappings from
// http://www.ietf.org/rfc/rfc5646.txt
private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> GRANDFATHERED =
// The Map contains legacy language tags (marked as Type: grandfathered in BCP 47)
// and their preferred mappings from BCP 47.
private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> LEGACY =
new HashMap<AsciiUtil.CaseInsensitiveKey, String[]>();
static {
// grandfathered = irregular ; non-redundant tags registered
// legacy = irregular ; non-redundant tags registered
// / regular ; during the RFC 3066 era
//
// irregular = "en-GB-oed" ; irregular tags do not match
@ -105,57 +105,17 @@ public class LanguageTag {
{"zh-xiang", "hsn"},
};
for (String[] e : entries) {
GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
LEGACY.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
}
}
private LanguageTag() {
}
/*
* BNF in RFC5464
*
* Language-Tag = langtag ; normal language tags
* / privateuse ; private use tag
* / grandfathered ; grandfathered tags
*
*
* langtag = language
* ["-" script]
* ["-" region]
* *("-" variant)
* *("-" extension)
* ["-" privateuse]
*
* language = 2*3ALPHA ; shortest ISO 639 code
* ["-" extlang] ; sometimes followed by
* ; extended language subtags
* / 4ALPHA ; or reserved for future use
* / 5*8ALPHA ; or registered language subtag
*
* extlang = 3ALPHA ; selected ISO 639 codes
* *2("-" 3ALPHA) ; permanently reserved
*
* script = 4ALPHA ; ISO 15924 code
*
* region = 2ALPHA ; ISO 3166-1 code
* / 3DIGIT ; UN M.49 code
*
* variant = 5*8alphanum ; registered variants
* / (DIGIT 3alphanum)
*
* extension = singleton 1*("-" (2*8alphanum))
*
* ; Single alphanumerics
* ; "x" reserved for private use
* singleton = DIGIT ; 0 - 9
* / %x41-57 ; A - W
* / %x59-5A ; Y - Z
* / %x61-77 ; a - w
* / %x79-7A ; y - z
*
* privateuse = "x" 1*("-" (1*8alphanum))
*
/**
* See BCP 47 Tags for Identifying Languages:
* https://www.rfc-editor.org/info/bcp47 -->
* https://www.rfc-editor.org/rfc/rfc5646.html#section-2.1
*/
public static LanguageTag parse(String languageTag, ParseStatus sts) {
if (sts == null) {
@ -165,14 +125,13 @@ public class LanguageTag {
}
StringTokenIterator itr;
boolean isGrandfathered = false;
boolean isLegacy = false;
// Check if the tag is grandfathered
String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
String[] gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
// Language tag is at least 2 alpha so we can skip searching the first 2 chars.
int dash = 2;
while (gfmap == null && (dash = languageTag.indexOf('-', dash + 1)) != -1) {
gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag.substring(0, dash)));
gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag.substring(0, dash)));
}
if (gfmap != null) {
@ -183,7 +142,7 @@ public class LanguageTag {
// append the rest of the tag.
itr = new StringTokenIterator(gfmap[1] + languageTag.substring(dash), SEP);
}
isGrandfathered = true;
isLegacy = true;
} else {
itr = new StringTokenIterator(languageTag, SEP);
}
@ -202,8 +161,8 @@ public class LanguageTag {
}
tag.parsePrivateuse(itr, sts);
if (isGrandfathered) {
// Grandfathered tag is replaced with a well-formed tag above.
if (isLegacy) {
// A legacy tag is replaced with a well-formed tag above.
// However, the parsed length must be the original tag length.
assert (itr.isDone());
assert (!sts.isError());

View file

@ -80,7 +80,8 @@ import com.ibm.icu.text.LocaleDisplayNames.DialectHandling;
* Canonicalization additionally performs the following:
* <ul>
* <li>POSIX ids are converted to ICU format IDs</li>
* <li>'grandfathered' 3066 ids are converted to ICU standard form</li>
* <li>Legacy language tags (marked as Type: grandfathered in BCP 47)
* are converted to ICU standard form</li>
* </ul>
* All ULocale constructors automatically normalize the locale id. To handle
* POSIX ids, <code>canonicalize</code> can be called to convert the id
@ -1204,7 +1205,7 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
/**
* {@icu} Returns the canonical name according to CLDR for the specified locale ID.
* This is used to convert POSIX and other grandfathered IDs to standard ICU form.
* This is used to convert POSIX and other legacy IDs to standard ICU form.
* @param localeID the locale id
* @return the canonicalized id
* @stable ICU 3.0
@ -1242,7 +1243,7 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
// element in Supplemental Data, replace the language subtag with the replacement value.
// If there are additional subtags in the replacement value, add them to the result, but
// only if there is no corresponding subtag already in the tag.
// Five special deprecated grandfathered codes (such as i-default) are in type attributes, and are also replaced.
// Five special deprecated codes (such as i-default) are in type attributes, and are also replaced.
try {
UResourceBundle languageAlias = UResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME,
"metadata", ICUResourceBundle.ICU_DATA_CLASS_LOADER)
@ -3201,58 +3202,16 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
*
* </ul>
*
* <p>This implements the 'Language-Tag' production of BCP47, and
* so supports grandfathered (regular and irregular) as well as
* private use language tags. Stand alone private use tags are
* represented as empty language and extension 'x-whatever',
* and grandfathered tags are converted to their canonical replacements
* where they exist.
* <p>This implements the 'Language-Tag' production of BCP 47, and so
* supports legacy language tags (marked as Type: grandfathered in BCP 47)
* (regular and irregular) as well as private use language tags.
*
* <p>Grandfathered tags with canonical replacements are as follows:
* <p>Stand-alone private use tags are represented as empty language and extension 'x-whatever',
* and legacy tags are converted to their canonical replacements where they exist.
*
* <table>
* <tbody align="center">
* <tr><th>grandfathered tag</th><th>&nbsp;</th><th>modern replacement</th></tr>
* <tr><td>art-lojban</td><td>&nbsp;</td><td>jbo</td></tr>
* <tr><td>i-ami</td><td>&nbsp;</td><td>ami</td></tr>
* <tr><td>i-bnn</td><td>&nbsp;</td><td>bnn</td></tr>
* <tr><td>i-hak</td><td>&nbsp;</td><td>hak</td></tr>
* <tr><td>i-klingon</td><td>&nbsp;</td><td>tlh</td></tr>
* <tr><td>i-lux</td><td>&nbsp;</td><td>lb</td></tr>
* <tr><td>i-navajo</td><td>&nbsp;</td><td>nv</td></tr>
* <tr><td>i-pwn</td><td>&nbsp;</td><td>pwn</td></tr>
* <tr><td>i-tao</td><td>&nbsp;</td><td>tao</td></tr>
* <tr><td>i-tay</td><td>&nbsp;</td><td>tay</td></tr>
* <tr><td>i-tsu</td><td>&nbsp;</td><td>tsu</td></tr>
* <tr><td>no-bok</td><td>&nbsp;</td><td>nb</td></tr>
* <tr><td>no-nyn</td><td>&nbsp;</td><td>nn</td></tr>
* <tr><td>sgn-BE-FR</td><td>&nbsp;</td><td>sfb</td></tr>
* <tr><td>sgn-BE-NL</td><td>&nbsp;</td><td>vgt</td></tr>
* <tr><td>sgn-CH-DE</td><td>&nbsp;</td><td>sgg</td></tr>
* <tr><td>zh-guoyu</td><td>&nbsp;</td><td>cmn</td></tr>
* <tr><td>zh-hakka</td><td>&nbsp;</td><td>hak</td></tr>
* <tr><td>zh-min-nan</td><td>&nbsp;</td><td>nan</td></tr>
* <tr><td>zh-xiang</td><td>&nbsp;</td><td>hsn</td></tr>
* </tbody>
* </table>
*
* <p>Grandfathered tags with no modern replacement will be
* converted as follows:
*
* <table>
* <tbody align="center">
* <tr><th>grandfathered tag</th><th>&nbsp;</th><th>converts to</th></tr>
* <tr><td>cel-gaulish</td><td>&nbsp;</td><td>xtg-x-cel-gaulish</td></tr>
* <tr><td>en-GB-oed</td><td>&nbsp;</td><td>en-GB-x-oed</td></tr>
* <tr><td>i-default</td><td>&nbsp;</td><td>en-x-i-default</td></tr>
* <tr><td>i-enochian</td><td>&nbsp;</td><td>und-x-i-enochian</td></tr>
* <tr><td>i-mingo</td><td>&nbsp;</td><td>see-x-i-mingo</td></tr>
* <tr><td>zh-min</td><td>&nbsp;</td><td>nan-x-zh-min</td></tr>
* </tbody>
* </table>
*
* <p>For a list of all grandfathered tags, see the
* IANA Language Subtag Registry (search for "Type: grandfathered").
* <p>Note that a few legacy tags have no modern replacement;
* these will be converted using the fallback described in
* the first paragraph, so some information might be lost.
*
* <p><b>Note</b>: there is no guarantee that <code>toLanguageTag</code>
* and <code>forLanguageTag</code> will round-trip.
@ -3491,7 +3450,7 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
* Resets the Builder to match the provided IETF BCP 47
* language tag. Discards the existing state. Null and the
* empty string cause the builder to be reset, like {@link
* #clear}. Grandfathered tags (see {@link
* #clear}. Legacy tags (see {@link
* ULocale#forLanguageTag}) are converted to their canonical
* form before being processed. Otherwise, the language tag
* must be well-formed (see {@link ULocale}) or an exception is

View file

@ -365,7 +365,7 @@ public class LocaleMatcherTest extends TestFmwk {
}
@Test
public void testMatchGrandfatheredCode() {
public void testMatchLegacyCode() {
final LocaleMatcher matcher = newLocaleMatcher("fr, i_klingon, en_Latn_US");
assertEquals("en_Latn_US", matcher.getBestMatch("en_GB_oed").toString());
// assertEquals("tlh", matcher.getBestMatch("i_klingon").toString());

View file

@ -279,7 +279,7 @@ und-TW >> zh-Hant
zh-Hant >> und-TW
zh >> und-TW
** test: testMatchGrandfatheredCode
** test: testMatchLegacyCode
@supported=fr, i-klingon, en-Latn-US
en-GB-oed >> en-Latn-US
@ -984,7 +984,7 @@ x-bork >> x-bork
x-piglatin >> x-bork
x-bork >> x-bork
** test: MatchGrandfatheredCode
** test: MatchLegacyCode
@supported=fr, i-klingon, en-Latn-US
en-GB-oed >> en-Latn-US
i-klingon >> tlh
@ -1525,7 +1525,7 @@ en >> null
x-piglatin >> fr
x-bork >> x-bork
** test: grandfathered codes
** test: legacy codes
@supported=fr, i-klingon, en-Latn-US
en-GB-oed >> en-Latn-US
i-klingon >> tlh

View file

@ -38,14 +38,15 @@ $extension = $singleton (?: $s $alphanum{2,8} )+ ; # singleton 1*("-" (2*8alpha
$privateUse = $x (?: $s $alphanum{1,8} )+ ; # "x" 1*("-" (1*8alphanum))
# Define certain grandfathered codes, since otherwise the regex is pretty useless.
# Define certain legacy language tags (marked as “Type: grandfathered” in BCP 47),
# since otherwise the regex is pretty useless.
# Since these are limited, this is safe even later changes to the registry --
# the only oddity is that it might change the type of the tag, and thus
# the results from the capturing groups.
# http://www.iana.org/assignments/language-subtag-registry
# Note that these have to be compared case insensitively, requiring (?i) below.
$grandfathered = en $s GB $s oed
$legacy = en $s GB $s oed
| i $s (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )
| no $s (?: bok | nyn )
| sgn $s (?: BE $s (?: fr | nl) | CH $s de )
@ -55,7 +56,7 @@ $grandfathered = en $s GB $s oed
# For well-formedness, we don't need the ones that would otherwise pass.
# For validity, they need to be checked.
# $grandfatheredWellFormed = (?:
# $legacyWellFormed = (?:
# art $s lojban
# | cel $s gaulish
# | zh $s (?: guoyu | hakka | xiang )
@ -78,12 +79,12 @@ $langtag = (?: ( $language )
(?: $s ( $privateUse ) )? 5%);
# Here is the final breakdown, with capturing groups for each of these components
# The variants, extensions, grandfathered, and private-use may have interior '-'
# The variants, extensions, legacy, and private-use may have interior '-'
$root = (?i) # case-insensitive
(?:
$langtag 90%
| ( $privateUse ) 5%
| ( $grandfathered ) 5%)
| ( $legacy ) 5%)
# (?: \@ $keywords )? 5%
;

View file

@ -527,7 +527,8 @@ public final class SupplementalData {
// ...
// Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
//
// Note that this implementation does not need to handle "grandfathered" tags.
// Note that this implementation does not need to handle
// legacy language tags (marked as Type: grandfathered in BCP 47).
private Optional<LocaleId> addLikelySubtags(String localeId) {
if (localeId.equals("root")) {
return Optional.empty();