mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-21184 rephrase docs/comments using the term grandfathered
This commit is contained in:
parent
cde54fc5ba
commit
39da689d30
18 changed files with 138 additions and 287 deletions
|
@ -1025,13 +1025,14 @@ Locale::forLanguageTag(StringPiece tag, UErrorCode& status)
|
|||
return result;
|
||||
}
|
||||
|
||||
// If a BCP-47 language tag is passed as the language parameter to the
|
||||
// If a BCP 47 language tag is passed as the language parameter to the
|
||||
// normal Locale constructor, it will actually fall back to invoking
|
||||
// uloc_forLanguageTag() to parse it if it somehow is able to detect that
|
||||
// the string actually is BCP-47. This works well for things like strings
|
||||
// using BCP-47 extensions, but it does not at all work for things like
|
||||
// BCP-47 grandfathered tags (eg. "en-GB-oed") which are possible to also
|
||||
// interpret as ICU locale IDs and because of that won't trigger the BCP-47
|
||||
// the string actually is BCP 47. This works well for things like strings
|
||||
// using BCP 47 extensions, but it does not at all work for things like
|
||||
// legacy language tags (marked as “Type: grandfathered” in BCP 47,
|
||||
// e.g., "en-GB-oed") which are possible to also
|
||||
// interpret as ICU locale IDs and because of that won't trigger the BCP 47
|
||||
// parsing. Therefore the code here explicitly calls uloc_forLanguageTag()
|
||||
// and then Locale::init(), instead of just calling the normal constructor.
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ typedef struct ULanguageTag {
|
|||
VariantListEntry *variants;
|
||||
ExtensionListEntry *extensions;
|
||||
const char *privateuse;
|
||||
const char *grandfathered;
|
||||
const char *legacy;
|
||||
} ULanguageTag;
|
||||
|
||||
#define MINLEN 2
|
||||
|
@ -85,8 +85,9 @@ static const char LOCALE_TYPE_YES[] = "yes";
|
|||
Updated on 2018-09-12 from
|
||||
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
|
||||
|
||||
This table has 2 parts. The parts for Grandfathered tags is generated by the
|
||||
following scripts from the IANA language tag registry.
|
||||
This table has 2 parts. The part for
|
||||
legacy language tags (marked as “Type: grandfathered” in BCP 47)
|
||||
is generated by the following scripts from the IANA language tag registry.
|
||||
|
||||
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
|
||||
egrep -A 7 'Type: grandfathered' | \
|
||||
|
@ -100,8 +101,8 @@ static const char LOCALE_TYPE_YES[] = "yes";
|
|||
values. They may have to be removed for the strict BCP 47 compliance.
|
||||
|
||||
*/
|
||||
static const char* const GRANDFATHERED[] = {
|
||||
/* grandfathered preferred */
|
||||
static const char* const LEGACY[] = {
|
||||
/* legacy preferred */
|
||||
"art-lojban", "jbo",
|
||||
"en-gb-oed", "en-gb-oxendict",
|
||||
"i-ami", "ami",
|
||||
|
@ -124,7 +125,7 @@ static const char* const GRANDFATHERED[] = {
|
|||
"zh-min-nan", "nan",
|
||||
"zh-xiang", "hsn",
|
||||
|
||||
// Grandfathered tags with no preferred value in the IANA
|
||||
// Legacy tags with no preferred value in the IANA
|
||||
// registry. Kept for now for the backward compatibility
|
||||
// because ICU has mapped them this way.
|
||||
"cel-gaulish", "xtg-x-cel-gaulish",
|
||||
|
@ -346,7 +347,7 @@ ultag_getPrivateUse(const ULanguageTag* langtag);
|
|||
|
||||
#if 0
|
||||
static const char*
|
||||
ultag_getGrandfathered(const ULanguageTag* langtag);
|
||||
ultag_getLegacy(const ULanguageTag* langtag);
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
@ -986,7 +987,7 @@ _initializeULanguageTag(ULanguageTag* langtag) {
|
|||
langtag->variants = NULL;
|
||||
langtag->extensions = NULL;
|
||||
|
||||
langtag->grandfathered = EMPTY;
|
||||
langtag->legacy = EMPTY;
|
||||
langtag->privateuse = EMPTY;
|
||||
}
|
||||
|
||||
|
@ -2042,7 +2043,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
|
|||
char *pExtValueSubtag, *pExtValueSubtagEnd;
|
||||
int32_t i;
|
||||
UBool privateuseVar = FALSE;
|
||||
int32_t grandfatheredLen = 0;
|
||||
int32_t legacyLen = 0;
|
||||
|
||||
if (parsedLen != NULL) {
|
||||
*parsedLen = 0;
|
||||
|
@ -2082,25 +2083,25 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
|
|||
}
|
||||
|
||||
size_t parsedLenDelta = 0;
|
||||
// Grandfathered tag will be consider together. Grandfathered tag with intervening
|
||||
// Legacy tag will be consider together. Legacy tag with intervening
|
||||
// script and region such as art-DE-lojban or art-Latn-lojban won't be
|
||||
// matched.
|
||||
/* check if the tag is grandfathered */
|
||||
for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) {
|
||||
int32_t checkGrandfatheredLen = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i]));
|
||||
if (tagLen < checkGrandfatheredLen) {
|
||||
/* check if the tag is legacy */
|
||||
for (i = 0; i < UPRV_LENGTHOF(LEGACY); i += 2) {
|
||||
int32_t checkLegacyLen = static_cast<int32_t>(uprv_strlen(LEGACY[i]));
|
||||
if (tagLen < checkLegacyLen) {
|
||||
continue;
|
||||
}
|
||||
if (tagLen > checkGrandfatheredLen && tagBuf[checkGrandfatheredLen] != '-') {
|
||||
if (tagLen > checkLegacyLen && tagBuf[checkLegacyLen] != '-') {
|
||||
// make sure next char is '-'.
|
||||
continue;
|
||||
}
|
||||
if (uprv_strnicmp(GRANDFATHERED[i], tagBuf, checkGrandfatheredLen) == 0) {
|
||||
if (uprv_strnicmp(LEGACY[i], tagBuf, checkLegacyLen) == 0) {
|
||||
int32_t newTagLength;
|
||||
|
||||
grandfatheredLen = checkGrandfatheredLen; /* back up for output parsedLen */
|
||||
int32_t replacementLen = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i+1]));
|
||||
newTagLength = replacementLen + tagLen - checkGrandfatheredLen;
|
||||
legacyLen = checkLegacyLen; /* back up for output parsedLen */
|
||||
int32_t replacementLen = static_cast<int32_t>(uprv_strlen(LEGACY[i+1]));
|
||||
newTagLength = replacementLen + tagLen - checkLegacyLen;
|
||||
if (tagLen < newTagLength) {
|
||||
uprv_free(tagBuf);
|
||||
tagBuf = (char*)uprv_malloc(newTagLength + 1);
|
||||
|
@ -2111,16 +2112,16 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
|
|||
t->buf = tagBuf;
|
||||
tagLen = newTagLength;
|
||||
}
|
||||
parsedLenDelta = checkGrandfatheredLen - replacementLen;
|
||||
uprv_strcpy(t->buf, GRANDFATHERED[i + 1]);
|
||||
if (checkGrandfatheredLen != tagLen) {
|
||||
uprv_strcpy(t->buf + replacementLen, tag + checkGrandfatheredLen);
|
||||
parsedLenDelta = checkLegacyLen - replacementLen;
|
||||
uprv_strcpy(t->buf, LEGACY[i + 1]);
|
||||
if (checkLegacyLen != tagLen) {
|
||||
uprv_strcpy(t->buf + replacementLen, tag + checkLegacyLen);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (grandfatheredLen == 0) {
|
||||
if (legacyLen == 0) {
|
||||
for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
|
||||
const char* redundantTag = REDUNDANT[i];
|
||||
size_t redundantTagLen = uprv_strlen(redundantTag);
|
||||
|
@ -2608,8 +2609,8 @@ ultag_getPrivateUse(const ULanguageTag* langtag) {
|
|||
|
||||
#if 0
|
||||
static const char*
|
||||
ultag_getGrandfathered(const ULanguageTag* langtag) {
|
||||
return langtag->grandfathered;
|
||||
ultag_getLegacy(const ULanguageTag* langtag) {
|
||||
return langtag->legacy;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -109,13 +109,17 @@ ulocimp_toLanguageTag(const char* localeID,
|
|||
* If the specified language tag contains any ill-formed subtags,
|
||||
* the first such subtag and all following subtags are ignored.
|
||||
* <p>
|
||||
* This implements the 'Language-Tag' production of BCP47, and so
|
||||
* supports grandfathered (regular and irregular) as well as private
|
||||
* use language tags. Private use tags are represented as 'x-whatever',
|
||||
* and grandfathered tags are converted to their canonical replacements
|
||||
* where they exist. Note that a few grandfathered tags have no modern
|
||||
* replacement, these will be converted using the fallback described in
|
||||
* This implements the 'Language-Tag' production of BCP 47, and so
|
||||
* supports legacy language tags (marked as “Type: grandfathered” in BCP 47)
|
||||
* (regular and irregular) as well as private use language tags.
|
||||
*
|
||||
* Private use tags are represented as 'x-whatever',
|
||||
* and legacy tags are converted to their canonical replacements where they exist.
|
||||
*
|
||||
* Note that a few legacy tags have no modern replacement;
|
||||
* these will be converted using the fallback described in
|
||||
* the first paragraph, so some information might be lost.
|
||||
*
|
||||
* @param langtag the input BCP47 language tag.
|
||||
* @param tagLen the length of langtag, or -1 to call uprv_strlen().
|
||||
* @param sink the output sink receiving a locale ID for the
|
||||
|
|
|
@ -92,11 +92,12 @@ public:
|
|||
/**
|
||||
* Resets the LocaleBuilder to match the provided
|
||||
* [Unicode Locale Identifier](http://www.unicode.org/reports/tr35/tr35.html#unicode_locale_id) .
|
||||
* Discards the existing state. the empty string cause the builder to be
|
||||
* reset, like {@link #clear}. Grandfathered tags are converted to their
|
||||
* canonical form before being processed. Otherwise, the <code>language
|
||||
* tag</code> must be well-formed, or else the build() method will later
|
||||
* report an U_ILLEGAL_ARGUMENT_ERROR.
|
||||
* Discards the existing state.
|
||||
* The empty string causes the builder to be reset, like {@link #clear}.
|
||||
* Legacy language tags (marked as “Type: grandfathered” in BCP 47)
|
||||
* are converted to their canonical form before being processed.
|
||||
* Otherwise, the <code>language tag</code> must be well-formed,
|
||||
* or else the build() method will later report an U_ILLEGAL_ARGUMENT_ERROR.
|
||||
*
|
||||
* <p>This method clears the internal UErrorCode.
|
||||
*
|
||||
|
|
|
@ -393,13 +393,17 @@ public:
|
|||
* If the specified language tag contains any ill-formed subtags,
|
||||
* the first such subtag and all following subtags are ignored.
|
||||
* <p>
|
||||
* This implements the 'Language-Tag' production of BCP47, and so
|
||||
* supports grandfathered (regular and irregular) as well as private
|
||||
* use language tags. Private use tags are represented as 'x-whatever',
|
||||
* and grandfathered tags are converted to their canonical replacements
|
||||
* where they exist. Note that a few grandfathered tags have no modern
|
||||
* replacement, these will be converted using the fallback described in
|
||||
* This implements the 'Language-Tag' production of BCP 47, and so
|
||||
* supports legacy language tags (marked as “Type: grandfathered” in BCP 47)
|
||||
* (regular and irregular) as well as private use language tags.
|
||||
*
|
||||
* Private use tags are represented as 'x-whatever',
|
||||
* and legacy tags are converted to their canonical replacements where they exist.
|
||||
*
|
||||
* Note that a few legacy tags have no modern replacement;
|
||||
* these will be converted using the fallback described in
|
||||
* the first paragraph, so some information might be lost.
|
||||
*
|
||||
* @param tag the input BCP47 language tag.
|
||||
* @param status error information if creating the Locale failed.
|
||||
* @return the Locale for the specified BCP47 language tag.
|
||||
|
|
|
@ -1237,14 +1237,18 @@ uloc_minimizeSubtags(const char* localeID,
|
|||
* Returns a locale ID for the specified BCP47 language tag string.
|
||||
* If the specified language tag contains any ill-formed subtags,
|
||||
* the first such subtag and all following subtags are ignored.
|
||||
* <p>
|
||||
* This implements the 'Language-Tag' production of BCP47, and so
|
||||
* supports grandfathered (regular and irregular) as well as private
|
||||
* use language tags. Private use tags are represented as 'x-whatever',
|
||||
* and grandfathered tags are converted to their canonical replacements
|
||||
* where they exist. Note that a few grandfathered tags have no modern
|
||||
* replacement, these will be converted using the fallback described in
|
||||
* <p>
|
||||
* This implements the 'Language-Tag' production of BCP 47, and so
|
||||
* supports legacy language tags (marked as “Type: grandfathered” in BCP 47)
|
||||
* (regular and irregular) as well as private use language tags.
|
||||
*
|
||||
* Private use tags are represented as 'x-whatever',
|
||||
* and legacy tags are converted to their canonical replacements where they exist.
|
||||
*
|
||||
* Note that a few legacy tags have no modern replacement;
|
||||
* these will be converted using the fallback described in
|
||||
* the first paragraph, so some information might be lost.
|
||||
*
|
||||
* @param langtag the input BCP47 language tag.
|
||||
* @param localeID the output buffer receiving a locale ID for the
|
||||
* specified BCP47 language tag.
|
||||
|
|
|
@ -266,7 +266,7 @@ static ECalType getCalendarTypeForLocale(const char *locid) {
|
|||
//TODO: ULOC_FULL_NAME is out of date and too small..
|
||||
char canonicalName[256];
|
||||
|
||||
// canonicalize, so grandfathered variant will be transformed to keywords
|
||||
// Canonicalize, so that an old-style variant will be transformed to keywords.
|
||||
// e.g ja_JP_TRADITIONAL -> ja_JP@calendar=japanese
|
||||
// NOTE: Since ICU-20187, ja_JP_TRADITIONAL no longer canonicalizes, and
|
||||
// the Gregorian calendar is returned instead.
|
||||
|
|
|
@ -30,8 +30,6 @@ U_NAMESPACE_BEGIN
|
|||
// -------------------------------------
|
||||
// Creates a formattable object with a char* string.
|
||||
// This API is useless. The API that takes a UnicodeString is actually just as good.
|
||||
// This is just a grandfathered API.
|
||||
|
||||
Formattable::Formattable(const char* stringToCopy)
|
||||
{
|
||||
init();
|
||||
|
|
|
@ -279,7 +279,7 @@ und-TW >> zh-Hant
|
|||
zh-Hant >> und-TW
|
||||
zh >> und-TW
|
||||
|
||||
** test: testMatchGrandfatheredCode
|
||||
** test: testMatchLegacyCode
|
||||
|
||||
@supported=fr, i-klingon, en-Latn-US
|
||||
en-GB-oed >> en-Latn-US
|
||||
|
@ -984,7 +984,7 @@ x-bork >> x-bork
|
|||
x-piglatin >> x-bork
|
||||
x-bork >> x-bork
|
||||
|
||||
** test: MatchGrandfatheredCode
|
||||
** test: MatchLegacyCode
|
||||
@supported=fr, i-klingon, en-Latn-US
|
||||
en-GB-oed >> en-Latn-US
|
||||
i-klingon >> tlh
|
||||
|
@ -1525,7 +1525,7 @@ en >> null
|
|||
x-piglatin >> fr
|
||||
x-bork >> x-bork
|
||||
|
||||
** test: grandfathered codes
|
||||
** test: legacy codes
|
||||
@supported=fr, i-klingon, en-Latn-US
|
||||
en-GB-oed >> en-Latn-US
|
||||
i-klingon >> tlh
|
||||
|
|
|
@ -38,13 +38,13 @@ public class LanguageTag {
|
|||
private List<String> _variants = Collections.emptyList(); // variant subtags
|
||||
private List<String> _extensions = Collections.emptyList(); // extensions
|
||||
|
||||
// Map contains grandfathered tags and its preferred mappings from
|
||||
// http://www.ietf.org/rfc/rfc5646.txt
|
||||
private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> GRANDFATHERED =
|
||||
// The Map contains legacy language tags (marked as “Type: grandfathered” in BCP 47)
|
||||
// and their preferred mappings from BCP 47.
|
||||
private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> LEGACY =
|
||||
new HashMap<AsciiUtil.CaseInsensitiveKey, String[]>();
|
||||
|
||||
static {
|
||||
// grandfathered = irregular ; non-redundant tags registered
|
||||
// legacy = irregular ; non-redundant tags registered
|
||||
// / regular ; during the RFC 3066 era
|
||||
//
|
||||
// irregular = "en-GB-oed" ; irregular tags do not match
|
||||
|
@ -105,57 +105,17 @@ public class LanguageTag {
|
|||
{"zh-xiang", "hsn"},
|
||||
};
|
||||
for (String[] e : entries) {
|
||||
GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
|
||||
LEGACY.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
|
||||
}
|
||||
}
|
||||
|
||||
private LanguageTag() {
|
||||
}
|
||||
|
||||
/*
|
||||
* BNF in RFC5464
|
||||
*
|
||||
* Language-Tag = langtag ; normal language tags
|
||||
* / privateuse ; private use tag
|
||||
* / grandfathered ; grandfathered tags
|
||||
*
|
||||
*
|
||||
* langtag = language
|
||||
* ["-" script]
|
||||
* ["-" region]
|
||||
* *("-" variant)
|
||||
* *("-" extension)
|
||||
* ["-" privateuse]
|
||||
*
|
||||
* language = 2*3ALPHA ; shortest ISO 639 code
|
||||
* ["-" extlang] ; sometimes followed by
|
||||
* ; extended language subtags
|
||||
* / 4ALPHA ; or reserved for future use
|
||||
* / 5*8ALPHA ; or registered language subtag
|
||||
*
|
||||
* extlang = 3ALPHA ; selected ISO 639 codes
|
||||
* *2("-" 3ALPHA) ; permanently reserved
|
||||
*
|
||||
* script = 4ALPHA ; ISO 15924 code
|
||||
*
|
||||
* region = 2ALPHA ; ISO 3166-1 code
|
||||
* / 3DIGIT ; UN M.49 code
|
||||
*
|
||||
* variant = 5*8alphanum ; registered variants
|
||||
* / (DIGIT 3alphanum)
|
||||
*
|
||||
* extension = singleton 1*("-" (2*8alphanum))
|
||||
*
|
||||
* ; Single alphanumerics
|
||||
* ; "x" reserved for private use
|
||||
* singleton = DIGIT ; 0 - 9
|
||||
* / %x41-57 ; A - W
|
||||
* / %x59-5A ; Y - Z
|
||||
* / %x61-77 ; a - w
|
||||
* / %x79-7A ; y - z
|
||||
*
|
||||
* privateuse = "x" 1*("-" (1*8alphanum))
|
||||
*
|
||||
/**
|
||||
* See BCP 47 “Tags for Identifying Languages”:
|
||||
* https://www.rfc-editor.org/info/bcp47 -->
|
||||
* https://www.rfc-editor.org/rfc/rfc5646.html#section-2.1
|
||||
*/
|
||||
public static LanguageTag parse(String languageTag, ParseStatus sts) {
|
||||
if (sts == null) {
|
||||
|
@ -166,8 +126,7 @@ public class LanguageTag {
|
|||
|
||||
StringTokenIterator itr;
|
||||
|
||||
// Check if the tag is grandfathered
|
||||
String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
|
||||
String[] gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
|
||||
if (gfmap != null) {
|
||||
// use preferred mapping
|
||||
itr = new StringTokenIterator(gfmap[1], SEP);
|
||||
|
|
|
@ -70,7 +70,8 @@ import com.ibm.icu.impl.locale.UnicodeLocaleExtension;
|
|||
* Canonicalization additionally performs the following:
|
||||
* <ul>
|
||||
* <li>POSIX ids are converted to ICU format IDs</li>
|
||||
* <li>'grandfathered' 3066 ids are converted to ICU standard form</li>
|
||||
* <li>Legacy language tags (marked as “Type: grandfathered” in BCP 47)
|
||||
* are converted to ICU standard form</li>
|
||||
* <li>'PREEURO' and 'EURO' variants are converted to currency keyword form,
|
||||
* with the currency
|
||||
* id appropriate to the country of the locale (for PREEURO) or EUR (for EURO).
|
||||
|
@ -1033,7 +1034,7 @@ public final class ULocale implements Serializable {
|
|||
|
||||
/**
|
||||
* {@icu} Returns the canonical name for the specified locale ID. This is used to
|
||||
* convert POSIX and other grandfathered IDs to standard ICU form.
|
||||
* convert POSIX and other legacy IDs to standard ICU form.
|
||||
* @param localeID the locale id
|
||||
* @return the canonicalized id
|
||||
* @stable ICU 3.0
|
||||
|
@ -2666,60 +2667,18 @@ public final class ULocale implements Serializable {
|
|||
* script to title case, country to upper case, variant to upper case,
|
||||
* and extensions to lower case.
|
||||
*
|
||||
* <p>This implements the 'Language-Tag' production of BCP47, and
|
||||
* so supports grandfathered (regular and irregular) as well as
|
||||
* private use language tags. Stand alone private use tags are
|
||||
* represented as empty language and extension 'x-whatever',
|
||||
* and grandfathered tags are converted to their canonical replacements
|
||||
* where they exist.
|
||||
* <p>This implements the 'Language-Tag' production of BCP 47, and so
|
||||
* supports legacy language tags (marked as “Type: grandfathered” in BCP 47)
|
||||
* (regular and irregular) as well as private use language tags.
|
||||
*
|
||||
* <p>Grandfathered tags with canonical replacements are as follows:
|
||||
* <p>Stand-alone private use tags are represented as empty language and extension 'x-whatever',
|
||||
* and legacy tags are converted to their canonical replacements where they exist.
|
||||
*
|
||||
* <table>
|
||||
* <tbody align="center">
|
||||
* <tr><th>grandfathered tag</th><th> </th><th>modern replacement</th></tr>
|
||||
* <tr><td>art-lojban</td><td> </td><td>jbo</td></tr>
|
||||
* <tr><td>i-ami</td><td> </td><td>ami</td></tr>
|
||||
* <tr><td>i-bnn</td><td> </td><td>bnn</td></tr>
|
||||
* <tr><td>i-hak</td><td> </td><td>hak</td></tr>
|
||||
* <tr><td>i-klingon</td><td> </td><td>tlh</td></tr>
|
||||
* <tr><td>i-lux</td><td> </td><td>lb</td></tr>
|
||||
* <tr><td>i-navajo</td><td> </td><td>nv</td></tr>
|
||||
* <tr><td>i-pwn</td><td> </td><td>pwn</td></tr>
|
||||
* <tr><td>i-tao</td><td> </td><td>tao</td></tr>
|
||||
* <tr><td>i-tay</td><td> </td><td>tay</td></tr>
|
||||
* <tr><td>i-tsu</td><td> </td><td>tsu</td></tr>
|
||||
* <tr><td>no-bok</td><td> </td><td>nb</td></tr>
|
||||
* <tr><td>no-nyn</td><td> </td><td>nn</td></tr>
|
||||
* <tr><td>sgn-BE-FR</td><td> </td><td>sfb</td></tr>
|
||||
* <tr><td>sgn-BE-NL</td><td> </td><td>vgt</td></tr>
|
||||
* <tr><td>sgn-CH-DE</td><td> </td><td>sgg</td></tr>
|
||||
* <tr><td>zh-guoyu</td><td> </td><td>cmn</td></tr>
|
||||
* <tr><td>zh-hakka</td><td> </td><td>hak</td></tr>
|
||||
* <tr><td>zh-min-nan</td><td> </td><td>nan</td></tr>
|
||||
* <tr><td>zh-xiang</td><td> </td><td>hsn</td></tr>
|
||||
* </tbody>
|
||||
* </table>
|
||||
* <p>Note that a few legacy tags have no modern replacement;
|
||||
* these will be converted using the fallback described in
|
||||
* the first paragraph, so some information might be lost.
|
||||
*
|
||||
* <p>Grandfathered tags with no modern replacement will be
|
||||
* converted as follows:
|
||||
*
|
||||
* <table>
|
||||
* <tbody align="center">
|
||||
* <tr><th>grandfathered tag</th><th> </th><th>converts to</th></tr>
|
||||
* <tr><td>cel-gaulish</td><td> </td><td>xtg-x-cel-gaulish</td></tr>
|
||||
* <tr><td>en-GB-oed</td><td> </td><td>en-GB-x-oed</td></tr>
|
||||
* <tr><td>i-default</td><td> </td><td>en-x-i-default</td></tr>
|
||||
* <tr><td>i-enochian</td><td> </td><td>und-x-i-enochian</td></tr>
|
||||
* <tr><td>i-mingo</td><td> </td><td>see-x-i-mingo</td></tr>
|
||||
* <tr><td>zh-min</td><td> </td><td>nan-x-zh-min</td></tr>
|
||||
* </tbody>
|
||||
* </table>
|
||||
*
|
||||
* <p>For a list of all grandfathered tags, see the
|
||||
* IANA Language Subtag Registry (search for "Type: grandfathered").
|
||||
*
|
||||
* <p><b>Note</b>: there is no guarantee that <code>toLanguageTag</code>
|
||||
* <p><b>Note</b>: There is no guarantee that <code>toLanguageTag</code>
|
||||
* and <code>forLanguageTag</code> will round-trip.
|
||||
*
|
||||
* @param languageTag the language tag
|
||||
|
@ -2821,7 +2780,7 @@ public final class ULocale implements Serializable {
|
|||
* Resets the Builder to match the provided IETF BCP 47
|
||||
* language tag. Discards the existing state. Null and the
|
||||
* empty string cause the builder to be reset, like {@link
|
||||
* #clear}. Grandfathered tags (see {@link
|
||||
* #clear}. Legacy tags (see {@link
|
||||
* ULocale#forLanguageTag}) are converted to their canonical
|
||||
* form before being processed. Otherwise, the language tag
|
||||
* must be well-formed (see {@link ULocale}) or an exception is
|
||||
|
|
|
@ -46,7 +46,7 @@ public final class CalendarUtil {
|
|||
return calType.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
// Canonicalize, so grandfathered variant will be transformed to keywords
|
||||
// Canonicalize, so that an old-style variant will be transformed to keywords.
|
||||
ULocale canonical = ULocale.createCanonical(loc.toString());
|
||||
calType = canonical.getKeywordValue(CALKEY);
|
||||
if (calType != null) {
|
||||
|
|
|
@ -38,13 +38,13 @@ public class LanguageTag {
|
|||
private List<String> _variants = Collections.emptyList(); // variant subtags
|
||||
private List<String> _extensions = Collections.emptyList(); // extensions
|
||||
|
||||
// Map contains grandfathered tags and its preferred mappings from
|
||||
// http://www.ietf.org/rfc/rfc5646.txt
|
||||
private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> GRANDFATHERED =
|
||||
// The Map contains legacy language tags (marked as “Type: grandfathered” in BCP 47)
|
||||
// and their preferred mappings from BCP 47.
|
||||
private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> LEGACY =
|
||||
new HashMap<AsciiUtil.CaseInsensitiveKey, String[]>();
|
||||
|
||||
static {
|
||||
// grandfathered = irregular ; non-redundant tags registered
|
||||
// legacy = irregular ; non-redundant tags registered
|
||||
// / regular ; during the RFC 3066 era
|
||||
//
|
||||
// irregular = "en-GB-oed" ; irregular tags do not match
|
||||
|
@ -105,57 +105,17 @@ public class LanguageTag {
|
|||
{"zh-xiang", "hsn"},
|
||||
};
|
||||
for (String[] e : entries) {
|
||||
GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
|
||||
LEGACY.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
|
||||
}
|
||||
}
|
||||
|
||||
private LanguageTag() {
|
||||
}
|
||||
|
||||
/*
|
||||
* BNF in RFC5464
|
||||
*
|
||||
* Language-Tag = langtag ; normal language tags
|
||||
* / privateuse ; private use tag
|
||||
* / grandfathered ; grandfathered tags
|
||||
*
|
||||
*
|
||||
* langtag = language
|
||||
* ["-" script]
|
||||
* ["-" region]
|
||||
* *("-" variant)
|
||||
* *("-" extension)
|
||||
* ["-" privateuse]
|
||||
*
|
||||
* language = 2*3ALPHA ; shortest ISO 639 code
|
||||
* ["-" extlang] ; sometimes followed by
|
||||
* ; extended language subtags
|
||||
* / 4ALPHA ; or reserved for future use
|
||||
* / 5*8ALPHA ; or registered language subtag
|
||||
*
|
||||
* extlang = 3ALPHA ; selected ISO 639 codes
|
||||
* *2("-" 3ALPHA) ; permanently reserved
|
||||
*
|
||||
* script = 4ALPHA ; ISO 15924 code
|
||||
*
|
||||
* region = 2ALPHA ; ISO 3166-1 code
|
||||
* / 3DIGIT ; UN M.49 code
|
||||
*
|
||||
* variant = 5*8alphanum ; registered variants
|
||||
* / (DIGIT 3alphanum)
|
||||
*
|
||||
* extension = singleton 1*("-" (2*8alphanum))
|
||||
*
|
||||
* ; Single alphanumerics
|
||||
* ; "x" reserved for private use
|
||||
* singleton = DIGIT ; 0 - 9
|
||||
* / %x41-57 ; A - W
|
||||
* / %x59-5A ; Y - Z
|
||||
* / %x61-77 ; a - w
|
||||
* / %x79-7A ; y - z
|
||||
*
|
||||
* privateuse = "x" 1*("-" (1*8alphanum))
|
||||
*
|
||||
/**
|
||||
* See BCP 47 “Tags for Identifying Languages”:
|
||||
* https://www.rfc-editor.org/info/bcp47 -->
|
||||
* https://www.rfc-editor.org/rfc/rfc5646.html#section-2.1
|
||||
*/
|
||||
public static LanguageTag parse(String languageTag, ParseStatus sts) {
|
||||
if (sts == null) {
|
||||
|
@ -165,14 +125,13 @@ public class LanguageTag {
|
|||
}
|
||||
|
||||
StringTokenIterator itr;
|
||||
boolean isGrandfathered = false;
|
||||
boolean isLegacy = false;
|
||||
|
||||
// Check if the tag is grandfathered
|
||||
String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
|
||||
String[] gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
|
||||
// Language tag is at least 2 alpha so we can skip searching the first 2 chars.
|
||||
int dash = 2;
|
||||
while (gfmap == null && (dash = languageTag.indexOf('-', dash + 1)) != -1) {
|
||||
gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag.substring(0, dash)));
|
||||
gfmap = LEGACY.get(new AsciiUtil.CaseInsensitiveKey(languageTag.substring(0, dash)));
|
||||
}
|
||||
|
||||
if (gfmap != null) {
|
||||
|
@ -183,7 +142,7 @@ public class LanguageTag {
|
|||
// append the rest of the tag.
|
||||
itr = new StringTokenIterator(gfmap[1] + languageTag.substring(dash), SEP);
|
||||
}
|
||||
isGrandfathered = true;
|
||||
isLegacy = true;
|
||||
} else {
|
||||
itr = new StringTokenIterator(languageTag, SEP);
|
||||
}
|
||||
|
@ -202,8 +161,8 @@ public class LanguageTag {
|
|||
}
|
||||
tag.parsePrivateuse(itr, sts);
|
||||
|
||||
if (isGrandfathered) {
|
||||
// Grandfathered tag is replaced with a well-formed tag above.
|
||||
if (isLegacy) {
|
||||
// A legacy tag is replaced with a well-formed tag above.
|
||||
// However, the parsed length must be the original tag length.
|
||||
assert (itr.isDone());
|
||||
assert (!sts.isError());
|
||||
|
|
|
@ -80,7 +80,8 @@ import com.ibm.icu.text.LocaleDisplayNames.DialectHandling;
|
|||
* Canonicalization additionally performs the following:
|
||||
* <ul>
|
||||
* <li>POSIX ids are converted to ICU format IDs</li>
|
||||
* <li>'grandfathered' 3066 ids are converted to ICU standard form</li>
|
||||
* <li>Legacy language tags (marked as “Type: grandfathered” in BCP 47)
|
||||
* are converted to ICU standard form</li>
|
||||
* </ul>
|
||||
* All ULocale constructors automatically normalize the locale id. To handle
|
||||
* POSIX ids, <code>canonicalize</code> can be called to convert the id
|
||||
|
@ -1204,7 +1205,7 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
|
|||
|
||||
/**
|
||||
* {@icu} Returns the canonical name according to CLDR for the specified locale ID.
|
||||
* This is used to convert POSIX and other grandfathered IDs to standard ICU form.
|
||||
* This is used to convert POSIX and other legacy IDs to standard ICU form.
|
||||
* @param localeID the locale id
|
||||
* @return the canonicalized id
|
||||
* @stable ICU 3.0
|
||||
|
@ -1242,7 +1243,7 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
|
|||
// element in Supplemental Data, replace the language subtag with the replacement value.
|
||||
// If there are additional subtags in the replacement value, add them to the result, but
|
||||
// only if there is no corresponding subtag already in the tag.
|
||||
// Five special deprecated grandfathered codes (such as i-default) are in type attributes, and are also replaced.
|
||||
// Five special deprecated codes (such as i-default) are in type attributes, and are also replaced.
|
||||
try {
|
||||
UResourceBundle languageAlias = UResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME,
|
||||
"metadata", ICUResourceBundle.ICU_DATA_CLASS_LOADER)
|
||||
|
@ -3201,58 +3202,16 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
|
|||
*
|
||||
* </ul>
|
||||
*
|
||||
* <p>This implements the 'Language-Tag' production of BCP47, and
|
||||
* so supports grandfathered (regular and irregular) as well as
|
||||
* private use language tags. Stand alone private use tags are
|
||||
* represented as empty language and extension 'x-whatever',
|
||||
* and grandfathered tags are converted to their canonical replacements
|
||||
* where they exist.
|
||||
* <p>This implements the 'Language-Tag' production of BCP 47, and so
|
||||
* supports legacy language tags (marked as “Type: grandfathered” in BCP 47)
|
||||
* (regular and irregular) as well as private use language tags.
|
||||
*
|
||||
* <p>Grandfathered tags with canonical replacements are as follows:
|
||||
* <p>Stand-alone private use tags are represented as empty language and extension 'x-whatever',
|
||||
* and legacy tags are converted to their canonical replacements where they exist.
|
||||
*
|
||||
* <table>
|
||||
* <tbody align="center">
|
||||
* <tr><th>grandfathered tag</th><th> </th><th>modern replacement</th></tr>
|
||||
* <tr><td>art-lojban</td><td> </td><td>jbo</td></tr>
|
||||
* <tr><td>i-ami</td><td> </td><td>ami</td></tr>
|
||||
* <tr><td>i-bnn</td><td> </td><td>bnn</td></tr>
|
||||
* <tr><td>i-hak</td><td> </td><td>hak</td></tr>
|
||||
* <tr><td>i-klingon</td><td> </td><td>tlh</td></tr>
|
||||
* <tr><td>i-lux</td><td> </td><td>lb</td></tr>
|
||||
* <tr><td>i-navajo</td><td> </td><td>nv</td></tr>
|
||||
* <tr><td>i-pwn</td><td> </td><td>pwn</td></tr>
|
||||
* <tr><td>i-tao</td><td> </td><td>tao</td></tr>
|
||||
* <tr><td>i-tay</td><td> </td><td>tay</td></tr>
|
||||
* <tr><td>i-tsu</td><td> </td><td>tsu</td></tr>
|
||||
* <tr><td>no-bok</td><td> </td><td>nb</td></tr>
|
||||
* <tr><td>no-nyn</td><td> </td><td>nn</td></tr>
|
||||
* <tr><td>sgn-BE-FR</td><td> </td><td>sfb</td></tr>
|
||||
* <tr><td>sgn-BE-NL</td><td> </td><td>vgt</td></tr>
|
||||
* <tr><td>sgn-CH-DE</td><td> </td><td>sgg</td></tr>
|
||||
* <tr><td>zh-guoyu</td><td> </td><td>cmn</td></tr>
|
||||
* <tr><td>zh-hakka</td><td> </td><td>hak</td></tr>
|
||||
* <tr><td>zh-min-nan</td><td> </td><td>nan</td></tr>
|
||||
* <tr><td>zh-xiang</td><td> </td><td>hsn</td></tr>
|
||||
* </tbody>
|
||||
* </table>
|
||||
*
|
||||
* <p>Grandfathered tags with no modern replacement will be
|
||||
* converted as follows:
|
||||
*
|
||||
* <table>
|
||||
* <tbody align="center">
|
||||
* <tr><th>grandfathered tag</th><th> </th><th>converts to</th></tr>
|
||||
* <tr><td>cel-gaulish</td><td> </td><td>xtg-x-cel-gaulish</td></tr>
|
||||
* <tr><td>en-GB-oed</td><td> </td><td>en-GB-x-oed</td></tr>
|
||||
* <tr><td>i-default</td><td> </td><td>en-x-i-default</td></tr>
|
||||
* <tr><td>i-enochian</td><td> </td><td>und-x-i-enochian</td></tr>
|
||||
* <tr><td>i-mingo</td><td> </td><td>see-x-i-mingo</td></tr>
|
||||
* <tr><td>zh-min</td><td> </td><td>nan-x-zh-min</td></tr>
|
||||
* </tbody>
|
||||
* </table>
|
||||
*
|
||||
* <p>For a list of all grandfathered tags, see the
|
||||
* IANA Language Subtag Registry (search for "Type: grandfathered").
|
||||
* <p>Note that a few legacy tags have no modern replacement;
|
||||
* these will be converted using the fallback described in
|
||||
* the first paragraph, so some information might be lost.
|
||||
*
|
||||
* <p><b>Note</b>: there is no guarantee that <code>toLanguageTag</code>
|
||||
* and <code>forLanguageTag</code> will round-trip.
|
||||
|
@ -3491,7 +3450,7 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
|
|||
* Resets the Builder to match the provided IETF BCP 47
|
||||
* language tag. Discards the existing state. Null and the
|
||||
* empty string cause the builder to be reset, like {@link
|
||||
* #clear}. Grandfathered tags (see {@link
|
||||
* #clear}. Legacy tags (see {@link
|
||||
* ULocale#forLanguageTag}) are converted to their canonical
|
||||
* form before being processed. Otherwise, the language tag
|
||||
* must be well-formed (see {@link ULocale}) or an exception is
|
||||
|
|
|
@ -365,7 +365,7 @@ public class LocaleMatcherTest extends TestFmwk {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testMatchGrandfatheredCode() {
|
||||
public void testMatchLegacyCode() {
|
||||
final LocaleMatcher matcher = newLocaleMatcher("fr, i_klingon, en_Latn_US");
|
||||
assertEquals("en_Latn_US", matcher.getBestMatch("en_GB_oed").toString());
|
||||
// assertEquals("tlh", matcher.getBestMatch("i_klingon").toString());
|
||||
|
|
|
@ -279,7 +279,7 @@ und-TW >> zh-Hant
|
|||
zh-Hant >> und-TW
|
||||
zh >> und-TW
|
||||
|
||||
** test: testMatchGrandfatheredCode
|
||||
** test: testMatchLegacyCode
|
||||
|
||||
@supported=fr, i-klingon, en-Latn-US
|
||||
en-GB-oed >> en-Latn-US
|
||||
|
@ -984,7 +984,7 @@ x-bork >> x-bork
|
|||
x-piglatin >> x-bork
|
||||
x-bork >> x-bork
|
||||
|
||||
** test: MatchGrandfatheredCode
|
||||
** test: MatchLegacyCode
|
||||
@supported=fr, i-klingon, en-Latn-US
|
||||
en-GB-oed >> en-Latn-US
|
||||
i-klingon >> tlh
|
||||
|
@ -1525,7 +1525,7 @@ en >> null
|
|||
x-piglatin >> fr
|
||||
x-bork >> x-bork
|
||||
|
||||
** test: grandfathered codes
|
||||
** test: legacy codes
|
||||
@supported=fr, i-klingon, en-Latn-US
|
||||
en-GB-oed >> en-Latn-US
|
||||
i-klingon >> tlh
|
||||
|
|
|
@ -38,14 +38,15 @@ $extension = $singleton (?: $s $alphanum{2,8} )+ ; # singleton 1*("-" (2*8alpha
|
|||
|
||||
$privateUse = $x (?: $s $alphanum{1,8} )+ ; # "x" 1*("-" (1*8alphanum))
|
||||
|
||||
# Define certain grandfathered codes, since otherwise the regex is pretty useless.
|
||||
# Define certain legacy language tags (marked as “Type: grandfathered” in BCP 47),
|
||||
# since otherwise the regex is pretty useless.
|
||||
# Since these are limited, this is safe even later changes to the registry --
|
||||
# the only oddity is that it might change the type of the tag, and thus
|
||||
# the results from the capturing groups.
|
||||
# http://www.iana.org/assignments/language-subtag-registry
|
||||
# Note that these have to be compared case insensitively, requiring (?i) below.
|
||||
|
||||
$grandfathered = en $s GB $s oed
|
||||
$legacy = en $s GB $s oed
|
||||
| i $s (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )
|
||||
| no $s (?: bok | nyn )
|
||||
| sgn $s (?: BE $s (?: fr | nl) | CH $s de )
|
||||
|
@ -55,7 +56,7 @@ $grandfathered = en $s GB $s oed
|
|||
# For well-formedness, we don't need the ones that would otherwise pass.
|
||||
# For validity, they need to be checked.
|
||||
|
||||
# $grandfatheredWellFormed = (?:
|
||||
# $legacyWellFormed = (?:
|
||||
# art $s lojban
|
||||
# | cel $s gaulish
|
||||
# | zh $s (?: guoyu | hakka | xiang )
|
||||
|
@ -78,12 +79,12 @@ $langtag = (?: ( $language )
|
|||
(?: $s ( $privateUse ) )? 5%);
|
||||
|
||||
# Here is the final breakdown, with capturing groups for each of these components
|
||||
# The variants, extensions, grandfathered, and private-use may have interior '-'
|
||||
# The variants, extensions, legacy, and private-use may have interior '-'
|
||||
|
||||
$root = (?i) # case-insensitive
|
||||
(?:
|
||||
$langtag 90%
|
||||
| ( $privateUse ) 5%
|
||||
| ( $grandfathered ) 5%)
|
||||
| ( $legacy ) 5%)
|
||||
# (?: \@ $keywords )? 5%
|
||||
;
|
||||
|
|
|
@ -527,7 +527,8 @@ public final class SupplementalData {
|
|||
// ...
|
||||
// Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
|
||||
//
|
||||
// Note that this implementation does not need to handle "grandfathered" tags.
|
||||
// Note that this implementation does not need to handle
|
||||
// legacy language tags (marked as “Type: grandfathered” in BCP 47).
|
||||
private Optional<LocaleId> addLikelySubtags(String localeId) {
|
||||
if (localeId.equals("root")) {
|
||||
return Optional.empty();
|
||||
|
|
Loading…
Add table
Reference in a new issue