ICU-9562 Update language tag mapping per the latest IANA registry

uloc_forLanguageTag has a few mapping tables to map grandfathered
language tags and deprecated language subtags to their preferred or
modern values.

Update them based on the latest version of the IANA
language subtag registry. [1]

Five grandfathered tags without a preferred value are still mapped to
what ICU has mapped them to for backward compatibility until the
wisdom of continuing to do so is reviewed.

In addition, map redundant language tags to their preferred values
regardless of whether they're followed by other subtags or not. (e.g.
zh-yue vs zh-yue-u-co-pinyin) .

Similary, ja-latn-hepburn-heploc is mapped to ja-latn-alaic97 (the
variant subtag 'hepburn-helploc' with the prefix 'ja-latn' has the
preferred value, 'alaic97') .

Update the mapping for deprecated language subtags (e.g. 'jw' to
'jv' and a bunch of 3-letter language codes).

Add a new table for deprecated region subtags to map them to their
modern values. (e.g. 'DD' to 'DE').

Add a new test case for deprecated language and region mapping and
a few more cases for updated grandfathered and redundant tag mapping.

[1]
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
This commit is contained in:
Jungshik Shin 2018-09-26 06:43:43 -07:00 committed by Shane Carr
parent ff9876421e
commit c71a1b4b51
No known key found for this signature in database
GPG key ID: FCED3B24AAB18B5C
4 changed files with 281 additions and 14 deletions

View file

@ -79,19 +79,34 @@ static const char LOCALE_TYPE_YES[] = "yes";
#define LANG_UND_LEN 3
/*
Updated on 2018-09-12 from
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
This table has 2 parts. The parts for Grandfathered tags is generated by the
following scripts from the IANA language tag registry.
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
egrep -A 7 'Type: grandfathered' | \
egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \
awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\
tr 'A-Z' 'a-z'
The 2nd part is made of five ICU-specific entries. They're kept for
the backward compatibility for now, even though there are no preferred
values. They may have to be removed for the strict BCP 47 compliance.
*/
static const char* const GRANDFATHERED[] = {
/* grandfathered preferred */
"art-lojban", "jbo",
"cel-gaulish", "xtg-x-cel-gaulish",
"en-GB-oed", "en-GB-x-oed",
"en-gb-oed", "en-gb-oxendict",
"i-ami", "ami",
"i-bnn", "bnn",
"i-default", "en-x-i-default",
"i-enochian", "und-x-i-enochian",
"i-hak", "hak",
"i-klingon", "tlh",
"i-lux", "lb",
"i-mingo", "see-x-i-mingo",
"i-navajo", "nv",
"i-pwn", "pwn",
"i-tao", "tao",
@ -104,17 +119,175 @@ static const char* const GRANDFATHERED[] = {
"sgn-ch-de", "sgg",
"zh-guoyu", "cmn",
"zh-hakka", "hak",
"zh-min", "nan-x-zh-min",
"zh-min-nan", "nan",
"zh-xiang", "hsn",
NULL, NULL
// Grandfathered tags with no preferred value in the IANA
// registry. Kept for now for the backward compatibility
// because ICU has mapped them this way.
"cel-gaulish", "xtg-x-cel-gaulish",
"i-default", "en-x-i-default",
"i-enochian", "und-x-i-enochian",
"i-mingo", "see-x-i-mingo",
"zh-min", "nan-x-zh-min",
};
/*
Updated on 2018-09-12 from
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
The table lists redundant tags with preferred value in the IANA languate tag registry.
It's generated with the following command:
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \
awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \
tr 'A-Z' 'a-z'
In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because
a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'.
*/
static const char* const REDUNDANT[] = {
// redundant preferred
"sgn-br", "bzs",
"sgn-co", "csn",
"sgn-de", "gsg",
"sgn-dk", "dsl",
"sgn-es", "ssp",
"sgn-fr", "fsl",
"sgn-gb", "bfi",
"sgn-gr", "gss",
"sgn-ie", "isg",
"sgn-it", "ise",
"sgn-jp", "jsl",
"sgn-mx", "mfs",
"sgn-ni", "ncs",
"sgn-nl", "dse",
"sgn-no", "nsl",
"sgn-pt", "psr",
"sgn-se", "swl",
"sgn-us", "ase",
"sgn-za", "sfs",
"zh-cmn", "cmn",
"zh-cmn-hans", "cmn-hans",
"zh-cmn-hant", "cmn-hant",
"zh-gan", "gan",
"zh-wuu", "wuu",
"zh-yue", "yue",
// variant tag with preferred value
"ja-latn-hepburn-heploc", "ja-latn-alalc97",
};
/*
Updated on 2018-09-12 from
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \
grep -B1 'Preferred' | grep -v '^--' | \
awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
Make sure that 2-letter language subtags come before 3-letter subtags.
*/
static const char DEPRECATEDLANGS[][4] = {
/* deprecated new */
"in", "id",
"iw", "he",
"ji", "yi",
"in", "id"
"jw", "jv",
"mo", "ro",
"aam", "aas",
"adp", "dz",
"aue", "ktz",
"ayx", "nun",
"bgm", "bcg",
"bjd", "drl",
"ccq", "rki",
"cjr", "mom",
"cka", "cmr",
"cmk", "xch",
"coy", "pij",
"cqu", "quh",
"drh", "khk",
"drw", "prs",
"gav", "dev",
"gfx", "vaj",
"ggn", "gvr",
"gti", "nyc",
"guv", "duz",
"hrr", "jal",
"ibi", "opa",
"ilw", "gal",
"jeg", "oyb",
"kgc", "tdf",
"kgh", "kml",
"koj", "kwv",
"krm", "bmf",
"ktr", "dtp",
"kvs", "gdj",
"kwq", "yam",
"kxe", "tvd",
"kzj", "dtp",
"kzt", "dtp",
"lii", "raq",
"lmm", "rmx",
"meg", "cir",
"mst", "mry",
"mwj", "vaj",
"myt", "mry",
"nad", "xny",
"ncp", "kdz",
"nnx", "ngv",
"nts", "pij",
"oun", "vaj",
"pcr", "adx",
"pmc", "huw",
"pmu", "phr",
"ppa", "bfy",
"ppr", "lcq",
"pry", "prt",
"puz", "pub",
"sca", "hle",
"skk", "oyb",
"tdu", "dtp",
"thc", "tpo",
"thx", "oyb",
"tie", "ras",
"tkk", "twm",
"tlw", "weo",
"tmp", "tyj",
"tne", "kak",
"tnf", "prs",
"tsf", "taj",
"uok", "ema",
"xba", "cax",
"xia", "acn",
"xkh", "waw",
"xsj", "suj",
"ybd", "rki",
"yma", "lrr",
"ymt", "mtm",
"yos", "zom",
"yuu", "yug",
};
/*
Updated on 2018-04-24 from
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \
grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \
grep -B1 'Preferred' | \
awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
*/
static const char DEPRECATEDREGIONS[][3] = {
/* deprecated new */
"BU", "MM",
"DD", "DE",
"FX", "FR",
"TP", "TL",
"YD", "YE",
"ZR", "CD",
};
/*
@ -717,6 +890,11 @@ _appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capac
} else {
/* resolve deprecated */
for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) {
// 2-letter deprecated subtags are listede before 3-letter
// ones in DEPRECATEDLANGS[]. Get out of loop on coming
// across the 1st 3-letter subtag, if the input is a 2-letter code.
// to avoid continuing to try when there's no match.
if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break;
if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) {
uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]);
len = (int32_t)uprv_strlen(buf);
@ -763,7 +941,6 @@ _appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacit
*(appendAt + reslen) = SEP;
}
reslen++;
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
}
@ -805,6 +982,14 @@ _appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacit
*(appendAt + reslen) = SEP;
}
reslen++;
/* resolve deprecated */
for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) {
if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) {
uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]);
len = (int32_t)uprv_strlen(buf);
break;
}
}
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
@ -1916,7 +2101,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
}
/* check if the tag is grandfathered */
for (i = 0; GRANDFATHERED[i] != NULL; i += 2) {
for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) {
if (uprv_stricmp(GRANDFATHERED[i], tagBuf) == 0) {
int32_t newTagLength;
@ -1938,6 +2123,37 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
}
}
size_t parsedLenDelta = 0;
if (grandfatheredLen == 0) {
for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
const char* redundantTag = REDUNDANT[i];
size_t redundantTagLen = uprv_strlen(redundantTag);
// The preferred tag for a redundant tag is always shorter than redundant
// tag. A redundant tag may or may not be followed by other subtags.
// (i.e. "zh-yue" or "zh-yue-u-co-pinyin").
if (uprv_strnicmp(redundantTag, tagBuf, redundantTagLen) == 0) {
const char* redundantTagEnd = tagBuf + redundantTagLen;
if (*redundantTagEnd == '\0' || *redundantTagEnd == SEP) {
const char* preferredTag = REDUNDANT[i + 1];
size_t preferredTagLen = uprv_strlen(preferredTag);
uprv_strncpy(t->buf, preferredTag, preferredTagLen);
if (*redundantTagEnd == SEP) {
uprv_memmove(tagBuf + preferredTagLen,
redundantTagEnd,
tagLen - redundantTagLen + 1);
} else {
tagBuf[preferredTagLen] = '\0';
}
// parsedLen should be the length of the input
// before redundantTag is replaced by preferredTag.
// Save the delta to add it back later.
parsedLenDelta = redundantTagLen - preferredTagLen;
break;
}
}
}
}
/*
* langtag = language
* ["-" script]
@ -1978,6 +2194,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
if (next & LANG) {
if (_isLanguageSubtag(pSubtag, subtagLen)) {
*pSep = 0; /* terminate */
// TODO: move deprecated language code handling here.
t->language = T_CString_toLowerCase(pSubtag);
pLastGoodPosition = pSep;
@ -2024,6 +2241,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
if (next & REGN) {
if (_isRegionSubtag(pSubtag, subtagLen)) {
*pSep = 0;
// TODO: move deprecated region code handling here.
t->region = T_CString_toUpperCase(pSubtag);
pLastGoodPosition = pSep;
@ -2220,7 +2438,8 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
}
if (parsedLen != NULL) {
*parsedLen = (grandfatheredLen > 0) ? grandfatheredLen : (int32_t)(pLastGoodPosition - t->buf);
*parsedLen = (grandfatheredLen > 0) ? grandfatheredLen :
(int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
}
return t;

View file

@ -253,6 +253,7 @@ void addLocaleTest(TestNode** root)
TESTCASE(TestToLanguageTag);
TESTCASE(TestForLanguageTag);
TESTCASE(TestInvalidLanguageTag);
TESTCASE(TestLangAndRegionCanonicalize);
TESTCASE(TestTrailingNull);
TESTCASE(TestUnicodeDefines);
TESTCASE(TestEnglishExemplarCharacters);
@ -6036,6 +6037,7 @@ static const struct {
{"art-lojban", "jbo", FULL_LENGTH},
{"zh-hakka", "hak", FULL_LENGTH},
{"zh-cmn-CH", "cmn_CH", FULL_LENGTH},
{"zh-cmn-CH-u-co-pinyin", "cmn_CH@collation=pinyin", FULL_LENGTH},
{"xxx-yy", "xxx_YY", FULL_LENGTH},
{"fr-234", "fr_234", FULL_LENGTH},
{"i-default", "en@x=i-default", FULL_LENGTH},
@ -6093,7 +6095,15 @@ static const struct {
{"zh-u-ca-gregory-co-pinyin-ca-chinese", "zh@calendar=gregorian;collation=pinyin", FULL_LENGTH},
{"de-latn-DE-1901-u-co-phonebk-co-pinyin-ca-gregory", "de_Latn_DE_1901@calendar=gregorian;collation=phonebook", FULL_LENGTH},
{"th-u-kf-nu-thai-kf-false", "th@colcasefirst=yes;numbers=thai", FULL_LENGTH},
{NULL, NULL, 0}
/* #9562 IANA language tag data update */
{"en-gb-oed", "en_GB_OXENDICT", FULL_LENGTH},
{"i-navajo", "nv", FULL_LENGTH},
{"i-navajo-a-foo", "", 0},
{"i-navajo-latn-us", "", 0},
{"sgn-br", "bzs", FULL_LENGTH},
{"sgn-br-u-co-phonebk", "bzs@collation=phonebook", FULL_LENGTH},
{"ja-latn-hepburn-heploc", "ja_Latn__ALALC97", FULL_LENGTH},
{"ja-latn-hepburn-heploc-u-ca-japanese", "ja_Latn__ALALC97@calendar=japanese", FULL_LENGTH},
};
static void TestForLanguageTag(void) {
@ -6103,7 +6113,7 @@ static void TestForLanguageTag(void) {
int32_t parsedLen;
int32_t expParsedLen;
for (i = 0; langtag_to_locale[i].bcpID != NULL; i++) {
for (i = 0; i < UPRV_LENGTHOF(langtag_to_locale); i++) {
status = U_ZERO_ERROR;
locale[0] = 0;
expParsedLen = langtag_to_locale[i].len;
@ -6156,6 +6166,43 @@ static void TestInvalidLanguageTag(void) {
}
}
static const struct {
const char *input;
const char *canonical;
} langtag_to_canonical[] = {
{"de-DD", "de-DE"},
{"de-DD-u-co-phonebk", "de-DE-u-co-phonebk"},
{"jw-id", "jv-ID"},
{"jw-id-u-ca-islamic-civil", "jv-ID-u-ca-islamic-civil"},
{"mo-md", "ro-MD"},
{"my-bu-u-nu-mymr", "my-MM-u-nu-mymr"},
{"yuu-ru", "yug-RU"},
};
static void TestLangAndRegionCanonicalize(void) {
char locale[256];
char canonical[256];
int32_t i;
UErrorCode status;
for (i = 0; i < UPRV_LENGTHOF(langtag_to_canonical); i++) {
status = U_ZERO_ERROR;
const char* input = langtag_to_canonical[i].input;
uloc_forLanguageTag(input, locale, sizeof(locale), NULL, &status);
uloc_toLanguageTag(locale, canonical, sizeof(canonical), TRUE, &status);
if (U_FAILURE(status)) {
log_err_status(status, "Error returned by uloc_forLanguageTag or uloc_toLanguageTag "
"for language tag [%s] - error: %s\n", input, u_errorName(status));
} else {
const char* expected_canonical = langtag_to_canonical[i].canonical;
if (uprv_strcmp(expected_canonical, canonical) != 0) {
log_data_err("input language tag [%s] is canonicalized to [%s] - expected: [%s]\n",
input, canonical, expected_canonical);
}
}
}
}
static void TestToUnicodeLocaleKey(void)
{
/* $IN specifies the result should be the input pointer itself */

View file

@ -126,6 +126,7 @@ static void TestLikelySubtags(void);
static void TestForLanguageTag(void);
static void TestInvalidLanguageTag(void);
static void TestToLanguageTag(void);
static void TestLangAndRegionCanonicalize(void);
static void TestToUnicodeLocaleKey(void);
static void TestToLegacyKey(void);

View file

@ -2950,7 +2950,7 @@ void LocaleTest::TestForLanguageTag() {
static const char tag_no_nul[] = { 'e', 'n', '-', 'G', 'B' };
static const Locale loc_en("en_US");
static const Locale loc_oed("en_GB@x=oed");
static const Locale loc_oed("en_GB_OXENDICT");
static const Locale loc_af("af@calendar=coptic;t=ar-i0-handwrit;x=foo");
static const Locale loc_null("");
static const Locale loc_gb("en_GB");