diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp index 2d6a9213c3d..c718de712cc 100644 --- a/icu4c/source/common/uloc_tag.cpp +++ b/icu4c/source/common/uloc_tag.cpp @@ -79,19 +79,34 @@ static const char LOCALE_TYPE_YES[] = "yes"; #define LANG_UND_LEN 3 +/* + Updated on 2018-09-12 from + https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . + + This table has 2 parts. The parts for Grandfathered tags is generated by the + following scripts from the IANA language tag registry. + + curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\ + egrep -A 7 'Type: grandfathered' | \ + egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \ + awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\ + tr 'A-Z' 'a-z' + + + The 2nd part is made of five ICU-specific entries. They're kept for + the backward compatibility for now, even though there are no preferred + values. They may have to be removed for the strict BCP 47 compliance. + +*/ static const char* const GRANDFATHERED[] = { /* grandfathered preferred */ "art-lojban", "jbo", - "cel-gaulish", "xtg-x-cel-gaulish", - "en-GB-oed", "en-GB-x-oed", + "en-gb-oed", "en-gb-oxendict", "i-ami", "ami", "i-bnn", "bnn", - "i-default", "en-x-i-default", - "i-enochian", "und-x-i-enochian", "i-hak", "hak", "i-klingon", "tlh", "i-lux", "lb", - "i-mingo", "see-x-i-mingo", "i-navajo", "nv", "i-pwn", "pwn", "i-tao", "tao", @@ -104,17 +119,175 @@ static const char* const GRANDFATHERED[] = { "sgn-ch-de", "sgg", "zh-guoyu", "cmn", "zh-hakka", "hak", - "zh-min", "nan-x-zh-min", "zh-min-nan", "nan", "zh-xiang", "hsn", - NULL, NULL + + // Grandfathered tags with no preferred value in the IANA + // registry. Kept for now for the backward compatibility + // because ICU has mapped them this way. + "cel-gaulish", "xtg-x-cel-gaulish", + "i-default", "en-x-i-default", + "i-enochian", "und-x-i-enochian", + "i-mingo", "see-x-i-mingo", + "zh-min", "nan-x-zh-min", }; +/* + Updated on 2018-09-12 from + https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . + + The table lists redundant tags with preferred value in the IANA languate tag registry. + It's generated with the following command: + + curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\ + grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \ + awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \ + tr 'A-Z' 'a-z' + + In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because + a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'. +*/ + +static const char* const REDUNDANT[] = { +// redundant preferred + "sgn-br", "bzs", + "sgn-co", "csn", + "sgn-de", "gsg", + "sgn-dk", "dsl", + "sgn-es", "ssp", + "sgn-fr", "fsl", + "sgn-gb", "bfi", + "sgn-gr", "gss", + "sgn-ie", "isg", + "sgn-it", "ise", + "sgn-jp", "jsl", + "sgn-mx", "mfs", + "sgn-ni", "ncs", + "sgn-nl", "dse", + "sgn-no", "nsl", + "sgn-pt", "psr", + "sgn-se", "swl", + "sgn-us", "ase", + "sgn-za", "sfs", + "zh-cmn", "cmn", + "zh-cmn-hans", "cmn-hans", + "zh-cmn-hant", "cmn-hant", + "zh-gan", "gan", + "zh-wuu", "wuu", + "zh-yue", "yue", + + // variant tag with preferred value + "ja-latn-hepburn-heploc", "ja-latn-alalc97", +}; + +/* + Updated on 2018-09-12 from + https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . + + grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \ + grep -B1 'Preferred' | grep -v '^--' | \ + awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' + + Make sure that 2-letter language subtags come before 3-letter subtags. +*/ static const char DEPRECATEDLANGS[][4] = { /* deprecated new */ + "in", "id", "iw", "he", "ji", "yi", - "in", "id" + "jw", "jv", + "mo", "ro", + "aam", "aas", + "adp", "dz", + "aue", "ktz", + "ayx", "nun", + "bgm", "bcg", + "bjd", "drl", + "ccq", "rki", + "cjr", "mom", + "cka", "cmr", + "cmk", "xch", + "coy", "pij", + "cqu", "quh", + "drh", "khk", + "drw", "prs", + "gav", "dev", + "gfx", "vaj", + "ggn", "gvr", + "gti", "nyc", + "guv", "duz", + "hrr", "jal", + "ibi", "opa", + "ilw", "gal", + "jeg", "oyb", + "kgc", "tdf", + "kgh", "kml", + "koj", "kwv", + "krm", "bmf", + "ktr", "dtp", + "kvs", "gdj", + "kwq", "yam", + "kxe", "tvd", + "kzj", "dtp", + "kzt", "dtp", + "lii", "raq", + "lmm", "rmx", + "meg", "cir", + "mst", "mry", + "mwj", "vaj", + "myt", "mry", + "nad", "xny", + "ncp", "kdz", + "nnx", "ngv", + "nts", "pij", + "oun", "vaj", + "pcr", "adx", + "pmc", "huw", + "pmu", "phr", + "ppa", "bfy", + "ppr", "lcq", + "pry", "prt", + "puz", "pub", + "sca", "hle", + "skk", "oyb", + "tdu", "dtp", + "thc", "tpo", + "thx", "oyb", + "tie", "ras", + "tkk", "twm", + "tlw", "weo", + "tmp", "tyj", + "tne", "kak", + "tnf", "prs", + "tsf", "taj", + "uok", "ema", + "xba", "cax", + "xia", "acn", + "xkh", "waw", + "xsj", "suj", + "ybd", "rki", + "yma", "lrr", + "ymt", "mtm", + "yos", "zom", + "yuu", "yug", +}; + +/* + Updated on 2018-04-24 from + + curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \ + grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \ + grep -B1 'Preferred' | \ + awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' +*/ +static const char DEPRECATEDREGIONS[][3] = { +/* deprecated new */ + "BU", "MM", + "DD", "DE", + "FX", "FR", + "TP", "TL", + "YD", "YE", + "ZR", "CD", }; /* @@ -717,6 +890,11 @@ _appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capac } else { /* resolve deprecated */ for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) { + // 2-letter deprecated subtags are listede before 3-letter + // ones in DEPRECATEDLANGS[]. Get out of loop on coming + // across the 1st 3-letter subtag, if the input is a 2-letter code. + // to avoid continuing to try when there's no match. + if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break; if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) { uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]); len = (int32_t)uprv_strlen(buf); @@ -763,7 +941,6 @@ _appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacit *(appendAt + reslen) = SEP; } reslen++; - if (reslen < capacity) { uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen)); } @@ -805,6 +982,14 @@ _appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacit *(appendAt + reslen) = SEP; } reslen++; + /* resolve deprecated */ + for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) { + if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) { + uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]); + len = (int32_t)uprv_strlen(buf); + break; + } + } if (reslen < capacity) { uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen)); @@ -1916,7 +2101,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta } /* check if the tag is grandfathered */ - for (i = 0; GRANDFATHERED[i] != NULL; i += 2) { + for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) { if (uprv_stricmp(GRANDFATHERED[i], tagBuf) == 0) { int32_t newTagLength; @@ -1938,6 +2123,37 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta } } + size_t parsedLenDelta = 0; + if (grandfatheredLen == 0) { + for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) { + const char* redundantTag = REDUNDANT[i]; + size_t redundantTagLen = uprv_strlen(redundantTag); + // The preferred tag for a redundant tag is always shorter than redundant + // tag. A redundant tag may or may not be followed by other subtags. + // (i.e. "zh-yue" or "zh-yue-u-co-pinyin"). + if (uprv_strnicmp(redundantTag, tagBuf, redundantTagLen) == 0) { + const char* redundantTagEnd = tagBuf + redundantTagLen; + if (*redundantTagEnd == '\0' || *redundantTagEnd == SEP) { + const char* preferredTag = REDUNDANT[i + 1]; + size_t preferredTagLen = uprv_strlen(preferredTag); + uprv_strncpy(t->buf, preferredTag, preferredTagLen); + if (*redundantTagEnd == SEP) { + uprv_memmove(tagBuf + preferredTagLen, + redundantTagEnd, + tagLen - redundantTagLen + 1); + } else { + tagBuf[preferredTagLen] = '\0'; + } + // parsedLen should be the length of the input + // before redundantTag is replaced by preferredTag. + // Save the delta to add it back later. + parsedLenDelta = redundantTagLen - preferredTagLen; + break; + } + } + } + } + /* * langtag = language * ["-" script] @@ -1978,6 +2194,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta if (next & LANG) { if (_isLanguageSubtag(pSubtag, subtagLen)) { *pSep = 0; /* terminate */ + // TODO: move deprecated language code handling here. t->language = T_CString_toLowerCase(pSubtag); pLastGoodPosition = pSep; @@ -2024,6 +2241,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta if (next & REGN) { if (_isRegionSubtag(pSubtag, subtagLen)) { *pSep = 0; + // TODO: move deprecated region code handling here. t->region = T_CString_toUpperCase(pSubtag); pLastGoodPosition = pSep; @@ -2220,7 +2438,8 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta } if (parsedLen != NULL) { - *parsedLen = (grandfatheredLen > 0) ? grandfatheredLen : (int32_t)(pLastGoodPosition - t->buf); + *parsedLen = (grandfatheredLen > 0) ? grandfatheredLen : + (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta); } return t; diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 1d1805196f2..63e7269aa48 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -253,6 +253,7 @@ void addLocaleTest(TestNode** root) TESTCASE(TestToLanguageTag); TESTCASE(TestForLanguageTag); TESTCASE(TestInvalidLanguageTag); + TESTCASE(TestLangAndRegionCanonicalize); TESTCASE(TestTrailingNull); TESTCASE(TestUnicodeDefines); TESTCASE(TestEnglishExemplarCharacters); @@ -6036,6 +6037,7 @@ static const struct { {"art-lojban", "jbo", FULL_LENGTH}, {"zh-hakka", "hak", FULL_LENGTH}, {"zh-cmn-CH", "cmn_CH", FULL_LENGTH}, + {"zh-cmn-CH-u-co-pinyin", "cmn_CH@collation=pinyin", FULL_LENGTH}, {"xxx-yy", "xxx_YY", FULL_LENGTH}, {"fr-234", "fr_234", FULL_LENGTH}, {"i-default", "en@x=i-default", FULL_LENGTH}, @@ -6093,7 +6095,15 @@ static const struct { {"zh-u-ca-gregory-co-pinyin-ca-chinese", "zh@calendar=gregorian;collation=pinyin", FULL_LENGTH}, {"de-latn-DE-1901-u-co-phonebk-co-pinyin-ca-gregory", "de_Latn_DE_1901@calendar=gregorian;collation=phonebook", FULL_LENGTH}, {"th-u-kf-nu-thai-kf-false", "th@colcasefirst=yes;numbers=thai", FULL_LENGTH}, - {NULL, NULL, 0} + /* #9562 IANA language tag data update */ + {"en-gb-oed", "en_GB_OXENDICT", FULL_LENGTH}, + {"i-navajo", "nv", FULL_LENGTH}, + {"i-navajo-a-foo", "", 0}, + {"i-navajo-latn-us", "", 0}, + {"sgn-br", "bzs", FULL_LENGTH}, + {"sgn-br-u-co-phonebk", "bzs@collation=phonebook", FULL_LENGTH}, + {"ja-latn-hepburn-heploc", "ja_Latn__ALALC97", FULL_LENGTH}, + {"ja-latn-hepburn-heploc-u-ca-japanese", "ja_Latn__ALALC97@calendar=japanese", FULL_LENGTH}, }; static void TestForLanguageTag(void) { @@ -6103,7 +6113,7 @@ static void TestForLanguageTag(void) { int32_t parsedLen; int32_t expParsedLen; - for (i = 0; langtag_to_locale[i].bcpID != NULL; i++) { + for (i = 0; i < UPRV_LENGTHOF(langtag_to_locale); i++) { status = U_ZERO_ERROR; locale[0] = 0; expParsedLen = langtag_to_locale[i].len; @@ -6156,6 +6166,43 @@ static void TestInvalidLanguageTag(void) { } } +static const struct { + const char *input; + const char *canonical; +} langtag_to_canonical[] = { + {"de-DD", "de-DE"}, + {"de-DD-u-co-phonebk", "de-DE-u-co-phonebk"}, + {"jw-id", "jv-ID"}, + {"jw-id-u-ca-islamic-civil", "jv-ID-u-ca-islamic-civil"}, + {"mo-md", "ro-MD"}, + {"my-bu-u-nu-mymr", "my-MM-u-nu-mymr"}, + {"yuu-ru", "yug-RU"}, +}; + + +static void TestLangAndRegionCanonicalize(void) { + char locale[256]; + char canonical[256]; + int32_t i; + UErrorCode status; + for (i = 0; i < UPRV_LENGTHOF(langtag_to_canonical); i++) { + status = U_ZERO_ERROR; + const char* input = langtag_to_canonical[i].input; + uloc_forLanguageTag(input, locale, sizeof(locale), NULL, &status); + uloc_toLanguageTag(locale, canonical, sizeof(canonical), TRUE, &status); + if (U_FAILURE(status)) { + log_err_status(status, "Error returned by uloc_forLanguageTag or uloc_toLanguageTag " + "for language tag [%s] - error: %s\n", input, u_errorName(status)); + } else { + const char* expected_canonical = langtag_to_canonical[i].canonical; + if (uprv_strcmp(expected_canonical, canonical) != 0) { + log_data_err("input language tag [%s] is canonicalized to [%s] - expected: [%s]\n", + input, canonical, expected_canonical); + } + } + } +} + static void TestToUnicodeLocaleKey(void) { /* $IN specifies the result should be the input pointer itself */ diff --git a/icu4c/source/test/cintltst/cloctst.h b/icu4c/source/test/cintltst/cloctst.h index a2ce892ec23..411ff5c9fe5 100644 --- a/icu4c/source/test/cintltst/cloctst.h +++ b/icu4c/source/test/cintltst/cloctst.h @@ -126,6 +126,7 @@ static void TestLikelySubtags(void); static void TestForLanguageTag(void); static void TestInvalidLanguageTag(void); static void TestToLanguageTag(void); +static void TestLangAndRegionCanonicalize(void); static void TestToUnicodeLocaleKey(void); static void TestToLegacyKey(void); diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp index 4c608febeba..ae841189784 100644 --- a/icu4c/source/test/intltest/loctest.cpp +++ b/icu4c/source/test/intltest/loctest.cpp @@ -2950,7 +2950,7 @@ void LocaleTest::TestForLanguageTag() { static const char tag_no_nul[] = { 'e', 'n', '-', 'G', 'B' }; static const Locale loc_en("en_US"); - static const Locale loc_oed("en_GB@x=oed"); + static const Locale loc_oed("en_GB_OXENDICT"); static const Locale loc_af("af@calendar=coptic;t=ar-i0-handwrit;x=foo"); static const Locale loc_null(""); static const Locale loc_gb("en_GB");