ICU-21406 canonicalize -T- extension

See #1491
This commit is contained in:
Frank Tang 2020-12-08 00:03:52 +00:00 committed by Frank Yung-Fong Tang
parent 32ba41e22f
commit 5f8df0d288
7 changed files with 251 additions and 22 deletions

View file

@ -1171,7 +1171,12 @@ private:
bool replaceVariant(UErrorCode& status);
// Replace by using subdivisionAlias.
bool replaceSubdivision(CharString& subdivision, UErrorCode& status);
bool replaceSubdivision(StringPiece subdivision,
CharString& output, UErrorCode& status);
// Replace transformed extensions.
bool replaceTransformedExtensions(
CharString& transformedExtensions, CharString& output, UErrorCode& status);
};
CharString&
@ -1503,7 +1508,8 @@ AliasReplacer::replaceVariant(UErrorCode& status)
}
bool
AliasReplacer::replaceSubdivision(CharString& subdivision, UErrorCode& status)
AliasReplacer::replaceSubdivision(
StringPiece subdivision, CharString& output, UErrorCode& status)
{
if (U_FAILURE(status)) {
return false;
@ -1516,13 +1522,84 @@ AliasReplacer::replaceSubdivision(CharString& subdivision, UErrorCode& status)
(firstSpace - replacement) : uprv_strlen(replacement);
// Ignore len == 2, see CLDR-14312
if (3 <= len && len <= 8) {
subdivision.clear().append(replacement, (int32_t)len, status);
output.append(replacement, (int32_t)len, status);
}
return true;
}
return false;
}
bool
AliasReplacer::replaceTransformedExtensions(
CharString& transformedExtensions, CharString& output, UErrorCode& status)
{
// The content of the transformedExtensions will be modified in this
// function to NULL-terminating (tkey-tvalue) pairs.
if (U_FAILURE(status)) {
return false;
}
int32_t len = transformedExtensions.length();
const char* str = transformedExtensions.data();
const char* tkey = ultag_getTKeyStart(str);
int32_t tlangLen = (tkey == str) ? 0 :
((tkey == nullptr) ? len : (tkey - str - 1));
CharStringByteSink sink(&output);
if (tlangLen > 0) {
Locale tlang = LocaleBuilder()
.setLanguageTag(StringPiece(str, tlangLen))
.build(status);
tlang.canonicalize(status);
tlang.toLanguageTag(sink, status);
if (U_FAILURE(status)) {
return false;
}
T_CString_toLowerCase(output.data());
}
if (tkey != nullptr) {
// We need to sort the tfields by tkey
UVector tfields(status);
if (U_FAILURE(status)) {
return false;
}
do {
const char* tvalue = uprv_strchr(tkey, '-');
if (tvalue == nullptr) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
const char* nextTKey = ultag_getTKeyStart(tvalue);
if (nextTKey != nullptr) {
*((char*)(nextTKey-1)) = '\0'; // NULL terminate tvalue
}
tfields.insertElementAt((void*)tkey, tfields.size(), status);
if (U_FAILURE(status)) {
return false;
}
tkey = nextTKey;
} while (tkey != nullptr);
tfields.sort([](UElement e1, UElement e2) -> int8_t {
return uprv_strcmp(
(const char*)e1.pointer, (const char*)e2.pointer);
}, status);
for (int32_t i = 0; i < tfields.size(); i++) {
if (output.length() > 0) {
output.append('-', status);
}
const char* tfield = (const char*) tfields.elementAt(i);
const char* tvalue = uprv_strchr(tfield, '-');
// Split the "tkey-tvalue" pair string so that we can canonicalize the tvalue.
U_ASSERT(tvalue != nullptr);
*((char*)tvalue++) = '\0'; // NULL terminate tkey
output.append(tfield, status).append('-', status);
const char* bcpTValue = ulocimp_toBcpType(tfield, tvalue, nullptr, nullptr);
output.append((bcpTValue == nullptr) ? tvalue : bcpTValue, status);
}
}
if (U_FAILURE(status)) {
return false;
}
return true;
}
CharString&
AliasReplacer::outputToString(
CharString& out, UErrorCode status)
@ -1663,7 +1740,8 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode status)
if (U_SUCCESS(status) && !iter.isNull()) {
const char* key;
while ((key = iter->next(nullptr, status)) != nullptr) {
if (uprv_strcmp("sd", key) == 0 || uprv_strcmp("rg", key) == 0) {
if (uprv_strcmp("sd", key) == 0 || uprv_strcmp("rg", key) == 0 ||
uprv_strcmp("t", key) == 0) {
CharString value;
CharStringByteSink valueSink(&value);
locale.getKeywordValue(key, valueSink, status);
@ -1671,10 +1749,19 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode status)
status = U_ZERO_ERROR;
continue;
}
if (replaceSubdivision(value, status)) {
changed++;
CharString replacement;
if (uprv_strlen(key) == 2) {
if (replaceSubdivision(value.toStringPiece(), replacement, status)) {
changed++;
temp.setKeywordValue(key, replacement.data(), status);
}
} else {
U_ASSERT(uprv_strcmp(key, "t") == 0);
if (replaceTransformedExtensions(value, replacement, status)) {
changed++;
temp.setKeywordValue(key, replacement.data(), status);
}
}
temp.setKeywordValue(key, value.data(), status);
if (U_FAILURE(status)) {
return false;
}
@ -1691,7 +1778,6 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode status)
}
// If the tag is not changed, return.
if (uprv_strcmp(out.data(), locale.getName()) == 0) {
U_ASSERT(changed == 0);
out.clear();
return false;
}

View file

@ -646,6 +646,22 @@ _isTKey(const char* s, int32_t len)
return FALSE;
}
U_CAPI const char * U_EXPORT2
ultag_getTKeyStart(const char *localeID) {
const char *result = localeID;
const char *sep;
while((sep = uprv_strchr(result, SEP)) != nullptr) {
if (_isTKey(result, sep - result)) {
return result;
}
result = ++sep;
}
if (_isTKey(result, -1)) {
return result;
}
return nullptr;
}
static UBool
_isTValue(const char* s, int32_t len)
{

View file

@ -286,6 +286,9 @@ ultag_isUnicodeLocaleType(const char* s, int32_t len);
U_CFUNC UBool
ultag_isVariantSubtags(const char* s, int32_t len);
U_CAPI const char * U_EXPORT2
ultag_getTKeyStart(const char *localeID);
U_CFUNC const char*
ulocimp_toBcpKey(const char* key);

View file

@ -4935,6 +4935,36 @@ void LocaleTest::TestCanonicalize(void)
// ICU-21401
{ "cel-gaulish", "xtg"},
// ICU-21406
// Inside T extension
// Case of Script and Region
{ "ja-kana-jp-t-it-latn-it", "ja-Kana-JP-t-it-latn-it"},
{ "und-t-zh-hani-tw", "und-t-zh-hani-tw"},
{ "und-cyrl-t-und-Latn", "und-Cyrl-t-und-latn"},
// Order of singleton
{ "und-u-ca-roc-t-zh", "und-t-zh-u-ca-roc"},
// Variant subtags are alphabetically ordered.
{ "sl-t-sl-rozaj-biske-1994", "sl-t-sl-1994-biske-rozaj"},
// tfield subtags are alphabetically ordered.
// (Also tests subtag case normalisation.)
{ "DE-T-lv-M0-DIN", "de-t-lv-m0-din"},
{ "DE-T-M0-DIN-K0-QWERTZ", "de-t-k0-qwertz-m0-din"},
{ "DE-T-lv-M0-DIN-K0-QWERTZ", "de-t-lv-k0-qwertz-m0-din"},
// "true" tvalue subtags aren't removed.
// (UTS 35 version 36, §3.2.1 claims otherwise, but tkey must be followed by
// tvalue, so that's likely a spec bug in UTS 35.)
{ "en-t-m0-true", "en-t-m0-true"},
// tlang subtags are canonicalised.
{ "en-t-iw", "en-t-he"},
{ "en-t-hy-latn-SU", "en-t-hy-latn-am"},
{ "ru-t-ru-cyrl-SU", "ru-t-ru-cyrl-ru"},
{ "fr-t-fr-172", "fr-t-fr-ru"},
{ "und-t-no-latn-BOKMAL", "und-t-nb-latn" },
{ "und-t-sgn-qAAi-NL", "und-t-dse-zinh" },
// alias of tvalue should be replaced
{ "en-t-m0-NaMeS", "en-t-m0-prprname" },
{ "en-t-s0-ascii-d0-NaMe", "en-t-d0-charname-s0-ascii" },
};
int32_t i;
for (i=0; i < UPRV_LENGTHOF(testCases); i++) {

View file

@ -595,6 +595,12 @@ public class LanguageTag {
return false;
}
public static boolean isTKey(String s) {
// tkey = = alpha digit ;
return (s.length() == 2) && AsciiUtil.isAlpha(s.charAt(0))
&& AsciiUtil.isNumeric(s.charAt(1));
}
public static boolean isExtensionSingleton(String s) {
// singleton = DIGIT ; 0 - 9
// / %x41-57 ; A - W
@ -657,18 +663,20 @@ public class LanguageTag {
public static String canonicalizeExtension(String s) {
s = AsciiUtil.toLowerString(s);
int found;
while (s.endsWith("-true")) {
s = s.substring(0, s.length() - 5); // length of "-true" is 5
}
while ((found = s.indexOf("-true-")) > 0) {
s = s.substring(0, found) + s.substring(found + 5); // length of "-true" is 5
}
while (s.endsWith("-yes")) {
s = s.substring(0, s.length() - 4); // length of "-yes" is 4
}
while ((found = s.indexOf("-yes-")) > 0) {
s = s.substring(0, found) + s.substring(found + 4); // length of "-yes" is 5
if (s.startsWith("u-")) {
int found;
while (s.endsWith("-true")) {
s = s.substring(0, s.length() - 5); // length of "-true" is 5
}
while ((found = s.indexOf("-true-")) > 0) {
s = s.substring(0, found) + s.substring(found + 5); // length of "-true" is 5
}
while (s.endsWith("-yes")) {
s = s.substring(0, s.length() - 4); // length of "-yes" is 4
}
while ((found = s.indexOf("-yes-")) > 0) {
s = s.substring(0, found) + s.substring(found + 4); // length of "-yes" is 5
}
}
return s;
}

View file

@ -1279,9 +1279,11 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
Iterator<String> keywords = temp.getKeywords();
while (keywords != null && keywords.hasNext()) {
String key = keywords.next();
if (key.equals("rg") || key.equals("sd")) {
if (key.equals("rg") || key.equals("sd") || key.equals("t")) {
String value = temp.getKeywordValue(key);
String replacement = replaceSubdivision(value);
String replacement = key.equals("t") ?
replaceTransformedExtensions(value) :
replaceSubdivision(value);
if (replacement != null) {
temp = temp.setKeywordValue(key, replacement);
keywordChanged = true;
@ -1636,6 +1638,58 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
return subdivisionAliasMap.get(subdivision);
}
private String replaceTransformedExtensions(String extensions) {
StringBuilder builder = new StringBuilder();
List<String> subtags = new ArrayList<>(Arrays.asList(extensions.split(LanguageTag.SEP)));
List<String> tfields = new ArrayList<>();
int processedLength = 0;
int tlangLength = 0;
String tkey = "";
for (String subtag : subtags) {
if (LanguageTag.isTKey(subtag)) {
if (tlangLength == 0) {
// Found the first tkey. Record the total length of the preceding
// tlang subtags. -1 if there is no tlang before the first tkey.
tlangLength = processedLength-1;
}
if (builder.length() > 0) {
// Finish & store the previous tkey with its tvalue subtags.
tfields.add(builder.toString());
builder.setLength(0);
}
// Start collecting subtags for this new tkey.
tkey = subtag;
builder.append(subtag);
} else {
if (tlangLength != 0) {
builder.append(LanguageTag.SEP).append(toUnicodeLocaleType(tkey, subtag));
}
}
processedLength += subtag.length() + 1;
}
if (builder.length() > 0) {
// Finish & store the previous=last tkey with its tvalue subtags.
tfields.add(builder.toString());
builder.setLength(0);
}
String tlang = (tlangLength > 0) ? extensions.substring(0, tlangLength) :
((tfields.size() == 0) ? extensions : "");
if (tlang.length() > 0) {
String canonicalized = ULocale.createCanonical(
ULocale.forLanguageTag(extensions)).toLanguageTag();
builder.append(AsciiUtil.toLowerString(canonicalized));
}
if (tfields.size() > 0) {
if (builder.length() > 0) {
builder.append(LanguageTag.SEP);
}
// tfields are sorted by alphabetical order of their keys
Collections.sort(tfields);
builder.append(Utility.joinStrings(LanguageTag.SEP, tfields));
}
return builder.toString();
}
};
/**

View file

@ -5233,6 +5233,38 @@ public class ULocaleTest extends TestFmwk {
// ICU-21401
Assert.assertEquals("xtg", canonicalTag("cel-gaulish"));
// ICU-21406
// Inside T extension
// Case of Script and Region
Assert.assertEquals("ja-Kana-JP-t-it-latn-it", canonicalTag("ja-kana-jp-t-it-latn-it"));
Assert.assertEquals("und-t-zh-hani-tw", canonicalTag("und-t-zh-hani-tw"));
Assert.assertEquals("und-Cyrl-t-und-latn", canonicalTag("und-cyrl-t-und-Latn"));
// Order of singleton
Assert.assertEquals("und-t-zh-u-ca-roc", canonicalTag("und-u-ca-roc-t-zh"));
// Variant subtags are alphabetically ordered.
Assert.assertEquals("sl-1994-biske-rozaj", canonicalTag("sl-rozaj-biske-1994"));
Assert.assertEquals("sl-t-sl-1994-biske-rozaj", canonicalTag("sl-t-sl-rozaj-biske-1994"));
// tfield subtags are alphabetically ordered.
// (Also tests subtag case normalisation.)
Assert.assertEquals("de-t-lv-m0-din", canonicalTag("DE-T-lv-M0-DIN"));
Assert.assertEquals("de-t-k0-qwertz-m0-din", canonicalTag("DE-T-M0-DIN-K0-QWERTZ"));
Assert.assertEquals("de-t-lv-k0-qwertz-m0-din", canonicalTag("DE-T-lv-M0-DIN-K0-QWERTZ"));
// "true" tvalue subtags aren't removed.
// (UTS 35 version 36, §3.2.1 claims otherwise, but tkey must be followed by
// tvalue, so that's likely a spec bug in UTS 35.)
Assert.assertEquals("en-t-m0-true", canonicalTag("en-t-m0-true"));
// tlang subtags are canonicalised.
Assert.assertEquals("en-t-he", canonicalTag("en-t-iw"));
Assert.assertEquals("en-t-hy-latn-am", canonicalTag("en-t-hy-latn-SU"));
Assert.assertEquals("ru-t-ru-cyrl-ru", canonicalTag("ru-t-ru-cyrl-SU"));
Assert.assertEquals("fr-t-fr-ru", canonicalTag("fr-t-fr-172"));
Assert.assertEquals("und-t-nb-latn", canonicalTag("und-t-no-latn-BOKMAL"));
Assert.assertEquals("und-t-dse-zinh", canonicalTag("und-t-sgn-qAAi-NL"));
// alias of tvalue should be replaced
Assert.assertEquals("en-t-m0-prprname", canonicalTag("en-t-m0-NaMeS"));
Assert.assertEquals("en-t-d0-charname-s0-ascii", canonicalTag("en-t-s0-ascii-d0-nAmE"));
}
@Test