mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
parent
32ba41e22f
commit
5f8df0d288
7 changed files with 251 additions and 22 deletions
|
@ -1171,7 +1171,12 @@ private:
|
|||
bool replaceVariant(UErrorCode& status);
|
||||
|
||||
// Replace by using subdivisionAlias.
|
||||
bool replaceSubdivision(CharString& subdivision, UErrorCode& status);
|
||||
bool replaceSubdivision(StringPiece subdivision,
|
||||
CharString& output, UErrorCode& status);
|
||||
|
||||
// Replace transformed extensions.
|
||||
bool replaceTransformedExtensions(
|
||||
CharString& transformedExtensions, CharString& output, UErrorCode& status);
|
||||
};
|
||||
|
||||
CharString&
|
||||
|
@ -1503,7 +1508,8 @@ AliasReplacer::replaceVariant(UErrorCode& status)
|
|||
}
|
||||
|
||||
bool
|
||||
AliasReplacer::replaceSubdivision(CharString& subdivision, UErrorCode& status)
|
||||
AliasReplacer::replaceSubdivision(
|
||||
StringPiece subdivision, CharString& output, UErrorCode& status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
|
@ -1516,13 +1522,84 @@ AliasReplacer::replaceSubdivision(CharString& subdivision, UErrorCode& status)
|
|||
(firstSpace - replacement) : uprv_strlen(replacement);
|
||||
// Ignore len == 2, see CLDR-14312
|
||||
if (3 <= len && len <= 8) {
|
||||
subdivision.clear().append(replacement, (int32_t)len, status);
|
||||
output.append(replacement, (int32_t)len, status);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
AliasReplacer::replaceTransformedExtensions(
|
||||
CharString& transformedExtensions, CharString& output, UErrorCode& status)
|
||||
{
|
||||
// The content of the transformedExtensions will be modified in this
|
||||
// function to NULL-terminating (tkey-tvalue) pairs.
|
||||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
}
|
||||
int32_t len = transformedExtensions.length();
|
||||
const char* str = transformedExtensions.data();
|
||||
const char* tkey = ultag_getTKeyStart(str);
|
||||
int32_t tlangLen = (tkey == str) ? 0 :
|
||||
((tkey == nullptr) ? len : (tkey - str - 1));
|
||||
CharStringByteSink sink(&output);
|
||||
if (tlangLen > 0) {
|
||||
Locale tlang = LocaleBuilder()
|
||||
.setLanguageTag(StringPiece(str, tlangLen))
|
||||
.build(status);
|
||||
tlang.canonicalize(status);
|
||||
tlang.toLanguageTag(sink, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
}
|
||||
T_CString_toLowerCase(output.data());
|
||||
}
|
||||
if (tkey != nullptr) {
|
||||
// We need to sort the tfields by tkey
|
||||
UVector tfields(status);
|
||||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
}
|
||||
do {
|
||||
const char* tvalue = uprv_strchr(tkey, '-');
|
||||
if (tvalue == nullptr) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
const char* nextTKey = ultag_getTKeyStart(tvalue);
|
||||
if (nextTKey != nullptr) {
|
||||
*((char*)(nextTKey-1)) = '\0'; // NULL terminate tvalue
|
||||
}
|
||||
tfields.insertElementAt((void*)tkey, tfields.size(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
}
|
||||
tkey = nextTKey;
|
||||
} while (tkey != nullptr);
|
||||
tfields.sort([](UElement e1, UElement e2) -> int8_t {
|
||||
return uprv_strcmp(
|
||||
(const char*)e1.pointer, (const char*)e2.pointer);
|
||||
}, status);
|
||||
for (int32_t i = 0; i < tfields.size(); i++) {
|
||||
if (output.length() > 0) {
|
||||
output.append('-', status);
|
||||
}
|
||||
const char* tfield = (const char*) tfields.elementAt(i);
|
||||
const char* tvalue = uprv_strchr(tfield, '-');
|
||||
// Split the "tkey-tvalue" pair string so that we can canonicalize the tvalue.
|
||||
U_ASSERT(tvalue != nullptr);
|
||||
*((char*)tvalue++) = '\0'; // NULL terminate tkey
|
||||
output.append(tfield, status).append('-', status);
|
||||
const char* bcpTValue = ulocimp_toBcpType(tfield, tvalue, nullptr, nullptr);
|
||||
output.append((bcpTValue == nullptr) ? tvalue : bcpTValue, status);
|
||||
}
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
CharString&
|
||||
AliasReplacer::outputToString(
|
||||
CharString& out, UErrorCode status)
|
||||
|
@ -1663,7 +1740,8 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode status)
|
|||
if (U_SUCCESS(status) && !iter.isNull()) {
|
||||
const char* key;
|
||||
while ((key = iter->next(nullptr, status)) != nullptr) {
|
||||
if (uprv_strcmp("sd", key) == 0 || uprv_strcmp("rg", key) == 0) {
|
||||
if (uprv_strcmp("sd", key) == 0 || uprv_strcmp("rg", key) == 0 ||
|
||||
uprv_strcmp("t", key) == 0) {
|
||||
CharString value;
|
||||
CharStringByteSink valueSink(&value);
|
||||
locale.getKeywordValue(key, valueSink, status);
|
||||
|
@ -1671,10 +1749,19 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode status)
|
|||
status = U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
if (replaceSubdivision(value, status)) {
|
||||
changed++;
|
||||
CharString replacement;
|
||||
if (uprv_strlen(key) == 2) {
|
||||
if (replaceSubdivision(value.toStringPiece(), replacement, status)) {
|
||||
changed++;
|
||||
temp.setKeywordValue(key, replacement.data(), status);
|
||||
}
|
||||
} else {
|
||||
U_ASSERT(uprv_strcmp(key, "t") == 0);
|
||||
if (replaceTransformedExtensions(value, replacement, status)) {
|
||||
changed++;
|
||||
temp.setKeywordValue(key, replacement.data(), status);
|
||||
}
|
||||
}
|
||||
temp.setKeywordValue(key, value.data(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -1691,7 +1778,6 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode status)
|
|||
}
|
||||
// If the tag is not changed, return.
|
||||
if (uprv_strcmp(out.data(), locale.getName()) == 0) {
|
||||
U_ASSERT(changed == 0);
|
||||
out.clear();
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -646,6 +646,22 @@ _isTKey(const char* s, int32_t len)
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
ultag_getTKeyStart(const char *localeID) {
|
||||
const char *result = localeID;
|
||||
const char *sep;
|
||||
while((sep = uprv_strchr(result, SEP)) != nullptr) {
|
||||
if (_isTKey(result, sep - result)) {
|
||||
return result;
|
||||
}
|
||||
result = ++sep;
|
||||
}
|
||||
if (_isTKey(result, -1)) {
|
||||
return result;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static UBool
|
||||
_isTValue(const char* s, int32_t len)
|
||||
{
|
||||
|
|
|
@ -286,6 +286,9 @@ ultag_isUnicodeLocaleType(const char* s, int32_t len);
|
|||
U_CFUNC UBool
|
||||
ultag_isVariantSubtags(const char* s, int32_t len);
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
ultag_getTKeyStart(const char *localeID);
|
||||
|
||||
U_CFUNC const char*
|
||||
ulocimp_toBcpKey(const char* key);
|
||||
|
||||
|
|
|
@ -4935,6 +4935,36 @@ void LocaleTest::TestCanonicalize(void)
|
|||
// ICU-21401
|
||||
{ "cel-gaulish", "xtg"},
|
||||
|
||||
// ICU-21406
|
||||
// Inside T extension
|
||||
// Case of Script and Region
|
||||
{ "ja-kana-jp-t-it-latn-it", "ja-Kana-JP-t-it-latn-it"},
|
||||
{ "und-t-zh-hani-tw", "und-t-zh-hani-tw"},
|
||||
{ "und-cyrl-t-und-Latn", "und-Cyrl-t-und-latn"},
|
||||
// Order of singleton
|
||||
{ "und-u-ca-roc-t-zh", "und-t-zh-u-ca-roc"},
|
||||
// Variant subtags are alphabetically ordered.
|
||||
{ "sl-t-sl-rozaj-biske-1994", "sl-t-sl-1994-biske-rozaj"},
|
||||
// tfield subtags are alphabetically ordered.
|
||||
// (Also tests subtag case normalisation.)
|
||||
{ "DE-T-lv-M0-DIN", "de-t-lv-m0-din"},
|
||||
{ "DE-T-M0-DIN-K0-QWERTZ", "de-t-k0-qwertz-m0-din"},
|
||||
{ "DE-T-lv-M0-DIN-K0-QWERTZ", "de-t-lv-k0-qwertz-m0-din"},
|
||||
// "true" tvalue subtags aren't removed.
|
||||
// (UTS 35 version 36, §3.2.1 claims otherwise, but tkey must be followed by
|
||||
// tvalue, so that's likely a spec bug in UTS 35.)
|
||||
{ "en-t-m0-true", "en-t-m0-true"},
|
||||
// tlang subtags are canonicalised.
|
||||
{ "en-t-iw", "en-t-he"},
|
||||
{ "en-t-hy-latn-SU", "en-t-hy-latn-am"},
|
||||
{ "ru-t-ru-cyrl-SU", "ru-t-ru-cyrl-ru"},
|
||||
{ "fr-t-fr-172", "fr-t-fr-ru"},
|
||||
{ "und-t-no-latn-BOKMAL", "und-t-nb-latn" },
|
||||
{ "und-t-sgn-qAAi-NL", "und-t-dse-zinh" },
|
||||
// alias of tvalue should be replaced
|
||||
{ "en-t-m0-NaMeS", "en-t-m0-prprname" },
|
||||
{ "en-t-s0-ascii-d0-NaMe", "en-t-d0-charname-s0-ascii" },
|
||||
|
||||
};
|
||||
int32_t i;
|
||||
for (i=0; i < UPRV_LENGTHOF(testCases); i++) {
|
||||
|
|
|
@ -595,6 +595,12 @@ public class LanguageTag {
|
|||
return false;
|
||||
}
|
||||
|
||||
public static boolean isTKey(String s) {
|
||||
// tkey = = alpha digit ;
|
||||
return (s.length() == 2) && AsciiUtil.isAlpha(s.charAt(0))
|
||||
&& AsciiUtil.isNumeric(s.charAt(1));
|
||||
}
|
||||
|
||||
public static boolean isExtensionSingleton(String s) {
|
||||
// singleton = DIGIT ; 0 - 9
|
||||
// / %x41-57 ; A - W
|
||||
|
@ -657,18 +663,20 @@ public class LanguageTag {
|
|||
|
||||
public static String canonicalizeExtension(String s) {
|
||||
s = AsciiUtil.toLowerString(s);
|
||||
int found;
|
||||
while (s.endsWith("-true")) {
|
||||
s = s.substring(0, s.length() - 5); // length of "-true" is 5
|
||||
}
|
||||
while ((found = s.indexOf("-true-")) > 0) {
|
||||
s = s.substring(0, found) + s.substring(found + 5); // length of "-true" is 5
|
||||
}
|
||||
while (s.endsWith("-yes")) {
|
||||
s = s.substring(0, s.length() - 4); // length of "-yes" is 4
|
||||
}
|
||||
while ((found = s.indexOf("-yes-")) > 0) {
|
||||
s = s.substring(0, found) + s.substring(found + 4); // length of "-yes" is 5
|
||||
if (s.startsWith("u-")) {
|
||||
int found;
|
||||
while (s.endsWith("-true")) {
|
||||
s = s.substring(0, s.length() - 5); // length of "-true" is 5
|
||||
}
|
||||
while ((found = s.indexOf("-true-")) > 0) {
|
||||
s = s.substring(0, found) + s.substring(found + 5); // length of "-true" is 5
|
||||
}
|
||||
while (s.endsWith("-yes")) {
|
||||
s = s.substring(0, s.length() - 4); // length of "-yes" is 4
|
||||
}
|
||||
while ((found = s.indexOf("-yes-")) > 0) {
|
||||
s = s.substring(0, found) + s.substring(found + 4); // length of "-yes" is 5
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
|
|
@ -1279,9 +1279,11 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
|
|||
Iterator<String> keywords = temp.getKeywords();
|
||||
while (keywords != null && keywords.hasNext()) {
|
||||
String key = keywords.next();
|
||||
if (key.equals("rg") || key.equals("sd")) {
|
||||
if (key.equals("rg") || key.equals("sd") || key.equals("t")) {
|
||||
String value = temp.getKeywordValue(key);
|
||||
String replacement = replaceSubdivision(value);
|
||||
String replacement = key.equals("t") ?
|
||||
replaceTransformedExtensions(value) :
|
||||
replaceSubdivision(value);
|
||||
if (replacement != null) {
|
||||
temp = temp.setKeywordValue(key, replacement);
|
||||
keywordChanged = true;
|
||||
|
@ -1636,6 +1638,58 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
|
|||
return subdivisionAliasMap.get(subdivision);
|
||||
}
|
||||
|
||||
private String replaceTransformedExtensions(String extensions) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
List<String> subtags = new ArrayList<>(Arrays.asList(extensions.split(LanguageTag.SEP)));
|
||||
List<String> tfields = new ArrayList<>();
|
||||
int processedLength = 0;
|
||||
int tlangLength = 0;
|
||||
String tkey = "";
|
||||
for (String subtag : subtags) {
|
||||
if (LanguageTag.isTKey(subtag)) {
|
||||
if (tlangLength == 0) {
|
||||
// Found the first tkey. Record the total length of the preceding
|
||||
// tlang subtags. -1 if there is no tlang before the first tkey.
|
||||
tlangLength = processedLength-1;
|
||||
}
|
||||
if (builder.length() > 0) {
|
||||
// Finish & store the previous tkey with its tvalue subtags.
|
||||
tfields.add(builder.toString());
|
||||
builder.setLength(0);
|
||||
}
|
||||
// Start collecting subtags for this new tkey.
|
||||
tkey = subtag;
|
||||
builder.append(subtag);
|
||||
} else {
|
||||
if (tlangLength != 0) {
|
||||
builder.append(LanguageTag.SEP).append(toUnicodeLocaleType(tkey, subtag));
|
||||
}
|
||||
}
|
||||
processedLength += subtag.length() + 1;
|
||||
}
|
||||
if (builder.length() > 0) {
|
||||
// Finish & store the previous=last tkey with its tvalue subtags.
|
||||
tfields.add(builder.toString());
|
||||
builder.setLength(0);
|
||||
}
|
||||
String tlang = (tlangLength > 0) ? extensions.substring(0, tlangLength) :
|
||||
((tfields.size() == 0) ? extensions : "");
|
||||
if (tlang.length() > 0) {
|
||||
String canonicalized = ULocale.createCanonical(
|
||||
ULocale.forLanguageTag(extensions)).toLanguageTag();
|
||||
builder.append(AsciiUtil.toLowerString(canonicalized));
|
||||
}
|
||||
|
||||
if (tfields.size() > 0) {
|
||||
if (builder.length() > 0) {
|
||||
builder.append(LanguageTag.SEP);
|
||||
}
|
||||
// tfields are sorted by alphabetical order of their keys
|
||||
Collections.sort(tfields);
|
||||
builder.append(Utility.joinStrings(LanguageTag.SEP, tfields));
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
@ -5233,6 +5233,38 @@ public class ULocaleTest extends TestFmwk {
|
|||
|
||||
// ICU-21401
|
||||
Assert.assertEquals("xtg", canonicalTag("cel-gaulish"));
|
||||
|
||||
// ICU-21406
|
||||
// Inside T extension
|
||||
// Case of Script and Region
|
||||
Assert.assertEquals("ja-Kana-JP-t-it-latn-it", canonicalTag("ja-kana-jp-t-it-latn-it"));
|
||||
Assert.assertEquals("und-t-zh-hani-tw", canonicalTag("und-t-zh-hani-tw"));
|
||||
Assert.assertEquals("und-Cyrl-t-und-latn", canonicalTag("und-cyrl-t-und-Latn"));
|
||||
// Order of singleton
|
||||
Assert.assertEquals("und-t-zh-u-ca-roc", canonicalTag("und-u-ca-roc-t-zh"));
|
||||
// Variant subtags are alphabetically ordered.
|
||||
Assert.assertEquals("sl-1994-biske-rozaj", canonicalTag("sl-rozaj-biske-1994"));
|
||||
Assert.assertEquals("sl-t-sl-1994-biske-rozaj", canonicalTag("sl-t-sl-rozaj-biske-1994"));
|
||||
// tfield subtags are alphabetically ordered.
|
||||
// (Also tests subtag case normalisation.)
|
||||
Assert.assertEquals("de-t-lv-m0-din", canonicalTag("DE-T-lv-M0-DIN"));
|
||||
Assert.assertEquals("de-t-k0-qwertz-m0-din", canonicalTag("DE-T-M0-DIN-K0-QWERTZ"));
|
||||
Assert.assertEquals("de-t-lv-k0-qwertz-m0-din", canonicalTag("DE-T-lv-M0-DIN-K0-QWERTZ"));
|
||||
// "true" tvalue subtags aren't removed.
|
||||
// (UTS 35 version 36, §3.2.1 claims otherwise, but tkey must be followed by
|
||||
// tvalue, so that's likely a spec bug in UTS 35.)
|
||||
Assert.assertEquals("en-t-m0-true", canonicalTag("en-t-m0-true"));
|
||||
// tlang subtags are canonicalised.
|
||||
Assert.assertEquals("en-t-he", canonicalTag("en-t-iw"));
|
||||
Assert.assertEquals("en-t-hy-latn-am", canonicalTag("en-t-hy-latn-SU"));
|
||||
Assert.assertEquals("ru-t-ru-cyrl-ru", canonicalTag("ru-t-ru-cyrl-SU"));
|
||||
Assert.assertEquals("fr-t-fr-ru", canonicalTag("fr-t-fr-172"));
|
||||
Assert.assertEquals("und-t-nb-latn", canonicalTag("und-t-no-latn-BOKMAL"));
|
||||
Assert.assertEquals("und-t-dse-zinh", canonicalTag("und-t-sgn-qAAi-NL"));
|
||||
// alias of tvalue should be replaced
|
||||
Assert.assertEquals("en-t-m0-prprname", canonicalTag("en-t-m0-NaMeS"));
|
||||
Assert.assertEquals("en-t-d0-charname-s0-ascii", canonicalTag("en-t-s0-ascii-d0-nAmE"));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Add table
Reference in a new issue