ICU-22547 fix addLikelySubtags for 4 chars script code

Also fix ICU-22546 to correct the comments in the API doc
and add additional unit tests
This commit is contained in:
Frank Tang 2023-10-26 16:24:36 -07:00 committed by Frank Yung-Fong Tang
parent e04f4427dc
commit 92eeb45811
7 changed files with 127 additions and 18 deletions

View file

@ -467,7 +467,14 @@ _uloc_addLikelySubtags(const char* localeID,
goto error;
}
if (langLength > 3) {
goto error;
if (langLength == 4 && scriptLength == 0) {
langLength = 0;
scriptLength = 4;
uprv_memcpy(script, lang, 4);
lang[0] = '\0';
} else {
goto error;
}
}
/* Find the length of the trailing portion. */

View file

@ -518,20 +518,20 @@ public:
* If this Locale is already in the maximal form, or not valid, or there is
* no data available for maximization, the Locale will be unchanged.
*
* For example, "und-Zzzz" cannot be maximized, since there is no
* For example, "sh" cannot be maximized, since there is no
* reasonable maximization.
*
* Examples:
*
* "und_Zzzz" maximizes to "en_Latn_US"
*
* "en" maximizes to "en_Latn_US"
*
* "de" maximizes to "de_Latn_US"
* "de" maximizes to "de_Latn_DE"
*
* "sr" maximizes to "sr_Cyrl_RS"
*
* "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.)
*
* "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.)
* "zh_Hani" maximizes to "zh_Hani_CN"
*
* @param status error information if maximizing this Locale failed.
* If this Locale is not well-formed, the error code is

View file

@ -1158,19 +1158,20 @@ uloc_getLocaleForLCID(uint32_t hostID, char *locale, int32_t localeCapacity,
*
* If localeID is already in the maximal form, or there is no data available
* for maximization, it will be copied to the output buffer. For example,
* "und-Zzzz" cannot be maximized, since there is no reasonable maximization.
* "sh" cannot be maximized, since there is no reasonable maximization.
*
* Examples:
*
* "und_Zzzz" maximizes to "en_Latn_US"
*
* "en" maximizes to "en_Latn_US"
*
* "de" maximizes to "de_Latn_US"
* "de" maximizes to "de_Latn_DE"
*
* "sr" maximizes to "sr_Cyrl_RS"
*
* "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.)
* "zh_Hani" maximizes to "zh_Hani_CN"
*
* "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.)
*
* @param localeID The locale to maximize
* @param maximizedLocaleID The maximized locale

View file

@ -3782,6 +3782,38 @@ const char* const basic_maximize_data[][2] = {
}, {
"_DE@em=emoji",
"de_Latn_DE@em=emoji"
}, {
// ICU-22547
// unicode_language_id = "root" |
// (unicode_language_subtag (sep unicode_script_subtag)? | unicode_script_subtag)
// (sep unicode_region_subtag)? (sep unicode_variant_subtag)* ;
// so "aaaa" is a well-formed unicode_language_id
"aaaa",
"aaaa",
}, {
// ICU-22546
"und-Zzzz",
"en_Latn_US" // If change, please also update common/unicode/uloc.h
}, {
// ICU-22546
"en",
"en_Latn_US" // If change, please also update common/unicode/uloc.h
}, {
// ICU-22546
"de",
"de_Latn_DE" // If change, please also update common/unicode/uloc.h
}, {
// ICU-22546
"sr",
"sr_Cyrl_RS" // If change, please also update common/unicode/uloc.h
}, {
// ICU-22546
"sh",
"sh" // If change, please also update common/unicode/uloc.h
}, {
// ICU-22546
"zh_Hani",
"zh_Hani_CN" // If change, please also update common/unicode/uloc.h
}
};
@ -6013,7 +6045,7 @@ static void TestLikelySubtags()
}
}
else if (uprv_stricmp(maximal, buffer) != 0) {
log_err(" maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %s\n", maximal, minimal, buffer);
log_err("1 maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %s\n", maximal, minimal, buffer);
}
}
@ -6066,7 +6098,7 @@ static void TestLikelySubtags()
}
}
else if (uprv_stricmp(maximal, buffer) != 0) {
log_err(" maximal doesn't match expected \"%s\" in uloc_addLikelySubtags(), minimal \"%s\" = \"%s\"\n", maximal, minimal, buffer);
log_err("2 maximal doesn't match expected \"%s\" in uloc_addLikelySubtags(), minimal \"%s\" = \"%s\"\n", maximal, minimal, buffer);
}
}
@ -6128,7 +6160,7 @@ static void TestLikelySubtags()
}
else if (status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
if (uprv_strnicmp(maximal, buffer, bufferSize) != 0) {
log_err(" maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %*s\n",
log_err("3 maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %*s\n",
maximal, minimal, (int)sizeof(buffer), buffer);
}
}

View file

@ -3842,6 +3842,45 @@ LocaleTest::TestAddLikelyAndMinimizeSubtags() {
"und_US",
"en_Latn_US",
"en"
}, {
// ICU-22547
// unicode_language_id = "root" |
// (unicode_language_subtag (sep unicode_script_subtag)? | unicode_script_subtag)
// (sep unicode_region_subtag)? (sep unicode_variant_subtag)* ;
// so "aaaa" is a well-formed unicode_language_id
"aaaa",
"aaaa",
"aaaa",
}, {
// ICU-22546
"und-Zzzz",
"en_Latn_US", // If change, please also update common/unicode/locid.h
"en"
}, {
// ICU-22546
"en",
"en_Latn_US", // If change, please also update common/unicode/locid.h
"en"
}, {
// ICU-22546
"de",
"de_Latn_DE", // If change, please also update common/unicode/locid.h
"de"
}, {
// ICU-22546
"sr",
"sr_Cyrl_RS", // If change, please also update common/unicode/locid.h
"sr"
}, {
// ICU-22546
"sh",
"sh",// If change, please also update common/unicode/locid.h
"sh"
}, {
// ICU-22546
"zh_Hani",
"zh_Hani_CN", // If change, please also update common/unicode/locid.h
"zh_Hani"
}
};

View file

@ -1917,6 +1917,38 @@ public class ULocaleTest extends TestFmwk {
}, {
"zzz",
""
}, {
// ICU-22547
// unicode_language_id = "root" |
// (unicode_language_subtag (sep unicode_script_subtag)? | unicode_script_subtag)
// (sep unicode_region_subtag)? (sep unicode_variant_subtag)* ;
// so "aaaa" is a well-formed unicode_language_id
"aaaa",
"aaaa",
}, {
// ICU-22546
"und-Zzzz",
"en_Latn_US" // If change, please also update ULocale.java
}, {
// ICU-22546
"en",
"en_Latn_US" // If change, please also update ULocale.java
}, {
// ICU-22546
"de",
"de_Latn_DE" // If change, please also update ULocale.java
}, {
// ICU-22546
"sr",
"sr_Cyrl_RS" // If change, please also update ULocale.java
}, {
// ICU-22546
"sh",
"sh" // If change, please also update ULocale.java
}, {
// ICU-22546
"zh_Hani",
"zh_Hani_CN" // If change, please also update ULocale.java
}
};

View file

@ -2694,20 +2694,18 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
*
* If the provided ULocale instance is already in the maximal form, or there is no
* data available available for maximization, it will be returned. For example,
* "und-Zzzz" cannot be maximized, since there is no reasonable maximization.
* "sh" cannot be maximized, since there is no reasonable maximization.
* Otherwise, a new ULocale instance with the maximal form is returned.
*
* Examples:
*
* "en" maximizes to "en_Latn_US"
*
* "de" maximizes to "de_Latn_US"
* "de" maximizes to "de_Latn_DE"
*
* "sr" maximizes to "sr_Cyrl_RS"
*
* "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.)
*
* "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.)
* "zh_Hani" maximizes to "zh_Hani_CN"
*
* @param loc The ULocale to maximize
* @return The maximized ULocale instance.