ICU-22700 Fix large POSIX charset name cause hang

Fix fuzzer found issue of hang that caused by long POSIX charset name.
Limit the POSIX charset name to at most 64 chars.
This commit is contained in:
Frank Tang 2024-03-19 17:42:14 -07:00 committed by Frank Yung-Fong Tang
parent fbc1f33e7e
commit d259da8118
2 changed files with 36 additions and 14 deletions

View file

@ -1795,6 +1795,9 @@ _canonicalize(const char* localeID,
&variant,
&tmpLocaleID,
err);
if (U_FAILURE(err)) {
return;
}
if (tag.length() == I_DEFAULT_LENGTH &&
uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) {
@ -1823,20 +1826,27 @@ _canonicalize(const char* localeID,
/* Copy POSIX-style charset specifier, if any [mr.utf8] */
if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
bool done = false;
do {
char c = *tmpLocaleID;
switch (c) {
case 0:
case '@':
done = true;
break;
default:
tag.append(c, err);
++tmpLocaleID;
break;
}
} while (!done);
tag.append('.', err);
++tmpLocaleID;
const char *atPos = nullptr;
size_t length;
if((atPos = uprv_strchr(tmpLocaleID, '@')) != nullptr) {
length = atPos - tmpLocaleID;
} else {
length = uprv_strlen(tmpLocaleID);
}
// The longest charset name we found in IANA charset registry
// https://www.iana.org/assignments/character-sets/ is
// "Extended_UNIX_Code_Packed_Format_for_Japanese" in length 45.
// we therefore restrict the length here to be 64 which is a power of 2
// number that is longer than 45.
constexpr size_t kMaxCharsetLength = 64;
if (length > kMaxCharsetLength) {
err = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
return;
}
tag.append(tmpLocaleID, static_cast<int32_t>(length), err);
tmpLocaleID += length;
}
/* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'

View file

@ -4937,6 +4937,18 @@ void LocaleTest::TestCanonicalization()
{ "no@ny", "no@ny", "no__NY" /* not: "nn" [alan ICU3.0] */ }, /* POSIX ID */
{ "no-no.utf32@B", "no_NO.utf32@B", "no_NO_B" }, /* POSIX ID */
{ "qz-qz@Euro", "qz_QZ@Euro", "qz_QZ_EURO" }, /* qz-qz uses private use iso codes */
// A very long charset name in IANA charset
{ "ja_JP.Extended_UNIX_Code_Packed_Format_for_Japanese@B",
"ja_JP.Extended_UNIX_Code_Packed_Format_for_Japanese@B", "ja_JP_B" }, /* POSIX ID */
// A fake long charset name below the limitation
{ "ja_JP.1234567890123456789012345678901234567890123456789012345678901234@B",
"ja_JP.1234567890123456789012345678901234567890123456789012345678901234@B",
"ja_JP_B" }, /* POSIX ID */
// A fake long charset name one char above the limitation
{ "ja_JP.12345678901234567890123456789012345678901234567890123456789012345@B",
"BOGUS",
"ja_JP_B" }, /* POSIX ID */
// NOTE: uloc_getName() works on en-BOONT, but Locale() parser considers it BOGUS
// TODO: unify this behavior
{ "en-BOONT", "en__BOONT", "en__BOONT" }, /* registered name */