From d259da81183bd2439e19dcb17cccf57cc31cf46c Mon Sep 17 00:00:00 2001 From: Frank Tang Date: Tue, 19 Mar 2024 17:42:14 -0700 Subject: [PATCH] ICU-22700 Fix large POSIX charset name cause hang Fix fuzzer found issue of hang that caused by long POSIX charset name. Limit the POSIX charset name to at most 64 chars. --- icu4c/source/common/uloc.cpp | 38 ++++++++++++++++---------- icu4c/source/test/intltest/loctest.cpp | 12 ++++++++ 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/icu4c/source/common/uloc.cpp b/icu4c/source/common/uloc.cpp index 2caaccbdc5a..88fe7eaadce 100644 --- a/icu4c/source/common/uloc.cpp +++ b/icu4c/source/common/uloc.cpp @@ -1795,6 +1795,9 @@ _canonicalize(const char* localeID, &variant, &tmpLocaleID, err); + if (U_FAILURE(err)) { + return; + } if (tag.length() == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, I_DEFAULT_LENGTH) == 0) { @@ -1823,20 +1826,27 @@ _canonicalize(const char* localeID, /* Copy POSIX-style charset specifier, if any [mr.utf8] */ if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') { - bool done = false; - do { - char c = *tmpLocaleID; - switch (c) { - case 0: - case '@': - done = true; - break; - default: - tag.append(c, err); - ++tmpLocaleID; - break; - } - } while (!done); + tag.append('.', err); + ++tmpLocaleID; + const char *atPos = nullptr; + size_t length; + if((atPos = uprv_strchr(tmpLocaleID, '@')) != nullptr) { + length = atPos - tmpLocaleID; + } else { + length = uprv_strlen(tmpLocaleID); + } + // The longest charset name we found in IANA charset registry + // https://www.iana.org/assignments/character-sets/ is + // "Extended_UNIX_Code_Packed_Format_for_Japanese" in length 45. + // we therefore restrict the length here to be 64 which is a power of 2 + // number that is longer than 45. + constexpr size_t kMaxCharsetLength = 64; + if (length > kMaxCharsetLength) { + err = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */ + return; + } + tag.append(tmpLocaleID, static_cast(length), err); + tmpLocaleID += length; } /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';' diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp index c38a7152bd2..c6499ea04c5 100644 --- a/icu4c/source/test/intltest/loctest.cpp +++ b/icu4c/source/test/intltest/loctest.cpp @@ -4937,6 +4937,18 @@ void LocaleTest::TestCanonicalization() { "no@ny", "no@ny", "no__NY" /* not: "nn" [alan ICU3.0] */ }, /* POSIX ID */ { "no-no.utf32@B", "no_NO.utf32@B", "no_NO_B" }, /* POSIX ID */ { "qz-qz@Euro", "qz_QZ@Euro", "qz_QZ_EURO" }, /* qz-qz uses private use iso codes */ + + // A very long charset name in IANA charset + { "ja_JP.Extended_UNIX_Code_Packed_Format_for_Japanese@B", + "ja_JP.Extended_UNIX_Code_Packed_Format_for_Japanese@B", "ja_JP_B" }, /* POSIX ID */ + // A fake long charset name below the limitation + { "ja_JP.1234567890123456789012345678901234567890123456789012345678901234@B", + "ja_JP.1234567890123456789012345678901234567890123456789012345678901234@B", + "ja_JP_B" }, /* POSIX ID */ + // A fake long charset name one char above the limitation + { "ja_JP.12345678901234567890123456789012345678901234567890123456789012345@B", + "BOGUS", + "ja_JP_B" }, /* POSIX ID */ // NOTE: uloc_getName() works on en-BOONT, but Locale() parser considers it BOGUS // TODO: unify this behavior { "en-BOONT", "en__BOONT", "en__BOONT" }, /* registered name */