ICU-22661 Limit the size of variants in Locale

See #2821
This commit is contained in:
Frank Tang 2024-03-14 18:37:27 +00:00 committed by Frank Yung-Fong Tang
parent 104214aeae
commit de9910659d
11 changed files with 417 additions and 19 deletions

View file

@ -1339,14 +1339,30 @@ _getVariant(const char* localeID,
char prev,
ByteSink* sink,
const char** pEnd,
bool needSeparator) {
bool hasVariant = false;
bool needSeparator,
UErrorCode& status) {
if (U_FAILURE(status)) return;
if (pEnd != nullptr) { *pEnd = localeID; }
// Reasonable upper limit for variants
// There are no strict limitation of the syntax of variant in the legacy
// locale format. If the locale is constructed from unicode_locale_id
// as defined in UTS35, then we know each unicode_variant_subtag
// could have max length of 8 ((alphanum{5,8} | digit alphanum{3})
// 179 would allow 20 unicode_variant_subtag with sep in the
// unicode_locale_id
// 8*20 + 1*(20-1) = 179
constexpr int32_t MAX_VARIANTS_LENGTH = 179;
/* get one or more variant tags and separate them with '_' */
if(_isIDSeparator(prev)) {
int32_t index = 0;
if (_isIDSeparator(prev)) {
/* get a variant string after a '-' or '_' */
while(!_isTerminator(*localeID)) {
for (index=0; !_isTerminator(localeID[index]); index++) {
if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (needSeparator) {
if (sink != nullptr) {
sink->Append("_", 1);
@ -1354,26 +1370,28 @@ _getVariant(const char* localeID,
needSeparator = false;
}
if (sink != nullptr) {
char c = (char)uprv_toupper(*localeID);
char c = (char)uprv_toupper(localeID[index]);
if (c == '-') c = '_';
sink->Append(&c, 1);
}
hasVariant = true;
localeID++;
}
if (pEnd != nullptr) { *pEnd = localeID; }
if (pEnd != nullptr) { *pEnd = localeID+index; }
}
/* if there is no variant tag after a '-' or '_' then look for '@' */
if(!hasVariant) {
if(prev=='@') {
if (index == 0) {
if (prev=='@') {
/* keep localeID */
} else if((localeID=locale_getKeywordsStart(localeID))!=nullptr) {
++localeID; /* point after the '@' */
} else {
return;
}
while(!_isTerminator(*localeID)) {
for(; !_isTerminator(localeID[index]); index++) {
if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (needSeparator) {
if (sink != nullptr) {
sink->Append("_", 1);
@ -1381,13 +1399,12 @@ _getVariant(const char* localeID,
needSeparator = false;
}
if (sink != nullptr) {
char c = (char)uprv_toupper(*localeID);
char c = (char)uprv_toupper(localeID[index]);
if (c == '-' || c == ',') c = '_';
sink->Append(&c, 1);
}
localeID++;
}
if (pEnd != nullptr) { *pEnd = localeID; }
if (pEnd != nullptr) { *pEnd = localeID + index; }
}
}
@ -1560,7 +1577,8 @@ ulocimp_getSubtags(
}
const char* begin = localeID + 1;
const char* end = nullptr;
_getVariant(begin, *localeID, variant, &end, false);
_getVariant(begin, *localeID, variant, &end, false, status);
if (U_FAILURE(status)) { return; }
U_ASSERT(end != nullptr);
if (end != begin && pEnd != nullptr) { *pEnd = end; }
}
@ -1853,7 +1871,8 @@ _canonicalize(const char* localeID,
}
CharStringByteSink s(&tag);
_getVariant(tmpLocaleID+1, '@', &s, nullptr, !variant.isEmpty());
_getVariant(tmpLocaleID+1, '@', &s, nullptr, !variant.isEmpty(), err);
if (U_FAILURE(err)) { return; }
}
/* Look up the ID in the canonicalization map */

View file

@ -1211,7 +1211,7 @@ Locale::getScript() const
inline const char *
Locale::getVariant() const
{
return &baseName[variantBegin];
return fIsBogus ? "" : &baseName[variantBegin];
}
inline const char *

View file

@ -510,6 +510,11 @@ enum AllowedHourFormat{
void
DateTimePatternGenerator::initData(const Locale& locale, UErrorCode &status, UBool skipStdPatterns) {
//const char *baseLangName = locale.getBaseName(); // unused
if (U_FAILURE(status)) { return; }
if (locale.isBogus()) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
skipMatcher = nullptr;
fAvailableFormatKeyHash=nullptr;

View file

@ -224,6 +224,7 @@ void addLocaleTest(TestNode** root)
TESTCASE(TestBasicGetters);
TESTCASE(TestNullDefault);
TESTCASE(TestPrefixes);
TESTCASE(TestVariantLengthLimit);
TESTCASE(TestSimpleResourceInfo);
TESTCASE(TestDisplayNames);
TESTCASE(TestGetDisplayScriptPreFlighting21160);
@ -568,6 +569,82 @@ static void TestPrefixes(void) {
}
}
static void TestVariantLengthLimit(void) {
static const char valid[] =
"_"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678";
static const char invalid[] =
"_"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678X"; // One character too long.
const char* const variantsExpected = valid + 2; // Skip initial "__".
const int32_t reslenExpected = uprv_strlen(variantsExpected);
char buffer[UPRV_LENGTHOF(invalid)];
UErrorCode status;
status = U_ZERO_ERROR;
int32_t reslen =
uloc_getVariant(valid, buffer, UPRV_LENGTHOF(buffer), &status);
if (U_FAILURE(status)) {
log_err("Unexpected error in uloc_getVariant(): %s\n",
myErrorName(status));
} else if (reslenExpected != reslen) {
log_err("Expected length %d but got length %d.\n",
reslenExpected, reslen);
} else if (uprv_strcmp(variantsExpected, buffer) != 0) {
log_err("Expected variants \"%s\" but got variants \"%s\"\n",
variantsExpected, buffer);
}
status = U_ZERO_ERROR;
uloc_getVariant(invalid, buffer, UPRV_LENGTHOF(buffer), &status);
if (status != U_ILLEGAL_ARGUMENT_ERROR) {
// The variants are known to be too long, parsing must fail.
log_err("Unexpected error in uloc_getVariant(), expected "
"U_ILLEGAL_ARGUMENT_ERROR but got %s.\n",
myErrorName(status));
}
}
/* testing uloc_getISO3Language(), uloc_getISO3Country(), */
static void TestSimpleResourceInfo(void) {

View file

@ -25,6 +25,8 @@
**/
static void TestBasicGetters(void);
static void TestPrefixes(void);
static void TestVariantLengthLimit(void);
/**
* Use Locale to access Resource file data and compare against expected values
**/

View file

@ -212,6 +212,7 @@ static int32_t bundles_count = UPRV_LENGTHOF(param);
static void TestDecodedBundle(void);
static void TestGetKeywordValues(void);
static void TestGetFunctionalEquivalent(void);
static void TestGetFunctionalEquivalentVariantLengthLimit(void);
static void TestCLDRStyleAliases(void);
static void TestFallbackCodes(void);
static void TestGetUTF8String(void);
@ -249,7 +250,10 @@ void addNEWResourceBundleTest(TestNode** root)
addTest(root, &TestGetVersionColl, "tsutil/creststn/TestGetVersionColl");
addTest(root, &TestAliasConflict, "tsutil/creststn/TestAliasConflict");
addTest(root, &TestGetKeywordValues, "tsutil/creststn/TestGetKeywordValues");
addTest(root, &TestGetFunctionalEquivalent,"tsutil/creststn/TestGetFunctionalEquivalent");
addTest(root, &TestGetFunctionalEquivalent,
"tsutil/creststn/TestGetFunctionalEquivalent");
addTest(root, &TestGetFunctionalEquivalentVariantLengthLimit,
"tsutil/creststn/TestGetFunctionalEquivalentVariantLengthLimit");
addTest(root, &TestJB3763, "tsutil/creststn/TestJB3763");
}
@ -2777,6 +2781,100 @@ static void TestGetFunctionalEquivalent(void) {
#endif
}
static void TestGetFunctionalEquivalentVariantLengthLimit(void) {
static const char valid[] =
"_"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678";
static const char invalid[] =
"_"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678X"; // One character too long.
static const char localeExpected[] = "_@calendar=gregorian";
const int32_t reslenExpected = uprv_strlen(localeExpected);
char buffer[UPRV_LENGTHOF(invalid)];
UErrorCode status;
status = U_ZERO_ERROR;
int32_t reslen = ures_getFunctionalEquivalent(
buffer,
UPRV_LENGTHOF(buffer),
NULL,
"calendar",
"calendar",
valid,
NULL,
false,
&status);
if (U_FAILURE(status)) {
log_err("Unexpected error in ures_getFunctionalEquivalent(): %s\n",
myErrorName(status));
} else if (reslenExpected != reslen) {
log_err("Expected length %d but got length %d.\n",
reslenExpected, reslen);
} else if (uprv_strcmp(localeExpected, buffer) != 0) {
log_err("Expected locale \"%s\" but got locale \"%s\"\n",
localeExpected, buffer);
}
status = U_ZERO_ERROR;
ures_getFunctionalEquivalent(
buffer,
UPRV_LENGTHOF(buffer),
NULL,
"calendar",
"calendar",
invalid,
NULL,
false,
&status);
if (status != U_ILLEGAL_ARGUMENT_ERROR) {
// The variants are known to be too long, parsing must fail.
log_err("Unexpected error in ures_getFunctionalEquivalent(), expected "
"U_ILLEGAL_ARGUMENT_ERROR but got %s.\n",
myErrorName(status));
}
}
static void TestXPath(void) {
UErrorCode status = U_ZERO_ERROR;
UResourceBundle *rb = NULL, *alias = NULL;

View file

@ -202,6 +202,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(TestBug11421); // Must run early in list to trigger failure.
TESTCASE_AUTO(TestBasicGetters);
TESTCASE_AUTO(TestVariantLengthLimit);
TESTCASE_AUTO(TestSimpleResourceInfo);
TESTCASE_AUTO(TestDisplayNames);
TESTCASE_AUTO(TestSimpleObjectStuff);
@ -405,6 +406,69 @@ void LocaleTest::TestBasicGetters() {
delete pb;
}
void LocaleTest::TestVariantLengthLimit() {
static constexpr char valid[] =
"_"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678";
static constexpr char invalid[] =
"_"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678"
"_12345678X"; // One character too long.
constexpr const char* variantsExpected = valid + 2; // Skip initial "__".
Locale validLocale(valid);
if (validLocale.isBogus()) {
errln("Valid locale is unexpectedly bogus.");
} else if (uprv_strcmp(variantsExpected, validLocale.getVariant()) != 0) {
errln("Expected variants \"%s\" but got variants \"%s\"\n",
variantsExpected, validLocale.getVariant());
}
Locale invalidLocale(invalid);
if (!invalidLocale.isBogus()) {
errln("Invalid locale is unexpectedly NOT bogus.");
}
}
void LocaleTest::TestParallelAPIValues() {
logln("Test synchronization between C and C++ API");
if (strcmp(Locale::getChinese().getName(), ULOC_CHINESE) != 0) {

View file

@ -23,6 +23,7 @@ public:
* Test methods to set and get data fields
**/
void TestBasicGetters();
void TestVariantLengthLimit();
/**
* Test methods to set and get data fields
**/

View file

@ -149,6 +149,70 @@ public final class ICUResourceBundleCollationTest extends TestFmwk {
}
}
@Test
public void TestGetFunctionalEquivalentVariantLengthWithinLimit() {
String valid =
"_" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678";
ULocale equivLocale = ICUResourceBundle.getFunctionalEquivalent(
ICUData.ICU_BASE_NAME, ICUResourceBundle.ICU_DATA_CLASS_LOADER,
"calendar", "calendar", new ULocale(valid), new boolean[1], false);
ULocale localeExpected = new ULocale("_@calendar=gregorian");
if(!equivLocale.equals(localeExpected)) {
errln("Get unexpected locale:" + equivLocale.toString() +
" while expecting " + localeExpected.toString());
}
}
@Test(expected = IllegalArgumentException.class)
public void TestGetFunctionalEquivalentVariantLengthOverLimit() {
String invalid =
"_" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678X"; // One character too long.
ULocale equivLocale2 = ICUResourceBundle.getFunctionalEquivalent(
ICUData.ICU_BASE_NAME, ICUResourceBundle.ICU_DATA_CLASS_LOADER,
"calendar", "calendar", new ULocale(invalid), new boolean[1], false);
}
@Test
public void TestOpen(){
UResourceBundle bundle = UResourceBundle.getBundleInstance(ICUData.ICU_COLLATION_BASE_NAME, "en_US_POSIX");

View file

@ -5438,6 +5438,63 @@ public class ULocaleTest extends CoreTestFmwk {
}
@Test
public void TestVariantLengthWithinLimit() {
String valid =
"_" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678";
ULocale locale = new ULocale(valid);
Assert.assertEquals(valid.substring(2), locale.getVariant());
}
@Test(expected = IllegalArgumentException.class)
public void TestVariantLengthOverLimit() {
String invalid =
"_" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678" +
"_12345678X"; // One character too long.
ULocale locale = new ULocale(invalid);
}
@Test
public void TestLocaleCanonicalizationFromFile() throws IOException {
BufferedReader testFile = TestUtil.getDataReader("cldr/localeIdentifiers/localeCanonicalization.txt");

View file

@ -365,6 +365,15 @@ public final class LocaleIDParser {
}
}
// There are no strict limitation of the syntax of variant in the legacy
// locale format. If the locale is constructed from unicode_locale_id
// as defined in UTS35, then we know each unicode_variant_subtag
// could have max length of 8 ((alphanum{5,8} | digit alphanum{3})
// 179 would allow 20 unicode_variant_subtag with sep in the
// unicode_locale_id
// 8*20 + 1*(20-1) = 179
private static final int MAX_VARIANTS_LENGTH = 179;
/**
* Advance index past variant, and accumulate normalized variant in buffer. This ignores
* the codepage information from POSIX ids. Index must be immediately after the country
@ -432,10 +441,12 @@ public final class LocaleIDParser {
c = UNDERSCORE;
}
append(c);
if (buffer.length() - oldBlen > MAX_VARIANTS_LENGTH) {
throw new IllegalArgumentException("variants is too long");
}
}
}
--index; // unget
return oldBlen;
}