diff --git a/icu4c/source/common/locid.cpp b/icu4c/source/common/locid.cpp index e0dcc8a88ec..90b4a4bc87b 100644 --- a/icu4c/source/common/locid.cpp +++ b/icu4c/source/common/locid.cpp @@ -945,59 +945,7 @@ Locale::toLanguageTag(ByteSink& sink, UErrorCode& status) const return; } - // All simple language tags will have the exact same length as BCP-47 - // strings as they have as ICU locale IDs (like "en-US" for "en_US"). - LocalMemory scratch; - int32_t scratch_capacity = static_cast(uprv_strlen(fullName)); - - if (scratch_capacity == 0) { - scratch_capacity = 3; // "und" - } - - char* buffer; - int32_t result_capacity, reslen; - - for (;;) { - if (scratch.allocateInsteadAndReset(scratch_capacity) == nullptr) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - - buffer = sink.GetAppendBuffer( - /*min_capacity=*/scratch_capacity, - /*desired_capacity_hint=*/scratch_capacity, - scratch.getAlias(), - scratch_capacity, - &result_capacity); - - reslen = uloc_toLanguageTag( - fullName, - buffer, - result_capacity, - /*strict=*/FALSE, - &status); - - if (status != U_BUFFER_OVERFLOW_ERROR) { - break; - } - - // For some very few edge cases a language tag will be longer as a - // BCP-47 string than it is as an ICU locale ID. Most notoriously "C" - // expands to the BCP-47 tag "en-US-u-va-posix", 16 times longer, and - // it'll take several calls to uloc_toLanguageTag() to figure that out. - // https://unicode-org.atlassian.net/browse/ICU-20132 - scratch_capacity = reslen; - status = U_ZERO_ERROR; - } - - if (U_FAILURE(status)) { - return; - } - - sink.Append(buffer, reslen); - if (status == U_STRING_NOT_TERMINATED_WARNING) { - status = U_ZERO_ERROR; // Terminators not used. - } + ulocimp_toLanguageTag(fullName, sink, /*strict=*/FALSE, &status); } Locale U_EXPORT2 diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp index 83d79160e8f..8146ec3077a 100644 --- a/icu4c/source/common/uloc_tag.cpp +++ b/icu4c/source/common/uloc_tag.cpp @@ -7,6 +7,7 @@ ********************************************************************** */ +#include "unicode/bytestream.h" #include "unicode/utypes.h" #include "unicode/ures.h" #include "unicode/putil.h" @@ -850,22 +851,21 @@ _initializeULanguageTag(ULanguageTag* langtag) { langtag->privateuse = EMPTY; } -static int32_t -_appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { +static void +_appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) { char buf[ULOC_LANG_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len, i; - int32_t reslen = 0; if (U_FAILURE(*status)) { - return 0; + return; } len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; + return; } len = 0; } @@ -873,20 +873,14 @@ _appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capac /* Note: returned language code is in lower case letters */ if (len == 0) { - if (reslen < capacity) { - uprv_memcpy(appendAt + reslen, LANG_UND, uprv_min(LANG_UND_LEN, capacity - reslen)); - } - reslen += LANG_UND_LEN; + sink.Append(LANG_UND, LANG_UND_LEN); } else if (!_isLanguageSubtag(buf, len)) { /* invalid language code */ if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; + return; } - if (reslen < capacity) { - uprv_memcpy(appendAt + reslen, LANG_UND, uprv_min(LANG_UND_LEN, capacity - reslen)); - } - reslen += LANG_UND_LEN; + sink.Append(LANG_UND, LANG_UND_LEN); } else { /* resolve deprecated */ for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) { @@ -901,24 +895,18 @@ _appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capac break; } } - if (reslen < capacity) { - uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen)); - } - reslen += len; + sink.Append(buf, len); } - u_terminateChars(appendAt, capacity, reslen, status); - return reslen; } -static int32_t -_appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { +static void +_appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) { char buf[ULOC_SCRIPT_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len; - int32_t reslen = 0; if (U_FAILURE(*status)) { - return 0; + return; } len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus); @@ -926,7 +914,7 @@ _appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacit if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } - return 0; + return; } if (len > 0) { @@ -935,31 +923,22 @@ _appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacit if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } - return 0; + return; } else { - if (reslen < capacity) { - *(appendAt + reslen) = SEP; - } - reslen++; - if (reslen < capacity) { - uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen)); - } - reslen += len; + sink.Append("-", 1); + sink.Append(buf, len); } } - u_terminateChars(appendAt, capacity, reslen, status); - return reslen; } -static int32_t -_appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { +static void +_appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) { char buf[ULOC_COUNTRY_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len; - int32_t reslen = 0; if (U_FAILURE(*status)) { - return 0; + return; } len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus); @@ -967,7 +946,7 @@ _appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacit if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } - return 0; + return; } if (len > 0) { @@ -976,13 +955,10 @@ _appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacit if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } - return 0; + return; } else { - if (reslen < capacity) { - *(appendAt + reslen) = SEP; - } - reslen++; - /* resolve deprecated */ + sink.Append("-", 1); + /* resolve deprecated */ for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) { if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) { uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]); @@ -990,26 +966,19 @@ _appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacit break; } } - - if (reslen < capacity) { - uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen)); - } - reslen += len; + sink.Append(buf, len); } } - u_terminateChars(appendAt, capacity, reslen, status); - return reslen; } -static int32_t -_appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool *hadPosix, UErrorCode* status) { +static void +_appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool *hadPosix, UErrorCode* status) { char buf[ULOC_FULLNAME_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len, i; - int32_t reslen = 0; if (U_FAILURE(*status)) { - return 0; + return; } len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus); @@ -1017,7 +986,7 @@ _appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capac if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } - return 0; + return; } if (len > 0) { @@ -1094,15 +1063,9 @@ _appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capac /* write out validated/normalized variants to the target */ var = varFirst; while (var != NULL) { - if (reslen < capacity) { - *(appendAt + reslen) = SEP; - } - reslen++; + sink.Append("-", 1); varLen = (int32_t)uprv_strlen(var->variant); - if (reslen < capacity) { - uprv_memcpy(appendAt + reslen, var->variant, uprv_min(varLen, capacity - reslen)); - } - reslen += varLen; + sink.Append(var->variant, varLen); var = var->next; } } @@ -1117,25 +1080,21 @@ _appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capac } if (U_FAILURE(*status)) { - return 0; + return; } } - - u_terminateChars(appendAt, capacity, reslen, status); - return reslen; } -static int32_t -_appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool hadPosix, UErrorCode* status) { +static void +_appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) { char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 }; int32_t attrBufLength = 0; UEnumeration *keywordEnum = NULL; - int32_t reslen = 0; keywordEnum = uloc_openKeywords(localeID, status); if (U_FAILURE(*status) && !hadPosix) { uenum_close(keywordEnum); - return 0; + return; } if (keywordEnum != NULL || hadPosix) { /* reorder extensions */ @@ -1378,15 +1337,7 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac for (ext = firstExt; ext; ext = ext->next) { if (!startLDMLExtension && uprv_strlen(ext->key) > 1) { /* first LDML u singlton extension */ - if (reslen < capacity) { - *(appendAt + reslen) = SEP; - } - reslen++; - if (reslen < capacity) { - *(appendAt + reslen) = LDMLEXT; - } - reslen++; - + sink.Append("-u", 2); startLDMLExtension = TRUE; } @@ -1394,35 +1345,15 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) { /* write the value for the attributes */ for (attr = firstAttr; attr; attr = attr->next) { - if (reslen < capacity) { - *(appendAt + reslen) = SEP; - } - reslen++; - len = (int32_t)uprv_strlen(attr->attribute); - if (reslen < capacity) { - uprv_memcpy(appendAt + reslen, attr->attribute, uprv_min(len, capacity - reslen)); - } - reslen += len; + sink.Append("-", 1); + sink.Append( + attr->attribute, uprv_strlen(attr->attribute)); } } else { - if (reslen < capacity) { - *(appendAt + reslen) = SEP; - } - reslen++; - len = (int32_t)uprv_strlen(ext->key); - if (reslen < capacity) { - uprv_memcpy(appendAt + reslen, ext->key, uprv_min(len, capacity - reslen)); - } - reslen += len; - if (reslen < capacity) { - *(appendAt + reslen) = SEP; - } - reslen++; - len = (int32_t)uprv_strlen(ext->value); - if (reslen < capacity) { - uprv_memcpy(appendAt + reslen, ext->value, uprv_min(len, capacity - reslen)); - } - reslen += len; + sink.Append("-", 1); + sink.Append(ext->key, uprv_strlen(ext->key)); + sink.Append("-", 1); + sink.Append(ext->value, uprv_strlen(ext->value)); } } } @@ -1447,11 +1378,9 @@ cleanup: uenum_close(keywordEnum); if (U_FAILURE(*status)) { - return 0; + return; } } - - return u_terminateChars(appendAt, capacity, reslen, status); } /** @@ -1906,17 +1835,18 @@ _appendKeywords(ULanguageTag* langtag, char* appendAt, int32_t capacity, UErrorC return u_terminateChars(appendAt, capacity, reslen, status); } -static int32_t -_appendPrivateuseToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool hadPosix, UErrorCode* status) { +static void +_appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) { (void)hadPosix; char buf[ULOC_FULLNAME_CAPACITY]; char tmpAppend[ULOC_FULLNAME_CAPACITY]; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t len, i; int32_t reslen = 0; + int32_t capacity = sizeof tmpAppend; if (U_FAILURE(*status)) { - return 0; + return; } len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus); @@ -1924,7 +1854,7 @@ _appendPrivateuseToLanguageTag(const char* localeID, char* appendAt, int32_t cap if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; } - return 0; + return; } if (len > 0) { @@ -2008,20 +1938,14 @@ _appendPrivateuseToLanguageTag(const char* localeID, char* appendAt, int32_t cap } if (U_FAILURE(*status)) { - return 0; + return; } } if (U_SUCCESS(*status)) { len = reslen; - if (reslen < capacity) { - uprv_memcpy(appendAt, tmpAppend, uprv_min(len, capacity - reslen)); - } + sink.Append(tmpAppend, len); } - - u_terminateChars(appendAt, capacity, reslen, status); - - return reslen; } /* @@ -2637,6 +2561,34 @@ uloc_toLanguageTag(const char* localeID, int32_t langtagCapacity, UBool strict, UErrorCode* status) { + if (U_FAILURE(*status)) { + return 0; + } + + icu::CheckedArrayByteSink sink(langtag, langtagCapacity); + ulocimp_toLanguageTag(localeID, sink, strict, status); + + int32_t reslen = sink.NumberOfBytesAppended(); + + if (U_FAILURE(*status)) { + return reslen; + } + + if (sink.Overflowed()) { + *status = U_BUFFER_OVERFLOW_ERROR; + } else { + u_terminateChars(langtag, langtagCapacity, reslen, status); + } + + return reslen; +} + + +U_CAPI void U_EXPORT2 +ulocimp_toLanguageTag(const char* localeID, + icu::ByteSink& sink, + UBool strict, + UErrorCode* status) { icu::CharString canonical; int32_t reslen; UErrorCode tmpStatus = U_ZERO_ERROR; @@ -2657,7 +2609,7 @@ uloc_toLanguageTag(const char* localeID, if (U_FAILURE(tmpStatus)) { *status = tmpStatus; - return 0; + return; } reslen = @@ -2673,7 +2625,7 @@ uloc_toLanguageTag(const char* localeID, if (U_FAILURE(tmpStatus)) { *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; + return; } canonical.append(buffer, reslen, tmpStatus); @@ -2683,12 +2635,10 @@ uloc_toLanguageTag(const char* localeID, if (U_FAILURE(tmpStatus)) { *status = tmpStatus; - return 0; + return; } } - reslen = 0; - /* For handling special case - private use only tag */ pKeywordStart = locale_getKeywordsStart(canonical.data()); if (pKeywordStart == canonical.data()) { @@ -2712,9 +2662,7 @@ uloc_toLanguageTag(const char* localeID, if (U_SUCCESS(tmpStatus)) { if (_isPrivateuseValueSubtags(&buf[2], len)) { /* return private use only tag */ - reslen = len + 2; - uprv_memcpy(langtag, buf, uprv_min(reslen, langtagCapacity)); - u_terminateChars(langtag, langtagCapacity, reslen, status); + sink.Append(buf, len + 2); done = TRUE; } else if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; @@ -2729,19 +2677,17 @@ uloc_toLanguageTag(const char* localeID, } uenum_close(kwdEnum); if (done) { - return reslen; + return; } } } - reslen += _appendLanguageToLanguageTag(canonical.data(), langtag, langtagCapacity, strict, status); - reslen += _appendScriptToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, status); - reslen += _appendRegionToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, status); - reslen += _appendVariantsToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, &hadPosix, status); - reslen += _appendKeywordsToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status); - reslen += _appendPrivateuseToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status); - - return reslen; + _appendLanguageToLanguageTag(canonical.data(), sink, strict, status); + _appendScriptToLanguageTag(canonical.data(), sink, strict, status); + _appendRegionToLanguageTag(canonical.data(), sink, strict, status); + _appendVariantsToLanguageTag(canonical.data(), sink, strict, &hadPosix, status); + _appendKeywordsToLanguageTag(canonical.data(), sink, strict, hadPosix, status); + _appendPrivateuseToLanguageTag(canonical.data(), sink, strict, hadPosix, status); } diff --git a/icu4c/source/common/ulocimp.h b/icu4c/source/common/ulocimp.h index aa2090ebba6..0e69a22e7aa 100644 --- a/icu4c/source/common/ulocimp.h +++ b/icu4c/source/common/ulocimp.h @@ -10,6 +10,7 @@ #ifndef ULOCIMP_H #define ULOCIMP_H +#include "unicode/bytestream.h" #include "unicode/uloc.h" /** @@ -61,6 +62,31 @@ ulocimp_getCountry(const char *localeID, char *country, int32_t countryCapacity, const char **pEnd); +/** + * Writes a well-formed language tag for this locale ID. + * + * **Note**: When `strict` is FALSE, any locale fields which do not satisfy the + * BCP47 syntax requirement will be omitted from the result. When `strict` is + * TRUE, this function sets U_ILLEGAL_ARGUMENT_ERROR to the `err` if any locale + * fields do not satisfy the BCP47 syntax requirement. + * + * @param localeID the input locale ID + * @param sink the output sink receiving the BCP47 language + * tag for this Locale. + * @param strict boolean value indicating if the function returns + * an error for an ill-formed input locale ID. + * @param err error information if receiving the language + * tag failed. + * @return The length of the BCP47 language tag. + * + * @internal ICU 64 + */ +U_STABLE void U_EXPORT2 +ulocimp_toLanguageTag(const char* localeID, + icu::ByteSink& sink, + UBool strict, + UErrorCode* err); + /** * Returns a locale ID for the specified BCP47 language tag string. * If the specified language tag contains any ill-formed subtags, diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 63e7269aa48..0dcc1e97ffa 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -251,6 +251,7 @@ void addLocaleTest(TestNode** root) TESTCASE(TestOrientation); TESTCASE(TestLikelySubtags); TESTCASE(TestToLanguageTag); + TESTCASE(TestBug20132); TESTCASE(TestForLanguageTag); TESTCASE(TestInvalidLanguageTag); TESTCASE(TestLangAndRegionCanonicalize); @@ -6018,6 +6019,47 @@ static void TestToLanguageTag(void) { } } +static void TestBug20132(void) { + char langtag[256]; + UErrorCode status; + int32_t len; + + static const char inloc[] = "C"; + static const char expected[] = "en-US-u-va-posix"; + const int32_t expected_len = uprv_strlen(expected); + + /* Before ICU-20132 was fixed, calling uloc_toLanguageTag() with a too small + * buffer would not immediately return the buffer size actually needed, but + * instead require several iterations before getting the correct size. */ + + status = U_ZERO_ERROR; + len = uloc_toLanguageTag(inloc, langtag, 1, FALSE, &status); + + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { + log_data_err("Error returned by uloc_toLanguageTag for locale id [%s] - error: %s Are you missing data?\n", + inloc, u_errorName(status)); + } + + if (len != expected_len) { + log_err("Bad length returned by uloc_toLanguageTag for locale id [%s]: %i != %i\n", inloc, len, expected_len); + } + + status = U_ZERO_ERROR; + len = uloc_toLanguageTag(inloc, langtag, expected_len, FALSE, &status); + + if (U_FAILURE(status)) { + log_data_err("Error returned by uloc_toLanguageTag for locale id [%s] - error: %s Are you missing data?\n", + inloc, u_errorName(status)); + } + + if (len != expected_len) { + log_err("Bad length returned by uloc_toLanguageTag for locale id [%s]: %i != %i\n", inloc, len, expected_len); + } else if (uprv_strncmp(langtag, expected, expected_len) != 0) { + log_data_err("uloc_toLanguageTag returned language tag [%.*s] for input locale [%s] - expected: [%s]. Are you missing data?\n", + len, langtag, inloc, expected); + } +} + #define FULL_LENGTH -1 static const struct { const char *bcpID; diff --git a/icu4c/source/test/cintltst/cloctst.h b/icu4c/source/test/cintltst/cloctst.h index fa117fa081d..f415dbbe278 100644 --- a/icu4c/source/test/cintltst/cloctst.h +++ b/icu4c/source/test/cintltst/cloctst.h @@ -126,6 +126,7 @@ static void TestLikelySubtags(void); static void TestForLanguageTag(void); static void TestInvalidLanguageTag(void); static void TestToLanguageTag(void); +static void TestBug20132(void); static void TestLangAndRegionCanonicalize(void); static void TestToUnicodeLocaleKey(void);