ICU-20132 Pass ByteSink from Locale::toLanguageTag() to uloc_toLanguageTag().

This eliminates the need for a scratch buffer in Locale::toLanguageTag()
and also the need for counting bytes required in uloc_toLanguageTag(),
something that ByteSink will now handle correctly and thereby
eliminating the bug where too few bytes required was returned.
This commit is contained in:
Fredrik Roubert 2018-10-22 22:28:20 +02:00 committed by Fredrik Roubert
parent 8b7fa6a03f
commit 59fe4f4be2
5 changed files with 159 additions and 196 deletions

View file

@ -945,59 +945,7 @@ Locale::toLanguageTag(ByteSink& sink, UErrorCode& status) const
return;
}
// All simple language tags will have the exact same length as BCP-47
// strings as they have as ICU locale IDs (like "en-US" for "en_US").
LocalMemory<char> scratch;
int32_t scratch_capacity = static_cast<int32_t>(uprv_strlen(fullName));
if (scratch_capacity == 0) {
scratch_capacity = 3; // "und"
}
char* buffer;
int32_t result_capacity, reslen;
for (;;) {
if (scratch.allocateInsteadAndReset(scratch_capacity) == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
buffer = sink.GetAppendBuffer(
/*min_capacity=*/scratch_capacity,
/*desired_capacity_hint=*/scratch_capacity,
scratch.getAlias(),
scratch_capacity,
&result_capacity);
reslen = uloc_toLanguageTag(
fullName,
buffer,
result_capacity,
/*strict=*/FALSE,
&status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
break;
}
// For some very few edge cases a language tag will be longer as a
// BCP-47 string than it is as an ICU locale ID. Most notoriously "C"
// expands to the BCP-47 tag "en-US-u-va-posix", 16 times longer, and
// it'll take several calls to uloc_toLanguageTag() to figure that out.
// https://unicode-org.atlassian.net/browse/ICU-20132
scratch_capacity = reslen;
status = U_ZERO_ERROR;
}
if (U_FAILURE(status)) {
return;
}
sink.Append(buffer, reslen);
if (status == U_STRING_NOT_TERMINATED_WARNING) {
status = U_ZERO_ERROR; // Terminators not used.
}
ulocimp_toLanguageTag(fullName, sink, /*strict=*/FALSE, &status);
}
Locale U_EXPORT2

View file

@ -7,6 +7,7 @@
**********************************************************************
*/
#include "unicode/bytestream.h"
#include "unicode/utypes.h"
#include "unicode/ures.h"
#include "unicode/putil.h"
@ -850,22 +851,21 @@ _initializeULanguageTag(ULanguageTag* langtag) {
langtag->privateuse = EMPTY;
}
static int32_t
_appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) {
static void
_appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
char buf[ULOC_LANG_CAPACITY];
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t len, i;
int32_t reslen = 0;
if (U_FAILURE(*status)) {
return 0;
return;
}
len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus);
if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
return;
}
len = 0;
}
@ -873,20 +873,14 @@ _appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capac
/* Note: returned language code is in lower case letters */
if (len == 0) {
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, LANG_UND, uprv_min(LANG_UND_LEN, capacity - reslen));
}
reslen += LANG_UND_LEN;
sink.Append(LANG_UND, LANG_UND_LEN);
} else if (!_isLanguageSubtag(buf, len)) {
/* invalid language code */
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
return;
}
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, LANG_UND, uprv_min(LANG_UND_LEN, capacity - reslen));
}
reslen += LANG_UND_LEN;
sink.Append(LANG_UND, LANG_UND_LEN);
} else {
/* resolve deprecated */
for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) {
@ -901,24 +895,18 @@ _appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capac
break;
}
}
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
}
reslen += len;
sink.Append(buf, len);
}
u_terminateChars(appendAt, capacity, reslen, status);
return reslen;
}
static int32_t
_appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) {
static void
_appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
char buf[ULOC_SCRIPT_CAPACITY];
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t len;
int32_t reslen = 0;
if (U_FAILURE(*status)) {
return 0;
return;
}
len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus);
@ -926,7 +914,7 @@ _appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacit
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
return;
}
if (len > 0) {
@ -935,31 +923,22 @@ _appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacit
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
return;
} else {
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
}
reslen += len;
sink.Append("-", 1);
sink.Append(buf, len);
}
}
u_terminateChars(appendAt, capacity, reslen, status);
return reslen;
}
static int32_t
_appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) {
static void
_appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
char buf[ULOC_COUNTRY_CAPACITY];
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t len;
int32_t reslen = 0;
if (U_FAILURE(*status)) {
return 0;
return;
}
len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus);
@ -967,7 +946,7 @@ _appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacit
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
return;
}
if (len > 0) {
@ -976,13 +955,10 @@ _appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacit
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
return;
} else {
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
/* resolve deprecated */
sink.Append("-", 1);
/* resolve deprecated */
for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) {
if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) {
uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]);
@ -990,26 +966,19 @@ _appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacit
break;
}
}
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
}
reslen += len;
sink.Append(buf, len);
}
}
u_terminateChars(appendAt, capacity, reslen, status);
return reslen;
}
static int32_t
_appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool *hadPosix, UErrorCode* status) {
static void
_appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool *hadPosix, UErrorCode* status) {
char buf[ULOC_FULLNAME_CAPACITY];
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t len, i;
int32_t reslen = 0;
if (U_FAILURE(*status)) {
return 0;
return;
}
len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
@ -1017,7 +986,7 @@ _appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
return;
}
if (len > 0) {
@ -1094,15 +1063,9 @@ _appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
/* write out validated/normalized variants to the target */
var = varFirst;
while (var != NULL) {
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
sink.Append("-", 1);
varLen = (int32_t)uprv_strlen(var->variant);
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, var->variant, uprv_min(varLen, capacity - reslen));
}
reslen += varLen;
sink.Append(var->variant, varLen);
var = var->next;
}
}
@ -1117,25 +1080,21 @@ _appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
}
if (U_FAILURE(*status)) {
return 0;
return;
}
}
u_terminateChars(appendAt, capacity, reslen, status);
return reslen;
}
static int32_t
_appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool hadPosix, UErrorCode* status) {
static void
_appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 };
int32_t attrBufLength = 0;
UEnumeration *keywordEnum = NULL;
int32_t reslen = 0;
keywordEnum = uloc_openKeywords(localeID, status);
if (U_FAILURE(*status) && !hadPosix) {
uenum_close(keywordEnum);
return 0;
return;
}
if (keywordEnum != NULL || hadPosix) {
/* reorder extensions */
@ -1378,15 +1337,7 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
for (ext = firstExt; ext; ext = ext->next) {
if (!startLDMLExtension && uprv_strlen(ext->key) > 1) {
/* first LDML u singlton extension */
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
if (reslen < capacity) {
*(appendAt + reslen) = LDMLEXT;
}
reslen++;
sink.Append("-u", 2);
startLDMLExtension = TRUE;
}
@ -1394,35 +1345,15 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
/* write the value for the attributes */
for (attr = firstAttr; attr; attr = attr->next) {
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
len = (int32_t)uprv_strlen(attr->attribute);
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, attr->attribute, uprv_min(len, capacity - reslen));
}
reslen += len;
sink.Append("-", 1);
sink.Append(
attr->attribute, uprv_strlen(attr->attribute));
}
} else {
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
len = (int32_t)uprv_strlen(ext->key);
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, ext->key, uprv_min(len, capacity - reslen));
}
reslen += len;
if (reslen < capacity) {
*(appendAt + reslen) = SEP;
}
reslen++;
len = (int32_t)uprv_strlen(ext->value);
if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, ext->value, uprv_min(len, capacity - reslen));
}
reslen += len;
sink.Append("-", 1);
sink.Append(ext->key, uprv_strlen(ext->key));
sink.Append("-", 1);
sink.Append(ext->value, uprv_strlen(ext->value));
}
}
}
@ -1447,11 +1378,9 @@ cleanup:
uenum_close(keywordEnum);
if (U_FAILURE(*status)) {
return 0;
return;
}
}
return u_terminateChars(appendAt, capacity, reslen, status);
}
/**
@ -1906,17 +1835,18 @@ _appendKeywords(ULanguageTag* langtag, char* appendAt, int32_t capacity, UErrorC
return u_terminateChars(appendAt, capacity, reslen, status);
}
static int32_t
_appendPrivateuseToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool hadPosix, UErrorCode* status) {
static void
_appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
(void)hadPosix;
char buf[ULOC_FULLNAME_CAPACITY];
char tmpAppend[ULOC_FULLNAME_CAPACITY];
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t len, i;
int32_t reslen = 0;
int32_t capacity = sizeof tmpAppend;
if (U_FAILURE(*status)) {
return 0;
return;
}
len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
@ -1924,7 +1854,7 @@ _appendPrivateuseToLanguageTag(const char* localeID, char* appendAt, int32_t cap
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
return 0;
return;
}
if (len > 0) {
@ -2008,20 +1938,14 @@ _appendPrivateuseToLanguageTag(const char* localeID, char* appendAt, int32_t cap
}
if (U_FAILURE(*status)) {
return 0;
return;
}
}
if (U_SUCCESS(*status)) {
len = reslen;
if (reslen < capacity) {
uprv_memcpy(appendAt, tmpAppend, uprv_min(len, capacity - reslen));
}
sink.Append(tmpAppend, len);
}
u_terminateChars(appendAt, capacity, reslen, status);
return reslen;
}
/*
@ -2637,6 +2561,34 @@ uloc_toLanguageTag(const char* localeID,
int32_t langtagCapacity,
UBool strict,
UErrorCode* status) {
if (U_FAILURE(*status)) {
return 0;
}
icu::CheckedArrayByteSink sink(langtag, langtagCapacity);
ulocimp_toLanguageTag(localeID, sink, strict, status);
int32_t reslen = sink.NumberOfBytesAppended();
if (U_FAILURE(*status)) {
return reslen;
}
if (sink.Overflowed()) {
*status = U_BUFFER_OVERFLOW_ERROR;
} else {
u_terminateChars(langtag, langtagCapacity, reslen, status);
}
return reslen;
}
U_CAPI void U_EXPORT2
ulocimp_toLanguageTag(const char* localeID,
icu::ByteSink& sink,
UBool strict,
UErrorCode* status) {
icu::CharString canonical;
int32_t reslen;
UErrorCode tmpStatus = U_ZERO_ERROR;
@ -2657,7 +2609,7 @@ uloc_toLanguageTag(const char* localeID,
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
return 0;
return;
}
reslen =
@ -2673,7 +2625,7 @@ uloc_toLanguageTag(const char* localeID,
if (U_FAILURE(tmpStatus)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
return;
}
canonical.append(buffer, reslen, tmpStatus);
@ -2683,12 +2635,10 @@ uloc_toLanguageTag(const char* localeID,
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
return 0;
return;
}
}
reslen = 0;
/* For handling special case - private use only tag */
pKeywordStart = locale_getKeywordsStart(canonical.data());
if (pKeywordStart == canonical.data()) {
@ -2712,9 +2662,7 @@ uloc_toLanguageTag(const char* localeID,
if (U_SUCCESS(tmpStatus)) {
if (_isPrivateuseValueSubtags(&buf[2], len)) {
/* return private use only tag */
reslen = len + 2;
uprv_memcpy(langtag, buf, uprv_min(reslen, langtagCapacity));
u_terminateChars(langtag, langtagCapacity, reslen, status);
sink.Append(buf, len + 2);
done = TRUE;
} else if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
@ -2729,19 +2677,17 @@ uloc_toLanguageTag(const char* localeID,
}
uenum_close(kwdEnum);
if (done) {
return reslen;
return;
}
}
}
reslen += _appendLanguageToLanguageTag(canonical.data(), langtag, langtagCapacity, strict, status);
reslen += _appendScriptToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, status);
reslen += _appendRegionToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, status);
reslen += _appendVariantsToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, &hadPosix, status);
reslen += _appendKeywordsToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status);
reslen += _appendPrivateuseToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status);
return reslen;
_appendLanguageToLanguageTag(canonical.data(), sink, strict, status);
_appendScriptToLanguageTag(canonical.data(), sink, strict, status);
_appendRegionToLanguageTag(canonical.data(), sink, strict, status);
_appendVariantsToLanguageTag(canonical.data(), sink, strict, &hadPosix, status);
_appendKeywordsToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
_appendPrivateuseToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
}

View file

@ -10,6 +10,7 @@
#ifndef ULOCIMP_H
#define ULOCIMP_H
#include "unicode/bytestream.h"
#include "unicode/uloc.h"
/**
@ -61,6 +62,31 @@ ulocimp_getCountry(const char *localeID,
char *country, int32_t countryCapacity,
const char **pEnd);
/**
* Writes a well-formed language tag for this locale ID.
*
* **Note**: When `strict` is FALSE, any locale fields which do not satisfy the
* BCP47 syntax requirement will be omitted from the result. When `strict` is
* TRUE, this function sets U_ILLEGAL_ARGUMENT_ERROR to the `err` if any locale
* fields do not satisfy the BCP47 syntax requirement.
*
* @param localeID the input locale ID
* @param sink the output sink receiving the BCP47 language
* tag for this Locale.
* @param strict boolean value indicating if the function returns
* an error for an ill-formed input locale ID.
* @param err error information if receiving the language
* tag failed.
* @return The length of the BCP47 language tag.
*
* @internal ICU 64
*/
U_STABLE void U_EXPORT2
ulocimp_toLanguageTag(const char* localeID,
icu::ByteSink& sink,
UBool strict,
UErrorCode* err);
/**
* Returns a locale ID for the specified BCP47 language tag string.
* If the specified language tag contains any ill-formed subtags,

View file

@ -251,6 +251,7 @@ void addLocaleTest(TestNode** root)
TESTCASE(TestOrientation);
TESTCASE(TestLikelySubtags);
TESTCASE(TestToLanguageTag);
TESTCASE(TestBug20132);
TESTCASE(TestForLanguageTag);
TESTCASE(TestInvalidLanguageTag);
TESTCASE(TestLangAndRegionCanonicalize);
@ -6018,6 +6019,47 @@ static void TestToLanguageTag(void) {
}
}
static void TestBug20132(void) {
char langtag[256];
UErrorCode status;
int32_t len;
static const char inloc[] = "C";
static const char expected[] = "en-US-u-va-posix";
const int32_t expected_len = uprv_strlen(expected);
/* Before ICU-20132 was fixed, calling uloc_toLanguageTag() with a too small
* buffer would not immediately return the buffer size actually needed, but
* instead require several iterations before getting the correct size. */
status = U_ZERO_ERROR;
len = uloc_toLanguageTag(inloc, langtag, 1, FALSE, &status);
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
log_data_err("Error returned by uloc_toLanguageTag for locale id [%s] - error: %s Are you missing data?\n",
inloc, u_errorName(status));
}
if (len != expected_len) {
log_err("Bad length returned by uloc_toLanguageTag for locale id [%s]: %i != %i\n", inloc, len, expected_len);
}
status = U_ZERO_ERROR;
len = uloc_toLanguageTag(inloc, langtag, expected_len, FALSE, &status);
if (U_FAILURE(status)) {
log_data_err("Error returned by uloc_toLanguageTag for locale id [%s] - error: %s Are you missing data?\n",
inloc, u_errorName(status));
}
if (len != expected_len) {
log_err("Bad length returned by uloc_toLanguageTag for locale id [%s]: %i != %i\n", inloc, len, expected_len);
} else if (uprv_strncmp(langtag, expected, expected_len) != 0) {
log_data_err("uloc_toLanguageTag returned language tag [%.*s] for input locale [%s] - expected: [%s]. Are you missing data?\n",
len, langtag, inloc, expected);
}
}
#define FULL_LENGTH -1
static const struct {
const char *bcpID;

View file

@ -126,6 +126,7 @@ static void TestLikelySubtags(void);
static void TestForLanguageTag(void);
static void TestInvalidLanguageTag(void);
static void TestToLanguageTag(void);
static void TestBug20132(void);
static void TestLangAndRegionCanonicalize(void);
static void TestToUnicodeLocaleKey(void);