ICU-22520 Move all localeID parsing logic into new ulocimp_getSubtags().

The logic for parsing a localeID string into its constituent subtags is
currently repeated over and over again in each one of the uloc_get*()
functions, so that calling all these functions one after the other in
order to get all the subtags does the parsing all over again from the
beginning for each function call.

In order to avoid having to do this parsing over and over again, a lot
of code instead has its own copy of the parsing logic in order to call
the underlying ulocimp_get*() functions directly for lower runtime cost
at the price of increased code complexity and repetition.

This new ulocimp_getSubtags() function, which writes natively to
icu::ByteSink and has a convenience wrapper to write to icu::CharString,
removes the repeated code from the uloc_get*() functions and makes it
possible to update all code that calls the ulocimp_get*() functions.
This commit is contained in:
Fredrik Roubert 2024-01-03 10:02:59 +09:00 committed by Fredrik Roubert
parent 678d5c1273
commit dc70b5a056
4 changed files with 257 additions and 96 deletions

View file

@ -30,6 +30,8 @@
l = lang, C = ctry, M = charmap, V = variant
*/
#include <optional>
#include "unicode/bytestream.h"
#include "unicode/errorcode.h"
#include "unicode/stringpiece.h"
@ -1285,24 +1287,31 @@ ulocimp_getCountry(const char *localeID,
static void
_getVariant(const char *localeID,
char prev,
ByteSink& sink,
ByteSink* sink,
const char** pEnd,
UBool needSeparator) {
UBool hasVariant = false;
if (pEnd != nullptr) { *pEnd = localeID; }
/* get one or more variant tags and separate them with '_' */
if(_isIDSeparator(prev)) {
/* get a variant string after a '-' or '_' */
while(!_isTerminator(*localeID)) {
if (needSeparator) {
sink.Append("_", 1);
if (sink != nullptr) {
sink->Append("_", 1);
}
needSeparator = false;
}
char c = (char)uprv_toupper(*localeID);
if (c == '-') c = '_';
sink.Append(&c, 1);
if (sink != nullptr) {
char c = (char)uprv_toupper(*localeID);
if (c == '-') c = '_';
sink->Append(&c, 1);
}
hasVariant = true;
localeID++;
}
if (pEnd != nullptr) { *pEnd = localeID; }
}
/* if there is no variant tag after a '-' or '_' then look for '@' */
@ -1316,14 +1325,133 @@ _getVariant(const char *localeID,
}
while(!_isTerminator(*localeID)) {
if (needSeparator) {
sink.Append("_", 1);
if (sink != nullptr) {
sink->Append("_", 1);
}
needSeparator = false;
}
char c = (char)uprv_toupper(*localeID);
if (c == '-' || c == ',') c = '_';
sink.Append(&c, 1);
if (sink != nullptr) {
char c = (char)uprv_toupper(*localeID);
if (c == '-' || c == ',') c = '_';
sink->Append(&c, 1);
}
localeID++;
}
if (pEnd != nullptr) { *pEnd = localeID; }
}
}
U_EXPORT void U_EXPORT2
ulocimp_getSubtags(
const char* localeID,
CharString* language,
CharString* script,
CharString* region,
CharString* variant,
const char** pEnd,
UErrorCode& status) {
std::optional<CharStringByteSink> languageSink;
std::optional<CharStringByteSink> scriptSink;
std::optional<CharStringByteSink> regionSink;
std::optional<CharStringByteSink> variantSink;
if (language != nullptr) { languageSink.emplace(language); }
if (script != nullptr) { scriptSink.emplace(script); }
if (region != nullptr) { regionSink.emplace(region); }
if (variant != nullptr) { variantSink.emplace(variant); }
ulocimp_getSubtags(
localeID,
languageSink.has_value() ? &languageSink.value() : nullptr,
scriptSink.has_value() ? &scriptSink.value() : nullptr,
regionSink.has_value() ? &regionSink.value() : nullptr,
variantSink.has_value() ? &variantSink.value() : nullptr,
pEnd,
status);
}
U_EXPORT void U_EXPORT2
ulocimp_getSubtags(
const char* localeID,
ByteSink* language,
ByteSink* script,
ByteSink* region,
ByteSink* variant,
const char** pEnd,
UErrorCode& status) {
if (U_FAILURE(status)) { return; }
if (pEnd != nullptr) {
*pEnd = localeID;
} else if (language == nullptr &&
script == nullptr &&
region == nullptr &&
variant == nullptr) {
return;
}
bool hasRegion = false;
if (localeID == nullptr) {
localeID = uloc_getDefault();
}
{
CharString tmp = ulocimp_getLanguage(localeID, &localeID, status);
if (U_FAILURE(status)) { return; }
U_ASSERT(localeID != nullptr);
if (language != nullptr) { language->Append(tmp.data(), tmp.length()); }
}
if (pEnd != nullptr) {
*pEnd = localeID;
} else if (script == nullptr &&
region == nullptr &&
variant == nullptr) {
return;
}
if (_isIDSeparator(*localeID)) {
const char* begin = localeID + 1;
const char* end = nullptr;
CharString tmp = ulocimp_getScript(begin, &end, status);
if (U_FAILURE(status)) { return; }
U_ASSERT(end != nullptr);
if (end != begin) {
localeID = end;
if (script != nullptr) { script->Append(tmp.data(), tmp.length()); }
if (pEnd != nullptr) { *pEnd = localeID; }
}
}
if (region == nullptr && variant == nullptr && pEnd == nullptr) { return; }
if (_isIDSeparator(*localeID)) {
const char* begin = localeID + 1;
const char* end = nullptr;
CharString tmp = ulocimp_getCountry(begin, &end, status);
if (U_FAILURE(status)) { return; }
U_ASSERT(end != nullptr);
if (end != begin) {
hasRegion = true;
localeID = end;
if (region != nullptr) { region->Append(tmp.data(), tmp.length()); }
if (pEnd != nullptr) { *pEnd = localeID; }
}
}
if (variant == nullptr && pEnd == nullptr) { return; }
if (_isIDSeparator(*localeID) && !_isBCP47Extension(localeID)) {
/* If there was no country ID, skip a possible extra IDSeparator */
if (!hasRegion && _isIDSeparator(localeID[1])) {
localeID++;
}
const char* begin = localeID + 1;
const char* end = nullptr;
_getVariant(begin, *localeID, variant, &end, false);
U_ASSERT(end != nullptr);
if (end != begin && pEnd != nullptr) { *pEnd = end; }
}
}
@ -1587,7 +1715,7 @@ _canonicalize(const char* localeID,
variantSize = -tag.length();
{
CharStringByteSink s(&tag);
_getVariant(tmpLocaleID+1, *tmpLocaleID, s, false);
_getVariant(tmpLocaleID+1, *tmpLocaleID, &s, nullptr, false);
}
variantSize += tag.length();
if (variantSize > 0) {
@ -1649,7 +1777,7 @@ _canonicalize(const char* localeID,
int32_t posixVariantSize = -tag.length();
{
CharStringByteSink s(&tag);
_getVariant(tmpLocaleID+1, '@', s, (UBool)(variantSize > 0));
_getVariant(tmpLocaleID+1, '@', &s, nullptr, (UBool)(variantSize > 0));
}
posixVariantSize += tag.length();
if (posixVariantSize > 0) {
@ -1755,11 +1883,28 @@ uloc_getLanguage(const char* localeID,
return 0;
}
if(localeID==nullptr) {
localeID=uloc_getDefault();
CheckedArrayByteSink sink(language, languageCapacity);
ulocimp_getSubtags(
localeID,
&sink,
nullptr,
nullptr,
nullptr,
nullptr,
*err);
int32_t length = sink.NumberOfBytesAppended();
if (U_FAILURE(*err)) {
return length;
}
return ulocimp_getLanguage(localeID, nullptr, *err).extract(language, languageCapacity, *err);
if (sink.Overflowed()) {
*err = U_BUFFER_OVERFLOW_ERROR;
return length;
}
return u_terminateChars(language, languageCapacity, length, err);
}
U_CAPI int32_t U_EXPORT2
@ -1772,20 +1917,28 @@ uloc_getScript(const char* localeID,
return 0;
}
if(localeID==nullptr) {
localeID=uloc_getDefault();
}
CheckedArrayByteSink sink(script, scriptCapacity);
ulocimp_getSubtags(
localeID,
nullptr,
&sink,
nullptr,
nullptr,
nullptr,
*err);
int32_t length = sink.NumberOfBytesAppended();
/* skip the language */
ulocimp_getLanguage(localeID, &localeID, *err);
if (U_FAILURE(*err)) {
return 0;
return length;
}
if(_isIDSeparator(*localeID)) {
return ulocimp_getScript(localeID+1, nullptr, *err).extract(script, scriptCapacity, *err);
if (sink.Overflowed()) {
*err = U_BUFFER_OVERFLOW_ERROR;
return length;
}
return u_terminateChars(script, scriptCapacity, 0, err);
return u_terminateChars(script, scriptCapacity, length, err);
}
U_CAPI int32_t U_EXPORT2
@ -1798,32 +1951,28 @@ uloc_getCountry(const char* localeID,
return 0;
}
if(localeID==nullptr) {
localeID=uloc_getDefault();
}
CheckedArrayByteSink sink(country, countryCapacity);
ulocimp_getSubtags(
localeID,
nullptr,
nullptr,
&sink,
nullptr,
nullptr,
*err);
int32_t length = sink.NumberOfBytesAppended();
/* Skip the language */
ulocimp_getLanguage(localeID, &localeID, *err);
if (U_FAILURE(*err)) {
return 0;
return length;
}
if(_isIDSeparator(*localeID)) {
const char *scriptID;
/* Skip the script if available */
ulocimp_getScript(localeID+1, &scriptID, *err);
if (U_FAILURE(*err)) {
return 0;
}
if(scriptID != localeID+1) {
/* Found optional script */
localeID = scriptID;
}
if(_isIDSeparator(*localeID)) {
return ulocimp_getCountry(localeID+1, nullptr, *err).extract(country, countryCapacity, *err);
}
if (sink.Overflowed()) {
*err = U_BUFFER_OVERFLOW_ERROR;
return length;
}
return u_terminateChars(country, countryCapacity, 0, err);
return u_terminateChars(country, countryCapacity, length, err);
}
U_CAPI int32_t U_EXPORT2
@ -1832,68 +1981,32 @@ uloc_getVariant(const char* localeID,
int32_t variantCapacity,
UErrorCode* err)
{
int32_t i=0;
if(err==nullptr || U_FAILURE(*err)) {
return 0;
}
if (localeID == nullptr) {
localeID = uloc_getDefault();
}
CheckedArrayByteSink sink(variant, variantCapacity);
ulocimp_getSubtags(
localeID,
nullptr,
nullptr,
nullptr,
&sink,
nullptr,
*err);
int32_t length = sink.NumberOfBytesAppended();
/* Skip the language */
ulocimp_getLanguage(localeID, &localeID, *err);
if (U_FAILURE(*err)) {
return 0;
return length;
}
if (_isIDSeparator(*localeID)) {
const char *scriptID;
/* Skip the script if available */
ulocimp_getScript(localeID+1, &scriptID, *err);
if (U_FAILURE(*err)) {
return 0;
}
if (scriptID != localeID+1) {
/* Found optional script */
localeID = scriptID;
}
/* Skip the Country */
if (_isIDSeparator(*localeID)) {
const char *cntryID;
ulocimp_getCountry(localeID+1, &cntryID, *err);
if (U_FAILURE(*err)) {
return 0;
}
if (cntryID != localeID+1) {
/* Found optional country */
localeID = cntryID;
}
if (_isIDSeparator(*localeID) && !_isBCP47Extension(localeID)) {
/* If there was no country ID, skip a possible extra IDSeparator */
if (localeID != cntryID && _isIDSeparator(localeID[1])) {
localeID++;
}
CheckedArrayByteSink sink(variant, variantCapacity);
_getVariant(localeID+1, *localeID, sink, false);
i = sink.NumberOfBytesAppended();
if (U_FAILURE(*err)) {
return i;
}
if (sink.Overflowed()) {
*err = U_BUFFER_OVERFLOW_ERROR;
return i;
}
}
}
if (sink.Overflowed()) {
*err = U_BUFFER_OVERFLOW_ERROR;
return length;
}
return u_terminateChars(variant, variantCapacity, i, err);
return u_terminateChars(variant, variantCapacity, length, err);
}
U_CAPI int32_t U_EXPORT2

View file

@ -10,6 +10,8 @@
#ifndef ULOCIMP_H
#define ULOCIMP_H
#include <cstddef>
#include "unicode/bytestream.h"
#include "unicode/uloc.h"
@ -105,6 +107,45 @@ ulocimp_setKeywordValue(const char* keywords,
icu::ByteSink& sink,
UErrorCode* status);
U_EXPORT void U_EXPORT2
ulocimp_getSubtags(
const char* localeID,
icu::CharString* language,
icu::CharString* script,
icu::CharString* region,
icu::CharString* variant,
const char** pEnd,
UErrorCode& status);
U_EXPORT void U_EXPORT2
ulocimp_getSubtags(
const char* localeID,
icu::ByteSink* language,
icu::ByteSink* script,
icu::ByteSink* region,
icu::ByteSink* variant,
const char** pEnd,
UErrorCode& status);
inline void U_EXPORT2
ulocimp_getSubtags(
const char* localeID,
std::nullptr_t,
std::nullptr_t,
std::nullptr_t,
std::nullptr_t,
const char** pEnd,
UErrorCode& status) {
ulocimp_getSubtags(
localeID,
static_cast<icu::ByteSink*>(nullptr),
static_cast<icu::ByteSink*>(nullptr),
static_cast<icu::ByteSink*>(nullptr),
static_cast<icu::ByteSink*>(nullptr),
pEnd,
status);
}
U_CAPI void U_EXPORT2
ulocimp_getParent(const char* localeID,
icu::ByteSink& sink,

View file

@ -1201,6 +1201,7 @@
#define ulocimp_getParent U_ICU_ENTRY_POINT_RENAME(ulocimp_getParent)
#define ulocimp_getRegionForSupplementalData U_ICU_ENTRY_POINT_RENAME(ulocimp_getRegionForSupplementalData)
#define ulocimp_getScript U_ICU_ENTRY_POINT_RENAME(ulocimp_getScript)
#define ulocimp_getSubtags U_ICU_ENTRY_POINT_RENAME(ulocimp_getSubtags)
#define ulocimp_isCanonicalizedLocaleForTest U_ICU_ENTRY_POINT_RENAME(ulocimp_isCanonicalizedLocaleForTest)
#define ulocimp_minimizeSubtags U_ICU_ENTRY_POINT_RENAME(ulocimp_minimizeSubtags)
#define ulocimp_setKeywordValue U_ICU_ENTRY_POINT_RENAME(ulocimp_setKeywordValue)

View file

@ -139,6 +139,12 @@ group: cplusplus
# "Calls the current terminate handler."
std::terminate()
# ICU4C doesn't actually use C++ exceptions, but the standard library does,
# so these symbols can end up in debug builds.
"std::exception::~exception()"
"typeinfo for std::exception"
"vtable for std::exception"
group: iostream
"std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)"
"std::basic_ios<char, std::char_traits<char> >::eof() const"