From dc70b5a056b618c014c71e8bfd45f3dd9145e9fe Mon Sep 17 00:00:00 2001 From: Fredrik Roubert Date: Wed, 3 Jan 2024 10:02:59 +0900 Subject: [PATCH] ICU-22520 Move all localeID parsing logic into new ulocimp_getSubtags(). The logic for parsing a localeID string into its constituent subtags is currently repeated over and over again in each one of the uloc_get*() functions, so that calling all these functions one after the other in order to get all the subtags does the parsing all over again from the beginning for each function call. In order to avoid having to do this parsing over and over again, a lot of code instead has its own copy of the parsing logic in order to call the underlying ulocimp_get*() functions directly for lower runtime cost at the price of increased code complexity and repetition. This new ulocimp_getSubtags() function, which writes natively to icu::ByteSink and has a convenience wrapper to write to icu::CharString, removes the repeated code from the uloc_get*() functions and makes it possible to update all code that calls the ulocimp_get*() functions. --- icu4c/source/common/uloc.cpp | 305 ++++++++++++++------ icu4c/source/common/ulocimp.h | 41 +++ icu4c/source/common/unicode/urename.h | 1 + icu4c/source/test/depstest/dependencies.txt | 6 + 4 files changed, 257 insertions(+), 96 deletions(-) diff --git a/icu4c/source/common/uloc.cpp b/icu4c/source/common/uloc.cpp index 25b09641a28..97fe5d442f1 100644 --- a/icu4c/source/common/uloc.cpp +++ b/icu4c/source/common/uloc.cpp @@ -30,6 +30,8 @@ l = lang, C = ctry, M = charmap, V = variant */ +#include + #include "unicode/bytestream.h" #include "unicode/errorcode.h" #include "unicode/stringpiece.h" @@ -1285,24 +1287,31 @@ ulocimp_getCountry(const char *localeID, static void _getVariant(const char *localeID, char prev, - ByteSink& sink, + ByteSink* sink, + const char** pEnd, UBool needSeparator) { UBool hasVariant = false; + if (pEnd != nullptr) { *pEnd = localeID; } /* get one or more variant tags and separate them with '_' */ if(_isIDSeparator(prev)) { /* get a variant string after a '-' or '_' */ while(!_isTerminator(*localeID)) { if (needSeparator) { - sink.Append("_", 1); + if (sink != nullptr) { + sink->Append("_", 1); + } needSeparator = false; } - char c = (char)uprv_toupper(*localeID); - if (c == '-') c = '_'; - sink.Append(&c, 1); + if (sink != nullptr) { + char c = (char)uprv_toupper(*localeID); + if (c == '-') c = '_'; + sink->Append(&c, 1); + } hasVariant = true; localeID++; } + if (pEnd != nullptr) { *pEnd = localeID; } } /* if there is no variant tag after a '-' or '_' then look for '@' */ @@ -1316,14 +1325,133 @@ _getVariant(const char *localeID, } while(!_isTerminator(*localeID)) { if (needSeparator) { - sink.Append("_", 1); + if (sink != nullptr) { + sink->Append("_", 1); + } needSeparator = false; } - char c = (char)uprv_toupper(*localeID); - if (c == '-' || c == ',') c = '_'; - sink.Append(&c, 1); + if (sink != nullptr) { + char c = (char)uprv_toupper(*localeID); + if (c == '-' || c == ',') c = '_'; + sink->Append(&c, 1); + } localeID++; } + if (pEnd != nullptr) { *pEnd = localeID; } + } +} + +U_EXPORT void U_EXPORT2 +ulocimp_getSubtags( + const char* localeID, + CharString* language, + CharString* script, + CharString* region, + CharString* variant, + const char** pEnd, + UErrorCode& status) { + std::optional languageSink; + std::optional scriptSink; + std::optional regionSink; + std::optional variantSink; + + if (language != nullptr) { languageSink.emplace(language); } + if (script != nullptr) { scriptSink.emplace(script); } + if (region != nullptr) { regionSink.emplace(region); } + if (variant != nullptr) { variantSink.emplace(variant); } + + ulocimp_getSubtags( + localeID, + languageSink.has_value() ? &languageSink.value() : nullptr, + scriptSink.has_value() ? &scriptSink.value() : nullptr, + regionSink.has_value() ? ®ionSink.value() : nullptr, + variantSink.has_value() ? &variantSink.value() : nullptr, + pEnd, + status); +} + +U_EXPORT void U_EXPORT2 +ulocimp_getSubtags( + const char* localeID, + ByteSink* language, + ByteSink* script, + ByteSink* region, + ByteSink* variant, + const char** pEnd, + UErrorCode& status) { + if (U_FAILURE(status)) { return; } + + if (pEnd != nullptr) { + *pEnd = localeID; + } else if (language == nullptr && + script == nullptr && + region == nullptr && + variant == nullptr) { + return; + } + + bool hasRegion = false; + + if (localeID == nullptr) { + localeID = uloc_getDefault(); + } + + { + CharString tmp = ulocimp_getLanguage(localeID, &localeID, status); + if (U_FAILURE(status)) { return; } + U_ASSERT(localeID != nullptr); + if (language != nullptr) { language->Append(tmp.data(), tmp.length()); } + } + + if (pEnd != nullptr) { + *pEnd = localeID; + } else if (script == nullptr && + region == nullptr && + variant == nullptr) { + return; + } + + if (_isIDSeparator(*localeID)) { + const char* begin = localeID + 1; + const char* end = nullptr; + CharString tmp = ulocimp_getScript(begin, &end, status); + if (U_FAILURE(status)) { return; } + U_ASSERT(end != nullptr); + if (end != begin) { + localeID = end; + if (script != nullptr) { script->Append(tmp.data(), tmp.length()); } + if (pEnd != nullptr) { *pEnd = localeID; } + } + } + + if (region == nullptr && variant == nullptr && pEnd == nullptr) { return; } + + if (_isIDSeparator(*localeID)) { + const char* begin = localeID + 1; + const char* end = nullptr; + CharString tmp = ulocimp_getCountry(begin, &end, status); + if (U_FAILURE(status)) { return; } + U_ASSERT(end != nullptr); + if (end != begin) { + hasRegion = true; + localeID = end; + if (region != nullptr) { region->Append(tmp.data(), tmp.length()); } + if (pEnd != nullptr) { *pEnd = localeID; } + } + } + + if (variant == nullptr && pEnd == nullptr) { return; } + + if (_isIDSeparator(*localeID) && !_isBCP47Extension(localeID)) { + /* If there was no country ID, skip a possible extra IDSeparator */ + if (!hasRegion && _isIDSeparator(localeID[1])) { + localeID++; + } + const char* begin = localeID + 1; + const char* end = nullptr; + _getVariant(begin, *localeID, variant, &end, false); + U_ASSERT(end != nullptr); + if (end != begin && pEnd != nullptr) { *pEnd = end; } } } @@ -1587,7 +1715,7 @@ _canonicalize(const char* localeID, variantSize = -tag.length(); { CharStringByteSink s(&tag); - _getVariant(tmpLocaleID+1, *tmpLocaleID, s, false); + _getVariant(tmpLocaleID+1, *tmpLocaleID, &s, nullptr, false); } variantSize += tag.length(); if (variantSize > 0) { @@ -1649,7 +1777,7 @@ _canonicalize(const char* localeID, int32_t posixVariantSize = -tag.length(); { CharStringByteSink s(&tag); - _getVariant(tmpLocaleID+1, '@', s, (UBool)(variantSize > 0)); + _getVariant(tmpLocaleID+1, '@', &s, nullptr, (UBool)(variantSize > 0)); } posixVariantSize += tag.length(); if (posixVariantSize > 0) { @@ -1755,11 +1883,28 @@ uloc_getLanguage(const char* localeID, return 0; } - if(localeID==nullptr) { - localeID=uloc_getDefault(); + CheckedArrayByteSink sink(language, languageCapacity); + ulocimp_getSubtags( + localeID, + &sink, + nullptr, + nullptr, + nullptr, + nullptr, + *err); + + int32_t length = sink.NumberOfBytesAppended(); + + if (U_FAILURE(*err)) { + return length; } - return ulocimp_getLanguage(localeID, nullptr, *err).extract(language, languageCapacity, *err); + if (sink.Overflowed()) { + *err = U_BUFFER_OVERFLOW_ERROR; + return length; + } + + return u_terminateChars(language, languageCapacity, length, err); } U_CAPI int32_t U_EXPORT2 @@ -1772,20 +1917,28 @@ uloc_getScript(const char* localeID, return 0; } - if(localeID==nullptr) { - localeID=uloc_getDefault(); - } + CheckedArrayByteSink sink(script, scriptCapacity); + ulocimp_getSubtags( + localeID, + nullptr, + &sink, + nullptr, + nullptr, + nullptr, + *err); + + int32_t length = sink.NumberOfBytesAppended(); - /* skip the language */ - ulocimp_getLanguage(localeID, &localeID, *err); if (U_FAILURE(*err)) { - return 0; + return length; } - if(_isIDSeparator(*localeID)) { - return ulocimp_getScript(localeID+1, nullptr, *err).extract(script, scriptCapacity, *err); + if (sink.Overflowed()) { + *err = U_BUFFER_OVERFLOW_ERROR; + return length; } - return u_terminateChars(script, scriptCapacity, 0, err); + + return u_terminateChars(script, scriptCapacity, length, err); } U_CAPI int32_t U_EXPORT2 @@ -1798,32 +1951,28 @@ uloc_getCountry(const char* localeID, return 0; } - if(localeID==nullptr) { - localeID=uloc_getDefault(); - } + CheckedArrayByteSink sink(country, countryCapacity); + ulocimp_getSubtags( + localeID, + nullptr, + nullptr, + &sink, + nullptr, + nullptr, + *err); + + int32_t length = sink.NumberOfBytesAppended(); - /* Skip the language */ - ulocimp_getLanguage(localeID, &localeID, *err); if (U_FAILURE(*err)) { - return 0; + return length; } - if(_isIDSeparator(*localeID)) { - const char *scriptID; - /* Skip the script if available */ - ulocimp_getScript(localeID+1, &scriptID, *err); - if (U_FAILURE(*err)) { - return 0; - } - if(scriptID != localeID+1) { - /* Found optional script */ - localeID = scriptID; - } - if(_isIDSeparator(*localeID)) { - return ulocimp_getCountry(localeID+1, nullptr, *err).extract(country, countryCapacity, *err); - } + if (sink.Overflowed()) { + *err = U_BUFFER_OVERFLOW_ERROR; + return length; } - return u_terminateChars(country, countryCapacity, 0, err); + + return u_terminateChars(country, countryCapacity, length, err); } U_CAPI int32_t U_EXPORT2 @@ -1832,68 +1981,32 @@ uloc_getVariant(const char* localeID, int32_t variantCapacity, UErrorCode* err) { - int32_t i=0; - if(err==nullptr || U_FAILURE(*err)) { return 0; } - if (localeID == nullptr) { - localeID = uloc_getDefault(); - } + CheckedArrayByteSink sink(variant, variantCapacity); + ulocimp_getSubtags( + localeID, + nullptr, + nullptr, + nullptr, + &sink, + nullptr, + *err); + + int32_t length = sink.NumberOfBytesAppended(); - /* Skip the language */ - ulocimp_getLanguage(localeID, &localeID, *err); if (U_FAILURE(*err)) { - return 0; + return length; } - if (_isIDSeparator(*localeID)) { - const char *scriptID; - /* Skip the script if available */ - ulocimp_getScript(localeID+1, &scriptID, *err); - if (U_FAILURE(*err)) { - return 0; - } - if (scriptID != localeID+1) { - /* Found optional script */ - localeID = scriptID; - } - /* Skip the Country */ - if (_isIDSeparator(*localeID)) { - const char *cntryID; - ulocimp_getCountry(localeID+1, &cntryID, *err); - if (U_FAILURE(*err)) { - return 0; - } - if (cntryID != localeID+1) { - /* Found optional country */ - localeID = cntryID; - } - if (_isIDSeparator(*localeID) && !_isBCP47Extension(localeID)) { - /* If there was no country ID, skip a possible extra IDSeparator */ - if (localeID != cntryID && _isIDSeparator(localeID[1])) { - localeID++; - } - - CheckedArrayByteSink sink(variant, variantCapacity); - _getVariant(localeID+1, *localeID, sink, false); - - i = sink.NumberOfBytesAppended(); - - if (U_FAILURE(*err)) { - return i; - } - - if (sink.Overflowed()) { - *err = U_BUFFER_OVERFLOW_ERROR; - return i; - } - } - } + if (sink.Overflowed()) { + *err = U_BUFFER_OVERFLOW_ERROR; + return length; } - return u_terminateChars(variant, variantCapacity, i, err); + return u_terminateChars(variant, variantCapacity, length, err); } U_CAPI int32_t U_EXPORT2 diff --git a/icu4c/source/common/ulocimp.h b/icu4c/source/common/ulocimp.h index 37ca223e16f..a129e3ac91a 100644 --- a/icu4c/source/common/ulocimp.h +++ b/icu4c/source/common/ulocimp.h @@ -10,6 +10,8 @@ #ifndef ULOCIMP_H #define ULOCIMP_H +#include + #include "unicode/bytestream.h" #include "unicode/uloc.h" @@ -105,6 +107,45 @@ ulocimp_setKeywordValue(const char* keywords, icu::ByteSink& sink, UErrorCode* status); +U_EXPORT void U_EXPORT2 +ulocimp_getSubtags( + const char* localeID, + icu::CharString* language, + icu::CharString* script, + icu::CharString* region, + icu::CharString* variant, + const char** pEnd, + UErrorCode& status); + +U_EXPORT void U_EXPORT2 +ulocimp_getSubtags( + const char* localeID, + icu::ByteSink* language, + icu::ByteSink* script, + icu::ByteSink* region, + icu::ByteSink* variant, + const char** pEnd, + UErrorCode& status); + +inline void U_EXPORT2 +ulocimp_getSubtags( + const char* localeID, + std::nullptr_t, + std::nullptr_t, + std::nullptr_t, + std::nullptr_t, + const char** pEnd, + UErrorCode& status) { + ulocimp_getSubtags( + localeID, + static_cast(nullptr), + static_cast(nullptr), + static_cast(nullptr), + static_cast(nullptr), + pEnd, + status); +} + U_CAPI void U_EXPORT2 ulocimp_getParent(const char* localeID, icu::ByteSink& sink, diff --git a/icu4c/source/common/unicode/urename.h b/icu4c/source/common/unicode/urename.h index 9f042b9a261..6c8b32a344f 100644 --- a/icu4c/source/common/unicode/urename.h +++ b/icu4c/source/common/unicode/urename.h @@ -1201,6 +1201,7 @@ #define ulocimp_getParent U_ICU_ENTRY_POINT_RENAME(ulocimp_getParent) #define ulocimp_getRegionForSupplementalData U_ICU_ENTRY_POINT_RENAME(ulocimp_getRegionForSupplementalData) #define ulocimp_getScript U_ICU_ENTRY_POINT_RENAME(ulocimp_getScript) +#define ulocimp_getSubtags U_ICU_ENTRY_POINT_RENAME(ulocimp_getSubtags) #define ulocimp_isCanonicalizedLocaleForTest U_ICU_ENTRY_POINT_RENAME(ulocimp_isCanonicalizedLocaleForTest) #define ulocimp_minimizeSubtags U_ICU_ENTRY_POINT_RENAME(ulocimp_minimizeSubtags) #define ulocimp_setKeywordValue U_ICU_ENTRY_POINT_RENAME(ulocimp_setKeywordValue) diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt index e0484f76329..ebe8bcecffb 100644 --- a/icu4c/source/test/depstest/dependencies.txt +++ b/icu4c/source/test/depstest/dependencies.txt @@ -139,6 +139,12 @@ group: cplusplus # "Calls the current terminate handler." std::terminate() + # ICU4C doesn't actually use C++ exceptions, but the standard library does, + # so these symbols can end up in debug builds. + "std::exception::~exception()" + "typeinfo for std::exception" + "vtable for std::exception" + group: iostream "std::basic_ios >::clear(std::_Ios_Iostate)" "std::basic_ios >::eof() const"