From 6942013a389b5b9436663997df07274f3ad6261d Mon Sep 17 00:00:00 2001 From: Frank Tang Date: Tue, 23 Oct 2018 09:17:55 +0800 Subject: [PATCH] ICU-20328 Implement LocaleBuilder Design Doc: https://goo.gl/Qf12p3 --- icu4c/source/common/Makefile.in | 1 + icu4c/source/common/common.vcxproj | 2 + icu4c/source/common/common.vcxproj.filters | 6 + icu4c/source/common/common_uwp.vcxproj | 2 + icu4c/source/common/localebuilder.cpp | 436 +++++ icu4c/source/common/uloc_tag.cpp | 359 +++- icu4c/source/common/ulocimp.h | 29 + icu4c/source/common/unicode/localebuilder.h | 288 +++ icu4c/source/common/unicode/urename.h | 10 + icu4c/source/test/depstest/dependencies.txt | 6 + icu4c/source/test/intltest/Makefile.in | 2 +- icu4c/source/test/intltest/intltest.vcxproj | 4 +- .../test/intltest/intltest.vcxproj.filters | 7 +- icu4c/source/test/intltest/itutil.cpp | 2 + .../test/intltest/localebuildertest.cpp | 1627 +++++++++++++++++ .../source/test/intltest/localebuildertest.h | 51 + 16 files changed, 2728 insertions(+), 104 deletions(-) create mode 100644 icu4c/source/common/localebuilder.cpp create mode 100644 icu4c/source/common/unicode/localebuilder.h create mode 100644 icu4c/source/test/intltest/localebuildertest.cpp create mode 100644 icu4c/source/test/intltest/localebuildertest.h diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index e10d3a27d3a..e663cb8e04b 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -88,6 +88,7 @@ ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \ ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_set.o ucnv_ct.o \ resource.o uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \ ucurr.o \ +localebuilder.o \ messagepattern.o ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o locdspnm.o loclikely.o locresdata.o \ bytestream.o stringpiece.o bytesinkutil.o \ stringtriebuilder.o bytestriebuilder.o \ diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj index eb9b456ce55..14d6e6ca617 100644 --- a/icu4c/source/common/common.vcxproj +++ b/icu4c/source/common/common.vcxproj @@ -256,6 +256,7 @@ + @@ -445,6 +446,7 @@ + diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index 85d0d9b5eef..72fef1bd718 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -361,6 +361,9 @@ locales & resources + + locales & resources + normalization @@ -1225,5 +1228,8 @@ strings + + locales & resources + diff --git a/icu4c/source/common/common_uwp.vcxproj b/icu4c/source/common/common_uwp.vcxproj index 1265b67d6bb..af030c48ef2 100644 --- a/icu4c/source/common/common_uwp.vcxproj +++ b/icu4c/source/common/common_uwp.vcxproj @@ -383,6 +383,7 @@ + @@ -572,6 +573,7 @@ + diff --git a/icu4c/source/common/localebuilder.cpp b/icu4c/source/common/localebuilder.cpp new file mode 100644 index 00000000000..fe931fcf759 --- /dev/null +++ b/icu4c/source/common/localebuilder.cpp @@ -0,0 +1,436 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include + +#include "bytesinkutil.h" // CharStringByteSink +#include "charstr.h" +#include "cstring.h" +#include "ulocimp.h" +#include "unicode/localebuilder.h" +#include "unicode/locid.h" + +U_NAMESPACE_BEGIN + +#define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9')) +#define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) ) + +const char* kAttributeKey = "attribute"; + +static bool _isExtensionSubtags(char key, const char* s, int32_t len) { + switch (uprv_tolower(key)) { + case 'u': + return ultag_isUnicodeExtensionSubtags(s, len); + case 't': + return ultag_isTransformedExtensionSubtags(s, len); + case 'x': + return ultag_isPrivateuseValueSubtags(s, len); + default: + return ultag_isExtensionSubtags(s, len); + } +} + +LocaleBuilder::LocaleBuilder() : UObject(), status_(U_ZERO_ERROR), language_(), + script_(), region_(), variant_(nullptr), extensions_(nullptr) +{ + language_[0] = 0; + script_[0] = 0; + region_[0] = 0; +} + +LocaleBuilder::~LocaleBuilder() +{ + delete variant_; + delete extensions_; +} + +LocaleBuilder& LocaleBuilder::setLocale(const Locale& locale) +{ + clear(); + setLanguage(locale.getLanguage()); + setScript(locale.getScript()); + setRegion(locale.getCountry()); + setVariant(locale.getVariant()); + extensions_ = locale.clone(); + if (extensions_ == nullptr) { + status_ = U_MEMORY_ALLOCATION_ERROR; + } + return *this; +} + +LocaleBuilder& LocaleBuilder::setLanguageTag(StringPiece tag) +{ + Locale l = Locale::forLanguageTag(tag, status_); + if (U_FAILURE(status_)) { return *this; } + // Because setLocale will reset status_ we need to return + // first if we have error in forLanguageTag. + setLocale(l); + return *this; +} + +static void setField(StringPiece input, char* dest, UErrorCode& errorCode, + UBool (*test)(const char*, int32_t)) { + if (U_FAILURE(errorCode)) { return; } + if (input.empty()) { + dest[0] = '\0'; + } else if (test(input.data(), input.length())) { + uprv_memcpy(dest, input.data(), input.length()); + dest[input.length()] = '\0'; + } else { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + } +} + +LocaleBuilder& LocaleBuilder::setLanguage(StringPiece language) +{ + setField(language, language_, status_, &ultag_isLanguageSubtag); + return *this; +} + +LocaleBuilder& LocaleBuilder::setScript(StringPiece script) +{ + setField(script, script_, status_, &ultag_isScriptSubtag); + return *this; +} + +LocaleBuilder& LocaleBuilder::setRegion(StringPiece region) +{ + setField(region, region_, status_, &ultag_isRegionSubtag); + return *this; +} + +static void transform(char* data, int32_t len) { + for (int32_t i = 0; i < len; i++, data++) { + if (*data == '_') { + *data = '-'; + } else { + *data = uprv_tolower(*data); + } + } +} + +LocaleBuilder& LocaleBuilder::setVariant(StringPiece variant) +{ + if (U_FAILURE(status_)) { return *this; } + if (variant.empty()) { + delete variant_; + variant_ = nullptr; + return *this; + } + CharString* new_variant = new CharString(variant, status_); + if (U_FAILURE(status_)) { return *this; } + if (new_variant == nullptr) { + status_ = U_MEMORY_ALLOCATION_ERROR; + return *this; + } + transform(new_variant->data(), new_variant->length()); + if (!ultag_isVariantSubtags(new_variant->data(), new_variant->length())) { + delete new_variant; + status_ = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + delete variant_; + variant_ = new_variant; + return *this; +} + +static bool +_isKeywordValue(const char* key, const char* value, int32_t value_len) +{ + if (key[1] == '\0') { + // one char key + return (UPRV_ISALPHANUM(uprv_tolower(key[0])) && + _isExtensionSubtags(key[0], value, value_len)); + } else if (uprv_strcmp(key, kAttributeKey) == 0) { + // unicode attributes + return ultag_isUnicodeLocaleAttributes(value, value_len); + } + // otherwise: unicode extension value + // We need to convert from legacy key/value to unicode + // key/value + const char* unicode_locale_key = uloc_toUnicodeLocaleKey(key); + const char* unicode_locale_type = uloc_toUnicodeLocaleType(key, value); + + return unicode_locale_key && unicode_locale_type && + ultag_isUnicodeLocaleKey(unicode_locale_key, -1) && + ultag_isUnicodeLocaleType(unicode_locale_type, -1); +} + +static void +_copyExtensions(const Locale& from, Locale* to, bool validate, UErrorCode& errorCode) +{ + if (U_FAILURE(errorCode)) { return; } + LocalPointer iter(from.createKeywords(errorCode)); + if (U_FAILURE(errorCode) || iter.isNull()) { return; } + const char* key; + while ((key = iter->next(nullptr, errorCode)) != nullptr) { + CharString value; + CharStringByteSink sink(&value); + from.getKeywordValue(key, sink, errorCode); + if (U_FAILURE(errorCode)) { return; } + if (uprv_strcmp(key, kAttributeKey) == 0) { + transform(value.data(), value.length()); + } + if (validate && + !_isKeywordValue(key, value.data(), value.length())) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + to->setKeywordValue(key, value.data(), errorCode); + if (U_FAILURE(errorCode)) { return; } + } +} + +void static +_clearUAttributesAndKeyType(Locale* locale, UErrorCode& errorCode) +{ + // Clear Unicode attributes + locale->setKeywordValue(kAttributeKey, "", errorCode); + + // Clear all Unicode keyword values + LocalPointer iter(locale->createUnicodeKeywords(errorCode)); + if (U_FAILURE(errorCode) || iter.isNull()) { return; } + const char* key; + while ((key = iter->next(nullptr, errorCode)) != nullptr) { + locale->setUnicodeKeywordValue(key, nullptr, errorCode); + } +} + +static void +_setUnicodeExtensions(Locale* locale, const CharString& value, UErrorCode& errorCode) +{ + // Add the unicode extensions to extensions_ + CharString locale_str("und-u-", errorCode); + locale_str.append(value, errorCode); + _copyExtensions( + Locale::forLanguageTag(locale_str.data(), errorCode), + locale, false, errorCode); +} + +LocaleBuilder& LocaleBuilder::setExtension(char key, StringPiece value) +{ + if (U_FAILURE(status_)) { return *this; } + if (!UPRV_ISALPHANUM(key)) { + status_ = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + CharString value_str(value, status_); + if (U_FAILURE(status_)) { return *this; } + transform(value_str.data(), value_str.length()); + if (!value_str.isEmpty() && + !_isExtensionSubtags(key, value_str.data(), value_str.length())) { + status_ = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + if (extensions_ == nullptr) { + extensions_ = new Locale(); + if (extensions_ == nullptr) { + status_ = U_MEMORY_ALLOCATION_ERROR; + return *this; + } + } + if (uprv_tolower(key) != 'u') { + // for t, x and others extension. + extensions_->setKeywordValue(StringPiece(&key, 1), value_str.data(), + status_); + return *this; + } + _clearUAttributesAndKeyType(extensions_, status_); + if (U_FAILURE(status_)) { return *this; } + if (!value.empty()) { + _setUnicodeExtensions(extensions_, value_str, status_); + } + return *this; +} + +LocaleBuilder& LocaleBuilder::setUnicodeLocaleKeyword( + StringPiece key, StringPiece type) +{ + if (U_FAILURE(status_)) { return *this; } + if (!ultag_isUnicodeLocaleKey(key.data(), key.length()) || + (!type.empty() && + !ultag_isUnicodeLocaleType(type.data(), type.length()))) { + status_ = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + if (extensions_ == nullptr) { + extensions_ = new Locale(); + } + if (extensions_ == nullptr) { + status_ = U_MEMORY_ALLOCATION_ERROR; + return *this; + } + extensions_->setUnicodeKeywordValue(key, type, status_); + return *this; +} + +LocaleBuilder& LocaleBuilder::addUnicodeLocaleAttribute( + StringPiece value) +{ + CharString value_str(value, status_); + if (U_FAILURE(status_)) { return *this; } + transform(value_str.data(), value_str.length()); + if (!ultag_isUnicodeLocaleAttribute(value_str.data(), value_str.length())) { + status_ = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + if (extensions_ == nullptr) { + extensions_ = new Locale(); + if (extensions_ == nullptr) { + status_ = U_MEMORY_ALLOCATION_ERROR; + return *this; + } + extensions_->setKeywordValue(kAttributeKey, value_str.data(), status_); + return *this; + } + + CharString attributes; + CharStringByteSink sink(&attributes); + UErrorCode localErrorCode = U_ZERO_ERROR; + extensions_->getKeywordValue(kAttributeKey, sink, localErrorCode); + if (U_FAILURE(localErrorCode)) { + CharString new_attributes(value_str.data(), status_); + // No attributes, set the attribute. + extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_); + return *this; + } + + transform(attributes.data(),attributes.length()); + const char* start = attributes.data(); + const char* limit = attributes.data() + attributes.length(); + CharString new_attributes; + bool inserted = false; + while (start < limit) { + if (!inserted) { + int cmp = uprv_strcmp(start, value_str.data()); + if (cmp == 0) { return *this; } // Found it in attributes: Just return + if (cmp > 0) { + if (!new_attributes.isEmpty()) new_attributes.append('_', status_); + new_attributes.append(value_str.data(), status_); + inserted = true; + } + } + if (!new_attributes.isEmpty()) { + new_attributes.append('_', status_); + } + new_attributes.append(start, status_); + start += uprv_strlen(start) + 1; + } + if (!inserted) { + if (!new_attributes.isEmpty()) { + new_attributes.append('_', status_); + } + new_attributes.append(value_str.data(), status_); + } + // Not yet in the attributes, set the attribute. + extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_); + return *this; +} + +LocaleBuilder& LocaleBuilder::removeUnicodeLocaleAttribute( + StringPiece value) +{ + CharString value_str(value, status_); + if (U_FAILURE(status_)) { return *this; } + transform(value_str.data(), value_str.length()); + if (!ultag_isUnicodeLocaleAttribute(value_str.data(), value_str.length())) { + status_ = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + if (extensions_ == nullptr) { return *this; } + UErrorCode localErrorCode = U_ZERO_ERROR; + CharString attributes; + CharStringByteSink sink(&attributes); + extensions_->getKeywordValue(kAttributeKey, sink, localErrorCode); + // get failure, just return + if (U_FAILURE(localErrorCode)) { return *this; } + // Do not have any attributes, just return. + if (attributes.isEmpty()) { return *this; } + + char* p = attributes.data(); + // Replace null terminiator in place for _ and - so later + // we can use uprv_strcmp to compare. + for (int32_t i = 0; i < attributes.length(); i++, p++) { + *p = (*p == '_' || *p == '-') ? '\0' : uprv_tolower(*p); + } + + const char* start = attributes.data(); + const char* limit = attributes.data() + attributes.length(); + CharString new_attributes; + bool found = false; + while (start < limit) { + if (uprv_strcmp(start, value_str.data()) == 0) { + found = true; + } else { + if (!new_attributes.isEmpty()) { + new_attributes.append('_', status_); + } + new_attributes.append(start, status_); + } + start += uprv_strlen(start) + 1; + } + // Found the value in attributes, set the attribute. + if (found) { + extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_); + } + return *this; +} + +LocaleBuilder& LocaleBuilder::clear() +{ + status_ = U_ZERO_ERROR; + language_[0] = 0; + script_[0] = 0; + region_[0] = 0; + delete variant_; + variant_ = nullptr; + clearExtensions(); + return *this; +} + +LocaleBuilder& LocaleBuilder::clearExtensions() +{ + delete extensions_; + extensions_ = nullptr; + return *this; +} + +Locale makeBogusLocale() { + Locale bogus; + bogus.setToBogus(); + return bogus; +} + +Locale LocaleBuilder::build(UErrorCode& errorCode) +{ + if (U_FAILURE(errorCode)) { + return makeBogusLocale(); + } + if (U_FAILURE(status_)) { + errorCode = status_; + return makeBogusLocale(); + } + CharString locale_str(language_, errorCode); + if (uprv_strlen(script_) > 0) { + locale_str.append('-', errorCode).append(StringPiece(script_), errorCode); + } + if (uprv_strlen(region_) > 0) { + locale_str.append('-', errorCode).append(StringPiece(region_), errorCode); + } + if (variant_ != nullptr) { + locale_str.append('-', errorCode).append(StringPiece(variant_->data()), errorCode); + } + if (U_FAILURE(errorCode)) { + return makeBogusLocale(); + } + Locale product(locale_str.data()); + if (extensions_ != nullptr) { + _copyExtensions(*extensions_, &product, true, errorCode); + } + if (U_FAILURE(errorCode)) { + return makeBogusLocale(); + } + return product; +} + +U_NAMESPACE_END diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp index 9b5de7f1d9c..063efd45578 100644 --- a/icu4c/source/common/uloc_tag.cpp +++ b/icu4c/source/common/uloc_tag.cpp @@ -406,13 +406,22 @@ _isAlphaNumericString(const char* s, int32_t len) { } static UBool -_isLanguageSubtag(const char* s, int32_t len) { +_isAlphaNumericStringLimitedLength(const char* s, int32_t len, int32_t min, int32_t max) { + if (len < 0) { + len = (int32_t)uprv_strlen(s); + } + if (len >= min && len <= max && _isAlphaNumericString(s, len)) { + return TRUE; + } + return FALSE; +} + +U_CFUNC UBool +ultag_isLanguageSubtag(const char* s, int32_t len) { /* - * language = 2*3ALPHA ; shortest ISO 639 code - * ["-" extlang] ; sometimes followed by - * ; extended language subtags - * / 4ALPHA ; or reserved for future use - * / 5*8ALPHA ; or registered language subtag + * unicode_language_subtag = alpha{2,3} | alpha{5,8}; + * NOTE: Per ICUTC 2019/01/23- accepting alpha 4 + * See ICU-20372 */ if (len < 0) { len = (int32_t)uprv_strlen(s); @@ -438,8 +447,8 @@ _isExtlangSubtag(const char* s, int32_t len) { return FALSE; } -static UBool -_isScriptSubtag(const char* s, int32_t len) { +U_CFUNC UBool +ultag_isScriptSubtag(const char* s, int32_t len) { /* * script = 4ALPHA ; ISO 15924 code */ @@ -452,8 +461,8 @@ _isScriptSubtag(const char* s, int32_t len) { return FALSE; } -static UBool -_isRegionSubtag(const char* s, int32_t len) { +U_CFUNC UBool +ultag_isRegionSubtag(const char* s, int32_t len) { /* * region = 2ALPHA ; ISO 3166-1 code * / 3DIGIT ; UN M.49 code @@ -479,7 +488,7 @@ _isVariantSubtag(const char* s, int32_t len) { if (len < 0) { len = (int32_t)uprv_strlen(s); } - if (len >= 5 && len <= 8 && _isAlphaNumericString(s, len)) { + if (_isAlphaNumericStringLimitedLength(s, len, 5, 8)) { return TRUE; } if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) { @@ -488,19 +497,48 @@ _isVariantSubtag(const char* s, int32_t len) { return FALSE; } +static UBool +_isSepListOf(UBool (*test)(const char*, int32_t), const char* s, int32_t len) { + const char *p = s; + const char *pSubtag = NULL; + + if (len < 0) { + len = (int32_t)uprv_strlen(s); + } + + while ((p - s) < len) { + if (*p == SEP) { + if (pSubtag == NULL) { + return FALSE; + } + if (!test(pSubtag, (int32_t)(p - pSubtag))) { + return FALSE; + } + pSubtag = NULL; + } else if (pSubtag == NULL) { + pSubtag = p; + } + p++; + } + if (pSubtag == NULL) { + return FALSE; + } + return test(pSubtag, (int32_t)(p - pSubtag)); +} + +U_CFUNC UBool +ultag_isVariantSubtags(const char* s, int32_t len) { + return _isSepListOf(&_isVariantSubtag, s, len); +} + +// This is for the ICU-specific "lvariant" handling. static UBool _isPrivateuseVariantSubtag(const char* s, int32_t len) { /* * variant = 1*8alphanum ; registered variants * / (DIGIT 3alphanum) */ - if (len < 0) { - len = (int32_t)uprv_strlen(s); - } - if (len >= 1 && len <= 8 && _isAlphaNumericString(s, len)) { - return TRUE; - } - return FALSE; + return _isAlphaNumericStringLimitedLength(s, len , 1, 8); } static UBool @@ -528,42 +566,12 @@ _isExtensionSubtag(const char* s, int32_t len) { /* * extension = singleton 1*("-" (2*8alphanum)) */ - if (len < 0) { - len = (int32_t)uprv_strlen(s); - } - if (len >= 2 && len <= 8 && _isAlphaNumericString(s, len)) { - return TRUE; - } - return FALSE; + return _isAlphaNumericStringLimitedLength(s, len, 2, 8); } -static UBool -_isExtensionSubtags(const char* s, int32_t len) { - const char *p = s; - const char *pSubtag = NULL; - - if (len < 0) { - len = (int32_t)uprv_strlen(s); - } - - while ((p - s) < len) { - if (*p == SEP) { - if (pSubtag == NULL) { - return FALSE; - } - if (!_isExtensionSubtag(pSubtag, (int32_t)(p - pSubtag))) { - return FALSE; - } - pSubtag = NULL; - } else if (pSubtag == NULL) { - pSubtag = p; - } - p++; - } - if (pSubtag == NULL) { - return FALSE; - } - return _isExtensionSubtag(pSubtag, (int32_t)(p - pSubtag)); +U_CFUNC UBool +ultag_isExtensionSubtags(const char* s, int32_t len) { + return _isSepListOf(&_isExtensionSubtag, s, len); } static UBool @@ -571,46 +579,32 @@ _isPrivateuseValueSubtag(const char* s, int32_t len) { /* * privateuse = "x" 1*("-" (1*8alphanum)) */ - if (len < 0) { - len = (int32_t)uprv_strlen(s); - } - if (len >= 1 && len <= 8 && _isAlphaNumericString(s, len)) { - return TRUE; - } - return FALSE; + return _isAlphaNumericStringLimitedLength(s, len, 1, 8); } -static UBool -_isPrivateuseValueSubtags(const char* s, int32_t len) { - const char *p = s; - const char *pSubtag = NULL; +U_CFUNC UBool +ultag_isPrivateuseValueSubtags(const char* s, int32_t len) { + return _isSepListOf(&_isPrivateuseValueSubtag, s, len); +} - if (len < 0) { - len = (int32_t)uprv_strlen(s); - } +U_CFUNC UBool +ultag_isUnicodeLocaleAttribute(const char* s, int32_t len) { + /* + * attribute = alphanum{3,8} ; + */ + return _isAlphaNumericStringLimitedLength(s, len , 3, 8); +} - while ((p - s) < len) { - if (*p == SEP) { - if (pSubtag == NULL) { - return FALSE; - } - if (!_isPrivateuseValueSubtag(pSubtag, (int32_t)(p - pSubtag))) { - return FALSE; - } - pSubtag = NULL; - } else if (pSubtag == NULL) { - pSubtag = p; - } - p++; - } - if (pSubtag == NULL) { - return FALSE; - } - return _isPrivateuseValueSubtag(pSubtag, (int32_t)(p - pSubtag)); +U_CFUNC UBool +ultag_isUnicodeLocaleAttributes(const char* s, int32_t len) { + return _isSepListOf(&ultag_isUnicodeLocaleAttribute, s, len); } U_CFUNC UBool ultag_isUnicodeLocaleKey(const char* s, int32_t len) { + /* + * key = alphanum alpha ; + */ if (len < 0) { len = (int32_t)uprv_strlen(s); } @@ -620,9 +614,160 @@ ultag_isUnicodeLocaleKey(const char* s, int32_t len) { return FALSE; } +U_CFUNC UBool +_isUnicodeLocaleTypeSubtag(const char*s, int32_t len) { + /* + * alphanum{3,8} + */ + return _isAlphaNumericStringLimitedLength(s, len , 3, 8); +} + U_CFUNC UBool ultag_isUnicodeLocaleType(const char*s, int32_t len) { + /* + * type = alphanum{3,8} (sep alphanum{3,8})* ; + */ + return _isSepListOf(&_isUnicodeLocaleTypeSubtag, s, len); +} + +static UBool +_isTKey(const char* s, int32_t len) +{ + /* + * tkey = alpha digit ; + */ + if (len < 0) { + len = (int32_t)uprv_strlen(s); + } + if (len == 2 && ISALPHA(*s) && ISNUMERIC(*(s + 1))) { + return TRUE; + } + return FALSE; +} + +static UBool +_isTValue(const char* s, int32_t len) +{ + /* + * tvalue = (sep alphanum{3,8})+ ; + */ + return _isAlphaNumericStringLimitedLength(s, len , 3, 8); +} + +static UBool +_isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len) +{ + const int32_t kStart = 0; // Start, wait for unicode_language_subtag, tkey or end + const int32_t kGotLanguage = 1; // Got unicode_language_subtag, wait for unicode_script_subtag, + // unicode_region_subtag, unicode_variant_subtag, tkey or end + const int32_t kGotScript = 2; // Got unicode_script_subtag, wait for unicode_region_subtag, + // unicode_variant_subtag, tkey, or end + const int32_t kGotRegion = 3; // Got unicode_region_subtag, wait for unicode_variant_subtag, + // tkey, or end. + const int32_t kGotVariant = 4; // Got unicode_variant_subtag, wait for unicode_variant_subtag + // tkey or end. + const int32_t kGotTKey = -1; // Got tkey, wait for tvalue. ERROR if stop here. + const int32_t kGotTValue = 6; // Got tvalue, wait for tkey, tvalue or end + + switch (state) { + case kStart: + if (ultag_isLanguageSubtag(s, len)) { + state = kGotLanguage; + return TRUE; + } + if (_isTKey(s, len)) { + state = kGotTKey; + return TRUE; + } + return FALSE; + case kGotLanguage: + if (ultag_isScriptSubtag(s, len)) { + state = kGotScript; + return TRUE; + } + U_FALLTHROUGH; + case kGotScript: + if (ultag_isRegionSubtag(s, len)) { + state = kGotRegion; + return TRUE; + } + U_FALLTHROUGH; + case kGotRegion: + U_FALLTHROUGH; + case kGotVariant: + if (_isVariantSubtag(s, len)) { + state = kGotVariant; + return TRUE; + } + if (_isTKey(s, len)) { + state = kGotTKey; + return TRUE; + } + return FALSE; + case kGotTKey: + if (_isTValue(s, len)) { + state = kGotTValue; + return TRUE; + } + return FALSE; + case kGotTValue: + if (_isTKey(s, len)) { + state = kGotTKey; + return TRUE; + } + if (_isTValue(s, len)) { + return TRUE; + } + return FALSE; + } + return FALSE; +} + +static UBool +_isUnicodeExtensionSubtag(int32_t& state, const char* s, int32_t len) +{ + const int32_t kStart = 0; // Start, wait for a key or attribute or end + const int32_t kGotKey = 1; // Got a key, wait for type or key or end + const int32_t kGotType = 2; // Got a type, wait for key or end + + switch (state) { + case kStart: + if (ultag_isUnicodeLocaleKey(s, len)) { + state = kGotKey; + return TRUE; + } + if (ultag_isUnicodeLocaleAttribute(s, len)) { + return TRUE; + } + return FALSE; + case kGotKey: + if (ultag_isUnicodeLocaleKey(s, len)) { + return TRUE; + } + if (_isUnicodeLocaleTypeSubtag(s, len)) { + state = kGotType; + return TRUE; + } + return FALSE; + case kGotType: + if (ultag_isUnicodeLocaleKey(s, len)) { + state = kGotKey; + return TRUE; + } + if (_isUnicodeLocaleTypeSubtag(s, len)) { + return TRUE; + } + return FALSE; + } + return FALSE; +} + +static UBool +_isStatefulSepListOf(UBool (*test)(int32_t&, const char*, int32_t), const char* s, int32_t len) +{ + int32_t state = 0; const char* p; + const char* start = s; int32_t subtagLen = 0; if (len < 0) { @@ -631,22 +776,34 @@ ultag_isUnicodeLocaleType(const char*s, int32_t len) { for (p = s; len > 0; p++, len--) { if (*p == SEP) { - if (subtagLen < 3) { + if (!test(state, start, subtagLen)) { return FALSE; } subtagLen = 0; - } else if (ISALPHA(*p) || ISNUMERIC(*p)) { - subtagLen++; - if (subtagLen > 8) { - return FALSE; - } + start = p + 1; } else { - return FALSE; + subtagLen++; } } - return (subtagLen >= 3); + if (test(state, start, subtagLen) && state >= 0) { + return TRUE; + } + return FALSE; } + +U_CFUNC UBool +ultag_isTransformedExtensionSubtags(const char* s, int32_t len) +{ + return _isStatefulSepListOf(&_isTransformedExtensionSubtag, s, len); +} + +U_CFUNC UBool +ultag_isUnicodeExtensionSubtags(const char* s, int32_t len) { + return _isStatefulSepListOf(&_isUnicodeExtensionSubtag, s, len); +} + + /* * ------------------------------------------------- * @@ -856,7 +1013,7 @@ _appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool st if (len == 0) { sink.Append(LANG_UND, LANG_UND_LEN); - } else if (!_isLanguageSubtag(buf, len)) { + } else if (!ultag_isLanguageSubtag(buf, len)) { /* invalid language code */ if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; @@ -900,7 +1057,7 @@ _appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool stri } if (len > 0) { - if (!_isScriptSubtag(buf, len)) { + if (!ultag_isScriptSubtag(buf, len)) { /* invalid script code */ if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; @@ -932,7 +1089,7 @@ _appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool stri } if (len > 0) { - if (!_isRegionSubtag(buf, len)) { + if (!ultag_isRegionSubtag(buf, len)) { /* invalid region code */ if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; @@ -1252,7 +1409,7 @@ _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool st } } else { if (*key == PRIVATEUSE) { - if (!_isPrivateuseValueSubtags(buf.data(), len)) { + if (!ultag_isPrivateuseValueSubtags(buf.data(), len)) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; @@ -1260,7 +1417,7 @@ _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool st continue; } } else { - if (!_isExtensionSingleton(key, keylen) || !_isExtensionSubtags(buf.data(), len)) { + if (!_isExtensionSingleton(key, keylen) || !ultag_isExtensionSubtags(buf.data(), len)) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; @@ -1997,7 +2154,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta subtagLen = (int32_t)(pSep - pSubtag); if (next & LANG) { - if (_isLanguageSubtag(pSubtag, subtagLen)) { + if (ultag_isLanguageSubtag(pSubtag, subtagLen)) { *pSep = 0; /* terminate */ // TODO: move deprecated language code handling here. t->language = T_CString_toLowerCase(pSubtag); @@ -2024,7 +2181,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta } } if (next & SCRT) { - if (_isScriptSubtag(pSubtag, subtagLen)) { + if (ultag_isScriptSubtag(pSubtag, subtagLen)) { char *p = pSubtag; *pSep = 0; @@ -2044,7 +2201,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta } } if (next & REGN) { - if (_isRegionSubtag(pSubtag, subtagLen)) { + if (ultag_isRegionSubtag(pSubtag, subtagLen)) { *pSep = 0; // TODO: move deprecated region code handling here. t->region = T_CString_toUpperCase(pSubtag); @@ -2535,7 +2692,7 @@ ulocimp_toLanguageTag(const char* localeID, buf[1] = SEP; len = uloc_getKeywordValue(localeID, key, &buf[2], sizeof(buf) - 2, &tmpStatus); if (U_SUCCESS(tmpStatus)) { - if (_isPrivateuseValueSubtags(&buf[2], len)) { + if (ultag_isPrivateuseValueSubtags(&buf[2], len)) { /* return private use only tag */ sink.Append(buf, len + 2); done = TRUE; diff --git a/icu4c/source/common/ulocimp.h b/icu4c/source/common/ulocimp.h index f268f8995aa..fd16af5ae52 100644 --- a/icu4c/source/common/ulocimp.h +++ b/icu4c/source/common/ulocimp.h @@ -148,6 +148,32 @@ ulocimp_getRegionForSupplementalData(const char *localeID, UBool inferRegion, U_CAPI const char * U_EXPORT2 locale_getKeywordsStart(const char *localeID); +U_CFUNC UBool +ultag_isExtensionSubtags(const char* s, int32_t len); + +U_CFUNC UBool +ultag_isLanguageSubtag(const char* s, int32_t len); + +U_CFUNC UBool +ultag_isPrivateuseValueSubtags(const char* s, int32_t len); + +U_CFUNC UBool +ultag_isRegionSubtag(const char* s, int32_t len); + +U_CFUNC UBool +ultag_isScriptSubtag(const char* s, int32_t len); + +U_CFUNC UBool +ultag_isTransformedExtensionSubtags(const char* s, int32_t len); + +U_CFUNC UBool +ultag_isUnicodeExtensionSubtags(const char* s, int32_t len); + +U_CFUNC UBool +ultag_isUnicodeLocaleAttribute(const char* s, int32_t len); + +U_CFUNC UBool +ultag_isUnicodeLocaleAttributes(const char* s, int32_t len); U_CFUNC UBool ultag_isUnicodeLocaleKey(const char* s, int32_t len); @@ -155,6 +181,9 @@ ultag_isUnicodeLocaleKey(const char* s, int32_t len); U_CFUNC UBool ultag_isUnicodeLocaleType(const char* s, int32_t len); +U_CFUNC UBool +ultag_isVariantSubtags(const char* s, int32_t len); + U_CFUNC const char* ulocimp_toBcpKey(const char* key); diff --git a/icu4c/source/common/unicode/localebuilder.h b/icu4c/source/common/unicode/localebuilder.h new file mode 100644 index 00000000000..8cd2039b3c5 --- /dev/null +++ b/icu4c/source/common/unicode/localebuilder.h @@ -0,0 +1,288 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +#ifndef __LOCALEBUILDER_H__ +#define __LOCALEBUILDER_H__ + +#include "unicode/locid.h" +#include "unicode/stringpiece.h" +#include "unicode/uobject.h" +#include "unicode/utypes.h" + + +/** + * \file + * \brief C++ API: Builder API for Locale + */ + +U_NAMESPACE_BEGIN +class CharString; + +#ifndef U_HIDE_DRAFT_API +/** + * LocaleBuilder is used to build instances of Locale + * from values configured by the setters. Unlike the Locale + * constructors, the LocaleBuilder checks if a value configured by a + * setter satisfies the syntax requirements defined by the Locale + * class. A Locale object created by a LocaleBuilder is + * well-formed and can be transformed to a well-formed IETF BCP 47 language tag + * without losing information. + * + *

The following example shows how to create a Locale object + * with the LocaleBuilder. + *

+ *
+ *     UErrorCode status = U_ZERO_ERROR;
+ *     Locale aLocale = LocaleBuilder()
+ *                          .setLanguage("sr")
+ *                          .setScript("Latn")
+ *                          .setRegion("RS")
+ *                          .build(status);
+ *     if (U_SUCCESS(status)) {
+ *       // ...
+ *     }
+ * 
+ *
+ * + *

LocaleBuilders can be reused; clear() resets all + * fields to their default values. + * + *

LocaleBuilder tracks errors in an internal UErrorCode. For all setters, + * except setLanguageTag and setLocale, LocaleBuilder will return immediately + * if the internal UErrorCode is in error state. + * To reset internal state and error code, call clear method. + * The setLanguageTag and setLocale method will first clear the internal + * UErrorCode, then track the error of the validation of the input parameter + * into the internal UErrorCode. + * + * @draft ICU 64 + */ +class U_COMMON_API LocaleBuilder : public UObject { +public: + /** + * Constructs an empty LocaleBuilder. The default value of all + * fields, extensions, and private use information is the + * empty string. + * + * @draft ICU 64 + */ + LocaleBuilder(); + + virtual ~LocaleBuilder(); + + /** + * Resets the LocaleBuilder to match the provided + * locale. Existing state is discarded. + * + *

All fields of the locale must be well-formed. + *

This method clears the internal UErrorCode. + * + * @param locale the locale + * @return This builder. + * + * @draft ICU 64 + */ + LocaleBuilder& setLocale(const Locale& locale); + + /** + * Resets the LocaleBuilder to match the provided + * [Unicode Locale Identifier](http://www.unicode.org/reports/tr35/tr35.html#unicode_locale_id) . + * Discards the existing state. the empty string cause the builder to be + * reset, like {@link #clear}. Grandfathered tags are converted to their + * canonical form before being processed. Otherwise, the language + * tag must be well-formed, or else the build() method will later + * report an U_ILLEGAL_ARGUMENT_ERROR. + * + *

This method clears the internal UErrorCode. + * + * @param tag the language tag, defined as + * [unicode_locale_id](http://www.unicode.org/reports/tr35/tr35.html#unicode_locale_id). + * @return This builder. + * @draft ICU 64 + */ + LocaleBuilder& setLanguageTag(StringPiece tag); + + /** + * Sets the language. If language is the empty string, the + * language in this LocaleBuilder is removed. Otherwise, the + * language must be well-formed, or else the build() method will + * later report an U_ILLEGAL_ARGUMENT_ERROR. + * + *

The syntax of language value is defined as + * [unicode_language_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_language_subtag). + * + * @param language the language + * @return This builder. + * @draft ICU 64 + */ + LocaleBuilder& setLanguage(StringPiece language); + + /** + * Sets the script. If script is the empty string, the script in + * this LocaleBuilder is removed. + * Otherwise, the script must be well-formed, or else the build() + * method will later report an U_ILLEGAL_ARGUMENT_ERROR. + * + *

The script value is a four-letter script code as + * [unicode_script_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_script_subtag) + * defined by ISO 15924 + * + * @param script the script + * @return This builder. + * @draft ICU 64 + */ + LocaleBuilder& setScript(StringPiece script); + + /** + * Sets the region. If region is the empty string, the region in this + * LocaleBuilder is removed. Otherwise, the region + * must be well-formed, or else the build() method will later report an + * U_ILLEGAL_ARGUMENT_ERROR. + * + *

The region value is defined by + * [unicode_region_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_region_subtag) + * as a two-letter ISO 3166 code or a three-digit UN M.49 area code. + * + *

The region value in the Locale created by the + * LocaleBuilder is always normalized to upper case. + * + * @param region the region + * @return This builder. + * @draft ICU 64 + */ + LocaleBuilder& setRegion(StringPiece region); + + /** + * Sets the variant. If variant is the empty string, the variant in this + * LocaleBuilder is removed. Otherwise, the variant + * must be well-formed, or else the build() method will later report an + * U_ILLEGAL_ARGUMENT_ERROR. + * + *

Note: This method checks if variant + * satisfies the + * [unicode_variant_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_variant_subtag) + * syntax requirements, and normalizes the value to lowercase letters. However, + * the Locale class does not impose any syntactic + * restriction on variant. To set an ill-formed variant, use a Locale constructor. + * If there are multiple unicode_variant_subtag, the caller must concatenate + * them with '-' as separator (ex: "foobar-fibar"). + * + * @param variant the variant + * @return This builder. + * @draft ICU 64 + */ + LocaleBuilder& setVariant(StringPiece variant); + + /** + * Sets the extension for the given key. If the value is the empty string, + * the extension is removed. Otherwise, the key and + * value must be well-formed, or else the build() method will + * later report an U_ILLEGAL_ARGUMENT_ERROR. + * + *

Note: The key ('u') is used for the Unicode locale extension. + * Setting a value for this key replaces any existing Unicode locale key/type + * pairs with those defined in the extension. + * + *

Note: The key ('x') is used for the private use code. To be + * well-formed, the value for this key needs only to have subtags of one to + * eight alphanumeric characters, not two to eight as in the general case. + * + * @param key the extension key + * @param value the extension value + * @return This builder. + * @draft ICU 64 + */ + LocaleBuilder& setExtension(char key, StringPiece value); + + /** + * Sets the Unicode locale keyword type for the given key. If the type + * StringPiece is constructed with a nullptr, the keyword is removed. + * If the type is the empty string, the keyword is set without type subtags. + * Otherwise, the key and type must be well-formed, or else the build() + * method will later report an U_ILLEGAL_ARGUMENT_ERROR. + * + *

Keys and types are converted to lower case. + * + *

Note:Setting the 'u' extension via {@link #setExtension} + * replaces all Unicode locale keywords with those defined in the + * extension. + * + * @param key the Unicode locale key + * @param type the Unicode locale type + * @return This builder. + * @draft ICU 64 + */ + LocaleBuilder& setUnicodeLocaleKeyword( + StringPiece key, StringPiece type); + + /** + * Adds a unicode locale attribute, if not already present, otherwise + * has no effect. The attribute must not be empty string and must be + * well-formed or U_ILLEGAL_ARGUMENT_ERROR will be set to status + * during the build() call. + * + * @param attribute the attribute + * @return This builder. + * @draft ICU 64 + */ + LocaleBuilder& addUnicodeLocaleAttribute(StringPiece attribute); + + /** + * Removes a unicode locale attribute, if present, otherwise has no + * effect. The attribute must not be empty string and must be well-formed + * or U_ILLEGAL_ARGUMENT_ERROR will be set to status during the build() call. + * + *

Attribute comparison for removal is case-insensitive. + * + * @param attribute the attribute + * @return This builder. + * @draft ICU 64 + */ + LocaleBuilder& removeUnicodeLocaleAttribute(StringPiece attribute); + + /** + * Resets the builder to its initial, empty state. + *

This method clears the internal UErrorCode. + * + * @return this builder + * @draft ICU 64 + */ + LocaleBuilder& clear(); + + /** + * Resets the extensions to their initial, empty state. + * Language, script, region and variant are unchanged. + * + * @return this builder + * @draft ICU 64 + */ + LocaleBuilder& clearExtensions(); + + /** + * Returns an instance of Locale created from the fields set + * on this builder. + * If any set methods or during the build() call require memory allocation + * but fail U_MEMORY_ALLOCATION_ERROR will be set to status. + * If any of the fields set by the setters are not well-formed, the status + * will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will + * not change after the build() call and the caller is free to keep using + * the same builder to build more locales. + * + * @return a new Locale + * @draft ICU 64 + */ + Locale build(UErrorCode& status); + +private: + UErrorCode status_; + char language_[9]; + char script_[5]; + char region_[4]; + CharString *variant_; // Pointer not object so we need not #include internal charstr.h. + icu::Locale *extensions_; // Pointer not object. Storage for all other fields. + +}; +#endif // U_HIDE_DRAFT_API + +U_NAMESPACE_END + +#endif // __LOCALEBUILDER_H__ diff --git a/icu4c/source/common/unicode/urename.h b/icu4c/source/common/unicode/urename.h index 0512be3b6e5..cea3be4430e 100644 --- a/icu4c/source/common/unicode/urename.h +++ b/icu4c/source/common/unicode/urename.h @@ -1109,6 +1109,16 @@ #define ulocimp_toLegacyType U_ICU_ENTRY_POINT_RENAME(ulocimp_toLegacyType) #define ultag_isUnicodeLocaleKey U_ICU_ENTRY_POINT_RENAME(ultag_isUnicodeLocaleKey) #define ultag_isUnicodeLocaleType U_ICU_ENTRY_POINT_RENAME(ultag_isUnicodeLocaleType) +#define ultag_isExtensionSubtags U_ICU_ENTRY_POINT_RENAME(ultag_isExtensionSubtags) +#define ultag_isLanguageSubtag U_ICU_ENTRY_POINT_RENAME(ultag_isLanguageSubtag) +#define ultag_isPrivateuseValueSubtags U_ICU_ENTRY_POINT_RENAME(ultag_isPrivateuseValueSubtags) +#define ultag_isRegionSubtag U_ICU_ENTRY_POINT_RENAME(ultag_isRegionSubtag) +#define ultag_isScriptSubtag U_ICU_ENTRY_POINT_RENAME(ultag_isScriptSubtag) +#define ultag_isTransformedExtensionSubtags U_ICU_ENTRY_POINT_RENAME(ultag_isTransformedExtensionSubtags) +#define ultag_isUnicodeExtensionSubtags U_ICU_ENTRY_POINT_RENAME(ultag_isUnicodeExtensionSubtags) +#define ultag_isUnicodeLocaleAttribute U_ICU_ENTRY_POINT_RENAME(ultag_isUnicodeLocaleAttribute) +#define ultag_isUnicodeLocaleAttributes U_ICU_ENTRY_POINT_RENAME(ultag_isUnicodeLocaleAttributes) +#define ultag_isVariantSubtags U_ICU_ENTRY_POINT_RENAME(ultag_isVariantSubtags) #define umsg_applyPattern U_ICU_ENTRY_POINT_RENAME(umsg_applyPattern) #define umsg_autoQuoteApostrophe U_ICU_ENTRY_POINT_RENAME(umsg_autoQuoteApostrophe) #define umsg_clone U_ICU_ENTRY_POINT_RENAME(umsg_clone) diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt index d2682abf473..1e519808e75 100644 --- a/icu4c/source/test/depstest/dependencies.txt +++ b/icu4c/source/test/depstest/dependencies.txt @@ -188,6 +188,7 @@ library: common uinit utypes errorcode icuplug platform + localebuilder group: pluralmap # TODO: Move to i18n library, ticket #11926. @@ -643,6 +644,11 @@ group: resourcebundle uscript_props propname bytesinkutil +group: localebuilder + localebuilder.o + deps + resourcebundle + group: udata udata.o ucmndata.o udatamem.o umapfile.o diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in index c049a5c691d..b4cf918ec48 100644 --- a/icu4c/source/test/intltest/Makefile.in +++ b/icu4c/source/test/intltest/Makefile.in @@ -44,7 +44,7 @@ caltztst.o canittst.o citrtest.o colldata.o convtest.o currcoll.o collationtest. fldset.o dadrfmt.o dadrcal.o dcfmapts.o decoll.o dtfmapts.o dtfmrgts.o dtfmtrtts.o dtfmttst.o \ dtptngts.o encoll.o escoll.o ficoll.o frcoll.o g7coll.o intltest.o \ itercoll.o itformat.o itmajor.o itutil.o jacoll.o lcukocol.o \ -loctest.o miscdtfm.o mnkytst.o msfmrgts.o nmfmapts.o nmfmtrt.o \ +loctest.o localebuildertest.o miscdtfm.o mnkytst.o msfmrgts.o nmfmapts.o nmfmtrt.o \ numfmtst.o numrgts.o plurults.o plurfmts.o pptest.o regcoll.o restest.o restsnew.o \ sdtfmtts.o svccoll.o tchcfmt.o selfmts.o \ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \ diff --git a/icu4c/source/test/intltest/intltest.vcxproj b/icu4c/source/test/intltest/intltest.vcxproj index 298a7f5c07d..5e82ef3bdc7 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj +++ b/icu4c/source/test/intltest/intltest.vcxproj @@ -364,6 +364,7 @@ + @@ -494,8 +495,9 @@ + - \ No newline at end of file + diff --git a/icu4c/source/test/intltest/intltest.vcxproj.filters b/icu4c/source/test/intltest/intltest.vcxproj.filters index d707727e34c..bed26bcbbde 100644 --- a/icu4c/source/test/intltest/intltest.vcxproj.filters +++ b/icu4c/source/test/intltest/intltest.vcxproj.filters @@ -540,6 +540,8 @@ formatting + + locales & resources @@ -927,5 +929,8 @@ formatting + + locales & resources + - \ No newline at end of file + diff --git a/icu4c/source/test/intltest/itutil.cpp b/icu4c/source/test/intltest/itutil.cpp index 91d81d0a86c..3cda39d1778 100644 --- a/icu4c/source/test/intltest/itutil.cpp +++ b/icu4c/source/test/intltest/itutil.cpp @@ -19,6 +19,7 @@ #include "itutil.h" #include "strtest.h" #include "loctest.h" +#include "localebuildertest.h" #include "citrtest.h" #include "ustrtest.h" #include "ucdtest.h" @@ -149,6 +150,7 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* & } #endif break; + CASE(25, LocaleBuilderTest); default: name = ""; break; //needed to end loop } } diff --git a/icu4c/source/test/intltest/localebuildertest.cpp b/icu4c/source/test/intltest/localebuildertest.cpp new file mode 100644 index 00000000000..f99057fa11f --- /dev/null +++ b/icu4c/source/test/intltest/localebuildertest.cpp @@ -0,0 +1,1627 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include + +#include "cmemory.h" +#include "cstring.h" +#include "localebuildertest.h" +#include "unicode/localebuilder.h" +#include "unicode/strenum.h" + +LocaleBuilderTest::LocaleBuilderTest() +{ +} + +LocaleBuilderTest::~LocaleBuilderTest() +{ +} + +void LocaleBuilderTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) +{ + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(TestAddRemoveUnicodeLocaleAttribute); + TESTCASE_AUTO(TestAddRemoveUnicodeLocaleAttributeWellFormed); + TESTCASE_AUTO(TestAddUnicodeLocaleAttributeIllFormed); + TESTCASE_AUTO(TestLocaleBuilder); + TESTCASE_AUTO(TestLocaleBuilderBasic); + TESTCASE_AUTO(TestPosixCases); + TESTCASE_AUTO(TestSetExtensionOthers); + TESTCASE_AUTO(TestSetExtensionPU); + TESTCASE_AUTO(TestSetExtensionT); + TESTCASE_AUTO(TestSetExtensionU); + TESTCASE_AUTO(TestSetExtensionValidateOthersIllFormed); + TESTCASE_AUTO(TestSetExtensionValidateOthersWellFormed); + TESTCASE_AUTO(TestSetExtensionValidatePUIllFormed); + TESTCASE_AUTO(TestSetExtensionValidatePUWellFormed); + TESTCASE_AUTO(TestSetExtensionValidateTIllFormed); + TESTCASE_AUTO(TestSetExtensionValidateTWellFormed); + TESTCASE_AUTO(TestSetExtensionValidateUIllFormed); + TESTCASE_AUTO(TestSetExtensionValidateUWellFormed); + TESTCASE_AUTO(TestSetLanguageIllFormed); + TESTCASE_AUTO(TestSetLanguageWellFormed); + TESTCASE_AUTO(TestSetLocale); + TESTCASE_AUTO(TestSetRegionIllFormed); + TESTCASE_AUTO(TestSetRegionWellFormed); + TESTCASE_AUTO(TestSetScriptIllFormed); + TESTCASE_AUTO(TestSetScriptWellFormed); + TESTCASE_AUTO(TestSetUnicodeLocaleKeywordIllFormedKey); + TESTCASE_AUTO(TestSetUnicodeLocaleKeywordIllFormedValue); + TESTCASE_AUTO(TestSetUnicodeLocaleKeywordWellFormed); + TESTCASE_AUTO(TestSetVariantIllFormed); + TESTCASE_AUTO(TestSetVariantWellFormed); + TESTCASE_AUTO_END; +} + +void LocaleBuilderTest::Verify(LocaleBuilder& bld, const char* expected, const char* msg) { + UErrorCode status = U_ZERO_ERROR; + Locale loc = bld.build(status); + if (U_FAILURE(status)) { + errln(msg, u_errorName(status)); + } + std::string tag = loc.toLanguageTag(status); + if (U_FAILURE(status)) { + errln("loc.toLanguageTag() got Error: %s\n", + u_errorName(status)); + } + if (tag != expected) { + errln("should get \"%s\", but got \"%s\"\n", expected, tag.c_str()); + } +} + +void LocaleBuilderTest::TestLocaleBuilder() { + // The following test data are copy from + // icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleBuilderTest.java + // "L": +1 = language + // "S": +1 = script + // "R": +1 = region + // "V": +1 = variant + // "K": +1 = Unicode locale key / +2 = Unicode locale type + // "A": +1 = Unicode locale attribute + // "E": +1 = extension letter / +2 = extension value + // "P": +1 = private use + // "U": +1 = ULocale + // "B": +1 = BCP47 language tag + // "C": Clear all + // "N": Clear extensions + // "D": +1 = Unicode locale attribute to be removed + // "X": indicates an exception must be thrown + // "T": +1 = expected language tag / +2 = expected locale string + const char* TESTCASES[][14] = { + {"L", "en", "R", "us", "T", "en-US", "en_US"}, + {"L", "en", "R", "CA", "L", nullptr, "T", "und-CA", "_CA"}, + {"L", "en", "R", "CA", "L", "", "T", "und-CA", "_CA"}, + {"L", "en", "R", "FR", "L", "fr", "T", "fr-FR", "fr_FR"}, + {"L", "123", "X"}, + {"R", "us", "T", "und-US", "_US"}, + {"R", "usa", "X"}, + {"R", "123", "L", "it", "R", nullptr, "T", "it", "it"}, + {"R", "123", "L", "it", "R", "", "T", "it", "it"}, + {"R", "123", "L", "en", "T", "en-123", "en_123"}, + {"S", "LATN", "L", "DE", "T", "de-Latn", "de_Latn"}, + {"L", "De", "S", "latn", "R", "de", "S", "", "T", "de-DE", "de_DE"}, + {"L", "De", "S", "Arab", "R", "de", "S", nullptr, "T", "de-DE", "de_DE"}, + {"S", "latin", "X"}, + {"V", "1234", "L", "en", "T", "en-1234", "en__1234"}, + {"V", "1234", "L", "en", "V", "5678", "T", "en-5678", "en__5678"}, + {"V", "1234", "L", "en", "V", nullptr, "T", "en", "en"}, + {"V", "1234", "L", "en", "V", "", "T", "en", "en"}, + {"V", "123", "X"}, + {"U", "en_US", "T", "en-US", "en_US"}, + {"U", "en_US_WIN", "X"}, + {"B", "fr-FR-1606nict-u-ca-gregory-x-test", "T", + "fr-FR-1606nict-u-ca-gregory-x-test", + "fr_FR_1606NICT@calendar=gregorian;x=test"}, + {"B", "ab-cde-fghij", "T", "cde-fghij", "cde__FGHIJ"}, + {"B", "und-CA", "T", "und-CA", "_CA"}, + // Blocked by ICU-20327 + // {"B", "en-US-x-test-lvariant-var", "T", "en-US-x-test-lvariant-var", + // "en_US_VAR@x=test"}, + {"B", "en-US-VAR", "X"}, + {"U", "ja_JP@calendar=japanese;currency=JPY", "L", "ko", "T", + "ko-JP-u-ca-japanese-cu-jpy", "ko_JP@calendar=japanese;currency=JPY"}, + {"U", "ja_JP@calendar=japanese;currency=JPY", "K", "ca", nullptr, "T", + "ja-JP-u-cu-jpy", "ja_JP@currency=JPY"}, + {"U", "ja_JP@calendar=japanese;currency=JPY", "E", "u", + "attr1-ca-gregory", "T", "ja-JP-u-attr1-ca-gregory", + "ja_JP@attribute=attr1;calendar=gregorian"}, + {"U", "en@colnumeric=yes", "K", "kn", "true", "T", "en-u-kn-true", + "en@colnumeric=yes"}, + {"L", "th", "R", "th", "K", "nu", "thai", "T", "th-TH-u-nu-thai", + "th_TH@numbers=thai"}, + {"U", "zh_Hans", "R", "sg", "K", "ca", "badcalendar", "X"}, + {"U", "zh_Hans", "R", "sg", "K", "cal", "gregory", "X"}, + {"E", "z", "ExtZ", "L", "en", "T", "en-z-extz", "en@z=extz"}, + {"E", "z", "ExtZ", "L", "en", "E", "z", "", "T", "en", "en"}, + {"E", "z", "ExtZ", "L", "en", "E", "z", nullptr, "T", "en", "en"}, + {"E", "a", "x", "X"}, + {"E", "a", "abc_def", "T", "und-a-abc-def", "@a=abc-def"}, + // Design limitation - typeless u extension keyword 0a below is interpreted as a boolean value true/yes. + // With the legacy keyword syntax, "yes" is used for such boolean value instead of "true". + // However, once the legacy keyword is translated back to BCP 47 u extension, key "0a" is unknown, + // so "yes" is preserved - not mapped to "true". We could change the code to automatically transform + // key = alphanum alpha + {"L", "en", "E", "u", "bbb-aaa-0a", "T", "en-u-aaa-bbb-0a-yes", + "en@0a=yes;attribute=aaa-bbb"}, + {"L", "fr", "R", "FR", "P", "Yoshito-ICU", "T", "fr-FR-x-yoshito-icu", + "fr_FR@x=yoshito-icu"}, + {"L", "ja", "R", "jp", "K", "ca", "japanese", "T", "ja-JP-u-ca-japanese", + "ja_JP@calendar=japanese"}, + {"K", "co", "PHONEBK", "K", "ca", "gregory", "L", "De", "T", + "de-u-ca-gregory-co-phonebk", "de@calendar=gregorian;collation=phonebook"}, + {"E", "o", "OPQR", "E", "a", "aBcD", "T", "und-a-abcd-o-opqr", "@a=abcd;o=opqr"}, + {"E", "u", "nu-thai-ca-gregory", "L", "TH", "T", "th-u-ca-gregory-nu-thai", + "th@calendar=gregorian;numbers=thai"}, + {"L", "en", "K", "tz", "usnyc", "R", "US", "T", "en-US-u-tz-usnyc", + "en_US@timezone=America/New_York"}, + {"L", "de", "K", "co", "phonebk", "K", "ks", "level1", "K", "kk", + "true", "T", "de-u-co-phonebk-kk-true-ks-level1", + "de@collation=phonebook;colnormalization=yes;colstrength=primary"}, + {"L", "en", "R", "US", "K", "ca", "gregory", "T", "en-US-u-ca-gregory", + "en_US@calendar=gregorian"}, + {"L", "en", "R", "US", "K", "cal", "gregory", "X"}, + {"L", "en", "R", "US", "K", "ca", "gregorian", "X"}, + {"L", "en", "R", "US", "K", "kn", "true", "T", "en-US-u-kn-true", + "en_US@colnumeric=yes"}, + {"B", "de-DE-u-co-phonebk", "C", "L", "pt", "T", "pt", "pt"}, + {"B", "ja-jp-u-ca-japanese", "N", "T", "ja-JP", "ja_JP"}, + {"B", "es-u-def-abc-co-trad", "A", "hij", "D", "def", "T", + "es-u-abc-hij-co-trad", "es@attribute=abc-hij;collation=traditional"}, + {"B", "es-u-def-abc-co-trad", "A", "hij", "D", "def", "D", "def", "T", + "es-u-abc-hij-co-trad", "es@attribute=abc-hij;collation=traditional"}, + {"L", "en", "A", "aa", "X"}, + {"B", "fr-u-attr1-cu-eur", "D", "attribute1", "X"}, + }; + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + for (int tidx = 0; tidx < UPRV_LENGTHOF(TESTCASES); tidx++) { + const char* (&testCase)[14] = TESTCASES[tidx]; + std::string actions; + for (int p = 0; p < UPRV_LENGTHOF(testCase); p++) { + if (testCase[p] == nullptr) { + actions += " (nullptr)"; + break; + } + if (p > 0) actions += " "; + actions += testCase[p]; + } + int i = 0; + const char* method; + status = U_ZERO_ERROR; + bld.clear(); + while (true) { + method = testCase[i++]; + if (strcmp("L", method) == 0) { + bld.setLanguage(testCase[i++]).build(status); + } else if (strcmp("S", method) == 0) { + bld.setScript(testCase[i++]).build(status); + } else if (strcmp("R", method) == 0) { + bld.setRegion(testCase[i++]).build(status); + } else if (strcmp("V", method) == 0) { + bld.setVariant(testCase[i++]).build(status); + } else if (strcmp("K", method) == 0) { + const char* key = testCase[i++]; + const char* type = testCase[i++]; + bld.setUnicodeLocaleKeyword(key, type).build(status); + } else if (strcmp("A", method) == 0) { + bld.addUnicodeLocaleAttribute(testCase[i++]).build(status); + } else if (strcmp("E", method) == 0) { + const char* key = testCase[i++]; + const char* value = testCase[i++]; + bld.setExtension(key[0], value).build(status); + } else if (strcmp("P", method) == 0) { + bld.setExtension('x', testCase[i++]).build(status); + } else if (strcmp("U", method) == 0) { + bld.setLocale(Locale(testCase[i++])).build(status); + } else if (strcmp("B", method) == 0) { + bld.setLanguageTag(testCase[i++]).build(status); + } + // clear / remove + else if (strcmp("C", method) == 0) { + bld.clear().build(status); + } else if (strcmp("N", method) == 0) { + bld.clearExtensions().build(status); + } else if (strcmp("D", method) == 0) { + bld.removeUnicodeLocaleAttribute(testCase[i++]).build(status); + } + // result + else if (strcmp("X", method) == 0) { + if (U_SUCCESS(status)) { + errln("FAIL: No error return - test case: %s", actions.c_str()); + } + } else if (strcmp("T", method) == 0) { + status = U_ZERO_ERROR; + Locale loc = bld.build(status); + if (U_FAILURE(status) || + strcmp(loc.getName(), testCase[i + 1]) != 0) { + errln("FAIL: Wrong locale ID - %s %s %s", loc.getName(), + " for test case: ", actions.c_str()); + } + std::string langtag = loc.toLanguageTag(status); + if (U_FAILURE(status) || langtag != testCase[i]) { + errln("FAIL: Wrong language tag - %s %s %s", langtag.c_str(), + " for test case: ", actions.c_str()); + } + break; + } else { + // Unknow test method + errln("Unknown test case method: There is an error in the test case data."); + break; + } + if (U_FAILURE(status)) { + if (strcmp("X", testCase[i]) == 0) { + // This failure is expected + break; + } else { + errln("FAIL: U_ILLEGAL_ARGUMENT_ERROR at offset %d %s %s", i, + " in test case: ", actions.c_str()); + break; + } + } + if (strcmp("T", method) == 0) { + break; + } + } // while(true) + } // for TESTCASES +} + +void LocaleBuilderTest::TestLocaleBuilderBasic() { + LocaleBuilder bld; + bld.setLanguage("zh"); + Verify(bld, "zh", "setLanguage('zh') got Error: %s\n"); + + bld.setScript("Hant"); + Verify(bld, "zh-Hant", "setScript('Hant') got Error: %s\n"); + + bld.setRegion("SG"); + Verify(bld, "zh-Hant-SG", "setRegion('SG') got Error: %s\n"); + + bld.setRegion("HK"); + bld.setScript("Hans"); + Verify(bld, "zh-Hans-HK", + "setRegion('HK') and setScript('Hans') got Error: %s\n"); + + bld.setVariant("revised"); + Verify(bld, "zh-Hans-HK-revised", + "setVariant('revised') got Error: %s\n"); + + bld.setUnicodeLocaleKeyword("nu", "thai"); + Verify(bld, "zh-Hans-HK-revised-u-nu-thai", + "setUnicodeLocaleKeyword('nu', 'thai'') got Error: %s\n"); + + bld.setUnicodeLocaleKeyword("co", "pinyin"); + Verify(bld, "zh-Hans-HK-revised-u-co-pinyin-nu-thai", + "setUnicodeLocaleKeyword('co', 'pinyin'') got Error: %s\n"); + + bld.setUnicodeLocaleKeyword("nu", "latn"); + Verify(bld, "zh-Hans-HK-revised-u-co-pinyin-nu-latn", + "setUnicodeLocaleKeyword('nu', 'latn'') got Error: %s\n"); + + bld.setUnicodeLocaleKeyword("nu", nullptr); + Verify(bld, "zh-Hans-HK-revised-u-co-pinyin", + "setUnicodeLocaleKeyword('nu', ''') got Error: %s\n"); + + bld.setUnicodeLocaleKeyword("co", nullptr); + Verify(bld, "zh-Hans-HK-revised", + "setUnicodeLocaleKeyword('nu', nullptr) got Error: %s\n"); + + bld.setScript(""); + Verify(bld, "zh-HK-revised", + "setScript('') got Error: %s\n"); + + bld.setVariant(""); + Verify(bld, "zh-HK", + "setVariant('') got Error: %s\n"); + + bld.setRegion(""); + Verify(bld, "zh", + "setRegion('') got Error: %s\n"); +} + +void LocaleBuilderTest::TestSetLanguageWellFormed() { + // http://www.unicode.org/reports/tr35/tr35.html#unicode_language_subtag + // unicode_language_subtag = alpha{2,3} | alpha{5,8}; + // ICUTC decided also support alpha{4} + static const char* wellFormedLanguages[] = { + "", + + // alpha{2} + "en", + "NE", + "eN", + "Ne", + + // alpha{3} + "aNe", + "zzz", + "AAA", + + // alpha{4} + "ABCD", + "abcd", + + // alpha{5} + "efgij", + "AbCAD", + "ZAASD", + + // alpha{6} + "efgijk", + "AADGFE", + "AkDfFz", + + // alpha{7} + "asdfads", + "ADSFADF", + "piSFkDk", + + // alpha{8} + "oieradfz", + "IADSFJKR", + "kkDSFJkR", + }; + for (const char* lang : wellFormedLanguages) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setLanguage(lang); + Locale loc = bld.build(status); + if (U_FAILURE(status)) { + errln("setLanguage(\"%s\") got Error: %s\n", + lang, u_errorName(status)); + } + } +} + +void LocaleBuilderTest::TestSetLanguageIllFormed() { + static const char* illFormed[] = { + "a", + "z", + "A", + "F", + "2", + "0", + "9" + "{", + ".", + "[", + "]", + "\\", + + "e1", + "N2", + "3N", + "4e", + "e:", + "43", + "a9", + + "aN0", + "z1z", + "2zz", + "3A3", + "456", + "af)", + + // Per 2019-01-23 ICUTC, we still accept 4alpha as tlang. see ICU-20321. + // "latn", + // "Arab", + // "LATN", + + "e)gij", + "Ab3AD", + "ZAAS8", + + "efgi[]", + "AA9GFE", + "7kD3Fz", + "as8fads", + "0DSFADF", + "'iSFkDk", + + "oieradf+", + "IADSFJK-", + "kkDSFJk0", + + // alpha{9} + "oieradfab", + "IADSFJKDE", + "kkDSFJkzf", + }; + for (const char* ill : illFormed) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setLanguage(ill); + Locale loc = bld.build(status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("setLanguage(\"%s\") should fail but has no Error\n", ill); + } + } +} + +void LocaleBuilderTest::TestSetScriptWellFormed() { + // http://www.unicode.org/reports/tr35/tr35.html#unicode_script_subtag + // unicode_script_subtag = alpha{4} ; + static const char* wellFormedScripts[] = { + "", + + "Latn", + "latn", + "lATN", + "laTN", + "arBN", + "ARbn", + "adsf", + "aADF", + "BSVS", + "LATn", + }; + for (const char* script : wellFormedScripts) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setScript(script); + Locale loc = bld.build(status); + if (U_FAILURE(status)) { + errln("setScript(\"%s\") got Error: %s\n", + script, u_errorName(status)); + } + } +} + +void LocaleBuilderTest::TestSetScriptIllFormed() { + static const char* illFormed[] = { + "a", + "z", + "A", + "F", + "2", + "0", + "9" + "{", + ".", + "[", + "]", + "\\", + + "e1", + "N2", + "3N", + "4e", + "e:", + "43", + "a9", + + "aN0", + "z1z", + "2zz", + "3A3", + "456", + "af)", + + "0atn", + "l1tn", + "lA2N", + "la4N", + "arB5", + "1234", + + "e)gij", + "Ab3AD", + "ZAAS8", + + "efgi[]", + "AA9GFE", + "7kD3Fz", + + "as8fads", + "0DSFADF", + "'iSFkDk", + + "oieradf+", + "IADSFJK-", + "kkDSFJk0", + + // alpha{9} + "oieradfab", + "IADSFJKDE", + "kkDSFJkzf", + }; + for (const char* ill : illFormed) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setScript(ill); + Locale loc = bld.build(status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("setScript(\"%s\") should fail but has no Error\n", ill); + } + } +} + +void LocaleBuilderTest::TestSetRegionWellFormed() { + // http://www.unicode.org/reports/tr35/tr35.html#unicode_region_subtag + // unicode_region_subtag = (alpha{2} | digit{3}) + static const char* wellFormedRegions[] = { + "", + + // alpha{2} + "en", + "NE", + "eN", + "Ne", + + // digit{3} + "000", + "999", + "123", + "987" + }; + for (const char* region : wellFormedRegions) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setRegion(region); + Locale loc = bld.build(status); + if (U_FAILURE(status)) { + errln("setRegion(\"%s\") got Error: %s\n", + region, u_errorName(status)); + } + } +} + +void LocaleBuilderTest::TestSetRegionIllFormed() { + static const char* illFormed[] = { + "a", + "z", + "A", + "F", + "2", + "0", + "9" + "{", + ".", + "[", + "]", + "\\", + + "e1", + "N2", + "3N", + "4e", + "e:", + "43", + "a9", + + "aN0", + "z1z", + "2zz", + "3A3", + "4.6", + "af)", + + "0atn", + "l1tn", + "lA2N", + "la4N", + "arB5", + "1234", + + "e)gij", + "Ab3AD", + "ZAAS8", + + "efgi[]", + "AA9GFE", + "7kD3Fz", + + "as8fads", + "0DSFADF", + "'iSFkDk", + + "oieradf+", + "IADSFJK-", + "kkDSFJk0", + + // alpha{9} + "oieradfab", + "IADSFJKDE", + "kkDSFJkzf", + }; + for (const char* ill : illFormed) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setRegion(ill); + Locale loc = bld.build(status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("setRegion(\"%s\") should fail but has no Error\n", ill); + } + } +} + +void LocaleBuilderTest::TestSetVariantWellFormed() { + // http://www.unicode.org/reports/tr35/tr35.html#unicode_variant_subtag + // (sep unicode_variant_subtag)* + // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; + static const char* wellFormedVariants[] = { + "", + + // alphanum{5} + "efgij", + "AbCAD", + "ZAASD", + "0AASD", + "A1CAD", + "ef2ij", + "ads3X", + "owqF4", + + // alphanum{6} + "efgijk", + "AADGFE", + "AkDfFz", + "0ADGFE", + "A9DfFz", + "AADG7E", + + // alphanum{7} + "asdfads", + "ADSFADF", + "piSFkDk", + "a0dfads", + "ADSF3DF", + "piSFkD9", + + // alphanum{8} + "oieradfz", + "IADSFJKR", + "kkDSFJkR", + "0ADSFJKR", + "12345679", + + // digit alphanum{3} + "0123", + "1abc", + "20EF", + "30EF", + "8A03", + "3Ax3", + "9Axy", + + // (sep unicode_variant_subtag)* + "0123-4567", + "0ab3-ABCDE", + "9ax3-xByD9", + "9ax3-xByD9-adfk934a", + + "0123_4567", + "0ab3_ABCDE", + "9ax3_xByD9", + "9ax3_xByD9_adfk934a", + + "9ax3-xByD9_adfk934a", + "9ax3_xByD9-adfk934a", + }; + for (const char* variant : wellFormedVariants) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setVariant(variant); + Locale loc = bld.build(status); + if (U_FAILURE(status)) { + errln("setVariant(\"%s\") got Error: %s\n", + variant, u_errorName(status)); + } + } +} + +void LocaleBuilderTest::TestSetVariantIllFormed() { + static const char* illFormed[] = { + "a", + "z", + "A", + "F", + "2", + "0", + "9" + "{", + ".", + "[", + "]", + "\\", + + "e1", + "N2", + "3N", + "4e", + "e:", + "43", + "a9", + "en", + "NE", + "eN", + "Ne", + + "aNe", + "zzz", + "AAA", + "aN0", + "z1z", + "2zz", + "3A3", + "4.6", + "af)", + "345", + "923", + + "Latn", + "latn", + "lATN", + "laTN", + "arBN", + "ARbn", + "adsf", + "aADF", + "BSVS", + "LATn", + "l1tn", + "lA2N", + "la4N", + "arB5", + "abc3", + "A3BC", + + "e)gij", + "A+3AD", + "ZAA=8", + + "efgi[]", + "AA9]FE", + "7k[3Fz", + + "as8f/ds", + "0DSFAD{", + "'iSFkDk", + + "oieradf+", + "IADSFJK-", + "k}DSFJk0", + + // alpha{9} + "oieradfab", + "IADSFJKDE", + "kkDSFJkzf", + "123456789", + + "-0123", + "-0123-4567", + "0123-4567-", + "-123-4567", + "_0123", + "_0123_4567", + "0123_4567_", + "_123_4567", + + "-abcde-figjk", + "abcde-figjk-", + "-abcde-figjk-", + "_abcde_figjk", + "abcde_figjk_", + "_abcde_figjk_", + }; + for (const char* ill : illFormed) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setVariant(ill); + Locale loc = bld.build(status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("setVariant(\"%s\") should fail but has no Error\n", ill); + } + } +} + +void LocaleBuilderTest::TestSetUnicodeLocaleKeywordWellFormed() { + // http://www.unicode.org/reports/tr35/tr35.html#unicode_locale_extensions + // keyword = key (sep type)? ; + // key = alphanum alpha ; + // type = alphanum{3,8} (sep alphanum{3,8})* ; + static const char* wellFormed_key_value[] = { + "aa", "123", + "3b", "zyzbcdef", + "0Z", "1ZB30zk9-abc", + "cZ", "2ck30zfZ-adsf023-234kcZ", + "ZZ", "Lant", + "ko", "", + }; + for (int i = 0; i < UPRV_LENGTHOF(wellFormed_key_value); i += 2) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setUnicodeLocaleKeyword(wellFormed_key_value[i], + wellFormed_key_value[i + 1]); + Locale loc = bld.build(status); + if (U_FAILURE(status)) { + errln("setUnicodeLocaleKeyword(\"%s\", \"%s\") got Error: %s\n", + wellFormed_key_value[i], + wellFormed_key_value[i + 1], + u_errorName(status)); + } + } +} + +void LocaleBuilderTest::TestSetUnicodeLocaleKeywordIllFormedKey() { + static const char* illFormed[] = { + "34", + "ab-cde", + "123", + "b3", + "zyzabcdef", + "Z0", + }; + for (const char* ill : illFormed) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setUnicodeLocaleKeyword(ill, "abc"); + Locale loc = bld.build(status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("setUnicodeLocaleKeyword(\"%s\", \"abc\") should fail but has no Error\n", + ill); + } + } +} + +void LocaleBuilderTest::TestSetUnicodeLocaleKeywordIllFormedValue() { + static const char* illFormed[] = { + "34", + "ab-", + "-cd", + "-ef-", + "zyzabcdef", + "ab-abc", + "1ZB30zfk9-abc", + "2ck30zfk9-adsf023-234kcZ", + }; + for (const char* ill : illFormed) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setUnicodeLocaleKeyword("ab", ill); + Locale loc = bld.build(status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("setUnicodeLocaleKeyword(\"ab\", \"%s\") should fail but has no Error\n", + ill); + } + } +} + +void LocaleBuilderTest::TestAddRemoveUnicodeLocaleAttribute() { + LocaleBuilder bld; + UErrorCode status = U_ZERO_ERROR; + Locale loc = bld.setLanguage("fr") + .addUnicodeLocaleAttribute("abc") + .addUnicodeLocaleAttribute("aBc") + .addUnicodeLocaleAttribute("EFG") + .addUnicodeLocaleAttribute("efghi") + .addUnicodeLocaleAttribute("efgh") + .addUnicodeLocaleAttribute("efGhi") + .addUnicodeLocaleAttribute("EFg") + .addUnicodeLocaleAttribute("hijk") + .addUnicodeLocaleAttribute("EFG") + .addUnicodeLocaleAttribute("HiJK") + .addUnicodeLocaleAttribute("aBc") + .build(status); + if (U_FAILURE(status)) { + errln("addUnicodeLocaleAttribute() got Error: %s\n", + u_errorName(status)); + } + std::string expected("fr-u-abc-efg-efgh-efghi-hijk"); + std::string actual = loc.toLanguageTag(status); + if (U_FAILURE(status) || expected != actual) { + errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str()); + } + + // remove "efgh" in the middle with different casing. + loc = bld.removeUnicodeLocaleAttribute("eFgH").build(status); + if (U_FAILURE(status)) { + errln("removeUnicodeLocaleAttribute() got Error: %s\n", + u_errorName(status)); + } + expected = "fr-u-abc-efg-efghi-hijk"; + actual = loc.toLanguageTag(status); + if (U_FAILURE(status) || expected != actual) { + errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str()); + } + + // remove non-existing attributes. + loc = bld.removeUnicodeLocaleAttribute("efgh").build(status); + if (U_FAILURE(status)) { + errln("removeUnicodeLocaleAttribute() got Error: %s\n", + u_errorName(status)); + } + actual = loc.toLanguageTag(status); + if (U_FAILURE(status) || expected != actual) { + errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str()); + } + + // remove "abc" in the beginning with different casing. + loc = bld.removeUnicodeLocaleAttribute("ABC").build(status); + if (U_FAILURE(status)) { + errln("removeUnicodeLocaleAttribute() got Error: %s\n", + u_errorName(status)); + } + expected = "fr-u-efg-efghi-hijk"; + actual = loc.toLanguageTag(status); + if (U_FAILURE(status) || expected != actual) { + errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str()); + } + + // remove non-existing substring in the end. + loc = bld.removeUnicodeLocaleAttribute("hij").build(status); + if (U_FAILURE(status)) { + errln("removeUnicodeLocaleAttribute() got Error: %s\n", + u_errorName(status)); + } + actual = loc.toLanguageTag(status); + if (U_FAILURE(status) || expected != actual) { + errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str()); + } + + // remove "hijk" in the end with different casing. + loc = bld.removeUnicodeLocaleAttribute("hIJK").build(status); + if (U_FAILURE(status)) { + errln("removeUnicodeLocaleAttribute() got Error: %s\n", + u_errorName(status)); + } + expected = "fr-u-efg-efghi"; + actual = loc.toLanguageTag(status); + if (U_FAILURE(status) || expected != actual) { + errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str()); + } + + // remove "efghi" in the end with different casing. + loc = bld.removeUnicodeLocaleAttribute("EFGhi").build(status); + if (U_FAILURE(status)) { + errln("removeUnicodeLocaleAttribute() got Error: %s\n", + u_errorName(status)); + } + expected = "fr-u-efg"; + actual = loc.toLanguageTag(status); + if (U_FAILURE(status) || expected != actual) { + errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str()); + } + + // remove "efg" in as the only one, with different casing. + loc = bld.removeUnicodeLocaleAttribute("EFG").build(status); + if (U_FAILURE(status)) { + errln("removeUnicodeLocaleAttribute() got Error: %s\n", + u_errorName(status)); + } + expected = "fr"; + actual = loc.toLanguageTag(status); + if (U_FAILURE(status) || expected != actual) { + errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str()); + } + +} + +void LocaleBuilderTest::TestAddRemoveUnicodeLocaleAttributeWellFormed() { + // http://www.unicode.org/reports/tr35/tr35.html#unicode_locale_extensions + // attribute = alphanum{3,8} ; + static const char* wellFormedAttributes[] = { + // alphanum{3} + "AbC", + "ZAA", + "0AA", + "x3A", + "xa8", + + // alphanum{4} + "AbCA", + "ZASD", + "0ASD", + "A3a4", + "zK90", + + // alphanum{5} + "efgij", + "AbCAD", + "ZAASD", + "0AASD", + "A1CAD", + "ef2ij", + "ads3X", + "owqF4", + + // alphanum{6} + "efgijk", + "AADGFE", + "AkDfFz", + "0ADGFE", + "A9DfFz", + "AADG7E", + + // alphanum{7} + "asdfads", + "ADSFADF", + "piSFkDk", + "a0dfads", + "ADSF3DF", + "piSFkD9", + + // alphanum{8} + "oieradfz", + "IADSFJKR", + "kkDSFJkR", + }; + LocaleBuilder bld; + for (int i = 0; i < UPRV_LENGTHOF(wellFormedAttributes); i++) { + if (i % 5 == 0) { + bld.clear(); + } + UErrorCode status = U_ZERO_ERROR; + bld.addUnicodeLocaleAttribute(wellFormedAttributes[i]); + Locale loc = bld.build(status); + if (U_FAILURE(status)) { + errln("addUnicodeLocaleAttribute(\"%s\") got Error: %s\n", + wellFormedAttributes[i], u_errorName(status)); + } + if (i > 2) { + bld.removeUnicodeLocaleAttribute(wellFormedAttributes[i - 1]); + loc = bld.build(status); + if (U_FAILURE(status)) { + errln("removeUnicodeLocaleAttribute(\"%s\") got Error: %s\n", + wellFormedAttributes[i - 1], u_errorName(status)); + } + bld.removeUnicodeLocaleAttribute(wellFormedAttributes[i - 3]); + loc = bld.build(status); + if (U_FAILURE(status)) { + errln("removeUnicodeLocaleAttribute(\"%s\") got Error: %s\n", + wellFormedAttributes[i - 3], u_errorName(status)); + } + } + } +} + +void LocaleBuilderTest::TestAddUnicodeLocaleAttributeIllFormed() { + static const char* illFormed[] = { + "aa", + "34", + "ab-", + "-cd", + "-ef-", + "zyzabcdef", + "123456789", + "ab-abc", + "1ZB30zfk9-abc", + "2ck30zfk9-adsf023-234kcZ", + }; + for (const char* ill : illFormed) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.addUnicodeLocaleAttribute(ill); + Locale loc = bld.build(status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("addUnicodeLocaleAttribute(\"%s\") should fail but has no Error\n", + ill); + } + } +} + +void LocaleBuilderTest::TestSetExtensionU() { + LocaleBuilder bld; + bld.setLanguage("zh"); + Verify(bld, "zh", + "setLanguage(\"zh\") got Error: %s\n"); + + bld.setExtension('u', "co-stroke"); + Verify(bld, "zh-u-co-stroke", + "setExtension('u', \"co-stroke\") got Error: %s\n"); + + bld.setExtension('U', "ca-islamic"); + Verify(bld, "zh-u-ca-islamic", + "setExtension('U', \"zh-u-ca-islamic\") got Error: %s\n"); + + bld.setExtension('u', "ca-chinese"); + Verify(bld, "zh-u-ca-chinese", + "setExtension('u', \"ca-chinese\") got Error: %s\n"); + + bld.setExtension('U', "co-pinyin"); + Verify(bld, "zh-u-co-pinyin", + "setExtension('U', \"co-pinyin\") got Error: %s\n"); + + bld.setRegion("TW"); + Verify(bld, "zh-TW-u-co-pinyin", + "setRegion(\"TW\") got Error: %s\n"); + + bld.setExtension('U', ""); + Verify(bld, "zh-TW", + "setExtension('U', \"\") got Error: %s\n"); + + bld.setExtension('u', "abc-defg-kr-face"); + Verify(bld, "zh-TW-u-abc-defg-kr-face", + "setExtension('u', \"abc-defg-kr-face\") got Error: %s\n"); + + bld.setExtension('U', "ca-japanese"); + Verify(bld, "zh-TW-u-ca-japanese", + "setExtension('U', \"ca-japanese\") got Error: %s\n"); + +} + +void LocaleBuilderTest::TestSetExtensionValidateUWellFormed() { + static const char* wellFormedExtensions[] = { + // keyword + // keyword = key (sep type)? ; + // key = alphanum alpha ; + // type = alphanum{3,8} (sep alphanum{3,8})* ; + "3A", + "ZA", + "az-abc", + "zz-123", + "7z-12345678", + "kb-A234567Z", + // (sep keyword)+ + "1z-ZZ", + "2z-ZZ-123", + "3z-ZZ-123-cd", + "0z-ZZ-123-cd-efghijkl", + // attribute + "abc", + "456", + "87654321", + "ZABADFSD", + // (sep attribute)+ + "abc-ZABADFSD", + "123-ZABADFSD", + "K2K-12345678", + "K2K-12345678-zzz", + // (sep attribute)+ (sep keyword)* + "K2K-12345678-zz", + "K2K-12345678-zz-0z", + "K2K-12345678-9z-AZ-abc", + "K2K-12345678-zz-9A-234", + "K2K-12345678-zk0-abc-efg-zz-9k-234", + }; + for (const char* extension : wellFormedExtensions) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setExtension('u', extension); + Locale loc = bld.build(status); + if (U_FAILURE(status)) { + errln("setExtension('u', \"%s\") got Error: %s\n", + extension, u_errorName(status)); + } + }; +} + +void LocaleBuilderTest::TestSetExtensionValidateUIllFormed() { + static const char* illFormed[] = { + // bad key + "-", + "-ab", + "ab-", + "abc-", + "-abc", + "0", + "a", + "A0", + "z9", + "09", + "90", + // bad keyword + "AB-A0", + "AB-efg-A0", + "xy-123456789", + "AB-Aa-", + "AB-Aac-", + // bad attribute + "abcdefghi", + "abcdefgh-", + "abcdefgh-abcdefghi", + "abcdefgh-1", + "abcdefgh-a", + "abcdefgh-a2345678z", + }; + for (const char* ill : illFormed) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setExtension('u', ill); + Locale loc = bld.build(status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("setExtension('u', \"%s\") should fail but has no Error\n", + ill); + } + } +} + +void LocaleBuilderTest::TestSetExtensionT() { + LocaleBuilder bld; + bld.setLanguage("fr"); + Verify(bld, "fr", + "setLanguage(\"fr\") got Error: %s\n"); + + bld.setExtension('T', "zh"); + Verify(bld, "fr-t-zh", + "setExtension('T', \"zh\") got Error: %s\n"); + + bld.setExtension('t', "zh-Hant-TW-1234-A9-123-456ABCDE"); + Verify(bld, "fr-t-zh-hant-tw-1234-a9-123-456abcde", + "setExtension('t', \"zh-Hant-TW-1234-A9-123-456ABCDE\") got Error: %s\n"); + + bld.setExtension('T', "a9-123"); + Verify(bld, "fr-t-a9-123", + "setExtension('T', \"a9-123\") got Error: %s\n"); + + bld.setRegion("MX"); + Verify(bld, "fr-MX-t-a9-123", + "setRegion(\"MX\") got Error: %s\n"); + + bld.setScript("Hans"); + Verify(bld, "fr-Hans-MX-t-a9-123", + "setScript(\"Hans\") got Error: %s\n"); + + bld.setVariant("9abc-abcde"); + Verify(bld, "fr-Hans-MX-9abc-abcde-t-a9-123", + "setVariant(\"9abc-abcde\") got Error: %s\n"); + + bld.setExtension('T', ""); + Verify(bld, "fr-Hans-MX-9abc-abcde", + "bld.setExtension('T', \"\") got Error: %s\n"); +} + +void LocaleBuilderTest::TestSetExtensionValidateTWellFormed() { + // ((sep tlang (sep tfield)*) | (sep tfield)+) + static const char* wellFormedExtensions[] = { + // tlang + // tlang = unicode_language_subtag (sep unicode_script_subtag)? + // (sep unicode_region_subtag)? (sep unicode_variant_subtag)* ; + // unicode_language_subtag + "en", + "abc", + "abcde", + "ABCDEFGH", + // unicode_language_subtag sep unicode_script_subtag + "en-latn", + "abc-arab", + "ABCDEFGH-Thai", + // unicode_language_subtag sep unicode_script_subtag sep unicode_region_subtag + "en-latn-ME", + "abc-arab-RU", + "ABCDEFGH-Thai-TH", + "en-latn-409", + "abc-arab-123", + "ABCDEFGH-Thai-456", + // unicode_language_subtag sep unicode_region_subtag + "en-ME", + "abc-RU", + "ABCDEFGH-TH", + "en-409", + "abc-123", + "ABCDEFGH-456", + // unicode_language_subtag sep unicode_script_subtag sep unicode_region_subtag + // sep (sep unicode_variant_subtag)* + "en-latn-ME-abcde", + "abc-arab-RU-3abc-abcdef", + "ABCDEFGH-Thai-TH-ADSFS-9xyz-abcdef", + "en-latn-409-xafsa", + "abc-arab-123-ADASDF", + "ABCDEFGH-Thai-456-9sdf-ADASFAS", + // (sep tfield)+ + "A0-abcde", + "z9-abcde123", + "z9-abcde123-a1-abcde", + // tlang (sep tfield)* + "fr-A0-abcde", + "fr-FR-A0-abcde", + "fr-123-z9-abcde123-a1-abcde", + "fr-Latn-FR-z9-abcde123-a1-abcde", + "gab-Thai-TH-abcde-z9-abcde123-a1-abcde", + "gab-Thai-TH-0bde-z9-abcde123-a1-abcde", + }; + for (const char* extension : wellFormedExtensions) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setExtension('t', extension); + Locale loc = bld.build(status); + if (U_FAILURE(status)) { + errln("setExtension('t', \"%s\") got Error: %s\n", + extension, u_errorName(status)); + } + }; +} + +void LocaleBuilderTest::TestSetExtensionValidateTIllFormed() { + static const char* illFormed[] = { + "a", + "a-", + "0", + "9-", + "-9", + "-z", + // "Latn", // Per 2019-01-23 ICUTC, still accept 4alpha. See ICU-20321 + "Latn-", + "en-", + "nob-", + "-z9", + "a3", + "a3-", + "3a", + "0z-", + "en-123-a1", + "en-TH-a1", + "gab-TH-a1", + "gab-Thai-a1", + "gab-Thai-TH-a1", + "gab-Thai-TH-0bde-a1", + "gab-Thai-TH-0bde-3b", + "gab-Thai-TH-0bde-z9-a1", + "gab-Thai-TH-0bde-z9-3b", + "gab-Thai-TH-0bde-z9-abcde123-3b", + "gab-Thai-TH-0bde-z9-abcde123-ab", + "gab-Thai-TH-0bde-z9-abcde123-ab", + "gab-Thai-TH-0bde-z9-abcde123-a1", + "gab-Thai-TH-0bde-z9-abcde123-a1-", + "gab-Thai-TH-0bde-z9-abcde123-a1-a", + "gab-Thai-TH-0bde-z9-abcde123-a1-ab", + }; + for (const char* ill : illFormed) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setExtension('t', ill); + Locale loc = bld.build(status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("setExtension('t', \"%s\") should fail but has no Error\n", + ill); + } + } +} + +void LocaleBuilderTest::TestSetExtensionPU() { + LocaleBuilder bld; + bld.setLanguage("ar"); + Verify(bld, "ar", + "setLanguage(\"ar\") got Error: %s\n"); + + bld.setExtension('X', "a-b-c-d-e"); + Verify(bld, "ar-x-a-b-c-d-e", + "setExtension('X', \"a-b-c-d-e\") got Error: %s\n"); + + bld.setExtension('x', "0-1-2-3"); + Verify(bld, "ar-x-0-1-2-3", + "setExtension('x', \"0-1-2-3\") got Error: %s\n"); + + bld.setExtension('X', "0-12345678-x-x"); + Verify(bld, "ar-x-0-12345678-x-x", + "setExtension('x', \"ar-x-0-12345678-x-x\") got Error: %s\n"); + + bld.setRegion("TH"); + Verify(bld, "ar-TH-x-0-12345678-x-x", + "setRegion(\"TH\") got Error: %s\n"); + + bld.setExtension('X', ""); + Verify(bld, "ar-TH", + "setExtension(\"X\") got Error: %s\n"); +} + +void LocaleBuilderTest::TestSetExtensionValidatePUWellFormed() { + // ((sep tlang (sep tfield)*) | (sep tfield)+) + static const char* wellFormedExtensions[] = { + "a", // Short subtag + "z", // Short subtag + "0", // Short subtag, digit + "9", // Short subtag, digit + "a-0", // Two short subtag, alpha and digit + "9-z", // Two short subtag, digit and alpha + "ab", + "abc", + "abcefghi", // Long subtag + "87654321", + "01", + "234", + "0a-ab-87654321", // Three subtags + "87654321-ab-00-3A", // Four subtabs + "a-9-87654321", // Three subtags with short and long subtags + "87654321-ab-0-3A", + }; + for (const char* extension : wellFormedExtensions) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setExtension('x', extension); + Locale loc = bld.build(status); + if (U_FAILURE(status)) { + errln("setExtension('x', \"%s\") got Error: %s\n", + extension, u_errorName(status)); + } + }; +} + +void LocaleBuilderTest::TestSetExtensionValidatePUIllFormed() { + static const char* illFormed[] = { + "123456789", // Too long + "abcdefghi", // Too long + "ab-123456789", // Second subtag too long + "abcdefghi-12", // First subtag too long + "a-ab-987654321", // Third subtag too long + "987654321-a-0-3", // First subtag too long + }; + for (const char* ill : illFormed) { + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setExtension('x', ill); + Locale loc = bld.build(status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("setExtension('x', \"%s\") should fail but has no Error\n", + ill); + } + } +} + +void LocaleBuilderTest::TestSetExtensionOthers() { + LocaleBuilder bld; + bld.setLanguage("fr"); + Verify(bld, "fr", + "setLanguage(\"fr\") got Error: %s\n"); + + bld.setExtension('Z', "ab"); + Verify(bld, "fr-z-ab", + "setExtension('Z', \"ab\") got Error: %s\n"); + + bld.setExtension('0', "xyz12345-abcdefg"); + Verify(bld, "fr-0-xyz12345-abcdefg-z-ab", + "setExtension('0', \"xyz12345-abcdefg\") got Error: %s\n"); + + bld.setExtension('a', "01-12345678-ABcdef"); + Verify(bld, "fr-0-xyz12345-abcdefg-a-01-12345678-abcdef-z-ab", + "setExtension('a', \"01-12345678-ABcdef\") got Error: %s\n"); + + bld.setRegion("TH"); + Verify(bld, "fr-TH-0-xyz12345-abcdefg-a-01-12345678-abcdef-z-ab", + "setRegion(\"TH\") got Error: %s\n"); + + bld.setScript("Arab"); + Verify(bld, "fr-Arab-TH-0-xyz12345-abcdefg-a-01-12345678-abcdef-z-ab", + "setRegion(\"Arab\") got Error: %s\n"); + + bld.setExtension('A', "97"); + Verify(bld, "fr-Arab-TH-0-xyz12345-abcdefg-a-97-z-ab", + "setExtension('a', \"97\") got Error: %s\n"); + + bld.setExtension('a', ""); + Verify(bld, "fr-Arab-TH-0-xyz12345-abcdefg-z-ab", + "setExtension('a', \"\") got Error: %s\n"); + + bld.setExtension('0', ""); + Verify(bld, "fr-Arab-TH-z-ab", + "setExtension('0', \"\") got Error: %s\n"); +} + +void LocaleBuilderTest::TestSetExtensionValidateOthersWellFormed() { + static const char* wellFormedExtensions[] = { + "ab", + "abc", + "abcefghi", + "01", + "234", + "87654321", + "0a-ab-87654321", + "87654321-ab-00-3A", + }; + + const char * aToZ = "abcdefghijklmnopqrstuvwxyz"; + const int32_t aToZLen = uprv_strlen(aToZ); + int32_t i = 0; + for (const char* extension : wellFormedExtensions) { + char ch = aToZ[i]; + i = (i + 1) % aToZLen; + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setExtension(ch, extension); + Locale loc = bld.build(status); + if (U_FAILURE(status)) { + errln("setExtension('%c', \"%s\") got Error: %s\n", + ch, extension, u_errorName(status)); + } + }; + + const char* someChars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789`~!@#$%^&*()-_=+;:,.<>?"; + const int32_t someCharsLen = uprv_strlen(someChars); + for (int32_t i = 0; i < someCharsLen; i++) { + char ch = someChars[i]; + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setExtension(ch, wellFormedExtensions[ch % UPRV_LENGTHOF(wellFormedExtensions)]); + Locale loc = bld.build(status); + if (uprv_isASCIILetter(ch) || ('0' <= ch && ch <= '9')) { + if (ch != 't' && ch != 'T' && ch != 'u' && ch != 'U' && ch != 'x' && ch != 'X') { + if (U_FAILURE(status)) { + errln("setExtension('%c', \"%s\") got Error: %s\n", + ch, wellFormedExtensions[ch % UPRV_LENGTHOF(wellFormedExtensions)], u_errorName(status)); + } + } + } else { + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("setExtension('%c', \"%s\") should fail but has no Error\n", + ch, wellFormedExtensions[ch % UPRV_LENGTHOF(wellFormedExtensions)]); + } + } + + } +} + +void LocaleBuilderTest::TestSetExtensionValidateOthersIllFormed() { + static const char* illFormed[] = { + "0", // Too short + "a", // Too short + "123456789", // Too long + "abcdefghi", // Too long + "ab-123456789", // Second subtag too long + "abcdefghi-12", // First subtag too long + "a-ab-87654321", // Third subtag too long + "87654321-a-0-3", // First subtag too long + }; + const char * aToZ = "abcdefghijklmnopqrstuvwxyz"; + const int32_t aToZLen = uprv_strlen(aToZ); + int32_t i = 0; + for (const char* ill : illFormed) { + char ch = aToZ[i]; + i = (i + 1) % aToZLen; + UErrorCode status = U_ZERO_ERROR; + LocaleBuilder bld; + bld.setExtension(ch, ill); + Locale loc = bld.build(status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + errln("setExtension('%c', \"%s\") should fail but has no Error\n", + ch, ill); + } + } +} + +void LocaleBuilderTest::TestSetLocale() { + LocaleBuilder bld1, bld2; + UErrorCode status = U_ZERO_ERROR; + Locale l1 = bld1.setLanguage("en") + .setScript("Latn") + .setRegion("MX") + .setVariant("3456-abcde") + .addUnicodeLocaleAttribute("456") + .addUnicodeLocaleAttribute("123") + .setUnicodeLocaleKeyword("nu", "thai") + .setUnicodeLocaleKeyword("co", "stroke") + .setUnicodeLocaleKeyword("ca", "chinese") + .build(status); + if (U_FAILURE(status) || l1.isBogus()) { + errln("build got Error: %s\n", u_errorName(status)); + } + status = U_ZERO_ERROR; + Locale l2 = bld1.setLocale(l1).build(status); + if (U_FAILURE(status) || l2.isBogus()) { + errln("build got Error: %s\n", u_errorName(status)); + } + + if (l1 != l2) { + errln("Two locales should be the same, but one is '%s' and the other is '%s'", + l1.getName(), l2.getName()); + } +} + +void LocaleBuilderTest::TestPosixCases() { + UErrorCode status = U_ZERO_ERROR; + Locale l1 = Locale::forLanguageTag("en-US-u-va-posix", status); + if (U_FAILURE(status) || l1.isBogus()) { + errln("build got Error: %s\n", u_errorName(status)); + } + LocaleBuilder bld; + bld.setLanguage("en") + .setRegion("MX") + .setScript("Arab") + .setUnicodeLocaleKeyword("nu", "Thai") + .setExtension('x', "1"); + // All of above should be cleared by the setLocale call. + Locale l2 = bld.setLocale(l1).build(status); + if (U_FAILURE(status) || l2.isBogus()) { + errln("build got Error: %s\n", u_errorName(status)); + } + if (l1 != l2) { + errln("The result locale should be the set as the setLocale %s but got %s\n", + l1.toLanguageTag(status).c_str(), + l2.toLanguageTag(status).c_str()); + } + Locale posix("en-US-POSIX"); + if (posix != l2) { + errln("The result locale should be the set as %s but got %s\n", + posix.getName(), l2.getName()); + } +} diff --git a/icu4c/source/test/intltest/localebuildertest.h b/icu4c/source/test/intltest/localebuildertest.h new file mode 100644 index 00000000000..41f3730ff24 --- /dev/null +++ b/icu4c/source/test/intltest/localebuildertest.h @@ -0,0 +1,51 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "intltest.h" +#include "unicode/localebuilder.h" + + +/** + * Tests for the LocaleBuilder class + **/ +class LocaleBuilderTest: public IntlTest { + public: + LocaleBuilderTest(); + virtual ~LocaleBuilderTest(); + + void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); + + void TestAddRemoveUnicodeLocaleAttribute(void); + void TestAddRemoveUnicodeLocaleAttributeWellFormed(void); + void TestAddUnicodeLocaleAttributeIllFormed(void); + void TestLocaleBuilder(void); + void TestLocaleBuilderBasic(void); + void TestPosixCases(void); + void TestSetExtensionOthers(void); + void TestSetExtensionPU(void); + void TestSetExtensionT(void); + void TestSetExtensionU(void); + void TestSetExtensionValidateOthersIllFormed(void); + void TestSetExtensionValidateOthersWellFormed(void); + void TestSetExtensionValidatePUIllFormed(void); + void TestSetExtensionValidatePUWellFormed(void); + void TestSetExtensionValidateTIllFormed(void); + void TestSetExtensionValidateTWellFormed(void); + void TestSetExtensionValidateUIllFormed(void); + void TestSetExtensionValidateUWellFormed(void); + void TestSetLanguageIllFormed(void); + void TestSetLanguageWellFormed(void); + void TestSetLocale(void); + void TestSetRegionIllFormed(void); + void TestSetRegionWellFormed(void); + void TestSetScriptIllFormed(void); + void TestSetScriptWellFormed(void); + void TestSetUnicodeLocaleKeywordIllFormedKey(void); + void TestSetUnicodeLocaleKeywordIllFormedValue(void); + void TestSetUnicodeLocaleKeywordWellFormed(void); + void TestSetVariantIllFormed(void); + void TestSetVariantWellFormed(void); + + private: + void Verify(LocaleBuilder& bld, const char* expected, const char* msg); +};