diff --git a/.gitattributes b/.gitattributes index 2aa831b9bdc..ca69ef34e5e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -53,6 +53,7 @@ icu4c/source/aclocal.m4 -text icu4c/source/allinone/icucheck.bat -text icu4c/source/common/common.vcxproj -text icu4c/source/common/common.vcxproj.filters -text +icu4c/source/common/uloc_keytype.cpp -text icu4c/source/common/unifiedcache.cpp -text icu4c/source/common/unifiedcache.h -text icu4c/source/data/coll/dsb.txt -text diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index 2d8b12c0ab0..635b3f2630f 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -105,7 +105,7 @@ serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \ uidna.o usprep.o uts46.o punycode.o \ util.o util_props.o parsepos.o locbased.o cwchar.o wintz.o dtintrv.o ucnvsel.o propsvec.o \ ulist.o uloc_tag.o icudataver.o icuplug.o listformatter.o \ -sharedobject.o simplepatternformatter.o unifiedcache.o +sharedobject.o simplepatternformatter.o unifiedcache.o uloc_keytype.o ## Header files to install HEADERS = $(srcdir)/unicode/*.h diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj index 12f993feb41..5bf22962f6f 100644 --- a/icu4c/source/common/common.vcxproj +++ b/icu4c/source/common/common.vcxproj @@ -1,4 +1,4 @@ - + @@ -236,6 +236,7 @@ + @@ -1754,4 +1755,4 @@ - + \ No newline at end of file diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index a4571cf92e8..9befe71777f 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -421,9 +421,6 @@ properties & sets - - registration - registration @@ -568,6 +565,10 @@ collections + + + locales & resources + @@ -1112,4 +1113,4 @@ collections - + \ No newline at end of file diff --git a/icu4c/source/common/ucln_cmn.h b/icu4c/source/common/ucln_cmn.h index 0e2abc6a520..2290de868ba 100644 --- a/icu4c/source/common/ucln_cmn.h +++ b/icu4c/source/common/ucln_cmn.h @@ -37,6 +37,7 @@ typedef enum ECleanupCommonType { UCLN_COMMON_BREAKITERATOR, UCLN_COMMON_BREAKITERATOR_DICT, UCLN_COMMON_SERVICE, + UCLN_COMMON_LOCALE_KEY_TYPE, UCLN_COMMON_LOCALE, UCLN_COMMON_LOCALE_AVAILABLE, UCLN_COMMON_ULOC, diff --git a/icu4c/source/common/uloc.cpp b/icu4c/source/common/uloc.cpp index d8d8ad28cdf..13083b0c8e3 100644 --- a/icu4c/source/common/uloc.cpp +++ b/icu4c/source/common/uloc.cpp @@ -2524,4 +2524,103 @@ uloc_acceptLanguage(char *result, int32_t resultAvailable, return -1; } +U_CAPI const char* U_EXPORT2 +uloc_toUnicodeLocaleKey(const char* keyword) +{ + const char* bcpKey = ulocimp_toBcpKey(keyword); + if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) { + // unknown keyword, but syntax is fine.. + return keyword; + } + return bcpKey; +} + +U_CAPI const char* U_EXPORT2 +uloc_toUnicodeLocaleType(const char* keyword, const char* value) +{ + const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL); + if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) { + // unknown keyword, but syntax is fine.. + return value; + } + return bcpType; +} + +#define ISALPHANUM(c) ( (c) >= '0' && (c) <= '9' || (c) >= 'A' && (c) <= 'Z' || (c) >= 'a' && (c) <= 'z' ) + +static UBool +isWellFormedLegacyKey(const char* legacyKey) +{ + const char* p = legacyKey; + while (*p) { + if (!ISALPHANUM(*p)) { + return FALSE; + } + p++; + } + return TRUE; +} + +static UBool +isWellFormedLegacyType(const char* legacyType) +{ + const char* p = legacyType; + int32_t alphaNumLen = 0; + while (*p) { + if (*p == '_' || *p == '/' || *p == '-') { + if (alphaNumLen == 0) { + return FALSE; + } + alphaNumLen = 0; + } else if (ISALPHANUM(*p)) { + alphaNumLen++; + } else { + return FALSE; + } + p++; + } + return (alphaNumLen != 0); +} + +U_CAPI const char* U_EXPORT2 +uloc_toLegacyKey(const char* keyword) +{ + const char* legacyKey = ulocimp_toLegacyKey(keyword); + if (legacyKey == NULL) { + // Checks if the specified locale key is well-formed with the legacy locale syntax. + // + // Note: + // Neither ICU nor LDML/CLDR provides the definition of keyword syntax. + // However, a key should not contain '=' obviously. For now, all existing + // keys are using ASCII alphabetic letters only. We won't add any new key + // that is not compatible with the BCP 47 syntax. Therefore, we assume + // a valid key consist from [0-9a-zA-Z], no symbols. + if (isWellFormedLegacyKey(keyword)) { + return keyword; + } + } + return legacyKey; +} + +U_CAPI const char* U_EXPORT2 +uloc_toLegacyType(const char* keyword, const char* value) +{ + const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL); + if (legacyType == NULL) { + // Checks if the specified locale type is well-formed with the legacy locale syntax. + // + // Note: + // Neither ICU nor LDML/CLDR provides the definition of keyword syntax. + // However, a type should not contain '=' obviously. For now, all existing + // types are using ASCII alphabetic letters with a few symbol letters. We won't + // add any new type that is not compatible with the BCP 47 syntax except timezone + // IDs. For now, we assume a valid type start with [0-9a-zA-Z], but may contain + // '-' '_' '/' in the middle. + if (isWellFormedLegacyType(value)) { + return value; + } + } + return legacyType; +} + /*eof*/ diff --git a/icu4c/source/common/uloc_keytype.cpp b/icu4c/source/common/uloc_keytype.cpp new file mode 100644 index 00000000000..896ea9ee794 --- /dev/null +++ b/icu4c/source/common/uloc_keytype.cpp @@ -0,0 +1,577 @@ +/* +********************************************************************** +* Copyright (C) 2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +*/ +#include "unicode/utypes.h" + +#include "cstring.h" +#include "uassert.h" +#include "ucln_cmn.h" +#include "uhash.h" +#include "umutex.h" +#include "uresimp.h" +#include "uvector.h" + +static UHashtable* gLocExtKeyMap = NULL; +static icu::UInitOnce gLocExtKeyMapInitOnce = U_INITONCE_INITIALIZER; +static icu::UVector* gKeyTypeStringPool = NULL; +static icu::UVector* gLocExtKeyDataEntries = NULL; +static icu::UVector* gLocExtTypeEntries = NULL; + +// bit flags for special types +typedef enum { + SPECIALTYPE_NONE = 0, + SPECIALTYPE_CODEPOINTS = 1, + SPECIALTYPE_REORDER_CODE = 2 +} SpecialType; + +typedef struct LocExtKeyData { + const char* legacyId; + const char* bcpId; + UHashtable* typeMap; + uint32_t specialTypes; +} LocExtKeyData; + +typedef struct LocExtType { + const char* legacyId; + const char* bcpId; +} LocExtType; + +U_CDECL_BEGIN + +static UBool U_CALLCONV +uloc_key_type_cleanup(void) { + if (gLocExtKeyMap != NULL) { + uhash_close(gLocExtKeyMap); + gLocExtKeyMap = NULL; + } + + delete gLocExtKeyDataEntries; + gLocExtKeyDataEntries = NULL; + + delete gLocExtTypeEntries; + gLocExtTypeEntries = NULL; + + delete gKeyTypeStringPool; + gKeyTypeStringPool = NULL; + + gLocExtKeyMapInitOnce.reset(); + return TRUE; +} + +static void U_CALLCONV +uloc_deleteKeyTypeStringPoolEntry(void* obj) { + uprv_free(obj); +} + +static void U_CALLCONV +uloc_deleteKeyDataEntry(void* obj) { + LocExtKeyData* keyData = (LocExtKeyData*)obj; + if (keyData->typeMap != NULL) { + uhash_close(keyData->typeMap); + } + uprv_free(keyData); +} + +static void U_CALLCONV +uloc_deleteTypeEntry(void* obj) { + uprv_free(obj); +} + +U_CDECL_END + + +static void U_CALLCONV +initFromResourceBundle(UErrorCode& sts) { + ucln_common_registerCleanup(UCLN_COMMON_LOCALE_KEY_TYPE, uloc_key_type_cleanup); + + gLocExtKeyMap = uhash_open(uhash_hashIChars, uhash_compareIChars, NULL, &sts); + if (U_FAILURE(sts)) { + return; + } + + UResourceBundle *keyTypeDataRes = NULL; + UResourceBundle *keyMapRes = NULL; + UResourceBundle *typeMapRes = NULL; + UResourceBundle *typeAliasRes = NULL; + UResourceBundle *bcpTypeAliasRes = NULL; + + keyTypeDataRes = ures_openDirect(NULL, "keyTypeData", &sts); + keyMapRes = ures_getByKey(keyTypeDataRes, "keyMap", NULL, &sts); + typeMapRes = ures_getByKey(keyTypeDataRes, "typeMap", NULL, &sts); + + UErrorCode tmpSts = U_ZERO_ERROR; + typeAliasRes = ures_getByKey(keyTypeDataRes, "typeAlias", NULL, &tmpSts); + if (U_FAILURE(tmpSts)) { + typeAliasRes = NULL; + tmpSts = U_ZERO_ERROR; + } + bcpTypeAliasRes = ures_getByKey(keyTypeDataRes, "bcpTypeAlias", NULL, &tmpSts); + if (U_FAILURE(tmpSts)) { + bcpTypeAliasRes = NULL; + tmpSts = U_ZERO_ERROR; + } + + // initialize vectors storing dynamically allocated objects + gKeyTypeStringPool = new UVector(uloc_deleteKeyTypeStringPoolEntry, NULL, sts); + if (gKeyTypeStringPool == NULL || U_FAILURE(sts)) { + goto close_bundles; + } + gLocExtKeyDataEntries = new UVector(uloc_deleteKeyDataEntry, NULL, sts); + if (gLocExtKeyDataEntries == NULL || U_FAILURE(sts)) { + goto close_bundles; + } + gLocExtTypeEntries = new UVector(uloc_deleteTypeEntry, NULL, sts); + if (gLocExtTypeEntries == NULL || U_FAILURE(sts)) { + goto close_bundles; + } + + // iterate through keyMap resource + UResourceBundle keyMapEntry; + ures_initStackObject(&keyMapEntry); + + while (ures_hasNext(keyMapRes)) { + ures_getNextResource(keyMapRes, &keyMapEntry, &sts); + if (U_FAILURE(sts)) { + break; + } + const char* legacyKeyId = ures_getKey(&keyMapEntry); + int32_t bcpKeyIdLen = 0; + const UChar* uBcpKeyId = ures_getString(&keyMapEntry, &bcpKeyIdLen, &sts); + if (U_FAILURE(sts)) { + break; + } + + // empty value indicates that BCP key is same with the legacy key. + const char* bcpKeyId = legacyKeyId; + if (bcpKeyIdLen > 0) { + char* bcpKeyIdBuf = (char*)uprv_malloc(bcpKeyIdLen + 1); + if (bcpKeyIdBuf == NULL) { + sts = U_MEMORY_ALLOCATION_ERROR; + break; + } + u_UCharsToChars(uBcpKeyId, bcpKeyIdBuf, bcpKeyIdLen); + bcpKeyIdBuf[bcpKeyIdLen] = 0; + gKeyTypeStringPool->addElement(bcpKeyIdBuf, sts); + if (U_FAILURE(sts)) { + break; + } + bcpKeyId = bcpKeyIdBuf; + } + + UBool isTZ = uprv_strcmp(legacyKeyId, "timezone") == 0; + + UHashtable* typeDataMap = uhash_open(uhash_hashIChars, uhash_compareIChars, NULL, &sts); + if (U_FAILURE(sts)) { + break; + } + uint32_t specialTypes = SPECIALTYPE_NONE; + + UResourceBundle* typeAliasResByKey = NULL; + UResourceBundle* bcpTypeAliasResByKey = NULL; + + if (typeAliasRes != NULL) { + typeAliasResByKey = ures_getByKey(typeAliasRes, legacyKeyId, NULL, &tmpSts); + if (U_FAILURE(tmpSts)) { + // only a few keys have type alias mapping + typeAliasResByKey = NULL; + tmpSts = U_ZERO_ERROR; + } + } + if (bcpTypeAliasRes != NULL) { + bcpTypeAliasResByKey = ures_getByKey(bcpTypeAliasRes, bcpKeyId, NULL, &tmpSts); + if (U_FAILURE(tmpSts)) { + // only a few keys have BCP type alias mapping + bcpTypeAliasResByKey = NULL; + tmpSts = U_ZERO_ERROR; + } + } + + // look up type map for the key, and walk through the mapping data + UResourceBundle* typeMapResByKey = ures_getByKey(typeMapRes, legacyKeyId, NULL, &tmpSts); + if (U_FAILURE(tmpSts)) { + // type map for each key must exist + U_ASSERT(FALSE); + tmpSts = U_ZERO_ERROR; + } else { + UResourceBundle typeMapEntry; + ures_initStackObject(&typeMapEntry); + + while (ures_hasNext(typeMapResByKey)) { + ures_getNextResource(typeMapResByKey, &typeMapEntry, &sts); + if (U_FAILURE(sts)) { + break; + } + const char* legacyTypeId = ures_getKey(&typeMapEntry); + + // special types + if (uprv_strcmp(legacyTypeId, "CODEPOINTS") == 0) { + specialTypes |= SPECIALTYPE_CODEPOINTS; + continue; + } + if (uprv_strcmp(legacyTypeId, "REORDER_CODE") == 0) { + specialTypes |= SPECIALTYPE_REORDER_CODE; + continue; + } + + if (isTZ) { + // a timezone key uses a colon instead of a slash in the resource. + // e.g. America:Los_Angeles + if (uprv_strchr(legacyTypeId, ':') != NULL) { + int32_t legacyTypeIdLen = uprv_strlen(legacyTypeId); + char* legacyTypeIdBuf = (char*)uprv_malloc(legacyTypeIdLen + 1); + if (legacyTypeIdBuf == NULL) { + sts = U_MEMORY_ALLOCATION_ERROR; + break; + } + const char* p = legacyTypeId; + char* q = legacyTypeIdBuf; + while (*p) { + if (*p == ':') { + *q++ = '/'; + } else { + *q++ = *p; + } + p++; + } + *q = 0; + + gKeyTypeStringPool->addElement(legacyTypeIdBuf, sts); + if (U_FAILURE(sts)) { + break; + } + legacyTypeId = legacyTypeIdBuf; + } + } + + int32_t bcpTypeIdLen = 0; + const UChar* uBcpTypeId = ures_getString(&typeMapEntry, &bcpTypeIdLen, &sts); + if (U_FAILURE(sts)) { + break; + } + + // empty value indicates that BCP type is same with the legacy type. + const char* bcpTypeId = legacyTypeId; + if (bcpTypeIdLen > 0) { + char* bcpTypeIdBuf = (char*)uprv_malloc(bcpTypeIdLen + 1); + if (bcpTypeIdBuf == NULL) { + sts = U_MEMORY_ALLOCATION_ERROR; + break; + } + u_UCharsToChars(uBcpTypeId, bcpTypeIdBuf, bcpTypeIdLen); + bcpTypeIdBuf[bcpTypeIdLen] = 0; + gKeyTypeStringPool->addElement(bcpTypeIdBuf, sts); + if (U_FAILURE(sts)) { + break; + } + bcpTypeId = bcpTypeIdBuf; + } + + // Note: legacy type value should never be + // equivalent to bcp type value of a different + // type under the same key. So we use a single + // map for lookup. + LocExtType* t = (LocExtType*)uprv_malloc(sizeof(LocExtType)); + if (t == NULL) { + sts = U_MEMORY_ALLOCATION_ERROR; + break; + } + t->bcpId = bcpTypeId; + t->legacyId = legacyTypeId; + gLocExtTypeEntries->addElement((void*)t, sts); + if (U_FAILURE(sts)) { + break; + } + + uhash_put(typeDataMap, (void*)legacyTypeId, t, &sts); + if (bcpTypeId != legacyTypeId) { + // different type value + uhash_put(typeDataMap, (void*)bcpTypeId, t, &sts); + } + if (U_FAILURE(sts)) { + break; + } + + // also put aliases in the map + if (typeAliasResByKey != NULL) { + UResourceBundle typeAliasDataEntry; + ures_initStackObject(&typeAliasDataEntry); + + ures_resetIterator(typeAliasResByKey); + while (ures_hasNext(typeAliasResByKey) && U_SUCCESS(sts)) { + int32_t toLen; + ures_getNextResource(typeAliasResByKey, &typeAliasDataEntry, &sts); + const UChar* to = ures_getString(&typeAliasDataEntry, &toLen, &sts); + if (U_FAILURE(sts)) { + break; + } + // check if this is an alias of canoncal legacy type + if (uprv_compareInvAscii(NULL, legacyTypeId, -1, to, toLen) == 0) { + const char* from = ures_getKey(&typeAliasDataEntry); + if (isTZ) { + // replace colon with slash if necessary + if (uprv_strchr(from, ':') != NULL) { + int32_t fromLen = uprv_strlen(from); + char* fromBuf = (char*)uprv_malloc(fromLen + 1); + if (fromBuf == NULL) { + sts = U_MEMORY_ALLOCATION_ERROR; + break; + } + const char* p = from; + char* q = fromBuf; + while (*p) { + if (*p == ':') { + *q++ = '/'; + } else { + *q++ = *p; + } + p++; + } + *q = 0; + + gKeyTypeStringPool->addElement(fromBuf, sts); + if (U_FAILURE(sts)) { + break; + } + from = fromBuf; + } + } + uhash_put(typeDataMap, (void*)from, t, &sts); + } + } + ures_close(&typeAliasDataEntry); + if (U_FAILURE(sts)) { + break; + } + } + + if (bcpTypeAliasResByKey != NULL) { + UResourceBundle bcpTypeAliasDataEntry; + ures_initStackObject(&bcpTypeAliasDataEntry); + + ures_resetIterator(bcpTypeAliasResByKey); + while (ures_hasNext(bcpTypeAliasResByKey) && U_SUCCESS(sts)) { + int32_t toLen; + ures_getNextResource(bcpTypeAliasResByKey, &bcpTypeAliasDataEntry, &sts); + const UChar* to = ures_getString(&bcpTypeAliasDataEntry, &toLen, &sts); + if (U_FAILURE(sts)) { + break; + } + // check if this is an alias of bcp type + if (uprv_compareInvAscii(NULL, bcpTypeId, -1, to, toLen) == 0) { + const char* from = ures_getKey(&bcpTypeAliasDataEntry); + uhash_put(typeDataMap, (void*)from, t, &sts); + } + } + ures_close(&bcpTypeAliasDataEntry); + if (U_FAILURE(sts)) { + break; + } + } + } + ures_close(&typeMapEntry); + } + ures_close(typeMapResByKey); + ures_close(typeAliasResByKey); + ures_close(bcpTypeAliasResByKey); + if (U_FAILURE(sts)) { + break; + } + + LocExtKeyData* keyData = (LocExtKeyData*)uprv_malloc(sizeof(LocExtKeyData)); + if (keyData == NULL) { + sts = U_MEMORY_ALLOCATION_ERROR; + break; + } + keyData->bcpId = bcpKeyId; + keyData->legacyId = legacyKeyId; + keyData->specialTypes = specialTypes; + keyData->typeMap = typeDataMap; + + gLocExtKeyDataEntries->addElement((void*)keyData, sts); + if (U_FAILURE(sts)) { + break; + } + + uhash_put(gLocExtKeyMap, (void*)legacyKeyId, keyData, &sts); + if (legacyKeyId != bcpKeyId) { + // different key value + uhash_put(gLocExtKeyMap, (void*)bcpKeyId, keyData, &sts); + } + if (U_FAILURE(sts)) { + break; + } + } + + ures_close(&keyMapEntry); + +close_bundles: + ures_close(bcpTypeAliasRes); + ures_close(typeAliasRes); + ures_close(typeMapRes); + ures_close(keyMapRes); + ures_close(keyTypeDataRes); +} + +static UBool +init() { + UErrorCode sts = U_ZERO_ERROR; + umtx_initOnce(gLocExtKeyMapInitOnce, &initFromResourceBundle, sts); + if (U_FAILURE(sts)) { + return FALSE; + } + return TRUE; +} + +static UBool +isSpecialTypeCodepoints(const char* val) { + int32_t subtagLen = 0; + const char* p = val; + while (*p) { + if (*p == '-') { + if (subtagLen < 4 || subtagLen > 6) { + return FALSE; + } + subtagLen = 0; + } else if (('0' <= *p && *p <= '9') || + ('A' <= *p && *p <= 'F') || ('a' <= *p && *p <= 'f')) { + subtagLen++; + } else { + return FALSE; + } + p++; + } + return (subtagLen >= 4 && subtagLen <= 6); +} + +static UBool +isSpecialTypeReorderCode(const char* val) { + int32_t subtagLen = 0; + const char* p = val; + while (*p) { + if (*p == '-') { + if (subtagLen < 3 || subtagLen > 8) { + return FALSE; + } + subtagLen = 0; + } else if (('A' <= *p && *p <= 'Z') || ('a' <= *p && *p <= 'z')) { + subtagLen++; + } else { + return FALSE; + } + p++; + } + return (subtagLen >=3 && subtagLen <=8); +} + +U_CFUNC const char* +ulocimp_toBcpKey(const char* key) { + if (!init()) { + return NULL; + } + + LocExtKeyData* keyData = (LocExtKeyData*)uhash_get(gLocExtKeyMap, key); + if (keyData != NULL) { + return keyData->bcpId; + } + return NULL; +} + +U_CFUNC const char* +ulocimp_toLegacyKey(const char* key) { + if (!init()) { + return NULL; + } + + LocExtKeyData* keyData = (LocExtKeyData*)uhash_get(gLocExtKeyMap, key); + if (keyData != NULL) { + return keyData->legacyId; + } + return NULL; +} + +U_CFUNC const char* +ulocimp_toBcpType(const char* key, const char* type, UBool* isKnownKey, UBool* isSpecialType) { + if (isKnownKey != NULL) { + *isKnownKey = FALSE; + } + if (isSpecialType != NULL) { + *isSpecialType = FALSE; + } + + if (!init()) { + return NULL; + } + + LocExtKeyData* keyData = (LocExtKeyData*)uhash_get(gLocExtKeyMap, key); + if (keyData != NULL) { + if (isKnownKey != NULL) { + *isKnownKey = TRUE; + } + LocExtType* t = (LocExtType*)uhash_get(keyData->typeMap, type); + if (t != NULL) { + return t->bcpId; + } + if (keyData->specialTypes != SPECIALTYPE_NONE) { + UBool matched = FALSE; + if (keyData->specialTypes & SPECIALTYPE_CODEPOINTS) { + matched = isSpecialTypeCodepoints(type); + } + if (!matched && keyData->specialTypes & SPECIALTYPE_REORDER_CODE) { + matched = isSpecialTypeReorderCode(type); + } + if (matched) { + if (isSpecialType != NULL) { + *isSpecialType = TRUE; + } + return type; + } + } + } + return NULL; +} + + +U_CFUNC const char* +ulocimp_toLegacyType(const char* key, const char* type, UBool* isKnownKey, UBool* isSpecialType) { + if (isKnownKey != NULL) { + *isKnownKey = FALSE; + } + if (isSpecialType != NULL) { + *isSpecialType = FALSE; + } + + if (!init()) { + return NULL; + } + + LocExtKeyData* keyData = (LocExtKeyData*)uhash_get(gLocExtKeyMap, key); + if (keyData != NULL) { + if (isKnownKey != NULL) { + *isKnownKey = TRUE; + } + LocExtType* t = (LocExtType*)uhash_get(keyData->typeMap, type); + if (t != NULL) { + return t->legacyId; + } + if (keyData->specialTypes != SPECIALTYPE_NONE) { + UBool matched = FALSE; + if (keyData->specialTypes & SPECIALTYPE_CODEPOINTS) { + matched = isSpecialTypeCodepoints(type); + } + if (!matched && keyData->specialTypes & SPECIALTYPE_REORDER_CODE) { + matched = isSpecialTypeReorderCode(type); + } + if (matched) { + if (isSpecialType != NULL) { + *isSpecialType = TRUE; + } + return type; + } + } + } + return NULL; +} + diff --git a/icu4c/source/common/uloc_tag.c b/icu4c/source/common/uloc_tag.c index 3725955b361..c038026790d 100644 --- a/icu4c/source/common/uloc_tag.c +++ b/icu4c/source/common/uloc_tag.c @@ -408,8 +408,8 @@ _isPrivateuseValueSubtags(const char* s, int32_t len) { return _isPrivateuseValueSubtag(pSubtag, (int32_t)(p - pSubtag)); } -static UBool -_isLDMLKey(const char* s, int32_t len) { +U_CFUNC UBool +ultag_isUnicodeLocaleKey(const char* s, int32_t len) { if (len < 0) { len = (int32_t)uprv_strlen(s); } @@ -419,17 +419,33 @@ _isLDMLKey(const char* s, int32_t len) { return FALSE; } -static UBool -_isLDMLType(const char* s, int32_t len) { +U_CFUNC UBool +ultag_isUnicodeLocaleType(const char*s, int32_t len) { + const char* p; + int32_t subtagLen = 0; + if (len < 0) { len = (int32_t)uprv_strlen(s); } - if (len >= 3 && len <= 8 && _isAlphaNumericString(s, len)) { - return TRUE; - } - return FALSE; -} + for (p = s; len > 0; p++, len--) { + if (*p == SEP) { + if (subtagLen < 3) { + return FALSE; + } + subtagLen = 0; + } else if (ISALPHA(*p) || ISNUMERIC(*p)) { + subtagLen++; + if (subtagLen > 8) { + return FALSE; + } + } else { + return FALSE; + } + } + + return (subtagLen >= 3); +} /* * ------------------------------------------------- * @@ -608,417 +624,6 @@ _initializeULanguageTag(ULanguageTag* langtag) { langtag->privateuse = EMPTY; } -#define KEYTYPEDATA "keyTypeData" -#define KEYMAP "keyMap" -#define TYPEMAP "typeMap" -#define TYPEALIAS "typeAlias" -#define MAX_BCP47_SUBTAG_LEN 9 /* including null terminator */ -#define MAX_LDML_KEY_LEN 22 -#define MAX_LDML_TYPE_LEN 32 - -static int32_t -_ldmlKeyToBCP47(const char* key, int32_t keyLen, - char* bcpKey, int32_t bcpKeyCapacity, - UErrorCode *status) { - UResourceBundle *rb; - char keyBuf[MAX_LDML_KEY_LEN]; - char bcpKeyBuf[MAX_BCP47_SUBTAG_LEN]; - int32_t resultLen = 0; - int32_t i; - UErrorCode tmpStatus = U_ZERO_ERROR; - const UChar *uBcpKey; - int32_t bcpKeyLen; - - if (keyLen < 0) { - keyLen = (int32_t)uprv_strlen(key); - } - - if (keyLen >= sizeof(keyBuf)) { - /* no known valid LDML key exceeding 21 */ - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - uprv_memcpy(keyBuf, key, keyLen); - keyBuf[keyLen] = 0; - - /* to lower case */ - for (i = 0; i < keyLen; i++) { - keyBuf[i] = uprv_tolower(keyBuf[i]); - } - - rb = ures_openDirect(NULL, KEYTYPEDATA, status); - ures_getByKey(rb, KEYMAP, rb, status); - - if (U_FAILURE(*status)) { - ures_close(rb); - return 0; - } - - uBcpKey = ures_getStringByKey(rb, keyBuf, &bcpKeyLen, &tmpStatus); - if (U_SUCCESS(tmpStatus)) { - if (bcpKeyLen == 0) { - /* empty value indicates the BCP47 key is same with the legacy key */ - uprv_memcpy(bcpKeyBuf, key, keyLen); - bcpKeyBuf[keyLen] = 0; - resultLen = keyLen; - } else { - u_UCharsToChars(uBcpKey, bcpKeyBuf, bcpKeyLen); - bcpKeyBuf[bcpKeyLen] = 0; - resultLen = bcpKeyLen; - } - } else { - if (_isLDMLKey(key, keyLen)) { - uprv_memcpy(bcpKeyBuf, key, keyLen); - bcpKeyBuf[keyLen] = 0; - resultLen = keyLen; - } else { - /* mapping not availabe */ - *status = U_ILLEGAL_ARGUMENT_ERROR; - } - } - ures_close(rb); - - if (U_FAILURE(*status)) { - return 0; - } - - uprv_memcpy(bcpKey, bcpKeyBuf, uprv_min(resultLen, bcpKeyCapacity)); - return u_terminateChars(bcpKey, bcpKeyCapacity, resultLen, status); -} - -static int32_t -_bcp47ToLDMLKey(const char* bcpKey, int32_t bcpKeyLen, - char* key, int32_t keyCapacity, - UErrorCode *status) { - UResourceBundle *rb; - char bcpKeyBuf[MAX_BCP47_SUBTAG_LEN]; - int32_t resultLen = 0; - int32_t i; - const char *resKey = NULL; - UResourceBundle *mapData; - - if (bcpKeyLen < 0) { - bcpKeyLen = (int32_t)uprv_strlen(bcpKey); - } - - if (bcpKeyLen >= sizeof(bcpKeyBuf)) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - uprv_memcpy(bcpKeyBuf, bcpKey, bcpKeyLen); - bcpKeyBuf[bcpKeyLen] = 0; - - /* to lower case */ - for (i = 0; i < bcpKeyLen; i++) { - bcpKeyBuf[i] = uprv_tolower(bcpKeyBuf[i]); - } - - rb = ures_openDirect(NULL, KEYTYPEDATA, status); - ures_getByKey(rb, KEYMAP, rb, status); - if (U_FAILURE(*status)) { - ures_close(rb); - return 0; - } - - mapData = ures_getNextResource(rb, NULL, status); - while (U_SUCCESS(*status)) { - const UChar *uBcpKey; - char tmpBcpKeyBuf[MAX_BCP47_SUBTAG_LEN]; - int32_t tmpBcpKeyLen; - const char *tmpBcpKey = tmpBcpKeyBuf; - - uBcpKey = ures_getString(mapData, &tmpBcpKeyLen, status); - if (U_FAILURE(*status)) { - break; - } - if (tmpBcpKeyLen == 0) { - /* empty value indicates the BCP47 key is same with the legacy key */ - tmpBcpKey = ures_getKey(mapData); - } else { - u_UCharsToChars(uBcpKey, tmpBcpKeyBuf, tmpBcpKeyLen); - tmpBcpKeyBuf[tmpBcpKeyLen] = 0; - } - if (uprv_compareInvCharsAsAscii(bcpKeyBuf, tmpBcpKey) == 0) { - /* found a matching BCP47 key */ - resKey = ures_getKey(mapData); - resultLen = (int32_t)uprv_strlen(resKey); - break; - } - if (!ures_hasNext(rb)) { - break; - } - ures_getNextResource(rb, mapData, status); - } - ures_close(mapData); - ures_close(rb); - - if (U_FAILURE(*status)) { - return 0; - } - - if (resKey == NULL) { - resKey = bcpKeyBuf; - resultLen = bcpKeyLen; - } - - uprv_memcpy(key, resKey, uprv_min(resultLen, keyCapacity)); - return u_terminateChars(key, keyCapacity, resultLen, status); -} - -static int32_t -_ldmlTypeToBCP47(const char* key, int32_t keyLen, - const char* type, int32_t typeLen, - char* bcpType, int32_t bcpTypeCapacity, - UErrorCode *status) { - UResourceBundle *rb, *keyTypeData, *typeMapForKey; - char keyBuf[MAX_LDML_KEY_LEN]; - char typeBuf[MAX_LDML_TYPE_LEN]; - char bcpTypeBuf[MAX_BCP47_SUBTAG_LEN]; - int32_t resultLen = 0; - int32_t i; - UErrorCode tmpStatus = U_ZERO_ERROR; - const UChar *uBcpType, *uCanonicalType; - int32_t bcpTypeLen, canonicalTypeLen; - UBool isTimezone = FALSE; - - if (keyLen < 0) { - keyLen = (int32_t)uprv_strlen(key); - } - if (keyLen >= sizeof(keyBuf)) { - /* no known valid LDML key exceeding 21 */ - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - uprv_memcpy(keyBuf, key, keyLen); - keyBuf[keyLen] = 0; - - /* to lower case */ - for (i = 0; i < keyLen; i++) { - keyBuf[i] = uprv_tolower(keyBuf[i]); - } - if (uprv_compareInvCharsAsAscii(keyBuf, "timezone") == 0) { - isTimezone = TRUE; - } - - if (typeLen < 0) { - typeLen = (int32_t)uprv_strlen(type); - } - if (typeLen >= sizeof(typeBuf)) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - if (isTimezone) { - /* replace '/' with ':' */ - for (i = 0; i < typeLen; i++) { - if (*(type + i) == '/') { - typeBuf[i] = ':'; - } else { - typeBuf[i] = *(type + i); - } - } - typeBuf[typeLen] = 0; - type = &typeBuf[0]; - } - - keyTypeData = ures_openDirect(NULL, KEYTYPEDATA, status); - rb = ures_getByKey(keyTypeData, TYPEMAP, NULL, status); - if (U_FAILURE(*status)) { - ures_close(rb); - ures_close(keyTypeData); - return 0; - } - - typeMapForKey = ures_getByKey(rb, keyBuf, NULL, &tmpStatus); - uBcpType = ures_getStringByKey(typeMapForKey, type, &bcpTypeLen, &tmpStatus); - if (U_SUCCESS(tmpStatus)) { - if (bcpTypeLen == 0) { - /* empty value indicates the BCP47 type is same with the legacy type */ - uprv_memcpy(bcpTypeBuf, type, typeLen); - resultLen = typeLen; - } else { - u_UCharsToChars(uBcpType, bcpTypeBuf, bcpTypeLen); - resultLen = bcpTypeLen; - } - } else if (tmpStatus == U_MISSING_RESOURCE_ERROR) { - /* is this type alias? */ - tmpStatus = U_ZERO_ERROR; - ures_getByKey(keyTypeData, TYPEALIAS, rb, &tmpStatus); - ures_getByKey(rb, keyBuf, rb, &tmpStatus); - uCanonicalType = ures_getStringByKey(rb, type, &canonicalTypeLen, &tmpStatus); - if (U_SUCCESS(tmpStatus)) { - u_UCharsToChars(uCanonicalType, typeBuf, canonicalTypeLen); - if (isTimezone) { - /* replace '/' with ':' */ - for (i = 0; i < canonicalTypeLen; i++) { - if (typeBuf[i] == '/') { - typeBuf[i] = ':'; - } - } - } - typeBuf[canonicalTypeLen] = 0; - - /* look up the canonical type */ - uBcpType = ures_getStringByKey(typeMapForKey, typeBuf, &bcpTypeLen, &tmpStatus); - if (U_SUCCESS(tmpStatus)) { - if (bcpTypeLen == 0) { - /* empty value indicates the BCP47 type is same with the legacy type */ - uprv_memcpy(bcpTypeBuf, typeBuf, canonicalTypeLen); - resultLen = canonicalTypeLen; - } else { - u_UCharsToChars(uBcpType, bcpTypeBuf, bcpTypeLen); - resultLen = bcpTypeLen; - } - } - } - if (tmpStatus == U_MISSING_RESOURCE_ERROR) { - if (_isLDMLType(type, typeLen)) { - uprv_memcpy(bcpTypeBuf, type, typeLen); - resultLen = typeLen; - } else { - /* mapping not availabe */ - *status = U_ILLEGAL_ARGUMENT_ERROR; - } - } - } else { - *status = tmpStatus; - } - ures_close(rb); - ures_close(typeMapForKey); - ures_close(keyTypeData); - - if (U_FAILURE(*status)) { - return 0; - } - - uprv_memcpy(bcpType, bcpTypeBuf, uprv_min(resultLen, bcpTypeCapacity)); - return u_terminateChars(bcpType, bcpTypeCapacity, resultLen, status); -} - -static int32_t -_bcp47ToLDMLType(const char* key, int32_t keyLen, - const char* bcpType, int32_t bcpTypeLen, - char* type, int32_t typeCapacity, - UErrorCode *status) { - UResourceBundle *rb; - char keyBuf[MAX_LDML_KEY_LEN]; - char bcpTypeBuf[ULOC_KEYWORDS_CAPACITY]; /* ensure buffter is large enough for multiple values (e.g. buddhist-greg) */ - int32_t resultLen = 0; - int32_t i, typeSize; - const char *resType = NULL; - UResourceBundle *mapData; - UErrorCode tmpStatus = U_ZERO_ERROR; - int32_t copyLen; - - if (keyLen < 0) { - keyLen = (int32_t)uprv_strlen(key); - } - - if (keyLen >= sizeof(keyBuf)) { - /* no known valid LDML key exceeding 21 */ - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - uprv_memcpy(keyBuf, key, keyLen); - keyBuf[keyLen] = 0; - - /* to lower case */ - for (i = 0; i < keyLen; i++) { - keyBuf[i] = uprv_tolower(keyBuf[i]); - } - - - if (bcpTypeLen < 0) { - bcpTypeLen = (int32_t)uprv_strlen(bcpType); - } - - typeSize = 0; - for (i = 0; i < bcpTypeLen; i++) { - if (bcpType[i] == SEP) { - if (typeSize >= MAX_BCP47_SUBTAG_LEN) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - typeSize = 0; - } else { - typeSize++; - } - } - - uprv_memcpy(bcpTypeBuf, bcpType, bcpTypeLen); - bcpTypeBuf[bcpTypeLen] = 0; - - /* to lower case */ - for (i = 0; i < bcpTypeLen; i++) { - bcpTypeBuf[i] = uprv_tolower(bcpTypeBuf[i]); - } - - rb = ures_openDirect(NULL, KEYTYPEDATA, status); - ures_getByKey(rb, TYPEMAP, rb, status); - if (U_FAILURE(*status)) { - ures_close(rb); - return 0; - } - - ures_getByKey(rb, keyBuf, rb, &tmpStatus); - mapData = ures_getNextResource(rb, NULL, &tmpStatus); - while (U_SUCCESS(tmpStatus)) { - const UChar *uBcpType; - char tmpBcpTypeBuf[MAX_BCP47_SUBTAG_LEN]; - int32_t tmpBcpTypeLen; - const char *tmpBcpType = tmpBcpTypeBuf; - - uBcpType = ures_getString(mapData, &tmpBcpTypeLen, &tmpStatus); - if (U_FAILURE(tmpStatus)) { - break; - } - if (tmpBcpTypeLen == 0) { - /* empty value indicates the BCP47 type is same with the legacy type */ - tmpBcpType = ures_getKey(mapData); - } else { - u_UCharsToChars(uBcpType, tmpBcpTypeBuf, tmpBcpTypeLen); - tmpBcpTypeBuf[tmpBcpTypeLen] = 0; - } - if (uprv_compareInvCharsAsAscii(bcpTypeBuf, tmpBcpType) == 0) { - /* found a matching BCP47 type */ - resType = ures_getKey(mapData); - resultLen = (int32_t)uprv_strlen(resType); - break; - } - if (!ures_hasNext(rb)) { - break; - } - ures_getNextResource(rb, mapData, &tmpStatus); - } - ures_close(mapData); - ures_close(rb); - - if (U_FAILURE(tmpStatus) && tmpStatus != U_MISSING_RESOURCE_ERROR) { - *status = tmpStatus; - return 0; - } - - if (resType == NULL) { - resType = bcpTypeBuf; - resultLen = bcpTypeLen; - } - - copyLen = uprv_min(resultLen, typeCapacity); - uprv_memcpy(type, resType, copyLen); - - if (uprv_compareInvCharsAsAscii(keyBuf, "timezone") == 0) { - for (i = 0; i < copyLen; i++) { - if (*(type + i) == ':') { - *(type + i) = '/'; - } - } - } - - return u_terminateChars(type, typeCapacity, resultLen, status); -} - static int32_t _appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { char buf[ULOC_LANG_CAPACITY]; @@ -1311,7 +916,7 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac const char *bcpKey, *bcpValue; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t keylen; - UBool isLDMLKeyword; + UBool isBcpUExt; while (TRUE) { isAttribute = FALSE; @@ -1320,7 +925,8 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac break; } len = uloc_getKeywordValue(localeID, key, buf, sizeof(buf), &tmpStatus); - if (U_FAILURE(tmpStatus)) { + /* buf must be null-terminated */ + if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; @@ -1331,7 +937,7 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac } keylen = (int32_t)uprv_strlen(key); - isLDMLKeyword = (keylen > 1); + isBcpUExt = (keylen > 1); /* special keyword used for representing Unicode locale attributes */ if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) { @@ -1379,36 +985,49 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac } } } - } else if (isLDMLKeyword) { - int32_t modKeyLen; - - /* transform key and value to bcp47 style */ - modKeyLen = _ldmlKeyToBCP47(key, keylen, pExtBuf, extBufCapacity, &tmpStatus); - if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { + } else if (isBcpUExt) { + bcpKey = uloc_toUnicodeLocaleKey(key); + if (bcpKey == NULL) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } - tmpStatus = U_ZERO_ERROR; continue; } - bcpKey = pExtBuf; - pExtBuf += (modKeyLen + 1); - extBufCapacity -= (modKeyLen + 1); - - len = _ldmlTypeToBCP47(key, keylen, buf, len, pExtBuf, extBufCapacity, &tmpStatus); - if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { + /* we've checked buf is null-terminated above */ + bcpValue = uloc_toUnicodeLocaleType(key, buf); + if (bcpValue == NULL) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; } - tmpStatus = U_ZERO_ERROR; continue; } - bcpValue = pExtBuf; - pExtBuf += (len + 1); - extBufCapacity -= (len + 1); + if (bcpValue == buf) { + /* + When uloc_toUnicodeLocaleType(key, buf) returns the + input value as is, the value is well-formed, but has + no known mapping. This implementation normalizes the + the value to lower case + */ + int32_t bcpValueLen = uprv_strlen(bcpValue); + if (bcpValueLen < extBufCapacity) { + uprv_strcpy(pExtBuf, bcpValue); + T_CString_toLowerCase(pExtBuf); + + bcpValue = pExtBuf; + + pExtBuf += (bcpValueLen + 1); + extBufCapacity -= (bcpValueLen + 1); + } else { + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + continue; + } + } } else { if (*key == PRIVATEUSE) { if (!_isPrivateuseValueSubtags(buf, len)) { @@ -1600,7 +1219,7 @@ _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendT /* locate next separator char */ for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++); - if (_isLDMLKey(pTag, len)) { + if (ultag_isUnicodeLocaleKey(pTag, len)) { pKwds = pTag; break; } @@ -1708,7 +1327,7 @@ _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendT /* locate next separator char */ for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++); - if (_isLDMLKey(pTag, len)) { + if (ultag_isUnicodeLocaleKey(pTag, len)) { if (pBcpKey) { emitKeyword = TRUE; pNextBcpKey = pTag; @@ -1744,28 +1363,78 @@ _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendT const char *pKey = NULL; /* LDML key */ const char *pType = NULL; /* LDML type */ + char bcpKeyBuf[9]; /* BCP key length is always 2 for now */ + U_ASSERT(pBcpKey != NULL); - /* u extension key to LDML key */ - len = _bcp47ToLDMLKey(pBcpKey, bcpKeyLen, buf + bufIdx, bufSize - bufIdx - 1, status); - if (U_FAILURE(*status)) { + if (bcpKeyLen >= sizeof(bcpKeyBuf)) { + /* the BCP key is invalid */ + *status = U_ILLEGAL_ARGUMENT_ERROR; goto cleanup; } - pKey = buf + bufIdx; - bufIdx += len; - *(buf + bufIdx) = 0; - bufIdx++; - if (pBcpType) { - /* BCP type to locale type */ - len = _bcp47ToLDMLType(pKey, -1, pBcpType, bcpTypeLen, buf + bufIdx, bufSize - bufIdx - 1, status); - if (U_FAILURE(*status)) { + uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen); + bcpKeyBuf[bcpKeyLen] = 0; + + /* u extension key to LDML key */ + pKey = uloc_toLegacyKey(bcpKeyBuf); + if (pKey == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + goto cleanup; + } + if (pKey == bcpKeyBuf) { + /* + The key returned by toLegacyKey points to the input buffer. + We normalize the result key to lower case. + */ + T_CString_toLowerCase(bcpKeyBuf); + if (bufSize - bufIdx - 1 >= bcpKeyLen) { + uprv_memcpy(buf + bufIdx, bcpKeyBuf, bcpKeyLen); + pKey = buf + bufIdx; + bufIdx += bcpKeyLen; + *(buf + bufIdx) = 0; + bufIdx++; + } else { + *status = U_BUFFER_OVERFLOW_ERROR; goto cleanup; } - pType = buf + bufIdx; - bufIdx += len; - *(buf + bufIdx) = 0; - bufIdx++; + } + + if (pBcpType) { + char bcpTypeBuf[128]; /* practically long enough even considering multiple subtag type */ + if (bcpTypeLen >= sizeof(bcpTypeBuf)) { + /* the BCP type is too long */ + *status = U_ILLEGAL_ARGUMENT_ERROR; + goto cleanup; + } + + uprv_strncpy(bcpTypeBuf, pBcpType, bcpTypeLen); + bcpTypeBuf[bcpTypeLen] = 0; + + /* BCP type to locale type */ + pType = uloc_toLegacyType(pKey, bcpTypeBuf); + if (pType == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + goto cleanup; + } + if (pType == bcpTypeBuf) { + /* + The type returned by toLegacyType points to the input buffer. + We normalize the result type to lower case. + */ + /* normalize to lower case */ + T_CString_toLowerCase(bcpTypeBuf); + if (bufSize - bufIdx - 1 >= bcpTypeLen) { + uprv_memcpy(buf + bufIdx, bcpTypeBuf, bcpTypeLen); + pType = buf + bufIdx; + bufIdx += bcpTypeLen; + *(buf + bufIdx) = 0; + bufIdx++; + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + goto cleanup; + } + } } else { /* typeless - default type value is "yes" */ pType = LOCALE_TYPE_YES; diff --git a/icu4c/source/common/ulocimp.h b/icu4c/source/common/ulocimp.h index ebc525ef9de..164a730c005 100644 --- a/icu4c/source/common/ulocimp.h +++ b/icu4c/source/common/ulocimp.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2004-2010, International Business Machines +* Copyright (C) 2004-2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -62,4 +62,23 @@ ulocimp_getCountry(const char *localeID, U_CAPI const char * U_EXPORT2 locale_getKeywordsStart(const char *localeID); + +U_CFUNC UBool +ultag_isUnicodeLocaleKey(const char* s, int32_t len); + +U_CFUNC UBool +ultag_isUnicodeLocaleType(const char* s, int32_t len); + +U_CFUNC const char* +ulocimp_toBcpKey(const char* key); + +U_CFUNC const char* +ulocimp_toLegacyKey(const char* key); + +U_CFUNC const char* +ulocimp_toBcpType(const char* key, const char* type, UBool* isKnownKey, UBool* isSpecialType); + +U_CFUNC const char* +ulocimp_toLegacyType(const char* key, const char* type, UBool* isKnownKey, UBool* isSpecialType); + #endif diff --git a/icu4c/source/common/unicode/uloc.h b/icu4c/source/common/unicode/uloc.h index 5d9e1e7dc88..f16380f0378 100644 --- a/icu4c/source/common/unicode/uloc.h +++ b/icu4c/source/common/unicode/uloc.h @@ -1149,4 +1149,106 @@ uloc_toLanguageTag(const char* localeID, UBool strict, UErrorCode* err); +#ifndef U_HIDE_DRAFT_API +/** + * Converts the specified keyword (legacy key, or BCP 47 Unicode locale + * extension key) to the equivalent BCP 47 Unicode locale extension key. + * For example, BCP 47 Unicode locale extension key "co" is returned for + * the input keyword "collation". + *

+ * When the specified keyword is unknown, but satisfies the BCP syntax, + * then the pointer to the input keyword itself will be returned. + * For example, + * uloc_toUnicodeLocaleKey("ZZ") returns "ZZ". + * + * @param keyword the input locale keyword (either legacy key + * such as "collation" or BCP 47 Unicode locale extension + * key such as "co"). + * @return the well-formed BCP 47 Unicode locale extension key, + * or NULL if the specified locale keyword cannot be + * mapped to a well-formed BCP 47 Unicode locale extension + * key. + * @see uloc_toLegacyKey + * @draft ICU 54 + */ +U_DRAFT const char* U_EXPORT2 +uloc_toUnicodeLocaleKey(const char* keyword); + +/** + * Converts the specified keyword value (legacy type, or BCP 47 + * Unicode locale extension type) to the well-formed BCP 47 Unicode locale + * extension type for the specified keyword (category). For example, BCP 47 + * Unicode locale extension type "phonebk" is returned for the input + * keyword value "phonebook", with the keyword "collation" (or "co"). + *

+ * When the specified keyword is not recognized, but the specified value + * satisfies the syntax of the BCP 47 Unicode locale extension type, + * or when the specified keyword allows 'variable' type and the specified + * value satisfies the syntax, then the pointer to the input type value itself + * will be returned. + * For example, + * uloc_toUnicodeLocaleType("Foo", "Bar") returns "Bar", + * uloc_toUnicodeLocaleType("variableTop", "00A4") returns "00A4". + * + * @param keyword the locale keyword (either legacy key such as + * "collation" or BCP 47 Unicode locale extension + * key such as "co"). + * @param value the locale keyword value (either legacy type + * such as "phonebook" or BCP 47 Unicode locale extension + * type such as "phonebk"). + * @return the well-formed BCP47 Unicode locale extension type, + * or NULL if the locale keyword value cannot be mapped to + * a well-formed BCP 47 Unicode locale extension type. + * @see uloc_toLegacyType + * @draft ICU 54 + */ +U_DRAFT const char* U_EXPORT2 +uloc_toUnicodeLocaleType(const char* keyword, const char* value); + +/** + * Converts the specified keyword (BCP 47 Unicode locale extension key, or + * legacy key) to the legacy key. For example, legacy key "collation" is + * returned for the input BCP 47 Unicode locale extension key "co". + * + * @param keyword the input locale keyword (either BCP 47 Unicode locale + * extension key or legacy key). + * @return the well-formed legacy key, or NULL if the specified + * keyword cannot be mapped to a well-formed legacy key. + * @see toUnicodeLocaleKey + * @draft ICU 54 + */ +U_DRAFT const char* U_EXPORT2 +uloc_toLegacyKey(const char* keyword); + +/** + * Converts the specified keyword value (BCP 47 Unicode locale extension type, + * or legacy type or type alias) to the canonical legacy type. For example, + * the legacy type "phonebook" is returned for the input BCP 47 Unicode + * locale extension type "phonebk" with the keyword "collation" (or "co"). + *

+ * When the specified keyword is not recognized, but the specified value + * satisfies the syntax of legacy key, or when the specified keyword + * allows 'variable' type and the specified value satisfies the syntax, + * then the pointer to the input type value itself will be returned. + * For example, + * uloc_toLegacyType("Foo", "Bar") returns "Bar", + * uloc_toLegacyType("vt", "00A4") returns "00A4". + * + * @param keyword the locale keyword (either legacy keyword such as + * "collation" or BCP 47 Unicode locale extension + * key such as "co"). + * @param value the locale keyword value (either BCP 47 Unicode locale + * extension type such as "phonebk" or legacy keyword value + * such as "phonebook"). + * @return the well-formed legacy type, or NULL if the specified + * keyword value cannot be mapped to a well-formed legacy + * type. + * @see toUnicodeLocaleType + * @draft ICU 54 + */ +U_DRAFT const char* U_EXPORT2 +uloc_toLegacyType(const char* keyword, const char* value); + +#endif /* U_HIDE_DRAFT_API */ + #endif /*_ULOC*/ diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 3c06c40e7d1..f3691867716 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -250,6 +250,10 @@ void addLocaleTest(TestNode** root) TESTCASE(TestEnglishExemplarCharacters); TESTCASE(TestDisplayNameBrackets); TESTCASE(TestIsRightToLeft); + TESTCASE(TestToUnicodeLocaleKey); + TESTCASE(TestToLegacyKey); + TESTCASE(TestToUnicodeLocaleType); + TESTCASE(TestToLegacyType); } @@ -5673,7 +5677,6 @@ static void TestLikelySubtags() } const char* const locale_to_langtag[][3] = { - {"@x=elmer", "x-elmer", "x-elmer"}, {"", "und", "und"}, {"en", "en", "en"}, {"en_US", "en-US", "en-US"}, @@ -5707,9 +5710,9 @@ const char* const locale_to_langtag[][3] = { {"en@timezone=America/New_York;calendar=japanese", "en-u-ca-japanese-tz-usnyc", "en-u-ca-japanese-tz-usnyc"}, {"en@timezone=US/Eastern", "en-u-tz-usnyc", "en-u-tz-usnyc"}, {"en@x=x-y-z;a=a-b-c", "en-x-x-y-z", NULL}, - {"it@collation=badcollationtype;colStrength=identical;cu=usd-eur", "it-u-ks-identic", NULL}, + {"it@collation=badcollationtype;colStrength=identical;cu=usd-eur", "it-u-cu-usd-eur-ks-identic", NULL}, {"en_US_POSIX", "en-US-u-va-posix", "en-US-u-va-posix"}, - {"en_US_POSIX@calendar=japanese;currency=EUR","en-US-u-ca-japanese-cu-EUR-va-posix", "en-US-u-ca-japanese-cu-EUR-va-posix"}, + {"en_US_POSIX@calendar=japanese;currency=EUR","en-US-u-ca-japanese-cu-eur-va-posix", "en-US-u-ca-japanese-cu-eur-va-posix"}, {"@x=elmer", "x-elmer", "x-elmer"}, {"en@x=elmer", "en-x-elmer", "en-x-elmer"}, {"@x=elmer;a=exta", "und-a-exta-x-elmer", "und-a-exta-x-elmer"}, @@ -5779,6 +5782,7 @@ static const struct { const char *locID; int32_t len; } langtag_to_locale[] = { + {"ja-u-ijkl-efgh-abcd-ca-japanese-xx-yyy-zzz-kn", "ja@attribute=abcd-efgh-ijkl;calendar=japanese;colnumeric=yes;xx=yyy-zzz", FULL_LENGTH}, {"en", "en", FULL_LENGTH}, {"en-us", "en_US", FULL_LENGTH}, {"und-US", "_US", FULL_LENGTH}, @@ -5859,6 +5863,187 @@ static void TestForLanguageTag(void) { } } +static void TestToUnicodeLocaleKey(void) +{ + /* $IN specifies the result should be the input pointer itself */ + static const char* DATA[][2] = { + {"calendar", "ca"}, + {"CALEndar", "ca"}, /* difference casing */ + {"ca", "ca"}, /* bcp key itself */ + {"kv", "kv"}, /* no difference between legacy and bcp */ + {"foo", NULL}, /* unknown, bcp ill-formed */ + {"ZZ", "$IN"}, /* unknown, bcp well-formed - */ + {NULL, NULL} + }; + + int32_t i; + for (i = 0; DATA[i][0] != NULL; i++) { + const char* keyword = DATA[i][0]; + const char* expected = DATA[i][1]; + const char* bcpKey = NULL; + + bcpKey = uloc_toUnicodeLocaleKey(keyword); + if (expected == NULL) { + if (bcpKey != NULL) { + log_err("toUnicodeLocaleKey: keyword=%s => %s, expected=NULL\n", keyword, bcpKey); + } + } else if (bcpKey == NULL) { + log_err("toUnicodeLocaleKey: keyword=%s => NULL, expected=%s\n", keyword, expected); + } else if (uprv_strcmp(expected, "$IN") == 0) { + if (bcpKey != keyword) { + log_err("toUnicodeLocaleKey: keyword=%s => %s, expected=%s(input pointer)\n", keyword, bcpKey, keyword); + } + } else if (uprv_strcmp(bcpKey, expected) != 0) { + log_err("toUnicodeLocaleKey: keyword=%s => %s, expected=%s\n", keyword, bcpKey, expected); + } + } +} + +static void TestToLegacyKey(void) +{ + /* $IN specifies the result should be the input pointer itself */ + static const char* DATA[][2] = { + {"kb", "colbackwards"}, + {"kB", "colbackwards"}, /* different casing */ + {"Collation", "collation"}, /* keyword itself with different casing */ + {"kv", "kv"}, /* no difference between legacy and bcp */ + {"foo", "$IN"}, /* unknown, bcp ill-formed */ + {"ZZ", "$IN"}, /* unknown, bcp well-formed */ + {"e=mc2", NULL}, /* unknown, bcp/legacy ill-formed */ + {NULL, NULL} + }; + + int32_t i; + for (i = 0; DATA[i][0] != NULL; i++) { + const char* keyword = DATA[i][0]; + const char* expected = DATA[i][1]; + const char* legacyKey = NULL; + + legacyKey = uloc_toLegacyKey(keyword); + if (expected == NULL) { + if (legacyKey != NULL) { + log_err("toLegacyKey: keyword=%s => %s, expected=NULL\n", keyword, legacyKey); + } + } else if (legacyKey == NULL) { + log_err("toLegacyKey: keyword=%s => NULL, expected=%s\n", keyword, expected); + } else if (uprv_strcmp(expected, "$IN") == 0) { + if (legacyKey != keyword) { + log_err("toLegacyKey: keyword=%s => %s, expected=%s(input pointer)\n", keyword, legacyKey, keyword); + } + } else if (uprv_strcmp(legacyKey, expected) != 0) { + log_err("toUnicodeLocaleKey: keyword=%s, %s, expected=%s\n", keyword, legacyKey, expected); + } + } +} + +static void TestToUnicodeLocaleType(void) +{ + /* $IN specifies the result should be the input pointer itself */ + static const char* DATA[][3] = { + {"tz", "Asia/Kolkata", "inccu"}, + {"calendar", "gregorian", "gregory"}, + {"ca", "gregorian", "gregory"}, + {"ca", "Gregorian", "gregory"}, + {"ca", "buddhist", "buddhist"}, + {"Calendar", "Japanese", "japanese"}, + {"calendar", "Islamic-Civil", "islamic-civil"}, + {"calendar", "islamicc", "islamic-civil"}, /* bcp type alias */ + {"colalternate", "NON-IGNORABLE", "noignore"}, + {"colcaselevel", "yes", "true"}, + {"tz", "america/new_york", "usnyc"}, + {"tz", "Asia/Kolkata", "inccu"}, + {"timezone", "navajo", "usden"}, + {"ca", "aaaa", "$IN"}, /* unknown type, well-formed type */ + {"ca", "gregory-japanese-islamic", "$IN"}, /* unknown type, well-formed type */ + {"zz", "gregorian", NULL}, /* unknown key, ill-formed type */ + {"co", "foo-", NULL}, /* unknown type, ill-formed type */ + {"variableTop", "00A0", "$IN"}, /* valid codepoints type */ + {"variableTop", "wxyz", "$IN"}, /* invalid codepoints type - return as is for now */ + {"kr", "space-punct", "space-punct"}, /* valid reordercode type */ + {"kr", "digit-spacepunct", NULL}, /* invalid (bcp ill-formed) reordercode type */ + {NULL, NULL, NULL} + }; + + int32_t i; + for (i = 0; DATA[i][0] != NULL; i++) { + const char* keyword = DATA[i][0]; + const char* value = DATA[i][1]; + const char* expected = DATA[i][2]; + const char* bcpType = NULL; + + bcpType = uloc_toUnicodeLocaleType(keyword, value); + if (expected == NULL) { + if (bcpType != NULL) { + log_err("toUnicodeLocaleType: keyword=%s, value=%s => %s, expected=NULL\n", keyword, value, bcpType); + } + } else if (bcpType == NULL) { + log_err("toUnicodeLocaleType: keyword=%s, value=%s => NULL, expected=%s\n", keyword, value, expected); + } else if (uprv_strcmp(expected, "$IN") == 0) { + if (bcpType != value) { + log_err("toUnicodeLocaleType: keyword=%s, value=%s => %s, expected=%s(input pointer)\n", keyword, value, bcpType, value); + } + } else if (uprv_strcmp(bcpType, expected) != 0) { + log_err("toUnicodeLocaleType: keyword=%s, value=%s => %s, expected=%s\n", keyword, value, bcpType, expected); + } + } +} + +static void TestToLegacyType(void) +{ + /* $IN specifies the result should be the input pointer itself */ + static const char* DATA[][3] = { + {"calendar", "gregory", "gregorian"}, + {"ca", "gregory", "gregorian"}, + {"ca", "Gregory", "gregorian"}, + {"ca", "buddhist", "buddhist"}, + {"Calendar", "Japanese", "japanese"}, + {"calendar", "Islamic-Civil", "islamic-civil"}, + {"calendar", "islamicc", "islamic-civil"}, /* bcp type alias */ + {"colalternate", "noignore", "non-ignorable"}, + {"colcaselevel", "true", "yes"}, + {"tz", "usnyc", "America/New_York"}, + {"tz", "inccu", "Asia/Calcutta"}, + {"timezone", "usden", "America/Denver"}, + {"timezone", "usnavajo", "America/Denver"}, /* bcp type alias */ + {"colstrength", "quarternary", "quaternary"}, /* type alias */ + {"ca", "aaaa", "$IN"}, /* unknown type */ + {"calendar", "gregory-japanese-islamic", "$IN"}, /* unknown type, well-formed type */ + {"zz", "gregorian", "$IN"}, /* unknown key, bcp ill-formed type */ + {"ca", "gregorian-calendar", "$IN"}, /* known key, bcp ill-formed type */ + {"co", "e=mc2", NULL}, /* known key, ill-formed bcp/legacy type */ + {"variableTop", "00A0", "$IN"}, /* valid codepoints type */ + {"variableTop", "wxyz", "$IN"}, /* invalid codepoints type - return as is for now */ + {"kr", "space-punct", "space-punct"}, /* valid reordercode type */ + {"kr", "digit-spacepunct", "digit-spacepunct"}, /* invalid reordercode type, bad ok for legacy syntax */ + {NULL, NULL, NULL} + }; + + int32_t i; + for (i = 0; DATA[i][0] != NULL; i++) { + const char* keyword = DATA[i][0]; + const char* value = DATA[i][1]; + const char* expected = DATA[i][2]; + const char* legacyType = NULL; + + legacyType = uloc_toLegacyType(keyword, value); + if (expected == NULL) { + if (legacyType != NULL) { + log_err("toLegacyType: keyword=%s, value=%s => %s, expected=NULL\n", keyword, value, legacyType); + } + } else if (legacyType == NULL) { + log_err("toLegacyType: keyword=%s, value=%s => NULL, expected=%s\n", keyword, value, expected); + } else if (uprv_strcmp(expected, "$IN") == 0) { + if (legacyType != value) { + log_err("toLegacyType: keyword=%s, value=%s => %s, expected=%s(input pointer)\n", keyword, value, legacyType, value); + } + } else if (uprv_strcmp(legacyType, expected) != 0) { + log_err("toLegacyType: keyword=%s, value=%s => %s, expected=%s\n", keyword, value, legacyType, expected); + } + } +} + + + static void test_unicode_define(const char *namech, char ch, const char *nameu, UChar uch) { UChar asUch[1]; diff --git a/icu4c/source/test/cintltst/cloctst.h b/icu4c/source/test/cintltst/cloctst.h index ac313b3eb9a..8ae243f9dba 100644 --- a/icu4c/source/test/cintltst/cloctst.h +++ b/icu4c/source/test/cintltst/cloctst.h @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2013, International Business Machines Corporation and + * Copyright (c) 1997-2014, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************** @@ -123,6 +123,11 @@ static void TestLikelySubtags(void); static void TestForLanguageTag(void); static void TestToLanguageTag(void); +static void TestToUnicodeLocaleKey(void); +static void TestToLegacyKey(void); +static void TestToUnicodeLocaleType(void); +static void TestToLegacyType(void); + /** * locale data */