diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index 45bb9813d00..b46e2c645a0 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -87,7 +87,7 @@ rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb. serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \ uidna.o usprep.o punycode.o \ util.o util_props.o parsepos.o locbased.o cwchar.o wintz.o mutex.o dtintrv.o ucnvsel.o propsvec.o \ -ulist.o +ulist.o ultag.o ## Header files to install HEADERS = $(srcdir)/unicode/*.h unicode/*.h diff --git a/icu4c/source/common/common.vcproj b/icu4c/source/common/common.vcproj index 3ef8763211b..eb6863d3d83 100644 --- a/icu4c/source/common/common.vcproj +++ b/icu4c/source/common/common.vcproj @@ -2677,6 +2677,14 @@ RelativePath=".\ulocimp.h" > + + + + diff --git a/icu4c/source/common/uloc.c b/icu4c/source/common/uloc.c index 8b31bd800d6..692287fcb53 100644 --- a/icu4c/source/common/uloc.c +++ b/icu4c/source/common/uloc.c @@ -45,6 +45,7 @@ #include "uarrsort.h" #include "uenumimp.h" #include "uassert.h" +#include "ultag.h" #include /* for sprintf */ @@ -4381,8 +4382,7 @@ uloc_forLanguageTag(const char* langtag, int32_t* parsedLength, UErrorCode* err) { - /* TODO */ - return 0; + return ultag_languageTagToLocale(langtag, localeID, localeIDCapacity, parsedLength, err); } U_DRAFT int32_t U_EXPORT2 @@ -4392,8 +4392,7 @@ uloc_toLanguageTag(const char* localeID, UBool strict, UErrorCode* err) { - /* TODO */ - return 0; + return ultag_localeToLanguageTag(localeID, langtag, langtagCapacity, strict, err); } /*eof*/ diff --git a/icu4c/source/common/ultag.c b/icu4c/source/common/ultag.c new file mode 100644 index 00000000000..6c28986186d --- /dev/null +++ b/icu4c/source/common/ultag.c @@ -0,0 +1,2190 @@ +/* +********************************************************************** +* Copyright (C) 2009, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +*/ + +#include "unicode/utypes.h" +#include "unicode/ures.h" +#include "unicode/putil.h" +#include "ustr_imp.h" +#include "cmemory.h" +#include "cstring.h" +#include "putilimp.h" +#include "ultag.h" + +/* struct holding a single variant */ +typedef struct VariantListEntry { + const char *variant; + struct VariantListEntry *next; +} VariantListEntry; + +/* struct holding a single extension */ +typedef struct ExtensionListEntry { + const char *key; + const char *value; + struct ExtensionListEntry *next; +} ExtensionListEntry; + +#define MAXEXTLANG 3 +struct ULanguageTag { + char *buf; /* holding parsed subtags */ + const char *language; + const char *extlang[MAXEXTLANG]; + const char *script; + const char *region; + VariantListEntry *variants; + ExtensionListEntry *extensions; + const char *privateuse; + const char *grandfathered; +}; + +#define MINLEN 2 +#define SEP '-' +#define PRIVATEUSE 'x' +#define LDMLEXT 'u' + +#define LOCALE_SEP '_' +#define LOCALE_EXT_SEP '@' +#define LOCALE_KEYWORD_SEP ';' +#define LOCALE_KEY_TYPE_SEP '=' + +#define ISALPHA(c) (((c)>='A' && (c)<='Z') || ((c)>='a' && (c)<='z')) +#define ISNUMERIC(c) ((c)>='0' && (c)<='9') + +static const char* EMPTY = ""; +static const char* LANG_UND = "und"; +static const char* PRIVATEUSE_KEY = "x"; + +#define LANG_UND_LEN 3 + +static const char* GRANDFATHERED[] = { +/* grandfathered preferred */ + "art-lojban", "jbo", + "cel-gaulish", "", + "en-GB-oed", "", + "i-ami", "ami", + "i-bnn", "bnn", + "i-default", "", + "i-enochian", "", + "i-hak", "hak", + "i-klingon", "tlh", + "i-lux", "lb", + "i-mingo", "", + "i-navajo", "nv", + "i-pwn", "pwn", + "i-tao", "tao", + "i-tay", "tay", + "i-tsu", "tsu", + "no-bok", "nb", + "no-nyn", "nn", + "sgn-be-fr", "sfb", + "sgn-be-nl", "vgt", + "sgn-ch-de", "sgg", + "zh-guoyu", "cmn", + "zh-hakka", "hak", + "zh-min", "", + "zh-min-nan", "nan", + "zh-xiang", "hsn", + NULL, NULL +}; + +static const char* DEPRECATEDLANGS[] = { +/* deprecated new */ + "iw", "he", + "ji", "yi", + "in", "id", + NULL, NULL +}; + +/* +* ------------------------------------------------- +* +* Language subtag syntax validation functions +* +* ------------------------------------------------- +*/ + +static UBool +_isAlphaString(const char* s, int32_t len) { + int32_t i; + for (i = 0; i < len; i++) { + if (!ISALPHA(*(s + i))) { + return FALSE; + } + } + return TRUE; +} + +static UBool +_isNumericString(const char* s, int32_t len) { + int32_t i; + for (i = 0; i < len; i++) { + if (!ISNUMERIC(*(s + i))) { + return FALSE; + } + } + return TRUE; +} + +static UBool +_isAlphaNumericString(const char* s, int32_t len) { + int32_t i; + for (i = 0; i < len; i++) { + if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) { + return FALSE; + } + } + return TRUE; +} + +static UBool +_isLanguageSubtag(const char* s, int32_t len) { + /* + * language = 2*3ALPHA ; shortest ISO 639 code + * ["-" extlang] ; sometimes followed by + * ; extended language subtags + * / 4ALPHA ; or reserved for future use + * / 5*8ALPHA ; or registered language subtag + */ + if (len < 0) { + len = uprv_strlen(s); + } + if (len >= 2 && len <= 8 && _isAlphaString(s, len)) { + return TRUE; + } + return FALSE; +} + +static UBool +_isExtlangSubtag(const char* s, int32_t len) { + /* + * extlang = 3ALPHA ; selected ISO 639 codes + * *2("-" 3ALPHA) ; permanently reserved + */ + if (len < 0) { + len = uprv_strlen(s); + } + if (len == 3 && _isAlphaString(s, len)) { + return TRUE; + } + return FALSE; +} + +static UBool +_isScriptSubtag(const char* s, int32_t len) { + /* + * script = 4ALPHA ; ISO 15924 code + */ + if (len < 0) { + len = uprv_strlen(s); + } + if (len == 4 && _isAlphaString(s, len)) { + return TRUE; + } + return FALSE; +} + +static UBool +_isRegionSubtag(const char* s, int32_t len) { + /* + * region = 2ALPHA ; ISO 3166-1 code + * / 3DIGIT ; UN M.49 code + */ + if (len < 0) { + len = uprv_strlen(s); + } + if (len == 2 && _isAlphaString(s, len)) { + return TRUE; + } + if (len == 3 && _isNumericString(s, len)) { + return TRUE; + } + return FALSE; +} + +static UBool +_isVariantSubtag(const char* s, int32_t len) { + /* + * variant = 5*8alphanum ; registered variants + * / (DIGIT 3alphanum) + */ + if (len < 0) { + len = uprv_strlen(s); + } + if (len >= 5 && len <= 8 && _isAlphaString(s, len)) { + return TRUE; + } + if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) { + return TRUE; + } + return FALSE; +} + +static UBool +_isExtensionSingleton(const char* s, int32_t len) { + /* + * extension = singleton 1*("-" (2*8alphanum)) + */ + if (len < 0) { + len = uprv_strlen(s); + } + if (len == 1 && ISALPHA(*s) && (uprv_tolower(*s) != PRIVATEUSE)) { + return TRUE; + } + return FALSE; +} + +static UBool +_isExtensionSubtag(const char* s, int32_t len) { + /* + * extension = singleton 1*("-" (2*8alphanum)) + */ + if (len < 0) { + len = uprv_strlen(s); + } + if (len >= 2 && len <= 8 && _isAlphaNumericString(s, len)) { + return TRUE; + } + return FALSE; +} + +static UBool +_isExtensionSubtags(const char* s, int32_t len) { + const char *p = s; + const char *pSubtag = NULL; + + if (len < 0) { + len = uprv_strlen(s); + } + + while ((p - s) < len) { + if (*p == SEP) { + if (pSubtag == NULL) { + return FALSE; + } + if (!_isExtensionSubtag(pSubtag, p - pSubtag)) { + return FALSE; + } + pSubtag = NULL; + } else if (pSubtag == NULL) { + pSubtag = p; + } + p++; + } + if (pSubtag == NULL) { + return FALSE; + } + return _isExtensionSubtag(pSubtag, p - pSubtag); +} + +static UBool +_isPrivateuseValueSubtag(const char* s, int32_t len) { + /* + * privateuse = "x" 1*("-" (1*8alphanum)) + */ + if (len < 0) { + len = uprv_strlen(s); + } + if (len >= 1 && len <= 8 && _isAlphaNumericString(s, len)) { + return TRUE; + } + return FALSE; +} + +static UBool +_isPrivateuseValueSubtags(const char* s, int32_t len) { + const char *p = s; + const char *pSubtag = NULL; + + if (len < 0) { + len = uprv_strlen(s); + } + + while ((p - s) < len) { + if (*p == SEP) { + if (pSubtag == NULL) { + return FALSE; + } + if (!_isPrivateuseValueSubtag(pSubtag, p - pSubtag)) { + return FALSE; + } + pSubtag = NULL; + } else if (pSubtag == NULL) { + pSubtag = p; + } + p++; + } + if (pSubtag == NULL) { + return FALSE; + } + return _isPrivateuseValueSubtag(pSubtag, p - pSubtag); +} + +static UBool +_isLDMLKey(const char* s, int32_t len) { + if (len < 0) { + len = uprv_strlen(s); + } + if (len == 2 && _isAlphaNumericString(s, len)) { + return TRUE; + } + return FALSE; +} + +static UBool +_isLDMLType(const char* s, int32_t len) { + if (len < 0) { + len = uprv_strlen(s); + } + if (len >= 3 && len <= 8 && _isAlphaNumericString(s, len)) { + return TRUE; + } + return FALSE; +} + +/* +* ------------------------------------------------- +* +* Helper functions +* +* ------------------------------------------------- +*/ + +static UBool +_addVariantToList(VariantListEntry **first, VariantListEntry *var) { + UBool bAdded = TRUE; + + if (*first == NULL) { + var->next = NULL; + *first = var; + } else { + VariantListEntry *prev, *cur; + int32_t cmp; + + /* reorder variants in alphabetical order */ + prev = NULL; + cur = *first; + while (TRUE) { + if (cur == NULL) { + prev->next = var; + var->next = NULL; + break; + } + cmp = uprv_strcmp(var->variant, cur->variant); + if (cmp < 0) { + if (prev == NULL) { + *first = var; + } else { + prev->next = var; + } + var->next = cur; + break; + } + if (cmp == 0) { + /* duplicated variant */ + bAdded = FALSE; + break; + } + prev = cur; + cur = cur->next; + } + } + + return bAdded; +} + + +static UBool +_addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, UBool localeToBCP) { + UBool bAdded = TRUE; + + if (*first == NULL) { + ext->next = NULL; + *first = ext; + } else { + ExtensionListEntry *prev, *cur; + int32_t cmp; + + /* reorder variants in alphabetical order */ + prev = NULL; + cur = *first; + while (TRUE) { + if (cur == NULL) { + prev->next = ext; + ext->next = NULL; + break; + } + if (localeToBCP) { + /* special handling for locale to bcp conversion */ + int32_t len, curlen; + + len = uprv_strlen(ext->key); + curlen = uprv_strlen(cur->key); + + if (len == 1 && curlen == 1) { + if (*(ext->key) == *(cur->key)) { + cmp = 0; + } else if (*(ext->key) == PRIVATEUSE) { + cmp = 1; + } else if (*(cur->key) == PRIVATEUSE) { + cmp = -1; + } else { + cmp = *(ext->key) - *(cur->key); + } + } else if (len == 1) { + cmp = *(ext->key) - LDMLEXT; + } else if (curlen == 1) { + cmp = LDMLEXT - *(cur->key); + } else { + cmp = uprv_strcmp(ext->key, cur->key); + } + } else { + cmp = uprv_strcmp(ext->key, cur->key); + } + if (cmp < 0) { + if (prev == NULL) { + *first = ext; + } else { + prev->next = ext; + } + ext->next = cur; + break; + } + if (cmp == 0) { + /* duplicated extension key */ + bAdded = FALSE; + break; + } + prev = cur; + cur = cur->next; + } + } + + return bAdded; +} + +static void +_initializeULanguageTag(ULanguageTag* langtag) { + int32_t i; + + langtag->buf = NULL; + + langtag->language = EMPTY; + for (i = 0; i < MAXEXTLANG; i++) { + langtag->extlang[i] = NULL; + } + + langtag->script = EMPTY; + langtag->region = EMPTY; + + langtag->variants = NULL; + langtag->extensions = NULL; + + langtag->grandfathered = EMPTY; + langtag->privateuse = EMPTY; +} + +#define SUPPLEMENTAL "supplementalData" +#define BCP47MAPPINGS "bcp47KeywordMappings" +#define MAX_BCP47_SUBTAG_LEN 9 /* including null terminator */ +#define MAX_LDML_KEY_LEN 22 +#define MAX_LDML_TYPE_LEN 32 + +static int32_t +_ldmlKeyToBCP47(const char* key, int32_t keyLen, + char* bcpKey, int32_t bcpKeyCapacity, + UErrorCode *status) { + UResourceBundle *rb; + char keyBuf[MAX_LDML_KEY_LEN]; + char bcpKeyBuf[MAX_BCP47_SUBTAG_LEN]; + int32_t resultLen = 0; + int32_t i; + UErrorCode tmpStatus = U_ZERO_ERROR; + const UChar *uBcpKey; + int32_t bcpKeyLen; + + if (keyLen < 0) { + keyLen = uprv_strlen(key); + } + + if (keyLen >= sizeof(keyBuf)) { + /* no known valid LDML key exceeding 21 */ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + uprv_memcpy(keyBuf, key, keyLen); + keyBuf[keyLen] = 0; + + /* to lower case */ + for (i = 0; i < keyLen; i++) { + keyBuf[i] = uprv_tolower(keyBuf[i]); + } + + rb = ures_openDirect(NULL, SUPPLEMENTAL, status); + ures_getByKey(rb, BCP47MAPPINGS, rb, status); + ures_getByKey(rb, "key", rb, status); + + if (U_FAILURE(*status)) { + ures_close(rb); + return 0; + } + + uBcpKey = ures_getStringByKey(rb, keyBuf, &bcpKeyLen, &tmpStatus); + if (U_SUCCESS(tmpStatus)) { + u_UCharsToChars(uBcpKey, bcpKeyBuf, bcpKeyLen); + bcpKeyBuf[bcpKeyLen] = 0; + resultLen = bcpKeyLen; + } else { + if (_isLDMLKey(key, keyLen)) { + uprv_memcpy(bcpKeyBuf, key, keyLen); + bcpKeyBuf[keyLen] = 0; + resultLen = keyLen; + } else { + /* mapping not availabe */ + *status = U_ILLEGAL_ARGUMENT_ERROR; + } + } + ures_close(rb); + + if (U_FAILURE(*status)) { + return 0; + } + + uprv_memcpy(bcpKey, bcpKeyBuf, uprv_min(resultLen, bcpKeyCapacity)); + return u_terminateChars(bcpKey, bcpKeyCapacity, resultLen, status); +} + +static int32_t +_bcp47ToLDMLKey(const char* bcpKey, int32_t bcpKeyLen, + char* key, int32_t keyCapacity, + UErrorCode *status) { + UResourceBundle *rb; + char bcpKeyBuf[MAX_BCP47_SUBTAG_LEN]; + int32_t resultLen = 0; + int32_t i; + const char *resKey = NULL; + UResourceBundle *keyMap; + + if (bcpKeyLen < 0) { + bcpKeyLen = uprv_strlen(bcpKey); + } + + if (bcpKeyLen >= sizeof(bcpKeyBuf)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + uprv_memcpy(bcpKeyBuf, bcpKey, bcpKeyLen); + bcpKeyBuf[bcpKeyLen] = 0; + + /* to lower case */ + for (i = 0; i < bcpKeyLen; i++) { + bcpKeyBuf[i] = uprv_tolower(bcpKeyBuf[i]); + } + + rb = ures_openDirect(NULL, SUPPLEMENTAL, status); + ures_getByKey(rb, BCP47MAPPINGS, rb, status); + ures_getByKey(rb, "key", rb, status); + if (U_FAILURE(*status)) { + ures_close(rb); + return 0; + } + + keyMap = ures_getNextResource(rb, NULL, status); + while (U_SUCCESS(*status)) { + const UChar *uBcpKey; + char tmpBcpKeyBuf[MAX_BCP47_SUBTAG_LEN]; + int32_t tmpBcpKeyLen; + + uBcpKey = ures_getString(keyMap, &tmpBcpKeyLen, status); + if (U_FAILURE(*status)) { + break; + } + u_UCharsToChars(uBcpKey, tmpBcpKeyBuf, tmpBcpKeyLen); + tmpBcpKeyBuf[tmpBcpKeyLen] = 0; + if (uprv_strcmp(bcpKeyBuf, tmpBcpKeyBuf) == 0) { + /* found a matching BCP47 key */ + resKey = ures_getKey(keyMap); + resultLen = uprv_strlen(resKey); + break; + } + if (!ures_hasNext(rb)) { + break; + } + ures_getNextResource(rb, keyMap, status); + } + ures_close(keyMap); + ures_close(rb); + + if (U_FAILURE(*status)) { + return 0; + } + + if (resKey == NULL) { + resKey = bcpKeyBuf; + resultLen = bcpKeyLen; + } + + uprv_memcpy(key, resKey, uprv_min(resultLen, keyCapacity)); + return u_terminateChars(key, keyCapacity, resultLen, status); +} + +static int32_t +_ldmlTypeToBCP47(const char* key, int32_t keyLen, + const char* type, int32_t typeLen, + char* bcpType, int32_t bcpTypeCapacity, + UErrorCode *status) { + + UResourceBundle *rb; + char keyBuf[MAX_LDML_KEY_LEN]; + char typeBuf[MAX_LDML_TYPE_LEN]; + char bcpTypeBuf[MAX_BCP47_SUBTAG_LEN]; + int32_t resultLen = 0; + int32_t i; + UErrorCode tmpStatus = U_ZERO_ERROR; + const UChar *uBcpType; + int32_t bcpTypeLen; + UBool isTimezone = FALSE; + + if (keyLen < 0) { + keyLen = uprv_strlen(key); + } + if (keyLen >= sizeof(keyBuf)) { + /* no known valid LDML key exceeding 21 */ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + uprv_memcpy(keyBuf, key, keyLen); + keyBuf[keyLen] = 0; + + /* to lower case */ + for (i = 0; i < keyLen; i++) { + keyBuf[i] = uprv_tolower(keyBuf[i]); + } + if (uprv_strcmp(keyBuf, "timezone") == 0) { + isTimezone = TRUE; + } + + if (typeLen < 0) { + typeLen = uprv_strlen(type); + } + if (typeLen >= sizeof(typeBuf)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + uprv_memcpy(typeBuf, type, typeLen); + typeBuf[typeLen] = 0; + + for (i = 0; i < typeLen; i++) { + if (isTimezone && typeBuf[i] == '/') { + typeBuf[i] = ':'; + } else { + typeBuf[i] = uprv_tolower(typeBuf[i]); + } + } + + rb = ures_openDirect(NULL, SUPPLEMENTAL, status); + ures_getByKey(rb, BCP47MAPPINGS, rb, status); + if (U_FAILURE(*status)) { + ures_close(rb); + return 0; + } + + ures_getByKey(rb, keyBuf, rb, &tmpStatus); + uBcpType = ures_getStringByKey(rb, typeBuf, &bcpTypeLen, &tmpStatus); + if (U_SUCCESS(tmpStatus)) { + u_UCharsToChars(uBcpType, bcpTypeBuf, bcpTypeLen); + resultLen = bcpTypeLen; + } else if (tmpStatus == U_MISSING_RESOURCE_ERROR) { + if (_isLDMLType(type, typeLen)) { + uprv_memcpy(bcpTypeBuf, type, typeLen); + resultLen = typeLen; + } else { + /* mapping not availabe */ + *status = U_ILLEGAL_ARGUMENT_ERROR; + } + } else { + *status = tmpStatus; + } + ures_close(rb); + + if (U_FAILURE(*status)) { + return 0; + } + + uprv_memcpy(bcpType, bcpTypeBuf, uprv_min(resultLen, bcpTypeCapacity)); + return u_terminateChars(bcpType, bcpTypeCapacity, resultLen, status); +} + +static int32_t +_bcp47ToLDMLType(const char* key, int32_t keyLen, + const char* bcpType, int32_t bcpTypeLen, + char* type, int32_t typeCapacity, + UErrorCode *status) { + UResourceBundle *rb; + char keyBuf[MAX_LDML_KEY_LEN]; + char bcpTypeBuf[MAX_BCP47_SUBTAG_LEN]; + int32_t resultLen = 0; + int32_t i; + const char *resType = NULL; + UResourceBundle *typeMap; + UErrorCode tmpStatus = U_ZERO_ERROR; + int32_t copyLen; + + if (keyLen < 0) { + keyLen = uprv_strlen(key); + } + + if (keyLen >= sizeof(keyBuf)) { + /* no known valid LDML key exceeding 21 */ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + uprv_memcpy(keyBuf, key, keyLen); + keyBuf[keyLen] = 0; + + /* to lower case */ + for (i = 0; i < keyLen; i++) { + keyBuf[i] = uprv_tolower(keyBuf[i]); + } + + + if (bcpTypeLen < 0) { + bcpTypeLen = uprv_strlen(bcpType); + } + + if (bcpTypeLen >= sizeof(bcpTypeBuf)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + uprv_memcpy(bcpTypeBuf, bcpType, bcpTypeLen); + bcpTypeBuf[bcpTypeLen] = 0; + + /* to lower case */ + for (i = 0; i < bcpTypeLen; i++) { + bcpTypeBuf[i] = uprv_tolower(bcpTypeBuf[i]); + } + + rb = ures_openDirect(NULL, SUPPLEMENTAL, status); + ures_getByKey(rb, BCP47MAPPINGS, rb, status); + if (U_FAILURE(*status)) { + ures_close(rb); + return 0; + } + + ures_getByKey(rb, keyBuf, rb, &tmpStatus); + typeMap = ures_getNextResource(rb, NULL, &tmpStatus); + while (U_SUCCESS(tmpStatus)) { + const UChar *uBcpType; + char tmpBcpTypeBuf[MAX_BCP47_SUBTAG_LEN]; + int32_t tmpBcpTypeLen; + + uBcpType = ures_getString(typeMap, &tmpBcpTypeLen, &tmpStatus); + if (U_FAILURE(tmpStatus)) { + break; + } + u_UCharsToChars(uBcpType, tmpBcpTypeBuf, tmpBcpTypeLen); + tmpBcpTypeBuf[tmpBcpTypeLen] = 0; + if (uprv_strcmp(bcpTypeBuf, tmpBcpTypeBuf) == 0) { + /* found a matching BCP47 type */ + resType = ures_getKey(typeMap); + resultLen = uprv_strlen(resType); + break; + } + if (!ures_hasNext(rb)) { + break; + } + ures_getNextResource(rb, typeMap, &tmpStatus); + } + ures_close(typeMap); + ures_close(rb); + + if (U_FAILURE(tmpStatus) && tmpStatus != U_MISSING_RESOURCE_ERROR) { + *status = tmpStatus; + return 0; + } + + if (resType == NULL) { + resType = bcpTypeBuf; + resultLen = bcpTypeLen; + } + + copyLen = uprv_min(resultLen, typeCapacity); + uprv_memcpy(type, resType, copyLen); + + if (uprv_strcmp(keyBuf, "timezone") == 0) { + for (i = 0; i < copyLen; i++) { + if (*(type + i) == ':') { + *(type + i) = '/'; + } + } + } + + return u_terminateChars(type, typeCapacity, resultLen, status); +} + +static int32_t +_appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { + char buf[ULOC_LANG_CAPACITY]; + UErrorCode tmpStatus = U_ZERO_ERROR; + int32_t len, i; + int32_t reslen = 0; + + if (U_FAILURE(*status)) { + return 0; + } + + len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus); + if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + len = 0; + } + + /* Note: returned language code is in lower case letters */ + + if (len == 0) { + if (reslen < capacity) { + uprv_memcpy(appendAt + reslen, LANG_UND, uprv_min(LANG_UND_LEN, capacity - reslen)); + } + reslen += LANG_UND_LEN; + } else if (!_isLanguageSubtag(buf, len)) { + /* invalid language code */ + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + if (reslen < capacity) { + uprv_memcpy(appendAt + reslen, LANG_UND, uprv_min(LANG_UND_LEN, capacity - reslen)); + } + reslen += LANG_UND_LEN; + } else { + /* resolve deprecated */ + for (i = 0; DEPRECATEDLANGS[i] != NULL; i += 2) { + if (uprv_strcmp(buf, DEPRECATEDLANGS[i]) == 0) { + uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]); + len = uprv_strlen(buf); + break; + } + } + if (reslen < capacity) { + uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen)); + } + reslen += len; + } + u_terminateChars(appendAt, capacity, reslen, status); + return reslen; +} + +static int32_t +_appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { + char buf[ULOC_SCRIPT_CAPACITY]; + UErrorCode tmpStatus = U_ZERO_ERROR; + int32_t len, i; + int32_t reslen = 0; + + if (U_FAILURE(*status)) { + return 0; + } + + len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus); + if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + } + return 0; + } + + if (len > 0) { + if (!_isScriptSubtag(buf, len)) { + /* invalid script code */ + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + } + return 0; + } else { + /* to lowercase */ + for (i = 0; i < len; i++) { + buf[i] = uprv_tolower(buf[i]); + } + if (reslen < capacity) { + *(appendAt + reslen) = SEP; + } + reslen++; + + if (reslen < capacity) { + uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen)); + } + reslen += len; + } + } + u_terminateChars(appendAt, capacity, reslen, status); + return reslen; +} + +static int32_t +_appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { + char buf[ULOC_COUNTRY_CAPACITY]; + UErrorCode tmpStatus = U_ZERO_ERROR; + int32_t len, i; + int32_t reslen = 0; + + if (U_FAILURE(*status)) { + return 0; + } + + len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus); + if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + } + return 0; + } + + if (len > 0) { + if (!_isRegionSubtag(buf, len)) { + /* invalid region code */ + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + } + return 0; + } else { + /* to lowercase */ + for (i = 0; i < len; i++) { + buf[i] = uprv_tolower(buf[i]); + } + if (reslen < capacity) { + *(appendAt + reslen) = SEP; + } + reslen++; + + if (reslen < capacity) { + uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen)); + } + reslen += len; + } + } + u_terminateChars(appendAt, capacity, reslen, status); + return reslen; +} + +static int32_t +_appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { + char buf[ULOC_FULLNAME_CAPACITY]; + UErrorCode tmpStatus = U_ZERO_ERROR; + int32_t len, i; + int32_t reslen = 0; + + if (U_FAILURE(*status)) { + return 0; + } + + len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus); + if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + } + return 0; + } + + if (len > 0) { + char *p, *pVar; + UBool bNext = TRUE; + VariantListEntry *var; + VariantListEntry *varFirst = NULL; + + pVar = NULL; + p = buf; + while (bNext) { + if (*p == SEP || *p == LOCALE_SEP || *p == 0) { + if (*p == 0) { + bNext = FALSE; + } else { + *p = 0; /* terminate */ + } + if (pVar == NULL) { + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + /* ignore empty variant */ + } else { + /* to lowercase */ + for (i = 0; *(pVar + i) != 0; i++) { + *(pVar + i) = uprv_tolower(*(pVar + i)); + } + + /* validate */ + if (_isVariantSubtag(pVar, -1)) { + /* emit the variant to the list */ + var = uprv_malloc(sizeof(VariantListEntry)); + if (var == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + break; + } + var->variant = pVar; + if (!_addVariantToList(&varFirst, var)) { + /* duplicated variant */ + uprv_free(var); + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + } + } else if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + } + /* reset variant starting position */ + pVar = NULL; + } else if (pVar == NULL) { + pVar = p; + } + p++; + } + + if (U_SUCCESS(*status)) { + if (varFirst != NULL) { + int32_t varLen; + + /* write out sorted/validated/normalized variants to the target */ + var = varFirst; + while (var != NULL) { + if (reslen < capacity) { + *(appendAt + reslen) = SEP; + } + reslen++; + varLen = uprv_strlen(var->variant); + if (reslen < capacity) { + uprv_memcpy(appendAt + reslen, var->variant, uprv_min(varLen, capacity - reslen)); + } + reslen += varLen; + var = var->next; + } + } + } + + /* clean up */ + var = varFirst; + while (var != NULL) { + VariantListEntry *tmpVar = var->next; + uprv_free(var); + var = tmpVar; + } + + if (U_FAILURE(*status)) { + return 0; + } + } + + u_terminateChars(appendAt, capacity, reslen, status); + return reslen; +} + +static int32_t +_appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UErrorCode* status) { + char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; + UEnumeration *keywordEnum = NULL; + int32_t reslen = 0; + + keywordEnum = uloc_openKeywords(localeID, status); + if (U_FAILURE(*status)) { + uenum_close(keywordEnum); + return 0; + } + if (keywordEnum != NULL) { + /* reorder extensions */ + int32_t len; + const char *key; + ExtensionListEntry *firstExt = NULL; + ExtensionListEntry *ext; + char extBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; + char *pExtBuf = extBuf; + int32_t extBufCapacity = sizeof(extBuf); + const char *bcpKey, *bcpValue; + UErrorCode tmpStatus = U_ZERO_ERROR; + int32_t keylen; + UBool isLDMLKeyword; + + while (TRUE) { + key = uenum_next(keywordEnum, NULL, status); + if (key == NULL) { + break; + } + len = uloc_getKeywordValue(localeID, key, buf, sizeof(buf), &tmpStatus); + if (U_FAILURE(tmpStatus)) { + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + /* ignore this keyword */ + tmpStatus = U_ZERO_ERROR; + continue; + } + + keylen = uprv_strlen(key); + isLDMLKeyword = (keylen > 1); + + if (isLDMLKeyword) { + int32_t modKeyLen; + + /* transform key and value to bcp47 style */ + modKeyLen = _ldmlKeyToBCP47(key, keylen, pExtBuf, extBufCapacity, &tmpStatus); + if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + tmpStatus = U_ZERO_ERROR; + continue; + } + + bcpKey = pExtBuf; + pExtBuf += (modKeyLen + 1); + extBufCapacity -= (modKeyLen + 1); + + len = _ldmlTypeToBCP47(key, keylen, buf, len, pExtBuf, extBufCapacity, &tmpStatus); + if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + tmpStatus = U_ZERO_ERROR; + continue; + } + bcpValue = pExtBuf; + pExtBuf += (len + 1); + extBufCapacity -= (len + 1); + } else { + if (*key == PRIVATEUSE) { + if (!_isPrivateuseValueSubtags(buf, len)) { + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + continue; + } + } else { + if (!_isExtensionSingleton(key, keylen) || !_isExtensionSubtags(buf, len)) { + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + continue; + } + } + bcpKey = key; + if ((len + 1) < extBufCapacity) { + uprv_memcpy(pExtBuf, buf, len); + bcpValue = pExtBuf; + + pExtBuf += len; + + *pExtBuf = 0; + pExtBuf++; + + extBufCapacity -= (len + 1); + } else { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + } + + /* create ExtensionListEntry */ + ext = uprv_malloc(sizeof(ExtensionListEntry)); + if (ext == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + break; + } + ext->key = bcpKey; + ext->value = bcpValue; + + if (!_addExtensionToList(&firstExt, ext, TRUE)) { + uprv_free(ext); + if (strict) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + } + } + if (U_SUCCESS(*status) && (firstExt != NULL)) { + UBool startLDMLExtension = FALSE; + + /* write out the sorted BCP47 extensions and private use */ + ext = firstExt; + while (ext != NULL) { + if (uprv_strlen(ext->key) > 1 && !startLDMLExtension) { + /* write LDML singleton extension */ + if (reslen < capacity) { + *(appendAt + reslen) = SEP; + } + reslen++; + if (reslen < capacity) { + *(appendAt + reslen) = LDMLEXT; + } + reslen++; + startLDMLExtension = TRUE; + } + + if (reslen < capacity) { + *(appendAt + reslen) = SEP; + } + reslen++; + len = uprv_strlen(ext->key); + if (reslen < capacity) { + uprv_memcpy(appendAt + reslen, ext->key, uprv_min(len, capacity - reslen)); + } + reslen += len; + if (reslen < capacity) { + *(appendAt + reslen) = SEP; + } + reslen++; + len = uprv_strlen(ext->value); + if (reslen < capacity) { + uprv_memcpy(appendAt + reslen, ext->value, uprv_min(len, capacity - reslen)); + } + reslen += len; + + ext = ext->next; + } + } + /* clean up */ + ext = firstExt; + while (ext != NULL) { + ExtensionListEntry *tmpExt = ext->next; + uprv_free(ext); + ext = tmpExt; + } + + uenum_close(keywordEnum); + + if (U_FAILURE(*status)) { + return 0; + } + } + + return u_terminateChars(appendAt, capacity, reslen, status); +} + +/* + * Append keywords parsed from LDML extension value + * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional} + * Note: char* buf is used for storing keywords + */ +static void +_appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, char* buf, int32_t bufSize, UErrorCode *status) { + const char *p, *pNext, *pSep; + const char *pBcpKey, *pBcpType; + const char *pKey, *pType; + int32_t bcpKeyLen, bcpTypeLen; + ExtensionListEntry *kwd, *nextKwd; + ExtensionListEntry *kwdFirst = NULL; + int32_t bufIdx = 0; + int32_t len; + + pNext = ldmlext; + pBcpKey = pBcpType = NULL; + while (pNext) { + p = pSep = pNext; + + /* locate next separator char */ + while (*pSep) { + if (*pSep == SEP) { + break; + } + pSep++; + } + if (*pSep == 0) { + /* last subtag */ + pNext = NULL; + } else { + pNext = pSep + 1; + } + + if (pBcpKey == NULL) { + pBcpKey = p; + bcpKeyLen = pSep - p; + } else { + pBcpType = p; + bcpTypeLen = pSep - p; + + /* BCP key to locale key */ + len = _bcp47ToLDMLKey(pBcpKey, bcpKeyLen, buf + bufIdx, bufSize - bufIdx - 1, status); + if (U_FAILURE(*status)) { + goto cleanup; + } + pKey = buf + bufIdx; + bufIdx += len; + *(buf + bufIdx) = 0; + bufIdx++; + + /* BCP type to locale type */ + len = _bcp47ToLDMLType(pKey, -1, pBcpType, bcpTypeLen, buf + bufIdx, bufSize - bufIdx - 1, status); + if (U_FAILURE(*status)) { + goto cleanup; + } + pType = buf + bufIdx; + bufIdx += len; + *(buf + bufIdx) = 0; + bufIdx++; + + /* create an ExtensionListEntry for this keyword */ + kwd = uprv_malloc(sizeof(ExtensionListEntry)); + if (kwd == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + goto cleanup; + } + + kwd->key = pKey; + kwd->value = pType; + + if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + uprv_free(kwd); + goto cleanup; + } + + /* for next pair */ + pBcpKey = NULL; + pBcpType = NULL; + } + } + + if (pBcpKey != NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + goto cleanup; + } + + kwd = kwdFirst; + while (kwd != NULL) { + nextKwd = kwd->next; + _addExtensionToList(appendTo, kwd, FALSE); + kwd = nextKwd; + } + + return; + +cleanup: + kwd = kwdFirst; + while (kwd != NULL) { + nextKwd = kwd->next; + uprv_free(kwd); + kwd = nextKwd; + } +} + + +static int32_t +_appendKeywords(ULanguageTag* langtag, char* appendAt, int32_t capacity, UErrorCode* status) { + int32_t reslen = 0; + int32_t i, n; + int32_t len; + ExtensionListEntry *kwdFirst = NULL; + ExtensionListEntry *kwd; + const char *key, *type; + char kwdBuf[ULOC_KEYWORDS_CAPACITY]; + + if (U_FAILURE(*status)) { + return 0; + } + + n = ultag_getExtensionsSize(langtag); + + /* resolve locale keywords and reordering keys */ + for (i = 0; i < n; i++) { + key = ultag_getExtensionKey(langtag, i); + type = ultag_getExtensionValue(langtag, i); + if (*key == LDMLEXT) { + _appendLDMLExtensionAsKeywords(type, &kwdFirst, kwdBuf, sizeof(kwdBuf), status); + if (U_FAILURE(*status)) { + break; + } + } else { + kwd = uprv_malloc(sizeof(ExtensionListEntry)); + if (kwd == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + break; + } + kwd->key = key; + kwd->value = type; + if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { + uprv_free(kwd); + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + } + } + + if (U_SUCCESS(*status)) { + type = ultag_getPrivateUse(langtag); + if (uprv_strlen(type) > 0) { + /* add private use as a keyword */ + kwd = uprv_malloc(sizeof(ExtensionListEntry)); + if (kwd == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + } else { + kwd->key = PRIVATEUSE_KEY; + kwd->value = type; + if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { + uprv_free(kwd); + *status = U_ILLEGAL_ARGUMENT_ERROR; + } + } + } + } + + if (U_SUCCESS(*status) && kwdFirst != NULL) { + /* write out the sorted keywords */ + kwd = kwdFirst; + while (kwd != NULL) { + if (reslen < capacity) { + if (kwd == kwdFirst) { + /* '@' */ + *(appendAt + reslen) = LOCALE_EXT_SEP; + } else { + /* ';' */ + *(appendAt + reslen) = LOCALE_KEYWORD_SEP; + } + } + reslen++; + + /* key */ + len = uprv_strlen(kwd->key); + if (reslen < capacity) { + uprv_memcpy(appendAt + reslen, kwd->key, uprv_min(len, capacity - reslen)); + } + reslen += len; + + /* '=' */ + if (reslen < capacity) { + *(appendAt + reslen) = LOCALE_KEY_TYPE_SEP; + } + reslen++; + + /* type */ + len = uprv_strlen(kwd->value); + if (reslen < capacity) { + uprv_memcpy(appendAt + reslen, kwd->value, uprv_min(len, capacity - reslen)); + } + reslen += len; + + kwd = kwd->next; + } + } + + /* clean up */ + kwd = kwdFirst; + while (kwd != NULL) { + ExtensionListEntry *tmpKwd = kwd->next; + uprv_free(kwd); + kwd = tmpKwd; + } + + if (U_FAILURE(*status)) { + return 0; + } + + return u_terminateChars(appendAt, capacity, reslen, status); +} + +/* +* ------------------------------------------------- +* +* ultag_ APIs +* +* ------------------------------------------------- +*/ + +/* Bit flags used by the parser */ +#define LANG 0x0001 +#define EXTL 0x0002 +#define SCRT 0x0004 +#define REGN 0x0008 +#define VART 0x0010 +#define EXTS 0x0020 +#define EXTV 0x0040 +#define PRIV 0x0080 + +U_CFUNC ULanguageTag* +ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status) { + ULanguageTag *t; + char *tagBuf; + int16_t next; + char *pSubtag, *pNext, *pLastGoodPosition; + int32_t subtagLen; + int32_t extlangIdx; + ExtensionListEntry *pExtension; + char *pExtValueSubtagEnd; + int32_t i; + UBool isLDMLExtension, reqLDMLType; + + if (parsedLen != NULL) { + *parsedLen = 0; + } + + if (U_FAILURE(*status)) { + return NULL; + } + + if (tagLen < 0) { + tagLen = uprv_strlen(tag); + } + + /* copy the entire string */ + tagBuf = (char*)uprv_malloc(tagLen + 1); + if (tagBuf == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + uprv_memcpy(tagBuf, tag, tagLen); + *(tagBuf + tagLen) = 0; + + /* to lower case */ + for (i = 0; i < tagLen; i++) { + tagBuf[i] = uprv_tolower(tagBuf[i]); + } + + /* create a ULanguageTag */ + t = (ULanguageTag*)uprv_malloc(sizeof(ULanguageTag)); + _initializeULanguageTag(t); + t->buf = tagBuf; + if (t == NULL) { + uprv_free(tagBuf); + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + + if (tagLen < MINLEN) { + /* the input tag is too short - return empty ULanguageTag */ + return t; + } + + /* check if the tag is grandfathered */ + for (i = 0; GRANDFATHERED[i] != NULL; i += 2) { + if (uprv_strcmp(GRANDFATHERED[i], tagBuf) == 0) { + /* a grandfathered tag is always longer than its preferred mapping */ + uprv_strcpy(t->buf, GRANDFATHERED[i + 1]); + t->language = t->buf; + if (parsedLen != NULL) { + *parsedLen = tagLen; + } + return t; + } + } + + /* + * langtag = language + * ["-" script] + * ["-" region] + * *("-" variant) + * *("-" extension) + * ["-" privateuse] + */ + + next = LANG | PRIV; + pNext = pLastGoodPosition = tagBuf; + extlangIdx = 0; + pExtension = NULL; + pExtValueSubtagEnd = NULL; + isLDMLExtension = FALSE; + reqLDMLType = FALSE; + + while (pNext) { + char *pSep; + + pSubtag = pNext; + + /* locate next separator char */ + pSep = pSubtag; + while (*pSep) { + if (*pSep == SEP) { + break; + } + pSep++; + } + if (*pSep == 0) { + /* last subtag */ + pNext = NULL; + } else { + pNext = pSep + 1; + } + subtagLen = pSep - pSubtag; + + if (next & LANG) { + if (_isLanguageSubtag(pSubtag, subtagLen)) { + *pSep = 0; /* terminate */ + t->language = pSubtag; + + pLastGoodPosition = pSep; + next = EXTL | SCRT | REGN | VART | EXTS | PRIV; + continue; + } + } + if (next & EXTL) { + if (_isExtlangSubtag(pSubtag, subtagLen)) { + *pSep = 0; + t->extlang[extlangIdx++] = pSubtag; + + pLastGoodPosition = pSep; + if (extlangIdx < 3) { + next = EXTL | SCRT | REGN | VART | EXTS | PRIV; + } else { + next = SCRT | REGN | VART | EXTS | PRIV; + } + continue; + } + } + if (next & SCRT) { + if (_isScriptSubtag(pSubtag, subtagLen)) { + *pSep = 0; + t->script = pSubtag; + + pLastGoodPosition = pSep; + next = REGN | VART | EXTS | PRIV; + continue; + } + } + if (next & REGN) { + if (_isRegionSubtag(pSubtag, subtagLen)) { + *pSep = 0; + t->region = pSubtag; + + pLastGoodPosition = pSep; + next = VART | EXTS | PRIV; + continue; + } + } + if (next & VART) { + if (_isVariantSubtag(pSubtag, subtagLen)) { + VariantListEntry *var; + UBool isAdded; + + var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry)); + if (var == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + goto error; + } + *pSep = 0; + var->variant = pSubtag; + isAdded = _addVariantToList(&(t->variants), var); + if (!isAdded) { + /* duplicated variant entry */ + uprv_free(var); + break; + } + pLastGoodPosition = pSep; + next = VART | EXTS | PRIV; + continue; + } + } + if (next & EXTS) { + if (_isExtensionSingleton(pSubtag, subtagLen)) { + if (pExtension != NULL) { + if (pExtValueSubtagEnd == NULL) { + /* the previous extension is incomplete */ + uprv_free(pExtension); + break; + } + + /* terminate the previous extension value */ + *pExtValueSubtagEnd = 0; + + /* insert the extension to the list */ + if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) { + pLastGoodPosition = pExtValueSubtagEnd; + } else { + /* stop parsing here */ + uprv_free(pExtension); + pExtension = NULL; + break; + } + + if (isLDMLExtension && reqLDMLType) { + /* incomplete LDML extension key and type pair */ + pExtension = NULL; + break; + } + } + + isLDMLExtension = (*pSubtag == LDMLEXT); + + /* create a new extension */ + pExtension = uprv_malloc(sizeof(ExtensionListEntry)); + if (pExtension == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + goto error; + } + *pSep = 0; + pExtension->key = pSubtag; + pExtension->value = NULL; /* will be set later */ + + /* + * reset the end location of extension value + * subtags for this extension + */ + pExtValueSubtagEnd = NULL; + + next = EXTV; + continue; + } + } + if (next & EXTV) { + if (_isExtensionSubtag(pSubtag, subtagLen)) { + if (isLDMLExtension) { + if (reqLDMLType) { + /* already saw an LDML key */ + if (!_isLDMLType(pSubtag, subtagLen)) { + /* stop parsing here and let the valid LDML extension key/type + pairs processed by the code out of this while loop */ + break; + } + pExtValueSubtagEnd = pSep; + reqLDMLType = FALSE; + next = EXTS | EXTV | PRIV; + } else { + /* LDML key */ + if (!_isLDMLKey(pSubtag, subtagLen)) { + /* stop parsing here and let the valid LDML extension key/type + pairs processed by the code out of this while loop */ + break; + } + reqLDMLType = TRUE; + next = EXTV; + } + } else { + /* Mark the end of this subtag */ + pExtValueSubtagEnd = pSep; + next = EXTS | EXTV | PRIV; + } + + if (pExtension->value == NULL) { + /* if the start postion of this extension's value is not yet, + this one is the first value subtag */ + pExtension->value = pSubtag; + } + continue; + } + } + if (next & PRIV) { + if (*pSubtag == PRIVATEUSE) { + char *pPrivuseVal; + + if (pExtension != NULL) { + /* Process the last extension */ + if (pExtValueSubtagEnd == NULL) { + /* the previous extension is incomplete */ + uprv_free(pExtension); + break; + } else { + /* terminate the previous extension value */ + *pExtValueSubtagEnd = 0; + + /* insert the extension to the list */ + if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) { + pLastGoodPosition = pExtValueSubtagEnd; + pExtension = NULL; + pExtValueSubtagEnd = NULL; + } else { + /* stop parsing here */ + uprv_free(pExtension); + pExtension = NULL; + pExtValueSubtagEnd = NULL; + break; + } + } + } + + /* The rest of part will be private use value subtags */ + if (pNext == NULL) { + /* empty private use subtag */ + break; + } + /* back up the private use value start position */ + pPrivuseVal = pNext; + + /* validate private use value subtags */ + while (pNext) { + pSubtag = pNext; + pSep = pSubtag; + while (*pSep) { + if (*pSep == SEP) { + break; + } + pSep++; + } + if (*pSep == 0) { + /* last subtag */ + pNext = NULL; + } else { + pNext = pSep + 1; + } + subtagLen = pSep - pSubtag; + + if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) { + pLastGoodPosition = pSep; + } else { + break; + } + } + if (pLastGoodPosition - pPrivuseVal > 0) { + *pLastGoodPosition = 0; + t->privateuse = pPrivuseVal; + } + /* No more subtags, exiting the parse loop */ + break; + } + break; + } + /* If we fell through here, it means this subtag is illegal - quit parsing */ + break; + } + + if (pExtension != NULL) { + /* Process the last extension */ + if (pExtValueSubtagEnd == NULL) { + /* the previous extension is incomplete */ + uprv_free(pExtension); + } else { + /* terminate the previous extension value */ + *pExtValueSubtagEnd = 0; + /* insert the extension to the list */ + if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) { + pLastGoodPosition = pExtValueSubtagEnd; + } else { + uprv_free(pExtension); + } + } + } + + if (parsedLen != NULL) { + *parsedLen = pLastGoodPosition - t->buf; + } + + return t; + +error: + uprv_free(t); + return NULL; +} + +U_CFUNC void +ultag_close(ULanguageTag* langtag) { + + if (langtag == NULL) { + return; + } + + uprv_free(langtag->buf); + + if (langtag->variants) { + VariantListEntry *curVar = langtag->variants; + while (curVar) { + VariantListEntry *nextVar = curVar->next; + uprv_free(curVar); + curVar = nextVar; + } + } + + if (langtag->extensions) { + ExtensionListEntry *curExt = langtag->extensions; + while (curExt) { + ExtensionListEntry *nextExt = curExt->next; + uprv_free(curExt); + curExt = nextExt; + } + } + + uprv_free(langtag); +} + +U_CFUNC const char* +ultag_getLanguage(const ULanguageTag* langtag) { + return langtag->language; +} + +U_CFUNC const char* +ultag_getJDKLanguage(const ULanguageTag* langtag) { + int32_t i; + for (i = 0; DEPRECATEDLANGS[i] != NULL; i += 2) { + if (uprv_strcmp(DEPRECATEDLANGS[i], langtag->language) == 0) { + return DEPRECATEDLANGS[i + 1]; + } + } + return langtag->language; +} + +U_CFUNC const char* +ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) { + if (idx >= 0 && idx < MAXEXTLANG) { + return langtag->extlang[idx]; + } + return NULL; +} + +U_CFUNC int32_t +ultag_getExtlangSize(const ULanguageTag* langtag) { + int32_t size = 0; + int32_t i; + for (i = 0; i < MAXEXTLANG; i++) { + if (langtag->extlang[i]) { + size++; + } + } + return size; +} + +U_CFUNC const char* +ultag_getScript(const ULanguageTag* langtag) { + return langtag->script; +} + +U_CFUNC const char* +ultag_getRegion(const ULanguageTag* langtag) { + return langtag->region; +} + +U_CFUNC const char* +ultag_getVariant(const ULanguageTag* langtag, int32_t idx) { + const char *var = NULL; + VariantListEntry *cur = langtag->variants; + int32_t i = 0; + while (cur) { + if (i == idx) { + var = cur->variant; + break; + } + cur = cur->next; + i++; + } + return var; +} + +U_CFUNC int32_t +ultag_getVariantsSize(const ULanguageTag* langtag) { + int32_t size = 0; + VariantListEntry *cur = langtag->variants; + while (TRUE) { + if (cur == NULL) { + break; + } + size++; + cur = cur->next; + } + return size; +} + +U_CFUNC const char* +ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) { + const char *key = NULL; + ExtensionListEntry *cur = langtag->extensions; + int32_t i = 0; + while (cur) { + if (i == idx) { + key = cur->key; + break; + } + cur = cur->next; + i++; + } + return key; +} + +U_CFUNC const char* +ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) { + const char *val = NULL; + ExtensionListEntry *cur = langtag->extensions; + int32_t i = 0; + while (cur) { + if (i == idx) { + val = cur->value; + break; + } + cur = cur->next; + i++; + } + return val; +} + +U_CFUNC int32_t +ultag_getExtensionsSize(const ULanguageTag* langtag) { + int32_t size = 0; + ExtensionListEntry *cur = langtag->extensions; + while (TRUE) { + if (cur == NULL) { + break; + } + size++; + cur = cur->next; + } + return size; +} + +U_CFUNC const char* +ultag_getPrivateUse(const ULanguageTag* langtag) { + return langtag->privateuse; +} + +U_CFUNC const char* +ultag_getGrandfathered(const ULanguageTag* langtag) { + return langtag->grandfathered; +} + +U_CFUNC int32_t +ultag_localeToLanguageTag(const char* localeID, + char* langtag, + int32_t langtagCapacity, + UBool strict, + UErrorCode* status) { + /* char canonical[ULOC_FULLNAME_CAPACITY]; */ /* See #6822 */ + char canonical[256]; + int32_t reslen = 0; + UErrorCode tmpStatus = U_ZERO_ERROR; + + /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */ + canonical[0] = 0; + if (uprv_strlen(localeID) > 0) { + uloc_canonicalize(localeID, canonical, sizeof(canonical), &tmpStatus); + if (tmpStatus != U_ZERO_ERROR) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + } + + reslen += _appendLanguageToLanguageTag(canonical, langtag, langtagCapacity, strict, status); + reslen += _appendScriptToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status); + reslen += _appendRegionToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status); + reslen += _appendVariantsToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status); + reslen += _appendKeywordsToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status); + + return reslen; +} + +U_CFUNC int32_t +ultag_languageTagToLocale(const char* langtag, + char* localeID, + int32_t localeIDCapacity, + int32_t* parsedLength, + UErrorCode* status) { + ULanguageTag *lt; + int32_t reslen = 0; + const char *subtag, *p; + int32_t len; + int32_t i, n; + UBool noRegion = TRUE; + + lt = ultag_parse(langtag, -1, parsedLength, status); + if (U_FAILURE(*status)) { + return 0; + } + + /* language */ + subtag = ultag_getLanguage(lt); + if (uprv_strcmp(subtag, LANG_UND) != 0) { + len = uprv_strlen(subtag); + if (len > 0) { + if (reslen < localeIDCapacity) { + uprv_memcpy(localeID, subtag, uprv_min(len, localeIDCapacity - reslen)); + } + reslen += len; + } + } + + /* script */ + subtag = ultag_getScript(lt); + len = uprv_strlen(subtag); + if (len > 0) { + if (reslen < localeIDCapacity) { + *(localeID + reslen) = LOCALE_SEP; + } + reslen++; + + /* write out the script in title case */ + p = subtag; + while (*p) { + if (reslen < localeIDCapacity) { + if (p == subtag) { + *(localeID + reslen) = uprv_toupper(*p); + } else { + *(localeID + reslen) = *p; + } + } + reslen++; + p++; + } + } + + /* region */ + subtag = ultag_getRegion(lt); + len = uprv_strlen(subtag); + if (len > 0) { + if (reslen < localeIDCapacity) { + *(localeID + reslen) = LOCALE_SEP; + } + reslen++; + /* write out the retion in upper case */ + p = subtag; + while (*p) { + if (reslen < localeIDCapacity) { + *(localeID + reslen) = uprv_toupper(*p); + } + reslen++; + p++; + } + noRegion = FALSE; + } + + /* variants */ + n = ultag_getVariantsSize(lt); + if (n > 0) { + if (noRegion) { + if (reslen < localeIDCapacity) { + *(localeID + reslen) = LOCALE_SEP; + } + reslen++; + } + + for (i = 0; i < n; i++) { + subtag = ultag_getVariant(lt, i); + if (reslen < localeIDCapacity) { + *(localeID + reslen) = LOCALE_SEP; + } + reslen++; + /* write out the variant in upper case */ + p = subtag; + while (*p) { + if (reslen < localeIDCapacity) { + *(localeID + reslen) = uprv_toupper(*p); + } + reslen++; + p++; + } + } + } + + /* keywords */ + n = ultag_getExtensionsSize(lt); + subtag = ultag_getPrivateUse(lt); + if (n > 0 || uprv_strlen(subtag) > 0) { + if (reslen == 0) { + /* need a language */ + if (reslen < localeIDCapacity) { + uprv_memcpy(localeID + reslen, LANG_UND, uprv_min(LANG_UND_LEN, localeIDCapacity - reslen)); + } + reslen += LANG_UND_LEN; + } + len = _appendKeywords(lt, localeID + reslen, localeIDCapacity - reslen, status); + reslen += len; + } + + ultag_close(lt); + return u_terminateChars(localeID, localeIDCapacity, reslen, status); +} + + diff --git a/icu4c/source/common/ultag.h b/icu4c/source/common/ultag.h new file mode 100644 index 00000000000..eacaca2eade --- /dev/null +++ b/icu4c/source/common/ultag.h @@ -0,0 +1,74 @@ +/* +********************************************************************** +* Copyright (C) 2009, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +*/ +#ifndef ULTAG_H +#define ULTAG_H + +#include "unicode/utypes.h" + +typedef struct ULanguageTag ULanguageTag; + +U_CFUNC ULanguageTag* +ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status); + +U_CFUNC void +ultag_close(ULanguageTag* langtag); + +U_CFUNC const char* +ultag_getLanguage(const ULanguageTag* langtag); + +U_CFUNC const char* +ultag_getJDKLanguage(const ULanguageTag* langtag); + +U_CFUNC const char* +ultag_getExtlang(const ULanguageTag* langtag, int32_t idx); + +U_CFUNC int32_t +ultag_getExtlangSize(const ULanguageTag* langtag); + +U_CFUNC const char* +ultag_getScript(const ULanguageTag* langtag); + +U_CFUNC const char* +ultag_getRegion(const ULanguageTag* langtag); + +U_CFUNC const char* +ultag_getVariant(const ULanguageTag* langtag, int32_t idx); + +U_CFUNC int32_t +ultag_getVariantsSize(const ULanguageTag* langtag); + +U_CFUNC const char* +ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx); + +U_CFUNC const char* +ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx); + +U_CFUNC int32_t +ultag_getExtensionsSize(const ULanguageTag* langtag); + +U_CFUNC const char* +ultag_getPrivateUse(const ULanguageTag* langtag); + +U_CFUNC const char* +ultag_getGrandfathered(const ULanguageTag* langtag); + +U_CFUNC int32_t +ultag_languageTagToLocale(const char* langtag, + char* localeID, + int32_t localeIDCapacity, + int32_t* parsedLength, + UErrorCode* status); + +U_CFUNC int32_t +ultag_localeToLanguageTag(const char* localeID, + char* langtag, + int32_t langtagCapacity, + UBool strict, + UErrorCode* status); + + +#endif /* ULTAG_H */ diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 23313d2f5cc..96b6ed3d25a 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -233,6 +233,8 @@ void addLocaleTest(TestNode** root) TESTCASE(TestGetLocaleForLCID); TESTCASE(TestOrientation); TESTCASE(TestLikelySubtags); + TESTCASE(TestToLanguageTag); + TESTCASE(TestForLanguageTag); } @@ -5343,3 +5345,158 @@ static void TestLikelySubtags() } } } + +const char* const locale_to_langtag[][3] = { + {"", "und", "und"}, + {"en", "en", "en"}, + {"en_US", "en-us", "en-us"}, + {"iw_IL", "he-il", "he-il"}, + {"sr_Latn_SR", "sr-latn-sr", "sr-latn-sr"}, + {"en__POSIX", "en-posix", "en-posix"}, + {"en_POSIX", "en", NULL}, + {"und_555", "und-555", "und-555"}, + {"123", "und", NULL}, + {"%$#&", "und", NULL}, + {"_Latn", "und-latn", "und-latn"}, + {"_DE", "und-de", "und-de"}, + {"und_FR", "und-fr", "und-fr"}, + {"th_TH_TH", "th-th", NULL}, + {"bogus", "bogus", "bogus"}, + {"foooobarrr", "und", NULL}, + {"az_AZ_CYRL", "az-cyrl-az", "az-cyrl-az"}, + {"aa_BB_CYRL", "aa-bb", NULL}, + {"en_US_1234", "en-us-1234", "en-us-1234"}, + {"en_US_VARIANTA_VARIANTB", "en-us-varianta-variantb", "en-us-varianta-variantb"}, + {"en_US_VARIANTB_VARIANTA", "en-us-varianta-variantb", "en-us-varianta-variantb"}, + {"ja__9876_5432", "ja-5432-9876", "ja-5432-9876"}, + {"zh_Hant__VAR", "zh-hant", NULL}, + {"es__BADVARIANT_GOODVAR", "es-goodvar", NULL}, + {"en@calendar=gregorian", "en-u-ca-gregory", "en-u-ca-gregory"}, + {"de@collation=phonebook;calendar=gregorian", "de-u-ca-gregory-co-phonebk", "de-u-ca-gregory-co-phonebk"}, + {"th@numbers=thai;z=extz;x=priv-use;a=exta", "th-a-exta-u-nu-thai-z-extz-x-priv-use", "th-a-exta-u-nu-thai-z-extz-x-priv-use"}, + {"en@timezone=America/New_York;calendar=japanese", "en-u-ca-japanese-tz-usnyc", "en-u-ca-japanese-tz-usnyc"}, + {"en@x=x-y-z;a=a-b-c", "en-x-x-y-z", NULL}, + {"it@collation=badcollationtype;colStrength=identical;cu=usd-eur", "it-u-ks-identic", NULL}, + {NULL, NULL, NULL} +}; + +static void TestToLanguageTag(void) { + char langtag[256]; + int32_t i; + UErrorCode status; + int32_t len; + const char *inloc; + const char *expected; + + for (i = 0; locale_to_langtag[i][0] != NULL; i++) { + inloc = locale_to_langtag[i][0]; + + /* testing non-strict mode */ + status = U_ZERO_ERROR; + langtag[0] = 0; + expected = locale_to_langtag[i][1]; + + len = uloc_toLanguageTag(inloc, langtag, sizeof(langtag), FALSE, &status); + if (U_FAILURE(status)) { + if (expected != NULL) { + log_err("Error returned by uloc_toLanguageTag for locale id [%s] - error: %s\n", + inloc, u_errorName(status)); + } + } else { + if (expected == NULL) { + log_err("Error should be returned by uloc_toLanguageTag for locale id [%s], but [%s] is returned without errors\n", + inloc, langtag); + } else if (uprv_strcmp(langtag, expected) != 0) { + log_err("uloc_toLanguageTag returned language tag [%s] for input locale [%s] - expected: [%s]\n", + langtag, inloc, expected); + } + } + + /* testing strict mode */ + status = U_ZERO_ERROR; + langtag[0] = 0; + expected = locale_to_langtag[i][2]; + + len = uloc_toLanguageTag(inloc, langtag, sizeof(langtag), TRUE, &status); + if (U_FAILURE(status)) { + if (expected != NULL) { + log_err("Error returned by uloc_toLanguageTag {strict} for locale id [%s] - error: %s\n", + inloc, u_errorName(status)); + } + } else { + if (expected == NULL) { + log_err("Error should be returned by uloc_toLanguageTag {strict} for locale id [%s], but [%s] is returned without errors\n", + inloc, langtag); + } else if (uprv_strcmp(langtag, expected) != 0) { + log_err("uloc_toLanguageTag {strict} returned language tag [%s] for input locale [%s] - expected: [%s]\n", + langtag, inloc, expected); + } + } + } +} + +static const struct { + const char *bcpID; + const char *locID; + int32_t len; +} langtag_to_locale[] = { + {"en", "en", 2}, + {"en-us", "en_US", 5}, + {"und-us", "_US", 6}, + {"und-latn", "_Latn", 8}, + {"en-us-posix", "en_US_POSIX", 11}, + {"de-de_euro", "de", 2}, + {"kok-in", "kok_IN", 6}, + {"123", "", 0}, + {"en_us", "", 0}, + {"en-latn-x", "en_Latn", 7}, + {"art-lojban", "jbo", 10}, + {"zh-hakka", "hak", 8}, + {"xxx-yy", "xxx_YY", 6}, + {"fr-234", "fr_234", 6}, + {"i-default", "", 9}, + {"i-test", "", 0}, + {"ja-jp-jp", "ja_JP", 5}, + {"bogus", "bogus", 5}, + {"boguslang", "", 0}, + {"EN-lATN-us", "en_Latn_US", 10}, + {"und-variant-1234", "__1234_VARIANT", 16}, + {"und-varzero-var1-vartwo", "__VARZERO", 11}, + {"en-u-ca-gregory", "en@calendar=gregorian", 15}, + {"en-U-cu-USD", "en@currency=usd", 11}, + {"ar-x-1-2-3", "ar@x=1-2-3", 10}, + {"fr-u-nu-latn-cu-eur", "fr@currency=eur;numbers=latn", 19}, + {"de-k-kext-u-co-phonebk-nu-latn", "de@collation=phonebook;k=kext;numbers=latn", 30}, + {"ja-u-cu-jpy-ca-jp", "ja@currency=jpy", 11}, + {"en-us-u-tz-usnyc", "en_US@timezone=america/new_york", 16}, + {"und-a-abc-def", "und@a=abc-def", 13}, + {"zh-u-ca-chinese-x-u-ca-chinese", "zh@calendar=chinese;x=u-ca-chinese", 30}, + {NULL, NULL, 0} +}; + +static void TestForLanguageTag(void) { + char locale[256]; + int32_t i; + UErrorCode status; + int32_t parsedLen; + + for (i = 0; langtag_to_locale[i].bcpID != NULL; i++) { + status = U_ZERO_ERROR; + locale[0] = 0; + uloc_forLanguageTag(langtag_to_locale[i].bcpID, locale, sizeof(locale), &parsedLen, &status); + if (U_FAILURE(status)) { + log_err("Error returned by uloc_forLanguageTag for language tag [%s] - error: %s\n", + langtag_to_locale[i].bcpID, u_errorName(status)); + } else { + if (uprv_strcmp(langtag_to_locale[i].locID, locale) != 0) { + log_err("uloc_forLanguageTag returned locale [%s] for input language tag [%s] - expected: [%s]\n", + locale, langtag_to_locale[i].bcpID, langtag_to_locale[i].locID); + } + if (parsedLen != langtag_to_locale[i].len) { + log_err("uloc_forLanguageTag parsed length of %d for input language tag [%s] - expected parsed length: %d\n", + parsedLen, langtag_to_locale[i].bcpID, langtag_to_locale[i].len); + } + } + } +} + diff --git a/icu4c/source/test/cintltst/cloctst.h b/icu4c/source/test/cintltst/cloctst.h index 52d6c45b32d..8c5e73c5386 100644 --- a/icu4c/source/test/cintltst/cloctst.h +++ b/icu4c/source/test/cintltst/cloctst.h @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2008, International Business Machines Corporation and + * Copyright (c) 1997-2009, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************** @@ -115,4 +115,9 @@ static void TestOrientation(void); static void TestLikelySubtags(void); +/** + * lanuage tag + */ +static void TestForLanguageTag(void); +static void TestToLanguageTag(void); #endif