diff --git a/icu4c/source/common/putil.c b/icu4c/source/common/putil.c index bc0e7acbd88..d43962b36af 100644 --- a/icu4c/source/common/putil.c +++ b/icu4c/source/common/putil.c @@ -1724,7 +1724,9 @@ _uErrorInfoName[U_ERROR_WARNING_LIMIT-U_ERROR_WARNING_START]={ "U_USING_DEFAULT_WARNING", "U_SAFECLONE_ALLOCATED_WARNING", "U_STATE_OLD_WARNING", - "U_STRING_NOT_TERMINATED_WARNING" + "U_STRING_NOT_TERMINATED_WARNING", + "U_SORT_KEY_TOO_SHORT_WARNING", + "U_AMBIGUOUS_ALIAS_WARNING" }; static const char * const diff --git a/icu4c/source/common/ucnv.c b/icu4c/source/common/ucnv.c index c32bb2475a9..cd7393ccc89 100644 --- a/icu4c/source/common/ucnv.c +++ b/icu4c/source/common/ucnv.c @@ -283,8 +283,7 @@ ucnv_countAvailable () U_CAPI uint16_t U_EXPORT2 ucnv_countAliases(const char *alias, UErrorCode *pErrorCode) { - const char *p; - return ucnv_io_getAliases(alias, &p, pErrorCode); + return ucnv_io_countAliases(alias, pErrorCode); } @@ -297,14 +296,7 @@ ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) U_CAPI void U_EXPORT2 ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode) { - const char *p; - uint16_t count=ucnv_io_getAliases(alias, &p, pErrorCode); - while(count>0) { - *aliases++=p; - /* skip a name, first the canonical converter name */ - p+=uprv_strlen(p)+1; - --count; - } + ucnv_io_getAliases(alias, 0, aliases, pErrorCode); } U_CAPI uint16_t U_EXPORT2 diff --git a/icu4c/source/common/ucnv_io.c b/icu4c/source/common/ucnv_io.c index 1081fea4c37..f219cb2bf84 100644 --- a/icu4c/source/common/ucnv_io.c +++ b/icu4c/source/common/ucnv_io.c @@ -31,67 +31,140 @@ #include "unicode/udata.h" #include "ucln_cmn.h" -/* Format of cnvalias.dat ------------------------------------------------------ +/* Format of cnvalias.icu ----------------------------------------------------- * - * cnvalias.dat is a binary, memory-mappable form of convrtrs.txt . - * It contains two sorted tables and a block of zero-terminated strings. - * Each table is preceded by the number of table entries. + * cnvalias.dat is a binary, memory-mappable form of convrtrs.txt. + * This binary form contains several tables. All indexes are to uint16_t + * units, and not to the bytes (uint8_t units). Addressing everything on + * 16-bit boundaries allows us to store more information with small index + * numbers, which are also 16-bit in size. The majority of the table (except + * the string table) are 16-bit numbers. * - * The first table maps from aliases to converter indexes. - * The converter names themselves are listed as aliases in this table. - * Each entry in this table has an offset to the alias and - * an index of the converter in the converter table. + * First there is the size of the Table of Contents (TOC). The TOC + * entries contain the size of each section. In order to find the offset + * you just need to sum up the previous offsets. * - * The second table lists only the converters themselves. - * Each entry in this table has an offset to the converter name and - * the number of aliases, including the converter itself. - * A count of 1 means that there is no alias, only the converter name. + * 1) This section contains a list of converters. This list contains indexes + * into the string table for the converter name. The index of this list is + * also used by other sections, which are mentioned later on. * - * In the block of strings after the tables, each converter name is directly - * followed by its aliases. All offsets to strings are offsets from the - * beginning of the data. + * 2) This section contains a list of tags. This list contains indexes + * into the string table for the tag name. The index of this list is + * also used by other sections, which are mentioned later on. * - * More formal file data structure (data format 2.1): + * 3) This section contains a list of sorted list of unique aliases. This + * list contains indexes into the string table for the alias name. The + * index of this list is also used by other sections, which are mentioned + * later on. * - * uint16_t aliasCount; - * uint16_t aliasOffsets[aliasCount]; - * uint16_t converterIndexes[aliasCount]; + * 4) This section contains a list of mapped converter names. Consider this + * as a table that maps the 3rd section to the 1st section. This list contains + * indexes into the 1st section. The index of this list is the same index in + * the 3rd section. There is also some extra information in the high bits of + * each converter index in this table. Currently it's only used to say that + * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK + * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is + * the predigested form of the 5th section so that an alias lookup can be fast. + * + * 5) This section contains a 2D array with indexes to the 6th section. This + * section is the full form of all alias mappings. The column index is the + * index into the converter list (column header). The row index is the index + * to tag list (row header). This 2D array is the top part a 3D array. The + * third dimension is in the 6th section. * - * uint16_t converterCount; - * struct { - * uint16_t converterOffset; - * uint16_t aliasCount; - * } converters[converterCount]; + * 6) This is blob of variable length arrays. Each array starts with a size, + * and is followed by indexes to alias names in the string table. This is + * the third dimension to the section 5. No other section should be referencing + * this section. * - * uint16_t tagCount; - * uint16_t taggedAliasesOffsets[tagCount][converterCount]; - * char tags[] = { "Tag0\Tag1\0..." }; + * 7) Reserved at this time (There is no information). This _usually_ has a + * size of 0. Future versions may add more information here. * - * char strings[]={ - * "Converter0\0Alias1\0Alias2\0...Converter1\0Converter2\0Alias0\Alias1\0..." - * }; + * 8) This is the string table. All strings are indexed on an even address. + * There are two reasons for this. First many chip architectures locate strings + * faster on even address boundaries. Second, since all indexes are 16-bit + * numbers, this string table can be 128KB in size instead of 64KB when we + * only have strings starting on an even address. * - * The code included here can read versions 2 and 2.1 of the data format. - * Version 2 does not have tag information, but since the code never refers - * to strings[] by its base offset, it's okay. * + * Here is the concept of section 5 and 6. It's a 3D cube. Each tag + * has a unique alias among all converters. That same alias can + * be mentioned in other standards on different converters, + * but only one alias per tag can be unique. + * + * + * Converter Names (Usually in TR22 form) + * -------------------------------------------. + * T / /| + * a / / | + * g / / | + * s / / | + * / / | + * ------------------------------------------/ | + * A | | | + * l | | | + * i | | / + * a | | / + * s | | / + * e | | / + * s | |/ + * ------------------------------------------- + * + * + * + * Here is what it really looks like. It's like swiss cheese. + * There are holes. Some converters aren't recognized by + * a standard, or they are really old converters that the + * standard doesn't recognize anymore. + * + * Converter Names (Usually in TR22 form) + * -------------------------------------------. + * T /##########################################/| + * a / # # /# + * g / # ## ## ### # ### ### ### #/ + * s / # ##### #### ## ## #/# + * / ### # # ## # # # ### # # #/## + * ------------------------------------------/# # + * A |### # # ## # # # ### # # #|# # + * l |# # # # # ## # #|# # + * i |# # # # # # #|# + * a |# #|# + * s | #|# + * e + * s + * */ static const char DATA_NAME[] = "cnvalias"; -static const char DATA_TYPE[] = "dat"; +static const char DATA_TYPE[] = "icu"; static UDataMemory *aliasData=NULL; -static const uint16_t *aliasTable=NULL; + +static const uint16_t *converterList = NULL; +static const uint16_t *tagList = NULL; +static const uint16_t *aliasList = NULL; +static const uint16_t *untaggedConvArray = NULL; +static const uint16_t *taggedAliasArray = NULL; +static const uint16_t *taggedAliasLists = NULL; +static const uint16_t *stringTable = NULL; + +static uint32_t converterListNum; +static uint32_t tagListNum; +static uint32_t aliasListNum; +static uint32_t untaggedConvArraySize; +static uint32_t taggedAliasArraySize; +static uint32_t taggedAliasListsSize; +static uint32_t stringTableSize; static const char **availableConverters = NULL; static uint16_t availableConverterCount = 0; -static const uint16_t *converterTable = NULL; -static const uint16_t *tagTable = NULL; - -static char defaultConverterNameBuffer[100]; +static char defaultConverterNameBuffer[UCNV_MAX_CONVERTER_NAME_LENGTH + 1]; /* +1 for NULL */ static const char *defaultConverterName = NULL; +#define GET_STRING(idx) (const char *)(stringTable + (idx)) +#define NUM_RESERVED_TAGS 2 + static UBool isAcceptable(void *context, const char *type, const char *name, @@ -104,7 +177,7 @@ isAcceptable(void *context, pInfo->dataFormat[1]==0x76 && pInfo->dataFormat[2]==0x41 && pInfo->dataFormat[3]==0x6c && - pInfo->formatVersion[0]==2); + pInfo->formatVersion[0]==3); } static UBool @@ -115,32 +188,64 @@ haveAliasData(UErrorCode *pErrorCode) { /* load converter alias data from file if necessary */ if(aliasData==NULL) { - UDataMemory *data; - UDataInfo info; - const uint16_t *table=NULL; + UDataMemory *data = NULL; + const uint16_t *table = NULL; + uint32_t tableStart; + uint32_t currOffset; + uint32_t reservedSize1; - /* open the data outside the mutex block */ - data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode); + data = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode); if(U_FAILURE(*pErrorCode)) { return FALSE; } - table=(const uint16_t *)udata_getMemory(data); - info.size=sizeof(UDataInfo); - udata_getInfo(data, &info); + table = (const uint16_t *)udata_getMemory(data); + + tableStart = ((const uint32_t *)(table))[0]; + if (tableStart < 8) { + *pErrorCode = U_INVALID_FORMAT_ERROR; + return FALSE; + } - /* in the mutex block, set the data for this process */ umtx_lock(NULL); if(aliasData==NULL) { - aliasData=data; + aliasData = data; data=NULL; - aliasTable=table; - table=NULL; - converterTable = aliasTable + 1 + 2 * *aliasTable; - if (info.formatVersion[0] == 2 && info.formatVersion[1] > 0) { - tagTable = converterTable + 1 + 2 * *converterTable; - } + converterListNum = ((const uint32_t *)(table))[1]; + tagListNum = ((const uint32_t *)(table))[2]; + aliasListNum = ((const uint32_t *)(table))[3]; + untaggedConvArraySize = ((const uint32_t *)(table))[4]; + taggedAliasArraySize = ((const uint32_t *)(table))[5]; + taggedAliasListsSize = ((const uint32_t *)(table))[6]; + reservedSize1 = ((const uint32_t *)(table))[7]; /* reserved */ + stringTableSize = ((const uint32_t *)(table))[8]; + + currOffset = tableStart * (sizeof(uint32_t)/sizeof(uint16_t)) + (sizeof(uint32_t)/sizeof(uint16_t)); + converterList = table + currOffset; + + currOffset += converterListNum; + tagList = table + currOffset; + + currOffset += tagListNum; + aliasList = table + currOffset; + + currOffset += aliasListNum; + untaggedConvArray = table + currOffset; + + currOffset += untaggedConvArraySize; + taggedAliasArray = table + currOffset; + + /* aliasLists is a 1's based array, but it has a padding character */ + currOffset += taggedAliasArraySize; + taggedAliasLists = table + currOffset; + + currOffset += taggedAliasListsSize; + /* reserved */ + + currOffset += reservedSize1; + stringTable = table + currOffset; + } umtx_unlock(NULL); @@ -175,49 +280,40 @@ ucnv_io_cleanup() ucnv_io_flushAvailableConverterCache(); - aliasData = NULL; - aliasTable = NULL; + converterListNum = 0; + tagListNum = 0; + aliasListNum = 0; + untaggedConvArraySize = 0; + taggedAliasArraySize = 0; + taggedAliasListsSize = 0; + stringTableSize = 0; - converterTable = NULL; - tagTable = NULL; + converterList = NULL; + tagList = NULL; + aliasList = NULL; + untaggedConvArray = NULL; + taggedAliasArray = NULL; + taggedAliasLists = NULL; + stringTable = NULL; defaultConverterName = NULL; + defaultConverterNameBuffer[0] = 0; return TRUE; /* Everything was cleaned up */ } -static int16_t getTagNumber(const char *tagname) { - if (tagTable) { - int16_t tag, count = (int16_t) *tagTable; - const char *tags = (const char *) (tagTable + 1 + count * *converterTable); - -#if 0 - - char name[100]; - int i; - - /* convert the tag name to lowercase to do case-insensitive comparisons */ - for(i = 0; i < sizeof(name) - 1 && *tagname; ++i) { - name[i] = (char)uprv_tolower(*tagname++); - } - name[i] = 0; - -#else - - const char *name = tagname; - -#endif - - for (tag = 0; count--; ++tag) { - if (!uprv_stricmp(name, tags)) { - return tag; +static uint32_t getTagNumber(const char *tagname) { + if (tagList) { + uint32_t tagNum; + for (tagNum = 0; tagNum < tagListNum; tagNum++) { + if (!uprv_stricmp(GET_STRING(tagList[tagNum]), tagname)) { + return tagNum; } - tags += strlen(tags) + 1; } } - return -1; + return UINT32_MAX; } /** @@ -240,14 +336,16 @@ static int16_t getTagNumber(const char *tagname) { U_CAPI int U_EXPORT2 ucnv_compareNames(const char *name1, const char *name2) { int rc; - unsigned char c1, c2; + char c1, c2; for (;;) { /* Ignore delimiters '-', '_', and ' ' */ - while ((c1 = (unsigned char)*name1) == '-' - || c1 == '_' || c1 == ' ') ++name1; - while ((c2 = (unsigned char)*name2) == '-' - || c2 == '_' || c2 == ' ') ++name2; + while ((c1 = *name1) == '-' || c1 == '_' || c1 == ' ') { + ++name1; + } + while ((c2 = *name2) == '-' || c2 == '_' || c2 == ' ') { + ++name2; + } /* If we reach the ends of both strings then they match */ if ((c1|c2)==0) { @@ -257,7 +355,7 @@ ucnv_compareNames(const char *name1, const char *name2) { /* Case-insensitive comparison */ rc = (int)(unsigned char)uprv_tolower(c1) - (int)(unsigned char)uprv_tolower(c2); - if (rc!=0) { + if (rc != 0) { return rc; } ++name1; @@ -267,69 +365,87 @@ ucnv_compareNames(const char *name1, const char *name2) { /* * search for an alias - * return NULL or a pointer to the converter table entry + * return the converter number index for converterList */ -static const uint16_t * -findAlias(const char *alias) { - char name[100]; - const uint16_t *p=aliasTable; - uint16_t i, start, limit; - - limit=*p++; - if(limit==0) { - /* there are no aliases */ - return NULL; - } - - /* convert the alias name to lowercase to do case-insensitive comparisons */ - for(i=0; i 0) { + start = mid+1; } else { - start=i; + /* Since the gencnval tool folds duplicates into one entry, + * this alias in aliasList is unique, but different standards + * may map an alias to different converters. + */ + if (untaggedConvArray[mid] & UCNV_AMBIGUOUS_ALIAS_MAP_BIT) { + *pErrorCode = U_AMBIGUOUS_ALIAS_WARNING; + } + return untaggedConvArray[mid] & UCNV_CONVERTER_INDEX_MASK; } } - /* did we really find it? */ - if(ucnv_compareNames(name, (const char *)aliasTable+p[start])==0) { - limit=*(p-1); /* aliasCount */ - p+=limit; /* advance to the second column of the alias table */ - i=p[start]; /* converter index */ - return - p+limit+ /* beginning of converter table */ - 1+ /* skip its count */ - 2*i; /* go to this converter's entry and return a pointer to it */ - } else { - return NULL; - } + return UINT32_MAX; } U_CFUNC const char * ucnv_io_getConverterName(const char *alias, UErrorCode *pErrorCode) { if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { - const uint16_t *p=findAlias(alias); - if(p!=NULL) { - return (const char *)aliasTable+*p; + uint32_t convNum = findConverter(alias, pErrorCode); + if (convNum < converterListNum) { + return GET_STRING(converterList[convNum]); } } return NULL; } U_CFUNC uint16_t -ucnv_io_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode) { +ucnv_io_countAliases(const char *alias, UErrorCode *pErrorCode) { if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { - const uint16_t *p=findAlias(alias); - if(p!=NULL) { - *aliases=(const char *)aliasTable+*p; - return *(p+1); + uint32_t convNum = findConverter(alias, pErrorCode); + if (convNum < converterListNum) { + /* tagListNum - 1 is the ALL tag */ + int32_t listOffset = taggedAliasArray[(tagListNum - 1)*converterListNum + convNum]; + + if (listOffset) { + return taggedAliasLists[listOffset]; + } + } + } + return 0; +} + +U_CFUNC uint16_t +ucnv_io_getAliases(const char *alias, uint16_t start, const char **aliases, UErrorCode *pErrorCode) { + if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { + uint32_t currAlias; + uint32_t convNum = findConverter(alias, pErrorCode); + if (convNum < converterListNum) { + /* tagListNum - 1 is the ALL tag */ + int32_t listOffset = taggedAliasArray[(tagListNum - 1)*converterListNum + convNum]; + + if (listOffset) { + uint32_t listCount = taggedAliasLists[listOffset]; + /* +1 to skip listCount */ + const uint16_t *currList = taggedAliasLists + listOffset + 1; + + for (currAlias = start; currAlias < listCount; currAlias++) { + aliases[currAlias] = GET_STRING(currList[currAlias]); + } + } } } return 0; @@ -338,17 +454,20 @@ ucnv_io_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCo U_CFUNC const char * ucnv_io_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) { if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { - const uint16_t *p=findAlias(alias); - if(p!=NULL) { - uint16_t count=*(p+1); - if(n0) { - /* skip a name, first the canonical converter name */ - aliases+=uprv_strlen(aliases)+1; - --n; + uint32_t convNum = findConverter(alias, pErrorCode); + if (convNum < converterListNum) { + /* tagListNum - 1 is the ALL tag */ + int32_t listOffset = taggedAliasArray[(tagListNum - 1)*converterListNum + convNum]; + + if (listOffset) { + uint32_t listCount = taggedAliasLists[listOffset]; + /* +1 to skip listCount */ + const uint16_t *currList = taggedAliasLists + listOffset + 1; + + if (n < listCount) { + return GET_STRING(currList[n]); } - return aliases; + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; } } } @@ -358,12 +477,8 @@ ucnv_io_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) { U_CFUNC uint16_t ucnv_io_countStandards(UErrorCode *pErrorCode) { if (haveAliasData(pErrorCode)) { - if (!tagTable) { - *pErrorCode = U_INVALID_FORMAT_ERROR; - return 0; - } - - return *tagTable; + /* Don't include the empty list */ + return (uint16_t)(tagListNum - NUM_RESERVED_TAGS); } return 0; @@ -371,15 +486,11 @@ ucnv_io_countStandards(UErrorCode *pErrorCode) { U_CAPI const char * U_EXPORT2 ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) { - if (haveAliasData(pErrorCode) && tagTable) { - int16_t count = (int16_t) *tagTable; - const char *tags = (const char *) (tagTable + 1 + count * *converterTable); - - while (n-- && count--) { - tags += strlen(tags) + 1; + if (haveAliasData(pErrorCode)) { + if (n < tagListNum - NUM_RESERVED_TAGS) { + return GET_STRING(tagList[n]); } - - return count ? tags : NULL; + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; } return NULL; @@ -388,18 +499,56 @@ ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) { U_CFUNC const char * U_EXPORT2 ucnv_getStandardName(const char *alias, const char *standard, UErrorCode *pErrorCode) { if (haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { - const uint16_t *p = findAlias(alias); - if(p != NULL) { - int16_t tag = getTagNumber(standard); + uint32_t idx; + uint32_t listOffset; + uint32_t convNum; + uint32_t tagNum = getTagNumber(standard); + UErrorCode myErr = U_ZERO_ERROR; - if (tag > -1) { - uint16_t offset = tagTable[1 + tag * *converterTable + (p - converterTable) / 2]; - return offset ? (const char *) aliasTable + offset : NULL; + /* Make a quick guess. Hopefully they used a TR22 canonical alias. */ + convNum = findConverter(alias, &myErr); + + if (tagNum < (tagListNum - NUM_RESERVED_TAGS) && convNum < converterListNum) { + if (myErr == U_AMBIGUOUS_ALIAS_WARNING) { + /* Uh Oh! They used an ambiguous alias. + Hopefully the standard knows the alias. + This may take a while. + */ + for (idx = 0; idx < converterListNum; idx++) { + listOffset = taggedAliasArray[tagNum*converterListNum + idx]; + if (listOffset) { + uint32_t currAlias; + uint32_t listCount = taggedAliasLists[listOffset]; + /* +1 to skip listCount */ + const uint16_t *currList = taggedAliasLists + listOffset + 1; + for (currAlias = 0; currAlias < listCount; currAlias++) { + if (currList[currAlias] + && ucnv_compareNames(alias, GET_STRING(currList[currAlias]))==0) + { + if (currList[0]) { + return GET_STRING(currList[0]); + } + else { + /* Someone screwed up the alias table. */ + return NULL; + } + } + } + } + } + /* The standard doesn't know about the alias */ + *pErrorCode = U_AMBIGUOUS_ALIAS_WARNING; } + listOffset = taggedAliasArray[tagNum*converterListNum + convNum]; + if (listOffset && taggedAliasLists[listOffset + 1]) { + return GET_STRING(taggedAliasLists[listOffset + 1]); + } + /* else no default name */ } + /* else converter or tag not found */ } - return NULL; + return NULL; } void @@ -413,41 +562,52 @@ ucnv_io_flushAvailableConverterCache() { availableConverterCount = 0; } -static void ucnv_io_loadAvailableConverterList(void) { - uint16_t idx = 0; - uint16_t localConverterCount = 0; - UErrorCode status; - char *converterName; - - /* We can't have more than "*converterTable" converters to open */ - char **localConverterList = (char **) uprv_malloc(*converterTable * sizeof(char*)); - - for (; idx < *converterTable; idx++) { - status = U_ZERO_ERROR; - converterName = (char *)aliasTable+converterTable[1+2*idx]; - ucnv_close(ucnv_open(converterName, &status)); - if (U_SUCCESS(status)) { - localConverterList[localConverterCount++] = converterName; - } - } - - umtx_lock(NULL); +static UBool haveAvailableConverterList(UErrorCode *pErrorCode) { if (availableConverters == NULL) { - availableConverters = (const char **)localConverterList; - availableConverterCount = localConverterCount; + uint16_t idx; + uint16_t localConverterCount; + UErrorCode status; + const char *converterName; + const char **localConverterList; + + if (!haveAliasData(pErrorCode)) { + return FALSE; + } + + /* We can't have more than "*converterTable" converters to open */ + localConverterList = (const char **) uprv_malloc(converterListNum * sizeof(char*)); + if (!localConverterList) { + *pErrorCode = U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + + localConverterCount = 0; + + for (idx = 0; idx < converterListNum; idx++) { + status = U_ZERO_ERROR; + converterName = GET_STRING(converterList[idx]); + ucnv_close(ucnv_open(converterName, &status)); + if (U_SUCCESS(status)) { + localConverterList[localConverterCount++] = converterName; + } + } + + umtx_lock(NULL); + if (availableConverters == NULL) { + availableConverters = localConverterList; + availableConverterCount = localConverterCount; + } + else { + uprv_free((char **)localConverterList); + } + umtx_unlock(NULL); } - else { - uprv_free(localConverterList); - } - umtx_unlock(NULL); + return TRUE; } U_CFUNC uint16_t ucnv_io_countAvailableConverters(UErrorCode *pErrorCode) { - if(haveAliasData(pErrorCode)) { - if (availableConverters == NULL) { - ucnv_io_loadAvailableConverterList(); - } + if (haveAvailableConverterList(pErrorCode)) { return availableConverterCount; } return 0; @@ -455,20 +615,18 @@ ucnv_io_countAvailableConverters(UErrorCode *pErrorCode) { U_CFUNC const char * ucnv_io_getAvailableConverter(uint16_t n, UErrorCode *pErrorCode) { - if(haveAliasData(pErrorCode)) { - if (availableConverters == NULL) { - ucnv_io_loadAvailableConverterList(); - } - if(n < availableConverterCount) { + if (haveAvailableConverterList(pErrorCode)) { + if (n < availableConverterCount) { return availableConverters[n]; } + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; } return NULL; } U_CFUNC void ucnv_io_fillAvailableConverters(const char **aliases, UErrorCode *pErrorCode) { - if(haveAliasData(pErrorCode)) { + if (haveAvailableConverterList(pErrorCode)) { uint16_t count = 0; while (count < availableConverterCount) { *aliases++=availableConverters[count++]; @@ -478,42 +636,12 @@ ucnv_io_fillAvailableConverters(const char **aliases, UErrorCode *pErrorCode) { U_CFUNC uint16_t ucnv_io_countAvailableAliases(UErrorCode *pErrorCode) { - if(haveAliasData(pErrorCode)) { - return *aliasTable; + if (haveAliasData(pErrorCode)) { + return (uint16_t)aliasListNum; } return 0; } -#if 0 -/* - * We are not currently using these functions, so I am commenting them out - * to reduce the binary file size and improve the code coverage; - * I do not currently want to remove this entirely because it may be useful - * in the future and also serves to some degree as another piece of - * documentation of the data structure. - */ -U_CFUNC const char * -ucnv_io_getAvailableAlias(uint16_t n, UErrorCode *pErrorCode) { - if(haveAliasData(pErrorCode) && n<*aliasTable) { - return (const char *)aliasTable+*(aliasTable+1+n); - } - return NULL; -} - -U_CFUNC void -ucnv_io_fillAvailableAliases(const char **aliases, UErrorCode *pErrorCode) { - if(haveAliasData(pErrorCode)) { - const uint16_t *p=aliasTable; - uint16_t count=*p++; - while(count>0) { - *aliases++=(const char *)aliasTable+*p; - ++p; - --count; - } - } -} -#endif - /* default converter name --------------------------------------------------- */ /* @@ -529,10 +657,7 @@ ucnv_io_getDefaultConverterName() { /* local variable to be thread-safe */ const char *name=defaultConverterName; if(name==NULL) { - const char *codepage=0; - umtx_lock(NULL); - codepage = uprv_getDefaultCodepage(); - umtx_unlock(NULL); + const char *codepage = uprv_getDefaultCodepage(); if(codepage!=NULL) { UErrorCode errorCode=U_ZERO_ERROR; name=ucnv_io_getConverterName(codepage, &errorCode); @@ -543,26 +668,27 @@ ucnv_io_getDefaultConverterName() { /* if the name is there, test it out */ if(name != NULL) { - UErrorCode errorCode = U_ZERO_ERROR; - UConverter *cnv; - cnv = ucnv_open(name, &errorCode); - if(U_FAILURE(errorCode) || (cnv == NULL)) { - /* Panic time, let's use a fallback. */ + UErrorCode errorCode = U_ZERO_ERROR; + UConverter *cnv = ucnv_open(name, &errorCode); + if(U_FAILURE(errorCode) || (cnv == NULL)) { + /* Panic time, let's use a fallback. */ #if (U_CHARSET_FAMILY == U_ASCII_FAMILY) - name = "US-ASCII"; - /* there is no 'algorithmic' converter for EBCDIC */ + name = "US-ASCII"; + /* there is no 'algorithmic' converter for EBCDIC */ #elif defined(OS390) - name = "ibm-1047-s390"; + name = "ibm-1047-s390"; #else - name = "ibm-37"; + name = "ibm-37"; #endif - } - ucnv_close(cnv); + } + ucnv_close(cnv); } if(name != NULL) { - /* Did find a name. And it works.*/ - defaultConverterName=name; + umtx_lock(NULL); + /* Did find a name. And it works.*/ + defaultConverterName=name; + umtx_unlock(NULL); } } diff --git a/icu4c/source/common/ucnv_io.h b/icu4c/source/common/ucnv_io.h index 500718dee06..6dc2798ea3d 100644 --- a/icu4c/source/common/ucnv_io.h +++ b/icu4c/source/common/ucnv_io.h @@ -15,6 +15,9 @@ #include "unicode/utypes.h" +#define UCNV_AMBIGUOUS_ALIAS_MAP_BIT 0x8000 +#define UCNV_CONVERTER_INDEX_MASK 0x7FF + /** * Map a converter alias name to a canonical converter name. * The alias is searched for case-insensitively, the converter name @@ -24,6 +27,12 @@ U_CFUNC const char * ucnv_io_getConverterName(const char *alias, UErrorCode *pErrorCode); +/** + * The count for ucnv_io_getAliases and ucnv_io_getAlias + */ +U_CFUNC uint16_t +ucnv_io_countAliases(const char *alias, UErrorCode *pErrorCode); + /** * Search case-insensitively for a converter alias and set aliases to * a pointer to the list of aliases for the actual converter. @@ -34,7 +43,7 @@ ucnv_io_getConverterName(const char *alias, UErrorCode *pErrorCode); * or 0 if the alias is not found. */ U_CFUNC uint16_t -ucnv_io_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode); +ucnv_io_getAliases(const char *alias, uint16_t start, const char **aliases, UErrorCode *pErrorCode); /** * Search case-insensitively for a converter alias and return @@ -85,21 +94,6 @@ ucnv_io_flushAvailableConverterCache(void); U_CFUNC uint16_t ucnv_io_countAvailableAliases(UErrorCode *pErrorCode); -/** - * Return the (n)th alias or converter name in mixed case, or NULL - * if there is none (typically, if the data cannot be loaded). - * 0<=indexucnv_io_getConverterName(). diff --git a/icu4c/source/common/unicode/ucnv.h b/icu4c/source/common/unicode/ucnv.h index ab0ded6bc02..9c2a816ddb8 100644 --- a/icu4c/source/common/unicode/ucnv.h +++ b/icu4c/source/common/unicode/ucnv.h @@ -40,6 +40,7 @@ U_CDECL_BEGIN /* maximum length of the converter names */ #define UCNV_MAX_CONVERTER_NAME_LENGTH 60 +/* maximum length of the converter name including path */ #define UCNV_MAX_FULL_FILE_NAME_LENGTH (600+UCNV_MAX_CONVERTER_NAME_LENGTH) #define UCNV_SI 0x0F /*Shift in for EBDCDIC_STATEFUL and iso2022 states */ @@ -979,7 +980,6 @@ ucnv_getAvailableName (int32_t n); /** * Gives the number of aliases for a given converter or alias name. - * Note that additional aliases are recognized by ucnv_open(). * This method only enumerates the listed entries in the alias file. * @param alias alias name * @param pErrorCode error status @@ -991,7 +991,6 @@ ucnv_countAliases(const char *alias, UErrorCode *pErrorCode); /** * Gives the name of the alias at given index of alias list. - * Note that additional aliases are recognized by ucnv_open(). * This method only enumerates the listed entries in the alias file. * @param alias alias name * @param n index in alias list @@ -1005,7 +1004,6 @@ ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode); /** * Fill-up the list of alias names for the given alias. - * Note that additional aliases are recognized by ucnv_open(). * This method only enumerates the listed entries in the alias file. * @param alias alias name * @param aliases fill-in list, aliases is a pointer to an array of diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h index 68b01b5b72d..aa6ca5e5a78 100644 --- a/icu4c/source/common/unicode/utypes.h +++ b/icu4c/source/common/unicode/utypes.h @@ -379,8 +379,11 @@ enum UErrorCode { U_STATE_OLD_WARNING = -125, /**< ICU has to use compatibility layer to construct the service. Expect performance/memory usage degradation. Consider upgrading */ U_STRING_NOT_TERMINATED_WARNING = -124,/**< An output string could not be NUL-terminated because output length==destCapacity. */ + U_SORT_KEY_TOO_SHORT_WARNING = -123, + U_AMBIGUOUS_ALIAS_WARNING = -122, + U_ERROR_WARNING_LIMIT, /**< This must always be the last warning value to indicate the limit for UErrorCode warnings (last warning code +1) */ /** @deprecated use the enum that ends in _WARNING */ @@ -476,18 +479,18 @@ enum UErrorCode { /* * the error code range 0x10200 0x10300 are reserved for Break Iterator related error */ - U_BRK_ERROR_START=0x10200, - U_BRK_INTERNAL_ERROR, - U_BRK_HEX_DIGITS_EXPECTED, - U_BRK_SEMICOLON_EXPECTED, - U_BRK_RULE_SYNTAX, - U_BRK_UNCLOSED_SET, - U_BRK_ASSIGN_ERROR, - U_BRK_VARIABLE_REDFINITION, - U_BRK_MISMATCHED_PAREN, - U_BRK_NEW_LINE_IN_QUOTED_STRING, - U_BRK_UNDEFINED_VARIABLE, - U_BRK_ERROR_LIMIT, + U_BRK_ERROR_START=0x10200, + U_BRK_INTERNAL_ERROR, + U_BRK_HEX_DIGITS_EXPECTED, + U_BRK_SEMICOLON_EXPECTED, + U_BRK_RULE_SYNTAX, + U_BRK_UNCLOSED_SET, + U_BRK_ASSIGN_ERROR, + U_BRK_VARIABLE_REDFINITION, + U_BRK_MISMATCHED_PAREN, + U_BRK_NEW_LINE_IN_QUOTED_STRING, + U_BRK_UNDEFINED_VARIABLE, + U_BRK_ERROR_LIMIT, U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */ }; diff --git a/icu4c/source/data/makedata.mak b/icu4c/source/data/makedata.mak index 27d864ddad5..0963b000e46 100644 --- a/icu4c/source/data/makedata.mak +++ b/icu4c/source/data/makedata.mak @@ -35,6 +35,8 @@ ICUOUT=$(ICUMAKE)\out # ICUP=$(ICUMAKE)\..\.. ICUP=$(ICUP:\source\data\..\..=) +# In case the first one didn't do it, try this one. .NET would do the second one. +ICUP=$(ICUP:\source\data\\..\..=) !MESSAGE ICU root path is $(ICUP) @@ -238,14 +240,14 @@ BRK_FILES = "$(ICUBLD)\sent.brk" "$(ICUBLD)\char.brk" "$(ICUBLD)\line.brk" "$(IC # move the .dll and .lib files to their final destination afterwards. # The $(U_ICUDATA_NAME).lib and $(U_ICUDATA_NAME).exp should already be in the right place due to stubdata. # -"$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll" : "$(ICUTOOLS)\pkgdata\$(CFG)\pkgdata.exe" $(CNV_FILES) $(BRK_FILES) "$(ICUBLD)\uprops.dat" "$(ICUBLD)\unames.dat" "$(ICUBLD)\unorm.dat" "$(ICUBLD)\cnvalias.dat" "$(ICUBLD)\tz.dat" "$(ICUBLD)\ucadata.dat" "$(ICUBLD)\invuca.dat" $(ALL_RES) "$(ICUBLD)\icudata.res" "$(ICUP)\source\stubdata\stubdatabuilt.txt" +"$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll" : "$(ICUTOOLS)\pkgdata\$(CFG)\pkgdata.exe" $(CNV_FILES) $(BRK_FILES) "$(ICUBLD)\uprops.dat" "$(ICUBLD)\unames.dat" "$(ICUBLD)\unorm.dat" "$(ICUBLD)\cnvalias.icu" "$(ICUBLD)\tz.dat" "$(ICUBLD)\ucadata.dat" "$(ICUBLD)\invuca.dat" $(ALL_RES) "$(ICUBLD)\icudata.res" "$(ICUP)\source\stubdata\stubdatabuilt.txt" @echo Building icu data @cd "$(ICUBLD)" "$(ICUTOOLS)\pkgdata\$(CFG)\pkgdata" -e $(U_ICUDATA_NAME) -v -m dll -c -p $(U_ICUDATA_NAME) -O "$(PKGOPT)" -d "$(ICUBLD)" -s . < Yen mapping -ibm-33722_P12A-2000 ibm-33722_VPUA EUC-JP { MIME } ibm-eucJP eucjis Extended_UNIX_Code_Packed_Format_for_Japanese { IANA } cseucpkdfmtjapanese X-EUC-JP # Japan EUC. x-euc-jp is a MIME name -ibm-970 EUC-KR { IANA MIME } ibm-eucKR csEUCKR # Korean EUC. x-euc-kr is a MIME name +ibm-33722_P12A-2000 ibm-33722_VPUA EUC-JP { MIME* } ibm-eucJP eucjis Extended_UNIX_Code_Packed_Format_for_Japanese { IANA* } cseucpkdfmtjapanese X-EUC-JP # Japan EUC. x-euc-jp is a MIME name +ibm-970 EUC-KR { IANA* MIME* } ibm-eucKR csEUCKR # Korean EUC. x-euc-kr is a MIME name ibm-964 EUC-TW ibm-eucTW cns11643 # Taiwan EUC. x-euc-tw is a MIME name -ibm-1383_P110-2000 ibm-1383_VPUA ibm-1383 EUC-CN ibm-eucCN GB_2312-80 { IANA } chinese gb iso-ir-58 csISO58GB231280 GB2312 { MIME } gb2312-1980 cp1383 1383 csGB2312# China EUC. x-euc-cn is a MIME name -ibm-1162 tis-620 { IANA } cp874 windows-874 ms874 cp9066 874 # Thai (w/ euro update) + +ibm-1383_P110-2000 ibm-1383_VPUA + ibm-1383 + EUC-CN + ibm-eucCN + GB_2312-80 { IANA* } + chinese { IANA } + gb # This is not an IANA name. gb in IANA means Great Britain. + iso-ir-58 { IANA } + csISO58GB231280 { IANA } + GB2312 { MIME* } + gb2312-1980 + cp1383 + 1383 + csGB2312 # China EUC. x-euc-cn is a MIME name + +ibm-1162 tis-620 { IANA* } cp874 windows-874 ms874 cp9066 874 # Thai (w/ euro update) ibm-874 ibm-1161 # Same as 1162 (w/o euro update) # Platform codepages ibm-437 cp437 csPC8CodePage437 437 # PC US # HSYS: -ibm-850 IBM850 { IANA } cp850 { MIME } 850 csPC850Multilingual # PC latin1 -ibm-851 IBM851 { IANA } cp851 { MIME } 851 csPC851 # PC DOS Greek (w/o euro) -ibm-858 cp858 { MIME } IBM00858 { IANA } # PC latin1 with Euro cp850 removed +ibm-850 IBM850 { IANA* } cp850 { MIME* } 850 csPC850Multilingual # PC latin1 +ibm-851 IBM851 { IANA* } cp851 { MIME* } 851 csPC851 # PC DOS Greek (w/o euro) +ibm-858 cp858 { MIME* } IBM00858 { IANA* } # PC latin1 with Euro cp850 removed ibm-9044 852 csPCp852 cp852 # PC latin2 (w/ euro update) cp852 is a MIME name for IBM-852 -ibm-852 IBM852 { IANA } # PC latin2 (w/o euro update) +ibm-852 IBM852 { IANA* } # PC latin2 (w/o euro update) ibm-872 855 csIBM855 cp855 csPCp855 # PC cyrillic (w/ euro update) cp855 is a MIME name for IBM-855 -ibm-855 IBM855 { IANA } # PC cyrillic (w/o euro update) -ibm-856 cp856 { MIME } 856 # PC Hebrew (old) -ibm-9049 857 csIBM857 cp857 { MIME } # PC Latin 5 (Turkish) (w/ euro update) -ibm-857 IBM857 { IANA } # PC Latin 5 (w/o euro update) -ibm-859 cp859 { MIME } # PC Latin 9 (w/ euro update) -ibm-860 IBM860 { IANA } cp860 { MIME } 860 csIBM860 # PC Portugal -ibm-861 IBM861 { IANA } cp861 { MIME } 861 cp-is csIBM861 # PC Iceland -ibm-867 cp867 862 cp862 { MIME } cspc862latinhebrew # PC Hebrew (w/ euro update) -ibm-862 IBM862 { IANA } # PC Hebrew (w/o euro update) -ibm-863 IBM863 { IANA } cp863 { MIME } 863 csIBM863 # PC Canadian French -ibm-17248 cp864 { MIME } csIBM864 # PC Arabic (w/ euro update) -ibm-864 IBM864 { IANA } # PC Arabic (w/o euro update) -ibm-865 IBM865 { IANA } cp865 { MIME } 865 csIBM865 # PC Nordic -ibm-808 cp866 { MIME } 866 csIBM866 # PC Russian (w/ euro update) +ibm-855 IBM855 { IANA* } # PC cyrillic (w/o euro update) +ibm-856 cp856 { MIME* } 856 # PC Hebrew (old) +ibm-9049 857 csIBM857 cp857 { MIME* } # PC Latin 5 (Turkish) (w/ euro update) +ibm-857 IBM857 { IANA* } # PC Latin 5 (w/o euro update) +ibm-859 cp859 { MIME* } # PC Latin 9 (w/ euro update) +ibm-860 IBM860 { IANA* } cp860 { MIME* } 860 csIBM860 # PC Portugal +ibm-861 IBM861 { IANA* } cp861 { MIME* } 861 cp-is csIBM861 # PC Iceland +ibm-867 cp867 862 cp862 { MIME* } cspc862latinhebrew # PC Hebrew (w/ euro update) +ibm-862 IBM862 { IANA* } # PC Hebrew (w/o euro update) +ibm-863 IBM863 { IANA* } cp863 { MIME* } 863 csIBM863 # PC Canadian French +ibm-17248 cp864 { MIME* } csIBM864 # PC Arabic (w/ euro update) +ibm-864 IBM864 { IANA* } # PC Arabic (w/o euro update) +ibm-865 IBM865 { IANA* } cp865 { MIME* } 865 csIBM865 # PC Nordic +ibm-808 cp866 { MIME* } 866 csIBM866 # PC Russian (w/ euro update) ibm-866 # PC Russian (w/o euro update) -ibm-868 IBM868 { IANA } cp868 { MIME } cp-ar csIBM868 868 # PC Urdu -ibm-9061 cp869 { MIME } 869 cp-gr csIBM869 # PC Greek (w/ euro update) -ibm-869 IBM869 { IANA } # PC Greek (w/o euro update) -ibm-878 KOI8-R { IANA MIME } cp878 koi8 cskoi8r # Russian internet -ibm-901 cp921 { MIME } 921 # PC Baltic (w/ euro update) +ibm-868 IBM868 { IANA* } cp868 { MIME* } cp-ar csIBM868 868 # PC Urdu +ibm-9061 cp869 { MIME* } 869 cp-gr csIBM869 # PC Greek (w/ euro update) +ibm-869 IBM869 { IANA* } # PC Greek (w/o euro update) +ibm-878 KOI8-R { IANA* MIME* } cp878 koi8 cskoi8r # Russian internet +ibm-901 cp921 { MIME* } 921 # PC Baltic (w/ euro update) ibm-921 # PC Baltic (w/o euro update) -ibm-902 cp922 { MIME } 922 # PC Estonian (w/ euro update) +ibm-902 cp922 { MIME* } 922 # PC Estonian (w/ euro update) ibm-922 # PC Estonian (w/o euro update) #ibm-941 jis-208 jisx-208 # Pure DBCS jisx-208 # ibm-941 is not JISX 208 code page #ibm-1038 Adobe-Symbol-Encoding csHPPSMath symbol -ibm-5346 windows-1250 { IANA } cp1250 # Windows Latin2 (w/ euro update) -ibm-5347 windows-1251 { IANA } cp1251 # Windows Cyrillic (w/ euro update) -ibm-5348 windows-1252 { IANA } cp1252 # Windows Latin1 (w/ euro update) -ibm-5349 windows-1253 { IANA } cp1253 # Windows Greek (w/ euro update) -ibm-5350 windows-1254 { IANA } cp1254 # Windows Turkish (w/ euro update) -ibm-5351 windows-1255 { IANA } cp1255 # Windows Hebrew (w/ euro update) -ibm-5352 windows-1256 { IANA } cp1256 # Windows Arabic (w/ euro update) -ibm-5353 windows-1257 { IANA } cp1257 # Windows Baltic (w/ euro update) -ibm-5354 windows-1258 { IANA } cp1258 # Windows Vietnamese (w/ euro update) +ibm-5346 windows-1250 { IANA* } cp1250 # Windows Latin2 (w/ euro update) +ibm-5347 windows-1251 { IANA* } cp1251 # Windows Cyrillic (w/ euro update) +ibm-5348 windows-1252 { IANA* } cp1252 # Windows Latin1 (w/ euro update) +ibm-5349 windows-1253 { IANA* } cp1253 # Windows Greek (w/ euro update) +ibm-5350 windows-1254 { IANA* } cp1254 # Windows Turkish (w/ euro update) +ibm-5351 windows-1255 { IANA* } cp1255 # Windows Hebrew (w/ euro update) +ibm-5352 windows-1256 { IANA* } cp1256 # Windows Arabic (w/ euro update) +ibm-5353 windows-1257 { IANA* } cp1257 # Windows Baltic (w/ euro update) +ibm-5354 windows-1258 { IANA* } cp1258 # Windows Vietnamese (w/ euro update) ibm-1250 # Windows Latin2 (w/o euro update) ibm-1251 # Windows Cyrillic (w/o euro update) ibm-1253 # Windows Greek (w/o euro update) @@ -240,15 +405,15 @@ ibm-1256 # Windows Arabic (w/o euro update) ibm-1257 # Windows Baltic (w/o euro update) ibm-1258 # Windows Vietnamese (w/o euro update) -ibm-1275 macintosh { IANA } mac { MIME } csMacintosh # Apple latin 1 -ibm-1276 Adobe-Standard-Encoding { IANA } csAdobeStandardEncoding # Different from ISO-Unicode-IBM-1276 (GCSGID: 1276) +ibm-1275 macintosh { IANA* } mac { MIME* } csMacintosh # Apple latin 1 +ibm-1276 Adobe-Standard-Encoding { IANA* } csAdobeStandardEncoding # Different from ISO-Unicode-IBM-1276 (GCSGID: 1276) ibm-1277 Adobe-Latin1-Encoding ibm-1280 macgr # Apple Greek ibm-1281 mactr # Apple Turkish ibm-1282 macce # Apple Central Europe ibm-1283 maccy # Apple Cyrillic -ibm-1051 hp-roman8 { IANA } roman8 r8 csHPRoman8 # HP Latin1 +ibm-1051 hp-roman8 { IANA* } roman8 r8 csHPRoman8 # HP Latin1 ibm-806_P100-2000 ibm-806 ibm-806_VSUB # PC ISCII-91: Indian Script Code ibm-1006_P100-2000 ibm-1006 ibm-1006_VPUA # Urdu @@ -265,120 +430,127 @@ ibm-9066_P100-2000 ibm-9066 ibm-9066_VSUB # Thai PC # Added for more euro support -ibm-849 cp1131 # PC Belarus (w/ euro update) -ibm-848 cp1125 # PC Ukraine (w/ euro update) -ibm-5104 cp1008 # 8-bit Arabic (w/ euro update) -ibm-9238 cp1046 # PC Arabic Extended (w/ euro update) -ibm-1363_P110-2000 ibm-1363 ibm-1363_VASCII_VSUB_VPUA ibm-1362 # Korean KSC Korean Windows MBCS -ibm-1363_P11B-2000 ibm-1363_VSUB_VPUA windows-949 cp949 cp1363 ksc korean -ibm-5210 cp1114 # PC SBCS Big-5 (w/ euro update) -ibm-21427 cp947 # PC DBCS Big-5 (w/ euro update) +ibm-849 cp1131 # PC Belarus (w/ euro update) +ibm-848 cp1125 # PC Ukraine (w/ euro update) +ibm-5104 cp1008 # 8-bit Arabic (w/ euro update) +ibm-9238 cp1046 # PC Arabic Extended (w/ euro update) +ibm-1363_P110-2000 ibm-1363 ibm-1363_VASCII_VSUB_VPUA ibm-1362 # Korean KSC Korean Windows MBCS + +ibm-1363_P11B-2000 ibm-1363_VSUB_VPUA + windows-949 + cp949 + cp1363 + ksc + # korean # The korean alias from IANA goes to ibm-949_P11A-2000 + +ibm-5210 cp1114 # PC SBCS Big-5 (w/ euro update) +ibm-21427 cp947 # PC DBCS Big-5 (w/ euro update) # EBCDIC codepages according to the CDRA # without Euro -ibm-37 IBM037 { IANA } ibm-037 cpibm37 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 cp37 cp037 037 # EBCDIC US -ibm-273 IBM273 { IANA } csIBM273 ebcdic-de cp273 cpibm273 273 # EBCDIC Germanay, Austria... -ibm-277 IBM277 { IANA } EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 ebcdic-dk cp277 cpibm277 277 # EBCDIC Denmark... -ibm-278 IBM278 { IANA } ebcdic-cp-fi ebcdic-cp-se csIBM278 ebcdic-sv cp278 cpibm278 278 # EBCDIC Sweden -ibm-280 IBM280 { IANA } ebcdic-cp-it csIBM280 cp280 cpibm280 280 # EBCDIC Italy -ibm-284 IBM284 { IANA } ebcdic-cp-es csIBM284 cp284 cpibm284 284 # EBCDIC Spain -ibm-285 IBM285 { IANA } ebcdic-cp-gb csIBM285 ebcdic-gb cp285 cpibm285 285 # EBCDIC UK Ireland -ibm-290 IBM290 { IANA } EBCDIC-JP-kana csIBM290 cp290 # host SBCS (Katakana) -ibm-297 IBM297 { IANA } ebcdic-cp-fr csIBM297 cp297 cpibm297 297 # EBCDIC France -ibm-420 IBM420 { IANA } ebcdic-cp-ar1 csIBM420 cp420 420 -ibm-424 IBM424 { IANA } ebcdic-cp-he csIBM424 cp424 424 -ibm-500 IBM500 { IANA } cpibm500 csIBM500 cp500 ebcdic-cp-be ebcdic-cp-ch 500 # EBCDIC International Latin1 -ibm-803 cp803 # Old EBCDIC Hebrew -ibm-834 cp834 # Korean DBCS Host -ibm-835 cp835 # DBCS T-Ch Host -ibm-870_P100-2000 IBM870 { IANA } ibm-870 CP870 ibm-870_STD ebcdic-cp-roece ebcdic-cp-yu csIBM870 -ibm-871 IBM871 { IANA } ebcdic-cp-is csIBM871 cpibm871 cp871 871 # EBCDIC Iceland -ibm-875_P100-2000 ibm-875 cp875 ibm-875 875 ibm-875_STD -ibm-918_P100-2000 IBM918 { IANA } ibm-918 CP918 ibm-918_VPUA ebcdic-cp-ar2 csIBM918 -ibm-918_X100-2000 ibm-918_STD -ibm-930 cp930 cpibm930 930 # Japan EBCDIC MIXED -ibm-933 cp933 cpibm933 933 # Korea EBCDIC MIXED -ibm-935 cp935 cpibm935 935 # China EBCDIC MIXED -ibm-937 cp937 cpibm937 937 # Taiwan EBCDIC MIXED -ibm-939 cp939 939 # Host MBCS (Latin-Kanji) EBCDIC -ibm-1025_P100-2000 ibm-1025 ibm-1025_STD -ibm-1026_P100-2000 IBM1026 { IANA } ibm-1026 CP1026 csIBM1026 ibm-1026_STD -ibm-1047 cpibm1047 # EBCDIC Open systems Latin1 -ibm-1097_P100-2000 ibm-1097 ibm-1097_VPUA -ibm-1097_X100-2000 ibm-1097_STD -ibm-1112_P100-2000 ibm-1112 cp1112 1112 ibm-1112_STD -ibm-1122_P100-2000 ibm-1122 cp1122 ibm-1122 1122 ibm-1122_STD -ibm-1130_P100-2000 ibm-1130 ibm-1130_STD -ibm-1132_P100-2000 ibm-1132 ibm-1132_STD -ibm-1137_P100-2000 ibm-1137 ibm-1137_STD -ibm-1388_P103-2001 ibm-1388 # S-Ch DBCS-Host Data GBK mixed MBCS -ibm-9030_P100-2000 ibm-9030 ibm-9030_STD +ibm-37 IBM037 { IANA* } ibm-037 cpibm37 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 cp37 cp037 037 # EBCDIC US +ibm-273 IBM273 { IANA* } csIBM273 ebcdic-de cp273 cpibm273 273 # EBCDIC Germanay, Austria... +ibm-277 IBM277 { IANA* } EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 ebcdic-dk cp277 cpibm277 277 # EBCDIC Denmark... +ibm-278 IBM278 { IANA* } ebcdic-cp-fi ebcdic-cp-se csIBM278 ebcdic-sv cp278 cpibm278 278 # EBCDIC Sweden +ibm-280 IBM280 { IANA* } ebcdic-cp-it csIBM280 cp280 cpibm280 280 # EBCDIC Italy +ibm-284 IBM284 { IANA* } ebcdic-cp-es csIBM284 cp284 cpibm284 284 # EBCDIC Spain +ibm-285 IBM285 { IANA* } ebcdic-cp-gb csIBM285 ebcdic-gb cp285 cpibm285 285 # EBCDIC UK Ireland +ibm-290 IBM290 { IANA* } EBCDIC-JP-kana csIBM290 cp290 # host SBCS (Katakana) +ibm-297 IBM297 { IANA* } ebcdic-cp-fr csIBM297 cp297 cpibm297 297 # EBCDIC France +ibm-420 IBM420 { IANA* } ebcdic-cp-ar1 csIBM420 cp420 420 +ibm-424 IBM424 { IANA* } ebcdic-cp-he csIBM424 cp424 424 +ibm-500 IBM500 { IANA* } cpibm500 csIBM500 cp500 ebcdic-cp-be ebcdic-cp-ch 500 # EBCDIC International Latin1 +ibm-803 cp803 # Old EBCDIC Hebrew +ibm-834 cp834 # Korean DBCS Host +ibm-835 cp835 # DBCS T-Ch Host +ibm-870_P100-2000 IBM870 { IANA* } ibm-870 CP870 ibm-870_STD ebcdic-cp-roece ebcdic-cp-yu csIBM870 +ibm-871 IBM871 { IANA* } ebcdic-cp-is csIBM871 cpibm871 cp871 871 # EBCDIC Iceland +ibm-875_P100-2000 ibm-875 cp875 875 ibm-875_STD +ibm-918_P100-2000 IBM918 { IANA* } ibm-918 CP918 ibm-918_VPUA ebcdic-cp-ar2 csIBM918 +ibm-918_X100-2000 ibm-918_STD +ibm-930 cp930 cpibm930 930 # Japan EBCDIC MIXED +ibm-933 cp933 cpibm933 933 # Korea EBCDIC MIXED +ibm-935 cp935 cpibm935 935 # China EBCDIC MIXED +ibm-937 cp937 cpibm937 937 # Taiwan EBCDIC MIXED +ibm-939 cp939 939 # Host MBCS (Latin-Kanji) EBCDIC +ibm-1025_P100-2000 ibm-1025 ibm-1025_STD +ibm-1026_P100-2000 IBM1026 { IANA* } ibm-1026 CP1026 csIBM1026 ibm-1026_STD +ibm-1047 cpibm1047 # EBCDIC Open systems Latin1 +ibm-1097_P100-2000 ibm-1097 ibm-1097_VPUA +ibm-1097_X100-2000 ibm-1097_STD +ibm-1112_P100-2000 ibm-1112 cp1112 1112 ibm-1112_STD +ibm-1122_P100-2000 ibm-1122 cp1122 1122 ibm-1122_STD +ibm-1130_P100-2000 ibm-1130 ibm-1130_STD +ibm-1132_P100-2000 ibm-1132 ibm-1132_STD +ibm-1137_P100-2000 ibm-1137 ibm-1137_STD +ibm-1388_P103-2001 ibm-1388 # S-Ch DBCS-Host Data GBK mixed MBCS +ibm-9030_P100-2000 ibm-9030 ibm-9030_STD -#ibm-1046 # PC Arabic without EURO +#ibm-1046 # PC Arabic without EURO # with Euro -ibm-1123 cpibm1123 # EBCDIC Cyrillic Ukraine -ibm-1140 cpibm1140 IBM01140 { IANA } # EBCDIC US... -ibm-1141 cpibm1141 IBM01141 { IANA } # EBCDIC Germanay, Austria... -ibm-1142 cpibm1142 IBM01142 { IANA } # EBCDIC Denmark... -ibm-1143 cpibm1143 IBM01143 { IANA } # EBCDIC Sweden -ibm-1144 cpibm1144 # EBCDIC Italy -ibm-1145 cpibm1145 # EBCDIC Spain -ibm-1146 cpibm1146 # EBCDIC UK Ireland -ibm-1147 cpibm1147 # EBCDIC France -ibm-1148 cpibm1148 # EBCDIC International Latin1 -ibm-1149 cpibm1149 ebcdic-is # EBCDIC Iceland -ibm-1153 cpibm1153 # EBCDIC latin 2 -ibm-1154 cp1025 cpibm1154 # EBCDIC Cyrillic Multilingual -ibm-1155 cpibm1155 # EBCDIC Turkey -ibm-1156 cpibm1156 # EBCDIC Baltic Multilingual -ibm-1157 cpibm1157 # EBCDIC Estonia -ibm-1158 cp1123 cpibm1158 1123 # EBCDIC Cyrillic Ukraine -ibm-1159 cp28709 # SBCS T-Ch Host -ibm-1160 cp9030 cpibm1160 # EBCDIC Thailand -ibm-1164 cp1130 cpibm1164 # EBCDIC Viet Nam -ibm-1364_P110-2000 ibm-1364_VPUA ibm-1364 cp1364 # Korean Host Mixed -ibm-1371 cpibm1371 # Taiwan EBCDIC MIXED -ibm-1390 cpibm1390 # Japan EBCDIC MIXED -ibm-1399 # Host MBCS (Latin-Kanji) -ibm-4899 cpibm4899 # Old EBCDIC Hebrew -ibm-4971 cpibm4971 # EBCDIC Greek -ibm-5123 cp1027 # Host Roman Jis -ibm-8482 # host SBCS (Katakana) -ibm-9027 # DBCS T-Ch Host -ibm-12712 cpibm12712 ebcdic-he # EBCDIC Hebrew (new sheqel, control charaters update) -ibm-16684 cp300 # Jis + Roman Jis Host -ibm-16804 cpibm16804 ebcdic-ar # EBCDIC Arabic +ibm-1123 cpibm1123 # EBCDIC Cyrillic Ukraine +ibm-1140 cpibm1140 IBM01140 { IANA* } # EBCDIC US... +ibm-1141 cpibm1141 IBM01141 { IANA* } # EBCDIC Germanay, Austria... +ibm-1142 cpibm1142 IBM01142 { IANA* } # EBCDIC Denmark... +ibm-1143 cpibm1143 IBM01143 { IANA* } # EBCDIC Sweden +ibm-1144 cpibm1144 # EBCDIC Italy +ibm-1145 cpibm1145 # EBCDIC Spain +ibm-1146 cpibm1146 # EBCDIC UK Ireland +ibm-1147 cpibm1147 # EBCDIC France +ibm-1148 cpibm1148 # EBCDIC International Latin1 +ibm-1149 cpibm1149 ebcdic-is # EBCDIC Iceland +ibm-1153 cpibm1153 # EBCDIC latin 2 +ibm-1154 cp1025 cpibm1154 # EBCDIC Cyrillic Multilingual +ibm-1155 cpibm1155 # EBCDIC Turkey +ibm-1156 cpibm1156 # EBCDIC Baltic Multilingual +ibm-1157 cpibm1157 # EBCDIC Estonia +ibm-1158 cp1123 cpibm1158 1123 # EBCDIC Cyrillic Ukraine +ibm-1159 cp28709 # SBCS T-Ch Host +ibm-1160 cp9030 cpibm1160 # EBCDIC Thailand +ibm-1164 cp1130 cpibm1164 # EBCDIC Viet Nam +ibm-1364_P110-2000 ibm-1364_VPUA ibm-1364 cp1364 # Korean Host Mixed +ibm-1371 cpibm1371 # Taiwan EBCDIC MIXED +ibm-1390 cpibm1390 # Japan EBCDIC MIXED +ibm-1399 # Host MBCS (Latin-Kanji) +ibm-4899 cpibm4899 # Old EBCDIC Hebrew +ibm-4971 cpibm4971 # EBCDIC Greek +ibm-5123 cp1027 # Host Roman Jis +ibm-8482 # host SBCS (Katakana) +ibm-9027 # DBCS T-Ch Host +ibm-12712 cpibm12712 ebcdic-he # EBCDIC Hebrew (new sheqel, control charaters update) +ibm-16684 cp300 # Jis + Roman Jis Host +ibm-16804 cpibm16804 ebcdic-ar # EBCDIC Arabic # unsupported IANA names # ebcdic-it csEBCDICIT # ebcdic-es csEBCDICES # csEBCDICFR ebcdic-fr -# ibm-274 IBM274 { IANA } cp274 csIBM274 ebcdic-be -# ibm-870 IBM870 { IANA } ebcdic-cp-roece ebcdic-cp-yu csIBM870 cp870 870 +# ibm-274 IBM274 { IANA* } cp274 csIBM274 ebcdic-be +# ibm-870 IBM870 { IANA* } ebcdic-cp-roece ebcdic-cp-yu csIBM870 cp870 870 # EBCDIC codepages for S/390, with LF and NL codes swapped ebcdic-xml-us # without Euro -ibm-37-s390 ibm037-s390 # EBCDIC US -ibm-1047-s390 # EBCDIC for S/390 Open Edition +ibm-37-s390 ibm037-s390 # EBCDIC US +ibm-1047-s390 # EBCDIC for S/390 Open Edition # with Euro -ibm-1140-s390 # EBCDIC US -ibm-1142-s390 # EBCDIC Denmark -ibm-1143-s390 # EBCDIC Sweden -ibm-1144-s390 # EBCDIC Italy -ibm-1145-s390 # EBCDIC Spain -ibm-1146-s390 # EBCDIC UK Ireland -ibm-1147-s390 # EBCDIC France -ibm-1148-s390 # EBCDIC International Latin1 -ibm-1149-s390 # EBCDIC Iceland -ibm-1153-s390 # EBCDIC latin 2 -ibm-12712-s390 # EBCDIC Hebrew -ibm-16804-s390 # EBCDIC Arabic +ibm-1140-s390 # EBCDIC US +ibm-1142-s390 # EBCDIC Denmark +ibm-1143-s390 # EBCDIC Sweden +ibm-1144-s390 # EBCDIC Italy +ibm-1145-s390 # EBCDIC Spain +ibm-1146-s390 # EBCDIC UK Ireland +ibm-1147-s390 # EBCDIC France +ibm-1148-s390 # EBCDIC International Latin1 +ibm-1149-s390 # EBCDIC Iceland +ibm-1153-s390 # EBCDIC latin 2 +ibm-12712-s390 # EBCDIC Hebrew +ibm-16804-s390 # EBCDIC Arabic # GB 18030 is partly algorithmic, using the MBCS converter -gb18030 { IANA } ibm-1392 +gb18030 { IANA* } ibm-1392 diff --git a/icu4c/source/test/cintltst/ccapitst.c b/icu4c/source/test/cintltst/ccapitst.c index bd770a47109..68cdecb02d9 100644 --- a/icu4c/source/test/cintltst/ccapitst.c +++ b/icu4c/source/test/cintltst/ccapitst.c @@ -240,6 +240,8 @@ static void TestConvert() /*Testing ucnv_openU()*/ { UChar converterName[]={ 0x0069, 0x0062, 0x006d, 0x002d, 0x0039, 0x0034, 0x0033, 0x0000}; /*ibm-943*/ + UChar firstSortedName[]={ 0x0021, 0x0000}; /* ! */ + UChar lastSortedName[]={ 0x007E, 0x0000}; /* ~ */ const char *illegalNameChars={ "ibm-943 ibm-943 ibm-943 ibm-943 ibm-943 ibm-943 ibm-943 ibm-943 ibm-943 ibm-943"}; UChar illegalName[100]; UConverter *converter=NULL; @@ -269,8 +271,20 @@ static void TestConvert() if(!(err==U_ILLEGAL_ARGUMENT_ERROR)){ log_err("FAILURE! ucnv_openU(illegalName, err) is expected to fail\n"); } + + err=U_ZERO_ERROR; + ucnv_openU(firstSortedName, &err); + if(err!=U_FILE_ACCESS_ERROR){ + log_err("FAILURE! ucnv_openU(firstSortedName, err) is expected to fail\n"); + } + + err=U_ZERO_ERROR; + ucnv_openU(lastSortedName, &err); + if(err!=U_FILE_ACCESS_ERROR){ + log_err("FAILURE! ucnv_openU(lastSortedName, err) is expected to fail\n"); + } + err=U_ZERO_ERROR; - } log_verbose("Testing ucnv_open() with converter name greater than 7 characters\n"); { @@ -455,6 +469,11 @@ static void TestConvert() char* index = NULL; strcpy(ucs_file_name, loadTestData(&err)); + if(U_FAILURE(err)){ + log_err("Couldn't get the test data directory... Exiting...Error:%s\n", u_errorName(err)); + return; + } + index=strrchr(ucs_file_name,(char)U_FILE_SEP_CHAR); if((unsigned int)(index-ucs_file_name) != (strlen(ucs_file_name)-1)){ @@ -462,11 +481,6 @@ static void TestConvert() } strcat(ucs_file_name,".."U_FILE_SEP_STRING); - - if(U_FAILURE(err)){ - log_err("Couldn't get the test data directory... Exiting...Error:%s\n", u_errorName(err)); - return; - } strcat(ucs_file_name, CodePagesTestFiles[codepage_index]); ucs_file_in = fopen(ucs_file_name,"rb"); diff --git a/icu4c/source/test/cintltst/udatatst.c b/icu4c/source/test/cintltst/udatatst.c index 03dc13f9fbe..173d200fb36 100644 --- a/icu4c/source/test/cintltst/udatatst.c +++ b/icu4c/source/test/cintltst/udatatst.c @@ -67,7 +67,7 @@ static void TestUDataOpen(){ UErrorCode status=U_ZERO_ERROR; const char* memMap[][2]={ {"tz", "dat"}, - {"cnvalias", "dat"}, + {"cnvalias", "icu"}, {"unames", "dat"}, {"ibm-1141", "cnv"} }; @@ -388,7 +388,7 @@ isAcceptable1(void *context, pInfo->dataFormat[1]==0x76 && pInfo->dataFormat[2]==0x41 && pInfo->dataFormat[3]==0x6c && - pInfo->formatVersion[0]==2 ) + pInfo->formatVersion[0]==3 ) { log_verbose("The data from \"%s.%s\" IS acceptable using the verifing function isAcceptable1()\n", name, type); return TRUE; @@ -473,7 +473,7 @@ static void TestUDataOpenChoiceDemo1() { strcat(strcpy(testPath, u_getDataDirectory()), "testdata"); - result=udata_openChoice(NULL, type, name[0], isAcceptable1, NULL, &status); + result=udata_openChoice(NULL, "icu", name[0], isAcceptable1, NULL, &status); if(U_FAILURE(status)){ log_err("FAIL: udata_openChoice() failed name=%s, type=%s, \n errorcode=%s\n", name[0], type, myErrorName(status)); } else { @@ -624,7 +624,7 @@ static void TestUDataGetInfo() { log_verbose("Testing udata_getInfo() for cnvalias.dat\n"); - result=udata_open(NULL, type, name, &status); + result=udata_open(NULL, "icu", name, &status); if(U_FAILURE(status)){ log_err("FAIL: udata_open() failed for path = NULL, name=%s, type=%s, \n errorcode=%s\n", path, name, type, myErrorName(status)); return; @@ -677,32 +677,34 @@ static void TestUDataGetInfo() { static void TestUDataGetMemory() { UDataMemory *result; - const uint16_t *table=NULL; + const int32_t *table=NULL; uint16_t* intValue=0; UErrorCode status=U_ZERO_ERROR; const char* name="cnvalias"; - const char* type="dat"; + const char* type; const char* name2="test"; - char* testPath=(char*)malloc(sizeof(char) * (strlen(u_getDataDirectory()) + strlen("testdata") +1 ) ); + char* testPath=(char*)malloc(sizeof(char) * (strlen(u_getDataDirectory()) + strlen("testdata") +1 ) ); - strcat(strcpy(testPath, u_getDataDirectory()), "testdata"); + strcat(strcpy(testPath, u_getDataDirectory()), "testdata"); + type="icu"; log_verbose("Testing udata_getMemory for \"cnvalias.dat()\"\n"); result=udata_openChoice(NULL, type, name, isAcceptable1, NULL, &status); if(U_FAILURE(status)){ log_err("FAIL: udata_openChoice() failed for name=%s, type=%s, \n errorcode=%s\n", name, type, myErrorName(status)); return; } - table=(const uint16_t *)udata_getMemory(result); + table=(const uint32_t *)udata_getMemory(result); /* The alias table may list more converters than what's actually available now. [grhoten] */ - if(ucnv_countAvailable() > table[1+2*(*table)]) /*???*/ + if(ucnv_countAvailable() > table[1]) /*???*/ log_err("FAIL: udata_getMemory() failed ucnv_countAvailable returned = %d, expected = %d\n", ucnv_countAvailable(), table[1+2*(*table)]); udata_close(result); + type="dat"; log_verbose("Testing udata_getMemory for \"test.dat\"()\n"); result=udata_openChoice(testPath, type, name2, isAcceptable3, NULL, &status); if(U_FAILURE(status)){ diff --git a/icu4c/source/tools/gencnval/gencnval.c b/icu4c/source/tools/gencnval/gencnval.c index a4d75d5517e..1737d4261c3 100644 --- a/icu4c/source/tools/gencnval/gencnval.c +++ b/icu4c/source/tools/gencnval/gencnval.c @@ -25,6 +25,7 @@ #include "unicode/utypes.h" #include "unicode/putil.h" #include "unicode/ucnv.h" /* ucnv_compareNames() */ +#include "ucnv_io.h" #include "cmemory.h" #include "cstring.h" #include "filestrm.h" @@ -35,18 +36,41 @@ #include #include -/* TODO: Need to specify the maximum alias name length in a header (see ucnv_io.c::findalias()) */ +/* TODO: Need to check alias name length is less than UCNV_MAX_CONVERTER_NAME_LENGTH */ -#define STRING_STORE_SIZE 100000 -#define MAX_ALIAS_COUNT 2000 +/* STRING_STORE_SIZE + TAG_STORE_SIZE <= ((2^16 - 1) * 2) + That is the maximum size for the string stores combined + because the strings are index at 16-bit boundries by a + 16-bit index, and there is only one section for the + strings. + */ +#define STRING_STORE_SIZE 0x1FBFE /* 130046 */ +#define TAG_STORE_SIZE 0x400 /* 1024 */ -#define TAG_STORE_SIZE 20000 -#define MAX_TAG_COUNT 200 +/* The combined tag and converter count can affect the number of lists + created. The size of all lists must be less than (2^17 - 1) + because the lists are indexed as a 16-bit array with a 16-bit index. + */ +#define MAX_TAG_COUNT 0x3F /* 63 */ +#define MAX_CONV_COUNT UCNV_CONVERTER_INDEX_MASK +#define MAX_ALIAS_COUNT 0xFFFF /* 65535 */ -#define MAX_LINE_SIZE 32767 +/* The maximum number of aliases that a standard tag/converter combination can have. + At this moment 6/18/2002, IANA has 12 names for ASCII. Don't go below 15 for + this value. I don't recommend more than 31 for this value. + */ +#define MAX_TC_ALIAS_COUNT 0x1F /* 31 */ + +#define MAX_LINE_SIZE 0x7FFF /* 32767 */ +#define MAX_LIST_SIZE 0xFFFF /* 65535 */ #define DATA_NAME "cnvalias" -#define DATA_TYPE "dat" +#define DATA_TYPE "icu" /* ICU alias table */ + +#define ALL_TAG_STR "ALL" +#define ALL_TAG_NUM 1 +#define EMPTY_TAG_NUM 0 +#define USER_TAG_NUM_START 2 /* UDataInfo cf. udata.h */ static const UDataInfo dataInfo={ @@ -59,7 +83,7 @@ static const UDataInfo dataInfo={ 0, {0x43, 0x76, 0x41, 0x6c}, /* dataFormat="CvAl" */ - {2, 1, 0, 0}, /* formatVersion */ + {3, 0, 0, 0}, /* formatVersion */ {1, 4, 2, 0} /* dataVersion */ }; @@ -73,34 +97,44 @@ static char stringStore[STRING_STORE_SIZE]; static StringBlock stringBlock = { stringStore, 0, STRING_STORE_SIZE }; typedef struct { - const char *alias; - uint16_t converter; -} Alias; - -static Alias aliases[MAX_ALIAS_COUNT]; -static uint16_t aliasCount=0; + uint16_t aliasCount; + uint16_t *aliases; /* Index into stringStore */ +} AliasList; typedef struct { - const char *converter; - uint16_t aliasCount; + uint16_t converter; /* Index into stringStore */ + uint16_t totalAliasCount; /* Total aliases in this column */ } Converter; -static Converter converters[MAX_ALIAS_COUNT]; +static Converter converters[MAX_CONV_COUNT]; static uint16_t converterCount=0; static char tagStore[TAG_STORE_SIZE]; static StringBlock tagBlock = { tagStore, 0, TAG_STORE_SIZE }; typedef struct { - const char *tag; - const char *aliases[MAX_ALIAS_COUNT]; + uint16_t tag; /* Index into tagStore */ + uint16_t totalAliasCount; /* Total aliases in this row */ + AliasList aliasList[MAX_CONV_COUNT]; } Tag; +/* Think of this as a 3D array. It's tagCount by converterCount by aliasCount */ static Tag tags[MAX_TAG_COUNT]; static uint16_t tagCount = 0; +/* Used for storing all aliases */ +static uint16_t knownAliases[MAX_ALIAS_COUNT]; +static uint16_t knownAliasesCount = 0; +/*static uint16_t duplicateKnownAliasesCount = 0;*/ + +/* Used for storing the lists section that point to aliases */ +static uint16_t aliasLists[MAX_LIST_SIZE]; +static uint16_t aliasListsSize = 0; + /* Were the standard tags declared before the aliases. */ -UBool standardTagsUsed = FALSE; +static UBool standardTagsUsed = FALSE; +static UBool verbose = FALSE; +static int32_t lineNum = 1; /* prototypes --------------------------------------------------------------- */ @@ -117,7 +151,7 @@ static void addOfficialTaggedStandards(char *line, int32_t lineLen); static uint16_t -addAlias(const char *alias, uint16_t converter); +addAlias(const char *alias, uint16_t standard, uint16_t converter, UBool defaultName); static uint16_t addConverter(const char *converter); @@ -125,20 +159,45 @@ addConverter(const char *converter); static char * allocString(StringBlock *block, uint32_t length); +static uint16_t +addToKnownAliases(const char *alias); + static int compareAliases(const void *alias1, const void *alias2); static uint16_t getTagNumber(const char *tag, uint16_t tagLen); +/*static void +addTaggedAlias(uint16_t tag, const char *alias, uint16_t converter);*/ + static void -addTaggedAlias(uint16_t tag, const char *alias, uint16_t converter); +writeAliasTable(UNewDataMemory *out); /* -------------------------------------------------------------------------- */ +/* Presumes that you used allocString() */ +#define GET_ALIAS_STR(index) (stringStore + ((size_t)(index) << 1)) +#define GET_TAG_STR(index) (tagStore + ((size_t)(index) << 1)) + +/* Presumes that you used allocString() */ +#define GET_ALIAS_NUM(str) ((uint16_t)((str - stringStore) >> 1)) +#define GET_TAG_NUM(str) ((uint16_t)((str - tagStore) >> 1)) + +enum +{ + HELP1, + HELP2, + VERBOSE, + COPYRIGHT, + DESTDIR, + SOURCEDIR +}; + static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, + UOPTION_VERBOSE, UOPTION_COPYRIGHT, UOPTION_DESTDIR, UOPTION_SOURCEDIR @@ -151,8 +210,6 @@ main(int argc, char* argv[]) { FileStream *in; UNewDataMemory *out; UErrorCode errorCode=U_ZERO_ERROR; - int i; - uint16_t tagOffset, stringOffset; U_MAIN_INIT_ARGS(argc, argv); @@ -166,12 +223,13 @@ main(int argc, char* argv[]) { "error in command line argument \"%s\"\n", argv[-argc]); } - if(argc<0 || options[0].doesOccur || options[1].doesOccur) { + if(argc<0 || options[HELP1].doesOccur || options[HELP2].doesOccur) { fprintf(stderr, "usage: %s [-options] [convrtrs.txt]\n" "\tread convrtrs.txt and create " DATA_NAME "." DATA_TYPE "\n" "options:\n" "\t-h or -? or --help this usage text\n" + "\t-v or --verbose prints out extra information about the alias table\n" "\t-c or --copyright include a copyright notice\n" "\t-d or --destdir destination directory, followed by the path\n" "\t-s or --sourcedir source directory, followed by the path\n", @@ -179,6 +237,10 @@ main(int argc, char* argv[]) { return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } + if(options[VERBOSE].doesOccur) { + verbose = TRUE; + } + if(argc>=2) { path=argv[1]; } else { @@ -195,19 +257,23 @@ main(int argc, char* argv[]) { path = "convrtrs.txt"; } } + + uprv_memset(stringStore, 0, sizeof(stringStore)); + uprv_memset(tagStore, 0, sizeof(tagStore)); + uprv_memset(converters, 0, sizeof(converters)); + uprv_memset(tags, 0, sizeof(tags)); + uprv_memset(aliasLists, 0, sizeof(aliasLists)); + uprv_memset(knownAliases, 0, sizeof(aliasLists)); + + in=T_FileStream_open(path, "r"); if(in==NULL) { fprintf(stderr, "gencnval: unable to open input file convrtrs.txt\n"); exit(U_FILE_ACCESS_ERROR); } - - parseFile(in); T_FileStream_close(in); - /* sort the aliases */ - qsort(aliases, aliasCount, sizeof(Alias), compareAliases); - /* create the output file */ out=udata_create(options[3].value, DATA_TYPE, DATA_NAME, &dataInfo, options[2].doesOccur ? U_COPYRIGHT_STRING : NULL, &errorCode); @@ -216,47 +282,8 @@ main(int argc, char* argv[]) { exit(errorCode); } - /* determine the length of tables for the data offset of the strings */ - tagOffset = (uint16_t)(2 + 4 * aliasCount + 2 + 4 * converterCount); - stringOffset = (uint16_t)(tagOffset + 2 + (2 * tagCount) * converterCount + tagBlock.top); - - /* write the table of aliases */ - udata_write16(out, aliasCount); - for(i=0; i 0 && isspace(*lastLine))) { uprv_strcpy(line + lineSize, lastLine); @@ -292,21 +324,28 @@ parseFile(FileStream *in) { validParse = TRUE; break; } + lineNum++; } - if (validParse) { + if (validParse || lineSize > 0) { if (isspace(*line)) { - fprintf(stderr, "error: line %d: cannot start an alias with a space\n", lineNum-2); - exit(1); + fprintf(stderr, "error(line %d): cannot start an alias with a space\n", lineNum-1); + exit(U_PARSE_ERROR); } else if (line[0] == '{') { if (!standardTagsUsed && line[lineSize - 1] != '}') { - fprintf(stderr, "error: line %d: alias needs to start with a converter name\n", lineNum); - exit(1); + fprintf(stderr, "error(line %d): alias needs to start with a converter name\n", lineNum); + exit(U_PARSE_ERROR); } addOfficialTaggedStandards(line, lineSize); standardTagsUsed = TRUE; } else { - parseLine(line); + if (standardTagsUsed) { + parseLine(line); + } + else { + fprintf(stderr, "error(line %d): alias table needs to start a list of standard tags\n", lineNum); + exit(U_PARSE_ERROR); + } } /* Was the last line consumed */ if (lastLineSize > 0) { @@ -317,6 +356,7 @@ parseFile(FileStream *in) { lineSize = 0; } } + lineNum++; } } @@ -351,9 +391,11 @@ parseLine(const char *line) { char *converter, *alias; /* skip leading white space */ - while(line[pos]!=0 && isspace((unsigned char)line[pos])) { + /* There is no whitespace at the beginning anymore */ +/* while(line[pos]!=0 && isspace(line[pos])) { ++pos; } +*/ /* is there nothing on this line? */ if(line[pos]==0) { @@ -362,7 +404,7 @@ parseLine(const char *line) { /* get the converter name */ start=pos; - while(line[pos]!=0 && !isspace((unsigned char)line[pos])) { + while(line[pos]!=0 && !isspace(line[pos])) { ++pos; } limit=pos; @@ -376,75 +418,49 @@ parseLine(const char *line) { /* add the converter to the converter table */ cnv=addConverter(converter); - /* add the converter as its own alias to the alias table */ - addAlias(alias = converter, cnv); - - /* skip white space */ - while(line[pos]!=0 && isspace((unsigned char)line[pos])) { - ++pos; - } - - /* handle tags if they are present; sloppy, shouldn't copy/paste this */ - if (line[pos] == '{') { - ++pos; - do { - start = pos; - while (line[pos] && line[pos] != '}' && line[pos] != '#' && !isspace((unsigned char) line[pos])) { - ++pos; - } - limit = pos; - - if (start != limit) { - uint16_t tag; - - /* add the tag to the tag table */ - tag = getTagNumber(line + start, (uint16_t)(limit - start)); - addTaggedAlias(tag, alias, cnv); - } - - while (line[pos] && isspace((unsigned char)line[pos])) { - ++pos; - } - } while (line[pos] && line[pos] != '}' && line[pos] != '#'); - - if (line[pos] == '}') { - ++pos; - } else { - fprintf(stderr, "unterminated tag list in: %s\n", line); - exit(U_PARSE_ERROR); - } - } + /* The name itself may be tagged, so let's added it to the aliases list properly */ + pos = start; /* get all the real aliases */ for(;;) { + /* skip white space */ - while(line[pos]!=0 && isspace((unsigned char)line[pos])) { + while(line[pos]!=0 && isspace(line[pos])) { ++pos; } /* is there no more alias name on this line? */ - if(line[pos]==0 || line[pos]=='#') { + if(line[pos]==0) { break; } /* get an alias name */ start=pos; - while(line[pos]!=0 && line[pos]!='{' && line[pos]!='#' && !isspace((unsigned char)line[pos])) { + while(line[pos]!=0 && line[pos]!='{' && !isspace(line[pos])) { ++pos; } limit=pos; /* store the alias name */ length=(uint16_t)(limit-start); - alias=allocString(&stringBlock, length+1); - uprv_memcpy(alias, line+start, length); - alias[length]=0; + if (start == 0) { + /* add the converter as its own alias to the alias table */ + alias = converter; + addAlias(alias, ALL_TAG_NUM, cnv, TRUE); + } + else { + alias=allocString(&stringBlock, length+1); + uprv_memcpy(alias, line+start, length); + alias[length]=0; + addAlias(alias, ALL_TAG_NUM, cnv, FALSE); + } + addToKnownAliases(alias); /* add the alias/converter pair to the alias table */ - addAlias(alias, cnv); + /* addAlias(alias, 0, cnv, FALSE);*/ /* skip whitespace */ - while (line[pos] && isspace((unsigned char)line[pos])) { + while (line[pos] && isspace(line[pos])) { ++pos; } @@ -453,30 +469,30 @@ parseLine(const char *line) { ++pos; do { start = pos; - while (line[pos] && line[pos] != '}' && line[pos] != '#' && !isspace((unsigned char) line[pos])) { + while (line[pos] && line[pos] != '}' && !isspace( line[pos])) { ++pos; } limit = pos; if (start != limit) { - uint16_t tag; - /* add the tag to the tag table */ - tag = getTagNumber(line + start, (uint16_t)(limit - start)); - addTaggedAlias(tag, alias, cnv); + uint16_t tag = getTagNumber(line + start, (uint16_t)(limit - start)); + addAlias(alias, tag, cnv, (UBool)(line[limit-1] == '*')); } - while (line[pos] && isspace((unsigned char)line[pos])) { + while (line[pos] && isspace(line[pos])) { ++pos; } - } while (line[pos] && line[pos] != '}' && line[pos] != '#'); + } while (line[pos] && line[pos] != '}'); if (line[pos] == '}') { ++pos; } else { - fprintf(stderr, "unterminated tag list in: %s\n", line); - exit(U_PARSE_ERROR); + fprintf(stderr, "error(line %d): Unterminated tag list\n", lineNum); + exit(U_UNMATCHED_BRACES); } + } else { + addAlias(alias, EMPTY_TAG_NUM, cnv, (UBool)(tags[0].aliasList[cnv].aliasCount == 0)); } } } @@ -485,10 +501,10 @@ static uint16_t getTagNumber(const char *tag, uint16_t tagLen) { char *atag; uint16_t t; - UBool preferredName = (tag[tagLen - 1] == '*'); + UBool preferredName = ((tagLen > 0) ? (tag[tagLen - 1] == '*') : (FALSE)); if (tagCount >= MAX_TAG_COUNT) { - fprintf(stderr, "gencnval: too many tags\n"); + fprintf(stderr, "error(line %d): too many tags\n", lineNum); exit(U_BUFFER_OVERFLOW_ERROR); } @@ -498,14 +514,15 @@ getTagNumber(const char *tag, uint16_t tagLen) { } for (t = 0; t < tagCount; ++t) { - if (uprv_strlen(tags[t].tag) == tagLen && !uprv_strnicmp(tags[t].tag, tag, tagLen)) { + const char *currTag = GET_TAG_STR(tags[t].tag); + if (uprv_strlen(currTag) == tagLen && !uprv_strnicmp(currTag, tag, tagLen)) { return t; } } /* we need to add this tag */ if (tagCount >= MAX_TAG_COUNT) { - fprintf(stderr, "gencnval: too many tags\n"); + fprintf(stderr, "error(line %d): too many tags\n", lineNum); exit(U_BUFFER_OVERFLOW_ERROR); } @@ -515,25 +532,27 @@ getTagNumber(const char *tag, uint16_t tagLen) { atag[tagLen] = 0; if (standardTagsUsed) { - fprintf(stderr, "error: Tag \"%s\" is not declared at the beginning of the alias table.\n", atag); + fprintf(stderr, "error(line %d): Tag \"%s\" is not declared at the beginning of the alias table.\n", + lineNum, atag); exit(1); } - else { - fprintf(stderr, "warning: Tag \"%s\" was added to the list of standards because it was not declared at beginning of the alias table.\n", atag); + else if (tagLen > 0 && strcmp(tag, ALL_TAG_STR) != 0) { + fprintf(stderr, "warning(line %d): Tag \"%s\" was added to the list of standards because it was not declared at beginning of the alias table.\n", + lineNum, atag); } /* add the tag to the tag table */ - tags[tagCount].tag = atag; - /* Set the array of pointers to NULL */ - uprv_memset((void *)&tags[tagCount].aliases, 0, sizeof(tags[tagCount].aliases)); + tags[tagCount].tag = GET_TAG_NUM(atag); + /* The aliasList should be set to 0's already */ return tagCount++; } -static void +/*static void addTaggedAlias(uint16_t tag, const char *alias, uint16_t converter) { tags[tag].aliases[converter] = alias; } +*/ static void addOfficialTaggedStandards(char *line, int32_t lineLen) { @@ -542,8 +561,8 @@ addOfficialTaggedStandards(char *line, int32_t lineLen) { uint16_t tagSize; static const char WHITESPACE[] = " \t"; - if (tagCount >= MAX_TAG_COUNT) { - fprintf(stderr, "gencnval: too many tags\n"); + if (tagCount > USER_TAG_NUM_START) { + fprintf(stderr, "error(line %d): official tags already added\n", lineNum); exit(U_BUFFER_OVERFLOW_ERROR); } strchr(tag, '}')[0] = 0; @@ -559,10 +578,9 @@ addOfficialTaggedStandards(char *line, int32_t lineLen) { uprv_memcpy(atag, tag, tagSize); /* add the tag to the tag table */ - tags[tagCount].tag = atag; - /* Set the array of pointers to NULL */ - uprv_memset((void *)&tags[tagCount].aliases, 0, sizeof(tags[tagCount].aliases)); - tagCount++; + tags[tagCount++].tag = (uint16_t)((atag - tagStore) >> 1); + + /* The aliasList should already be set to 0's */ /* Get next tag */ tag = strtok(NULL, WHITESPACE); @@ -570,42 +588,396 @@ addOfficialTaggedStandards(char *line, int32_t lineLen) { } static uint16_t -addAlias(const char *alias, uint16_t converter) { - if(aliasCount>=MAX_ALIAS_COUNT) { - fprintf(stderr, "gencnval: too many aliases\n"); +addToKnownAliases(const char *alias) { +/* uint32_t idx; */ + /* strict matching */ +/* for (idx = 0; idx < knownAliasesCount; idx++) { + uint16_t num = GET_ALIAS_NUM(alias); + if (knownAliases[idx] != num + && uprv_strcmp(alias, GET_ALIAS_STR(knownAliases[idx])) == 0) + { + fprintf(stderr, "warning(line %d): duplicate alias %s and %s found\n", + lineNum, alias, GET_ALIAS_STR(knownAliases[idx])); + duplicateKnownAliasesCount++; + break; + } + else if (knownAliases[idx] != num + && ucnv_compareNames(alias, GET_ALIAS_STR(knownAliases[idx])) == 0) + { + if (verbose) { + fprintf(stderr, "information(line %d): duplicate alias %s and %s found\n", + lineNum, alias, GET_ALIAS_STR(knownAliases[idx])); + } + duplicateKnownAliasesCount++; + break; + } + } +*/ + if (knownAliasesCount >= MAX_ALIAS_COUNT) { + fprintf(stderr, "warning(line %d): Too many aliases defined for all converters\n", + lineNum); + exit(U_BUFFER_OVERFLOW_ERROR); + } + /* TODO: We could try to unlist exact duplicates. */ + return knownAliases[knownAliasesCount++] = GET_ALIAS_NUM(alias); +} + +/* +@param When standard is 0, then it's the default tag. +*/ +static uint16_t +addAlias(const char *alias, uint16_t standard, uint16_t converter, UBool defaultName) { + uint32_t idx, idx2; + UBool dupFound = FALSE; + UBool startEmptyWithoutDefault = FALSE; + AliasList *aliasList; + + if(standard>=MAX_TAG_COUNT) { + fprintf(stderr, "error(line %d): too many standard tags\n", lineNum); + exit(U_BUFFER_OVERFLOW_ERROR); + } + if(converter>=MAX_CONV_COUNT) { + fprintf(stderr, "error(line %d): too many converter names\n", lineNum); + exit(U_BUFFER_OVERFLOW_ERROR); + } + aliasList = &tags[standard].aliasList[converter]; + + if (strchr(alias, '}')) { + fprintf(stderr, "error(line %d): unmatched } found\n", + lineNum); + } + + if(aliasList->aliasCount + 1 >= MAX_TC_ALIAS_COUNT) { + fprintf(stderr, "error(line %d): too many aliases for alias %s and converter %s\n", + lineNum, alias, GET_ALIAS_STR(converters[converter].converter)); exit(U_BUFFER_OVERFLOW_ERROR); } - /* TODO: Check for duplicates */ - aliases[aliasCount].alias = alias; - aliases[aliasCount].converter = converter; + /* Check for duplicates in a tag/converter combination */ + for (idx = 0; idx < aliasList->aliasCount; idx++) { + uint16_t aliasNum = tags[standard].aliasList[converter].aliases[idx]; + if (aliasNum && ucnv_compareNames(alias, GET_ALIAS_STR(aliasNum)) == 0 && standard != ALL_TAG_NUM) + { + fprintf(stderr, "warning(line %d): duplicate alias %s and %s found for standard %s\n", + lineNum, alias, GET_ALIAS_STR(aliasNum), GET_TAG_STR(tags[standard].tag)); + dupFound = TRUE; + break; + } + } - converters[converter].aliasCount++; + if (!dupFound && standard != ALL_TAG_NUM) { + /* Check for duplicate aliases for this tag on all converters */ + for (idx = 0; idx < converterCount; idx++) { + for (idx2 = 0; idx2 < tags[standard].aliasList[idx].aliasCount; idx2++) { + uint16_t aliasNum = tags[standard].aliasList[idx].aliases[idx2]; + if (aliasNum + && ucnv_compareNames(alias, GET_ALIAS_STR(aliasNum)) == 0) + { + fprintf(stderr, "warning(line %d): duplicate alias %s found for standard tag %s between converter %s and converter %s\n", + lineNum, alias, GET_TAG_STR(tags[standard].tag), GET_ALIAS_STR(converters[converter].converter), GET_ALIAS_STR(converters[idx].converter)); + dupFound = TRUE; + break; + } + } + } - return aliasCount++; + /* Check for duplicate default aliases for this converter on all tags */ + /* It's okay to have multiple standards prefer the same name */ +/* if (verbose && !dupFound) { + for (idx = 0; idx < tagCount; idx++) { + if (tags[idx].aliasList[converter].aliases) { + uint16_t aliasNum = tags[idx].aliasList[converter].aliases[0]; + if (aliasNum + && ucnv_compareNames(alias, GET_ALIAS_STR(aliasNum)) == 0) + { + fprintf(stderr, "warning(line %d): duplicate alias %s found for converter %s and standard tag %s\n", + lineNum, alias, GET_ALIAS_STR(converters[converter].converter), GET_TAG_STR(tags[standard].tag)); + break; + } + } + } + }*/ + } + + if (aliasList->aliasCount <= 0) { + aliasList->aliasCount++; + startEmptyWithoutDefault = TRUE; + } + aliasList->aliases = uprv_realloc(aliasList->aliases, (aliasList->aliasCount + 1) * sizeof(aliasList->aliases[0])); + if (startEmptyWithoutDefault) { + aliasList->aliases[0] = 0; + } + if (defaultName) { + if (aliasList->aliases[0] != 0) { + fprintf(stderr, "error(line %d): Alias %s and %s cannot both be the default alias for standard tag %s and converter %s\n", + lineNum, + alias, + GET_ALIAS_STR(aliasList->aliases[0]), + GET_TAG_STR(tags[standard].tag), + GET_ALIAS_STR(converters[converter].converter)); + exit(U_PARSE_ERROR); + } + aliasList->aliases[0] = GET_ALIAS_NUM(alias); + } else { + aliasList->aliases[aliasList->aliasCount++] = GET_ALIAS_NUM(alias); + } +/* aliasList->converter = converter;*/ + + converters[converter].totalAliasCount++; /* One more to the column */ + tags[standard].totalAliasCount++; /* One more to the row */ + + return aliasList->aliasCount; } static uint16_t addConverter(const char *converter) { - if(converterCount>=MAX_ALIAS_COUNT) { - fprintf(stderr, "gencnval: too many converters\n"); + uint32_t idx; + if(converterCount>=MAX_CONV_COUNT) { + fprintf(stderr, "error(line %d): too many converters\n", lineNum); exit(U_BUFFER_OVERFLOW_ERROR); } - /* TODO: Check for duplicates */ - converters[converterCount].converter = converter; - converters[converterCount].aliasCount = 0; + for (idx = 0; idx < converterCount; idx++) { + if (ucnv_compareNames(converter, GET_ALIAS_STR(converters[idx].converter)) == 0) { + fprintf(stderr, "error(line %d): duplicate converter %s found!\n", lineNum, converter); + exit(U_PARSE_ERROR); + break; + } + } + + converters[converterCount].converter = GET_ALIAS_NUM(converter); + converters[converterCount].totalAliasCount = 0; return converterCount++; } +/* resolve this alias based on the prioritization of the standard tags. */ +static void +resolveAliasToConverter(uint16_t alias, uint16_t *tagNum, uint16_t *converterNum) { + uint16_t idx, idx2, idx3; + + for (idx = USER_TAG_NUM_START; idx < tagCount; idx++) { + for (idx2 = 0; idx2 < converterCount; idx2++) { + for (idx3 = 0; idx3 < tags[idx].aliasList[idx2].aliasCount; idx3++) { + uint16_t aliasNum = tags[idx].aliasList[idx2].aliases[idx3]; + if (aliasNum == alias) { + *tagNum = idx; + *converterNum = idx2; + return; + } + } + } + } + /* Do the leftovers last, just in case */ + /* There is no need to do the ALL tag */ + idx = 0; + for (idx2 = 0; idx2 < converterCount; idx2++) { + for (idx3 = 0; idx3 < tags[idx].aliasList[idx2].aliasCount; idx3++) { + uint16_t aliasNum = tags[idx].aliasList[idx2].aliases[idx3]; + if (aliasNum == alias) { + *tagNum = idx; + *converterNum = idx2; + return; + } + } + } + *tagNum = UINT16_MAX; + *converterNum = UINT16_MAX; + fprintf(stderr, "warning: alias %s not found\n", + GET_ALIAS_STR(alias)); + return; +} + +/* The knownAliases should be sorted before calling this function */ +static uint32_t +resolveAliases(uint16_t *uniqueAliasArr, uint16_t *uniqueAliasToConverterArr, uint16_t aliasOffset) { + uint32_t uniqueAliasIdx = 0; + uint32_t idx; + uint16_t currTagNum, oldTagNum; + uint16_t currConvNum; + const char *lastName; + + resolveAliasToConverter(knownAliases[0], &oldTagNum, &currConvNum); + uniqueAliasToConverterArr[uniqueAliasIdx] = currConvNum; + uniqueAliasArr[uniqueAliasIdx] = knownAliases[0] + aliasOffset; + uniqueAliasIdx++; + lastName = GET_ALIAS_STR(knownAliases[0]); + + for (idx = 1; idx < knownAliasesCount; idx++) { + resolveAliasToConverter(knownAliases[idx], &currTagNum, &currConvNum); + if (ucnv_compareNames(lastName, GET_ALIAS_STR(knownAliases[idx])) == 0) { + /* duplicate found */ + if (currTagNum > oldTagNum) { + oldTagNum = currTagNum; + uniqueAliasToConverterArr[uniqueAliasIdx - 1] = currConvNum; + uniqueAliasArr[uniqueAliasIdx - 1] = knownAliases[idx] + aliasOffset; + if (verbose) { + printf("using %s instead of %s -> %s", + GET_ALIAS_STR(knownAliases[idx]), + lastName, + GET_ALIAS_STR(converters[currConvNum].converter)); + if (uniqueAliasToConverterArr[uniqueAliasIdx - 1] != currConvNum) { + printf(" (alias conflict)"); + } + puts(""); + } + } + else { + /* else ignore it */ + if (verbose) { + printf("folding %s into %s -> %s", + GET_ALIAS_STR(knownAliases[idx]), + lastName, + GET_ALIAS_STR(converters[currConvNum].converter)); + if (uniqueAliasToConverterArr[uniqueAliasIdx - 1] != currConvNum) { + printf(" (alias conflict)"); + } + puts(""); + } + } + if (uniqueAliasToConverterArr[uniqueAliasIdx - 1] != currConvNum) { + uniqueAliasToConverterArr[uniqueAliasIdx - 1] |= UCNV_AMBIGUOUS_ALIAS_MAP_BIT; + } + } + else { + uniqueAliasToConverterArr[uniqueAliasIdx] = currConvNum; + uniqueAliasArr[uniqueAliasIdx] = knownAliases[idx] + aliasOffset; + uniqueAliasIdx++; + lastName = GET_ALIAS_STR(knownAliases[idx]); + /*printf("%s -> %s\n", GET_ALIAS_STR(knownAliases[idx]), GET_ALIAS_STR(converters[currConvNum].converter));*/ + } + } + return uniqueAliasIdx; +} + +static void +createOneAliasList(uint16_t *aliasArrLists, uint32_t tag, uint32_t converter, uint16_t offset) { + uint32_t aliasNum; + AliasList *aliasList = &tags[tag].aliasList[converter]; + + if (aliasList->aliasCount == 0) { + aliasArrLists[tag*converterCount + converter] = 0; + } + else { + aliasLists[aliasListsSize++] = aliasList->aliasCount; + + /* write into the array area a 1's based index. */ + aliasArrLists[tag*converterCount + converter] = aliasListsSize; + +/* printf("tag %s converter %s\n", + GET_TAG_STR(tags[tag].tag), + GET_ALIAS_STR(converters[converter].converter));*/ + for (aliasNum = 0; aliasNum < aliasList->aliasCount; aliasNum++) { + uint16_t value; +/* printf(" %s\n", + GET_ALIAS_STR(aliasList->aliases[aliasNum]));*/ + if (aliasList->aliases[aliasNum]) { + value = aliasList->aliases[aliasNum] + offset; + } else { + value = 0; + if (tag != 0) { /* Only show the warning when it's not the leftover tag. */ + printf("warning: tag %s does not have a default alias for %s\n", + GET_TAG_STR(tags[tag].tag), + GET_ALIAS_STR(converters[converter].converter)); + } + } + aliasLists[aliasListsSize++] = value; + if (aliasListsSize >= MAX_LIST_SIZE) { + fprintf(stderr, "error: Too many alias lists\n", lineNum); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + } + } +} + +static void +writeAliasTable(UNewDataMemory *out) { + UBool dupFound = FALSE; + uint32_t i, j; + uint32_t uniqueAliasesSize; + uint16_t aliasOffset = (uint16_t)(tagBlock.top/sizeof(uint16_t)); + uint16_t *aliasArrLists = (uint16_t *)uprv_malloc(tagCount * converterCount * sizeof(uint16_t)); + uint16_t *uniqueAliases = (uint16_t *)uprv_malloc(knownAliasesCount * sizeof(uint16_t)); + uint16_t *uniqueAliasesToConverter = (uint16_t *)uprv_malloc(knownAliasesCount * sizeof(uint16_t)); + + qsort(knownAliases, knownAliasesCount, sizeof(knownAliases[0]), compareAliases); + uniqueAliasesSize = resolveAliases(uniqueAliases, uniqueAliasesToConverter, aliasOffset); + + /* Array index starts at 1. aliasLists[0] is the size of the lists section. */ + aliasListsSize = 0; + + /* write the offsets of all the aliases lists in a 2D array, and create the lists. */ + for (i = 0; i < tagCount; ++i) { + for (j = 0; j < converterCount; ++j) { + createOneAliasList(aliasArrLists, i, j, aliasOffset); + } + } + + /* Write the size of the TOC */ + udata_write32(out, 8); + + /* Write the sizes of each section */ + /* All sizes are the number of uint16_t units, not bytes */ + udata_write32(out, converterCount); + udata_write32(out, tagCount); + udata_write32(out, uniqueAliasesSize); /* list of aliases */ + udata_write32(out, uniqueAliasesSize); /* The preresolved form of mapping an untagged the alias to a converter */ + udata_write32(out, tagCount * converterCount); + udata_write32(out, aliasListsSize + 1); + udata_write32(out, 0); /* Reserved space. */ + udata_write32(out, (tagBlock.top + stringBlock.top) / sizeof(uint16_t)); + + /* write the table of converters */ + /* Think of this as the column headers */ + for(i=0; itop+length; + /* The (length&1) is used to keep the addresses on a 16-bit boundary */ + uint32_t top=block->top + length + (length&1); char *p; - if(top > block->max) { - fprintf(stderr, "gencnval: out of memory\n"); + if(top >= block->max) { + fprintf(stderr, "error(line %d): out of memory\n", lineNum); exit(U_MEMORY_ALLOCATION_ERROR); } p = block->store + block->top; @@ -615,7 +987,8 @@ allocString(StringBlock *block, uint32_t length) { static int compareAliases(const void *alias1, const void *alias2) { - return ucnv_compareNames(((Alias*)alias1)->alias, ((Alias*)alias2)->alias); + /* Names like IBM850 and ibm-850 need to be sorted together */ + return ucnv_compareNames(GET_ALIAS_STR(*(uint16_t*)alias1), GET_ALIAS_STR(*(uint16_t*)alias2)); } /*