diff --git a/icu4c/source/common/ucnv_io.c b/icu4c/source/common/ucnv_io.c index ca9050a032f..a047ca1ae4d 100644 --- a/icu4c/source/common/ucnv_io.c +++ b/icu4c/source/common/ucnv_io.c @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 1999-2005, International Business Machines +* Copyright (C) 1999-2006, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -94,8 +94,10 @@ * the third dimension to the section 5. No other section should be referencing * this section. * - * 7) Reserved at this time (There is no information). This _usually_ has a - * size of 0. Future versions may add more information here. + * 7) Starting in ICU 3.6, this can be a UConverterAliasOptions struct. Its + * presence indicates that a section 9 exists. UConverterAliasOptions specifies + * what type of string normalization is used among other potential things in the + * future. * * 8) This is the string table. All strings are indexed on an even address. * There are two reasons for this. First many chip architectures locate strings @@ -103,6 +105,10 @@ * numbers, this string table can be 128KB in size instead of 64KB when we * only have strings starting on an even address. * + * 9) When present this is a set of prenormalized strings from section 8. This + * table contains normalized strings with the dashes and spaces stripped out, + * and all strings lowercased. In the future, the options in section 7 may state + * other types of normalization. * * Here is the concept of section 5 and 6. It's a 3D cube. Each tag * has a unique alias among all converters. That same alias can @@ -173,15 +179,20 @@ enum { untaggedConvArrayIndex=4, taggedAliasArrayIndex=5, taggedAliasListsIndex=6, - reservedIndex1=7, + tableOptions=7, stringTableIndex=8, - minTocLength=8, /* min. tocLength in the file, does not count the tocLengthIndex! */ - offsetsCount /* length of the swapper's temporary offsets[] */ + normalizedStringTableIndex=9, + offsetsCount, /* length of the swapper's temporary offsets[] */ + minTocLength=8 /* min. tocLength in the file, does not count the tocLengthIndex! */ }; +static const UConverterAliasOptions defaultTableOptions = { + UCNV_IO_UNNORMALIZED +}; static UConverterAlias gMainTable; #define GET_STRING(idx) (const char *)(gMainTable.stringTable + (idx)) +#define GET_NORMALIZED_STRING(idx) (const char *)(gMainTable.normalizedStringTable + (idx)) static UBool U_CALLCONV isAcceptable(void *context, @@ -226,7 +237,6 @@ haveAliasData(UErrorCode *pErrorCode) { const uint16_t *table = NULL; uint32_t tableStart; uint32_t currOffset; - uint32_t reservedSize1; data = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode); if(U_FAILURE(*pErrorCode)) { @@ -253,8 +263,12 @@ haveAliasData(UErrorCode *pErrorCode) { gMainTable.untaggedConvArraySize = ((const uint32_t *)(table))[4]; gMainTable.taggedAliasArraySize = ((const uint32_t *)(table))[5]; gMainTable.taggedAliasListsSize = ((const uint32_t *)(table))[6]; - reservedSize1 = ((const uint32_t *)(table))[7]; /* reserved */ - /*gStringTableSize = ((const uint32_t *)(table))[8];*/ + gMainTable.optionTableSize = ((const uint32_t *)(table))[7]; + gMainTable.stringTableSize = ((const uint32_t *)(table))[8]; + + if (((const uint32_t *)(table))[0] > 8) { + gMainTable.normalizedStringTableSize = ((const uint32_t *)(table))[9]; + } currOffset = tableStart * (sizeof(uint32_t)/sizeof(uint16_t)) + (sizeof(uint32_t)/sizeof(uint16_t)); gMainTable.converterList = table + currOffset; @@ -276,11 +290,25 @@ haveAliasData(UErrorCode *pErrorCode) { gMainTable.taggedAliasLists = table + currOffset; currOffset += gMainTable.taggedAliasListsSize; - /* reserved */ + if (gMainTable.optionTableSize > 0 + && ((const UConverterAliasOptions *)(table + currOffset))->stringNormalizationType < UCNV_IO_NORM_TYPE_COUNT) + { + /* Faster table */ + gMainTable.optionTable = (const UConverterAliasOptions *)(table + currOffset); + } + else { + /* Smaller table, or I can't handle this normalization mode! + Use the original slower table lookup. */ + gMainTable.optionTable = &defaultTableOptions; + } - currOffset += reservedSize1; + currOffset += gMainTable.optionTableSize; gMainTable.stringTable = table + currOffset; + currOffset += gMainTable.stringTableSize; + gMainTable.normalizedStringTable = ((gMainTable.optionTable->stringNormalizationType == UCNV_IO_UNNORMALIZED) + ? gMainTable.stringTable : (table + currOffset)); + ucln_common_registerCleanup(UCLN_COMMON_UCNV_IO, ucnv_io_cleanup); } umtx_unlock(NULL); @@ -411,6 +439,15 @@ findConverter(const char *alias, UErrorCode *pErrorCode) { uint32_t mid, start, limit; uint32_t lastMid; int result; + char strippedName[UCNV_MAX_CONVERTER_NAME_LENGTH]; + + if (uprv_strlen(alias) >= UCNV_MAX_CONVERTER_NAME_LENGTH) { + *pErrorCode = U_BUFFER_OVERFLOW_ERROR; + return UINT32_MAX; + } + + /* Lower case and remove ignoreable characters. */ + ucnv_io_stripForCompare(strippedName, alias); /* do a binary search for the alias */ start = 0; @@ -424,7 +461,12 @@ findConverter(const char *alias, UErrorCode *pErrorCode) { break; /* We haven't moved, and it wasn't found. */ } lastMid = mid; - result = ucnv_compareNames(alias, GET_STRING(gMainTable.aliasList[mid])); + if (gMainTable.optionTable->stringNormalizationType == UCNV_IO_UNNORMALIZED) { + result = ucnv_compareNames(strippedName, GET_STRING(gMainTable.aliasList[mid])); + } + else { + result = uprv_strcmp(strippedName, GET_NORMALIZED_STRING(gMainTable.aliasList[mid])); + } if (result < 0) { limit = mid; @@ -981,22 +1023,23 @@ ucnv_swapAliases(const UDataSwapper *ds, } inTable=(const uint16_t *)((const char *)inData+headerSize); + uprv_memset(toc, 0, sizeof(toc)); toc[tocLengthIndex]=tocLength=ds->readUInt32(((const uint32_t *)inTable)[tocLengthIndex]); - if(tocLengthreadUInt32(((const uint32_t *)inTable)[i]); } /* compute offsets */ - offsets[tocLengthIndex]=0; + uprv_memset(offsets, 0, sizeof(offsets)); offsets[converterListIndex]=2*(1+tocLength); /* count two 16-bit units per toc entry */ - for(i=tagListIndex; i<=stringTableIndex; ++i) { + for(i=tagListIndex; i<=tocLength; ++i) { offsets[i]=offsets[i-1]+toc[i-1]; } @@ -1024,6 +1067,11 @@ ucnv_swapAliases(const UDataSwapper *ds, /* swap strings */ ds->swapInvChars(ds, inTable+offsets[stringTableIndex], 2*(int32_t)toc[stringTableIndex], outTable+offsets[stringTableIndex], pErrorCode); + /* swap normalized strings */ + if (toc[normalizedStringTableIndex] > 0) { + ds->swapInvChars(ds, inTable+offsets[normalizedStringTableIndex], 2*(int32_t)toc[normalizedStringTableIndex], + outTable+offsets[normalizedStringTableIndex], pErrorCode); + } if(U_FAILURE(*pErrorCode)) { udata_printError(ds, "ucnv_swapAliases().swapInvChars(charset names) failed\n"); return 0; diff --git a/icu4c/source/common/ucnv_io.h b/icu4c/source/common/ucnv_io.h index 95db9a61fde..49e6eba854c 100644 --- a/icu4c/source/common/ucnv_io.h +++ b/icu4c/source/common/ucnv_io.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 1999-2005, International Business Machines + * Copyright (C) 1999-2006, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * @@ -24,6 +24,16 @@ #define UCNV_NUM_RESERVED_TAGS 2 #define UCNV_NUM_HIDDEN_TAGS 1 +typedef enum { + UCNV_IO_UNNORMALIZED, + UCNV_IO_STD_NORMALIZED, + UCNV_IO_NORM_TYPE_COUNT +}; + +typedef struct { + uint16_t stringNormalizationType; +} UConverterAliasOptions; + typedef struct UConverterAlias { const uint16_t *converterList; const uint16_t *tagList; @@ -31,7 +41,9 @@ typedef struct UConverterAlias { const uint16_t *untaggedConvArray; const uint16_t *taggedAliasArray; const uint16_t *taggedAliasLists; + const UConverterAliasOptions *optionTable; const uint16_t *stringTable; + const uint16_t *normalizedStringTable; uint32_t converterListSize; uint32_t tagListSize; @@ -39,7 +51,9 @@ typedef struct UConverterAlias { uint32_t untaggedConvArraySize; uint32_t taggedAliasArraySize; uint32_t taggedAliasListsSize; - /*uint32_t stringTableSize;*/ + uint32_t optionTableSize; + uint32_t stringTableSize; + uint32_t normalizedStringTableSize; } UConverterAlias; /** @@ -59,10 +73,10 @@ typedef struct UConverterAlias { # error U_CHARSET_FAMILY is not valid #endif -U_CFUNC char * U_EXPORT2 +U_CAPI char * U_EXPORT2 ucnv_io_stripASCIIForCompare(char *dst, const char *name); -U_CFUNC char * U_EXPORT2 +U_CAPI char * U_EXPORT2 ucnv_io_stripEBCDICForCompare(char *dst, const char *name); /** diff --git a/icu4c/source/tools/gencnval/gencnval.c b/icu4c/source/tools/gencnval/gencnval.c index bc38e9ca69b..a56d7ef4ba1 100644 --- a/icu4c/source/tools/gencnval/gencnval.c +++ b/icu4c/source/tools/gencnval/gencnval.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2004, International Business Machines +* Copyright (C) 1999-2006, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -84,7 +84,7 @@ static const UDataInfo dataInfo={ 0, {0x43, 0x76, 0x41, 0x6c}, /* dataFormat="CvAl" */ - {3, 0, 0, 0}, /* formatVersion */ + {3, 0, 1, 0}, /* formatVersion */ {1, 4, 2, 0} /* dataVersion */ }; @@ -137,6 +137,10 @@ static UBool standardTagsUsed = FALSE; static UBool verbose = FALSE; static int lineNum = 1; +static UConverterAliasOptions tableOptions = { + UCNV_IO_UNNORMALIZED +}; + /* prototypes --------------------------------------------------------------- */ static void @@ -192,7 +196,8 @@ enum VERBOSE, COPYRIGHT, DESTDIR, - SOURCEDIR + SOURCEDIR, + OPTIMIZE }; static UOption options[]={ @@ -201,7 +206,8 @@ static UOption options[]={ UOPTION_VERBOSE, UOPTION_COPYRIGHT, UOPTION_DESTDIR, - UOPTION_SOURCEDIR + UOPTION_SOURCEDIR, + UOPTION_DEF( "optimize", 'O', UOPT_REQUIRES_ARG), }; extern int @@ -234,6 +240,7 @@ main(int argc, char* argv[]) { "\t-c or --copyright include a copyright notice\n" "\t-d or --destdir destination directory, followed by the path\n" "\t-s or --sourcedir source directory, followed by the path\n", + "\t-O or --optimize optimize the table for \"size\" or \"speed\"\n", argv[0]); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } @@ -242,6 +249,19 @@ main(int argc, char* argv[]) { verbose = TRUE; } + if(options[OPTIMIZE].doesOccur) { + if (strcmp(options[OPTIMIZE].value, "size") == 0) { + tableOptions.stringNormalizationType = UCNV_IO_UNNORMALIZED; + } + else if (strcmp(options[OPTIMIZE].value, "speed") == 0) { + tableOptions.stringNormalizationType = UCNV_IO_STD_NORMALIZED; + } + else { + fprintf(stderr, "Invalid value for optimization\n"); + return -1; + } + } + if(argc>=2) { path=argv[1]; } else { @@ -905,6 +925,26 @@ createOneAliasList(uint16_t *aliasArrLists, uint32_t tag, uint32_t converter, ui } } +static void +createNormalizedAliasStrings(char *normalizedStrings, const char *origStringBlock, int32_t stringBlockLength) { + int32_t currStrLen; + uprv_memcpy(normalizedStrings, origStringBlock, stringBlockLength); + while ((currStrLen = (int32_t)uprv_strlen(origStringBlock)) < stringBlockLength) { + int32_t currStrSize = currStrLen + 1; + if (currStrLen > 0) { + int32_t normStrLen; + ucnv_io_stripForCompare(normalizedStrings, origStringBlock); + normStrLen = uprv_strlen(normalizedStrings); + if (normStrLen > 0) { + uprv_memset(normalizedStrings + normStrLen, 0, currStrSize - normStrLen); + } + } + stringBlockLength -= currStrSize; + normalizedStrings += currStrSize; + origStringBlock += currStrSize; + } +} + static void writeAliasTable(UNewDataMemory *out) { uint32_t i, j; @@ -928,7 +968,12 @@ writeAliasTable(UNewDataMemory *out) { } /* Write the size of the TOC */ - udata_write32(out, 8); + if (tableOptions.stringNormalizationType == UCNV_IO_UNNORMALIZED) { + udata_write32(out, 8); + } + else { + udata_write32(out, 9); + } /* Write the sizes of each section */ /* All sizes are the number of uint16_t units, not bytes */ @@ -938,8 +983,16 @@ writeAliasTable(UNewDataMemory *out) { udata_write32(out, uniqueAliasesSize); /* The preresolved form of mapping an untagged the alias to a converter */ udata_write32(out, tagCount * converterCount); udata_write32(out, aliasListsSize + 1); - udata_write32(out, 0); /* Reserved space. */ + if (tableOptions.stringNormalizationType == UCNV_IO_UNNORMALIZED) { + udata_write32(out, 0); + } + else { + udata_write32(out, sizeof(tableOptions) / sizeof(uint16_t)); + } udata_write32(out, (tagBlock.top + stringBlock.top) / sizeof(uint16_t)); + if (tableOptions.stringNormalizationType != UCNV_IO_UNNORMALIZED) { + udata_write32(out, (tagBlock.top + stringBlock.top) / sizeof(uint16_t)); + } /* write the table of converters */ /* Think of this as the column headers */ @@ -973,12 +1026,28 @@ writeAliasTable(UNewDataMemory *out) { /* Write the lists */ udata_writeBlock(out, (const void *)aliasLists, aliasListsSize * sizeof(uint16_t)); + /* Write any options for the alias table. */ + if (tableOptions.stringNormalizationType != UCNV_IO_UNNORMALIZED) { + udata_writeBlock(out, (const void *)&tableOptions, sizeof(tableOptions)); + } + /* write the tags strings */ udata_writeString(out, tagBlock.store, tagBlock.top); /* write the aliases strings */ udata_writeString(out, stringBlock.store, stringBlock.top); + /* write the normalized aliases strings */ + if (tableOptions.stringNormalizationType != UCNV_IO_UNNORMALIZED) { + char *normalizedStrings = (char *)uprv_malloc(tagBlock.top + stringBlock.top); + createNormalizedAliasStrings(normalizedStrings, tagBlock.store, tagBlock.top); + createNormalizedAliasStrings(normalizedStrings + tagBlock.top, stringBlock.store, stringBlock.top); + + /* Write out the complete normalized array. */ + udata_writeString(out, normalizedStrings, tagBlock.top + stringBlock.top); + uprv_free(normalizedStrings); + } + uprv_free(aliasArrLists); uprv_free(uniqueAliases); }