From a11fc0d1422682c8f8e1e08dcffabb128fde05ce Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 12 Aug 2003 00:26:07 +0000 Subject: [PATCH] ICU-2235 swap converter alias tables X-SVN-Rev: 12807 --- icu4c/source/common/ucnv_io.c | 269 ++++++++++++++++++++++++- icu4c/source/common/ucnv_io.h | 13 ++ icu4c/source/tools/icuswap/icuswap.cpp | 5 +- 3 files changed, 284 insertions(+), 3 deletions(-) diff --git a/icu4c/source/common/ucnv_io.c b/icu4c/source/common/ucnv_io.c index dc381b961aa..dd61af7041f 100644 --- a/icu4c/source/common/ucnv_io.c +++ b/icu4c/source/common/ucnv_io.c @@ -33,6 +33,8 @@ #include "unicode/udata.h" #include "umutex.h" +#include "uarrsort.h" +#include "udataswp.h" #include "cstring.h" #include "cmemory.h" #include "ucnv_io.h" @@ -51,14 +53,18 @@ * First there is the size of the Table of Contents (TOC). The TOC * entries contain the size of each section. In order to find the offset * you just need to sum up the previous offsets. + * The TOC length and entries are an array of uint32_t values. + * The first section after the TOC starts immediately after the TOC. * * 1) This section contains a list of converters. This list contains indexes * into the string table for the converter name. The index of this list is * also used by other sections, which are mentioned later on. + * This list is not sorted. * * 2) This section contains a list of tags. This list contains indexes * into the string table for the tag name. The index of this list is * also used by other sections, which are mentioned later on. + * This list is in priority order of standards. * * 3) This section contains a list of sorted unique aliases. This * list contains indexes into the string table for the alias name. The @@ -157,6 +163,19 @@ static const char DATA_TYPE[] = "icu"; static UDataMemory *gAliasData=NULL; +enum { + tocLengthIndex=0, + converterListIndex=1, + tagListIndex=2, + aliasListIndex=3, + untaggedConvArrayIndex=4, + taggedAliasArrayIndex=5, + taggedAliasListsIndex=6, + reservedIndex1=7, + stringTableIndex=8, + minTocLength=8 /* does not count the tocLengthIndex! */ +}; + static const uint16_t *gConverterList = NULL; static const uint16_t *gTagList = NULL; static const uint16_t *gAliasList = NULL; @@ -224,7 +243,7 @@ haveAliasData(UErrorCode *pErrorCode) { table = (const uint16_t *)udata_getMemory(data); tableStart = ((const uint32_t *)(table))[0]; - if (tableStart < 8) { + if (tableStart < minTocLength) { *pErrorCode = U_INVALID_FORMAT_ERROR; udata_close(data); return FALSE; @@ -1061,6 +1080,253 @@ ucnv_io_setDefaultConverterName(const char *converterName) { } } +/* alias table swapping ----------------------------------------------------- */ + +typedef char * U_CALLCONV StripForCompareFn(char *dst, const char *name); + +/* + * row of a temporary array + * + * gets platform-endian charset string indexes and sorting indexes; + * after sorting this array by strings, the actual arrays are permutated + * according to the sorting indexes + */ +typedef struct Row { + uint16_t strIndex, sortIndex; +} Row; + +typedef struct TempTable { + const char *chars; + Row *rows; + uint16_t *resort; + StripForCompareFn *stripForCompare; +} TempTable; + +enum { + STACK_ROW_CAPACITY=500 +}; + +static int32_t +io_compareRows(const void *context, const void *left, const void *right) { + char strippedLeft[UCNV_MAX_CONVERTER_NAME_LENGTH], + strippedRight[UCNV_MAX_CONVERTER_NAME_LENGTH]; + + TempTable *tempTable=(TempTable *)context; + const char *chars=tempTable->chars; + + return (int32_t)uprv_strcmp(tempTable->stripForCompare(strippedLeft, chars+2*((const Row *)left)->strIndex), + tempTable->stripForCompare(strippedRight, chars+2*((const Row *)right)->strIndex)); +} + +U_CAPI int32_t U_EXPORT2 +ucnv_swapAliases(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint16_t *inTable; + uint32_t toc[1+minTocLength]; + uint32_t offsets[1+minTocLength]; /* 16-bit-addressed offsets from inTable/outTable */ + uint32_t i, count, tocLength, topOffset; + + Row rows[STACK_ROW_CAPACITY]; + uint16_t resort[STACK_ROW_CAPACITY]; + TempTable tempTable; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x43 && /* dataFormat="CvAl" */ + pInfo->dataFormat[1]==0x76 && + pInfo->dataFormat[2]==0x41 && + pInfo->dataFormat[3]==0x6c && + pInfo->formatVersion[0]==3 + )) { + udata_printError(ds, "ucnv_swapAliases(): data format %02x.%02x.%02x.%02x (format version %02x) is not an alias table\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + /* an alias table must contain at least the table of contents array */ + if(length>=0 && length<4*(1+minTocLength)) { + udata_printError(ds, "ucnv_swapAliases(): too few bytes (%d after header) for an alias table\n", + length-headerSize); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + inTable=(const uint16_t *)((const char *)inData+headerSize); + toc[tocLengthIndex]=tocLength=ds->readUInt32(((const uint32_t *)inTable)[tocLengthIndex]); + if(tocLengthreadUInt32(((const uint32_t *)inTable)[i]); + } + + /* compute offsets */ + offsets[tocLengthIndex]=0; + offsets[converterListIndex]=2*(1+tocLength); /* count two 16-bit units per toc entry */ + for(i=tagListIndex; i<=stringTableIndex; ++i) { + offsets[i]=offsets[i-1]+toc[i-1]; + } + + /* compute the overall size of the after-header data, in numbers of 16-bit units */ + topOffset=offsets[i]=offsets[i-1]+toc[i-1]; + + if(length>=0) { + uint16_t *outTable; + const uint16_t *p, *p2; + uint16_t *q, *q2; + uint16_t oldIndex; + + outTable=(uint16_t *)((char *)outData+headerSize); + + /* swap the entire table of contents */ + ds->swapArray32(ds, inTable, 4*(1+tocLength), outTable, pErrorCode); + + /* swap strings */ + ds->swapInvChars(ds, inTable+offsets[stringTableIndex], 2*(int32_t)toc[stringTableIndex], + outTable+offsets[stringTableIndex], pErrorCode); + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "ucnv_swapAliases().swapInvChars(charset names) failed - %s\n", + u_errorName(*pErrorCode)); + return 0; + } + + /* + * ### TODO optimize + * After some testing, add a test + * if(inCharset==outCharset) { + * only swap 16-bit units, do not sort; + -- swap all 16-bit values -- + ds->swapArray16(ds, + inTable+offsets[converterListIndex], + 2*(int32_t)(offsets[stringTableIndex]-offsets[converterListIndex]), + outTable+offsets[converterListIndex], + pErrorCode); + * } else { sort/copy/swap/permutate as below; } + */ + + /* allocate the temporary table for sorting */ + count=toc[aliasListIndex]; + + tempTable.chars=(const char *)(outTable+offsets[stringTableIndex]); /* sort by outCharset */ + + if(count<=STACK_ROW_CAPACITY) { + tempTable.rows=rows; + tempTable.resort=resort; + } else { + tempTable.rows=(Row *)uprv_malloc(count*sizeof(Row)+count*2); + if(tempTable.rows==NULL) { + udata_printError(ds, "ucnv_swapAliases(): unable to allocate memory for sorting tables (max length: %u)\n", + count); + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return 0; + } + tempTable.resort=(uint16_t *)(tempTable.rows+count); + } + + if(ds->outCharset==U_ASCII_FAMILY) { + tempTable.stripForCompare=ucnv_io_stripASCIIForCompare; + } else /* U_EBCDIC_FAMILY */ { + tempTable.stripForCompare=ucnv_io_stripEBCDICForCompare; + } + + /* + * Sort unique aliases+mapped names. + * + * We need to sort the list again by outCharset strings because they + * sort differently for different charset families. + * First we set up a temporary table with the string indexes and + * sorting indexes and sort that. + * Then we permutate and copy/swap the actual values. + */ + p=inTable+offsets[aliasListIndex]; + q=outTable+offsets[aliasListIndex]; + + p2=inTable+offsets[untaggedConvArrayIndex]; + q2=outTable+offsets[untaggedConvArrayIndex]; + + for(i=0; ireadUInt16(p[i]); + tempTable.rows[i].sortIndex=(uint16_t)i; + } + + uprv_sortArray(tempTable.rows, (int32_t)count, sizeof(Row), + io_compareRows, &tempTable, + FALSE, pErrorCode); + + if(U_SUCCESS(*pErrorCode)) { + /* copy/swap/permutate items */ + if(p!=q) { + for(i=0; iswapArray16(ds, p+oldIndex, 2, q+i, pErrorCode); + ds->swapArray16(ds, p2+oldIndex, 2, q2+i, pErrorCode); + } + } else { + /* + * If we swap in-place, then the permutation must use another + * temporary array (tempTable.resort) + * before the results are copied to the outBundle. + */ + uint16_t *r=tempTable.resort; + + for(i=0; iswapArray16(ds, p+oldIndex, 2, r+i, pErrorCode); + } + uprv_memcpy(q, r, 2*count); + + for(i=0; iswapArray16(ds, p2+oldIndex, 2, r+i, pErrorCode); + } + uprv_memcpy(q2, r, 2*count); + } + } + + if(tempTable.rows!=rows) { + uprv_free(tempTable.rows); + } + + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "ucnv_swapAliases().uprv_sortArray(%u items) failed - %s\n", + count, u_errorName(*pErrorCode)); + return 0; + } + + /* swap remaining 16-bit values */ + ds->swapArray16(ds, + inTable+offsets[converterListIndex], + 2*(int32_t)(offsets[aliasListIndex]-offsets[converterListIndex]), + outTable+offsets[converterListIndex], + pErrorCode); + ds->swapArray16(ds, + inTable+offsets[taggedAliasArrayIndex], + 2*(int32_t)(offsets[stringTableIndex]-offsets[taggedAliasArrayIndex]), + outTable+offsets[taggedAliasArrayIndex], + pErrorCode); + } + + return headerSize+2*(int32_t)topOffset; +} + /* * Hey, Emacs, please set the following: * @@ -1069,4 +1335,3 @@ ucnv_io_setDefaultConverterName(const char *converterName) { * End: * */ - diff --git a/icu4c/source/common/ucnv_io.h b/icu4c/source/common/ucnv_io.h index 7597155b27c..2790be4272f 100644 --- a/icu4c/source/common/ucnv_io.h +++ b/icu4c/source/common/ucnv_io.h @@ -149,6 +149,19 @@ ucnv_io_getDefaultConverterName(void); U_CFUNC void ucnv_io_setDefaultConverterName(const char *name); +/* forward declaration */ +struct UDataSwapper; +typedef struct UDataSwapper UDataSwapper; + +/** + * Swap an ICU converter alias table. See ucnv_io.h. + * @internal + */ +U_CAPI int32_t U_EXPORT2 +ucnv_swapAliases(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode); + #endif /* _UCNV_IO */ /* diff --git a/icu4c/source/tools/icuswap/icuswap.cpp b/icu4c/source/tools/icuswap/icuswap.cpp index 728f0c57af3..fdf810f8eeb 100644 --- a/icu4c/source/tools/icuswap/icuswap.cpp +++ b/icu4c/source/tools/icuswap/icuswap.cpp @@ -29,6 +29,7 @@ #include "unicode/udata.h" #include "udataswp.h" #include "uresdata.h" +#include "ucnv_io.h" #include "uoptions.h" #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) @@ -245,7 +246,9 @@ static const struct { uint8_t dataFormat[4]; UDataSwapFn *swapFn; } swapFns[]={ - { { 0x52, 0x65, 0x73, 0x42 }, ures_swap } /* dataFormat="ResB" */ + { { 0x52, 0x65, 0x73, 0x42 }, ures_swap }, /* dataFormat="ResB" */ + /* insert data formats here, descending by expected frequency of occurrence */ + { { 0x43, 0x76, 0x41, 0x6c }, ucnv_swapAliases } /* dataFormat="CvAl" */ }; static int32_t