ICU-5225 Provide optimization option for size or speed of the alias table.

X-SVN-Rev: 19703
This commit is contained in:
George Rhoten 2006-06-11 16:57:02 +00:00
parent 6a1f5ae01a
commit 5e15aecbe2
3 changed files with 158 additions and 27 deletions

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 1999-2005, International Business Machines
* Copyright (C) 1999-2006, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -94,8 +94,10 @@
* the third dimension to the section 5. No other section should be referencing
* this section.
*
* 7) Reserved at this time (There is no information). This _usually_ has a
* size of 0. Future versions may add more information here.
* 7) Starting in ICU 3.6, this can be a UConverterAliasOptions struct. Its
* presence indicates that a section 9 exists. UConverterAliasOptions specifies
* what type of string normalization is used among other potential things in the
* future.
*
* 8) This is the string table. All strings are indexed on an even address.
* There are two reasons for this. First many chip architectures locate strings
@ -103,6 +105,10 @@
* numbers, this string table can be 128KB in size instead of 64KB when we
* only have strings starting on an even address.
*
* 9) When present this is a set of prenormalized strings from section 8. This
* table contains normalized strings with the dashes and spaces stripped out,
* and all strings lowercased. In the future, the options in section 7 may state
* other types of normalization.
*
* Here is the concept of section 5 and 6. It's a 3D cube. Each tag
* has a unique alias among all converters. That same alias can
@ -173,15 +179,20 @@ enum {
untaggedConvArrayIndex=4,
taggedAliasArrayIndex=5,
taggedAliasListsIndex=6,
reservedIndex1=7,
tableOptions=7,
stringTableIndex=8,
minTocLength=8, /* min. tocLength in the file, does not count the tocLengthIndex! */
offsetsCount /* length of the swapper's temporary offsets[] */
normalizedStringTableIndex=9,
offsetsCount, /* length of the swapper's temporary offsets[] */
minTocLength=8 /* min. tocLength in the file, does not count the tocLengthIndex! */
};
static const UConverterAliasOptions defaultTableOptions = {
UCNV_IO_UNNORMALIZED
};
static UConverterAlias gMainTable;
#define GET_STRING(idx) (const char *)(gMainTable.stringTable + (idx))
#define GET_NORMALIZED_STRING(idx) (const char *)(gMainTable.normalizedStringTable + (idx))
static UBool U_CALLCONV
isAcceptable(void *context,
@ -226,7 +237,6 @@ haveAliasData(UErrorCode *pErrorCode) {
const uint16_t *table = NULL;
uint32_t tableStart;
uint32_t currOffset;
uint32_t reservedSize1;
data = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
@ -253,8 +263,12 @@ haveAliasData(UErrorCode *pErrorCode) {
gMainTable.untaggedConvArraySize = ((const uint32_t *)(table))[4];
gMainTable.taggedAliasArraySize = ((const uint32_t *)(table))[5];
gMainTable.taggedAliasListsSize = ((const uint32_t *)(table))[6];
reservedSize1 = ((const uint32_t *)(table))[7]; /* reserved */
/*gStringTableSize = ((const uint32_t *)(table))[8];*/
gMainTable.optionTableSize = ((const uint32_t *)(table))[7];
gMainTable.stringTableSize = ((const uint32_t *)(table))[8];
if (((const uint32_t *)(table))[0] > 8) {
gMainTable.normalizedStringTableSize = ((const uint32_t *)(table))[9];
}
currOffset = tableStart * (sizeof(uint32_t)/sizeof(uint16_t)) + (sizeof(uint32_t)/sizeof(uint16_t));
gMainTable.converterList = table + currOffset;
@ -276,11 +290,25 @@ haveAliasData(UErrorCode *pErrorCode) {
gMainTable.taggedAliasLists = table + currOffset;
currOffset += gMainTable.taggedAliasListsSize;
/* reserved */
if (gMainTable.optionTableSize > 0
&& ((const UConverterAliasOptions *)(table + currOffset))->stringNormalizationType < UCNV_IO_NORM_TYPE_COUNT)
{
/* Faster table */
gMainTable.optionTable = (const UConverterAliasOptions *)(table + currOffset);
}
else {
/* Smaller table, or I can't handle this normalization mode!
Use the original slower table lookup. */
gMainTable.optionTable = &defaultTableOptions;
}
currOffset += reservedSize1;
currOffset += gMainTable.optionTableSize;
gMainTable.stringTable = table + currOffset;
currOffset += gMainTable.stringTableSize;
gMainTable.normalizedStringTable = ((gMainTable.optionTable->stringNormalizationType == UCNV_IO_UNNORMALIZED)
? gMainTable.stringTable : (table + currOffset));
ucln_common_registerCleanup(UCLN_COMMON_UCNV_IO, ucnv_io_cleanup);
}
umtx_unlock(NULL);
@ -411,6 +439,15 @@ findConverter(const char *alias, UErrorCode *pErrorCode) {
uint32_t mid, start, limit;
uint32_t lastMid;
int result;
char strippedName[UCNV_MAX_CONVERTER_NAME_LENGTH];
if (uprv_strlen(alias) >= UCNV_MAX_CONVERTER_NAME_LENGTH) {
*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
return UINT32_MAX;
}
/* Lower case and remove ignoreable characters. */
ucnv_io_stripForCompare(strippedName, alias);
/* do a binary search for the alias */
start = 0;
@ -424,7 +461,12 @@ findConverter(const char *alias, UErrorCode *pErrorCode) {
break; /* We haven't moved, and it wasn't found. */
}
lastMid = mid;
result = ucnv_compareNames(alias, GET_STRING(gMainTable.aliasList[mid]));
if (gMainTable.optionTable->stringNormalizationType == UCNV_IO_UNNORMALIZED) {
result = ucnv_compareNames(strippedName, GET_STRING(gMainTable.aliasList[mid]));
}
else {
result = uprv_strcmp(strippedName, GET_NORMALIZED_STRING(gMainTable.aliasList[mid]));
}
if (result < 0) {
limit = mid;
@ -981,22 +1023,23 @@ ucnv_swapAliases(const UDataSwapper *ds,
}
inTable=(const uint16_t *)((const char *)inData+headerSize);
uprv_memset(toc, 0, sizeof(toc));
toc[tocLengthIndex]=tocLength=ds->readUInt32(((const uint32_t *)inTable)[tocLengthIndex]);
if(tocLength<minTocLength) {
udata_printError(ds, "ucnv_swapAliases(): table of contents too short (%u sections)\n", tocLength);
if(tocLength<minTocLength || offsetsCount<=tocLength) {
udata_printError(ds, "ucnv_swapAliases(): table of contents contains unsupported number of sections (%u sections)\n", tocLength);
*pErrorCode=U_INVALID_FORMAT_ERROR;
return 0;
}
/* read the known part of the table of contents */
for(i=converterListIndex; i<=minTocLength; ++i) {
for(i=converterListIndex; i<=tocLength; ++i) {
toc[i]=ds->readUInt32(((const uint32_t *)inTable)[i]);
}
/* compute offsets */
offsets[tocLengthIndex]=0;
uprv_memset(offsets, 0, sizeof(offsets));
offsets[converterListIndex]=2*(1+tocLength); /* count two 16-bit units per toc entry */
for(i=tagListIndex; i<=stringTableIndex; ++i) {
for(i=tagListIndex; i<=tocLength; ++i) {
offsets[i]=offsets[i-1]+toc[i-1];
}
@ -1024,6 +1067,11 @@ ucnv_swapAliases(const UDataSwapper *ds,
/* swap strings */
ds->swapInvChars(ds, inTable+offsets[stringTableIndex], 2*(int32_t)toc[stringTableIndex],
outTable+offsets[stringTableIndex], pErrorCode);
/* swap normalized strings */
if (toc[normalizedStringTableIndex] > 0) {
ds->swapInvChars(ds, inTable+offsets[normalizedStringTableIndex], 2*(int32_t)toc[normalizedStringTableIndex],
outTable+offsets[normalizedStringTableIndex], pErrorCode);
}
if(U_FAILURE(*pErrorCode)) {
udata_printError(ds, "ucnv_swapAliases().swapInvChars(charset names) failed\n");
return 0;

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2005, International Business Machines
* Copyright (C) 1999-2006, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -24,6 +24,16 @@
#define UCNV_NUM_RESERVED_TAGS 2
#define UCNV_NUM_HIDDEN_TAGS 1
typedef enum {
UCNV_IO_UNNORMALIZED,
UCNV_IO_STD_NORMALIZED,
UCNV_IO_NORM_TYPE_COUNT
};
typedef struct {
uint16_t stringNormalizationType;
} UConverterAliasOptions;
typedef struct UConverterAlias {
const uint16_t *converterList;
const uint16_t *tagList;
@ -31,7 +41,9 @@ typedef struct UConverterAlias {
const uint16_t *untaggedConvArray;
const uint16_t *taggedAliasArray;
const uint16_t *taggedAliasLists;
const UConverterAliasOptions *optionTable;
const uint16_t *stringTable;
const uint16_t *normalizedStringTable;
uint32_t converterListSize;
uint32_t tagListSize;
@ -39,7 +51,9 @@ typedef struct UConverterAlias {
uint32_t untaggedConvArraySize;
uint32_t taggedAliasArraySize;
uint32_t taggedAliasListsSize;
/*uint32_t stringTableSize;*/
uint32_t optionTableSize;
uint32_t stringTableSize;
uint32_t normalizedStringTableSize;
} UConverterAlias;
/**
@ -59,10 +73,10 @@ typedef struct UConverterAlias {
# error U_CHARSET_FAMILY is not valid
#endif
U_CFUNC char * U_EXPORT2
U_CAPI char * U_EXPORT2
ucnv_io_stripASCIIForCompare(char *dst, const char *name);
U_CFUNC char * U_EXPORT2
U_CAPI char * U_EXPORT2
ucnv_io_stripEBCDICForCompare(char *dst, const char *name);
/**

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2004, International Business Machines
* Copyright (C) 1999-2006, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -84,7 +84,7 @@ static const UDataInfo dataInfo={
0,
{0x43, 0x76, 0x41, 0x6c}, /* dataFormat="CvAl" */
{3, 0, 0, 0}, /* formatVersion */
{3, 0, 1, 0}, /* formatVersion */
{1, 4, 2, 0} /* dataVersion */
};
@ -137,6 +137,10 @@ static UBool standardTagsUsed = FALSE;
static UBool verbose = FALSE;
static int lineNum = 1;
static UConverterAliasOptions tableOptions = {
UCNV_IO_UNNORMALIZED
};
/* prototypes --------------------------------------------------------------- */
static void
@ -192,7 +196,8 @@ enum
VERBOSE,
COPYRIGHT,
DESTDIR,
SOURCEDIR
SOURCEDIR,
OPTIMIZE
};
static UOption options[]={
@ -201,7 +206,8 @@ static UOption options[]={
UOPTION_VERBOSE,
UOPTION_COPYRIGHT,
UOPTION_DESTDIR,
UOPTION_SOURCEDIR
UOPTION_SOURCEDIR,
UOPTION_DEF( "optimize", 'O', UOPT_REQUIRES_ARG),
};
extern int
@ -234,6 +240,7 @@ main(int argc, char* argv[]) {
"\t-c or --copyright include a copyright notice\n"
"\t-d or --destdir destination directory, followed by the path\n"
"\t-s or --sourcedir source directory, followed by the path\n",
"\t-O or --optimize optimize the table for \"size\" or \"speed\"\n",
argv[0]);
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
@ -242,6 +249,19 @@ main(int argc, char* argv[]) {
verbose = TRUE;
}
if(options[OPTIMIZE].doesOccur) {
if (strcmp(options[OPTIMIZE].value, "size") == 0) {
tableOptions.stringNormalizationType = UCNV_IO_UNNORMALIZED;
}
else if (strcmp(options[OPTIMIZE].value, "speed") == 0) {
tableOptions.stringNormalizationType = UCNV_IO_STD_NORMALIZED;
}
else {
fprintf(stderr, "Invalid value for optimization\n");
return -1;
}
}
if(argc>=2) {
path=argv[1];
} else {
@ -905,6 +925,26 @@ createOneAliasList(uint16_t *aliasArrLists, uint32_t tag, uint32_t converter, ui
}
}
static void
createNormalizedAliasStrings(char *normalizedStrings, const char *origStringBlock, int32_t stringBlockLength) {
int32_t currStrLen;
uprv_memcpy(normalizedStrings, origStringBlock, stringBlockLength);
while ((currStrLen = (int32_t)uprv_strlen(origStringBlock)) < stringBlockLength) {
int32_t currStrSize = currStrLen + 1;
if (currStrLen > 0) {
int32_t normStrLen;
ucnv_io_stripForCompare(normalizedStrings, origStringBlock);
normStrLen = uprv_strlen(normalizedStrings);
if (normStrLen > 0) {
uprv_memset(normalizedStrings + normStrLen, 0, currStrSize - normStrLen);
}
}
stringBlockLength -= currStrSize;
normalizedStrings += currStrSize;
origStringBlock += currStrSize;
}
}
static void
writeAliasTable(UNewDataMemory *out) {
uint32_t i, j;
@ -928,7 +968,12 @@ writeAliasTable(UNewDataMemory *out) {
}
/* Write the size of the TOC */
udata_write32(out, 8);
if (tableOptions.stringNormalizationType == UCNV_IO_UNNORMALIZED) {
udata_write32(out, 8);
}
else {
udata_write32(out, 9);
}
/* Write the sizes of each section */
/* All sizes are the number of uint16_t units, not bytes */
@ -938,8 +983,16 @@ writeAliasTable(UNewDataMemory *out) {
udata_write32(out, uniqueAliasesSize); /* The preresolved form of mapping an untagged the alias to a converter */
udata_write32(out, tagCount * converterCount);
udata_write32(out, aliasListsSize + 1);
udata_write32(out, 0); /* Reserved space. */
if (tableOptions.stringNormalizationType == UCNV_IO_UNNORMALIZED) {
udata_write32(out, 0);
}
else {
udata_write32(out, sizeof(tableOptions) / sizeof(uint16_t));
}
udata_write32(out, (tagBlock.top + stringBlock.top) / sizeof(uint16_t));
if (tableOptions.stringNormalizationType != UCNV_IO_UNNORMALIZED) {
udata_write32(out, (tagBlock.top + stringBlock.top) / sizeof(uint16_t));
}
/* write the table of converters */
/* Think of this as the column headers */
@ -973,12 +1026,28 @@ writeAliasTable(UNewDataMemory *out) {
/* Write the lists */
udata_writeBlock(out, (const void *)aliasLists, aliasListsSize * sizeof(uint16_t));
/* Write any options for the alias table. */
if (tableOptions.stringNormalizationType != UCNV_IO_UNNORMALIZED) {
udata_writeBlock(out, (const void *)&tableOptions, sizeof(tableOptions));
}
/* write the tags strings */
udata_writeString(out, tagBlock.store, tagBlock.top);
/* write the aliases strings */
udata_writeString(out, stringBlock.store, stringBlock.top);
/* write the normalized aliases strings */
if (tableOptions.stringNormalizationType != UCNV_IO_UNNORMALIZED) {
char *normalizedStrings = (char *)uprv_malloc(tagBlock.top + stringBlock.top);
createNormalizedAliasStrings(normalizedStrings, tagBlock.store, tagBlock.top);
createNormalizedAliasStrings(normalizedStrings + tagBlock.top, stringBlock.store, stringBlock.top);
/* Write out the complete normalized array. */
udata_writeString(out, normalizedStrings, tagBlock.top + stringBlock.top);
uprv_free(normalizedStrings);
}
uprv_free(aliasArrLists);
uprv_free(uniqueAliases);
}