ICU-2949 define and fix ucnv_getMaxCharSize() behavior

X-SVN-Rev: 13538
This commit is contained in:
Markus Scherer 2003-10-30 23:03:18 +00:00
parent cefe1b98ab
commit a4d1270530
5 changed files with 197 additions and 26 deletions

View file

@ -580,19 +580,81 @@ U_CAPI void U_EXPORT2
ucnv_resetFromUnicode(UConverter *converter);
/**
* Returns the maximum length of bytes used by a character. This varies
* between 1 and 4
* @param converter the Unicode converter
* @return the maximum number of bytes allowed by this particular converter
* Returns the maximum number of bytes that are output per UChar in conversion
* from Unicode using this converter.
* The returned number can be used with UCNV_GET_MAX_BYTES_FOR_STRING
* to calculate the size of a target buffer for conversion from Unicode.
*
* Note: Before ICU 2.8, this function did not return reliable numbers for
* some stateful converters (EBCDIC_STATEFUL, ISO-2022) and LMBCS.
*
* This number may not be the same as the maximum number of bytes per
* "conversion unit". In other words, it may not be the intuitively expected
* number of bytes per character that would be published for a charset,
* and may not fulfill any other purpose than the allocation of an output
* buffer of guaranteed sufficient size for a given input length and converter.
*
* Examples for special cases that are taken into account:
* - Supplementary code points may convert to more bytes than BMP code points.
* This function returns bytes per UChar (UTF-16 code unit), not per
* Unicode code point, for efficient buffer allocation.
* - State-shifting output (SI/SO, escapes, etc.) from stateful converters.
* - When m input UChars are converted to n output bytes, then the maximum m/n
* is taken into account.
*
* The number returned here does not take into account
* (see UCNV_GET_MAX_BYTES_FOR_STRING):
* - callbacks which output more than one charset character sequence per call,
* like escape callbacks
* - initial and final non-character bytes that are output by some converters
* (automatic BOMs, initial escape sequence, final SI, etc.)
*
* Examples for returned values:
* - SBCS charsets: 1
* - Shift-JIS: 2
* - UTF-16: 2 (2 per BMP, 4 per surrogate _pair_, BOM not counted)
* - UTF-8: 3 (3 per BMP, 4 per surrogate _pair_)
* - EBCDIC_STATEFUL (EBCDIC mixed SBCS/DBCS): 3 (SO + DBCS)
* - ISO-2022: 3 (always outputs UTF-8)
* - ISO-2022-JP: 6 (4-byte escape sequences + DBCS)
* - ISO-2022-CN: 8 (4-byte designator sequences + 2-byte SS2/SS3 + DBCS)
*
* @param converter The Unicode converter.
* @return The maximum number of bytes per UChar that are output by ucnv_fromUnicode(),
* to be used together with UCNV_GET_MAX_BYTES_FOR_STRING for buffer allocation.
*
* @see UCNV_GET_MAX_BYTES_FOR_STRING
* @see ucnv_getMinCharSize
* @stable ICU 2.0
*/
U_CAPI int8_t U_EXPORT2
ucnv_getMaxCharSize(const UConverter *converter);
/**
* Calculates the size of a buffer for conversion from Unicode to a charset.
* The calculated size is guaranteed to be sufficient for this conversion.
*
* It takes into account initial and final non-character bytes that are output
* by some converters.
* It does not take into account callbacks which output more than one charset
* character sequence per call, like escape callbacks.
* The default (substitution) callback only outputs one charset character sequence.
*
* @param length Number of UChars to be converted.
* @param maxCharSize Return value from ucnv_getMaxCharSize() for the converter
* that will be used.
* @return Size of a buffer that will be large enough to hold the output bytes of
* converting length UChars with the converter that returned the maxCharSize.
*
* @see ucnv_getMaxCharSize
* @draft ICU 2.8
*/
#define UCNV_GET_MAX_BYTES_FOR_STRING(length, maxCharSize) \
(((int32_t)(length)+10)*(int32_t)(maxCharSize))
/**
* Returns the minimum byte length for characters in this codepage.
* This is either 1 or 2 for all supported codepages.
* This is usually either 1 or 2.
* @param converter the Unicode converter
* @return the minimum number of bytes allowed by this particular converter
* @see ucnv_getMaxCharSize
@ -970,7 +1032,7 @@ ucnv_toUnicode(UConverter *converter,
* It is only useful for whole strings, not for streaming conversion.
*
* The maximum output buffer capacity required (barring output from callbacks) will be
* srcLength*ucnv_getMaxCharSize(cnv).
* UCNV_GET_MAX_BYTES_FOR_STRING(srcLength, ucnv_getMaxCharSize(cnv)).
*
* @param cnv the converter object to be used (ucnv_resetFromUnicode() will be called)
* @param src the input Unicode string
@ -986,6 +1048,7 @@ ucnv_toUnicode(UConverter *converter,
* and a buffer of the indicated length would need to be passed in
* @see ucnv_fromUnicode
* @see ucnv_convert
* @see UCNV_GET_MAX_BYTES_FOR_STRING
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2

View file

@ -82,6 +82,15 @@ T_CString_stricmp(const char *str1, const char *str2) {
static UConverterFromUCallback otherUnicodeAction(UConverterFromUCallback MIA);
static UConverterToUCallback otherCharAction(UConverterToUCallback MIA);
static UConverter *
cnv_open(const char *name, UErrorCode *pErrorCode) {
if(name!=NULL && name[0]=='*') {
return ucnv_openPackage(loadTestData(pErrorCode), name+1, pErrorCode);
} else {
return ucnv_open(name, pErrorCode);
}
}
static void ListNames(void);
void TestFlushCache(void); /* defined in cctest.c */
@ -2307,33 +2316,69 @@ static void TestLMBCSMaxChar(void) {
int8_t maxSize;
const char *name;
} converter[] = {
{ 2, "LMBCS-1"},
{ 2, "LMBCS-2"},
{ 2, "LMBCS-3"},
{ 2, "LMBCS-4"},
{ 2, "LMBCS-5"},
{ 2, "LMBCS-6"},
{ 2, "LMBCS-8"},
{ 2, "LMBCS-11"},
{ 2, "LMBCS-16"},
{ 2, "LMBCS-17"},
{ 2, "LMBCS-18"},
{ 2, "LMBCS-19"}
/* some non-LMBCS converters - perfect test setup here */
{ 1, "US-ASCII"},
{ 1, "ISO-8859-1"},
{ 2, "UTF-16"},
{ 2, "UTF-16BE"},
{ 3, "UTF-8"},
{ 3, "CESU-8"},
{ 3, "SCSU"},
{ 4, "UTF-32"},
{ 4, "UTF-7"},
{ 4, "IMAP-mailbox-name"},
{ 4, "BOCU-1"},
{ 1, "windows-1256"},
{ 2, "Shift-JIS"},
{ 2, "ibm-16684"},
{ 3, "ibm-930"},
{ 3, "ibm-1390"},
{ 4, "*test3"},
{ 16,"*test4"},
{ 4, "ISCII"},
{ 4, "HZ"},
{ 3, "ISO-2022"},
{ 3, "ISO-2022-KR"},
{ 6, "ISO-2022-JP"},
{ 8, "ISO-2022-CN"},
/* LMBCS */
{ 3, "LMBCS-1"},
{ 3, "LMBCS-2"},
{ 3, "LMBCS-3"},
{ 3, "LMBCS-4"},
{ 3, "LMBCS-5"},
{ 3, "LMBCS-6"},
{ 3, "LMBCS-8"},
{ 3, "LMBCS-11"},
{ 3, "LMBCS-16"},
{ 3, "LMBCS-17"},
{ 3, "LMBCS-18"},
{ 3, "LMBCS-19"}
};
int32_t idx;
for (idx = 0; idx < LENGTHOF(converter); idx++) {
UErrorCode status = U_ZERO_ERROR;
UConverter *cnv = ucnv_open(converter[idx].name, &status);
UConverter *cnv = cnv_open(converter[idx].name, &status);
if (U_FAILURE(status)) {
continue;
}
if (converter[idx].maxSize != ucnv_getMaxCharSize(cnv)) {
log_data_err("error: for %s expected %d, got %d\n",
log_err("error: ucnv_getMaxCharSize(%s) expected %d, got %d\n",
converter[idx].name, converter[idx].maxSize, ucnv_getMaxCharSize(cnv));
}
ucnv_close(cnv);
}
/* mostly test that the macro compiles */
if(UCNV_GET_MAX_BYTES_FOR_STRING(1, 2)<10) {
log_err("error UCNV_GET_MAX_BYTES_FOR_STRING(1, 2)<10\n");
}
}
@ -2576,6 +2621,3 @@ TestEBCDICSwapLFNL() {
testSwap(tests[i].name, tests[i].swap);
}
}

View file

@ -55,4 +55,10 @@ CHARMAP
# add a mapping that turns the above's Unicode side into a prefix
<U50005><U60006> \x06 |1
# many bytes, and bytes per UChar
<U30ab><U309a> \x01\x02\x03\x0a\x01\x02\x03\x0b\x01\x02\x03\x0c\x01\x02\x03\x0d\x01\x02\x03\x0e\x01\x02\x03\x0f\x01\x02\x03\x0a\x05\x06\x07 |0
# many UChars, and UChars per byte
<U304b><U309a><U304d><U309a><U304f><U309a><U3051><U309a><U3053><U309a><U30ab><U309a><U30ad><U309a><U30af><U309a><U30b1><U309a><U0300> \x08\x09 |0
END CHARMAP

View file

@ -63,6 +63,11 @@ typedef struct CnvExtData {
/* for stage3 compaction of <subchar1> |2 mappings */
uint16_t stage3Sub1Block;
/* statistics */
int32_t
maxInBytes, maxOutBytes, maxBytesPerUChar,
maxInUChars, maxOutUChars, maxUCharsPerByte;
} CnvExtData;
NewConverter *
@ -199,6 +204,16 @@ CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
indexes[UCNV_EXT_SIZE]=top;
/* statistics */
indexes[UCNV_EXT_COUNT_BYTES]=
(extData->maxInBytes<<16)|
(extData->maxOutBytes<<8)|
extData->maxBytesPerUChar;
indexes[UCNV_EXT_COUNT_UCHARS]=
(extData->maxInUChars<<16)|
(extData->maxOutUChars<<8)|
extData->maxUCharsPerByte;
/* write the extension data */
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4);
@ -307,11 +322,12 @@ getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
UChar32 *u32;
UChar *u;
uint32_t value;
int32_t u16Length;
int32_t u16Length, ratio;
UErrorCode errorCode;
/* write the Unicode result code point or string index */
if(m->uLen==1) {
u16Length=U16_LENGTH(m->u);
value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u);
} else {
/* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */
@ -340,6 +356,20 @@ getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
if(m->f==0) {
value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG;
}
/* update statistics */
if(m->bLen>extData->maxInBytes) {
extData->maxInBytes=m->bLen;
}
if(u16Length>extData->maxOutUChars) {
extData->maxOutUChars=u16Length;
}
ratio=(u16Length+(m->bLen-1))/m->bLen;
if(ratio>extData->maxUCharsPerByte) {
extData->maxUCharsPerByte=ratio;
}
return value;
}
@ -586,9 +616,18 @@ static uint32_t
getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
uint8_t *bytes, *resultBytes;
uint32_t value;
int32_t u16Length, ratio;
if(m->f==2) {
return UCNV_EXT_FROM_U_SUBCHAR1; /* <subchar1> SUB mapping */
/*
* no mapping, <subchar1> preferred
*
* no need to count in statistics because the subchars are already
* counted for maxOutBytes and maxBytesPerUChar in UConverterStaticData,
* and this non-mapping does not count for maxInUChars which are always
* trivially at least two if counting unmappable supplementary code points
*/
return UCNV_EXT_FROM_U_SUBCHAR1;
}
bytes=UCM_GET_BYTES(table, m);
@ -614,6 +653,27 @@ getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
if(m->f==0) {
value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG;
}
/* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */
if(m->uLen==1) {
u16Length=U16_LENGTH(m->u);
} else {
u16Length=U16_LENGTH(UCM_GET_CODE_POINTS(table, m)[0])+(m->uLen-2);
}
/* update statistics */
if(u16Length>extData->maxInUChars) {
extData->maxInUChars=u16Length;
}
if(m->bLen>extData->maxOutBytes) {
extData->maxOutBytes=m->bLen;
}
ratio=(m->bLen+(u16Length-1))/u16Length;
if(ratio>extData->maxBytesPerUChar) {
extData->maxBytesPerUChar=ratio;
}
return value;
}

View file

@ -597,7 +597,7 @@ static void
readTable(ConvData *data, FileStream* convFile,
UBool forBase, UCMStates *baseStates,
UErrorCode *pErrorCode) {
char line[200];
char line[500];
char *end;
UBool isOK;