mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-2949 define and fix ucnv_getMaxCharSize() behavior
X-SVN-Rev: 13538
This commit is contained in:
parent
cefe1b98ab
commit
a4d1270530
5 changed files with 197 additions and 26 deletions
|
@ -580,19 +580,81 @@ U_CAPI void U_EXPORT2
|
|||
ucnv_resetFromUnicode(UConverter *converter);
|
||||
|
||||
/**
|
||||
* Returns the maximum length of bytes used by a character. This varies
|
||||
* between 1 and 4
|
||||
* @param converter the Unicode converter
|
||||
* @return the maximum number of bytes allowed by this particular converter
|
||||
* Returns the maximum number of bytes that are output per UChar in conversion
|
||||
* from Unicode using this converter.
|
||||
* The returned number can be used with UCNV_GET_MAX_BYTES_FOR_STRING
|
||||
* to calculate the size of a target buffer for conversion from Unicode.
|
||||
*
|
||||
* Note: Before ICU 2.8, this function did not return reliable numbers for
|
||||
* some stateful converters (EBCDIC_STATEFUL, ISO-2022) and LMBCS.
|
||||
*
|
||||
* This number may not be the same as the maximum number of bytes per
|
||||
* "conversion unit". In other words, it may not be the intuitively expected
|
||||
* number of bytes per character that would be published for a charset,
|
||||
* and may not fulfill any other purpose than the allocation of an output
|
||||
* buffer of guaranteed sufficient size for a given input length and converter.
|
||||
*
|
||||
* Examples for special cases that are taken into account:
|
||||
* - Supplementary code points may convert to more bytes than BMP code points.
|
||||
* This function returns bytes per UChar (UTF-16 code unit), not per
|
||||
* Unicode code point, for efficient buffer allocation.
|
||||
* - State-shifting output (SI/SO, escapes, etc.) from stateful converters.
|
||||
* - When m input UChars are converted to n output bytes, then the maximum m/n
|
||||
* is taken into account.
|
||||
*
|
||||
* The number returned here does not take into account
|
||||
* (see UCNV_GET_MAX_BYTES_FOR_STRING):
|
||||
* - callbacks which output more than one charset character sequence per call,
|
||||
* like escape callbacks
|
||||
* - initial and final non-character bytes that are output by some converters
|
||||
* (automatic BOMs, initial escape sequence, final SI, etc.)
|
||||
*
|
||||
* Examples for returned values:
|
||||
* - SBCS charsets: 1
|
||||
* - Shift-JIS: 2
|
||||
* - UTF-16: 2 (2 per BMP, 4 per surrogate _pair_, BOM not counted)
|
||||
* - UTF-8: 3 (3 per BMP, 4 per surrogate _pair_)
|
||||
* - EBCDIC_STATEFUL (EBCDIC mixed SBCS/DBCS): 3 (SO + DBCS)
|
||||
* - ISO-2022: 3 (always outputs UTF-8)
|
||||
* - ISO-2022-JP: 6 (4-byte escape sequences + DBCS)
|
||||
* - ISO-2022-CN: 8 (4-byte designator sequences + 2-byte SS2/SS3 + DBCS)
|
||||
*
|
||||
* @param converter The Unicode converter.
|
||||
* @return The maximum number of bytes per UChar that are output by ucnv_fromUnicode(),
|
||||
* to be used together with UCNV_GET_MAX_BYTES_FOR_STRING for buffer allocation.
|
||||
*
|
||||
* @see UCNV_GET_MAX_BYTES_FOR_STRING
|
||||
* @see ucnv_getMinCharSize
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI int8_t U_EXPORT2
|
||||
ucnv_getMaxCharSize(const UConverter *converter);
|
||||
|
||||
/**
|
||||
* Calculates the size of a buffer for conversion from Unicode to a charset.
|
||||
* The calculated size is guaranteed to be sufficient for this conversion.
|
||||
*
|
||||
* It takes into account initial and final non-character bytes that are output
|
||||
* by some converters.
|
||||
* It does not take into account callbacks which output more than one charset
|
||||
* character sequence per call, like escape callbacks.
|
||||
* The default (substitution) callback only outputs one charset character sequence.
|
||||
*
|
||||
* @param length Number of UChars to be converted.
|
||||
* @param maxCharSize Return value from ucnv_getMaxCharSize() for the converter
|
||||
* that will be used.
|
||||
* @return Size of a buffer that will be large enough to hold the output bytes of
|
||||
* converting length UChars with the converter that returned the maxCharSize.
|
||||
*
|
||||
* @see ucnv_getMaxCharSize
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
#define UCNV_GET_MAX_BYTES_FOR_STRING(length, maxCharSize) \
|
||||
(((int32_t)(length)+10)*(int32_t)(maxCharSize))
|
||||
|
||||
/**
|
||||
* Returns the minimum byte length for characters in this codepage.
|
||||
* This is either 1 or 2 for all supported codepages.
|
||||
* This is usually either 1 or 2.
|
||||
* @param converter the Unicode converter
|
||||
* @return the minimum number of bytes allowed by this particular converter
|
||||
* @see ucnv_getMaxCharSize
|
||||
|
@ -970,7 +1032,7 @@ ucnv_toUnicode(UConverter *converter,
|
|||
* It is only useful for whole strings, not for streaming conversion.
|
||||
*
|
||||
* The maximum output buffer capacity required (barring output from callbacks) will be
|
||||
* srcLength*ucnv_getMaxCharSize(cnv).
|
||||
* UCNV_GET_MAX_BYTES_FOR_STRING(srcLength, ucnv_getMaxCharSize(cnv)).
|
||||
*
|
||||
* @param cnv the converter object to be used (ucnv_resetFromUnicode() will be called)
|
||||
* @param src the input Unicode string
|
||||
|
@ -986,6 +1048,7 @@ ucnv_toUnicode(UConverter *converter,
|
|||
* and a buffer of the indicated length would need to be passed in
|
||||
* @see ucnv_fromUnicode
|
||||
* @see ucnv_convert
|
||||
* @see UCNV_GET_MAX_BYTES_FOR_STRING
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
|
|
|
@ -82,6 +82,15 @@ T_CString_stricmp(const char *str1, const char *str2) {
|
|||
static UConverterFromUCallback otherUnicodeAction(UConverterFromUCallback MIA);
|
||||
static UConverterToUCallback otherCharAction(UConverterToUCallback MIA);
|
||||
|
||||
static UConverter *
|
||||
cnv_open(const char *name, UErrorCode *pErrorCode) {
|
||||
if(name!=NULL && name[0]=='*') {
|
||||
return ucnv_openPackage(loadTestData(pErrorCode), name+1, pErrorCode);
|
||||
} else {
|
||||
return ucnv_open(name, pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void ListNames(void);
|
||||
void TestFlushCache(void); /* defined in cctest.c */
|
||||
|
@ -2307,33 +2316,69 @@ static void TestLMBCSMaxChar(void) {
|
|||
int8_t maxSize;
|
||||
const char *name;
|
||||
} converter[] = {
|
||||
{ 2, "LMBCS-1"},
|
||||
{ 2, "LMBCS-2"},
|
||||
{ 2, "LMBCS-3"},
|
||||
{ 2, "LMBCS-4"},
|
||||
{ 2, "LMBCS-5"},
|
||||
{ 2, "LMBCS-6"},
|
||||
{ 2, "LMBCS-8"},
|
||||
{ 2, "LMBCS-11"},
|
||||
{ 2, "LMBCS-16"},
|
||||
{ 2, "LMBCS-17"},
|
||||
{ 2, "LMBCS-18"},
|
||||
{ 2, "LMBCS-19"}
|
||||
/* some non-LMBCS converters - perfect test setup here */
|
||||
{ 1, "US-ASCII"},
|
||||
{ 1, "ISO-8859-1"},
|
||||
|
||||
{ 2, "UTF-16"},
|
||||
{ 2, "UTF-16BE"},
|
||||
{ 3, "UTF-8"},
|
||||
{ 3, "CESU-8"},
|
||||
{ 3, "SCSU"},
|
||||
{ 4, "UTF-32"},
|
||||
{ 4, "UTF-7"},
|
||||
{ 4, "IMAP-mailbox-name"},
|
||||
{ 4, "BOCU-1"},
|
||||
|
||||
{ 1, "windows-1256"},
|
||||
{ 2, "Shift-JIS"},
|
||||
{ 2, "ibm-16684"},
|
||||
{ 3, "ibm-930"},
|
||||
{ 3, "ibm-1390"},
|
||||
{ 4, "*test3"},
|
||||
{ 16,"*test4"},
|
||||
|
||||
{ 4, "ISCII"},
|
||||
{ 4, "HZ"},
|
||||
|
||||
{ 3, "ISO-2022"},
|
||||
{ 3, "ISO-2022-KR"},
|
||||
{ 6, "ISO-2022-JP"},
|
||||
{ 8, "ISO-2022-CN"},
|
||||
|
||||
/* LMBCS */
|
||||
{ 3, "LMBCS-1"},
|
||||
{ 3, "LMBCS-2"},
|
||||
{ 3, "LMBCS-3"},
|
||||
{ 3, "LMBCS-4"},
|
||||
{ 3, "LMBCS-5"},
|
||||
{ 3, "LMBCS-6"},
|
||||
{ 3, "LMBCS-8"},
|
||||
{ 3, "LMBCS-11"},
|
||||
{ 3, "LMBCS-16"},
|
||||
{ 3, "LMBCS-17"},
|
||||
{ 3, "LMBCS-18"},
|
||||
{ 3, "LMBCS-19"}
|
||||
};
|
||||
int32_t idx;
|
||||
|
||||
for (idx = 0; idx < LENGTHOF(converter); idx++) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UConverter *cnv = ucnv_open(converter[idx].name, &status);
|
||||
UConverter *cnv = cnv_open(converter[idx].name, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
continue;
|
||||
}
|
||||
if (converter[idx].maxSize != ucnv_getMaxCharSize(cnv)) {
|
||||
log_data_err("error: for %s expected %d, got %d\n",
|
||||
log_err("error: ucnv_getMaxCharSize(%s) expected %d, got %d\n",
|
||||
converter[idx].name, converter[idx].maxSize, ucnv_getMaxCharSize(cnv));
|
||||
}
|
||||
ucnv_close(cnv);
|
||||
}
|
||||
|
||||
/* mostly test that the macro compiles */
|
||||
if(UCNV_GET_MAX_BYTES_FOR_STRING(1, 2)<10) {
|
||||
log_err("error UCNV_GET_MAX_BYTES_FOR_STRING(1, 2)<10\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -2576,6 +2621,3 @@ TestEBCDICSwapLFNL() {
|
|||
testSwap(tests[i].name, tests[i].swap);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
6
icu4c/source/test/testdata/test4.ucm
vendored
6
icu4c/source/test/testdata/test4.ucm
vendored
|
@ -55,4 +55,10 @@ CHARMAP
|
|||
# add a mapping that turns the above's Unicode side into a prefix
|
||||
<U50005><U60006> \x06 |1
|
||||
|
||||
# many bytes, and bytes per UChar
|
||||
<U30ab><U309a> \x01\x02\x03\x0a\x01\x02\x03\x0b\x01\x02\x03\x0c\x01\x02\x03\x0d\x01\x02\x03\x0e\x01\x02\x03\x0f\x01\x02\x03\x0a\x05\x06\x07 |0
|
||||
|
||||
# many UChars, and UChars per byte
|
||||
<U304b><U309a><U304d><U309a><U304f><U309a><U3051><U309a><U3053><U309a><U30ab><U309a><U30ad><U309a><U30af><U309a><U30b1><U309a><U0300> \x08\x09 |0
|
||||
|
||||
END CHARMAP
|
||||
|
|
|
@ -63,6 +63,11 @@ typedef struct CnvExtData {
|
|||
|
||||
/* for stage3 compaction of <subchar1> |2 mappings */
|
||||
uint16_t stage3Sub1Block;
|
||||
|
||||
/* statistics */
|
||||
int32_t
|
||||
maxInBytes, maxOutBytes, maxBytesPerUChar,
|
||||
maxInUChars, maxOutUChars, maxUCharsPerByte;
|
||||
} CnvExtData;
|
||||
|
||||
NewConverter *
|
||||
|
@ -199,6 +204,16 @@ CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
|||
|
||||
indexes[UCNV_EXT_SIZE]=top;
|
||||
|
||||
/* statistics */
|
||||
indexes[UCNV_EXT_COUNT_BYTES]=
|
||||
(extData->maxInBytes<<16)|
|
||||
(extData->maxOutBytes<<8)|
|
||||
extData->maxBytesPerUChar;
|
||||
indexes[UCNV_EXT_COUNT_UCHARS]=
|
||||
(extData->maxInUChars<<16)|
|
||||
(extData->maxOutUChars<<8)|
|
||||
extData->maxUCharsPerByte;
|
||||
|
||||
/* write the extension data */
|
||||
udata_writeBlock(pData, indexes, sizeof(indexes));
|
||||
udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4);
|
||||
|
@ -307,11 +322,12 @@ getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
|
|||
UChar32 *u32;
|
||||
UChar *u;
|
||||
uint32_t value;
|
||||
int32_t u16Length;
|
||||
int32_t u16Length, ratio;
|
||||
UErrorCode errorCode;
|
||||
|
||||
/* write the Unicode result code point or string index */
|
||||
if(m->uLen==1) {
|
||||
u16Length=U16_LENGTH(m->u);
|
||||
value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u);
|
||||
} else {
|
||||
/* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */
|
||||
|
@ -340,6 +356,20 @@ getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
|
|||
if(m->f==0) {
|
||||
value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG;
|
||||
}
|
||||
|
||||
/* update statistics */
|
||||
if(m->bLen>extData->maxInBytes) {
|
||||
extData->maxInBytes=m->bLen;
|
||||
}
|
||||
if(u16Length>extData->maxOutUChars) {
|
||||
extData->maxOutUChars=u16Length;
|
||||
}
|
||||
|
||||
ratio=(u16Length+(m->bLen-1))/m->bLen;
|
||||
if(ratio>extData->maxUCharsPerByte) {
|
||||
extData->maxUCharsPerByte=ratio;
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
|
@ -586,9 +616,18 @@ static uint32_t
|
|||
getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
|
||||
uint8_t *bytes, *resultBytes;
|
||||
uint32_t value;
|
||||
int32_t u16Length, ratio;
|
||||
|
||||
if(m->f==2) {
|
||||
return UCNV_EXT_FROM_U_SUBCHAR1; /* <subchar1> SUB mapping */
|
||||
/*
|
||||
* no mapping, <subchar1> preferred
|
||||
*
|
||||
* no need to count in statistics because the subchars are already
|
||||
* counted for maxOutBytes and maxBytesPerUChar in UConverterStaticData,
|
||||
* and this non-mapping does not count for maxInUChars which are always
|
||||
* trivially at least two if counting unmappable supplementary code points
|
||||
*/
|
||||
return UCNV_EXT_FROM_U_SUBCHAR1;
|
||||
}
|
||||
|
||||
bytes=UCM_GET_BYTES(table, m);
|
||||
|
@ -614,6 +653,27 @@ getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
|
|||
if(m->f==0) {
|
||||
value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG;
|
||||
}
|
||||
|
||||
/* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */
|
||||
if(m->uLen==1) {
|
||||
u16Length=U16_LENGTH(m->u);
|
||||
} else {
|
||||
u16Length=U16_LENGTH(UCM_GET_CODE_POINTS(table, m)[0])+(m->uLen-2);
|
||||
}
|
||||
|
||||
/* update statistics */
|
||||
if(u16Length>extData->maxInUChars) {
|
||||
extData->maxInUChars=u16Length;
|
||||
}
|
||||
if(m->bLen>extData->maxOutBytes) {
|
||||
extData->maxOutBytes=m->bLen;
|
||||
}
|
||||
|
||||
ratio=(m->bLen+(u16Length-1))/u16Length;
|
||||
if(ratio>extData->maxBytesPerUChar) {
|
||||
extData->maxBytesPerUChar=ratio;
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
|
|
|
@ -597,7 +597,7 @@ static void
|
|||
readTable(ConvData *data, FileStream* convFile,
|
||||
UBool forBase, UCMStates *baseStates,
|
||||
UErrorCode *pErrorCode) {
|
||||
char line[200];
|
||||
char line[500];
|
||||
char *end;
|
||||
UBool isOK;
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue