ICU-2949 define and fix ucnv_getMaxCharSize() behavior

X-SVN-Rev: 13538
2025-04-07 22:44:49 +00:00 · 2003-10-30 23:03:18 +00:00 · 2003-10-30 23:03:18 +00:00 · a4d1270530
commit a4d1270530
parent cefe1b98ab
5 changed files with 197 additions and 26 deletions
--- a/icu4c/source/common/unicode/ucnv.h
+++ b/icu4c/source/common/unicode/ucnv.h
@ -580,19 +580,81 @@ U_CAPI void U_EXPORT2
 ucnv_resetFromUnicode(UConverter *converter);

 /**
- * Returns the maximum length of bytes used by a character. This varies 
- * between 1 and 4
- * @param converter the Unicode converter
- * @return the maximum number of bytes allowed by this particular converter
+ * Returns the maximum number of bytes that are output per UChar in conversion
+ * from Unicode using this converter.
+ * The returned number can be used with UCNV_GET_MAX_BYTES_FOR_STRING
+ * to calculate the size of a target buffer for conversion from Unicode.
+ *
+ * Note: Before ICU 2.8, this function did not return reliable numbers for
+ * some stateful converters (EBCDIC_STATEFUL, ISO-2022) and LMBCS.
+ *
+ * This number may not be the same as the maximum number of bytes per
+ * "conversion unit". In other words, it may not be the intuitively expected
+ * number of bytes per character that would be published for a charset,
+ * and may not fulfill any other purpose than the allocation of an output
+ * buffer of guaranteed sufficient size for a given input length and converter.
+ *
+ * Examples for special cases that are taken into account:
+ * - Supplementary code points may convert to more bytes than BMP code points.
+ *   This function returns bytes per UChar (UTF-16 code unit), not per
+ *   Unicode code point, for efficient buffer allocation.
+ * - State-shifting output (SI/SO, escapes, etc.) from stateful converters.
+ * - When m input UChars are converted to n output bytes, then the maximum m/n
+ *   is taken into account.
+ *
+ * The number returned here does not take into account
+ * (see UCNV_GET_MAX_BYTES_FOR_STRING):
+ * - callbacks which output more than one charset character sequence per call,
+ *   like escape callbacks
+ * - initial and final non-character bytes that are output by some converters
+ *   (automatic BOMs, initial escape sequence, final SI, etc.)
+ *
+ * Examples for returned values:
+ * - SBCS charsets: 1
+ * - Shift-JIS: 2
+ * - UTF-16: 2 (2 per BMP, 4 per surrogate _pair_, BOM not counted)
+ * - UTF-8: 3 (3 per BMP, 4 per surrogate _pair_)
+ * - EBCDIC_STATEFUL (EBCDIC mixed SBCS/DBCS): 3 (SO + DBCS)
+ * - ISO-2022: 3 (always outputs UTF-8)
+ * - ISO-2022-JP: 6 (4-byte escape sequences + DBCS)
+ * - ISO-2022-CN: 8 (4-byte designator sequences + 2-byte SS2/SS3 + DBCS)
+ *
+ * @param converter The Unicode converter.
+ * @return The maximum number of bytes per UChar that are output by ucnv_fromUnicode(),
+ *         to be used together with UCNV_GET_MAX_BYTES_FOR_STRING for buffer allocation.
+ *
+ * @see UCNV_GET_MAX_BYTES_FOR_STRING
 * @see ucnv_getMinCharSize
 * @stable ICU 2.0
 */
 U_CAPI int8_t U_EXPORT2
 ucnv_getMaxCharSize(const UConverter *converter);

+/**
+ * Calculates the size of a buffer for conversion from Unicode to a charset.
+ * The calculated size is guaranteed to be sufficient for this conversion.
+ *
+ * It takes into account initial and final non-character bytes that are output
+ * by some converters.
+ * It does not take into account callbacks which output more than one charset
+ * character sequence per call, like escape callbacks.
+ * The default (substitution) callback only outputs one charset character sequence.
+ *
+ * @param length Number of UChars to be converted.
+ * @param maxCharSize Return value from ucnv_getMaxCharSize() for the converter
+ *                    that will be used.
+ * @return Size of a buffer that will be large enough to hold the output bytes of
+ *         converting length UChars with the converter that returned the maxCharSize.
+ *
+ * @see ucnv_getMaxCharSize
+ * @draft ICU 2.8
+ */
+#define UCNV_GET_MAX_BYTES_FOR_STRING(length, maxCharSize) \
+     (((int32_t)(length)+10)*(int32_t)(maxCharSize))
+
 /**
 * Returns the minimum byte length for characters in this codepage. 
- * This is either 1 or 2 for all supported codepages.
+ * This is usually either 1 or 2.
 * @param converter the Unicode converter
 * @return the minimum number of bytes allowed by this particular converter
 * @see ucnv_getMaxCharSize
@ -970,7 +1032,7 @@ ucnv_toUnicode(UConverter *converter,
 * It is only useful for whole strings, not for streaming conversion.
 *
 * The maximum output buffer capacity required (barring output from callbacks) will be
- * srcLength*ucnv_getMaxCharSize(cnv).
+ * UCNV_GET_MAX_BYTES_FOR_STRING(srcLength, ucnv_getMaxCharSize(cnv)).
 *
 * @param cnv the converter object to be used (ucnv_resetFromUnicode() will be called)
 * @param src the input Unicode string
@ -986,6 +1048,7 @@ ucnv_toUnicode(UConverter *converter,
 *         and a buffer of the indicated length would need to be passed in
 * @see ucnv_fromUnicode
 * @see ucnv_convert
+ * @see UCNV_GET_MAX_BYTES_FOR_STRING
 * @stable ICU 2.0
 */
 U_CAPI int32_t U_EXPORT2
--- a/icu4c/source/test/cintltst/ccapitst.c
+++ b/icu4c/source/test/cintltst/ccapitst.c
@ -82,6 +82,15 @@ T_CString_stricmp(const char *str1, const char *str2) {
 static UConverterFromUCallback otherUnicodeAction(UConverterFromUCallback MIA);
 static UConverterToUCallback otherCharAction(UConverterToUCallback MIA);

+static UConverter *
+cnv_open(const char *name, UErrorCode *pErrorCode) {
+    if(name!=NULL && name[0]=='*') {
+        return ucnv_openPackage(loadTestData(pErrorCode), name+1, pErrorCode);
+    } else {
+        return ucnv_open(name, pErrorCode);
+    }
+}
+

 static void ListNames(void);
       void TestFlushCache(void);	/* defined in cctest.c */
@ -2307,33 +2316,69 @@ static void TestLMBCSMaxChar(void) {
        int8_t maxSize;
        const char *name;
    } converter[] = {
-        { 2, "LMBCS-1"},
-        { 2, "LMBCS-2"},
-        { 2, "LMBCS-3"},
-        { 2, "LMBCS-4"},
-        { 2, "LMBCS-5"},
-        { 2, "LMBCS-6"},
-        { 2, "LMBCS-8"},
-        { 2, "LMBCS-11"},
-        { 2, "LMBCS-16"},
-        { 2, "LMBCS-17"},
-        { 2, "LMBCS-18"},
-        { 2, "LMBCS-19"}
+        /* some non-LMBCS converters - perfect test setup here */
+        { 1, "US-ASCII"},
+        { 1, "ISO-8859-1"},
+
+        { 2, "UTF-16"},
+        { 2, "UTF-16BE"},
+        { 3, "UTF-8"},
+        { 3, "CESU-8"},
+        { 3, "SCSU"},
+        { 4, "UTF-32"},
+        { 4, "UTF-7"},
+        { 4, "IMAP-mailbox-name"},
+        { 4, "BOCU-1"},
+
+        { 1, "windows-1256"},
+        { 2, "Shift-JIS"},
+        { 2, "ibm-16684"},
+        { 3, "ibm-930"},
+        { 3, "ibm-1390"},
+        { 4, "*test3"},
+        { 16,"*test4"},
+
+        { 4, "ISCII"},
+        { 4, "HZ"},
+
+        { 3, "ISO-2022"},
+        { 3, "ISO-2022-KR"},
+        { 6, "ISO-2022-JP"},
+        { 8, "ISO-2022-CN"},
+
+        /* LMBCS */
+        { 3, "LMBCS-1"},
+        { 3, "LMBCS-2"},
+        { 3, "LMBCS-3"},
+        { 3, "LMBCS-4"},
+        { 3, "LMBCS-5"},
+        { 3, "LMBCS-6"},
+        { 3, "LMBCS-8"},
+        { 3, "LMBCS-11"},
+        { 3, "LMBCS-16"},
+        { 3, "LMBCS-17"},
+        { 3, "LMBCS-18"},
+        { 3, "LMBCS-19"}
    };
    int32_t idx;

    for (idx = 0; idx < LENGTHOF(converter); idx++) {
        UErrorCode status = U_ZERO_ERROR;
-        UConverter *cnv = ucnv_open(converter[idx].name, &status);
+        UConverter *cnv = cnv_open(converter[idx].name, &status);
        if (U_FAILURE(status)) {
            continue;
        }
        if (converter[idx].maxSize != ucnv_getMaxCharSize(cnv)) {
-            log_data_err("error: for %s expected %d, got %d\n",
+            log_err("error: ucnv_getMaxCharSize(%s) expected %d, got %d\n",
                converter[idx].name, converter[idx].maxSize, ucnv_getMaxCharSize(cnv));
        }
        ucnv_close(cnv);
    }
+
+    /* mostly test that the macro compiles */
+    if(UCNV_GET_MAX_BYTES_FOR_STRING(1, 2)<10) {
+        log_err("error UCNV_GET_MAX_BYTES_FOR_STRING(1, 2)<10\n");
+    }
 }


@ -2576,6 +2621,3 @@ TestEBCDICSwapLFNL() {
        testSwap(tests[i].name, tests[i].swap);
    }
 }
-
-
-
--- a/icu4c/source/test/testdata/test4.ucm
+++ b/icu4c/source/test/testdata/test4.ucm
@ -55,4 +55,10 @@ CHARMAP
 # add a mapping that turns the above's Unicode side into a prefix
 <U50005><U60006> \x06 |1

+# many bytes, and bytes per UChar
+<U30ab><U309a> \x01\x02\x03\x0a\x01\x02\x03\x0b\x01\x02\x03\x0c\x01\x02\x03\x0d\x01\x02\x03\x0e\x01\x02\x03\x0f\x01\x02\x03\x0a\x05\x06\x07 |0
+
+# many UChars, and UChars per byte
+<U304b><U309a><U304d><U309a><U304f><U309a><U3051><U309a><U3053><U309a><U30ab><U309a><U30ad><U309a><U30af><U309a><U30b1><U309a><U0300> \x08\x09 |0
+
 END CHARMAP
--- a/icu4c/source/tools/makeconv/gencnvex.c
+++ b/icu4c/source/tools/makeconv/gencnvex.c
@ -63,6 +63,11 @@ typedef struct CnvExtData {

    /* for stage3 compaction of <subchar1> |2 mappings */
    uint16_t stage3Sub1Block;
+
+    /* statistics */
+    int32_t
+        maxInBytes, maxOutBytes, maxBytesPerUChar,
+        maxInUChars, maxOutUChars, maxUCharsPerByte;
 } CnvExtData;

 NewConverter *
@ -199,6 +204,16 @@ CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,

    indexes[UCNV_EXT_SIZE]=top;

+    /* statistics */
+    indexes[UCNV_EXT_COUNT_BYTES]=
+        (extData->maxInBytes<<16)|
+        (extData->maxOutBytes<<8)|
+        extData->maxBytesPerUChar;
+    indexes[UCNV_EXT_COUNT_UCHARS]=
+        (extData->maxInUChars<<16)|
+        (extData->maxOutUChars<<8)|
+        extData->maxUCharsPerByte;
+
    /* write the extension data */
    udata_writeBlock(pData, indexes, sizeof(indexes));
    udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4);
@ -307,11 +322,12 @@ getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
    UChar32 *u32;
    UChar *u;
    uint32_t value;
-    int32_t u16Length;
+    int32_t u16Length, ratio;
    UErrorCode errorCode;

    /* write the Unicode result code point or string index */
    if(m->uLen==1) {
+        u16Length=U16_LENGTH(m->u);
        value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u);
    } else {
        /* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */
@ -340,6 +356,20 @@ getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
    if(m->f==0) {
        value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG;
    }
+
+    /* update statistics */
+    if(m->bLen>extData->maxInBytes) {
+        extData->maxInBytes=m->bLen;
+    }
+    if(u16Length>extData->maxOutUChars) {
+        extData->maxOutUChars=u16Length;
+    }
+
+    ratio=(u16Length+(m->bLen-1))/m->bLen;
+    if(ratio>extData->maxUCharsPerByte) {
+        extData->maxUCharsPerByte=ratio;
+    }
+
    return value;
 }

@ -586,9 +616,18 @@ static uint32_t
 getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
    uint8_t *bytes, *resultBytes;
    uint32_t value;
+    int32_t u16Length, ratio;

    if(m->f==2) {
-        return UCNV_EXT_FROM_U_SUBCHAR1; /* <subchar1> SUB mapping */
+        /*
+         * no mapping, <subchar1> preferred
+         *
+         * no need to count in statistics because the subchars are already
+         * counted for maxOutBytes and maxBytesPerUChar in UConverterStaticData,
+         * and this non-mapping does not count for maxInUChars which are always
+         * trivially at least two if counting unmappable supplementary code points
+         */
+        return UCNV_EXT_FROM_U_SUBCHAR1;
    }

    bytes=UCM_GET_BYTES(table, m);
@ -614,6 +653,27 @@ getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
    if(m->f==0) {
        value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG;
    }
+
+    /* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */
+    if(m->uLen==1) {
+        u16Length=U16_LENGTH(m->u);
+    } else {
+        u16Length=U16_LENGTH(UCM_GET_CODE_POINTS(table, m)[0])+(m->uLen-2);
+    }
+
+    /* update statistics */
+    if(u16Length>extData->maxInUChars) {
+        extData->maxInUChars=u16Length;
+    }
+    if(m->bLen>extData->maxOutBytes) {
+        extData->maxOutBytes=m->bLen;
+    }
+
+    ratio=(m->bLen+(u16Length-1))/u16Length;
+    if(ratio>extData->maxBytesPerUChar) {
+        extData->maxBytesPerUChar=ratio;
+    }
+
    return value;
 }

--- a/icu4c/source/tools/makeconv/makeconv.c
+++ b/icu4c/source/tools/makeconv/makeconv.c
@ -597,7 +597,7 @@ static void
 readTable(ConvData *data, FileStream* convFile,
          UBool forBase, UCMStates *baseStates,
          UErrorCode *pErrorCode) {
-    char line[200];
+    char line[500];
    char *end;
    UBool isOK;