ICU-705 add algorithmic US-ASCII converter

X-SVN-Rev: 3286
2025-04-06 22:15:31 +00:00 · 2000-12-20 02:08:39 +00:00 · 2000-12-20 02:08:39 +00:00 · 66544551d6
commit 66544551d6
parent 8d9bdf7a1e
6 changed files with 235 additions and 9 deletions
--- a/icu4c/data/convrtrs.txt
+++ b/icu4c/data/convrtrs.txt
@ -75,6 +75,7 @@ UTF32_PlatformEndian     ISO-10646-UCS-4 { IANA } csUCS4 utf-32 { MIME } ucs-4
 UTF32_OppositeEndian
 SCSU { IANA }
 LATIN_1                  iso-8859-1     { MIME } ibm-819 cp819 latin1 8859-1 csisolatin1 iso-ir-100 cp367 ISO_8859-1:1987 { IANA } l1 ANSI_X3.110-1983   #!!!!! There's whole lot of names for this
+US-ASCII { MIME }        ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6
 ISO_2022                         ISO-2022 { MIME } 2022 cp2022
 ISO_2022,locale=ja,version=0     ISO-2022-JP { IANA MIME } csISO2022JP
 ISO_2022,locale=ja,version=1     ISO-2022-JP-1
@ -98,7 +99,7 @@ LMBCS-19

 # Table-based

-ibm-367                 us-ascii { MIME } ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6
+ibm-367

 # Special mapping for S/390 new line characters
 ebcdic-xml-us
--- a/icu4c/source/common/ucnv_bld.c
+++ b/icu4c/source/common/ucnv_bld.c
@ -51,7 +51,7 @@ converterData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={
    &_EBCDICStatefulData, &_ISO2022Data, 
    &_LMBCSData1,&_LMBCSData2, &_LMBCSData3, &_LMBCSData4, &_LMBCSData5, &_LMBCSData6,
    &_LMBCSData8,&_LMBCSData11,&_LMBCSData16,&_LMBCSData17,&_LMBCSData18,&_LMBCSData19,
-    &_HZData, &_SCSUData
+    &_HZData, &_SCSUData, &_ASCIIData
 };

 static struct {
@ -92,7 +92,8 @@ static struct {
  { "LMBCS-18",UCNV_LMBCS_18 },
  { "LMBCS-19",UCNV_LMBCS_19 },
  { "HZ",UCNV_HZ },
-  { "SCSU", UCNV_SCSU }
+  { "SCSU", UCNV_SCSU },
+  { "US-ASCII", UCNV_US_ASCII }
 };


--- a/icu4c/source/common/ucnv_cnv.h
+++ b/icu4c/source/common/ucnv_cnv.h
@ -215,7 +215,7 @@ extern const UConverterSharedData
    _EBCDICStatefulData, _ISO2022Data, 
    _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6,
    _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19,
-    _HZData, _SCSUData;
+    _HZData, _SCSUData, _ASCIIData;

 U_CDECL_END

--- a/icu4c/source/common/ucnvlat1.c
+++ b/icu4c/source/common/ucnvlat1.c
@ -110,7 +110,11 @@ _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    targetCapacity=pArgs->targetLimit-pArgs->target;
    offsets=pArgs->offsets;

-    max=0xff; /* ### 0x7f for US-ASCII */
+    if(cnv->sharedData==&_Latin1Data) {
+        max=0xff; /* Latin-1 */
+    } else {
+        max=0x7f; /* US-ASCII */
+    }

    /* get the converter state from UConverter */
    c=cnv->fromUSurrogateLead;
@ -302,18 +306,236 @@ static const UConverterImpl _Latin1Impl={
    NULL
 };

-const UConverterStaticData _Latin1StaticData={
+static const UConverterStaticData _Latin1StaticData={
    sizeof(UConverterStaticData),
    "LATIN_1",
    819, UCNV_IBM, UCNV_LATIN_1, 1, 1,
-    { 0x1a, 0, 0, 0 },1,FALSE, FALSE,
+    { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
    0,
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };

-
 const UConverterSharedData _Latin1Data={
    sizeof(UConverterSharedData), ~((uint32_t) 0),
    NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl, 
    0
 };
+
+/* US-ASCII ----------------------------------------------------------------- */
+
+/* This is a table-less version of _MBCSSingleToBMPWithOffsets(). */
+U_CFUNC void
+_ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+                           UErrorCode *pErrorCode) {
+    const uint8_t *source, *sourceLimit, *lastSource;
+    UChar *target;
+    int32_t targetCapacity, length;
+    int32_t *offsets;
+
+    int32_t sourceIndex;
+    uint8_t b;
+
+    /* set up the local pointers */
+    source=(const uint8_t *)pArgs->source;
+    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
+    target=pArgs->target;
+    targetCapacity=pArgs->targetLimit-pArgs->target;
+    offsets=pArgs->offsets;
+
+    /* sourceIndex=-1 if the current character began in the previous buffer */
+    sourceIndex=0;
+    lastSource=source;
+
+    /*
+     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
+     * for the minimum of the sourceLength and targetCapacity
+     */
+    length=sourceLimit-source;
+    if(length<targetCapacity) {
+        targetCapacity=length;
+    }
+
+    /* conversion loop */
+    while(targetCapacity>0) {
+        b=*source++;
+        if(b<=0x7f) {
+            *target++=b;
+            --targetCapacity;
+        } else {
+            /* call the callback function with all the preparations and post-processing */
+            UConverter *cnv=pArgs->converter;
+
+            /* callback(illegal) */
+            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+
+            /* set offsets since the start or the last callback */
+            if(offsets!=NULL) {
+                int32_t count=(int32_t)(source-lastSource);
+
+                /* predecrement: do not set the offset for the callback-causing character */
+                while(--count>0) {
+                    *offsets++=sourceIndex++;
+                }
+                /* offset and sourceIndex are now set for the current character */
+            }
+
+            /* update the arguments structure */
+            pArgs->source=(const char *)source;
+            pArgs->target=target;
+            pArgs->offsets=offsets;
+
+            /* copy the current bytes to invalidCharBuffer */
+            cnv->invalidCharBuffer[0]=b;
+            cnv->invalidCharLength=1;
+
+            /* call the callback function */
+            cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode);
+
+            /* update target and deal with offsets if necessary */
+            offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
+            target=pArgs->target;
+
+            /* update the source pointer and index */
+            sourceIndex+=1+((const uint8_t *)pArgs->source-source);
+            source=lastSource=(const uint8_t *)pArgs->source;
+            targetCapacity=pArgs->targetLimit-target;
+            length=sourceLimit-source;
+            if(length<targetCapacity) {
+                targetCapacity=length;
+            }
+
+            /*
+             * If the callback overflowed the target, then we need to
+             * stop here with an overflow indication.
+             */
+            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
+                break;
+            } else if(U_FAILURE(*pErrorCode)) {
+                /* break on error */
+                break;
+            } else if(cnv->UCharErrorBufferLength>0) {
+                /* target is full */
+                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+                break;
+            }
+        }
+    }
+
+    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
+        /* target is full */
+        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+    }
+
+    /* set offsets since the start or the last callback */
+    if(offsets!=NULL) {
+        size_t count=source-lastSource;
+        while(count>0) {
+            *offsets++=sourceIndex++;
+            --count;
+        }
+    }
+
+    /* write back the updated pointers */
+    pArgs->source=(const char *)source;
+    pArgs->target=target;
+    pArgs->offsets=offsets;
+}
+
+/* This is a table-less version of _MBCSSingleGetNextUChar(). */
+U_CFUNC UChar32
+_ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
+                   UErrorCode *pErrorCode) {
+    UChar buffer[UTF_MAX_CHAR_LENGTH];
+    const uint8_t *source;
+    uint8_t b;
+
+    /* set up the local pointers */
+    source=(const uint8_t *)pArgs->source;
+
+    /* conversion loop */
+    while(source<(const uint8_t *)pArgs->sourceLimit) {
+        b=*source++;
+        pArgs->source=(const char *)source;
+        if(b<=0x7f) {
+            return b;
+        } else {
+            /* call the callback function with all the preparations and post-processing */
+            UConverter *cnv=pArgs->converter;
+
+            /* callback(illegal) */
+            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+
+            /* update the arguments structure */
+            pArgs->target=buffer;
+            pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
+
+            /* copy the current byte to invalidCharBuffer */
+            cnv->invalidCharBuffer[0]=(char)b;
+            cnv->invalidCharLength=1;
+
+            /* call the callback function */
+            cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode);
+
+            /* update the source pointer */
+            source=(const uint8_t *)pArgs->source;
+
+            /*
+             * return the first character if the callback wrote some
+             * we do not need to goto finish because the converter state is already set
+             */
+            if(U_SUCCESS(*pErrorCode)) {
+                int32_t length=pArgs->target-buffer;
+                if(length>0) {
+                    return ucnv_getUChar32KeepOverflow(cnv, buffer, length);
+                }
+                /* else (callback did not write anything) continue */
+            } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
+                *pErrorCode=U_ZERO_ERROR;
+                return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH);
+            } else {
+                /* break on error */
+                /* ### what if a callback set an error but _also_ generated output?! */
+                return 0xffff;
+            }
+        }
+    }
+
+    /* no output because of empty input or only skipping callbacks */
+    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+    return 0xffff;
+}
+
+static const UConverterImpl _ASCIIImpl={
+    UCNV_US_ASCII,
+
+    NULL,
+    NULL,
+
+    NULL,
+    NULL,
+    NULL,
+
+    _ASCIIToUnicodeWithOffsets,
+    _ASCIIToUnicodeWithOffsets,
+    _Latin1FromUnicodeWithOffsets,
+    _Latin1FromUnicodeWithOffsets,
+    _ASCIIGetNextUChar,
+
+    NULL,
+    NULL
+};
+
+static const UConverterStaticData _ASCIIStaticData={
+    sizeof(UConverterStaticData),
+    "US-ASCII",
+    367, UCNV_IBM, UCNV_US_ASCII, 1, 1,
+    { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+
+const UConverterSharedData _ASCIIData={
+    sizeof(UConverterSharedData), ~((uint32_t) 0),
+    NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl, 
+    0
+};
--- a/icu4c/source/common/unicode/ucnv.h
+++ b/icu4c/source/common/unicode/ucnv.h
@ -77,6 +77,7 @@ typedef enum {
    UCNV_LMBCS_LAST = UCNV_LMBCS_19,
    UCNV_HZ,
    UCNV_SCSU,
+    UCNV_US_ASCII,

    /* Number of converter types for which we have conversion routines. */
    UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES
--- a/icu4c/source/data/mappings/convrtrs.txt
+++ b/icu4c/source/data/mappings/convrtrs.txt
@ -75,6 +75,7 @@ UTF32_PlatformEndian     ISO-10646-UCS-4 { IANA } csUCS4 utf-32 { MIME } ucs-4
 UTF32_OppositeEndian
 SCSU { IANA }
 LATIN_1                  iso-8859-1     { MIME } ibm-819 cp819 latin1 8859-1 csisolatin1 iso-ir-100 cp367 ISO_8859-1:1987 { IANA } l1 ANSI_X3.110-1983   #!!!!! There's whole lot of names for this
+US-ASCII { MIME }        ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6
 ISO_2022                         ISO-2022 { MIME } 2022 cp2022
 ISO_2022,locale=ja,version=0     ISO-2022-JP { IANA MIME } csISO2022JP
 ISO_2022,locale=ja,version=1     ISO-2022-JP-1
@ -98,7 +99,7 @@ LMBCS-19

 # Table-based

-ibm-367                 us-ascii { MIME } ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6
+ibm-367

 # Special mapping for S/390 new line characters
 ebcdic-xml-us