diff --git a/icu4c/data/convrtrs.txt b/icu4c/data/convrtrs.txt index 359c7259229..74178b77c66 100644 --- a/icu4c/data/convrtrs.txt +++ b/icu4c/data/convrtrs.txt @@ -75,6 +75,7 @@ UTF32_PlatformEndian ISO-10646-UCS-4 { IANA } csUCS4 utf-32 { MIME } ucs-4 UTF32_OppositeEndian SCSU { IANA } LATIN_1 iso-8859-1 { MIME } ibm-819 cp819 latin1 8859-1 csisolatin1 iso-ir-100 cp367 ISO_8859-1:1987 { IANA } l1 ANSI_X3.110-1983 #!!!!! There's whole lot of names for this +US-ASCII { MIME } ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6 ISO_2022 ISO-2022 { MIME } 2022 cp2022 ISO_2022,locale=ja,version=0 ISO-2022-JP { IANA MIME } csISO2022JP ISO_2022,locale=ja,version=1 ISO-2022-JP-1 @@ -98,7 +99,7 @@ LMBCS-19 # Table-based -ibm-367 us-ascii { MIME } ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6 +ibm-367 # Special mapping for S/390 new line characters ebcdic-xml-us diff --git a/icu4c/source/common/ucnv_bld.c b/icu4c/source/common/ucnv_bld.c index f65d2027952..617e2eebef2 100644 --- a/icu4c/source/common/ucnv_bld.c +++ b/icu4c/source/common/ucnv_bld.c @@ -51,7 +51,7 @@ converterData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={ &_EBCDICStatefulData, &_ISO2022Data, &_LMBCSData1,&_LMBCSData2, &_LMBCSData3, &_LMBCSData4, &_LMBCSData5, &_LMBCSData6, &_LMBCSData8,&_LMBCSData11,&_LMBCSData16,&_LMBCSData17,&_LMBCSData18,&_LMBCSData19, - &_HZData, &_SCSUData + &_HZData, &_SCSUData, &_ASCIIData }; static struct { @@ -92,7 +92,8 @@ static struct { { "LMBCS-18",UCNV_LMBCS_18 }, { "LMBCS-19",UCNV_LMBCS_19 }, { "HZ",UCNV_HZ }, - { "SCSU", UCNV_SCSU } + { "SCSU", UCNV_SCSU }, + { "US-ASCII", UCNV_US_ASCII } }; diff --git a/icu4c/source/common/ucnv_cnv.h b/icu4c/source/common/ucnv_cnv.h index 876c51ca132..91a462f428f 100644 --- a/icu4c/source/common/ucnv_cnv.h +++ b/icu4c/source/common/ucnv_cnv.h @@ -215,7 +215,7 @@ extern const UConverterSharedData _EBCDICStatefulData, _ISO2022Data, _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6, _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19, - _HZData, _SCSUData; + _HZData, _SCSUData, _ASCIIData; U_CDECL_END diff --git a/icu4c/source/common/ucnvlat1.c b/icu4c/source/common/ucnvlat1.c index 6f35361efff..3c4a15528b9 100644 --- a/icu4c/source/common/ucnvlat1.c +++ b/icu4c/source/common/ucnvlat1.c @@ -110,7 +110,11 @@ _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, targetCapacity=pArgs->targetLimit-pArgs->target; offsets=pArgs->offsets; - max=0xff; /* ### 0x7f for US-ASCII */ + if(cnv->sharedData==&_Latin1Data) { + max=0xff; /* Latin-1 */ + } else { + max=0x7f; /* US-ASCII */ + } /* get the converter state from UConverter */ c=cnv->fromUSurrogateLead; @@ -302,18 +306,236 @@ static const UConverterImpl _Latin1Impl={ NULL }; -const UConverterStaticData _Latin1StaticData={ +static const UConverterStaticData _Latin1StaticData={ sizeof(UConverterStaticData), "LATIN_1", 819, UCNV_IBM, UCNV_LATIN_1, 1, 1, - { 0x1a, 0, 0, 0 },1,FALSE, FALSE, + { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, 0, { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ }; - const UConverterSharedData _Latin1Data={ sizeof(UConverterSharedData), ~((uint32_t) 0), NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl, 0 }; + +/* US-ASCII ----------------------------------------------------------------- */ + +/* This is a table-less version of _MBCSSingleToBMPWithOffsets(). */ +U_CFUNC void +_ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + const uint8_t *source, *sourceLimit, *lastSource; + UChar *target; + int32_t targetCapacity, length; + int32_t *offsets; + + int32_t sourceIndex; + uint8_t b; + + /* set up the local pointers */ + source=(const uint8_t *)pArgs->source; + sourceLimit=(const uint8_t *)pArgs->sourceLimit; + target=pArgs->target; + targetCapacity=pArgs->targetLimit-pArgs->target; + offsets=pArgs->offsets; + + /* sourceIndex=-1 if the current character began in the previous buffer */ + sourceIndex=0; + lastSource=source; + + /* + * since the conversion here is 1:1 UChar:uint8_t, we need only one counter + * for the minimum of the sourceLength and targetCapacity + */ + length=sourceLimit-source; + if(length0) { + b=*source++; + if(b<=0x7f) { + *target++=b; + --targetCapacity; + } else { + /* call the callback function with all the preparations and post-processing */ + UConverter *cnv=pArgs->converter; + + /* callback(illegal) */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + + /* set offsets since the start or the last callback */ + if(offsets!=NULL) { + int32_t count=(int32_t)(source-lastSource); + + /* predecrement: do not set the offset for the callback-causing character */ + while(--count>0) { + *offsets++=sourceIndex++; + } + /* offset and sourceIndex are now set for the current character */ + } + + /* update the arguments structure */ + pArgs->source=(const char *)source; + pArgs->target=target; + pArgs->offsets=offsets; + + /* copy the current bytes to invalidCharBuffer */ + cnv->invalidCharBuffer[0]=b; + cnv->invalidCharLength=1; + + /* call the callback function */ + cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode); + + /* update target and deal with offsets if necessary */ + offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex); + target=pArgs->target; + + /* update the source pointer and index */ + sourceIndex+=1+((const uint8_t *)pArgs->source-source); + source=lastSource=(const uint8_t *)pArgs->source; + targetCapacity=pArgs->targetLimit-target; + length=sourceLimit-source; + if(lengthUCharErrorBufferLength>0) { + /* target is full */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + break; + } + } + } + + if(U_SUCCESS(*pErrorCode) && source=pArgs->targetLimit) { + /* target is full */ + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + + /* set offsets since the start or the last callback */ + if(offsets!=NULL) { + size_t count=source-lastSource; + while(count>0) { + *offsets++=sourceIndex++; + --count; + } + } + + /* write back the updated pointers */ + pArgs->source=(const char *)source; + pArgs->target=target; + pArgs->offsets=offsets; +} + +/* This is a table-less version of _MBCSSingleGetNextUChar(). */ +U_CFUNC UChar32 +_ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UChar buffer[UTF_MAX_CHAR_LENGTH]; + const uint8_t *source; + uint8_t b; + + /* set up the local pointers */ + source=(const uint8_t *)pArgs->source; + + /* conversion loop */ + while(source<(const uint8_t *)pArgs->sourceLimit) { + b=*source++; + pArgs->source=(const char *)source; + if(b<=0x7f) { + return b; + } else { + /* call the callback function with all the preparations and post-processing */ + UConverter *cnv=pArgs->converter; + + /* callback(illegal) */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + + /* update the arguments structure */ + pArgs->target=buffer; + pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH; + + /* copy the current byte to invalidCharBuffer */ + cnv->invalidCharBuffer[0]=(char)b; + cnv->invalidCharLength=1; + + /* call the callback function */ + cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode); + + /* update the source pointer */ + source=(const uint8_t *)pArgs->source; + + /* + * return the first character if the callback wrote some + * we do not need to goto finish because the converter state is already set + */ + if(U_SUCCESS(*pErrorCode)) { + int32_t length=pArgs->target-buffer; + if(length>0) { + return ucnv_getUChar32KeepOverflow(cnv, buffer, length); + } + /* else (callback did not write anything) continue */ + } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { + *pErrorCode=U_ZERO_ERROR; + return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH); + } else { + /* break on error */ + /* ### what if a callback set an error but _also_ generated output?! */ + return 0xffff; + } + } + } + + /* no output because of empty input or only skipping callbacks */ + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0xffff; +} + +static const UConverterImpl _ASCIIImpl={ + UCNV_US_ASCII, + + NULL, + NULL, + + NULL, + NULL, + NULL, + + _ASCIIToUnicodeWithOffsets, + _ASCIIToUnicodeWithOffsets, + _Latin1FromUnicodeWithOffsets, + _Latin1FromUnicodeWithOffsets, + _ASCIIGetNextUChar, + + NULL, + NULL +}; + +static const UConverterStaticData _ASCIIStaticData={ + sizeof(UConverterStaticData), + "US-ASCII", + 367, UCNV_IBM, UCNV_US_ASCII, 1, 1, + { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +const UConverterSharedData _ASCIIData={ + sizeof(UConverterSharedData), ~((uint32_t) 0), + NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl, + 0 +}; diff --git a/icu4c/source/common/unicode/ucnv.h b/icu4c/source/common/unicode/ucnv.h index baccd622a7b..e3756573811 100644 --- a/icu4c/source/common/unicode/ucnv.h +++ b/icu4c/source/common/unicode/ucnv.h @@ -77,6 +77,7 @@ typedef enum { UCNV_LMBCS_LAST = UCNV_LMBCS_19, UCNV_HZ, UCNV_SCSU, + UCNV_US_ASCII, /* Number of converter types for which we have conversion routines. */ UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES diff --git a/icu4c/source/data/mappings/convrtrs.txt b/icu4c/source/data/mappings/convrtrs.txt index 359c7259229..74178b77c66 100644 --- a/icu4c/source/data/mappings/convrtrs.txt +++ b/icu4c/source/data/mappings/convrtrs.txt @@ -75,6 +75,7 @@ UTF32_PlatformEndian ISO-10646-UCS-4 { IANA } csUCS4 utf-32 { MIME } ucs-4 UTF32_OppositeEndian SCSU { IANA } LATIN_1 iso-8859-1 { MIME } ibm-819 cp819 latin1 8859-1 csisolatin1 iso-ir-100 cp367 ISO_8859-1:1987 { IANA } l1 ANSI_X3.110-1983 #!!!!! There's whole lot of names for this +US-ASCII { MIME } ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6 ISO_2022 ISO-2022 { MIME } 2022 cp2022 ISO_2022,locale=ja,version=0 ISO-2022-JP { IANA MIME } csISO2022JP ISO_2022,locale=ja,version=1 ISO-2022-JP-1 @@ -98,7 +99,7 @@ LMBCS-19 # Table-based -ibm-367 us-ascii { MIME } ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6 +ibm-367 # Special mapping for S/390 new line characters ebcdic-xml-us