mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 22:15:31 +00:00
ICU-705 add algorithmic US-ASCII converter
X-SVN-Rev: 3286
This commit is contained in:
parent
8d9bdf7a1e
commit
66544551d6
6 changed files with 235 additions and 9 deletions
|
@ -75,6 +75,7 @@ UTF32_PlatformEndian ISO-10646-UCS-4 { IANA } csUCS4 utf-32 { MIME } ucs-4
|
|||
UTF32_OppositeEndian
|
||||
SCSU { IANA }
|
||||
LATIN_1 iso-8859-1 { MIME } ibm-819 cp819 latin1 8859-1 csisolatin1 iso-ir-100 cp367 ISO_8859-1:1987 { IANA } l1 ANSI_X3.110-1983 #!!!!! There's whole lot of names for this
|
||||
US-ASCII { MIME } ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6
|
||||
ISO_2022 ISO-2022 { MIME } 2022 cp2022
|
||||
ISO_2022,locale=ja,version=0 ISO-2022-JP { IANA MIME } csISO2022JP
|
||||
ISO_2022,locale=ja,version=1 ISO-2022-JP-1
|
||||
|
@ -98,7 +99,7 @@ LMBCS-19
|
|||
|
||||
# Table-based
|
||||
|
||||
ibm-367 us-ascii { MIME } ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6
|
||||
ibm-367
|
||||
|
||||
# Special mapping for S/390 new line characters
|
||||
ebcdic-xml-us
|
||||
|
|
|
@ -51,7 +51,7 @@ converterData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={
|
|||
&_EBCDICStatefulData, &_ISO2022Data,
|
||||
&_LMBCSData1,&_LMBCSData2, &_LMBCSData3, &_LMBCSData4, &_LMBCSData5, &_LMBCSData6,
|
||||
&_LMBCSData8,&_LMBCSData11,&_LMBCSData16,&_LMBCSData17,&_LMBCSData18,&_LMBCSData19,
|
||||
&_HZData, &_SCSUData
|
||||
&_HZData, &_SCSUData, &_ASCIIData
|
||||
};
|
||||
|
||||
static struct {
|
||||
|
@ -92,7 +92,8 @@ static struct {
|
|||
{ "LMBCS-18",UCNV_LMBCS_18 },
|
||||
{ "LMBCS-19",UCNV_LMBCS_19 },
|
||||
{ "HZ",UCNV_HZ },
|
||||
{ "SCSU", UCNV_SCSU }
|
||||
{ "SCSU", UCNV_SCSU },
|
||||
{ "US-ASCII", UCNV_US_ASCII }
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -215,7 +215,7 @@ extern const UConverterSharedData
|
|||
_EBCDICStatefulData, _ISO2022Data,
|
||||
_LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6,
|
||||
_LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19,
|
||||
_HZData, _SCSUData;
|
||||
_HZData, _SCSUData, _ASCIIData;
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
|
|
|
@ -110,7 +110,11 @@ _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
|||
targetCapacity=pArgs->targetLimit-pArgs->target;
|
||||
offsets=pArgs->offsets;
|
||||
|
||||
max=0xff; /* ### 0x7f for US-ASCII */
|
||||
if(cnv->sharedData==&_Latin1Data) {
|
||||
max=0xff; /* Latin-1 */
|
||||
} else {
|
||||
max=0x7f; /* US-ASCII */
|
||||
}
|
||||
|
||||
/* get the converter state from UConverter */
|
||||
c=cnv->fromUSurrogateLead;
|
||||
|
@ -302,18 +306,236 @@ static const UConverterImpl _Latin1Impl={
|
|||
NULL
|
||||
};
|
||||
|
||||
const UConverterStaticData _Latin1StaticData={
|
||||
static const UConverterStaticData _Latin1StaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"LATIN_1",
|
||||
819, UCNV_IBM, UCNV_LATIN_1, 1, 1,
|
||||
{ 0x1a, 0, 0, 0 },1,FALSE, FALSE,
|
||||
{ 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
|
||||
const UConverterSharedData _Latin1Data={
|
||||
sizeof(UConverterSharedData), ~((uint32_t) 0),
|
||||
NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl,
|
||||
0
|
||||
};
|
||||
|
||||
/* US-ASCII ----------------------------------------------------------------- */
|
||||
|
||||
/* This is a table-less version of _MBCSSingleToBMPWithOffsets(). */
|
||||
U_CFUNC void
|
||||
_ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
const uint8_t *source, *sourceLimit, *lastSource;
|
||||
UChar *target;
|
||||
int32_t targetCapacity, length;
|
||||
int32_t *offsets;
|
||||
|
||||
int32_t sourceIndex;
|
||||
uint8_t b;
|
||||
|
||||
/* set up the local pointers */
|
||||
source=(const uint8_t *)pArgs->source;
|
||||
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
|
||||
target=pArgs->target;
|
||||
targetCapacity=pArgs->targetLimit-pArgs->target;
|
||||
offsets=pArgs->offsets;
|
||||
|
||||
/* sourceIndex=-1 if the current character began in the previous buffer */
|
||||
sourceIndex=0;
|
||||
lastSource=source;
|
||||
|
||||
/*
|
||||
* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
|
||||
* for the minimum of the sourceLength and targetCapacity
|
||||
*/
|
||||
length=sourceLimit-source;
|
||||
if(length<targetCapacity) {
|
||||
targetCapacity=length;
|
||||
}
|
||||
|
||||
/* conversion loop */
|
||||
while(targetCapacity>0) {
|
||||
b=*source++;
|
||||
if(b<=0x7f) {
|
||||
*target++=b;
|
||||
--targetCapacity;
|
||||
} else {
|
||||
/* call the callback function with all the preparations and post-processing */
|
||||
UConverter *cnv=pArgs->converter;
|
||||
|
||||
/* callback(illegal) */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
|
||||
/* set offsets since the start or the last callback */
|
||||
if(offsets!=NULL) {
|
||||
int32_t count=(int32_t)(source-lastSource);
|
||||
|
||||
/* predecrement: do not set the offset for the callback-causing character */
|
||||
while(--count>0) {
|
||||
*offsets++=sourceIndex++;
|
||||
}
|
||||
/* offset and sourceIndex are now set for the current character */
|
||||
}
|
||||
|
||||
/* update the arguments structure */
|
||||
pArgs->source=(const char *)source;
|
||||
pArgs->target=target;
|
||||
pArgs->offsets=offsets;
|
||||
|
||||
/* copy the current bytes to invalidCharBuffer */
|
||||
cnv->invalidCharBuffer[0]=b;
|
||||
cnv->invalidCharLength=1;
|
||||
|
||||
/* call the callback function */
|
||||
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode);
|
||||
|
||||
/* update target and deal with offsets if necessary */
|
||||
offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
|
||||
target=pArgs->target;
|
||||
|
||||
/* update the source pointer and index */
|
||||
sourceIndex+=1+((const uint8_t *)pArgs->source-source);
|
||||
source=lastSource=(const uint8_t *)pArgs->source;
|
||||
targetCapacity=pArgs->targetLimit-target;
|
||||
length=sourceLimit-source;
|
||||
if(length<targetCapacity) {
|
||||
targetCapacity=length;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the callback overflowed the target, then we need to
|
||||
* stop here with an overflow indication.
|
||||
*/
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
break;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* break on error */
|
||||
break;
|
||||
} else if(cnv->UCharErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
|
||||
/* set offsets since the start or the last callback */
|
||||
if(offsets!=NULL) {
|
||||
size_t count=source-lastSource;
|
||||
while(count>0) {
|
||||
*offsets++=sourceIndex++;
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
/* write back the updated pointers */
|
||||
pArgs->source=(const char *)source;
|
||||
pArgs->target=target;
|
||||
pArgs->offsets=offsets;
|
||||
}
|
||||
|
||||
/* This is a table-less version of _MBCSSingleGetNextUChar(). */
|
||||
U_CFUNC UChar32
|
||||
_ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar buffer[UTF_MAX_CHAR_LENGTH];
|
||||
const uint8_t *source;
|
||||
uint8_t b;
|
||||
|
||||
/* set up the local pointers */
|
||||
source=(const uint8_t *)pArgs->source;
|
||||
|
||||
/* conversion loop */
|
||||
while(source<(const uint8_t *)pArgs->sourceLimit) {
|
||||
b=*source++;
|
||||
pArgs->source=(const char *)source;
|
||||
if(b<=0x7f) {
|
||||
return b;
|
||||
} else {
|
||||
/* call the callback function with all the preparations and post-processing */
|
||||
UConverter *cnv=pArgs->converter;
|
||||
|
||||
/* callback(illegal) */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
|
||||
/* update the arguments structure */
|
||||
pArgs->target=buffer;
|
||||
pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
|
||||
|
||||
/* copy the current byte to invalidCharBuffer */
|
||||
cnv->invalidCharBuffer[0]=(char)b;
|
||||
cnv->invalidCharLength=1;
|
||||
|
||||
/* call the callback function */
|
||||
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode);
|
||||
|
||||
/* update the source pointer */
|
||||
source=(const uint8_t *)pArgs->source;
|
||||
|
||||
/*
|
||||
* return the first character if the callback wrote some
|
||||
* we do not need to goto finish because the converter state is already set
|
||||
*/
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
int32_t length=pArgs->target-buffer;
|
||||
if(length>0) {
|
||||
return ucnv_getUChar32KeepOverflow(cnv, buffer, length);
|
||||
}
|
||||
/* else (callback did not write anything) continue */
|
||||
} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH);
|
||||
} else {
|
||||
/* break on error */
|
||||
/* ### what if a callback set an error but _also_ generated output?! */
|
||||
return 0xffff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* no output because of empty input or only skipping callbacks */
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
static const UConverterImpl _ASCIIImpl={
|
||||
UCNV_US_ASCII,
|
||||
|
||||
NULL,
|
||||
NULL,
|
||||
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
|
||||
_ASCIIToUnicodeWithOffsets,
|
||||
_ASCIIToUnicodeWithOffsets,
|
||||
_Latin1FromUnicodeWithOffsets,
|
||||
_Latin1FromUnicodeWithOffsets,
|
||||
_ASCIIGetNextUChar,
|
||||
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
static const UConverterStaticData _ASCIIStaticData={
|
||||
sizeof(UConverterStaticData),
|
||||
"US-ASCII",
|
||||
367, UCNV_IBM, UCNV_US_ASCII, 1, 1,
|
||||
{ 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
const UConverterSharedData _ASCIIData={
|
||||
sizeof(UConverterSharedData), ~((uint32_t) 0),
|
||||
NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl,
|
||||
0
|
||||
};
|
||||
|
|
|
@ -77,6 +77,7 @@ typedef enum {
|
|||
UCNV_LMBCS_LAST = UCNV_LMBCS_19,
|
||||
UCNV_HZ,
|
||||
UCNV_SCSU,
|
||||
UCNV_US_ASCII,
|
||||
|
||||
/* Number of converter types for which we have conversion routines. */
|
||||
UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES
|
||||
|
|
|
@ -75,6 +75,7 @@ UTF32_PlatformEndian ISO-10646-UCS-4 { IANA } csUCS4 utf-32 { MIME } ucs-4
|
|||
UTF32_OppositeEndian
|
||||
SCSU { IANA }
|
||||
LATIN_1 iso-8859-1 { MIME } ibm-819 cp819 latin1 8859-1 csisolatin1 iso-ir-100 cp367 ISO_8859-1:1987 { IANA } l1 ANSI_X3.110-1983 #!!!!! There's whole lot of names for this
|
||||
US-ASCII { MIME } ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6
|
||||
ISO_2022 ISO-2022 { MIME } 2022 cp2022
|
||||
ISO_2022,locale=ja,version=0 ISO-2022-JP { IANA MIME } csISO2022JP
|
||||
ISO_2022,locale=ja,version=1 ISO-2022-JP-1
|
||||
|
@ -98,7 +99,7 @@ LMBCS-19
|
|||
|
||||
# Table-based
|
||||
|
||||
ibm-367 us-ascii { MIME } ascii ascii-7 US-ASCII ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6
|
||||
ibm-367
|
||||
|
||||
# Special mapping for S/390 new line characters
|
||||
ebcdic-xml-us
|
||||
|
|
Loading…
Add table
Reference in a new issue