mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-5691 stop illegal sequences before the first byte that could start a new character; in MBCS, DBCS, ISO-2022, HZ; also make ISO-2022-KR strictly check for validity
X-SVN-Rev: 24733
This commit is contained in:
parent
82807260fe
commit
acd74a93d1
6 changed files with 334 additions and 67 deletions
|
@ -1990,6 +1990,7 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
|||
mySourceChar = args->converter->toUBytes[0];
|
||||
args->converter->toULength = 0;
|
||||
cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
|
||||
targetUniChar = missingCharMarker;
|
||||
goto getTrailByte;
|
||||
}
|
||||
|
||||
|
@ -2119,21 +2120,44 @@ escape:
|
|||
default:
|
||||
/* G0 DBCS */
|
||||
if(mySource < mySourceLimit) {
|
||||
int leadIsOk, trailIsOk;
|
||||
char trailByte;
|
||||
getTrailByte:
|
||||
trailByte = *mySource++;
|
||||
tmpSourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
|
||||
if(cs == JISX208) {
|
||||
_2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
|
||||
} else {
|
||||
if (cs == KSC5601) {
|
||||
tmpSourceChar = _2022ToGR94DBCS(tmpSourceChar);
|
||||
trailByte = *mySource;
|
||||
/*
|
||||
* Ticket 5691: consistent illegal sequences:
|
||||
* - We include at least the first byte in the illegal sequence.
|
||||
* - If any of the non-initial bytes could be the start of a character,
|
||||
* we stop the illegal sequence before the first one of those.
|
||||
*
|
||||
* In ISO-2022 DBCS, if both bytes are valid or both bytes are outside
|
||||
* the 21..7e range, then we treat them as a pair.
|
||||
* Otherwise (valid lead byte + illegal trail byte, or vice versa)
|
||||
* we report only the first byte as the illegal sequence.
|
||||
*/
|
||||
leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
|
||||
trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
|
||||
if (leadIsOk == trailIsOk) {
|
||||
++mySource;
|
||||
tmpSourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
|
||||
if (leadIsOk) {
|
||||
if(cs == JISX208) {
|
||||
_2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
|
||||
mySourceChar = tmpSourceChar;
|
||||
} else {
|
||||
/* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
|
||||
mySourceChar = tmpSourceChar;
|
||||
if (cs == KSC5601) {
|
||||
tmpSourceChar = _2022ToGR94DBCS(tmpSourceChar);
|
||||
}
|
||||
tempBuf[0] = (char)(tmpSourceChar >> 8);
|
||||
tempBuf[1] = (char)(tmpSourceChar);
|
||||
}
|
||||
targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
|
||||
} else {
|
||||
mySourceChar = tmpSourceChar;
|
||||
}
|
||||
tempBuf[0] = (char)(tmpSourceChar >> 8);
|
||||
tempBuf[1] = (char)(tmpSourceChar);
|
||||
}
|
||||
targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
|
||||
mySourceChar = tmpSourceChar;
|
||||
} else {
|
||||
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
||||
args->converter->toULength = 1;
|
||||
|
@ -2275,7 +2299,12 @@ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
|
|||
}
|
||||
/* only DBCS or SBCS characters are expected*/
|
||||
/* DB characters with high bit set to 1 are expected */
|
||||
if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
|
||||
if( length > 2 || length==0 ||
|
||||
(length == 1 && targetByteUnit > 0x7f) ||
|
||||
(length == 2 &&
|
||||
((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
|
||||
(uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
|
||||
) {
|
||||
targetByteUnit=missingCharMarker;
|
||||
}
|
||||
if (targetByteUnit != missingCharMarker){
|
||||
|
@ -2604,17 +2633,36 @@ escape:
|
|||
myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
|
||||
if(myData->toU2022State.g == 1) {
|
||||
if(mySource < mySourceLimit) {
|
||||
int leadIsOk, trailIsOk;
|
||||
char trailByte;
|
||||
getTrailByte:
|
||||
trailByte = *mySource++;
|
||||
tempBuf[0] = (char)(mySourceChar + 0x80);
|
||||
tempBuf[1] = (char)(trailByte + 0x80);
|
||||
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
|
||||
if((mySourceChar & 0x8080) == 0) {
|
||||
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
|
||||
targetUniChar = missingCharMarker;
|
||||
trailByte = *mySource;
|
||||
/*
|
||||
* Ticket 5691: consistent illegal sequences:
|
||||
* - We include at least the first byte in the illegal sequence.
|
||||
* - If any of the non-initial bytes could be the start of a character,
|
||||
* we stop the illegal sequence before the first one of those.
|
||||
*
|
||||
* In ISO-2022 DBCS, if both bytes are valid or both bytes are outside
|
||||
* the 21..7e range, then we treat them as a pair.
|
||||
* Otherwise (valid lead byte + illegal trail byte, or vice versa)
|
||||
* we report only the first byte as the illegal sequence.
|
||||
*/
|
||||
leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
|
||||
trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
|
||||
if (leadIsOk == trailIsOk) {
|
||||
++mySource;
|
||||
if (leadIsOk) {
|
||||
tempBuf[0] = (char)(mySourceChar + 0x80);
|
||||
tempBuf[1] = (char)(trailByte + 0x80);
|
||||
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
|
||||
} else {
|
||||
leadIsOk = TRUE; /* TODO: remove */
|
||||
}
|
||||
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
|
||||
} else {
|
||||
/* illegal bytes > 0x7f */
|
||||
targetUniChar = missingCharMarker;
|
||||
trailIsOk = TRUE; /* TODO: remove */
|
||||
}
|
||||
} else {
|
||||
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
||||
|
@ -2622,8 +2670,10 @@ getTrailByte:
|
|||
break;
|
||||
}
|
||||
}
|
||||
else{
|
||||
else if(mySourceChar <= 0x7f) {
|
||||
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
|
||||
} else {
|
||||
targetUniChar = 0xffff;
|
||||
}
|
||||
if(targetUniChar < 0xfffe){
|
||||
if(args->offsets) {
|
||||
|
@ -3120,6 +3170,7 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
|||
/* continue with a partial double-byte character */
|
||||
mySourceChar = args->converter->toUBytes[0];
|
||||
args->converter->toULength = 0;
|
||||
targetUniChar = missingCharMarker;
|
||||
goto getTrailByte;
|
||||
}
|
||||
|
||||
|
@ -3199,29 +3250,48 @@ escape:
|
|||
UConverterSharedData *cnv;
|
||||
StateEnum tempState;
|
||||
int32_t tempBufLen;
|
||||
int leadIsOk, trailIsOk;
|
||||
char trailByte;
|
||||
getTrailByte:
|
||||
trailByte = *mySource++;
|
||||
tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
|
||||
if(tempState >= CNS_11643_0) {
|
||||
cnv = myData->myConverterArray[CNS_11643];
|
||||
tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
|
||||
tempBuf[1] = (char) (mySourceChar);
|
||||
tempBuf[2] = trailByte;
|
||||
tempBufLen = 3;
|
||||
trailByte = *mySource;
|
||||
/*
|
||||
* Ticket 5691: consistent illegal sequences:
|
||||
* - We include at least the first byte in the illegal sequence.
|
||||
* - If any of the non-initial bytes could be the start of a character,
|
||||
* we stop the illegal sequence before the first one of those.
|
||||
*
|
||||
* In ISO-2022 DBCS, if both bytes are valid or both bytes are outside
|
||||
* the 21..7e range, then we treat them as a pair.
|
||||
* Otherwise (valid lead byte + illegal trail byte, or vice versa)
|
||||
* we report only the first byte as the illegal sequence.
|
||||
*/
|
||||
leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
|
||||
trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
|
||||
if (leadIsOk == trailIsOk) {
|
||||
++mySource;
|
||||
if (leadIsOk) {
|
||||
tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
|
||||
if(tempState >= CNS_11643_0) {
|
||||
cnv = myData->myConverterArray[CNS_11643];
|
||||
tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
|
||||
tempBuf[1] = (char) (mySourceChar);
|
||||
tempBuf[2] = trailByte;
|
||||
tempBufLen = 3;
|
||||
|
||||
}else{
|
||||
cnv = myData->myConverterArray[tempState];
|
||||
tempBuf[0] = (char) (mySourceChar);
|
||||
tempBuf[1] = trailByte;
|
||||
tempBufLen = 2;
|
||||
}else{
|
||||
cnv = myData->myConverterArray[tempState];
|
||||
tempBuf[0] = (char) (mySourceChar);
|
||||
tempBuf[1] = trailByte;
|
||||
tempBufLen = 2;
|
||||
}
|
||||
targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
|
||||
}
|
||||
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
|
||||
}
|
||||
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
|
||||
if(pToU2022State->g>=2) {
|
||||
/* return from a single-shift state to the previous one */
|
||||
pToU2022State->g=pToU2022State->prevG;
|
||||
}
|
||||
targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
|
||||
} else {
|
||||
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
||||
args->converter->toULength = 1;
|
||||
|
|
|
@ -217,19 +217,35 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
|||
}
|
||||
else{
|
||||
/* trail byte */
|
||||
int leadIsOk, trailIsOk;
|
||||
uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
|
||||
if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) &&
|
||||
(uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
|
||||
) {
|
||||
tempBuf[0] = (char) (leadByte+0x80) ;
|
||||
tempBuf[1] = (char) (mySourceChar+0x80);
|
||||
targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
|
||||
tempBuf, 2, args->converter->useFallback);
|
||||
targetUniChar = 0xffff;
|
||||
/*
|
||||
* Ticket 5691: consistent illegal sequences:
|
||||
* - We include at least the first byte in the illegal sequence.
|
||||
* - If any of the non-initial bytes could be the start of a character,
|
||||
* we stop the illegal sequence before the first one of those.
|
||||
*
|
||||
* In HZ DBCS, if both bytes are valid or both bytes are outside
|
||||
* the 21..7d/7e range, then we treat them as a pair.
|
||||
* Otherwise (valid lead byte + illegal trail byte, or vice versa)
|
||||
* we report only the first byte as the illegal sequence.
|
||||
*/
|
||||
leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
|
||||
trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
|
||||
if (leadIsOk == trailIsOk) {
|
||||
if (leadIsOk) {
|
||||
tempBuf[0] = (char) (leadByte+0x80) ;
|
||||
tempBuf[1] = (char) (mySourceChar+0x80);
|
||||
targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
|
||||
tempBuf, 2, args->converter->useFallback);
|
||||
}
|
||||
/* add another bit so that the code below writes 2 bytes in case of error */
|
||||
mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
|
||||
} else {
|
||||
targetUniChar = 0xffff;
|
||||
--mySource;
|
||||
mySourceChar = (int32_t)leadByte;
|
||||
}
|
||||
/* add another bit so that the code below writes 2 bytes in case of error */
|
||||
mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
|
||||
args->converter->toUnicodeStatus =0x00;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000-2007, International Business Machines
|
||||
* Copyright (C) 2000-2008, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
|
@ -2151,6 +2151,65 @@ unrolled:
|
|||
pArgs->offsets=offsets;
|
||||
}
|
||||
|
||||
static UBool
|
||||
hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
|
||||
const int32_t *row=stateTable[state];
|
||||
int32_t b, entry;
|
||||
/* First test for final entries in this state for some commonly valid byte values. */
|
||||
entry=row[0xa1];
|
||||
if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
|
||||
MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
|
||||
) {
|
||||
return TRUE;
|
||||
}
|
||||
entry=row[0x41];
|
||||
if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
|
||||
MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
|
||||
) {
|
||||
return TRUE;
|
||||
}
|
||||
/* Then test for final entries in this state. */
|
||||
for(b=0; b<=0xff; ++b) {
|
||||
entry=row[b];
|
||||
if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
|
||||
MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
|
||||
) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
/* Then recurse for transition entries. */
|
||||
for(b=0; b<=0xff; ++b) {
|
||||
entry=row[b];
|
||||
if( MBCS_ENTRY_IS_TRANSITION(entry) &&
|
||||
hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
|
||||
) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Is byte b a single/lead byte in this state?
|
||||
* Recurse for transition states, because here we don't want to say that
|
||||
* b is a lead byte if all byte sequences that start with b are illegal.
|
||||
*/
|
||||
static UBool
|
||||
isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
|
||||
const int32_t *row=stateTable[state];
|
||||
int32_t entry=row[b];
|
||||
if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
|
||||
return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
|
||||
} else {
|
||||
uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
|
||||
if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
|
||||
return FALSE; /* SI/SO are illegal for DBCS-only conversion */
|
||||
} else {
|
||||
return action!=MBCS_STATE_ILLEGAL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
|
@ -2506,6 +2565,34 @@ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
|||
sourceIndex=nextSourceIndex;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* callback(illegal) */
|
||||
if(byteIndex>1) {
|
||||
/*
|
||||
* Ticket 5691: consistent illegal sequences:
|
||||
* - We include at least the first byte in the illegal sequence.
|
||||
* - If any of the non-initial bytes could be the start of a character,
|
||||
* we stop the illegal sequence before the first one of those.
|
||||
*/
|
||||
UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
|
||||
int8_t i;
|
||||
for(i=1;
|
||||
i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
|
||||
++i) {}
|
||||
if(i<byteIndex) {
|
||||
/* Back out some bytes. */
|
||||
int8_t backOutDistance=byteIndex-i;
|
||||
int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
|
||||
byteIndex=i; /* length of reported illegal byte sequence */
|
||||
if(backOutDistance<=bytesFromThisBuffer) {
|
||||
source-=backOutDistance;
|
||||
} else {
|
||||
/* Back out bytes from the previous buffer: Need to replay them. */
|
||||
cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
|
||||
/* preToULength is negative! */
|
||||
uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
|
||||
source=(const uint8_t *)pArgs->source;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
} else /* unassigned sequences indicated with byteIndex>0 */ {
|
||||
/* try an extension mapping */
|
||||
|
@ -2516,6 +2603,7 @@ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
|||
&offsets, sourceIndex,
|
||||
pArgs->flush,
|
||||
pErrorCode);
|
||||
/* TODO: nextSourceIndex+=diff instead of nextSourceIndex+diff ?? */
|
||||
sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
|
@ -2807,15 +2895,37 @@ ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
|
|||
|
||||
if(c<0) {
|
||||
if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
|
||||
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
||||
}
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
/* incomplete character byte sequence */
|
||||
uint8_t *bytes=cnv->toUBytes;
|
||||
cnv->toULength=(int8_t)(source-lastSource);
|
||||
do {
|
||||
*bytes++=*lastSource++;
|
||||
} while(lastSource<source);
|
||||
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
/* callback(illegal) */
|
||||
/*
|
||||
* Ticket 5691: consistent illegal sequences:
|
||||
* - We include at least the first byte in the illegal sequence.
|
||||
* - If any of the non-initial bytes could be the start of a character,
|
||||
* we stop the illegal sequence before the first one of those.
|
||||
*/
|
||||
UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
|
||||
uint8_t *bytes=cnv->toUBytes;
|
||||
*bytes++=*lastSource++; /* first byte */
|
||||
if(lastSource==source) {
|
||||
cnv->toULength=1;
|
||||
} else /* lastSource<source: multi-byte character */ {
|
||||
int8_t i;
|
||||
for(i=1;
|
||||
lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
|
||||
++i
|
||||
) {
|
||||
*bytes++=*lastSource++;
|
||||
}
|
||||
cnv->toULength=i;
|
||||
source=lastSource;
|
||||
}
|
||||
} else {
|
||||
/* no output because of empty input or only state changes */
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2007, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2008, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/*
|
||||
|
@ -2530,13 +2530,13 @@ static void TestLegalAndOthers(int32_t inputsize, int32_t outputsize)
|
|||
|
||||
|
||||
static const uint8_t text943[] = {
|
||||
0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
|
||||
static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
|
||||
static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
|
||||
0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
|
||||
static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22, 0x5b57 };
|
||||
static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5b57 };
|
||||
static const UChar toUnicode943stop[]= { 0x304b};
|
||||
|
||||
static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7};
|
||||
static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7};
|
||||
static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 };
|
||||
static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 };
|
||||
static const int32_t fromIBM943Offsstop[] = { 0};
|
||||
|
||||
gInBufferSize = inputsize;
|
||||
|
@ -2570,9 +2570,9 @@ static void TestSingleByte(int32_t inputsize, int32_t outputsize)
|
|||
{
|
||||
static const uint8_t sampleText[] = {
|
||||
0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82,
|
||||
0xff, /*0x82, 0xa9,*/ 0x32, 0x33};
|
||||
static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0xfffd,/*0x304b,*/ 0x0032, 0x0033};
|
||||
static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8};
|
||||
0xff, 0x32, 0x33};
|
||||
static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 };
|
||||
static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 };
|
||||
/*checking illegal value for ibm-943 with substitute*/
|
||||
gInBufferSize = inputsize;
|
||||
gOutBufferSize = outputsize;
|
||||
|
|
|
@ -2615,7 +2615,7 @@ TestMBCS() {
|
|||
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
|
||||
/*Test for the condition where there is an invalid character*/
|
||||
{
|
||||
static const uint8_t source2[]={0xa1, 0x01};
|
||||
static const uint8_t source2[]={0xa1, 0x80};
|
||||
TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");
|
||||
}
|
||||
/*Test for the condition where we have a truncated char*/
|
||||
|
@ -3908,11 +3908,11 @@ static void
|
|||
TestISO_2022_KR() {
|
||||
/* test input */
|
||||
static const uint16_t in[]={
|
||||
0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D
|
||||
,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04
|
||||
0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D
|
||||
,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04
|
||||
,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029
|
||||
,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB
|
||||
,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2
|
||||
,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2
|
||||
,0x53E3,0x53E4,0x000A,0x000D};
|
||||
const UChar* uSource;
|
||||
const UChar* uSourceLimit;
|
||||
|
|
79
icu4c/source/test/testdata/conversion.txt
vendored
79
icu4c/source/test/testdata/conversion.txt
vendored
|
@ -48,6 +48,77 @@ conversion:table(nofallback) {
|
|||
toUnicode {
|
||||
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
|
||||
Cases {
|
||||
// Test ticket 5691: consistent illegal sequences
|
||||
// Unfortunately, we cannot use the Shift-JIS examples from the ticket
|
||||
// comments because our Shift-JIS table is Windows-compatible and
|
||||
// therefore has no illegal single bytes. Same for GBK.
|
||||
// Instead, we use the stricter GB 18030 also for 2-byte examples.
|
||||
// The byte sequences are generally slightly different from the ticket
|
||||
// comment, simply using assigned characters rather than just
|
||||
// theoretically valid sequences.
|
||||
{
|
||||
"gb18030",
|
||||
:bin{ 618140813c81ff7a },
|
||||
"a\u4e02\\x81<\\x81\\xFFz",
|
||||
:intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"EUC-JP",
|
||||
:bin{ 618fb0a98fb03c8f3cb0a97a },
|
||||
"a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
|
||||
:intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"gb18030",
|
||||
:bin{ 618130fc318130fc8181303c3e813cfc817a },
|
||||
"a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
|
||||
:intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"UTF-8",
|
||||
:bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
|
||||
"a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",
|
||||
:intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"ISO-2022-JP-2",
|
||||
:bin{ 1b24424141af4142affe41431b2842 },
|
||||
"\u758f\\xAF\u758e\\xAF\\xFE\u790e",
|
||||
:intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"ibm-25546",
|
||||
:bin{ 411b242943420e4141af4142affe41430f5a },
|
||||
"AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
|
||||
:intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"ISO-2022-KR",
|
||||
:bin{ 411b242943420e4141af4142affe41430f5a },
|
||||
"AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
|
||||
:intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"ISO-2022-CN",
|
||||
:bin{ 411b242941420e4141af4142affe41430f5a },
|
||||
"AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
|
||||
:intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"HZ",
|
||||
:bin{ 417e7b4141af4142affe41437e7d5a },
|
||||
"A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
|
||||
:intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
// Test bug 6071 (2:1 Unicode:charset SBCS mapping).
|
||||
{
|
||||
"*test1bmp",
|
||||
|
@ -60,8 +131,8 @@ conversion:table(nofallback) {
|
|||
{
|
||||
"HZ",
|
||||
:bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b },
|
||||
"\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
|
||||
:intvector{ 2,4,6,8,10,12,14,18,19,21,24 },
|
||||
"\u3000\ufffd\ufffd\u3013\ufffd\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
|
||||
:intvector{ 2,4,5,6,8,9,10,12,14,18,19,21,24 },
|
||||
:int{1}, :int{1}, "", "?", :bin{""}
|
||||
}
|
||||
// improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
|
||||
|
@ -69,8 +140,8 @@ conversion:table(nofallback) {
|
|||
{
|
||||
"ISO-2022-JP",
|
||||
:bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
|
||||
"}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
|
||||
:intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
|
||||
"}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\ufffd\ufffd\u25b2\ufffd\ufffd\u6f3e",
|
||||
:intvector{ 3,4,5,9,11,12,13,14,16,17,19,20,21,22,23,25,26,27 },
|
||||
:int{1}, :int{1}, "", "?", :bin{""}
|
||||
}
|
||||
// improve coverage of ISO-2022-JP converter by simulating erroneous input
|
||||
|
|
Loading…
Add table
Reference in a new issue