ICU-5691 stop illegal ISO-2022 escape/shift sequences before the first byte that could start a new character

X-SVN-Rev: 24769
This commit is contained in:
Markus Scherer 2008-10-10 06:00:27 +00:00
parent 2b9e6b107e
commit b6b3273af0
2 changed files with 75 additions and 7 deletions

View file

@ -754,6 +754,7 @@ changeState_2022(UConverter* _this,
UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
uint32_t key = myData2022->key;
int32_t offset = 0;
int8_t initialToULength = _this->toULength;
char c;
value = VALID_NON_TERMINAL_2022;
@ -806,7 +807,6 @@ DONE:
return;
} else if (value == INVALID_2022 ) {
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
return;
} else /* value == VALID_TERMINAL_2022 */ {
switch(var){
#ifdef U_ENABLE_GENERIC_ISO_2022
@ -938,6 +938,35 @@ DONE:
}
if(U_SUCCESS(*err)) {
_this->toULength = 0;
} else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
if(_this->toULength>1) {
/*
* Ticket 5691: consistent illegal sequences:
* - We include at least the first byte (ESC) in the illegal sequence.
* - If any of the non-initial bytes could be the start of a character,
* we stop the illegal sequence before the first one of those.
* In escape sequences, all following bytes are "printable", that is,
* unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
* they are valid single/lead bytes.
* For simplicity, we always only report the initial ESC byte as the
* illegal sequence and back out all other bytes we looked at.
*/
/* Back out some bytes. */
int8_t backOutDistance=_this->toULength-1;
int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
if(backOutDistance<=bytesFromThisBuffer) {
/* same as initialToULength<=1 */
*source-=backOutDistance;
} else {
/* Back out bytes from the previous buffer: Need to replay them. */
_this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
/* same as -(initialToULength-1) */
/* preToULength is negative! */
uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
*source-=bytesFromThisBuffer;
}
_this->toULength=1;
}
} else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
_this->toUCallbackReason = UCNV_UNASSIGNED;
}
@ -2657,12 +2686,8 @@ getTrailByte:
tempBuf[0] = (char)(mySourceChar + 0x80);
tempBuf[1] = (char)(trailByte + 0x80);
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
} else {
leadIsOk = TRUE; /* TODO: remove */
}
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
} else {
trailIsOk = TRUE; /* TODO: remove */
}
} else {
args->converter->toUBytes[0] = (uint8_t)mySourceChar;

View file

@ -49,6 +49,8 @@ conversion:table(nofallback) {
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
Cases {
// Test ticket 5691: consistent illegal sequences
// The following test cases are for illegal character byte sequences.
//
// Unfortunately, we cannot use the Shift-JIS examples from the ticket
// comments because our Shift-JIS table is Windows-compatible and
// therefore has no illegal single bytes. Same for GBK.
@ -85,7 +87,7 @@ conversion:table(nofallback) {
:int{1}, :int{0}, "", "&C", :bin{""}
}
{
"ISO-2022-JP-2",
"ISO-2022-JP",
:bin{ 1b24424141af4142affe41431b2842 },
"\u758f\\xAF\u758e\\xAF\\xFE\u790e",
:intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
@ -119,6 +121,47 @@ conversion:table(nofallback) {
:intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
:int{1}, :int{0}, "", "&C", :bin{""}
}
// Test ticket 5691: consistent illegal sequences
// The following test cases are for illegal escape/designator/shift sequences.
//
// ISO-2022-JP and -CN with illegal escape sequences.
{
"ISO-2022-JP",
:bin{ 611b24201b244241411b283f1b28427a },
"a\\x1B$ \u758f\\x1B\u2538z",
:intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },
:int{1}, :int{0}, "", "&C", :bin{""}
}
{
"ISO-2022-CN",
:bin{ 611b2429201b2429410e41410f7a },
"a\\x1B$) \u4eaez",
:intvector{ 0,1,1,1,1,2,3,4,10,13 },
:int{1}, :int{0}, "", "&C", :bin{""}
}
// ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences.
// The first ESC N comes before its designator sequence, the last sequence is ESC+space.
{
"ISO-2022-JP-2",
:bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },
"N\\x1BNNN\xceN\\x1B N",
:intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },
:int{1}, :int{0}, "", "&C", :bin{""}
}
{
"ISO-2022-CN-EXT",
:bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },
"N\\x1BNNN\u8f0eN\\x1B N",
:intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
:int{1}, :int{0}, "", "&C", :bin{""}
}
{
"ISO-2022-CN-EXT",
:bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },
"O\\x1BOOO\u492bO\\x1B O",
:intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
:int{1}, :int{0}, "", "&C", :bin{""}
}
// Test bug 6071 (2:1 Unicode:charset SBCS mapping).
{
"*test1bmp",
@ -436,7 +479,7 @@ conversion:table(nofallback) {
{
"ISO-2022-CN-EXT",
:bin{ 411b4e2121 }, "\x41", :intvector{ 0 },
:int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }
:int{1}, :int{1}, "illesc", ".", :bin{ 1b }
}
// G3 designator: recognized, but not supported for -CN (only for -CN-EXT)
{