mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-5691 stop illegal ISO-2022 escape/shift sequences before the first byte that could start a new character
X-SVN-Rev: 24769
This commit is contained in:
parent
2b9e6b107e
commit
b6b3273af0
2 changed files with 75 additions and 7 deletions
|
@ -754,6 +754,7 @@ changeState_2022(UConverter* _this,
|
|||
UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
|
||||
uint32_t key = myData2022->key;
|
||||
int32_t offset = 0;
|
||||
int8_t initialToULength = _this->toULength;
|
||||
char c;
|
||||
|
||||
value = VALID_NON_TERMINAL_2022;
|
||||
|
@ -806,7 +807,6 @@ DONE:
|
|||
return;
|
||||
} else if (value == INVALID_2022 ) {
|
||||
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
||||
return;
|
||||
} else /* value == VALID_TERMINAL_2022 */ {
|
||||
switch(var){
|
||||
#ifdef U_ENABLE_GENERIC_ISO_2022
|
||||
|
@ -938,6 +938,35 @@ DONE:
|
|||
}
|
||||
if(U_SUCCESS(*err)) {
|
||||
_this->toULength = 0;
|
||||
} else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
|
||||
if(_this->toULength>1) {
|
||||
/*
|
||||
* Ticket 5691: consistent illegal sequences:
|
||||
* - We include at least the first byte (ESC) in the illegal sequence.
|
||||
* - If any of the non-initial bytes could be the start of a character,
|
||||
* we stop the illegal sequence before the first one of those.
|
||||
* In escape sequences, all following bytes are "printable", that is,
|
||||
* unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
|
||||
* they are valid single/lead bytes.
|
||||
* For simplicity, we always only report the initial ESC byte as the
|
||||
* illegal sequence and back out all other bytes we looked at.
|
||||
*/
|
||||
/* Back out some bytes. */
|
||||
int8_t backOutDistance=_this->toULength-1;
|
||||
int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
|
||||
if(backOutDistance<=bytesFromThisBuffer) {
|
||||
/* same as initialToULength<=1 */
|
||||
*source-=backOutDistance;
|
||||
} else {
|
||||
/* Back out bytes from the previous buffer: Need to replay them. */
|
||||
_this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
|
||||
/* same as -(initialToULength-1) */
|
||||
/* preToULength is negative! */
|
||||
uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
|
||||
*source-=bytesFromThisBuffer;
|
||||
}
|
||||
_this->toULength=1;
|
||||
}
|
||||
} else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
|
||||
_this->toUCallbackReason = UCNV_UNASSIGNED;
|
||||
}
|
||||
|
@ -2657,12 +2686,8 @@ getTrailByte:
|
|||
tempBuf[0] = (char)(mySourceChar + 0x80);
|
||||
tempBuf[1] = (char)(trailByte + 0x80);
|
||||
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
|
||||
} else {
|
||||
leadIsOk = TRUE; /* TODO: remove */
|
||||
}
|
||||
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
|
||||
} else {
|
||||
trailIsOk = TRUE; /* TODO: remove */
|
||||
}
|
||||
} else {
|
||||
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
||||
|
|
47
icu4c/source/test/testdata/conversion.txt
vendored
47
icu4c/source/test/testdata/conversion.txt
vendored
|
@ -49,6 +49,8 @@ conversion:table(nofallback) {
|
|||
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
|
||||
Cases {
|
||||
// Test ticket 5691: consistent illegal sequences
|
||||
// The following test cases are for illegal character byte sequences.
|
||||
//
|
||||
// Unfortunately, we cannot use the Shift-JIS examples from the ticket
|
||||
// comments because our Shift-JIS table is Windows-compatible and
|
||||
// therefore has no illegal single bytes. Same for GBK.
|
||||
|
@ -85,7 +87,7 @@ conversion:table(nofallback) {
|
|||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"ISO-2022-JP-2",
|
||||
"ISO-2022-JP",
|
||||
:bin{ 1b24424141af4142affe41431b2842 },
|
||||
"\u758f\\xAF\u758e\\xAF\\xFE\u790e",
|
||||
:intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
|
||||
|
@ -119,6 +121,47 @@ conversion:table(nofallback) {
|
|||
:intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
// Test ticket 5691: consistent illegal sequences
|
||||
// The following test cases are for illegal escape/designator/shift sequences.
|
||||
//
|
||||
// ISO-2022-JP and -CN with illegal escape sequences.
|
||||
{
|
||||
"ISO-2022-JP",
|
||||
:bin{ 611b24201b244241411b283f1b28427a },
|
||||
"a\\x1B$ \u758f\\x1B\u2538z",
|
||||
:intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"ISO-2022-CN",
|
||||
:bin{ 611b2429201b2429410e41410f7a },
|
||||
"a\\x1B$) \u4eaez",
|
||||
:intvector{ 0,1,1,1,1,2,3,4,10,13 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
// ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences.
|
||||
// The first ESC N comes before its designator sequence, the last sequence is ESC+space.
|
||||
{
|
||||
"ISO-2022-JP-2",
|
||||
:bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },
|
||||
"N\\x1BNNN\xceN\\x1B N",
|
||||
:intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"ISO-2022-CN-EXT",
|
||||
:bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },
|
||||
"N\\x1BNNN\u8f0eN\\x1B N",
|
||||
:intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
{
|
||||
"ISO-2022-CN-EXT",
|
||||
:bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },
|
||||
"O\\x1BOOO\u492bO\\x1B O",
|
||||
:intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
|
||||
:int{1}, :int{0}, "", "&C", :bin{""}
|
||||
}
|
||||
// Test bug 6071 (2:1 Unicode:charset SBCS mapping).
|
||||
{
|
||||
"*test1bmp",
|
||||
|
@ -436,7 +479,7 @@ conversion:table(nofallback) {
|
|||
{
|
||||
"ISO-2022-CN-EXT",
|
||||
:bin{ 411b4e2121 }, "\x41", :intvector{ 0 },
|
||||
:int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }
|
||||
:int{1}, :int{1}, "illesc", ".", :bin{ 1b }
|
||||
}
|
||||
// G3 designator: recognized, but not supported for -CN (only for -CN-EXT)
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue