diff --git a/icu4c/source/common/ucnvhz.c b/icu4c/source/common/ucnvhz.c index 8760afc93e0..2523383c982 100644 --- a/icu4c/source/common/ucnvhz.c +++ b/icu4c/source/common/ucnvhz.c @@ -198,10 +198,30 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, /* if the first byte is equal to TILDE and the trail byte * is not a valid byte then it is an error condition */ - mySourceChar = 0x7e00 | mySourceChar; - targetUniChar = 0xffff; + /* + * Ticket 5691: consistent illegal sequences: + * - We include at least the first byte in the illegal sequence. + * - If any of the non-initial bytes could be the start of a character, + * we stop the illegal sequence before the first one of those. + */ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ - break; + *err = U_ILLEGAL_ESCAPE_SEQUENCE; + args->converter->toUBytes[0] = UCNV_TILDE; + if( myData->isStateDBCS ? + (0x21 <= mySourceChar && mySourceChar <= 0x7e) : + mySourceChar <= 0x7f + ) { + /* The current byte could be the start of a character: Back it out. */ + args->converter->toULength = 1; + --mySource; + } else { + /* Include the current byte in the illegal sequence. */ + args->converter->toUBytes[1] = mySourceChar; + args->converter->toULength = 2; + } + args->target = myTarget; + args->source = mySource; + return; } } else if(myData->isStateDBCS) { if(args->converter->toUnicodeStatus == 0x00){ diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt index 01d1ba4bf5b..17c2fb6d915 100644 --- a/icu4c/source/test/testdata/conversion.txt +++ b/icu4c/source/test/testdata/conversion.txt @@ -162,6 +162,16 @@ conversion:table(nofallback) { :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, :int{1}, :int{0}, "", "&C", :bin{""} } + // HZ with illegal tilde sequences. + { + "HZ", + :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a }, + "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z", + :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS + 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS + 25 }, // SBCS + :int{1}, :int{0}, "", "&C", :bin{""} + } // Test bug 6071 (2:1 Unicode:charset SBCS mapping). { "*test1bmp", @@ -173,9 +183,9 @@ conversion:table(nofallback) { // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e { "HZ", - :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b }, - "\u3000\ufffd\ufffd\u3013\ufffd\ufffd\u9ccc\ufffd\ufffd ~\ufffd+", - :intvector{ 2,4,5,6,8,9,10,12,14,18,19,21,24 }, + :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b }, + "\u3000\ufffd\ufffd\u3013\ufffd\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+", + :intvector{ 2,4,5,6,8,9,10,12,14,15,19,20,22,25 }, :int{1}, :int{1}, "", "?", :bin{""} } // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and