ICU-5691 stop illegal HZ tilde sequences before the first byte that could start a new character

X-SVN-Rev: 24773
2025-04-06 05:55:35 +00:00 · 2008-10-10 16:30:28 +00:00 · 2008-10-10 16:30:28 +00:00 · c801506d5d
commit c801506d5d
parent e10cfeebfe
2 changed files with 36 additions and 6 deletions
--- a/icu4c/source/common/ucnvhz.c
+++ b/icu4c/source/common/ucnvhz.c
@ -198,10 +198,30 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
                     /* if the first byte is equal to TILDE and the trail byte
                     * is not a valid byte then it is an error condition
                     */
-                    mySourceChar = 0x7e00 | mySourceChar;
-                    targetUniChar = 0xffff;
+                    /*
+                     * Ticket 5691: consistent illegal sequences:
+                     * - We include at least the first byte in the illegal sequence.
+                     * - If any of the non-initial bytes could be the start of a character,
+                     *   we stop the illegal sequence before the first one of those.
+                     */
                    myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
-                    break;
+                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+                    args->converter->toUBytes[0] = UCNV_TILDE;
+                    if( myData->isStateDBCS ?
+                            (0x21 <= mySourceChar && mySourceChar <= 0x7e) :
+                            mySourceChar <= 0x7f
+                    ) {
+                        /* The current byte could be the start of a character: Back it out. */
+                        args->converter->toULength = 1;
+                        --mySource;
+                    } else {
+                        /* Include the current byte in the illegal sequence. */
+                        args->converter->toUBytes[1] = mySourceChar;
+                        args->converter->toULength = 2;
+                    }
+                    args->target = myTarget;
+                    args->source = mySource;
+                    return;
                }
            } else if(myData->isStateDBCS) {
                if(args->converter->toUnicodeStatus == 0x00){
--- a/icu4c/source/test/testdata/conversion.txt
+++ b/icu4c/source/test/testdata/conversion.txt
@ -162,6 +162,16 @@ conversion:table(nofallback) {
          :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
          :int{1}, :int{0}, "", "&C", :bin{""}
        }
+        // HZ with illegal tilde sequences.
+        {
+          "HZ",
+          :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a },
+          "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z",
+          :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9,                          // SBCS
+                      12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS
+                      25 },                                                                 // SBCS
+          :int{1}, :int{0}, "", "&C", :bin{""}
+        }
        // Test bug 6071 (2:1 Unicode:charset SBCS mapping).
        {
          "*test1bmp",
@ -173,9 +183,9 @@ conversion:table(nofallback) {
        // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
        {
          "HZ",
-          :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b },
-          "\u3000\ufffd\ufffd\u3013\ufffd\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
-          :intvector{ 2,4,5,6,8,9,10,12,14,18,19,21,24 },
+          :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b },
+          "\u3000\ufffd\ufffd\u3013\ufffd\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+",
+          :intvector{ 2,4,5,6,8,9,10,12,14,15,19,20,22,25 },
          :int{1}, :int{1}, "", "?", :bin{""}
        }
        // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and