ICU-10532 Improve UTF-16 charset detection.

X-SVN-Rev: 34646
2025-04-17 18:56:53 +00:00 · 2013-11-07 19:55:48 +00:00 · 2013-11-07 19:55:48 +00:00 · cc2d6e41f8
commit cc2d6e41f8
parent 2f0c821f16
3 changed files with 98 additions and 48 deletions
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_Unicode.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_Unicode.java
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
- * Copyright (C) 1996-2012, International Business Machines Corporation and    *
+ * Copyright (C) 1996-2013, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
@ -24,6 +24,29 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
     */
    abstract CharsetMatch match(CharsetDetector det);
    
+    static int codeUnit16FromBytes(byte hi, byte lo) {
+        return ((hi & 0xff) << 8) | (lo & 0xff);
+    }
+    
+    // UTF-16 confidence calculation. Very simple minded, but better than nothing.
+    //   Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
+    //     and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
+    //   NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
+    //   NULs should be rare in actual text. 
+    static int adjustConfidence(int codeUnit, int confidence) {
+        if (codeUnit == 0) {
+            confidence -= 10;
+        } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
+            confidence += 10;
+        }
+        if (confidence < 0) {
+            confidence = 0;
+        } else if (confidence > 100) {
+            confidence = 100;
+        }
+        return confidence;
+    }
+    
    static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
    {
        String getName()
@ -34,13 +57,26 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
        CharsetMatch match(CharsetDetector det)
        {
            byte[] input = det.fRawInput;
+            int confidence = 10;
            
-            if (input.length>=2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
-                int confidence = 100;
+            int bytesToCheck = Math.min(input.length, 30);
+            for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
+                int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
+                if (charIndex == 0 && codeUnit == 0xFEFF) {
+                    confidence = 100;
+                    break;
+                }
+                confidence = adjustConfidence(codeUnit, confidence);
+                if (confidence == 0 || confidence == 100) {
+                    break;
+                }
+            }
+            if (bytesToCheck < 4 && confidence < 100) {
+                confidence = 0;
+            }
+            if (confidence > 0) {
                return new CharsetMatch(det, this, confidence);
            }
-            
-            // TODO: Do some statistics to check for unsigned UTF-16BE
            return null;
        }
    }
@ -55,19 +91,26 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
        CharsetMatch match(CharsetDetector det)
        {
            byte[] input = det.fRawInput;
+            int confidence = 10;
            
-            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE))
-            {
-               // An LE BOM is present.
-               if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) {
-                   // It is probably UTF-32 LE, not UTF-16
-                   return null;
-               }
-               int confidence = 100;
-               return new CharsetMatch(det, this, confidence);
-            }        
-            
-            // TODO: Do some statistics to check for unsigned UTF-16LE
+            int bytesToCheck = Math.min(input.length, 30);
+            for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
+                int codeUnit = codeUnit16FromBytes(input[charIndex+1], input[charIndex]);
+                if (charIndex == 0 && codeUnit == 0xFEFF) {
+                    confidence = 100;
+                    break;
+                }
+                confidence = adjustConfidence(codeUnit, confidence);
+                if (confidence == 0 || confidence == 100) {
+                    break;
+                }
+            }
+            if (bytesToCheck < 4 && confidence < 100) {
+                confidence = 0;
+            }
+            if (confidence > 0) {
+                return new CharsetMatch(det, this, confidence);
+            }
            return null;
        }
    }
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml
@ -4,7 +4,7 @@
 <!-- See individual test cases for their specific copyright. -->

 <charset-detection-tests>
-    <test-case id="IUC10-ar" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-6/ar windows-1256/ar">
+    <test-case id="IUC10-ar" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-6/ar windows-1256/ar">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    أوروبا, برمجيات الحاسوب + انترنيت :
@ -20,7 +20,7 @@

    </test-case>

-    <test-case id="IUC10-da-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1252/da">
+    <test-case id="IUC10-da-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1252/da">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, Software + Internet:
@ -36,7 +36,7 @@

    </test-case>

-    <test-case id="IUC10-da" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/da">
+    <test-case id="IUC10-da" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/da">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, Software + Internet:
@ -52,7 +52,7 @@

    </test-case>

-    <test-case id="IUC10-de" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/de">
+    <test-case id="IUC10-de" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/de">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, Software + das Internet:
@ -69,7 +69,7 @@
    </test-case>

    <!-- No UTF-8 in this test because there are no non-ASCII characters. -->
-    <test-case id="IUC10-en" encodings="UTF-32BE UTF-32LE ISO-8859-1/en">
+    <test-case id="IUC10-en" encodings="UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/en">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europe, Software + the Internet:
@ -85,7 +85,7 @@

    </test-case>

-    <test-case id="IUC10-es" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/es">
+    <test-case id="IUC10-es" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/es">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, Software + el Internet:
@ -101,7 +101,7 @@

    </test-case>

-    <test-case id="IUC10-fr" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/fr">
+    <test-case id="IUC10-fr" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/fr">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    L'Europe, le logiciel et l'Internet :
@ -118,7 +118,7 @@

    </test-case>

-    <test-case id="IUC10-he" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-8-I/he">
+    <test-case id="IUC10-he" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-8-I/he">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    אירופה, תוכנה והאינטרנט:
@ -133,7 +133,7 @@

    </test-case>

-    <test-case id="IUC10-he-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1255/he">
+    <test-case id="IUC10-he-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1255/he">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    אירופה, תוכנה והאינטרנט:
@ -148,7 +148,7 @@

    </test-case>

-    <test-case id="IUC10-hu" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/hu">
+    <test-case id="IUC10-hu" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/hu">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Európa, a Szoftver s az Internet -
@ -165,7 +165,7 @@

    </test-case>

-    <test-case id="IUC10-hu-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1250/hu">
+    <test-case id="IUC10-hu-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1250/hu">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Európa, a Szoftver s az Internet -
@ -182,7 +182,7 @@

    </test-case>

-    <test-case id="IUC10-it" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/it">
+    <test-case id="IUC10-it" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/it">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, software e Internet:
@ -199,7 +199,7 @@
    </test-case>

    <!-- No EUC-JP in this test because it detects as GB18030 -->
-    <test-case id="IUC10-jp" encodings="UTF-8 UTF-32BE UTF-32LE Shift_JIS/ja ISO-2022-JP">
+    <test-case id="IUC10-jp" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE Shift_JIS/ja ISO-2022-JP">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    ヨーロッパ、ソフトウェア、そしてインターネット:
@ -214,7 +214,7 @@

    </test-case>

-    <test-case id="IUC10-ko" encodings="UTF-8 UTF-32BE UTF-32LE EUC-KR/ko ISO-2022-KR">
+    <test-case id="IUC10-ko" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE EUC-KR/ko ISO-2022-KR">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    유럽, 소프트웨어 그리고 인터넷:
@ -230,7 +230,7 @@
    </test-case>

    <!-- No UTF-8 in this test because there are no non-ASCII characters. -->
-    <test-case id="IUC10-nl" encodings="UTF-32BE UTF-32LE ISO-8859-1/nl">
+    <test-case id="IUC10-nl" encodings="UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/nl">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, Software + het Internet:
@ -247,7 +247,7 @@
    </test-case>

    <!-- No language for ISO-8859-1 in this test because no-NO is recogonized as Danish... -->
-    <test-case id="IUC10-no-NO" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/da">
+    <test-case id="IUC10-no-NO" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/da">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, Programvare og Internet:
@ -262,7 +262,7 @@

    </test-case>

-    <test-case id="IUC10-no-NO-NY" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/no">
+    <test-case id="IUC10-no-NO-NY" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/no">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, programvare og Internett:
@ -278,7 +278,7 @@

    </test-case>

-    <test-case id="IUC10-pt-BR" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/pt">
+    <test-case id="IUC10-pt-BR" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/pt">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, Software e a Internet:
@ -294,7 +294,7 @@

    </test-case>

-    <test-case id="IUC10-pt-PT" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/pt">
+    <test-case id="IUC10-pt-PT" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/pt">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, Software e a Internet:
@ -311,7 +311,7 @@

    </test-case>

-    <test-case id="IUC10-ro" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/ro">
+    <test-case id="IUC10-ro" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/ro">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, Software şi Internet:
@ -328,7 +328,7 @@

    </test-case>

-    <test-case id="IUC10-ru" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-5/ru windows-1251/ru KOI8-R/ru">
+    <test-case id="IUC10-ru" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-5/ru windows-1251/ru KOI8-R/ru">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Европа, Программное обеспечение + Интернет:
@ -345,7 +345,7 @@

    </test-case>

-    <test-case id="IUC10-sv" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/sv">
+    <test-case id="IUC10-sv" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/sv">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, programvara och Internet:
@ -361,7 +361,7 @@

    </test-case>

-    <test-case id="IUC10-yi" encodings="UTF-8 UTF-32BE UTF-32LE">
+    <test-case id="IUC10-yi" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    אײראָפּע: פּראָגראַמװאַרג און די װעלטנעץ:
@ -377,7 +377,7 @@

    </test-case>

-    <test-case id="IUC10-zh-Hant" encodings="UTF-8 UTF-32BE UTF-32LE Big5/zh">
+    <test-case id="IUC10-zh-Hant" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE Big5/zh">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    歐洲，軟體及網際網路：
@ -393,7 +393,7 @@
    </test-case>

    <!-- No ISO-2022-CN in this test because Java doesn't support it in both directions :-( -->
-    <test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-32BE UTF-32LE ISO-2022-CN GB18030/zh">
+    <test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-2022-CN GB18030/zh">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    欧洲，软件＋互联网
@ -409,7 +409,7 @@
 Conference Program
    </test-case>

-    <test-case id="WIU-cz" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/cs">
+    <test-case id="WIU-cz" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/cs">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Co je Unicode?
@ -432,7 +432,7 @@ Conference Program

    </test-case>

-    <test-case id="WIU-el" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-7/el">
+    <test-case id="WIU-el" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-7/el">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Τι είναι το Unicode;
@ -458,7 +458,7 @@ Conference Program

    </test-case>

-    <test-case id="WIU-el-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1253/el">
+    <test-case id="WIU-el-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1253/el">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Τι είναι το “Unicode”;
@ -484,7 +484,7 @@ Conference Program

    </test-case>

-    <test-case id="WIU-pl" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/pl">
+    <test-case id="WIU-pl" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/pl">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Czym jest Unikod ?
@ -505,7 +505,7 @@ Conference Program

    </test-case>

-    <test-case id="WIU-tr" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-9/tr">
+    <test-case id="WIU-tr" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-9/tr">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Evrensel Kod Nedir?
@ -527,7 +527,7 @@ Conference Program

    </test-case>

-    <test-case id="WIU-tr-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1254/tr">
+    <test-case id="WIU-tr-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1254/tr">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    “Evrensel Kod” Nedir?
@ -548,4 +548,9 @@ Conference Program
    şifrelemeyi desteklemek zorundadırlar; veriler, farklı şifreleme ve altyapılardan geçerken bozulma riski taşırlar.

    </test-case>
+    
+    
+    <test-case id="bug-10532-utf-16" encodings="UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE">
+    foo 東京・銀座の歌舞伎座。４月に新調された４枚の緞帳のうち３枚は、京都の川島織物セルコンが織った朝日新聞デジタル会員（有料・無料）にご登録いただくと、様々な特典・サービスが受けられます。
+    </test-case>
 </charset-detection-tests>
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@ -444,10 +444,12 @@ public class TestCharsetDetector extends TestFmwk
                        
                        // Ignore any other converters that can't
                        // convert from Unicode.
+                        logln("Unsupported encoding" + from);
                        return;
                    } catch (UnsupportedEncodingException uee) {
                        // Ignore any encodings that this runtime
                        // doesn't support.
+                        logln("Unsupported encoding" + from);
                        return;
                    }