ICU-10532 Improve UTF-16 charset detection.

X-SVN-Rev: 34646
This commit is contained in:
Andy Heninger 2013-11-07 19:55:48 +00:00
parent 2f0c821f16
commit cc2d6e41f8
3 changed files with 98 additions and 48 deletions

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2012, International Business Machines Corporation and *
* Copyright (C) 1996-2013, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
@ -24,6 +24,29 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
*/
abstract CharsetMatch match(CharsetDetector det);
static int codeUnit16FromBytes(byte hi, byte lo) {
return ((hi & 0xff) << 8) | (lo & 0xff);
}
// UTF-16 confidence calculation. Very simple minded, but better than nothing.
// Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
// and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
// NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
// NULs should be rare in actual text.
static int adjustConfidence(int codeUnit, int confidence) {
if (codeUnit == 0) {
confidence -= 10;
} else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
confidence += 10;
}
if (confidence < 0) {
confidence = 0;
} else if (confidence > 100) {
confidence = 100;
}
return confidence;
}
static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
{
String getName()
@ -34,13 +57,26 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
CharsetMatch match(CharsetDetector det)
{
byte[] input = det.fRawInput;
int confidence = 10;
if (input.length>=2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
int confidence = 100;
int bytesToCheck = Math.min(input.length, 30);
for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
if (charIndex == 0 && codeUnit == 0xFEFF) {
confidence = 100;
break;
}
confidence = adjustConfidence(codeUnit, confidence);
if (confidence == 0 || confidence == 100) {
break;
}
}
if (bytesToCheck < 4 && confidence < 100) {
confidence = 0;
}
if (confidence > 0) {
return new CharsetMatch(det, this, confidence);
}
// TODO: Do some statistics to check for unsigned UTF-16BE
return null;
}
}
@ -55,19 +91,26 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
CharsetMatch match(CharsetDetector det)
{
byte[] input = det.fRawInput;
int confidence = 10;
if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE))
{
// An LE BOM is present.
if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) {
// It is probably UTF-32 LE, not UTF-16
return null;
}
int confidence = 100;
return new CharsetMatch(det, this, confidence);
}
// TODO: Do some statistics to check for unsigned UTF-16LE
int bytesToCheck = Math.min(input.length, 30);
for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
int codeUnit = codeUnit16FromBytes(input[charIndex+1], input[charIndex]);
if (charIndex == 0 && codeUnit == 0xFEFF) {
confidence = 100;
break;
}
confidence = adjustConfidence(codeUnit, confidence);
if (confidence == 0 || confidence == 100) {
break;
}
}
if (bytesToCheck < 4 && confidence < 100) {
confidence = 0;
}
if (confidence > 0) {
return new CharsetMatch(det, this, confidence);
}
return null;
}
}

View file

@ -4,7 +4,7 @@
<!-- See individual test cases for their specific copyright. -->
<charset-detection-tests>
<test-case id="IUC10-ar" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-6/ar windows-1256/ar">
<test-case id="IUC10-ar" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-6/ar windows-1256/ar">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
أوروبا, برمجيات الحاسوب + انترنيت :
@ -20,7 +20,7 @@
</test-case>
<test-case id="IUC10-da-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1252/da">
<test-case id="IUC10-da-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1252/da">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, Software + Internet:
@ -36,7 +36,7 @@
</test-case>
<test-case id="IUC10-da" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/da">
<test-case id="IUC10-da" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/da">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, Software + Internet:
@ -52,7 +52,7 @@
</test-case>
<test-case id="IUC10-de" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/de">
<test-case id="IUC10-de" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/de">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, Software + das Internet:
@ -69,7 +69,7 @@
</test-case>
<!-- No UTF-8 in this test because there are no non-ASCII characters. -->
<test-case id="IUC10-en" encodings="UTF-32BE UTF-32LE ISO-8859-1/en">
<test-case id="IUC10-en" encodings="UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/en">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europe, Software + the Internet:
@ -85,7 +85,7 @@
</test-case>
<test-case id="IUC10-es" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/es">
<test-case id="IUC10-es" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/es">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, Software + el Internet:
@ -101,7 +101,7 @@
</test-case>
<test-case id="IUC10-fr" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/fr">
<test-case id="IUC10-fr" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/fr">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
L'Europe, le logiciel et l'Internet :
@ -118,7 +118,7 @@
</test-case>
<test-case id="IUC10-he" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-8-I/he">
<test-case id="IUC10-he" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-8-I/he">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
אירופה, תוכנה והאינטרנט:
@ -133,7 +133,7 @@
</test-case>
<test-case id="IUC10-he-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1255/he">
<test-case id="IUC10-he-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1255/he">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
אירופה, תוכנה והאינטרנט:
@ -148,7 +148,7 @@
</test-case>
<test-case id="IUC10-hu" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/hu">
<test-case id="IUC10-hu" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/hu">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Európa, a Szoftver s az Internet -
@ -165,7 +165,7 @@
</test-case>
<test-case id="IUC10-hu-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1250/hu">
<test-case id="IUC10-hu-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1250/hu">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Európa, a Szoftver s az Internet -
@ -182,7 +182,7 @@
</test-case>
<test-case id="IUC10-it" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/it">
<test-case id="IUC10-it" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/it">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, software e Internet:
@ -199,7 +199,7 @@
</test-case>
<!-- No EUC-JP in this test because it detects as GB18030 -->
<test-case id="IUC10-jp" encodings="UTF-8 UTF-32BE UTF-32LE Shift_JIS/ja ISO-2022-JP">
<test-case id="IUC10-jp" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE Shift_JIS/ja ISO-2022-JP">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
ヨーロッパ、ソフトウェア、そしてインターネット:
@ -214,7 +214,7 @@
</test-case>
<test-case id="IUC10-ko" encodings="UTF-8 UTF-32BE UTF-32LE EUC-KR/ko ISO-2022-KR">
<test-case id="IUC10-ko" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE EUC-KR/ko ISO-2022-KR">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
유럽, 소프트웨어 그리고 인터넷:
@ -230,7 +230,7 @@
</test-case>
<!-- No UTF-8 in this test because there are no non-ASCII characters. -->
<test-case id="IUC10-nl" encodings="UTF-32BE UTF-32LE ISO-8859-1/nl">
<test-case id="IUC10-nl" encodings="UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/nl">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, Software + het Internet:
@ -247,7 +247,7 @@
</test-case>
<!-- No language for ISO-8859-1 in this test because no-NO is recogonized as Danish... -->
<test-case id="IUC10-no-NO" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/da">
<test-case id="IUC10-no-NO" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/da">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, Programvare og Internet:
@ -262,7 +262,7 @@
</test-case>
<test-case id="IUC10-no-NO-NY" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/no">
<test-case id="IUC10-no-NO-NY" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/no">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, programvare og Internett:
@ -278,7 +278,7 @@
</test-case>
<test-case id="IUC10-pt-BR" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/pt">
<test-case id="IUC10-pt-BR" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/pt">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, Software e a Internet:
@ -294,7 +294,7 @@
</test-case>
<test-case id="IUC10-pt-PT" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/pt">
<test-case id="IUC10-pt-PT" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/pt">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, Software e a Internet:
@ -311,7 +311,7 @@
</test-case>
<test-case id="IUC10-ro" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/ro">
<test-case id="IUC10-ro" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/ro">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, Software şi Internet:
@ -328,7 +328,7 @@
</test-case>
<test-case id="IUC10-ru" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-5/ru windows-1251/ru KOI8-R/ru">
<test-case id="IUC10-ru" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-5/ru windows-1251/ru KOI8-R/ru">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Европа, Программное обеспечение + Интернет:
@ -345,7 +345,7 @@
</test-case>
<test-case id="IUC10-sv" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/sv">
<test-case id="IUC10-sv" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/sv">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Europa, programvara och Internet:
@ -361,7 +361,7 @@
</test-case>
<test-case id="IUC10-yi" encodings="UTF-8 UTF-32BE UTF-32LE">
<test-case id="IUC10-yi" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
אײראָפּע: פּראָגראַמװאַרג און די װעלטנעץ:
@ -377,7 +377,7 @@
</test-case>
<test-case id="IUC10-zh-Hant" encodings="UTF-8 UTF-32BE UTF-32LE Big5/zh">
<test-case id="IUC10-zh-Hant" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE Big5/zh">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
歐洲,軟體及網際網路:
@ -393,7 +393,7 @@
</test-case>
<!-- No ISO-2022-CN in this test because Java doesn't support it in both directions :-( -->
<test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-32BE UTF-32LE ISO-2022-CN GB18030/zh">
<test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-2022-CN GB18030/zh">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
欧洲,软件+互联网
@ -409,7 +409,7 @@
Conference Program
</test-case>
<test-case id="WIU-cz" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/cs">
<test-case id="WIU-cz" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/cs">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Co je Unicode?
@ -432,7 +432,7 @@ Conference Program
</test-case>
<test-case id="WIU-el" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-7/el">
<test-case id="WIU-el" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-7/el">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Τι είναι το Unicode;
@ -458,7 +458,7 @@ Conference Program
</test-case>
<test-case id="WIU-el-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1253/el">
<test-case id="WIU-el-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1253/el">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Τι είναι το “Unicode”;
@ -484,7 +484,7 @@ Conference Program
</test-case>
<test-case id="WIU-pl" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/pl">
<test-case id="WIU-pl" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/pl">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Czym jest Unikod ?
@ -505,7 +505,7 @@ Conference Program
</test-case>
<test-case id="WIU-tr" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-9/tr">
<test-case id="WIU-tr" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-9/tr">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
Evrensel Kod Nedir?
@ -527,7 +527,7 @@ Conference Program
</test-case>
<test-case id="WIU-tr-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1254/tr">
<test-case id="WIU-tr-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1254/tr">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
“Evrensel Kod” Nedir?
@ -548,4 +548,9 @@ Conference Program
şifrelemeyi desteklemek zorundadırlar; veriler, farklı şifreleme ve altyapılardan geçerken bozulma riski taşırlar.
</test-case>
<test-case id="bug-10532-utf-16" encodings="UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE">
foo 東京・銀座の歌舞伎座。4月に新調された4枚の緞帳のうち3枚は、京都の川島織物セルコンが織った朝日新聞デジタル会員(有料・無料)にご登録いただくと、様々な特典・サービスが受けられます。
</test-case>
</charset-detection-tests>

View file

@ -444,10 +444,12 @@ public class TestCharsetDetector extends TestFmwk
// Ignore any other converters that can't
// convert from Unicode.
logln("Unsupported encoding" + from);
return;
} catch (UnsupportedEncodingException uee) {
// Ignore any encodings that this runtime
// doesn't support.
logln("Unsupported encoding" + from);
return;
}