ICU-4060 Shift-JIS: add statistics, fix a bug in nextChar().

X-SVN-Rev: 17879
This commit is contained in:
Eric Mader 2005-06-14 00:41:51 +00:00
parent 03f94dc608
commit e712baf4af
2 changed files with 17 additions and 5 deletions

View file

@ -198,9 +198,8 @@
</test-case>
<!-- No UTF-8 in this test because it detects as Shift_JIS -->
<!-- No EUC-JP in this test because it detects as EUC-CN -->
<test-case id="IUC10-jp" encodings="Shift_JIS ISO-2022-JP">
<test-case id="IUC10-jp" encodings="UTF-8 Shift_JIS ISO-2022-JP">
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
ヨーロッパ、ソフトウェア、そしてインターネット:
@ -378,7 +377,6 @@
</test-case>
<!-- No UTF-8 in this test because it detects as Shift_JIS -->
<!-- No ISO-2022-CN in this test because Java doesn't support it in both directions :-( -->
<!-- test-case id="IUC10-zh-Hant" encodings="UTF-8 ISO-2022-CN" -->
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

View file

@ -176,6 +176,20 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
*
*/
static class CharsetRecog_sjis extends CharsetRecog_mbcs {
static int [] commonChars =
// TODO: This set of data comes from the character frequency-
// of-occurence analysis tool. The data needs to be moved
// into a resource and loaded from there.
{0x82cc, 0x82c5, 0x82a2, 0x815b, 0x8141, 0x82b5, 0x82c9, 0x82b7, 0x8142, 0x82c4,
0x82c6, 0x82cd, 0x82dc, 0x82f0, 0x82e9, 0x82c8, 0x82aa, 0x8393, 0x82bd, 0x8358,
0x82ea, 0x8343, 0x82a4, 0x82a9, 0x8367, 0x82b1, 0x8145, 0x82e0, 0x838b, 0x834e,
0x82e8, 0x82e7, 0x8140, 0x8362, 0x8389, 0x82c1, 0x838a, 0x82ab, 0x8376, 0x82b3,
0x82f1, 0x82a0, 0x8368, 0x93fa, 0x8175, 0x8176, 0x835e, 0x82e6, 0x8357, 0x82ad,
0x8381, 0x82a6, 0x82b9, 0x82bb, 0x82be, 0x8341, 0x8374, 0x82af, 0x9056, 0x82a8,
0x82c2, 0x8354, 0x8e67, 0x8375, 0x82c7, 0x95f1, 0x8356, 0x967b, 0x92e8, 0x8345,
0x82ce, 0x8385, 0x9770, 0x82df, 0x82dd, 0x836f, 0x8342, 0x8ca7, 0x8352, 0x837d,
0x838d, 0x8346, 0x834f, 0x8380, 0x82ed, 0x8d73, 0x8349, 0x8365, 0x8fee, 0x95b6,
0x8169, 0x816a, 0x836a, 0x8dec, 0x82bf, 0x834c, 0x8366, 0x82e2, 0x838c, 0x945c};
boolean nextChar(iteratedChar it, CharsetDetector det) {
it.index = it.nextIndex;
@ -194,7 +208,7 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
if (secondByte < 0) {
return false;
}
it.charValue = firstByte << 8 + secondByte;
it.charValue = (firstByte << 8) | secondByte;
if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
// Illegal second byte value.
it.error = true;
@ -203,7 +217,7 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
}
int match(CharsetDetector det) {
return match(det, null);
return match(det, commonChars);
}
String getName() {