From 291516499b76cc82f4076dbf0ec1991b5053fc02 Mon Sep 17 00:00:00 2001 From: Eric Mader Date: Sat, 28 May 2005 00:30:14 +0000 Subject: [PATCH] ICU-4060 Chean up charset names to match what Java expects, don't emit confidence over 100. X-SVN-Rev: 17728 --- icu4j/src/com/ibm/icu/text/CharsetDetector.java | 2 +- icu4j/src/com/ibm/icu/text/CharsetRecog_2022.java | 6 +++--- icu4j/src/com/ibm/icu/text/CharsetRecog_sbcs.java | 10 ++++++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/icu4j/src/com/ibm/icu/text/CharsetDetector.java b/icu4j/src/com/ibm/icu/text/CharsetDetector.java index 1b1690bd946..e2e140f7734 100644 --- a/icu4j/src/com/ibm/icu/text/CharsetDetector.java +++ b/icu4j/src/com/ibm/icu/text/CharsetDetector.java @@ -443,8 +443,8 @@ public class CharsetDetector { recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he_visual()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R()); diff --git a/icu4j/src/com/ibm/icu/text/CharsetRecog_2022.java b/icu4j/src/com/ibm/icu/text/CharsetRecog_2022.java index 8c8d787fa04..9b19536a32c 100644 --- a/icu4j/src/com/ibm/icu/text/CharsetRecog_2022.java +++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_2022.java @@ -105,7 +105,7 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer { }; String getName() { - return "ISO2022-JP"; + return "ISO-2022-JP"; } int match(CharsetDetector det) { @@ -119,7 +119,7 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer { }; String getName() { - return "ISO2022-KR"; + return "ISO-2022-KR"; } int match(CharsetDetector det) { @@ -144,7 +144,7 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer { }; String getName() { - return "ISO2022-CN"; + return "ISO-2022-CN"; } diff --git a/icu4j/src/com/ibm/icu/text/CharsetRecog_sbcs.java b/icu4j/src/com/ibm/icu/text/CharsetRecog_sbcs.java index b280211c535..52c7e31dfe9 100644 --- a/icu4j/src/com/ibm/icu/text/CharsetRecog_sbcs.java +++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_sbcs.java @@ -144,6 +144,12 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer { // return 0; // } + // TODO - This is a bit of a hack to take care of a case + // were we were getting a confidence of 135... + if (rawPercent > 0.33) { + return 98; + } + return (int) (rawPercent * 300.0); } } @@ -757,7 +763,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer { } } - static class CharsetRecog_8859_8_he extends CharsetRecog_8859_8 + static class CharsetRecog_8859_8_I_he extends CharsetRecog_8859_8 { private static int[] ngrams = { 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0, @@ -777,7 +783,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer { } } - static class CharsetRecog_8859_8_he_visual extends CharsetRecog_8859_8 + static class CharsetRecog_8859_8_he extends CharsetRecog_8859_8 { private static int[] ngrams = { 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,