diff --git a/icu4j/src/com/ibm/icu/text/CharsetDetector.java b/icu4j/src/com/ibm/icu/text/CharsetDetector.java index e9fd04576dc..8a55ad07708 100644 --- a/icu4j/src/com/ibm/icu/text/CharsetDetector.java +++ b/icu4j/src/com/ibm/icu/text/CharsetDetector.java @@ -354,11 +354,13 @@ public class CharsetDetector { recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp()); recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it()); + recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs()); recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl()); diff --git a/icu4j/src/com/ibm/icu/text/CharsetRecog_sbcs.java b/icu4j/src/com/ibm/icu/text/CharsetRecog_sbcs.java index 972d4800c2f..b826598fb89 100644 --- a/icu4j/src/com/ibm/icu/text/CharsetRecog_sbcs.java +++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_sbcs.java @@ -211,14 +211,34 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer { } } + static class CharsetRecog_8859_1_da extends CharsetRecog_8859_1 + { + private static int[] ngrams = { + 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620, + 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320, + 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520, + 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572, + }; + + public String getLanguage() + { + return "da"; + } + + public int match(CharsetDetector det) + { + return match(det, ngrams, byteMap); + } + } + static class CharsetRecog_8859_1_de extends CharsetRecog_8859_1 { private static int[] ngrams = { - 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F, - 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220, - 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465, - 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572, - }; + 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F, + 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220, + 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465, + 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572, + }; public String getLanguage() { @@ -311,6 +331,26 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer { } } + static class CharsetRecog_8859_1_nl extends CharsetRecog_8859_1 + { + private static int[] ngrams = { + 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665, + 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E, + 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F, + 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F, + }; + + public String getLanguage() + { + return "nl"; + } + + public int match(CharsetDetector det) + { + return match(det, ngrams, byteMap); + } + } + static class CharsetRecog_8859_1_pt extends CharsetRecog_8859_1 { private static int[] ngrams = {