ICU-5410 Improve testing of CharsetRecognizer::getLanguage

X-SVN-Rev: 20492
2025-04-08 06:53:45 +00:00 · 2006-10-05 18:51:11 +00:00 · 2006-10-05 18:51:11 +00:00 · 348c2eb1ff
commit 348c2eb1ff
parent f304022ded
2 changed files with 13 additions and 11 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml
+++ b/icu4j/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml
@ -199,7 +199,7 @@
    </test-case>

    <!-- No EUC-JP in this test because it detects as GB18030 -->
-    <test-case id="IUC10-jp" encodings="UTF-8 UTF-32BE UTF-32LE Shift_JIS ISO-2022-JP">
+    <test-case id="IUC10-jp" encodings="UTF-8 UTF-32BE UTF-32LE Shift_JIS/ja ISO-2022-JP">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    ヨーロッパ、ソフトウェア、そしてインターネット:
@ -214,7 +214,7 @@

    </test-case>

-    <test-case id="IUC10-ko" encodings="UTF-8 UTF-32BE UTF-32LE EUC-KR ISO-2022-KR">
+    <test-case id="IUC10-ko" encodings="UTF-8 UTF-32BE UTF-32LE EUC-KR/ko ISO-2022-KR">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    유럽, 소프트웨어 그리고 인터넷:
@ -246,8 +246,8 @@

    </test-case>

-    <!-- No language for ISO-8859-1 in this test because no-BO is recogonized as Danish... -->
-    <test-case id="IUC10-no-BO" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1">
+    <!-- No language for ISO-8859-1 in this test because no-NO is recogonized as Danish... -->
+    <test-case id="IUC10-no-NO" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/da">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, Programvare og Internet:
@ -262,7 +262,7 @@

    </test-case>

-    <test-case id="IUC10-no-NY" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/no">
+    <test-case id="IUC10-no-NO-NY" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/no">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    Europa, programvare og Internett:
@ -395,7 +395,7 @@
    <!-- /test-case -->

    <!-- No ISO-2022-CN in this test because Java doesn't support it in both directions :-( -->
-    <test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-32BE UTF-32LE ISO-2022-CN GB18030">
+    <test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-32BE UTF-32LE ISO-2022-CN GB18030/zh">
    <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->

    欧洲，软件＋互联网
--- a/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
+++ b/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@ -27,9 +27,6 @@ import org.w3c.dom.*;

 /**
 * @author andy
- *
- * TODO To change the template for this generated type comment go to
- * Window - Preferences - Java - Code Style - Code Templates
 */
 public class TestCharsetDetector extends TestFmwk
 {
@ -101,7 +98,11 @@ public class TestCharsetDetector extends TestFmwk
            return;
        }
        
-        if (! (language == null || m.getLanguage().equals(language))) {
+        String charsetMatchLanguage = m.getLanguage();
+        if ((language != null && !charsetMatchLanguage.equals(language))
+            || (language == null && charsetMatchLanguage != null)
+            || (language != null && charsetMatchLanguage == null))
+        {
            errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
        }
        
@ -179,7 +180,8 @@ public class TestCharsetDetector extends TestFmwk
            det.setText(new ByteArrayInputStream(bytes));
            checkMatch(det, testString, enc, lang, id);
         } catch (Exception e) {
-            errln(id + ": " + e.toString());
+            errln(id + ": " + e.toString() + "enc=" + enc);
+            e.printStackTrace();
        }
    }