ICU-6889 Add test for CharsetDetector.detectAll() producing the same encoding multiple times.

X-SVN-Rev: 31905
2025-04-17 02:37:25 +00:00 · 2012-06-05 17:40:59 +00:00 · 2012-06-05 17:40:59 +00:00 · 709b0884cc
commit 709b0884cc
parent cfb458d917
1 changed files with 25 additions and 1 deletions
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@ -11,6 +11,7 @@ import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.io.Reader;
 import java.io.UnsupportedEncodingException;
+import java.util.HashSet;

 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
@ -1104,7 +1105,30 @@ public class TestCharsetDetector extends TestFmwk

        name1 = match1.getName();
        assertEquals("Wrong charset name after running a second charset detector", "windows-1252", name1);
-
+    }
+    
+    public void TestBug6889() {
+        // Verify that CharsetDetector.detectAll() does not return the same encoding multiple times.
+        String text =
+            "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
+        byte[] textBytes;
+        try {
+            textBytes = text.getBytes("ISO-8859-1");
+        }
+        catch (Exception e) {
+            fail("Unexpected exception " + e.toString());
+            return;
+        }
+        
+        CharsetDetector det = new CharsetDetector();
+        det.setText(textBytes);
+        CharsetMatch matches[] = det.detectAll();
+        
+        HashSet<String> detectedEncodings = new HashSet<String>();
+        for (CharsetMatch m: matches) {
+            assertTrue("Charset " + m.getName() + " encountered before",
+                        detectedEncodings.add(m.getName()));
+        }   
    }