ICU-124 charset detector, work in progress.

X-SVN-Rev: 17554
2025-04-13 08:53:20 +00:00 · 2005-05-05 05:47:56 +00:00 · 2005-05-05 05:47:56 +00:00 · 2852ded666
commit 2852ded666
parent f2d4d6fa9e
3 changed files with 68 additions and 35 deletions
--- a/icu4j/src/com/ibm/icu/text/CharsetDetector.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetDetector.java
@ -105,6 +105,8 @@ public class CharsetDetector {
                                          //   in which case we can't touch it.
        fRawLength = fInputStream.read(fRawInput);
        fInputStream.reset();
+        
+        MungeInput();                     // Strip html markup, collect byte stats.
        return this;
    }

@ -292,14 +294,14 @@ public class CharsetDetector {
     *  The following items are accessed by individual CharsetRecongizers during
     *     the recognition process
     */
-    byte[]      fInputBytes =     // The text to be checked.  Markup will have been
-                   new byte[4000];//   removed if appropriate.
+    byte[]      fInputBytes =       // The text to be checked.  Markup will have been
+                   new byte[4000];  //   removed if appropriate.
    
-    int         fInputLen;        // Length of the byte data in fInputText.
+    int         fInputLen;          // Length of the byte data in fInputText.
    
-    short       fByteStats[];     // byte frequency statistics for the input text.
-                                  //   Value is percent, not absolute.
-                                  //   Value is rounded up, so zero really means zero occurences.
+    short       fByteStats[] =      // byte frequency statistics for the input text.
+                   new short[256];  //   Value is percent, not absolute.
+                                    //   Value is rounded up, so zero really means zero occurences.
    
    String      fDeclaredEncoding;
    
--- a/icu4j/src/com/ibm/icu/text/CharsetMatch.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetMatch.java
@ -86,8 +86,13 @@ public class CharsetMatch implements Comparable {
        return fConfidence;
    }
    
+    
+    static public final int ENCODING_SCHEME    = 1;
+    static public final int BOM                = 2;
+    static public final int DECLARED_ENCODING  = 4;
+    static public final int LANG_STATISTICS    = 8;
    /**
-     * Return an indication of what it was about input data that 
+     * Return indications of what it was about input data that 
     * that caused this charset to be considered as a possible match.
     * <p>
     * TODO: create a list of enum-like constants for the possible types of matches.
--- a/icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java
@ -83,7 +83,19 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
        if (confidence > 100) {
            confidence = 100;
        }
-        
+        if (commonChars != null && doubleByteCharCount > 0) {
+            //
+            int commonCharPercentage = commonCharCount*100 / doubleByteCharCount;
+            if (commonCharPercentage < 10) {
+                confidence = 0;
+            } else if (commonCharPercentage < 20) {
+                confidence = (confidence * (commonCharPercentage-10)) / 10;
+            } else {
+                // Percent of frequently occuring chars is > 20
+                //   Let the initial confidence, based soley on the encoding scheme match, stand.
+            }
+        }
+         
        return confidence;
    }
     
@ -94,6 +106,10 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
     //    The returned characters are not converted to Unicode, but remain as the raw
     //    bytes (concatenated into an int) from the codepage data.
     //
+     //  For Asian charsets, use the raw input rather than the input that has been
+     //   stripped of markup.  Detection only considers multi-byte chars, effectively
+     //   stripping markup anyway, and double byte chars do occur in markup too.
+     //
     static class iteratedChar {
         int             charValue = 0;             // 1-4 bytes from the raw input data
         int             index     = 0;
@ -110,11 +126,11 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
         }
         
         int nextByte(CharsetDetector det) {
-             if (nextIndex >= det.fInputLen) {
+             if (nextIndex >= det.fRawLength) {
                 done = true;
                 return -1;
             }
-             int byteValue = (int)det.fInputBytes[nextIndex++] & 0x00ff;
+             int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;
             return byteValue;
         }       
     }
@ -259,9 +275,16 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
                 // TODO:  This set of data comes from the character frequency-
                 //        of-occurence analysis tool.  The data needs to be moved
                 //        into a resource and loaded from there.
-                    {0xa4ce, 0xa4c7, 0xa4a4, 0xa4b9, 0xa4b7, 0xa4cb, 0xa1a2, 0xa4c6, 0xa4c8, 
-                     0xa4de, 0xa4cf, 0xa1bc, 0xa1a3, 0xa4eb, 0xa4f2, 0xa4ca, 0xa4ac, 0xa4bf, 0xa4ec, 
-                     0xa4a6, 0xa4b3, 0xa4ab, 0xa4e2, 0xa5f3, 0xa5c8, 0xa5b9, 0xa5af, 0xa5a4, 0xa4ea};
+                    {0xa4ce, 0xa4c7, 0xa4a4, 0xa1bc, 0xa1a2, 0xa4b7, 0xa4cb, 0xa4b9, 0xa1a3, 0xa4c6, 
+                     0xa4c8, 0xa4cf, 0xa4de, 0xa4f2, 0xa4eb, 0xa4ca, 0xa4ac, 0xa5f3, 0xa4bf, 0xa5b9, 
+                     0xa4ec, 0xa5a4, 0xa4a6, 0xa4ab, 0xa5c8, 0xa4b3, 0xa1a6, 0xa4e2, 0xa5eb, 0xa5af, 
+                     0xa4ea, 0xa4e9, 0xa1a1, 0xa5c3, 0xa5e9, 0xa4c3, 0xa5ea, 0xa4ad, 0xa5d7, 0xa4b5, 
+                     0xa4f3, 0xa4a2, 0xa5c9, 0xc6fc, 0xa1d6, 0xa1d7, 0xa5bf, 0xa4e8, 0xa5b8, 0xa4af, 
+                     0xa5e1, 0xa4a8, 0xa4bb, 0xa4bd, 0xa4c0, 0xa5a2, 0xa5d5, 0xa4b1, 0xbfb7, 0xa4aa, 
+                     0xa4c4, 0xa5b5, 0xbbc8, 0xa5d6, 0xa4c9, 0xcaf3, 0xa5b7, 0xcbdc, 0xc4ea, 0xa5a6, 
+                     0xa4d0, 0xa5e5, 0xcdd1, 0xa4e1, 0xa4df, 0xa5d0, 0xa5a3, 0xb8a9, 0xa5b3, 0xa5de, 
+                     0xa5ed, 0xa5a7, 0xa5b0, 0xa5e0, 0xa4ef, 0xb9d4, 0xa5aa, 0xa5c6, 0xbef0, 0xcab8, 
+                     0xa1ca, 0xa1cb, 0xa5cb, 0xbaee, 0xa4c1, 0xa5ad, 0xa5c7, 0xa4e4, 0xa5ec, 0xc7bd};
             
             String getName() {
                 return "EUC_JP";
@ -283,13 +306,16 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
                 // TODO:  This set of data comes from the character frequency-
                 //        of-occurence analysis tool.  The data needs to be moved
                 //        into a resource and loaded from there.
-                    {0xc0cc, 0xbbe7, 0xc0c7, 0xb1e2, 0xb4eb, 0xbdba, 0xc1f6, 0xbab8, 0xc1a4, 
-                     0xbdc3, 0xc7d1, 0xb4d9, 0xbfa1, 0xb4c2, 0xb0a1, 0xc0da, 0xc7cf, 0xbcad, 0xb8ae, 
-                     0xc0bb, 0xb0ed, 0xb7ce, 0xc1a6, 0xc0ce, 0xc8b8, 0xbff8, 0xb1b9, 0xbace, 0xb5b5, 
-                     0xc0fc, 0xbec6, 0xbfa9, 0xc0cf, 0xb0f8, 0xb5bf, 0xb1b8, 0xbfac, 0xc0fb, 0xbaf1, 
-                     0xb1b3, 0xc0a7, 0xc7d8, 0xc7d0, 0xb0fa, 0xc8ad, 0xbcd2, 0xbcf6, 0xbbf3, 0xc0ba, 
-                     0xc0b0, 0xbeee, 0xc1d6, 0xb9ae, 0xc0e5, 0xbfeb, 0xb8a6, 0xbcba, 0xc6ae, 0xc0db, 
-                     0xb0e8, 0xc0d6};
+                    {0xc0cc, 0xb4d9, 0xb4c2, 0xc0c7, 0xbfa1, 0xc7cf, 0xb0a1, 0xb0ed, 0xc7d1, 0xc1f6, 
+                     0xc0bb, 0xb7ce, 0xb1e2, 0xbcad, 0xc0ba, 0xbbe7, 0xc1a4, 0xc0da, 0xb5b5, 0xb8a6, 
+                     0xbeee, 0xb4cf, 0xbcf6, 0xbdc3, 0xb1d7, 0xb4eb, 0xb8ae, 0xc0ce, 0xb3aa, 0xbec6, 
+                     0xc0d6, 0xbab8, 0xb5e9, 0xb6f3, 0xc7d8, 0xb0cd, 0xc0cf, 0xbdba, 0xc0b8, 0xb1b9, 
+                     0xc1a6, 0xb9fd, 0xbbf3, 0xb0d4, 0xb8e9, 0xb8b8, 0xb0fa, 0xc0fb, 0xbace, 0xc1d6, 
+                     0xbfa9, 0xc0fc, 0xbfeb, 0xb9ae, 0xc6ae, 0xbbfd, 0xbcba, 0xc0a7, 0xbff8, 0xb5c7, 
+                     0xbfe4, 0xbfec, 0xbdc5, 0xc7d2, 0xc7e5, 0xb0fc, 0xb1b8, 0xbaf1, 0xbedf, 0xc5cd, 
+                     0xb8b6, 0xbdc0, 0xb7af, 0xb5bf, 0xb3bb, 0xc8ad, 0xc0bd, 0xb0b3, 0xc4a1, 0xb7c2, 
+                     0xb9ab, 0xc0af, 0xbef8, 0xb5a5, 0xbcd2, 0xb9ce, 0xc1df, 0xbfc0, 0xc1f8, 0xb0e6, 
+                     0xb1c7, 0xbad0, 0xbefa, 0xc0e5, 0xbec8, 0xc1b6, 0xb8bb, 0xb0f8, 0xb9cc, 0xb0c5};
             
             String getName() {
                 return "EUC_KR";
@ -310,21 +336,21 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
                 // TODO:  This set of data comes from the character frequency-
                 //        of-occurence analysis tool.  The data needs to be moved
                 //        into a resource and loaded from there.
-                    {0xb5c4, 0xd6d0, 0xa1a4, 0xa1a1, 0xa3ac, 0xcce5, 0xcec4, 0xd1a7, 0xcdf8, 
-                     0xb9fa, 0xcbce, 0xc8cb, 0xd3c3, 0xa1a3, 0xd2bb, 0xa3ba, 0xb4f3, 0xbbe1, 0xd0c2, 
-                     0xa1a2, 0xd4da, 0xb1a8, 0xb0a9, 0xb7a2, 0xc9cf, 0xd3d0, 0xc9fa, 0xc2db, 0xb1b1, 
-                     0xbcfe, 0xc8d5, 0xcab1, 0xbfc9, 0xc7f8, 0xbdcc, 0xbea9, 0xb2bb, 0xb7d6, 0xd2d4, 
-                     0xc4ea, 0xd2b3, 0xcfc2, 0xbacd, 0xd7d6, 0xbde1, 0xd0c5, 0xd3fd, 0xc3f1, 0xb8df, 
-                     0xd1d0, 0xbfbc, 0xcac7, 0xbcd2, 0xb3c9, 0xd7d4, 0xceaa, 0xc8eb, 0xd0c4, 0xbfc6, 
-                     0xd7a8, 0xbfaa, 0xcfa2, 0xbbaf, 0xb8e6, 0xcfdf, 0xd7ca, 0xb6af, 0xb7a8, 0xcaf8, 
-                     0xd2bd, 0xd0d0, 0xa1b0, 0xcad0, 0xa1b1, 0xb1be, 0xb7bd, 0xb2e9, 0xcad4, 0xced2, 
-                     0xb6e0, 0xb1ed, 0xd5be, 0xc4da, 0xd7f7, 0xd2aa, 0xb8f6, 0xbbaa, 0xc9e7, 0xbead, 
-                     0xd5df, 0xc3e6, 0xbbfa, 0xbebf, 0xd2a9, 0xb5bd, 0xb3f6, 0xc0ed, 0xb5e3, 0xcab9, 
-                     0xbcd3, 0xc6da, 0xb0b8, 0xd7d3, 0xbac3, 0xb9d8, 0xcec5, 0xc3fb, 0xd5b9, 0xb2bf, 
-                     0xb9ab, 0xc1cb, 0xd6ce, 0xb9a4, 0xccec, 0xb9e3, 0xb5d8, 0xd4c2, 0xc7eb, 0xbcbc, 
-                     0xb0e6, 0xb5c0, 0xc4dc, 0xd4ba, 0xd3eb, 0xb6a8, 0xb5e7, 0xcef1, 0xcce2, 0xcff2, 
-                     0xbaf3, 0xd3d1, 0xc1ac, 0xcae4, 0xcfb5, 0xcae9, 0xd7a2, 0xbdab, 0xd6f7, 0xc8ab, 
-                     0xc2eb, 0xbdf0, 0xb6d4, 0xccd8, 0xcee5, 0xceca, 0xc0b4, 0xd2b5, 0xcabe};
+             {0xa3ac, 0xb5c4, 0xa1a1, 0xa1a4, 0xa1a3, 0xcac7, 0xd2bb, 0xb4f3, 0xd4da, 0xd6d0, 
+                     0xcafd, 0xd3d0, 0xa1f3, 0xb2bb, 0xa3ba, 0xbbfa, 0xc8cb, 0xa1a2, 0xd3c3, 0xd1a7, 
+                     0xc8d5, 0xbedd, 0xb8f6, 0xd0c2, 0xcdf8, 0xd2aa, 0xb9fa, 0xc1cb, 0xc9cf, 0xa1b0, 
+                     0xa1b1, 0xced2, 0xbcfe, 0xcec4, 0xd2d4, 0xc4dc, 0xc0b4, 0xd4c2, 0xcab1, 0xd0d0, 
+                     0xbdcc, 0xbfc9, 0xb6d4, 0xbcdb, 0xb1be, 0xb3f6, 0xb8b4, 0xc9fa, 0xb1b8, 0xbcbc, 
+                     0xcfc2, 0xbacd, 0xbecd, 0xb3c9, 0xd5e2, 0xb8df, 0xb7d6, 0xc5cc, 0xbfc6, 0xbbe1, 
+                     0xceaa, 0xc8e7, 0xcfb5, 0xa1f1, 0xc4ea, 0xb1a8, 0xb6af, 0xc0ed, 0xd3fd, 0xb7a2, 
+                     0xc8ab, 0xb7bd, 0xcee5, 0xc2db, 0xbba7, 0xd0d4, 0xb9c9, 0xc3c7, 0xb9fd, 0xcad0, 
+                     0xb5e3, 0xbbd6, 0xcfd6, 0xcab5, 0xd2b2, 0xbfb4, 0xb6e0, 0xccec, 0xc7f8, 0xd0c5, 
+                     0xcad6, 0xb9d8, 0xb5bd, 0xb7dd, 0xc6f7, 0xcaf5, 0xa3a1, 0xb7a8, 0xb9ab, 0xd2b5, 
+                     0xcbf9, 0xcdbc, 0xc6e4, 0xd3da, 0xd0a1, 0xd1a1, 0xd3ce, 0xbfaa, 0xb4e6, 0xc4bf, 
+                     0xd7f7, 0xb5e7, 0xcdb3, 0xc7e9, 0xd7ee, 0xc6c0, 0xcfdf, 0xb5d8, 0xb5c0, 0xbead, 
+                     0xb4c5, 0xc6b7, 0xc4da, 0xd0c4, 0xb9a4, 0xd4aa, 0xc2bc, 0xc3c0, 0xbaf3, 0xcabd, 
+                     0xbcd2, 0xcef1, 0xbdab, 0xa3ad, 0xa3bf, 0xb3a4, 0xb9fb, 0xd6ae, 0xc1bf, 0xbbd8, 
+                     0xb8f1, 0xb6f8, 0xb6a8, 0xcde2, 0xbac3, 0xb3cc, 0xccd8, 0xd7d4, 0xcbb5};
             
             String getName() {
                 return "EUC_CN";