ICU-124 charset detector, work in progress.

X-SVN-Rev: 17526
2025-04-13 08:53:20 +00:00 · 2005-04-29 23:30:56 +00:00 · 2005-04-29 23:30:56 +00:00 · ae956feaf8
commit ae956feaf8
parent 031e4f244e
3 changed files with 167 additions and 28 deletions
--- a/icu4j/src/com/ibm/icu/text/CharsetDetector.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetDetector.java
@ -331,7 +331,7 @@ public class CharsetDetector {
    private static ArrayList createRecognizers() {
        ArrayList recognizers = new ArrayList();
        recognizers.add(new CharsetRecog_UTF8());
-        recognizers.add(new CharsetRecog_mbcs("Shift_JIS", new CharsetDetectEnc_sjis()));
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
--- a/icu4j/src/com/ibm/icu/text/CharsetRecog_2022.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_2022.java
@ -110,6 +110,7 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
        
        int   match(CharsetDetector det) {
            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        }
    }

    static class CharsetRecog_2022KR extends CharsetRecog_2022 {
@ -153,4 +154,4 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
    }
    
    }
-}
+
--- a/icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java
@ -6,42 +6,29 @@
 */
 package com.ibm.icu.text;

+
 /**
 * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
 *                   Match is determined mostly by the input data adhering to the
- *                   encoding scheme for the charset, although the hooks are here
- *                   to also check language based character occurence frequencies if that
- *                   proves to be necessary.
+ *                   encoding scheme for the charset, and, optionally,
+ *                   frequency-of-occurence of characters.
 * <p/>
 *                   Instances of this class are singletons, one per encoding
 *                   being recognized.  They are created in the main
 *                   CharsetDetector class and kept in the global list of available
 *                   encodings to be checked.  The specific encoding being recognized
- *                   is determined by the CharsetDetectEncoding provided when an
- *                   instance of this class is created.
+ *                   is determined by subclass.
 *                   
 */
-class CharsetRecog_mbcs extends CharsetRecognizer {
+abstract class CharsetRecog_mbcs extends CharsetRecognizer {

-    private CharsetDetectEncoding fEnc;
-    private String                fCharsetName;
-    
-    /**
-     * Constructor.  
-     * @param enc
-     */
-    CharsetRecog_mbcs(String charsetName, CharsetDetectEncoding enc) {
-        fEnc = enc;
-        fCharsetName = charsetName;
-    }
    
+     
    /**
     * Get the IANA name of this charset.
     * @return the charset name.
     */
-    String      getName() {
-        return fCharsetName;
-    }
+    abstract String      getName() ;
    
    
    /**
@ -56,21 +43,20 @@ class CharsetRecog_mbcs extends CharsetRecognizer {
     *             <br/>
     *             bits 8-15: The match reason, an enum-like value.
     */
-     int         match(CharsetDetector det) {
+    int         match(iteratedChar iter, CharsetDetector det) {
        int   singleByteCharCount = 0;
        int   doubleByteCharCount = 0;
        int   badCharCount        = 0;
        int   totalCharCount      = 0;
        
-        CharsetDetectEncoding.iteratedChar   ichar = new CharsetDetectEncoding.iteratedChar();
        
-        for (ichar.reset(); fEnc.nextChar(ichar, det);) {
+        for (iter.reset(); nextChar(iter, det);) {
            totalCharCount++;
-            if (ichar.error) {
+            if (iter.error) {
                badCharCount++; 
            } else {
                
-                if (ichar.charValue <= 0xff) {
+                if (iter.charValue <= 0xff) {
                    singleByteCharCount++;
                } else {
                    doubleByteCharCount++;
@ -85,8 +71,160 @@ class CharsetRecog_mbcs extends CharsetRecognizer {
        if (confidence > 100) {
            confidence = 100;
        }
-         
+        
        return confidence;
    }
+     
+     // "Character" iterator class & interface
+     //    Recognizers for specific mbcs encodings make their "characters" available
+     //    by subclassing this iterator class.   The returned characters are not converted
+     //    to Unicode - they are still values that are specific to the encoding - but
+     //    multi-byte sequences are combined to form single int values.
+     //
+     abstract static class iteratedChar {
+         int             charValue = 0;             // The char value is a value from the encoding.
+                                                    //   It's meaning is not well defined, other than
+                                                    //   different encodings
+         int             index     = 0;
+         int             nextIndex = 0;
+         boolean         error     = false;
+         boolean         done      = false;
+         
+         void reset() {
+             charValue = 0;
+             index     = -1;
+             nextIndex = 0;
+             error     = false;
+             done      = false;
+         }
+         
+         int nextByte(CharsetDetector det) {
+             if (nextIndex >= det.fInputLen) {
+                 done = true;
+                 return -1;
+             }
+             int byteValue = (int)det.fInputBytes[nextIndex++] & 0x00ff;
+             return byteValue;
+         }       
+     }
+     
+     abstract boolean nextChar(iteratedChar it, CharsetDetector det);
+     

+
+     
+     
+     /**
+      *   Shift-JIS charset recognizer.   
+      *
+      */
+     static class CharsetRecog_sjis extends CharsetRecog_mbcs {
+         
+         boolean nextChar(iteratedChar it, CharsetDetector det) {
+             it.index = it.nextIndex;
+             it.error = false;
+             int firstByte;
+             firstByte = it.charValue = it.nextByte(det);
+             if (firstByte < 0) {
+                 return false;
+             }
+             
+             if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
+                 return true;
+             }
+             
+             int secondByte = it.nextByte(det);
+             if (secondByte < 0)  {
+                 return false;          
+             }
+             it.charValue = firstByte << 8 + secondByte;
+             if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
+                 // Illegal second byte value.
+                 it.error = true;
+             }
+             return true;
+         }
+         
+         int match(CharsetDetector det) {
+             return 0;
+         }
+         
+         String getName() {
+             return "SHIFT_JIS";
+         }
+         
+     }
+     
+     
+     /**
+      *   EUC charset recognizer.   
+      *
+      */
+     abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
+         
+         /*
+          *  (non-Javadoc)
+          *  Get the next character value for EUC based encodings.
+          *  Character "value" is simply the raw bytes that make up the character
+          *     packed into an int.
+          */
+         boolean nextChar(iteratedChar it, CharsetDetector det) {
+             it.index = it.nextIndex;
+             it.error = false;
+             int firstByte  = 0;
+             int secondByte = 0;
+             int thirdByte  = 0;
+             int fourthByte = 0;
+             
+             buildChar: {
+                 firstByte = it.charValue = it.nextByte(det);                 
+                 if (firstByte < 0) {
+                     // Ran off the end of the input data
+                     it.done = true;
+                     break buildChar;
+                 }
+                 if (firstByte <= 0x8d) {
+                     // single byte char
+                     break buildChar;
+                 }
+                 
+                 secondByte = it.nextByte(det);
+                 it.charValue = (it.charValue << 8) | secondByte;
+                 
+                 if (firstByte >= 0xA1 && firstByte <= 0xfe) {
+                     // Two byte Char
+                     if (secondByte < 0xa1) {
+                         it.error = true;
+                     }
+                     break buildChar;
+                 }
+                 if (firstByte == 0x8e) {
+                     // Code Set 2.
+                     //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
+                     //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
+                     // We don't know which we've got.
+                     // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
+                     //   bytes will look like a well formed 2 byte char.  
+                     if (secondByte < 0xa1) {
+                         it.error = true;
+                     }
+                     break buildChar;                     
+                 }
+                 
+                 if (firstByte == 0x8f) {
+                     // Code set 3.
+                     // Three byte total char size, two bytes of actual char value.
+                     thirdByte    = it.nextByte(det);
+                     it.charValue = (it.charValue << 8) | thirdByte;
+                     if (thirdByte < 0xa1) {
+                         it.error = true;
+                     }
+                 }
+              }
+             
+             return (it.done == false);
+         }
+     }
+     
+     
 }