ICU-124 charset detector, work in progress.

X-SVN-Rev: 17526
This commit is contained in:
Andy Heninger 2005-04-29 23:30:56 +00:00
parent 031e4f244e
commit ae956feaf8
3 changed files with 167 additions and 28 deletions

View file

@ -331,7 +331,7 @@ public class CharsetDetector {
private static ArrayList createRecognizers() {
ArrayList recognizers = new ArrayList();
recognizers.add(new CharsetRecog_UTF8());
recognizers.add(new CharsetRecog_mbcs("Shift_JIS", new CharsetDetectEnc_sjis()));
recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());

View file

@ -110,6 +110,7 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
int match(CharsetDetector det) {
return match(det.fInputBytes, det.fInputLen, escapeSequences);
}
}
static class CharsetRecog_2022KR extends CharsetRecog_2022 {
@ -153,4 +154,4 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
}
}
}

View file

@ -6,42 +6,29 @@
*/
package com.ibm.icu.text;
/**
* CharsetRecognizer implemenation for Asian - double or multi-byte - charsets.
* Match is determined mostly by the input data adhering to the
* encoding scheme for the charset, although the hooks are here
* to also check language based character occurence frequencies if that
* proves to be necessary.
* encoding scheme for the charset, and, optionally,
* frequency-of-occurence of characters.
* <p/>
* Instances of this class are singletons, one per encoding
* being recognized. They are created in the main
* CharsetDetector class and kept in the global list of available
* encodings to be checked. The specific encoding being recognized
* is determined by the CharsetDetectEncoding provided when an
* instance of this class is created.
* is determined by subclass.
*
*/
class CharsetRecog_mbcs extends CharsetRecognizer {
abstract class CharsetRecog_mbcs extends CharsetRecognizer {
private CharsetDetectEncoding fEnc;
private String fCharsetName;
/**
* Constructor.
* @param enc
*/
CharsetRecog_mbcs(String charsetName, CharsetDetectEncoding enc) {
fEnc = enc;
fCharsetName = charsetName;
}
/**
* Get the IANA name of this charset.
* @return the charset name.
*/
String getName() {
return fCharsetName;
}
abstract String getName() ;
/**
@ -56,21 +43,20 @@ class CharsetRecog_mbcs extends CharsetRecognizer {
* <br/>
* bits 8-15: The match reason, an enum-like value.
*/
int match(CharsetDetector det) {
int match(iteratedChar iter, CharsetDetector det) {
int singleByteCharCount = 0;
int doubleByteCharCount = 0;
int badCharCount = 0;
int totalCharCount = 0;
CharsetDetectEncoding.iteratedChar ichar = new CharsetDetectEncoding.iteratedChar();
for (ichar.reset(); fEnc.nextChar(ichar, det);) {
for (iter.reset(); nextChar(iter, det);) {
totalCharCount++;
if (ichar.error) {
if (iter.error) {
badCharCount++;
} else {
if (ichar.charValue <= 0xff) {
if (iter.charValue <= 0xff) {
singleByteCharCount++;
} else {
doubleByteCharCount++;
@ -85,8 +71,160 @@ class CharsetRecog_mbcs extends CharsetRecognizer {
if (confidence > 100) {
confidence = 100;
}
return confidence;
}
// "Character" iterator class & interface
// Recognizers for specific mbcs encodings make their "characters" available
// by subclassing this iterator class. The returned characters are not converted
// to Unicode - they are still values that are specific to the encoding - but
// multi-byte sequences are combined to form single int values.
//
abstract static class iteratedChar {
int charValue = 0; // The char value is a value from the encoding.
// It's meaning is not well defined, other than
// different encodings
int index = 0;
int nextIndex = 0;
boolean error = false;
boolean done = false;
void reset() {
charValue = 0;
index = -1;
nextIndex = 0;
error = false;
done = false;
}
int nextByte(CharsetDetector det) {
if (nextIndex >= det.fInputLen) {
done = true;
return -1;
}
int byteValue = (int)det.fInputBytes[nextIndex++] & 0x00ff;
return byteValue;
}
}
abstract boolean nextChar(iteratedChar it, CharsetDetector det);
/**
* Shift-JIS charset recognizer.
*
*/
static class CharsetRecog_sjis extends CharsetRecog_mbcs {
boolean nextChar(iteratedChar it, CharsetDetector det) {
it.index = it.nextIndex;
it.error = false;
int firstByte;
firstByte = it.charValue = it.nextByte(det);
if (firstByte < 0) {
return false;
}
if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
return true;
}
int secondByte = it.nextByte(det);
if (secondByte < 0) {
return false;
}
it.charValue = firstByte << 8 + secondByte;
if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
// Illegal second byte value.
it.error = true;
}
return true;
}
int match(CharsetDetector det) {
return 0;
}
String getName() {
return "SHIFT_JIS";
}
}
/**
* EUC charset recognizer.
*
*/
abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
/*
* (non-Javadoc)
* Get the next character value for EUC based encodings.
* Character "value" is simply the raw bytes that make up the character
* packed into an int.
*/
boolean nextChar(iteratedChar it, CharsetDetector det) {
it.index = it.nextIndex;
it.error = false;
int firstByte = 0;
int secondByte = 0;
int thirdByte = 0;
int fourthByte = 0;
buildChar: {
firstByte = it.charValue = it.nextByte(det);
if (firstByte < 0) {
// Ran off the end of the input data
it.done = true;
break buildChar;
}
if (firstByte <= 0x8d) {
// single byte char
break buildChar;
}
secondByte = it.nextByte(det);
it.charValue = (it.charValue << 8) | secondByte;
if (firstByte >= 0xA1 && firstByte <= 0xfe) {
// Two byte Char
if (secondByte < 0xa1) {
it.error = true;
}
break buildChar;
}
if (firstByte == 0x8e) {
// Code Set 2.
// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
// We don't know which we've got.
// Treat it like EUC-JP. If the data really was EUC-TW, the following two
// bytes will look like a well formed 2 byte char.
if (secondByte < 0xa1) {
it.error = true;
}
break buildChar;
}
if (firstByte == 0x8f) {
// Code set 3.
// Three byte total char size, two bytes of actual char value.
thirdByte = it.nextByte(det);
it.charValue = (it.charValue << 8) | thirdByte;
if (thirdByte < 0xa1) {
it.error = true;
}
}
}
return (it.done == false);
}
}
}