mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-124 charset detector, work in progress.
X-SVN-Rev: 17526
This commit is contained in:
parent
031e4f244e
commit
ae956feaf8
3 changed files with 167 additions and 28 deletions
|
@ -331,7 +331,7 @@ public class CharsetDetector {
|
|||
private static ArrayList createRecognizers() {
|
||||
ArrayList recognizers = new ArrayList();
|
||||
recognizers.add(new CharsetRecog_UTF8());
|
||||
recognizers.add(new CharsetRecog_mbcs("Shift_JIS", new CharsetDetectEnc_sjis()));
|
||||
recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
|
||||
recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
|
||||
recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
|
||||
recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
|
||||
|
|
|
@ -110,6 +110,7 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
|
|||
|
||||
int match(CharsetDetector det) {
|
||||
return match(det.fInputBytes, det.fInputLen, escapeSequences);
|
||||
}
|
||||
}
|
||||
|
||||
static class CharsetRecog_2022KR extends CharsetRecog_2022 {
|
||||
|
@ -153,4 +154,4 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
|
|||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -6,42 +6,29 @@
|
|||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
|
||||
/**
|
||||
* CharsetRecognizer implemenation for Asian - double or multi-byte - charsets.
|
||||
* Match is determined mostly by the input data adhering to the
|
||||
* encoding scheme for the charset, although the hooks are here
|
||||
* to also check language based character occurence frequencies if that
|
||||
* proves to be necessary.
|
||||
* encoding scheme for the charset, and, optionally,
|
||||
* frequency-of-occurence of characters.
|
||||
* <p/>
|
||||
* Instances of this class are singletons, one per encoding
|
||||
* being recognized. They are created in the main
|
||||
* CharsetDetector class and kept in the global list of available
|
||||
* encodings to be checked. The specific encoding being recognized
|
||||
* is determined by the CharsetDetectEncoding provided when an
|
||||
* instance of this class is created.
|
||||
* is determined by subclass.
|
||||
*
|
||||
*/
|
||||
class CharsetRecog_mbcs extends CharsetRecognizer {
|
||||
abstract class CharsetRecog_mbcs extends CharsetRecognizer {
|
||||
|
||||
private CharsetDetectEncoding fEnc;
|
||||
private String fCharsetName;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* @param enc
|
||||
*/
|
||||
CharsetRecog_mbcs(String charsetName, CharsetDetectEncoding enc) {
|
||||
fEnc = enc;
|
||||
fCharsetName = charsetName;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the IANA name of this charset.
|
||||
* @return the charset name.
|
||||
*/
|
||||
String getName() {
|
||||
return fCharsetName;
|
||||
}
|
||||
abstract String getName() ;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -56,21 +43,20 @@ class CharsetRecog_mbcs extends CharsetRecognizer {
|
|||
* <br/>
|
||||
* bits 8-15: The match reason, an enum-like value.
|
||||
*/
|
||||
int match(CharsetDetector det) {
|
||||
int match(iteratedChar iter, CharsetDetector det) {
|
||||
int singleByteCharCount = 0;
|
||||
int doubleByteCharCount = 0;
|
||||
int badCharCount = 0;
|
||||
int totalCharCount = 0;
|
||||
|
||||
CharsetDetectEncoding.iteratedChar ichar = new CharsetDetectEncoding.iteratedChar();
|
||||
|
||||
for (ichar.reset(); fEnc.nextChar(ichar, det);) {
|
||||
for (iter.reset(); nextChar(iter, det);) {
|
||||
totalCharCount++;
|
||||
if (ichar.error) {
|
||||
if (iter.error) {
|
||||
badCharCount++;
|
||||
} else {
|
||||
|
||||
if (ichar.charValue <= 0xff) {
|
||||
if (iter.charValue <= 0xff) {
|
||||
singleByteCharCount++;
|
||||
} else {
|
||||
doubleByteCharCount++;
|
||||
|
@ -85,8 +71,160 @@ class CharsetRecog_mbcs extends CharsetRecognizer {
|
|||
if (confidence > 100) {
|
||||
confidence = 100;
|
||||
}
|
||||
|
||||
|
||||
return confidence;
|
||||
}
|
||||
|
||||
// "Character" iterator class & interface
|
||||
// Recognizers for specific mbcs encodings make their "characters" available
|
||||
// by subclassing this iterator class. The returned characters are not converted
|
||||
// to Unicode - they are still values that are specific to the encoding - but
|
||||
// multi-byte sequences are combined to form single int values.
|
||||
//
|
||||
abstract static class iteratedChar {
|
||||
int charValue = 0; // The char value is a value from the encoding.
|
||||
// It's meaning is not well defined, other than
|
||||
// different encodings
|
||||
int index = 0;
|
||||
int nextIndex = 0;
|
||||
boolean error = false;
|
||||
boolean done = false;
|
||||
|
||||
void reset() {
|
||||
charValue = 0;
|
||||
index = -1;
|
||||
nextIndex = 0;
|
||||
error = false;
|
||||
done = false;
|
||||
}
|
||||
|
||||
int nextByte(CharsetDetector det) {
|
||||
if (nextIndex >= det.fInputLen) {
|
||||
done = true;
|
||||
return -1;
|
||||
}
|
||||
int byteValue = (int)det.fInputBytes[nextIndex++] & 0x00ff;
|
||||
return byteValue;
|
||||
}
|
||||
}
|
||||
|
||||
abstract boolean nextChar(iteratedChar it, CharsetDetector det);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Shift-JIS charset recognizer.
|
||||
*
|
||||
*/
|
||||
static class CharsetRecog_sjis extends CharsetRecog_mbcs {
|
||||
|
||||
boolean nextChar(iteratedChar it, CharsetDetector det) {
|
||||
it.index = it.nextIndex;
|
||||
it.error = false;
|
||||
int firstByte;
|
||||
firstByte = it.charValue = it.nextByte(det);
|
||||
if (firstByte < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int secondByte = it.nextByte(det);
|
||||
if (secondByte < 0) {
|
||||
return false;
|
||||
}
|
||||
it.charValue = firstByte << 8 + secondByte;
|
||||
if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
|
||||
// Illegal second byte value.
|
||||
it.error = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int match(CharsetDetector det) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
String getName() {
|
||||
return "SHIFT_JIS";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* EUC charset recognizer.
|
||||
*
|
||||
*/
|
||||
abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* Get the next character value for EUC based encodings.
|
||||
* Character "value" is simply the raw bytes that make up the character
|
||||
* packed into an int.
|
||||
*/
|
||||
boolean nextChar(iteratedChar it, CharsetDetector det) {
|
||||
it.index = it.nextIndex;
|
||||
it.error = false;
|
||||
int firstByte = 0;
|
||||
int secondByte = 0;
|
||||
int thirdByte = 0;
|
||||
int fourthByte = 0;
|
||||
|
||||
buildChar: {
|
||||
firstByte = it.charValue = it.nextByte(det);
|
||||
if (firstByte < 0) {
|
||||
// Ran off the end of the input data
|
||||
it.done = true;
|
||||
break buildChar;
|
||||
}
|
||||
if (firstByte <= 0x8d) {
|
||||
// single byte char
|
||||
break buildChar;
|
||||
}
|
||||
|
||||
secondByte = it.nextByte(det);
|
||||
it.charValue = (it.charValue << 8) | secondByte;
|
||||
|
||||
if (firstByte >= 0xA1 && firstByte <= 0xfe) {
|
||||
// Two byte Char
|
||||
if (secondByte < 0xa1) {
|
||||
it.error = true;
|
||||
}
|
||||
break buildChar;
|
||||
}
|
||||
if (firstByte == 0x8e) {
|
||||
// Code Set 2.
|
||||
// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
|
||||
// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
|
||||
// We don't know which we've got.
|
||||
// Treat it like EUC-JP. If the data really was EUC-TW, the following two
|
||||
// bytes will look like a well formed 2 byte char.
|
||||
if (secondByte < 0xa1) {
|
||||
it.error = true;
|
||||
}
|
||||
break buildChar;
|
||||
}
|
||||
|
||||
if (firstByte == 0x8f) {
|
||||
// Code set 3.
|
||||
// Three byte total char size, two bytes of actual char value.
|
||||
thirdByte = it.nextByte(det);
|
||||
it.charValue = (it.charValue << 8) | thirdByte;
|
||||
if (thirdByte < 0xa1) {
|
||||
it.error = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (it.done == false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue