mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-124 charset detector, work in progress.
X-SVN-Rev: 17554
This commit is contained in:
parent
f2d4d6fa9e
commit
2852ded666
3 changed files with 68 additions and 35 deletions
|
@ -105,6 +105,8 @@ public class CharsetDetector {
|
|||
// in which case we can't touch it.
|
||||
fRawLength = fInputStream.read(fRawInput);
|
||||
fInputStream.reset();
|
||||
|
||||
MungeInput(); // Strip html markup, collect byte stats.
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -292,14 +294,14 @@ public class CharsetDetector {
|
|||
* The following items are accessed by individual CharsetRecongizers during
|
||||
* the recognition process
|
||||
*/
|
||||
byte[] fInputBytes = // The text to be checked. Markup will have been
|
||||
new byte[4000];// removed if appropriate.
|
||||
byte[] fInputBytes = // The text to be checked. Markup will have been
|
||||
new byte[4000]; // removed if appropriate.
|
||||
|
||||
int fInputLen; // Length of the byte data in fInputText.
|
||||
int fInputLen; // Length of the byte data in fInputText.
|
||||
|
||||
short fByteStats[]; // byte frequency statistics for the input text.
|
||||
// Value is percent, not absolute.
|
||||
// Value is rounded up, so zero really means zero occurences.
|
||||
short fByteStats[] = // byte frequency statistics for the input text.
|
||||
new short[256]; // Value is percent, not absolute.
|
||||
// Value is rounded up, so zero really means zero occurences.
|
||||
|
||||
String fDeclaredEncoding;
|
||||
|
||||
|
|
|
@ -86,8 +86,13 @@ public class CharsetMatch implements Comparable {
|
|||
return fConfidence;
|
||||
}
|
||||
|
||||
|
||||
static public final int ENCODING_SCHEME = 1;
|
||||
static public final int BOM = 2;
|
||||
static public final int DECLARED_ENCODING = 4;
|
||||
static public final int LANG_STATISTICS = 8;
|
||||
/**
|
||||
* Return an indication of what it was about input data that
|
||||
* Return indications of what it was about input data that
|
||||
* that caused this charset to be considered as a possible match.
|
||||
* <p>
|
||||
* TODO: create a list of enum-like constants for the possible types of matches.
|
||||
|
|
|
@ -83,7 +83,19 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
|
|||
if (confidence > 100) {
|
||||
confidence = 100;
|
||||
}
|
||||
|
||||
if (commonChars != null && doubleByteCharCount > 0) {
|
||||
//
|
||||
int commonCharPercentage = commonCharCount*100 / doubleByteCharCount;
|
||||
if (commonCharPercentage < 10) {
|
||||
confidence = 0;
|
||||
} else if (commonCharPercentage < 20) {
|
||||
confidence = (confidence * (commonCharPercentage-10)) / 10;
|
||||
} else {
|
||||
// Percent of frequently occuring chars is > 20
|
||||
// Let the initial confidence, based soley on the encoding scheme match, stand.
|
||||
}
|
||||
}
|
||||
|
||||
return confidence;
|
||||
}
|
||||
|
||||
|
@ -94,6 +106,10 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
|
|||
// The returned characters are not converted to Unicode, but remain as the raw
|
||||
// bytes (concatenated into an int) from the codepage data.
|
||||
//
|
||||
// For Asian charsets, use the raw input rather than the input that has been
|
||||
// stripped of markup. Detection only considers multi-byte chars, effectively
|
||||
// stripping markup anyway, and double byte chars do occur in markup too.
|
||||
//
|
||||
static class iteratedChar {
|
||||
int charValue = 0; // 1-4 bytes from the raw input data
|
||||
int index = 0;
|
||||
|
@ -110,11 +126,11 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
|
|||
}
|
||||
|
||||
int nextByte(CharsetDetector det) {
|
||||
if (nextIndex >= det.fInputLen) {
|
||||
if (nextIndex >= det.fRawLength) {
|
||||
done = true;
|
||||
return -1;
|
||||
}
|
||||
int byteValue = (int)det.fInputBytes[nextIndex++] & 0x00ff;
|
||||
int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;
|
||||
return byteValue;
|
||||
}
|
||||
}
|
||||
|
@ -259,9 +275,16 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
|
|||
// TODO: This set of data comes from the character frequency-
|
||||
// of-occurence analysis tool. The data needs to be moved
|
||||
// into a resource and loaded from there.
|
||||
{0xa4ce, 0xa4c7, 0xa4a4, 0xa4b9, 0xa4b7, 0xa4cb, 0xa1a2, 0xa4c6, 0xa4c8,
|
||||
0xa4de, 0xa4cf, 0xa1bc, 0xa1a3, 0xa4eb, 0xa4f2, 0xa4ca, 0xa4ac, 0xa4bf, 0xa4ec,
|
||||
0xa4a6, 0xa4b3, 0xa4ab, 0xa4e2, 0xa5f3, 0xa5c8, 0xa5b9, 0xa5af, 0xa5a4, 0xa4ea};
|
||||
{0xa4ce, 0xa4c7, 0xa4a4, 0xa1bc, 0xa1a2, 0xa4b7, 0xa4cb, 0xa4b9, 0xa1a3, 0xa4c6,
|
||||
0xa4c8, 0xa4cf, 0xa4de, 0xa4f2, 0xa4eb, 0xa4ca, 0xa4ac, 0xa5f3, 0xa4bf, 0xa5b9,
|
||||
0xa4ec, 0xa5a4, 0xa4a6, 0xa4ab, 0xa5c8, 0xa4b3, 0xa1a6, 0xa4e2, 0xa5eb, 0xa5af,
|
||||
0xa4ea, 0xa4e9, 0xa1a1, 0xa5c3, 0xa5e9, 0xa4c3, 0xa5ea, 0xa4ad, 0xa5d7, 0xa4b5,
|
||||
0xa4f3, 0xa4a2, 0xa5c9, 0xc6fc, 0xa1d6, 0xa1d7, 0xa5bf, 0xa4e8, 0xa5b8, 0xa4af,
|
||||
0xa5e1, 0xa4a8, 0xa4bb, 0xa4bd, 0xa4c0, 0xa5a2, 0xa5d5, 0xa4b1, 0xbfb7, 0xa4aa,
|
||||
0xa4c4, 0xa5b5, 0xbbc8, 0xa5d6, 0xa4c9, 0xcaf3, 0xa5b7, 0xcbdc, 0xc4ea, 0xa5a6,
|
||||
0xa4d0, 0xa5e5, 0xcdd1, 0xa4e1, 0xa4df, 0xa5d0, 0xa5a3, 0xb8a9, 0xa5b3, 0xa5de,
|
||||
0xa5ed, 0xa5a7, 0xa5b0, 0xa5e0, 0xa4ef, 0xb9d4, 0xa5aa, 0xa5c6, 0xbef0, 0xcab8,
|
||||
0xa1ca, 0xa1cb, 0xa5cb, 0xbaee, 0xa4c1, 0xa5ad, 0xa5c7, 0xa4e4, 0xa5ec, 0xc7bd};
|
||||
|
||||
String getName() {
|
||||
return "EUC_JP";
|
||||
|
@ -283,13 +306,16 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
|
|||
// TODO: This set of data comes from the character frequency-
|
||||
// of-occurence analysis tool. The data needs to be moved
|
||||
// into a resource and loaded from there.
|
||||
{0xc0cc, 0xbbe7, 0xc0c7, 0xb1e2, 0xb4eb, 0xbdba, 0xc1f6, 0xbab8, 0xc1a4,
|
||||
0xbdc3, 0xc7d1, 0xb4d9, 0xbfa1, 0xb4c2, 0xb0a1, 0xc0da, 0xc7cf, 0xbcad, 0xb8ae,
|
||||
0xc0bb, 0xb0ed, 0xb7ce, 0xc1a6, 0xc0ce, 0xc8b8, 0xbff8, 0xb1b9, 0xbace, 0xb5b5,
|
||||
0xc0fc, 0xbec6, 0xbfa9, 0xc0cf, 0xb0f8, 0xb5bf, 0xb1b8, 0xbfac, 0xc0fb, 0xbaf1,
|
||||
0xb1b3, 0xc0a7, 0xc7d8, 0xc7d0, 0xb0fa, 0xc8ad, 0xbcd2, 0xbcf6, 0xbbf3, 0xc0ba,
|
||||
0xc0b0, 0xbeee, 0xc1d6, 0xb9ae, 0xc0e5, 0xbfeb, 0xb8a6, 0xbcba, 0xc6ae, 0xc0db,
|
||||
0xb0e8, 0xc0d6};
|
||||
{0xc0cc, 0xb4d9, 0xb4c2, 0xc0c7, 0xbfa1, 0xc7cf, 0xb0a1, 0xb0ed, 0xc7d1, 0xc1f6,
|
||||
0xc0bb, 0xb7ce, 0xb1e2, 0xbcad, 0xc0ba, 0xbbe7, 0xc1a4, 0xc0da, 0xb5b5, 0xb8a6,
|
||||
0xbeee, 0xb4cf, 0xbcf6, 0xbdc3, 0xb1d7, 0xb4eb, 0xb8ae, 0xc0ce, 0xb3aa, 0xbec6,
|
||||
0xc0d6, 0xbab8, 0xb5e9, 0xb6f3, 0xc7d8, 0xb0cd, 0xc0cf, 0xbdba, 0xc0b8, 0xb1b9,
|
||||
0xc1a6, 0xb9fd, 0xbbf3, 0xb0d4, 0xb8e9, 0xb8b8, 0xb0fa, 0xc0fb, 0xbace, 0xc1d6,
|
||||
0xbfa9, 0xc0fc, 0xbfeb, 0xb9ae, 0xc6ae, 0xbbfd, 0xbcba, 0xc0a7, 0xbff8, 0xb5c7,
|
||||
0xbfe4, 0xbfec, 0xbdc5, 0xc7d2, 0xc7e5, 0xb0fc, 0xb1b8, 0xbaf1, 0xbedf, 0xc5cd,
|
||||
0xb8b6, 0xbdc0, 0xb7af, 0xb5bf, 0xb3bb, 0xc8ad, 0xc0bd, 0xb0b3, 0xc4a1, 0xb7c2,
|
||||
0xb9ab, 0xc0af, 0xbef8, 0xb5a5, 0xbcd2, 0xb9ce, 0xc1df, 0xbfc0, 0xc1f8, 0xb0e6,
|
||||
0xb1c7, 0xbad0, 0xbefa, 0xc0e5, 0xbec8, 0xc1b6, 0xb8bb, 0xb0f8, 0xb9cc, 0xb0c5};
|
||||
|
||||
String getName() {
|
||||
return "EUC_KR";
|
||||
|
@ -310,21 +336,21 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
|
|||
// TODO: This set of data comes from the character frequency-
|
||||
// of-occurence analysis tool. The data needs to be moved
|
||||
// into a resource and loaded from there.
|
||||
{0xb5c4, 0xd6d0, 0xa1a4, 0xa1a1, 0xa3ac, 0xcce5, 0xcec4, 0xd1a7, 0xcdf8,
|
||||
0xb9fa, 0xcbce, 0xc8cb, 0xd3c3, 0xa1a3, 0xd2bb, 0xa3ba, 0xb4f3, 0xbbe1, 0xd0c2,
|
||||
0xa1a2, 0xd4da, 0xb1a8, 0xb0a9, 0xb7a2, 0xc9cf, 0xd3d0, 0xc9fa, 0xc2db, 0xb1b1,
|
||||
0xbcfe, 0xc8d5, 0xcab1, 0xbfc9, 0xc7f8, 0xbdcc, 0xbea9, 0xb2bb, 0xb7d6, 0xd2d4,
|
||||
0xc4ea, 0xd2b3, 0xcfc2, 0xbacd, 0xd7d6, 0xbde1, 0xd0c5, 0xd3fd, 0xc3f1, 0xb8df,
|
||||
0xd1d0, 0xbfbc, 0xcac7, 0xbcd2, 0xb3c9, 0xd7d4, 0xceaa, 0xc8eb, 0xd0c4, 0xbfc6,
|
||||
0xd7a8, 0xbfaa, 0xcfa2, 0xbbaf, 0xb8e6, 0xcfdf, 0xd7ca, 0xb6af, 0xb7a8, 0xcaf8,
|
||||
0xd2bd, 0xd0d0, 0xa1b0, 0xcad0, 0xa1b1, 0xb1be, 0xb7bd, 0xb2e9, 0xcad4, 0xced2,
|
||||
0xb6e0, 0xb1ed, 0xd5be, 0xc4da, 0xd7f7, 0xd2aa, 0xb8f6, 0xbbaa, 0xc9e7, 0xbead,
|
||||
0xd5df, 0xc3e6, 0xbbfa, 0xbebf, 0xd2a9, 0xb5bd, 0xb3f6, 0xc0ed, 0xb5e3, 0xcab9,
|
||||
0xbcd3, 0xc6da, 0xb0b8, 0xd7d3, 0xbac3, 0xb9d8, 0xcec5, 0xc3fb, 0xd5b9, 0xb2bf,
|
||||
0xb9ab, 0xc1cb, 0xd6ce, 0xb9a4, 0xccec, 0xb9e3, 0xb5d8, 0xd4c2, 0xc7eb, 0xbcbc,
|
||||
0xb0e6, 0xb5c0, 0xc4dc, 0xd4ba, 0xd3eb, 0xb6a8, 0xb5e7, 0xcef1, 0xcce2, 0xcff2,
|
||||
0xbaf3, 0xd3d1, 0xc1ac, 0xcae4, 0xcfb5, 0xcae9, 0xd7a2, 0xbdab, 0xd6f7, 0xc8ab,
|
||||
0xc2eb, 0xbdf0, 0xb6d4, 0xccd8, 0xcee5, 0xceca, 0xc0b4, 0xd2b5, 0xcabe};
|
||||
{0xa3ac, 0xb5c4, 0xa1a1, 0xa1a4, 0xa1a3, 0xcac7, 0xd2bb, 0xb4f3, 0xd4da, 0xd6d0,
|
||||
0xcafd, 0xd3d0, 0xa1f3, 0xb2bb, 0xa3ba, 0xbbfa, 0xc8cb, 0xa1a2, 0xd3c3, 0xd1a7,
|
||||
0xc8d5, 0xbedd, 0xb8f6, 0xd0c2, 0xcdf8, 0xd2aa, 0xb9fa, 0xc1cb, 0xc9cf, 0xa1b0,
|
||||
0xa1b1, 0xced2, 0xbcfe, 0xcec4, 0xd2d4, 0xc4dc, 0xc0b4, 0xd4c2, 0xcab1, 0xd0d0,
|
||||
0xbdcc, 0xbfc9, 0xb6d4, 0xbcdb, 0xb1be, 0xb3f6, 0xb8b4, 0xc9fa, 0xb1b8, 0xbcbc,
|
||||
0xcfc2, 0xbacd, 0xbecd, 0xb3c9, 0xd5e2, 0xb8df, 0xb7d6, 0xc5cc, 0xbfc6, 0xbbe1,
|
||||
0xceaa, 0xc8e7, 0xcfb5, 0xa1f1, 0xc4ea, 0xb1a8, 0xb6af, 0xc0ed, 0xd3fd, 0xb7a2,
|
||||
0xc8ab, 0xb7bd, 0xcee5, 0xc2db, 0xbba7, 0xd0d4, 0xb9c9, 0xc3c7, 0xb9fd, 0xcad0,
|
||||
0xb5e3, 0xbbd6, 0xcfd6, 0xcab5, 0xd2b2, 0xbfb4, 0xb6e0, 0xccec, 0xc7f8, 0xd0c5,
|
||||
0xcad6, 0xb9d8, 0xb5bd, 0xb7dd, 0xc6f7, 0xcaf5, 0xa3a1, 0xb7a8, 0xb9ab, 0xd2b5,
|
||||
0xcbf9, 0xcdbc, 0xc6e4, 0xd3da, 0xd0a1, 0xd1a1, 0xd3ce, 0xbfaa, 0xb4e6, 0xc4bf,
|
||||
0xd7f7, 0xb5e7, 0xcdb3, 0xc7e9, 0xd7ee, 0xc6c0, 0xcfdf, 0xb5d8, 0xb5c0, 0xbead,
|
||||
0xb4c5, 0xc6b7, 0xc4da, 0xd0c4, 0xb9a4, 0xd4aa, 0xc2bc, 0xc3c0, 0xbaf3, 0xcabd,
|
||||
0xbcd2, 0xcef1, 0xbdab, 0xa3ad, 0xa3bf, 0xb3a4, 0xb9fb, 0xd6ae, 0xc1bf, 0xbbd8,
|
||||
0xb8f1, 0xb6f8, 0xb6a8, 0xcde2, 0xbac3, 0xb3cc, 0xccd8, 0xd7d4, 0xcbb5};
|
||||
|
||||
String getName() {
|
||||
return "EUC_CN";
|
||||
|
|
Loading…
Add table
Reference in a new issue