mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-4060 Add cleaned-up statistic tool.
X-SVN-Rev: 17820
This commit is contained in:
parent
65a2884e40
commit
7007286fe5
5 changed files with 1100 additions and 0 deletions
188
icu4j/src/com/ibm/icu/dev/tool/charsetdet/sbcs/Checker.java
Normal file
188
icu4j/src/com/ibm/icu/dev/tool/charsetdet/sbcs/Checker.java
Normal file
|
@ -0,0 +1,188 @@
|
|||
/*
|
||||
***********************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
***********************************************************************
|
||||
*
|
||||
*/
|
||||
|
||||
package com.ibm.icu.dev.tool.charsetdet.sbcs;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
/**
|
||||
* @author emader
|
||||
*
|
||||
* TODO To change the template for this generated type comment go to
|
||||
* Window - Preferences - Java - Code Style - Code Templates
|
||||
*/
|
||||
public class Checker implements NGramParser.NGramParserClient
|
||||
{
|
||||
private NGramList ngrams;
|
||||
private int totalNGrams;
|
||||
private int totalHits;
|
||||
|
||||
private String language;
|
||||
private String encoding;
|
||||
|
||||
private int[] histogram;
|
||||
|
||||
private static final int BUFFER_SIZE = 1024;
|
||||
|
||||
private char[] buffer;
|
||||
private int bufIndex;
|
||||
private int bufMax;
|
||||
|
||||
private NGramParser parser;
|
||||
|
||||
/**
|
||||
* TODO This should take cumulative percent and the name...
|
||||
*/
|
||||
public Checker(NGramList list, InputFile dataFile)
|
||||
{
|
||||
ngrams = list;
|
||||
ngrams.setMapper(dataFile);
|
||||
|
||||
language = languageName(dataFile.getFilename());
|
||||
encoding = dataFile.getEncoding();
|
||||
|
||||
buffer = new char[BUFFER_SIZE];
|
||||
parser = new NGramParser(this);
|
||||
resetCounts();
|
||||
|
||||
histogram = new int[100];
|
||||
resetHistogram();
|
||||
}
|
||||
|
||||
public void handleNGram(String key)
|
||||
{
|
||||
NGramList.NGram ngram = ngrams.get(key);
|
||||
|
||||
totalNGrams += 1;
|
||||
|
||||
if (ngram != null) {
|
||||
totalHits += 1;
|
||||
//ngram.incrementRefCount();
|
||||
}
|
||||
}
|
||||
|
||||
private void resetCounts()
|
||||
{
|
||||
bufIndex = 0;
|
||||
totalNGrams = totalHits = 0;
|
||||
}
|
||||
|
||||
private void resetHistogram()
|
||||
{
|
||||
for(int i = 0; i < 100; i += 1) {
|
||||
histogram[i] = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void exceptionError(Exception e)
|
||||
{
|
||||
System.err.println("ioError: " + e.toString());
|
||||
}
|
||||
|
||||
private static String languageName(String filename)
|
||||
{
|
||||
return filename.substring(0, filename.indexOf('.'));
|
||||
}
|
||||
|
||||
private boolean nextBuffer(InputFile inputFile)
|
||||
{
|
||||
try {
|
||||
bufMax = inputFile.read(buffer);
|
||||
} catch (Exception e) {
|
||||
bufMax = -1;
|
||||
exceptionError(e);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bufIndex = 0;
|
||||
|
||||
return bufMax >= 0;
|
||||
}
|
||||
|
||||
private void parseBuffer()
|
||||
{
|
||||
resetCounts();
|
||||
parser.reset();
|
||||
parser.parse();
|
||||
}
|
||||
|
||||
public char nextChar()
|
||||
{
|
||||
if (bufIndex >= bufMax) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return buffer[bufIndex++];
|
||||
}
|
||||
|
||||
public String getLanguage()
|
||||
{
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setMapper(InputFile file)
|
||||
{
|
||||
ngrams.setMapper(file);
|
||||
}
|
||||
|
||||
public int checkBuffer(char[] theBuffer, int charCount)
|
||||
{
|
||||
buffer = theBuffer;
|
||||
bufMax = charCount;
|
||||
|
||||
parseBuffer();
|
||||
|
||||
return totalHits;
|
||||
}
|
||||
|
||||
public void check(InputFile dataFile)
|
||||
{
|
||||
int minHist = 101, maxHist = -1;
|
||||
|
||||
dataFile.open();
|
||||
|
||||
String dataFilename = dataFile.getFilename();
|
||||
String fileEncoding = dataFile.getEncoding();
|
||||
|
||||
System.out.println(language + "(" + encoding + ") stats, " + languageName(dataFilename) + "(" + fileEncoding + ") data:");
|
||||
|
||||
setMapper(dataFile);
|
||||
resetHistogram();
|
||||
|
||||
while (nextBuffer(dataFile)) {
|
||||
parseBuffer();
|
||||
|
||||
double percentHits = (double) totalHits / totalNGrams * 100.0;
|
||||
int ph = (int) percentHits;
|
||||
|
||||
if (ph < minHist) {
|
||||
minHist = ph;
|
||||
}
|
||||
|
||||
if (ph > maxHist) {
|
||||
maxHist = ph;
|
||||
}
|
||||
|
||||
histogram[ph] += 1;
|
||||
}
|
||||
|
||||
for(int ph = minHist; ph <= maxHist; ph += 1) {
|
||||
System.out.println(ph + "\t" + histogram[ph]);
|
||||
}
|
||||
|
||||
System.out.println();
|
||||
|
||||
dataFile.close();
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
175
icu4j/src/com/ibm/icu/dev/tool/charsetdet/sbcs/InputFile.java
Normal file
175
icu4j/src/com/ibm/icu/dev/tool/charsetdet/sbcs/InputFile.java
Normal file
|
@ -0,0 +1,175 @@
|
|||
/*
|
||||
***********************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
***********************************************************************
|
||||
*
|
||||
*/
|
||||
|
||||
package com.ibm.icu.dev.tool.charsetdet.sbcs;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
|
||||
/**
|
||||
* @author emader
|
||||
*
|
||||
* TODO To change the template for this generated type comment go to
|
||||
* Window - Preferences - Java - Code Style - Code Templates
|
||||
*/
|
||||
public class InputFile implements NGramList.NGramKeyMapper
|
||||
{
|
||||
|
||||
private File file;
|
||||
private FileInputStream fileStream;
|
||||
private InputStreamReader inputStream;
|
||||
|
||||
private Charset charset;
|
||||
private CharsetDecoder decoder;
|
||||
private CharsetEncoder encoder;
|
||||
|
||||
private boolean visualOrder;
|
||||
|
||||
private static void exceptionError(Exception e)
|
||||
{
|
||||
System.err.println("ioError: " + e.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public InputFile(String filename, String encoding, boolean visual)
|
||||
{
|
||||
file = new File(filename);
|
||||
setEncoding(encoding);
|
||||
visualOrder = visual;
|
||||
}
|
||||
|
||||
public boolean open()
|
||||
{
|
||||
try {
|
||||
fileStream = new FileInputStream(file);
|
||||
inputStream = new InputStreamReader(fileStream, "UTF8");
|
||||
} catch (Exception e) {
|
||||
exceptionError(e);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public void close()
|
||||
{
|
||||
try {
|
||||
inputStream.close();
|
||||
fileStream.close();
|
||||
} catch (Exception e) {
|
||||
// don't really care if this fails...
|
||||
}
|
||||
}
|
||||
|
||||
public String getFilename()
|
||||
{
|
||||
return file.getName();
|
||||
}
|
||||
|
||||
public String getParent()
|
||||
{
|
||||
return file.getParent();
|
||||
}
|
||||
|
||||
public String getPath()
|
||||
{
|
||||
return file.getPath();
|
||||
}
|
||||
|
||||
public int read(char[] buffer)
|
||||
{
|
||||
int charsRead = -1;
|
||||
|
||||
try {
|
||||
charsRead = inputStream.read(buffer, 0, buffer.length);
|
||||
} catch (Exception e) {
|
||||
exceptionError(e);
|
||||
}
|
||||
|
||||
return charsRead;
|
||||
}
|
||||
|
||||
public void setEncoding(String encoding)
|
||||
{
|
||||
charset = Charset.forName(encoding);
|
||||
decoder = charset.newDecoder();
|
||||
encoder = charset.newEncoder();
|
||||
|
||||
encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
|
||||
encoder.onMalformedInput(CodingErrorAction.REPLACE);
|
||||
|
||||
decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
|
||||
decoder.onMalformedInput(CodingErrorAction.REPLACE);
|
||||
}
|
||||
|
||||
public String getEncoding()
|
||||
{
|
||||
return charset.displayName();
|
||||
}
|
||||
|
||||
public boolean getVisualOrder()
|
||||
{
|
||||
return visualOrder;
|
||||
}
|
||||
|
||||
public Object mapKey(String key)
|
||||
{
|
||||
byte[] bytes = encode(key.toCharArray());
|
||||
int length = key.length();
|
||||
int value = 0;
|
||||
|
||||
for(int b = 0; b < length; b += 1) {
|
||||
value <<= 8;
|
||||
value += (bytes[b] & 0xFF);
|
||||
}
|
||||
|
||||
return new Integer(value);
|
||||
}
|
||||
|
||||
public byte[] encode(char[] chars)
|
||||
{
|
||||
int length = chars.length;
|
||||
CharBuffer cb = CharBuffer.wrap(chars);
|
||||
ByteBuffer bb;
|
||||
|
||||
try {
|
||||
bb = encoder.encode(cb);
|
||||
} catch (CharacterCodingException e) {
|
||||
// don't expect to get any exceptions in normal usage...
|
||||
return null;
|
||||
}
|
||||
|
||||
return bb.array();
|
||||
}
|
||||
|
||||
public char[] decode(byte[] bytes)
|
||||
{
|
||||
int length = bytes.length;
|
||||
ByteBuffer bb = ByteBuffer.wrap(bytes);
|
||||
CharBuffer cb;
|
||||
|
||||
try {
|
||||
cb = decoder.decode(bb);
|
||||
} catch (CharacterCodingException e) {
|
||||
// don't expect to get any exceptions in normal usage...
|
||||
return null;
|
||||
}
|
||||
|
||||
return cb.array();
|
||||
}
|
||||
}
|
144
icu4j/src/com/ibm/icu/dev/tool/charsetdet/sbcs/NGramList.java
Normal file
144
icu4j/src/com/ibm/icu/dev/tool/charsetdet/sbcs/NGramList.java
Normal file
|
@ -0,0 +1,144 @@
|
|||
/*
|
||||
***********************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
***********************************************************************
|
||||
*
|
||||
*/
|
||||
|
||||
package com.ibm.icu.dev.tool.charsetdet.sbcs;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* @author emader
|
||||
*
|
||||
* TODO To change the template for this generated type comment go to
|
||||
* Window - Preferences - Java - Code Style - Code Templates
|
||||
*/
|
||||
public class NGramList
|
||||
{
|
||||
public interface NGramKeyMapper
|
||||
{
|
||||
Object mapKey(String key);
|
||||
}
|
||||
|
||||
public static final class NGram implements Comparable
|
||||
{
|
||||
private String value;
|
||||
private int refCount;
|
||||
|
||||
public NGram(String theValue, int theRefCount)
|
||||
{
|
||||
value = theValue;
|
||||
refCount = theRefCount;
|
||||
}
|
||||
|
||||
public NGram(String theValue)
|
||||
{
|
||||
this(theValue, 1);
|
||||
}
|
||||
|
||||
public NGram(NGram other)
|
||||
{
|
||||
this(other.getValue(), other.getRefCount());
|
||||
}
|
||||
|
||||
public final String getValue()
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
public final int getRefCount()
|
||||
{
|
||||
return refCount;
|
||||
}
|
||||
|
||||
public final void incrementRefCount()
|
||||
{
|
||||
refCount += 1;
|
||||
}
|
||||
|
||||
// Note: This makes higher refCounts come *before* lower refCounts...
|
||||
public int compareTo(Object o)
|
||||
{
|
||||
NGram ng = (NGram) o;
|
||||
|
||||
return ng.getRefCount() - refCount;
|
||||
}
|
||||
}
|
||||
|
||||
protected TreeMap ngrams;
|
||||
protected int totalNGrams;
|
||||
protected int uniqueNGrams;
|
||||
|
||||
protected final int N_GRAM_SIZE = 3;
|
||||
|
||||
private NGramKeyMapper keyMapper;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public NGramList(NGramKeyMapper theMapper)
|
||||
{
|
||||
keyMapper = theMapper;
|
||||
|
||||
ngrams = new TreeMap();
|
||||
totalNGrams = uniqueNGrams = 0;
|
||||
}
|
||||
|
||||
public void setMapper(NGramKeyMapper nGramKeyMapper)
|
||||
{
|
||||
keyMapper = nGramKeyMapper;
|
||||
}
|
||||
|
||||
public NGram get(Object mappedKey)
|
||||
{
|
||||
return (NGram) ngrams.get(mappedKey);
|
||||
}
|
||||
|
||||
public NGram get(String key)
|
||||
{
|
||||
Object mappedKey = keyMapper.mapKey(key);
|
||||
|
||||
return get(mappedKey);
|
||||
}
|
||||
|
||||
public void put(String key)
|
||||
{
|
||||
Object mappedKey = keyMapper.mapKey(key);
|
||||
NGram ngram = get(mappedKey);
|
||||
|
||||
totalNGrams += 1;
|
||||
|
||||
if (ngram == null) {
|
||||
uniqueNGrams += 1;
|
||||
ngrams.put(mappedKey, new NGram(key));
|
||||
} else {
|
||||
ngram.incrementRefCount();
|
||||
}
|
||||
}
|
||||
|
||||
public Collection values()
|
||||
{
|
||||
return ngrams.values();
|
||||
}
|
||||
|
||||
public Collection keys()
|
||||
{
|
||||
return ngrams.keySet();
|
||||
}
|
||||
|
||||
public int getTotalNGrams()
|
||||
{
|
||||
return totalNGrams;
|
||||
}
|
||||
|
||||
public int getUniqueNGrams()
|
||||
{
|
||||
return uniqueNGrams;
|
||||
}
|
||||
}
|
161
icu4j/src/com/ibm/icu/dev/tool/charsetdet/sbcs/NGramParser.java
Normal file
161
icu4j/src/com/ibm/icu/dev/tool/charsetdet/sbcs/NGramParser.java
Normal file
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
***********************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
***********************************************************************
|
||||
*
|
||||
*/
|
||||
|
||||
package com.ibm.icu.dev.tool.charsetdet.sbcs;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* @author emader
|
||||
*
|
||||
* TODO To change the template for this generated type comment go to
|
||||
* Window - Preferences - Java - Code Style - Code Templates
|
||||
*/
|
||||
public class NGramParser
|
||||
{
|
||||
|
||||
public interface NGramParserClient
|
||||
{
|
||||
char nextChar();
|
||||
void handleNGram(String key);
|
||||
}
|
||||
|
||||
private static final int A_NULL = 0;
|
||||
private static final int A_ADDC = 1;
|
||||
private static final int A_ADDS = 2;
|
||||
|
||||
/*
|
||||
* Character classes
|
||||
*/
|
||||
public static final int C_IGNORE = 0;
|
||||
public static final int C_LETTER = 1;
|
||||
public static final int C_PUNCT = 2;
|
||||
|
||||
private static final int S_START = 0;
|
||||
private static final int S_LETTER = 1;
|
||||
private static final int S_PUNCT = 2;
|
||||
|
||||
static final class StateEntry
|
||||
{
|
||||
private int newState;
|
||||
private int action;
|
||||
|
||||
StateEntry(int theState, int theAction)
|
||||
{
|
||||
newState = theState;
|
||||
action = theAction;
|
||||
}
|
||||
|
||||
public int getNewState()
|
||||
{
|
||||
return newState;
|
||||
}
|
||||
|
||||
public int getAction()
|
||||
{
|
||||
return action;
|
||||
}
|
||||
}
|
||||
|
||||
private StateEntry[][] stateTable = {
|
||||
{new StateEntry(S_START, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)},
|
||||
{new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)},
|
||||
{new StateEntry(S_PUNCT, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_NULL)}
|
||||
};
|
||||
|
||||
protected final int N_GRAM_SIZE = 3;
|
||||
|
||||
private char[] letters = new char[N_GRAM_SIZE];
|
||||
private int letterCount;
|
||||
|
||||
private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");
|
||||
|
||||
private NGramParserClient client;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public NGramParser(NGramParserClient theClient)
|
||||
{
|
||||
client = theClient;
|
||||
letterCount = 0;
|
||||
}
|
||||
|
||||
public void setClient(NGramParserClient theClient)
|
||||
{
|
||||
client = theClient;
|
||||
}
|
||||
|
||||
// TODO Is this good enough, or are there other C_IGNORE characters?
|
||||
// TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
|
||||
public static int getCharClass(char ch)
|
||||
{
|
||||
if (ch == '\'' || ch == '\uFEFF') {
|
||||
return C_IGNORE;
|
||||
}
|
||||
|
||||
if (letterSet.contains(ch)) {
|
||||
return C_LETTER;
|
||||
}
|
||||
|
||||
return C_PUNCT;
|
||||
}
|
||||
|
||||
public void reset()
|
||||
{
|
||||
letterCount = 0;
|
||||
}
|
||||
|
||||
public void addLetter(char letter)
|
||||
{
|
||||
// somewhat clever stuff goes here...
|
||||
letters[letterCount++] = letter;
|
||||
|
||||
if (letterCount >= N_GRAM_SIZE) {
|
||||
String key = new String(letters);
|
||||
|
||||
client.handleNGram(key);
|
||||
|
||||
letterCount = N_GRAM_SIZE - 1;
|
||||
for (int i = 0; i < letterCount; i += 1) {
|
||||
letters[i] = letters[i + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void parse()
|
||||
{
|
||||
char ch;
|
||||
int state = 0;
|
||||
|
||||
// this is where the clever stuff goes...
|
||||
while ((ch = client.nextChar()) != 0) {
|
||||
int charClass = getCharClass(ch);
|
||||
StateEntry entry = stateTable[state][charClass];
|
||||
|
||||
state = entry.getNewState();
|
||||
|
||||
switch (entry.getAction())
|
||||
{
|
||||
case A_ADDC:
|
||||
addLetter(Character.toLowerCase(ch));
|
||||
break;
|
||||
|
||||
case A_ADDS:
|
||||
addLetter(' ');
|
||||
break;
|
||||
|
||||
case A_NULL:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
addLetter(' ');
|
||||
}
|
||||
}
|
|
@ -0,0 +1,432 @@
|
|||
/*
|
||||
***********************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
***********************************************************************
|
||||
*
|
||||
*/
|
||||
|
||||
package com.ibm.icu.dev.tool.charsetdet.sbcs;
|
||||
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.ibm.icu.impl.Utility;
|
||||
|
||||
/**
|
||||
* @author emader
|
||||
*
|
||||
* TODO To change the template for this generated type comment go to
|
||||
* Window - Preferences - Java - Code Style - Code Templates
|
||||
*/
|
||||
public class StatisticsTool implements NGramParser.NGramParserClient, NGramList.NGramKeyMapper
|
||||
{
|
||||
/* TODO Make this usage string more sane. */
|
||||
private static final String usageString =
|
||||
"\nUsage: StatisticsTool [OPTIONS] [FILES]\n\n" +
|
||||
"This program will read in a Unicode text file of text in a particular language\n" +
|
||||
"and compute the statistics needed to detected that language and character set.\n " +
|
||||
"Options:\n" +
|
||||
"-e specify the target encoding\n" +
|
||||
"-h or -? print this usage text.\n" +
|
||||
"-v also generate statistics for visual order.\n" +
|
||||
"-l only generate statistics for logical order (cancel -v)." +
|
||||
"-c run the checker.\n" +
|
||||
"-t run the encoding test.\n" +
|
||||
"example: com.ibm.icu.dev.tool.charset.StatisticsTool -e 8859-1 Spanish.txt";
|
||||
|
||||
private static final int BUFFER_SIZE = 1024;
|
||||
|
||||
private char[] buffer;
|
||||
private int bufIndex;
|
||||
private int bufMax;
|
||||
|
||||
private InputFile inputFile;
|
||||
|
||||
private NGramList ngrams;
|
||||
|
||||
private static byte[] allBytes = {
|
||||
(byte) 0x00, (byte) 0x01, (byte) 0x02, (byte) 0x03, (byte) 0x04, (byte) 0x05, (byte) 0x06, (byte) 0x07,
|
||||
(byte) 0x08, (byte) 0x09, (byte) 0x0A, (byte) 0x0B, (byte) 0x0C, (byte) 0x0D, (byte) 0x0E, (byte) 0x0F,
|
||||
(byte) 0x10, (byte) 0x11, (byte) 0x12, (byte) 0x13, (byte) 0x14, (byte) 0x15, (byte) 0x16, (byte) 0x17,
|
||||
(byte) 0x18, (byte) 0x19, (byte) 0x1A, (byte) 0x1B, (byte) 0x1C, (byte) 0x1D, (byte) 0x1E, (byte) 0x1F,
|
||||
(byte) 0x20, (byte) 0x21, (byte) 0x22, (byte) 0x23, (byte) 0x24, (byte) 0x25, (byte) 0x26, (byte) 0x27,
|
||||
(byte) 0x28, (byte) 0x29, (byte) 0x2A, (byte) 0x2B, (byte) 0x2C, (byte) 0x2D, (byte) 0x2E, (byte) 0x2F,
|
||||
(byte) 0x30, (byte) 0x31, (byte) 0x32, (byte) 0x33, (byte) 0x34, (byte) 0x35, (byte) 0x36, (byte) 0x37,
|
||||
(byte) 0x38, (byte) 0x39, (byte) 0x3A, (byte) 0x3B, (byte) 0x3C, (byte) 0x3D, (byte) 0x3E, (byte) 0x3F,
|
||||
(byte) 0x40, (byte) 0x41, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47,
|
||||
(byte) 0x48, (byte) 0x49, (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D, (byte) 0x4E, (byte) 0x4F,
|
||||
(byte) 0x50, (byte) 0x51, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x57,
|
||||
(byte) 0x58, (byte) 0x59, (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D, (byte) 0x5E, (byte) 0x5F,
|
||||
(byte) 0x60, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
|
||||
(byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
|
||||
(byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
|
||||
(byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte) 0x7D, (byte) 0x7E, (byte) 0x7F,
|
||||
(byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87,
|
||||
(byte) 0x88, (byte) 0x89, (byte) 0x8A, (byte) 0x8B, (byte) 0x8C, (byte) 0x8D, (byte) 0x8E, (byte) 0x8F,
|
||||
(byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97,
|
||||
(byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9B, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
|
||||
(byte) 0xA0, (byte) 0xA1, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7,
|
||||
(byte) 0xA8, (byte) 0xA9, (byte) 0xAA, (byte) 0xAB, (byte) 0xAC, (byte) 0xAD, (byte) 0xAE, (byte) 0xAF,
|
||||
(byte) 0xB0, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7,
|
||||
(byte) 0xB8, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0xBD, (byte) 0xBE, (byte) 0xBF,
|
||||
(byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
|
||||
(byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
|
||||
(byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
|
||||
(byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
|
||||
(byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
|
||||
(byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
|
||||
(byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
|
||||
(byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF
|
||||
};
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public StatisticsTool()
|
||||
{
|
||||
buffer = new char[BUFFER_SIZE];
|
||||
|
||||
buffer[0] = ' ';
|
||||
bufIndex = 0;
|
||||
bufMax = 1;
|
||||
}
|
||||
|
||||
private static void usage()
|
||||
{
|
||||
System.out.println(usageString);
|
||||
}
|
||||
|
||||
private static void exceptionError(Exception e)
|
||||
{
|
||||
System.err.println("ioError: " + e.toString());
|
||||
}
|
||||
|
||||
private int nextBuffer(InputFile inputFile)
|
||||
{
|
||||
bufIndex = 0;
|
||||
|
||||
return inputFile.read(buffer);
|
||||
}
|
||||
|
||||
public char nextChar()
|
||||
{
|
||||
if (bufIndex >= bufMax) {
|
||||
bufMax = nextBuffer(inputFile);
|
||||
}
|
||||
|
||||
if (bufMax < 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return buffer[bufIndex++];
|
||||
}
|
||||
|
||||
public void handleNGram(String key)
|
||||
{
|
||||
ngrams.put(key);
|
||||
}
|
||||
|
||||
public Object mapKey(String key)
|
||||
{
|
||||
return key;
|
||||
}
|
||||
|
||||
private NGramList dumpNGrams()
|
||||
{
|
||||
String filename = inputFile.getPath();
|
||||
int extension = filename.lastIndexOf(".");
|
||||
String outputFileName = filename.substring(0, extension) + ".raw" + filename.substring(extension);
|
||||
PrintStream output;
|
||||
double cumulative = 0;
|
||||
|
||||
try {
|
||||
output = new PrintStream(
|
||||
new FileOutputStream(outputFileName), true, "UTF8");
|
||||
} catch (IOException e) {
|
||||
System.out.println("? Could not open " + outputFileName + " for writing.");
|
||||
return null;
|
||||
}
|
||||
|
||||
System.out.println(inputFile.getFilename() + ": " + ngrams.getUniqueNGrams() + "/" + ngrams.getTotalNGrams());
|
||||
|
||||
ArrayList array = new ArrayList(ngrams.values());
|
||||
|
||||
Collections.sort(array);
|
||||
|
||||
NGramList stats = new NGramList(inputFile);
|
||||
int count = 0;
|
||||
int totalNGrams = ngrams.getTotalNGrams();
|
||||
|
||||
for (Iterator it = array.iterator(); it.hasNext(); count += 1) {
|
||||
NGramList.NGram ngram = (NGramList.NGram) it.next();
|
||||
String value = ngram.getValue();
|
||||
int refCount = ngram.getRefCount();
|
||||
double ratio = (double) refCount / totalNGrams * 100.0;
|
||||
|
||||
cumulative += ratio;
|
||||
|
||||
// TODO check should be count < max && cumulative < maxPercent
|
||||
if (count < 64) {
|
||||
stats.put(value);
|
||||
}
|
||||
|
||||
output.println(value + "\t" + refCount + "\t" + ratio + "%\t" + cumulative + "%");
|
||||
}
|
||||
|
||||
output.close();
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
private void writeStatistics(ArrayList keyList, boolean visual)
|
||||
{
|
||||
String filename = inputFile.getPath();
|
||||
int extension = filename.lastIndexOf(".");
|
||||
String outputFileName = filename.substring(0, extension) + "-" + inputFile.getEncoding() +
|
||||
(visual? "-visual.dat" : ".dat");
|
||||
PrintStream output;
|
||||
|
||||
try {
|
||||
output = new PrintStream(
|
||||
new FileOutputStream(outputFileName), true, "ASCII");
|
||||
} catch (IOException e) {
|
||||
System.out.println("? Could not open " + outputFileName + " for writing.");
|
||||
return;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
|
||||
output.print(" private static int[] ngrams = {");
|
||||
|
||||
for (Iterator it = keyList.iterator(); it.hasNext(); i += 1) {
|
||||
Integer ngram = (Integer) it.next();
|
||||
|
||||
if (i % 16 == 0) {
|
||||
output.print("\n ");
|
||||
}
|
||||
|
||||
output.print("0x" + Utility.hex(ngram.intValue(), 6) + ", ");
|
||||
}
|
||||
|
||||
output.println("\n };\n");
|
||||
|
||||
/*
|
||||
* Generate the byte map
|
||||
*/
|
||||
char[] unicodes = inputFile.decode(allBytes);
|
||||
|
||||
for (int b = 0; b < 256; b += 1) {
|
||||
char unicode = unicodes[b];
|
||||
int charClass = NGramParser.getCharClass(unicode);
|
||||
|
||||
switch (charClass) {
|
||||
case NGramParser.C_LETTER:
|
||||
unicodes[b] = Character.toLowerCase(unicode);
|
||||
break;
|
||||
|
||||
case NGramParser.C_PUNCT:
|
||||
unicodes[b] = ' ';
|
||||
break;
|
||||
|
||||
case NGramParser.C_IGNORE:
|
||||
default:
|
||||
unicodes[b] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
byte[] byteMap = inputFile.encode(unicodes);
|
||||
|
||||
output.print(" private static byte[] byteMap = {");
|
||||
|
||||
for (int b = 0; b < 256; b += 1) {
|
||||
if (b % 8 == 0) {
|
||||
output.print("\n ");
|
||||
}
|
||||
|
||||
output.print("(byte) 0x" + Utility.hex(byteMap[b] & 0xFF, 2) + ", ");
|
||||
}
|
||||
|
||||
output.println("\n };");
|
||||
}
|
||||
|
||||
public NGramList collectStatistics(InputFile file)
|
||||
{
|
||||
if (!file.open()) {;
|
||||
return null;
|
||||
}
|
||||
|
||||
inputFile = file;
|
||||
|
||||
NGramParser parser = new NGramParser(this);
|
||||
|
||||
ngrams = new NGramList(this);
|
||||
parser.parse();
|
||||
|
||||
file.close();
|
||||
|
||||
NGramList stats = dumpNGrams();
|
||||
ArrayList statKeys = new ArrayList(stats.keys());
|
||||
|
||||
Collections.sort(statKeys);
|
||||
writeStatistics(statKeys, false);
|
||||
|
||||
if (inputFile.getVisualOrder()) {
|
||||
ArrayList reversed = new ArrayList(statKeys.size());
|
||||
|
||||
for (Iterator it = statKeys.iterator(); it.hasNext();) {
|
||||
Integer key = (Integer) it.next();
|
||||
int k = key.intValue();
|
||||
int r = 0;
|
||||
|
||||
while (k != 0) {
|
||||
r = (r << 8) | (k & 0xFF);
|
||||
k >>= 8;
|
||||
}
|
||||
|
||||
reversed.add(new Integer(r));
|
||||
}
|
||||
|
||||
Collections.sort(reversed);
|
||||
writeStatistics(reversed, true);
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
public static void main(String[] args)
|
||||
{
|
||||
List list = Arrays.asList(args);
|
||||
InputFile[] input_files = new InputFile[args.length];
|
||||
int file_count = 0;
|
||||
String encoding = null;
|
||||
boolean run_checker = false;
|
||||
boolean encoding_test = false;
|
||||
boolean visual_order = false;
|
||||
|
||||
for (Iterator it = list.iterator(); it.hasNext(); /*anything?*/) {
|
||||
String arg = (String) it.next();
|
||||
|
||||
if (arg.equals("-v")) {
|
||||
visual_order = true;
|
||||
} else if (arg.equals("-l")) {
|
||||
visual_order = false;
|
||||
} else if (arg.equals("-c")) {
|
||||
run_checker = true;
|
||||
} else if (arg.equals("-t")) {
|
||||
encoding_test = true;
|
||||
} else if (arg.equals("-e")) {
|
||||
if (it.hasNext()) {
|
||||
encoding = (String) it.next();
|
||||
} else {
|
||||
System.err.println("Error: missing encoding.");
|
||||
}
|
||||
} else if (arg.startsWith("-")) {
|
||||
if (! (arg.equals("-h") || arg.equals("-?"))) {
|
||||
System.err.println("Error: unknown option " + arg);
|
||||
}
|
||||
|
||||
usage();
|
||||
} else {
|
||||
input_files[file_count++] = new InputFile(arg, encoding, visual_order);
|
||||
}
|
||||
}
|
||||
|
||||
if(file_count == 0){
|
||||
System.err.println("Error: there are no files to process.");
|
||||
usage();
|
||||
}
|
||||
|
||||
StatisticsTool tool = new StatisticsTool();
|
||||
Checker[] checkers = new Checker[file_count];
|
||||
|
||||
for(int i = 0; i < file_count; i += 1) {
|
||||
InputFile file = input_files[i];
|
||||
|
||||
checkers[i] = new Checker(tool.collectStatistics(file), file);
|
||||
}
|
||||
|
||||
System.out.println();
|
||||
|
||||
/**
|
||||
* Checkers
|
||||
*/
|
||||
if (run_checker) {
|
||||
for(int c = 0; c < file_count; c += 1) {
|
||||
Checker checker = checkers[c];
|
||||
|
||||
for(int f = 0; f < file_count; f += 1) {
|
||||
checker.check(input_files[f]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Detection test
|
||||
*/
|
||||
if (encoding_test) {
|
||||
char[] buffer = new char[128];
|
||||
|
||||
System.out.println("Detection test");
|
||||
|
||||
for (int f = 0; f < file_count; f += 1) {
|
||||
InputFile file = input_files[f];
|
||||
int[] histogram = new int[file_count];
|
||||
int charCount, misses = 0;
|
||||
|
||||
System.out.println(file.getFilename() + "(" + file.getEncoding() + "):");
|
||||
file.open();
|
||||
|
||||
for (int c = 0; c < file_count; c += 1) {
|
||||
checkers[c].setMapper(file);
|
||||
}
|
||||
|
||||
// for each buffer
|
||||
// for each checker
|
||||
// call checkBuffer, save score
|
||||
// find highest score, update histogram for that checker
|
||||
// show checker histogram
|
||||
|
||||
while ((charCount = file.read(buffer)) > 0) {
|
||||
int[] scores = new int[file_count];
|
||||
int bestFit = -1, maxScore = 0;
|
||||
|
||||
for (int c = 0; c < file_count; c += 1) {
|
||||
scores[c] = checkers[c].checkBuffer(buffer, charCount);
|
||||
}
|
||||
|
||||
for (int c = 0; c < file_count; c += 1) {
|
||||
int score = scores[c];
|
||||
|
||||
if (score > maxScore) {
|
||||
maxScore = score;
|
||||
bestFit = c;
|
||||
}
|
||||
}
|
||||
|
||||
if (bestFit >= 0) {
|
||||
histogram[bestFit] += 1;
|
||||
} else {
|
||||
misses += 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (int c = 0; c < file_count; c += 1) {
|
||||
System.out.println(" " + checkers[c].getLanguage() + ": " + histogram[c]);
|
||||
}
|
||||
|
||||
if (misses > 0) {
|
||||
System.out.println(" NONE: " + misses);
|
||||
}
|
||||
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue