ICU-4060 Add cleaned-up statistic tool.

X-SVN-Rev: 17820
This commit is contained in:
Eric Mader 2005-06-07 23:18:53 +00:00
parent 65a2884e40
commit 7007286fe5
5 changed files with 1100 additions and 0 deletions

View file

@ -0,0 +1,188 @@
/*
***********************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
***********************************************************************
*
*/
package com.ibm.icu.dev.tool.charsetdet.sbcs;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
/**
* @author emader
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class Checker implements NGramParser.NGramParserClient
{
private NGramList ngrams;
private int totalNGrams;
private int totalHits;
private String language;
private String encoding;
private int[] histogram;
private static final int BUFFER_SIZE = 1024;
private char[] buffer;
private int bufIndex;
private int bufMax;
private NGramParser parser;
/**
* TODO This should take cumulative percent and the name...
*/
public Checker(NGramList list, InputFile dataFile)
{
ngrams = list;
ngrams.setMapper(dataFile);
language = languageName(dataFile.getFilename());
encoding = dataFile.getEncoding();
buffer = new char[BUFFER_SIZE];
parser = new NGramParser(this);
resetCounts();
histogram = new int[100];
resetHistogram();
}
public void handleNGram(String key)
{
NGramList.NGram ngram = ngrams.get(key);
totalNGrams += 1;
if (ngram != null) {
totalHits += 1;
//ngram.incrementRefCount();
}
}
private void resetCounts()
{
bufIndex = 0;
totalNGrams = totalHits = 0;
}
private void resetHistogram()
{
for(int i = 0; i < 100; i += 1) {
histogram[i] = 0;
}
}
private static void exceptionError(Exception e)
{
System.err.println("ioError: " + e.toString());
}
private static String languageName(String filename)
{
return filename.substring(0, filename.indexOf('.'));
}
private boolean nextBuffer(InputFile inputFile)
{
try {
bufMax = inputFile.read(buffer);
} catch (Exception e) {
bufMax = -1;
exceptionError(e);
return false;
}
bufIndex = 0;
return bufMax >= 0;
}
private void parseBuffer()
{
resetCounts();
parser.reset();
parser.parse();
}
public char nextChar()
{
if (bufIndex >= bufMax) {
return 0;
}
return buffer[bufIndex++];
}
public String getLanguage()
{
return language;
}
public void setMapper(InputFile file)
{
ngrams.setMapper(file);
}
public int checkBuffer(char[] theBuffer, int charCount)
{
buffer = theBuffer;
bufMax = charCount;
parseBuffer();
return totalHits;
}
public void check(InputFile dataFile)
{
int minHist = 101, maxHist = -1;
dataFile.open();
String dataFilename = dataFile.getFilename();
String fileEncoding = dataFile.getEncoding();
System.out.println(language + "(" + encoding + ") stats, " + languageName(dataFilename) + "(" + fileEncoding + ") data:");
setMapper(dataFile);
resetHistogram();
while (nextBuffer(dataFile)) {
parseBuffer();
double percentHits = (double) totalHits / totalNGrams * 100.0;
int ph = (int) percentHits;
if (ph < minHist) {
minHist = ph;
}
if (ph > maxHist) {
maxHist = ph;
}
histogram[ph] += 1;
}
for(int ph = minHist; ph <= maxHist; ph += 1) {
System.out.println(ph + "\t" + histogram[ph]);
}
System.out.println();
dataFile.close();
return;
}
}

View file

@ -0,0 +1,175 @@
/*
***********************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
***********************************************************************
*
*/
package com.ibm.icu.dev.tool.charsetdet.sbcs;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
/**
* @author emader
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class InputFile implements NGramList.NGramKeyMapper
{
private File file;
private FileInputStream fileStream;
private InputStreamReader inputStream;
private Charset charset;
private CharsetDecoder decoder;
private CharsetEncoder encoder;
private boolean visualOrder;
private static void exceptionError(Exception e)
{
System.err.println("ioError: " + e.toString());
}
/**
*
*/
public InputFile(String filename, String encoding, boolean visual)
{
file = new File(filename);
setEncoding(encoding);
visualOrder = visual;
}
public boolean open()
{
try {
fileStream = new FileInputStream(file);
inputStream = new InputStreamReader(fileStream, "UTF8");
} catch (Exception e) {
exceptionError(e);
return false;
}
return true;
}
public void close()
{
try {
inputStream.close();
fileStream.close();
} catch (Exception e) {
// don't really care if this fails...
}
}
public String getFilename()
{
return file.getName();
}
public String getParent()
{
return file.getParent();
}
public String getPath()
{
return file.getPath();
}
public int read(char[] buffer)
{
int charsRead = -1;
try {
charsRead = inputStream.read(buffer, 0, buffer.length);
} catch (Exception e) {
exceptionError(e);
}
return charsRead;
}
public void setEncoding(String encoding)
{
charset = Charset.forName(encoding);
decoder = charset.newDecoder();
encoder = charset.newEncoder();
encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
encoder.onMalformedInput(CodingErrorAction.REPLACE);
decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
decoder.onMalformedInput(CodingErrorAction.REPLACE);
}
public String getEncoding()
{
return charset.displayName();
}
public boolean getVisualOrder()
{
return visualOrder;
}
public Object mapKey(String key)
{
byte[] bytes = encode(key.toCharArray());
int length = key.length();
int value = 0;
for(int b = 0; b < length; b += 1) {
value <<= 8;
value += (bytes[b] & 0xFF);
}
return new Integer(value);
}
public byte[] encode(char[] chars)
{
int length = chars.length;
CharBuffer cb = CharBuffer.wrap(chars);
ByteBuffer bb;
try {
bb = encoder.encode(cb);
} catch (CharacterCodingException e) {
// don't expect to get any exceptions in normal usage...
return null;
}
return bb.array();
}
public char[] decode(byte[] bytes)
{
int length = bytes.length;
ByteBuffer bb = ByteBuffer.wrap(bytes);
CharBuffer cb;
try {
cb = decoder.decode(bb);
} catch (CharacterCodingException e) {
// don't expect to get any exceptions in normal usage...
return null;
}
return cb.array();
}
}

View file

@ -0,0 +1,144 @@
/*
***********************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
***********************************************************************
*
*/
package com.ibm.icu.dev.tool.charsetdet.sbcs;
import java.util.Collection;
import java.util.TreeMap;
import com.ibm.icu.text.UnicodeSet;
/**
* @author emader
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class NGramList
{
public interface NGramKeyMapper
{
Object mapKey(String key);
}
public static final class NGram implements Comparable
{
private String value;
private int refCount;
public NGram(String theValue, int theRefCount)
{
value = theValue;
refCount = theRefCount;
}
public NGram(String theValue)
{
this(theValue, 1);
}
public NGram(NGram other)
{
this(other.getValue(), other.getRefCount());
}
public final String getValue()
{
return value;
}
public final int getRefCount()
{
return refCount;
}
public final void incrementRefCount()
{
refCount += 1;
}
// Note: This makes higher refCounts come *before* lower refCounts...
public int compareTo(Object o)
{
NGram ng = (NGram) o;
return ng.getRefCount() - refCount;
}
}
protected TreeMap ngrams;
protected int totalNGrams;
protected int uniqueNGrams;
protected final int N_GRAM_SIZE = 3;
private NGramKeyMapper keyMapper;
/**
*
*/
public NGramList(NGramKeyMapper theMapper)
{
keyMapper = theMapper;
ngrams = new TreeMap();
totalNGrams = uniqueNGrams = 0;
}
public void setMapper(NGramKeyMapper nGramKeyMapper)
{
keyMapper = nGramKeyMapper;
}
public NGram get(Object mappedKey)
{
return (NGram) ngrams.get(mappedKey);
}
public NGram get(String key)
{
Object mappedKey = keyMapper.mapKey(key);
return get(mappedKey);
}
public void put(String key)
{
Object mappedKey = keyMapper.mapKey(key);
NGram ngram = get(mappedKey);
totalNGrams += 1;
if (ngram == null) {
uniqueNGrams += 1;
ngrams.put(mappedKey, new NGram(key));
} else {
ngram.incrementRefCount();
}
}
public Collection values()
{
return ngrams.values();
}
public Collection keys()
{
return ngrams.keySet();
}
public int getTotalNGrams()
{
return totalNGrams;
}
public int getUniqueNGrams()
{
return uniqueNGrams;
}
}

View file

@ -0,0 +1,161 @@
/*
***********************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
***********************************************************************
*
*/
package com.ibm.icu.dev.tool.charsetdet.sbcs;
import com.ibm.icu.text.UnicodeSet;
/**
* @author emader
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class NGramParser
{
public interface NGramParserClient
{
char nextChar();
void handleNGram(String key);
}
private static final int A_NULL = 0;
private static final int A_ADDC = 1;
private static final int A_ADDS = 2;
/*
* Character classes
*/
public static final int C_IGNORE = 0;
public static final int C_LETTER = 1;
public static final int C_PUNCT = 2;
private static final int S_START = 0;
private static final int S_LETTER = 1;
private static final int S_PUNCT = 2;
static final class StateEntry
{
private int newState;
private int action;
StateEntry(int theState, int theAction)
{
newState = theState;
action = theAction;
}
public int getNewState()
{
return newState;
}
public int getAction()
{
return action;
}
}
private StateEntry[][] stateTable = {
{new StateEntry(S_START, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)},
{new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)},
{new StateEntry(S_PUNCT, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_NULL)}
};
protected final int N_GRAM_SIZE = 3;
private char[] letters = new char[N_GRAM_SIZE];
private int letterCount;
private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");
private NGramParserClient client;
/**
*
*/
public NGramParser(NGramParserClient theClient)
{
client = theClient;
letterCount = 0;
}
public void setClient(NGramParserClient theClient)
{
client = theClient;
}
// TODO Is this good enough, or are there other C_IGNORE characters?
// TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
public static int getCharClass(char ch)
{
if (ch == '\'' || ch == '\uFEFF') {
return C_IGNORE;
}
if (letterSet.contains(ch)) {
return C_LETTER;
}
return C_PUNCT;
}
public void reset()
{
letterCount = 0;
}
public void addLetter(char letter)
{
// somewhat clever stuff goes here...
letters[letterCount++] = letter;
if (letterCount >= N_GRAM_SIZE) {
String key = new String(letters);
client.handleNGram(key);
letterCount = N_GRAM_SIZE - 1;
for (int i = 0; i < letterCount; i += 1) {
letters[i] = letters[i + 1];
}
}
}
public void parse()
{
char ch;
int state = 0;
// this is where the clever stuff goes...
while ((ch = client.nextChar()) != 0) {
int charClass = getCharClass(ch);
StateEntry entry = stateTable[state][charClass];
state = entry.getNewState();
switch (entry.getAction())
{
case A_ADDC:
addLetter(Character.toLowerCase(ch));
break;
case A_ADDS:
addLetter(' ');
break;
case A_NULL:
default:
break;
}
}
addLetter(' ');
}
}

View file

@ -0,0 +1,432 @@
/*
***********************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
***********************************************************************
*
*/
package com.ibm.icu.dev.tool.charsetdet.sbcs;
import java.io.*;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import com.ibm.icu.impl.Utility;
/**
* @author emader
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class StatisticsTool implements NGramParser.NGramParserClient, NGramList.NGramKeyMapper
{
/* TODO Make this usage string more sane. */
private static final String usageString =
"\nUsage: StatisticsTool [OPTIONS] [FILES]\n\n" +
"This program will read in a Unicode text file of text in a particular language\n" +
"and compute the statistics needed to detected that language and character set.\n " +
"Options:\n" +
"-e specify the target encoding\n" +
"-h or -? print this usage text.\n" +
"-v also generate statistics for visual order.\n" +
"-l only generate statistics for logical order (cancel -v)." +
"-c run the checker.\n" +
"-t run the encoding test.\n" +
"example: com.ibm.icu.dev.tool.charset.StatisticsTool -e 8859-1 Spanish.txt";
private static final int BUFFER_SIZE = 1024;
private char[] buffer;
private int bufIndex;
private int bufMax;
private InputFile inputFile;
private NGramList ngrams;
private static byte[] allBytes = {
(byte) 0x00, (byte) 0x01, (byte) 0x02, (byte) 0x03, (byte) 0x04, (byte) 0x05, (byte) 0x06, (byte) 0x07,
(byte) 0x08, (byte) 0x09, (byte) 0x0A, (byte) 0x0B, (byte) 0x0C, (byte) 0x0D, (byte) 0x0E, (byte) 0x0F,
(byte) 0x10, (byte) 0x11, (byte) 0x12, (byte) 0x13, (byte) 0x14, (byte) 0x15, (byte) 0x16, (byte) 0x17,
(byte) 0x18, (byte) 0x19, (byte) 0x1A, (byte) 0x1B, (byte) 0x1C, (byte) 0x1D, (byte) 0x1E, (byte) 0x1F,
(byte) 0x20, (byte) 0x21, (byte) 0x22, (byte) 0x23, (byte) 0x24, (byte) 0x25, (byte) 0x26, (byte) 0x27,
(byte) 0x28, (byte) 0x29, (byte) 0x2A, (byte) 0x2B, (byte) 0x2C, (byte) 0x2D, (byte) 0x2E, (byte) 0x2F,
(byte) 0x30, (byte) 0x31, (byte) 0x32, (byte) 0x33, (byte) 0x34, (byte) 0x35, (byte) 0x36, (byte) 0x37,
(byte) 0x38, (byte) 0x39, (byte) 0x3A, (byte) 0x3B, (byte) 0x3C, (byte) 0x3D, (byte) 0x3E, (byte) 0x3F,
(byte) 0x40, (byte) 0x41, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47,
(byte) 0x48, (byte) 0x49, (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D, (byte) 0x4E, (byte) 0x4F,
(byte) 0x50, (byte) 0x51, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x57,
(byte) 0x58, (byte) 0x59, (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D, (byte) 0x5E, (byte) 0x5F,
(byte) 0x60, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
(byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
(byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
(byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte) 0x7D, (byte) 0x7E, (byte) 0x7F,
(byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87,
(byte) 0x88, (byte) 0x89, (byte) 0x8A, (byte) 0x8B, (byte) 0x8C, (byte) 0x8D, (byte) 0x8E, (byte) 0x8F,
(byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97,
(byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9B, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
(byte) 0xA0, (byte) 0xA1, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7,
(byte) 0xA8, (byte) 0xA9, (byte) 0xAA, (byte) 0xAB, (byte) 0xAC, (byte) 0xAD, (byte) 0xAE, (byte) 0xAF,
(byte) 0xB0, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7,
(byte) 0xB8, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0xBD, (byte) 0xBE, (byte) 0xBF,
(byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
(byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
(byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
(byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
(byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
(byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
(byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
(byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF
};
/**
*
*/
public StatisticsTool()
{
buffer = new char[BUFFER_SIZE];
buffer[0] = ' ';
bufIndex = 0;
bufMax = 1;
}
private static void usage()
{
System.out.println(usageString);
}
private static void exceptionError(Exception e)
{
System.err.println("ioError: " + e.toString());
}
private int nextBuffer(InputFile inputFile)
{
bufIndex = 0;
return inputFile.read(buffer);
}
public char nextChar()
{
if (bufIndex >= bufMax) {
bufMax = nextBuffer(inputFile);
}
if (bufMax < 0) {
return 0;
}
return buffer[bufIndex++];
}
public void handleNGram(String key)
{
ngrams.put(key);
}
public Object mapKey(String key)
{
return key;
}
private NGramList dumpNGrams()
{
String filename = inputFile.getPath();
int extension = filename.lastIndexOf(".");
String outputFileName = filename.substring(0, extension) + ".raw" + filename.substring(extension);
PrintStream output;
double cumulative = 0;
try {
output = new PrintStream(
new FileOutputStream(outputFileName), true, "UTF8");
} catch (IOException e) {
System.out.println("? Could not open " + outputFileName + " for writing.");
return null;
}
System.out.println(inputFile.getFilename() + ": " + ngrams.getUniqueNGrams() + "/" + ngrams.getTotalNGrams());
ArrayList array = new ArrayList(ngrams.values());
Collections.sort(array);
NGramList stats = new NGramList(inputFile);
int count = 0;
int totalNGrams = ngrams.getTotalNGrams();
for (Iterator it = array.iterator(); it.hasNext(); count += 1) {
NGramList.NGram ngram = (NGramList.NGram) it.next();
String value = ngram.getValue();
int refCount = ngram.getRefCount();
double ratio = (double) refCount / totalNGrams * 100.0;
cumulative += ratio;
// TODO check should be count < max && cumulative < maxPercent
if (count < 64) {
stats.put(value);
}
output.println(value + "\t" + refCount + "\t" + ratio + "%\t" + cumulative + "%");
}
output.close();
return stats;
}
private void writeStatistics(ArrayList keyList, boolean visual)
{
String filename = inputFile.getPath();
int extension = filename.lastIndexOf(".");
String outputFileName = filename.substring(0, extension) + "-" + inputFile.getEncoding() +
(visual? "-visual.dat" : ".dat");
PrintStream output;
try {
output = new PrintStream(
new FileOutputStream(outputFileName), true, "ASCII");
} catch (IOException e) {
System.out.println("? Could not open " + outputFileName + " for writing.");
return;
}
int i = 0;
output.print(" private static int[] ngrams = {");
for (Iterator it = keyList.iterator(); it.hasNext(); i += 1) {
Integer ngram = (Integer) it.next();
if (i % 16 == 0) {
output.print("\n ");
}
output.print("0x" + Utility.hex(ngram.intValue(), 6) + ", ");
}
output.println("\n };\n");
/*
* Generate the byte map
*/
char[] unicodes = inputFile.decode(allBytes);
for (int b = 0; b < 256; b += 1) {
char unicode = unicodes[b];
int charClass = NGramParser.getCharClass(unicode);
switch (charClass) {
case NGramParser.C_LETTER:
unicodes[b] = Character.toLowerCase(unicode);
break;
case NGramParser.C_PUNCT:
unicodes[b] = ' ';
break;
case NGramParser.C_IGNORE:
default:
unicodes[b] = '\0';
}
}
byte[] byteMap = inputFile.encode(unicodes);
output.print(" private static byte[] byteMap = {");
for (int b = 0; b < 256; b += 1) {
if (b % 8 == 0) {
output.print("\n ");
}
output.print("(byte) 0x" + Utility.hex(byteMap[b] & 0xFF, 2) + ", ");
}
output.println("\n };");
}
public NGramList collectStatistics(InputFile file)
{
if (!file.open()) {;
return null;
}
inputFile = file;
NGramParser parser = new NGramParser(this);
ngrams = new NGramList(this);
parser.parse();
file.close();
NGramList stats = dumpNGrams();
ArrayList statKeys = new ArrayList(stats.keys());
Collections.sort(statKeys);
writeStatistics(statKeys, false);
if (inputFile.getVisualOrder()) {
ArrayList reversed = new ArrayList(statKeys.size());
for (Iterator it = statKeys.iterator(); it.hasNext();) {
Integer key = (Integer) it.next();
int k = key.intValue();
int r = 0;
while (k != 0) {
r = (r << 8) | (k & 0xFF);
k >>= 8;
}
reversed.add(new Integer(r));
}
Collections.sort(reversed);
writeStatistics(reversed, true);
}
return stats;
}
public static void main(String[] args)
{
List list = Arrays.asList(args);
InputFile[] input_files = new InputFile[args.length];
int file_count = 0;
String encoding = null;
boolean run_checker = false;
boolean encoding_test = false;
boolean visual_order = false;
for (Iterator it = list.iterator(); it.hasNext(); /*anything?*/) {
String arg = (String) it.next();
if (arg.equals("-v")) {
visual_order = true;
} else if (arg.equals("-l")) {
visual_order = false;
} else if (arg.equals("-c")) {
run_checker = true;
} else if (arg.equals("-t")) {
encoding_test = true;
} else if (arg.equals("-e")) {
if (it.hasNext()) {
encoding = (String) it.next();
} else {
System.err.println("Error: missing encoding.");
}
} else if (arg.startsWith("-")) {
if (! (arg.equals("-h") || arg.equals("-?"))) {
System.err.println("Error: unknown option " + arg);
}
usage();
} else {
input_files[file_count++] = new InputFile(arg, encoding, visual_order);
}
}
if(file_count == 0){
System.err.println("Error: there are no files to process.");
usage();
}
StatisticsTool tool = new StatisticsTool();
Checker[] checkers = new Checker[file_count];
for(int i = 0; i < file_count; i += 1) {
InputFile file = input_files[i];
checkers[i] = new Checker(tool.collectStatistics(file), file);
}
System.out.println();
/**
* Checkers
*/
if (run_checker) {
for(int c = 0; c < file_count; c += 1) {
Checker checker = checkers[c];
for(int f = 0; f < file_count; f += 1) {
checker.check(input_files[f]);
}
}
}
/*
* Detection test
*/
if (encoding_test) {
char[] buffer = new char[128];
System.out.println("Detection test");
for (int f = 0; f < file_count; f += 1) {
InputFile file = input_files[f];
int[] histogram = new int[file_count];
int charCount, misses = 0;
System.out.println(file.getFilename() + "(" + file.getEncoding() + "):");
file.open();
for (int c = 0; c < file_count; c += 1) {
checkers[c].setMapper(file);
}
// for each buffer
// for each checker
// call checkBuffer, save score
// find highest score, update histogram for that checker
// show checker histogram
while ((charCount = file.read(buffer)) > 0) {
int[] scores = new int[file_count];
int bestFit = -1, maxScore = 0;
for (int c = 0; c < file_count; c += 1) {
scores[c] = checkers[c].checkBuffer(buffer, charCount);
}
for (int c = 0; c < file_count; c += 1) {
int score = scores[c];
if (score > maxScore) {
maxScore = score;
bestFit = c;
}
}
if (bestFit >= 0) {
histogram[bestFit] += 1;
} else {
misses += 1;
}
}
for (int c = 0; c < file_count; c += 1) {
System.out.println(" " + checkers[c].getLanguage() + ": " + histogram[c]);
}
if (misses > 0) {
System.out.println(" NONE: " + misses);
}
System.out.println();
}
}
}
}