ICU-12436 Remove obsolete copy of Thai dictionary and dictionary tools from ICU4J

X-SVN-Rev: 38619
This commit is contained in:
Andy Heninger 2016-04-15 22:50:43 +00:00
parent 96f349b049
commit 2c572efdb3
4 changed files with 0 additions and 946 deletions

1
.gitattributes vendored
View file

@ -145,7 +145,6 @@ icu4j/main/shared/data/icudata.jar -text
icu4j/main/shared/data/icutzdata.jar -text
icu4j/main/shared/data/testdata.jar -text
icu4j/main/tests/core/src/com/ibm/icu/dev/data/rbbi/english.dict -text
icu4j/main/tests/core/src/com/ibm/icu/dev/data/thai6.ucs -text
icu4j/main/tests/core/src/com/ibm/icu/dev/test/serializable/data/ICU_3.6/com.ibm.icu.impl.OlsonTimeZone.dat -text
icu4j/main/tests/core/src/com/ibm/icu/dev/test/serializable/data/ICU_3.6/com.ibm.icu.impl.TimeZoneAdapter.dat -text
icu4j/main/tests/core/src/com/ibm/icu/dev/test/serializable/data/ICU_3.6/com.ibm.icu.math.BigDecimal.dat -text

View file

@ -1,880 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 1996-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.tool.rbbi;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.Vector;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.CompactByteArray;
public class BuildDictionaryFile {
public static void main(String args[])
throws FileNotFoundException, UnsupportedEncodingException, IOException {
String filename = args[0];
String encoding = "";
String outputFile = "";
String listingFile = "";
if (args.length >= 2)
encoding = args[1];
if(args.length >= 3)
outputFile = args[2];
if (args.length >= 4)
listingFile = args[3];
BuildDictionaryFile dictionary = new BuildDictionaryFile();
dictionary.build(filename, encoding);
DataOutputStream out = null;
if (outputFile.length() != 0) {
out = new DataOutputStream(new FileOutputStream(outputFile));
dictionary.writeDictionaryFile(out);
}
PrintWriter listing = null;
if (listingFile.length() != 0) {
listing = new PrintWriter(new OutputStreamWriter(new FileOutputStream(listingFile), "UnicodeLittle"));
dictionary.printWordList("", 0, listing);
listing.close();
}
}
public BuildDictionaryFile() {
}
public void build(String filename, String encoding)
throws FileNotFoundException, UnsupportedEncodingException, IOException {
FileInputStream file = new FileInputStream(filename);
InputStreamReader in;
if (encoding.length() == 0)
in = new InputStreamReader(file);
else
in = new InputStreamReader(file, encoding);
buildColumnMap(in);
file = new FileInputStream(filename);
if (encoding.length() == 0)
in = new InputStreamReader(file);
else
in = new InputStreamReader(file, encoding);
buildStateTable(in);
//printTable();
}
public void buildColumnMap(InputStreamReader in) throws IOException {
System.out.println("Building column map...");
UnicodeSet charsInFile = new UnicodeSet();
int c = in.read();
int totalChars = 0;
while (c >= 0) {
++totalChars; if (totalChars > 0 && totalChars % 5000 == 0) System.out.println("Read " + totalChars + " characters...");
if (c > ' ')
charsInFile.add((char)c);
c = in.read();
}
// Test.debugPrintln(charsInFile.toString());
StringBuffer tempReverseMap = new StringBuffer();
tempReverseMap.append(' ');
columnMap = new CompactByteArray();
int n = charsInFile.getRangeCount();
byte p = 1;
for (int i=0; i<n; ++i) {
char start = (char) charsInFile.getRangeStart(i);
char end = (char) charsInFile.getRangeEnd(i);
for (char ch = start; ch <= end; ch++) {
if (columnMap.elementAt(Character.toLowerCase(ch)) == 0) {
columnMap.setElementAt(Character.toUpperCase(ch), Character.toUpperCase(ch),
p);
columnMap.setElementAt(Character.toLowerCase(ch), Character.toLowerCase(ch),
p);
++p;
tempReverseMap.append(ch);
}
}
}
//System.out.println("Compacting...");
columnMap.compact();
//System.out.println(tempReverseMap.toString());
reverseColumnMap = new char[p];
if (0 != p) {
tempReverseMap.getChars(0, p, reverseColumnMap, 0);
}
System.out.println("total columns = " + p);
numCols = p;
numColGroups = (numCols >> 5) + 1;
/*
short[] index = columnMap.getIndexArray();
System.out.println("Index:");
for (int i = 0; i < index.length; i++) {
if (i % 16 == 0) {
System.out.println();
System.out.print(" " + Integer.toHexString(i * 128) + ":");
}
System.out.print("\t" + Integer.toHexString(index[i]));
}
System.out.println();
byte[] data = columnMap.getStringArray();
System.out.print("Values:");
for (int i = 0; i < data.length; i++) {
if (i % 16 == 0) {
System.out.println();
System.out.print(" " + Integer.toHexString(i) + ":");
}
if (data[i] == 0)
System.out.print("\t.");
else
System.out.print("\t" + Integer.toString(data[i]));
}
System.out.println();
*/
}
public void buildStateTable(InputStreamReader in) throws IOException {
Vector tempTable = new Vector();
tempTable.addElement(new int[numCols + 1]);
int state = 0;
int c = in.read();
int[] row = null;
int charsInWord = 0;
while (c >= 0) {
charsInWord++;
short column = columnMap.elementAt((char)c);
row = (int[])(tempTable.elementAt(state));
if (column != 0) {
if (row[column] == 0) {
row[column] = tempTable.size();
++row[numCols];
state = (tempTable.size());
tempTable.addElement(new int[numCols + 1]);
}
else
state = row[column];
}
else if (state != 0) {
if (row[0] != -1) {
row[0] = -1;
++row[numCols];
uniqueWords++;
totalUniqueWordChars += charsInWord;
}
totalWords++;
if (totalWords % 5000 == 0) System.out.println("Read " + totalWords + " words, " + tempTable.size() + " rows...");
charsInWord = 0;
state = 0;
}
c = in.read();
}
if (state != 0) {
row = (int[])(tempTable.elementAt(state));
if (row[0] != -1) {
row[0] = -1;
uniqueWords++;
totalUniqueWordChars += charsInWord;
}
totalWords++;
}
compress(tempTable);
table = new short[numCols * tempTable.size()];
for (int i = 0; i < tempTable.size(); i++) {
row = (int[])tempTable.elementAt(i);
for (int j = 0; j < numCols; j++)
table[i * numCols + j] = (short)row[j];
}
}
private void compress(Vector tempTable) {
System.out.println("Before compression:");
System.out.println(" Number of rows = " + tempTable.size());
System.out.println(" Number of columns = " + numCols);
System.out.println(" Number of cells = " + tempTable.size() * numCols);
deleteDuplicateRows(tempTable);
System.out.println("After removing duplicate rows:");
System.out.println(" Number of rows = " + tempTable.size());
System.out.println(" Number of columns = " + numCols);
System.out.println(" Number of cells = " + tempTable.size() * numCols);
stackRows(tempTable);
if (tempTable.size() > 32767) throw new IllegalArgumentException("Too many rows in table!");
System.out.println("After doubling up on rows:");
System.out.println(" Number of rows = " + tempTable.size());
System.out.println(" Number of columns = " + numCols);
System.out.println(" Number of cells = " + tempTable.size() * numCols);
}
/*
experimental...
private void deleteDuplicateRows(Vector tempTable) {
int[] rowNumMap = new int[tempTable.size()];
for (int i = 0; i < rowNumMap.length; i++)
rowNumMap[i] = i;
int nextClass = numCols;
int currentClass;
int lastClass;
boolean split;
int[] row1, row2, tempRow;
int tempCat;
do {
System.out.println("Making a pass (" + nextClass + " classes)...");
currentClass = 0;
lastClass = nextClass;
while (currentClass < nextClass) {
System.out.println(" currentClass = " + currentClass);
split = false;
row1 = row2 = null;
for (int i = 0; i < tempTable.size(); i++) {
tempRow = (int[])tempTable.elementAt(i);
if (tempRow[numCols] == currentClass) {
if (row1 == null) {
row1 = (int[])tempTable.elementAt(i);
}
else {
row2 = (int[])tempTable.elementAt(i);
for (int j = 0; j < numCols; j++) {
if ((row1[j] == 0) != (row2[j] == 0) ||
(row1[j] == -1) != (row2[j] == -1)) {
row2[numCols] = nextClass;
split = true;
break;
}
else if (row1[j] != 0 && row2[j] != 0 && row1[j] != -1
&& row2[j] != -1) {
tempRow = (int[])tempTable.elementAt(row1[j]);
tempCat = tempRow[numCols];
tempRow = (int[])tempTable.elementAt(row2[j]);
if (tempCat != tempRow[numCols]) {
row2[numCols] = nextClass;
split = true;
break;
}
}
}
}
}
}
if (split)
++nextClass;
++currentClass;
//System.out.println();
}
} while (lastClass != nextClass);
int[] representatives = new int[nextClass];
for (int i = 1; i < tempTable.size(); i++) {
tempRow = (int[])tempTable.elementAt(i);
if (representatives[tempRow[numCols]] == 0)
representatives[tempRow[numCols]] = i;
else
rowNumMap[i] = representatives[tempRow[numCols]];
}
System.out.println("Renumbering...");
// renumber all remaining rows
for (int i = 0; i < rowNumMap.length; i++)
if (rowNumMap[i] != i)
tempTable.setElementAt(null, i);
int newRowNum = 0;
for (int i = 0; i < rowNumMap.length; i++)
if (tempTable.elementAt(i) != null)
rowNumMap[i] = newRowNum++;
for (int i = 0; i < rowNumMap.length; i++)
if (tempTable.elementAt(i) == null)
rowNumMap[i] = rowNumMap[rowNumMap[i]];
for (int i = tempTable.size() - 1; i >= 0; i--) {
tempRow = (int[])tempTable.elementAt(i);
if (tempRow == null)
tempTable.removeElementAt(i);
else {
for (int j = 0; j < numCols; j++)
if (tempRow[j] != -1)
tempRow[j] = rowNumMap[j];
}
}
//for (int i = 1; i < rowNumMap.length; i++) rowNumMap[i] = i; int newRowNum = rowNumMap.length;
}
*/
private void deleteDuplicateRows(Vector tempTable) {
Vector work = (Vector)(tempTable.clone());
boolean didDeleteRow = true;
Vector tempMapping = new Vector(work.size());
int[] mapping = new int[work.size()];
for (int i = 0; i < mapping.length; i++) {
mapping[i] = i;
tempMapping.addElement(new Integer(i));
}
boolean[] tbd = new boolean[work.size()];
while (didDeleteRow) {
System.out.println(" " + work.size() + " rows...");
int deletedRows = 0;
didDeleteRow = false;
sortTable(work, tempMapping, mapping, 1, work.size());
for (int i = 0; i < work.size() - 1; ) {
System.out.print("Deleting, inspecting row " + i + ", deleted " + deletedRows + " rows...\r");
int rowToDelete = ((Integer)(tempMapping.elementAt(i + 1))).intValue();
int rowToMapTo = ((Integer)(tempMapping.elementAt(i))).intValue();
if (compareRows((int[])work.elementAt(i), (int[])work.elementAt(i + 1),
mapping) == 0) {
tbd[rowToDelete] = true;
tempTable.setElementAt(null, rowToDelete);
while (tbd[mapping[rowToMapTo]])
mapping[rowToMapTo] = mapping[mapping[rowToMapTo]];
mapping[rowToDelete] = mapping[rowToMapTo];
didDeleteRow = true;
deletedRows++;
work.removeElementAt(i + 1);
tempMapping.removeElementAt(i + 1);
}
else
i++;
}
for (int i = 0; i < mapping.length; i++) {
if (tbd[i] && tbd[mapping[i]])
mapping[i] = mapping[mapping[i]];
}
}
int decrementBy = 0;
for (int i = 0; i < mapping.length; i++) {
if (tbd[i])
decrementBy++;
else
mapping[i] -= decrementBy;
}
for (int i = 0; i < mapping.length; i++) {
if (tbd[i])
mapping[i] = mapping[mapping[i]];
}
for (int i = tempTable.size() - 1; i >= 0; i--) {
if (tbd[i])
tempTable.removeElementAt(i);
else {
int[] row = (int[])tempTable.elementAt(i);
for (int j = 0; j < numCols; j++)
row[j] = (row[j] == -1) ? -1 : mapping[row[j]];
}
}
}
private void sortTable(Vector tbl, Vector tempMapping, int[] mapping, int start, int end) {
System.out.print("Sorting (" + start + ", " + end + ")...\r");
if (start + 1 >= end)
return;
else if (start + 10 >= end) {
for (int i = start + 1; i < end; i++) {
int[] row = (int[])tbl.elementAt(i);
Integer tempMap = (Integer)tempMapping.elementAt(i);
int j;
for (j = i - 1; j >= start; j--) {
if (compareRows((int[])tbl.elementAt(j), row, mapping) > 0) {
tbl.setElementAt((int[])tbl.elementAt(j), j + 1);
tempMapping.setElementAt((Integer)tempMapping.elementAt(j), j + 1);
}
else {
tbl.setElementAt(row, j + 1);
tempMapping.setElementAt(tempMap, j + 1);
break;
}
}
if (j < start) {
tbl.setElementAt(row, start);
tempMapping.setElementAt(tempMap, start);
}
}
}
else {
int boundaryPos = (start + end) / 2;
int i;
boolean allTheSame = true;
int firstDifferent = 0;
do {
int[] boundary = (int[])tbl.elementAt(boundaryPos);
i = start;
int j = end - 1;
int[] row = null;
byte compResult;
while (i < j) {
row = (int[])tbl.elementAt(i);
while (i <= j && compareRows(row, boundary, mapping) < 0) {
i++;
row = (int[])tbl.elementAt(i);
}
row = (int[])tbl.elementAt(j);
compResult = compareRows(row, boundary, mapping);
while (i <= j && (compResult >= 0)) {
if (compResult != 0) {
allTheSame = false;
firstDifferent = j;
}
j--;
row = (int[])tbl.elementAt(j);
compResult = compareRows(row, boundary, mapping);
}
if (i <= j) {
row = (int[])tbl.elementAt(j);
tbl.setElementAt(tbl.elementAt(i), j);
tbl.setElementAt(row, i);
Object temp = tempMapping.elementAt(j);
tempMapping.setElementAt(tempMapping.elementAt(i), j);
tempMapping.setElementAt(temp, i);
}
}
if (i <= start) {
if (allTheSame)
return;
else
boundaryPos = firstDifferent;
}
} while (i <= start);
sortTable(tbl, tempMapping, mapping, start, i);
sortTable(tbl, tempMapping, mapping, i, end);
}
}
private byte compareRows(int[] row1, int[] row2, int[] mapping) {
for (int i = 0; i < numCols; i++) {
int c1 = (row1[i] == -1) ? -1 : mapping[row1[i]];
int c2 = (row2[i] == -1) ? -1 : mapping[row2[i]];
if (c1 < c2)
return -1;
else if (c1 > c2)
return 1;
}
return 0;
}
private int[] buildRowIndex(Vector tempTable) {
int[] tempRowIndex = new int[tempTable.size()];
rowIndexFlagsIndex = new short[tempTable.size()];
Vector tempRowIndexFlags = new Vector();
rowIndexShifts = new byte[tempTable.size()];
// build the row index. Each entry in the row index starts out referring
// to the original row (so it doesn't actually do any mapping), and we set
// up the index flags to show which cells in the row are populated
for (int i = 0; i < tempTable.size(); i++) {
tempRowIndex[i] = i;
int[] row = (int[])tempTable.elementAt(i);
if (row[numCols] == 1 && row[0] == 0) {
int j = 0;
while (row[j] == 0)
++j;
rowIndexFlagsIndex[i] = (short)(-j);
}
else {
int[] flags = new int[numColGroups];
int nextFlag = 1;
int colGroup = 0;
for (int j = 0; j < numCols; j++) {
if (row[j] != 0)
flags[colGroup] |= nextFlag;
nextFlag <<= 1;
if (nextFlag == 0) {
++colGroup;
nextFlag = 1;
}
}
colGroup = 0;
int j = 0;
while (j < tempRowIndexFlags.size()) {
if (((Integer)tempRowIndexFlags.elementAt(j)).intValue() ==
flags[colGroup]) {
++colGroup;
++j;
if (colGroup >= numColGroups)
break;
}
else if (colGroup != 0)
colGroup = 0;
else
++j;
}
rowIndexFlagsIndex[i] = (short)(j - colGroup);
while (colGroup < numColGroups) {
tempRowIndexFlags.addElement(new Integer(flags[colGroup]));
++colGroup;
}
}
}
rowIndexFlags = new int[tempRowIndexFlags.size()];
for (int i = 0; i < rowIndexFlags.length; i++)
rowIndexFlags[i] = ((Integer)tempRowIndexFlags.elementAt(i)).intValue();
System.out.println("Number of column groups = " + numColGroups);
System.out.println("Size of rowIndexFlags = " + rowIndexFlags.length);
return tempRowIndex;
}
private void stackRows(Vector tempTable) {
/*
System.out.print("Row:\t");
for (int i = 0; i < numCols; i++)
System.out.print(reverseColumnMap[i] + "\t");
System.out.println();
for (int i = 0; i < tempTable.size(); i++) {
System.out.print(Integer.toString(i) + ":\t");
int[] row = (int[])tempTable.elementAt(i);
for (int j = 0; j < row.length; j++)
if (row[j] != 0) System.out.print(Integer.toString(row[j]) + "\t");
else System.out.print(".\t");
System.out.println();
}
*/
int[] tempRowIndex = buildRowIndex(tempTable);
boolean[] tbd = new boolean[tempTable.size()];
// now we actually go through and stack rows together
for (int i = 0; i < tempTable.size(); i++) {
if (tbd[i])
continue;
System.out.print("Stacking, inspecting row " + i + "...\r");
//System.out.println("Stacking, inspecting row " + i + "...");
int[] destRow = (int[])tempTable.elementAt(i);
boolean[] tempFlags = new boolean[numCols];
boolean[] filledCells = new boolean[numCols];
for (int j = 0; j < numCols; j++)
filledCells[j] = destRow[j] != 0;
for (int j = i + 1; destRow[numCols] < numCols && j < tempTable.size(); j++) {
if (tbd[j])
continue;
int[] srcRow = (int[])tempTable.elementAt(j);
if (srcRow[numCols] + destRow[numCols] > numCols)
continue;
int maxLeftShift = -999;
int maxRightShift = 0;
for (int k = 0; k < numCols; k++) {
tempFlags[k] = srcRow[k] != 0;
if (tempFlags[k]) {
if (maxLeftShift == -999)
maxLeftShift = -k;
maxRightShift = (numCols - 1) - k;
}
}
int shift;
for (shift = maxLeftShift; shift <= maxRightShift; shift++) {
int k;
for (k = 0; k < numCols; k++) {
if (tempFlags[k] && filledCells[k + shift])
break;
}
if (k >= numCols)
break;
}
if (shift <= maxRightShift) {
//System.out.println("Packing row " + j + " into row " + i + " with shift = " + shift);
for (int k = 0; k < numCols; k++) {
if (tempFlags[k]) {
filledCells[k + shift] = true;
destRow[k + shift] = srcRow[k];
++destRow[numCols];
}
}
tbd[j] = true;
tempRowIndex[j] = i;
rowIndexShifts[j] = (byte)shift;
}
}
}
// finally, we squeeze out all the deleted rows
int decrementBy = 0;
for (int i = 0; i < tempRowIndex.length; i++) {
if (!tbd[i])
tempRowIndex[i] -= decrementBy;
else
++decrementBy;
}
rowIndex = new short[tempRowIndex.length];
for (int i = tempRowIndex.length - 1; i >= 0; i--) {
if (tbd[i]) {
rowIndex[i] = (short)(tempRowIndex[tempRowIndex[i]]);
tempTable.removeElementAt(i);
}
else
rowIndex[i] = (short)tempRowIndex[i];
}
}
// private void printTable() {
// short cell;
// int populatedCells = 0;
///*
// System.out.println("Conceptual table:");
// System.out.println(" Row: a b c d e f g h i j k l m n"
// + " o p q r s t u v w x y z ' #");
//
// boolean[] rowPrintFlags = new boolean[rowIndex.length];
// printConceptualTable("", 0, rowPrintFlags);
//*/
//
// System.out.println();
// System.out.println("Conceptual table:");
// System.out.print(" Row:");
// for (int i = 0; i < reverseColumnMap.length; i++) {
// System.out.print(" " + reverseColumnMap[i]);
// }
// for (int i = 0; i < rowIndex.length; i++) {
// System.out.println();
// printNumber(i, 4);
// System.out.print(":");
// for (int j = 0; j < numCols; j++)
// printNumber(at(i, j), 4);
// }
// System.out.println('\n');
//
// System.out.println();
// System.out.println("Internally stored table:");
// System.out.print(" Row:");
// for (int i = 0; i < reverseColumnMap.length; i++) {
// System.out.print(" " + reverseColumnMap[i]);
// }
// for (int i = 0; i < table.length; i++) {
// if (i % numCols == 0) {
// System.out.println();
// printNumber(i / numCols, 4);
// System.out.print(":");
// }
// cell = table[i];
// if (cell != 0)
// populatedCells++;
// printNumber(cell, 4);
// }
// System.out.println('\n');
//
//System.out.println("Row index:");
//for (int i = 0; i < rowIndex.length; i++) {
// System.out.print(" " + i + " -> " + rowIndex[i]);
// if (rowIndexFlagsIndex[i] < 0)
// System.out.print(", flags = " + Integer.toBinaryString((1 << (-rowIndexFlagsIndex[i]))) + " (" + rowIndexFlagsIndex[i]);
// else
// System.out.print(", flags = " + Integer.toBinaryString(rowIndexFlags[rowIndexFlagsIndex[i]]) + " (" + rowIndexFlagsIndex[i]);
// System.out.println("), shift = " + rowIndexShifts[i]);
//}
///*
// int theoreticalMinRows = populatedCells / numCols;
// if (populatedCells % numCols != 0)
// theoreticalMinRows++;
// int oneCellRows = 0;
// for (int i = 0; i < rowIndexFlags.length; i++) {
// double temp = Math.log(rowIndexFlags[i]) / Math.log(2);
// if (temp == (int)temp)
// oneCellRows++;
// }
//
// System.out.println('\n');
// System.out.println("Total words in input = " + totalWords);
// System.out.println("Total unique words = " + uniqueWords + ", comprising " +
// totalUniqueWordChars + " characters\n");
// System.out.println("Number of populated cells = " + populatedCells);
// System.out.println("Total number of cells = " + (table.length));
// System.out.println("Residency = " + ((float)populatedCells / table.length * 100) + '%');
// System.out.println("Ratio of populated cells to unique-word characters = " +
// ((float)populatedCells / totalUniqueWordChars * 100) + '%');
// System.out.println("Ratio of total cells to unique-word characters = " +
// ((float)table.length / totalUniqueWordChars * 100) + '%');
// System.out.println("Number of rows = " + (table.length / numCols));
// System.out.println("Theoretical minimum number of rows = " + theoreticalMinRows);
// System.out.println("Ratio of number of rows to theoretical minimum = " +
// ((float)(table.length / numCols) / theoreticalMinRows * 100) + '%');
// System.out.println("Number of conceptual rows = " + rowIndex.length);
// System.out.println("Conceptual rows with only one populated cell = " + oneCellRows);
// System.out.println("Ratio of one-cell rows to total conceptual rows = " + (((float)oneCellRows)
// / rowIndex.length * 100) + '%');
// System.out.println("Average number of populated cells in multi-cell rows = " +
// ((float)(populatedCells - oneCellRows) / (rowIndex.length - oneCellRows)));
//
// int storageUsed = table.length * 2 + rowIndex.length * 2
// + rowIndexFlags.length * 4 + rowIndexShifts.length;
// System.out.println("Total number of bytes in table (including indexes) = " +
// storageUsed);
// System.out.println("Bytes of overhead per unique-word character = " + ((double)(storageUsed
// - (totalUniqueWordChars * 2)) / totalUniqueWordChars));
//*/
// }
// private void printConceptualTable(String initialString, int state, boolean[] flags) {
// if (initialString.length() == 0)
// System.out.println("root:");
// else
// System.out.println(initialString + ':');
//
// if (!flags[state]) {
// flags[state] = true;
// printNumber(state, 4);
// System.out.print(":");
// for (int i = 0; i < numCols; i++)
// printNumber(at(state, i), 4);
// System.out.println();
// }
//
// int nextState;
// for (int i = 0; i < numCols; i++) {
// nextState = at(state, i);
// if (nextState > 0 && !flags[nextState]) {
// printNumber(nextState, 4);
// System.out.print(":");
// for (int j = 0; j < numCols; j++)
// printNumber(at(nextState, j), 4);
// System.out.println();
// }
// }
// for (int i = 0; i < numCols; i++) {
// nextState = at(state, i);
// if (nextState > 0 && !flags[nextState]) {
// char nextChar;
// if (nextState == 27)
// nextChar = ' ';
// else if (nextState == 26)
// nextChar = '\'';
// else
// nextChar = (char)(i + 'a');
// flags[nextState] = true;
// printConceptualTable(initialString + nextChar, nextState, flags);
// }
// }
// }
private void printWordList(String partialWord, int state, PrintWriter out)
throws IOException {
if (state == -1) {
System.out.println(partialWord);
if (out != null)
out.println(partialWord);
}
else {
for (int i = 0; i < numCols; i++) {
if (at(state, i) != 0)
printWordList(partialWord + reverseColumnMap[i], at(state, i), out);
}
}
}
private void writeDictionaryFile(DataOutputStream out) throws IOException {
out.writeInt(0); // version number
char[] columnMapIndexes = columnMap.getIndexArray();
out.writeInt(columnMapIndexes.length);
for (int i = 0; i < columnMapIndexes.length; i++)
out.writeShort((short)columnMapIndexes[i]);
byte[] columnMapValues = columnMap.getValueArray();
out.writeInt(columnMapValues.length);
for (int i = 0; i < columnMapValues.length; i++)
out.writeByte((byte)columnMapValues[i]);
out.writeInt(numCols);
out.writeInt(numColGroups);
out.writeInt(rowIndex.length);
for (int i = 0; i < rowIndex.length; i++)
out.writeShort(rowIndex[i]);
out.writeInt(rowIndexFlagsIndex.length);
for (int i = 0; i < rowIndexFlagsIndex.length; i++)
out.writeShort(rowIndexFlagsIndex[i]);
out.writeInt(rowIndexFlags.length);
for (int i = 0; i < rowIndexFlags.length; i++)
out.writeInt(rowIndexFlags[i]);
out.writeInt(rowIndexShifts.length);
for (int i = 0; i < rowIndexShifts.length; i++)
out.writeByte(rowIndexShifts[i]);
out.writeInt(table.length);
for (int i = 0; i < table.length; i++)
out.writeShort(table[i]);
out.close();
}
// private void printNumber(int x, int width) {
// String s = String.valueOf(x);
// if (width > s.length())
// System.out.print(spaces.substring(0, width - s.length()));
// if (x != 0)
// System.out.print(s);
// else
// System.out.print('.');
// }
public final short at(int row, char ch) {
int col = columnMap.elementAt(ch);
return at(row, col);
}
public final short at(int row, int col) {
if (cellIsPopulated(row, col))
return internalAt(rowIndex[row], col + rowIndexShifts[row]);
else
return 0;
}
private final boolean cellIsPopulated(int row, int col) {
if (rowIndexFlagsIndex[row] < 0)
return col == -rowIndexFlagsIndex[row];
else {
int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
return (flags & (1 << (col & 0x1f))) != 0;
}
}
private final short internalAt(int row, int col) {
return table[row * numCols + col];
}
private CompactByteArray columnMap = null;
private char[] reverseColumnMap = null;
private int numCols;
private int numColGroups;
private short[] table = null;
private short[] rowIndex = null;
private int[] rowIndexFlags = null;
private short[] rowIndexFlagsIndex = null;
private byte[] rowIndexShifts = null;
private int totalWords = 0;
private int uniqueWords = 0;
private int totalUniqueWordChars = 0;
//private static final String spaces = " ";
}

View file

@ -1,65 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta http-equiv="content-type"
content="text/html; charset=ISO-8859-1">
<title>README For RBBI Tools</title>
<!-- Copyright (C) 2003-2004, International Business Machines Corporation and
others. All Rights Reserved.
-->
</head>
<body>
<h3>What Are These Tools?</h3>
This directory contains two tools, WriteTablesToFiles, which converts
the Java&nbsp; BreakIterators into .brk files for ICU4C, and
BuildDictionaryFile, which builds the binary the Thai word break
dictionary from a Unicode text file containing a list of Thai words.
The rest of this document describes how to use these tools.<br>
<h3>How To Build The ICU4C BreakIterator Files</h3>
The RuleBasedBreakIterator code was originally developed for ICU4J, and
then ported to ICU4C. For various reasons, the code which compiled the
state tables from the rule text was hard to port. Instead the
WriteTablesToFiles tool was wirtten to read in the Java data and write
the .brk files which ICU4C reads. Later the RBBI code was re-written
for ICU4C, including the ability to compile the state tables from rules
stored in text files. This means that the WriteTablesToFiles tool is
now obsolete.<br>
<br>
<h3>How To Build The Thai Word Break Dictionary</h3>
The Thai word berak code was developed originally for ICU4J, and then
ported to ICU4C - the dictionary builder tool was never ported, so you
have to use the Java tool to build the dictionary file for ICU4C. On
the other hand, all of the rest of the ICU locale data was developed
originally for
ICU4C, and a tool was written to covert the ICU4C locale data to Java
resource bundles for use by ICU4J. Consequently, the process of
building the Thai
word break dictionary for ICU4C and
ICU4J is a bit convoluted. Here are the steps:<br>
<div style="margin-left: 40px;">
<ol>
<li>Download and build both ICU4C and ICU4J on a <span
style="font-weight: bold;">Big Endian</span> machine.<br>
</li>
<li>Run the following command line to build the Thai dictionary file:<br>
java -classpath $icu4j_root/classes
com.ibm.icu.dev.tool.rbbi.BuildDictionaryFile
$icu4j_root/src/com/ibm/icu/dev/data/thai6.ucs Unicode
$icu_root/soruce/data/brkitr/thai_dict.brk</li>
<li>Rebuild the ICU4C resources.</li>
<li>Rebuild the ICU4J ICULocaleData.jar file. (See <a
href="../../../../../../../readme.html">the ICU4J readme file</a> for
instructions)</li>
<li>Move ICULocaleData.jar from $icu_root/source/data/locales/java to
$icu4j_root/src/com/ibm/icu/impl/data</li>
<li>Build ICU4J's _resources target to unjar the new files.<br>
</li>
</ol>
</div>
In the above, $icu_root is the root of your ICU4C source tree, for
example
"~/dev/icu" and $icu4j_root is the root of your ICU4J source tree, for
example "~/dev/icu4j".<br>
<br>
</body>
</html>