no message

X-SVN-Rev: 15942
This commit is contained in:
Mark Davis 2004-06-26 00:26:16 +00:00
parent 28015f3710
commit ae721a34d1
5 changed files with 416 additions and 30 deletions

View file

@ -5,22 +5,29 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
* $Date: 2004/04/17 18:21:39 $
* $Revision: 1.15 $
* $Date: 2004/06/26 00:26:16 $
* $Revision: 1.16 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.Replaceable;
import com.ibm.icu.text.ReplaceableString;
import com.ibm.icu.text.UnicodeMatcher;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
import java.util.*;
@ -250,7 +257,242 @@ public final class GenerateHanTransliterator implements UCD_Types {
static final boolean DO_SIMPLE = true;
static final boolean SKIP_OVERRIDES = true;
public static void main(int typeIn) {
static PrintWriter out2;
public static void fixedMandarin() throws IOException {
UnicodeMap kMandarin = Default.ucd().getHanValue("kMandarin");
UnicodeMap kHanyuPinlu = Default.ucd().getHanValue("kHanyuPinlu");
UnicodeSet gotMandarin = kMandarin.getSet(null).complement();
UnicodeSet gotHanyu = kHanyuPinlu.getSet(null).complement();
UnicodeSet gotAtLeastOne = new UnicodeSet(gotMandarin).addAll(gotHanyu);
Map outmap = new TreeMap(Collator.getInstance(new ULocale("zh")));
for (UnicodeSetIterator it = new UnicodeSetIterator(gotAtLeastOne); it.next(); ) {
//String code = UTF16.valueOf(it.codepoint);
String hanyu = (String) kHanyuPinlu.getValue(it.codepoint);
String mandarin = (String) kMandarin.getValue(it.codepoint);
String hPinyin = hanyu == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(hanyu,'('));
String mPinyin = mandarin == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(mandarin.toLowerCase(),' '));
String uPinyin = hPinyin != null ? hPinyin : mPinyin;
UnicodeSet s = (UnicodeSet) outmap.get(uPinyin);
if (s == null) {
s = new UnicodeSet();
outmap.put(uPinyin, s);
}
s.add(it.codepoint);
}
String filename = "Raw_Transliterator_Han_Latin.txt";
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, filename);
for (Iterator it = outmap.keySet().iterator(); it.hasNext();) {
String pinyin = (String) it.next();
UnicodeSet uset = (UnicodeSet) outmap.get(pinyin);
if (uset.size() == 1) {
UnicodeSetIterator usi = new UnicodeSetIterator(uset);
usi.next();
out.println(UTF16.valueOf(usi.codepoint) + ">" + pinyin + ";");
} else {
out.println(uset.toPattern(false) + ">" + pinyin + ";");
}
}
out.close();
}
public static class PairComparator implements Comparator {
Comparator first;
Comparator second;
PairComparator(Comparator first, Comparator second) {
this.first = first;
this.second = second;
}
public int compare(Object o1, Object o2) {
Pair p1 = (Pair)o1;
Pair p2 = (Pair)o2;
int result = first.compare(p1.first, p2.first);
if (result != 0) return result;
return second.compare(p1.second, p2.second);
}
}
public static void quickMandarin() throws Exception {
UnicodeMap gcl = new UnicodeMap();
addField("C:\\DATA\\dict\\", "gcl_icu.txt", 2, 3, gcl);
addField("C:\\DATA\\dict\\", "gcl_other.txt", 2, 5, gcl);
Transliterator icuPinyin = Transliterator.getInstance("han-latin");
UnicodeMap kMandarin = Default.ucd().getHanValue("kMandarin");
UnicodeMap kHanyuPinlu = Default.ucd().getHanValue("kHanyuPinlu");
UnicodeSet gotMandarin = kMandarin.getSet(null).complement();
UnicodeSet gotHanyu = kHanyuPinlu.getSet(null).complement();
UnicodeSet gotAtLeastOne = new UnicodeSet(gotMandarin).addAll(gotHanyu);
int counter = 0;
int hCount = 0;
log = Utility.openPrintWriter("Mandarin_First.txt", Utility.UTF8_WINDOWS);
log.println("N\tCode\tChar\tUnihan\tICU\tGCL\tkHanyuPinlu / kMandarin");
UnicodeMap reformed = new UnicodeMap();
for (UnicodeSetIterator it = new UnicodeSetIterator(gotAtLeastOne); it.next(); ) {
String code = UTF16.valueOf(it.codepoint);
String hanyu = (String) kHanyuPinlu.getValue(it.codepoint);
String mandarin = (String) kMandarin.getValue(it.codepoint);
String hPinyin = hanyu == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(hanyu,'('));
String mPinyin = mandarin == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(mandarin.toLowerCase(),' '));
String uPinyin = hPinyin != null ? hPinyin : mPinyin;
String iPinyin = icuPinyin.transliterate(code).trim();
if (iPinyin.equals(code)) iPinyin = null;
String gPinyin = (String) gcl.getValue(it.codepoint);
if (hPinyin != null) reformed.put(it.codepoint, hPinyin);
else if (gPinyin != null) reformed.put(it.codepoint, gPinyin);
else if (mPinyin != null) reformed.put(it.codepoint, mPinyin);
else if (iPinyin != null) reformed.put(it.codepoint, iPinyin);
if (gPinyin != null && !gPinyin.equals(uPinyin)) {
log.println((++counter) + "\t" + Utility.hex(it.codepoint) + "\t" + code
+ "\t" + (uPinyin == null ? "" : uPinyin)
+ "\t" + (iPinyin == null ? "" : iPinyin.equals(gPinyin) ? "" : iPinyin)
+ "\t" + (gPinyin == null ? "" : gPinyin)
+ "\t" + (hanyu == null ? "" : hanyu + " / ")
+ (mandarin == null ? "" : mandarin)
);
if (hanyu != null) hCount++;
continue;
}
if (true) continue;
if (isEqualOrNull(uPinyin, iPinyin)) continue;
log.println((++counter) + "\t" + Utility.hex(it.codepoint) + "\t" + code
+ "\t" + (uPinyin == null ? "" : uPinyin)
+ "\t" + (iPinyin == null ? "" : iPinyin)
+ "\t" + (gPinyin == null ? "" : gPinyin)
+ "\t" + (hanyu == null ? "" : hanyu + " / ")
+ (mandarin == null ? "" : mandarin)
);
}
log.println("kHanyuPinlu count: " + hCount);
Collator col = Collator.getInstance(new Locale("zh","","PINYIN"));
UnicodeSet tailored = col.getTailoredSet().addAll(gotAtLeastOne);
Collator pinyinCollator = new RuleBasedCollator(
"&[before 1] a < \u0101 <<< \u0100 << \u00E1 <<< \u00C1 << \u01CE <<< \u01CD << \u00E0 <<< \u00C0 << a <<< A" +
"&[before 1] e < \u0113 <<< \u0112 << \u00E9 <<< \u00C9 << \u011B <<< \u011A << \u00E8 <<< \u00C8 << e <<< A" +
"&[before 1] i < \u012B <<< \u012A << \u00ED <<< \u00CD << \u01D0 <<< \u01CF << \u00EC <<< \u00CC << i <<< I" +
"&[before 1] o < \u014D <<< \u014C << \u00F3 <<< \u00D3 << \u01D2 <<< \u01D1 << \u00F2 <<< \u00D2 << o <<< O" +
"&[before 1] u < \u016B <<< \u016A << \u00FA <<< \u00DA << \u01D4 <<< \u01D3 << \u00F9 <<< \u00D9 << u <<< U" +
" << \u01D6 <<< \u01D5 << \u01D8 <<< \u01D7 << \u01DA <<< \u01D9 << \u01DC <<< \u01DB << \u00FC");
printSortedChars("ICU_Pinyin_Sort.txt", col, tailored, reformed, kHanyuPinlu, kMandarin, pinyinCollator);
/*
MultiComparator mcol = new MultiComparator(new Comparator[] {
new UnicodeMapComparator(reformed, pinyinCollator), col});
printSortedChars("ICU_Pinyin_Sort2.txt", mcol, tailored);
*/
log.close();
}
static class UnicodeMapComparator implements Comparator {
UnicodeMap map;
Comparator comp;
UnicodeMapComparator(UnicodeMap map, Comparator comp) {
this.map = map;
this.comp = comp;
}
public int compare(Object o1, Object o2) {
int c1 = UTF16.charAt((String) o1,0);
int c2 = UTF16.charAt((String) o2,0);
Object v1 = map.getValue(c1);
Object v2 = map.getValue(c2);
if (v1 == null) {
if (v2 == null) return 0;
return -1;
} else if (v2 == null) return 1;
return comp.compare(v1, v2);
}
}
static class MultiComparator implements Comparator {
private Comparator[] comparators;
public MultiComparator (Comparator[] comparators) {
this.comparators = comparators;
}
/* Lexigraphic compare. Returns the first difference
* @return zero if equal. Otherwise +/- (i+1)
* where i is the index of the first comparator finding a difference
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
*/
public int compare(Object arg0, Object arg1) {
for (int i = 0; i < comparators.length; ++i) {
int result = comparators[i].compare(arg0, arg1);
if (result == 0) continue;
if (result > 0) return i+1;
return -(i+1);
}
return 0;
}
}
private static void printSortedChars(String file, Comparator col, UnicodeSet tailored,
UnicodeMap map, UnicodeMap hanyu, UnicodeMap mand, Comparator p2)
throws IOException {
Set set = new TreeSet(col);
PrintWriter pw = Utility.openPrintWriter(file, Utility.UTF8_WINDOWS);
for (UnicodeSetIterator it = new UnicodeSetIterator(tailored); it.next(); ) {
set.add(UTF16.valueOf(it.codepoint));
}
String lastm = "";
String lasts = "";
for (Iterator it2 = set.iterator(); it2.hasNext(); ) {
String s = (String)it2.next();
String m = map == null ? null : (String) map.getValue(UTF16.charAt(s,0));
if (m == null) m = "";
String info = m;
if (p2.compare(lastm,m) > 0) {
info = info + "\t" + lastm + " > " + m + "\t";
Object temp;
temp = hanyu.getValue(UTF16.charAt(lasts,0));
if (temp != null) info += "[" + temp + "]";
temp = mand.getValue(UTF16.charAt(lasts,0));
if (temp != null) info += "[" + temp + "]";
info += " > ";
temp = hanyu.getValue(UTF16.charAt(s,0));
if (temp != null) info += "[" + temp + "]";
temp = mand.getValue(UTF16.charAt(s,0));
if (temp != null) info += "[" + temp + "]";
}
pw.println(Utility.hex(s) + "\t" + s + "\t" + info);
lastm = m;
lasts = s;
}
pw.close();
}
static void addField(String dir, String file, int hexCodeFieldNumber, int valueNumber, UnicodeMap result) throws IOException {
BufferedReader br = BagFormatter.openUTF8Reader(dir, file);
while (true) {
String line = br.readLine();
if (line == null) break;
line = line.trim();
if (line.length() == 0) continue;
if (line.startsWith("\uFEFF")) line = line.substring(1);
if (line.startsWith("#") || line.length() == 0) continue;
String[] pieces = Utility.split(line,'\t');
result.put(Integer.parseInt(pieces[hexCodeFieldNumber], 16), pieces[valueNumber]);
}
br.close();
}
static boolean isEqualOrNull(String a, String b) {
if (a == null || b == null) return true;
return a.equals(b);
}
public static String getUpTo(String s, char ch) {
int pos = s.indexOf(ch);
if (pos < 0) return s;
return s.substring(0,pos);
}
public static void main(int typeIn) throws IOException {
if (typeIn == CHINESE) {
fixedMandarin();
return;
}
type = typeIn;
try {
@ -298,7 +540,11 @@ public final class GenerateHanTransliterator implements UCD_Types {
log.println();
log.println("@Unihan Data");
log.println();
out2 = BagFormatter.openUTF8Writer(GEN_DIR, "unihan_kmandarinDump.txt");
readUnihanData(key);
out2.close();
if (false) {
readCDICT();
@ -1796,6 +2042,8 @@ Bad pinyin data: \u4E7F ? LE
static Map cdict = new TreeMap();
static Map simplifiedToTraditional = new HashMap();
static Map traditionalToSimplified = new HashMap();
static UnicodeMap kHanyuPinlu = new UnicodeMap();
static void readUnihanData(String key) throws java.io.IOException {
@ -1833,7 +2081,16 @@ Bad pinyin data: \u4E7F ? LE
traditionalToSimplified.put(UTF16.valueOf(code), propertyValue);
}
if (property.equals(key) || key.equals("kJapaneseOn") && property.equals("kJapaneseKun")) {
if (key.equals("kMandarin") && property.equals("kHanyuPinlu")) {
// U+64D4 kHanyuPinlu dan1(297), dan4(61), dan5(36)
String[] piece = Utility.split(propertyValue,'(');
String pinyin = digitToPinyin(piece[0], line);
log.println(scode + "\t" + pinyin + "\t" + line);
kHanyuPinlu.put(Integer.parseInt(scode,16), pinyin);
}
if (property.equals(key)
|| key.equals("kJapaneseOn") && property.equals("kJapaneseKun")
) {
storeDef(out, code, propertyValue, line);
}
}
@ -1885,6 +2142,7 @@ Bad pinyin data: \u4E7F ? LE
definition = definition.substring(0, end3);
definition = digitToPinyin(definition, line);
out2.println(Utility.hex(cp) + '\t' + UTF16.valueOf(cp) + "\t" + definition.toLowerCase());
}
if (type == DEFINITION) {
definition = removeMatched(definition,'(', ')', line);

View file

@ -13,6 +13,7 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
@ -968,35 +969,68 @@ public class MakeUnicodeFiles {
}
}
public static void showMatches() throws IOException {
public static void showAllDiff() throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "propertyDifference.txt");
try {
showDifferences(out, "4.0.1", "LB", "GC");
showDifferences(out, "4.0.1", "East Asian Width", "LB");
showDifferences(out, "4.0.1", "East Asian Width", "GC");
UnicodeProperty.Factory fac = ToolUnicodePropertySource.make("4.0.1");
List props = fac.getAvailableNames(
(1<<UnicodeProperty.BINARY)
| (1<<UnicodeProperty.ENUMERATED)
//| (1<<UnicodeProperty.CATALOG)
);
Set skipList = new HashSet();
skipList.add("Age");
skipList.add("Joining_Group");
skipList.add("Canonical_Combining_Class");
for (Iterator it = props.iterator(); it.hasNext();) {
String prop1 = (String) it.next();
for (Iterator it2 = props.iterator(); it2.hasNext();) {
String prop2 = (String) it2.next();
if (prop1.equals(prop2)) continue;
if (skipList.contains(prop2)) continue;
System.out.println(prop1 + " vs. " + prop2);
showDifferences(out, fac.getProperty(prop1), fac.getProperty(prop2), false);
out.flush();
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
out.close();
}
}
static NumberFormat nf = NumberFormat.getIntegerInstance(Locale.ENGLISH);
static void showDifferences(PrintWriter out, String version, String prop1, String prop2) throws IOException {
UnicodeProperty p1 = ToolUnicodePropertySource.make(version).getProperty(prop1);
UnicodeProperty p2 = ToolUnicodePropertySource.make(version).getProperty(prop2);
showDifferences(out, p1, p2, true);
}
static void showDifferences(PrintWriter out, UnicodeProperty p1, UnicodeProperty p2, boolean doOverlaps) throws IOException {
BagFormatter bf = new BagFormatter();
out.println("Comparing " + p1.getName() + " and " + p2.getName());
//out.println("Comparing " + p1.getName() + " and " + p2.getName());
System.out.println("Comparing " + p1.getName() + " and " + p2.getName());
String pn1 = '$' + p1.getName();
String pn2 = '$' + p2.getName();
UnicodeSet intersection = new UnicodeSet();
UnicodeSet disjoint = new UnicodeSet();
String skip1 = p1.getValue(0xEFFFD);
String skip2 = p2.getValue(0xEFFFD);
main:
for (Iterator it1 = p1.getAvailableValues().iterator(); it1.hasNext();) {
String v1 = (String)it1.next();
if (v1.equals(skip1)) continue;
UnicodeSet s1 = p1.getSet(v1);
v1 += " (" + p1.getFirstValueAlias(v1) + ")";
if (s1.size() == 0) continue;
String pv1 = pn1 + (v1.equals("True") ? "" : ":" + v1);
//v1 += " (" + p1.getFirstValueAlias(v1) + ")";
System.out.println(v1);
out.println();
out.println(v1 + " [" + nf.format(s1.size()) + "]");
//out.println();
//out.println(v1 + " [" + nf.format(s1.size()) + "]");
// create some containers so that the output is organized reasonably
String contains = "";
@ -1005,22 +1039,25 @@ public class MakeUnicodeFiles {
Set overlapsSet = new TreeSet();
for (Iterator it2 = p2.getAvailableValues().iterator(); it2.hasNext();) {
String v2 = (String)it2.next();
if (v2.equals(skip2)) continue;
UnicodeSet s2 = p2.getSet(v2);
if (s2.size() == 0) continue;
// v2 += "(" + p2.getFirstValueAlias(v2) + ")";
v2 = p2.getFirstValueAlias(v2);
//v2 = p2.getFirstValueAlias(v2);
String pv2 = pn2 + (v2.equals("True") ? "" : ":" + v2);
if (s1.containsNone(s2)) continue;
if (s1.equals(s2)) {
out.println("\t= " + v2);
out.println(pv1 + "\t= " + pv2);
continue main; // since they are partitions, we can stop here
} else if (s2.containsAll(s1)) {
out.println("\t\u2282 " + v2 + " [" + nf.format(s2.size()) + "]");
// out.println(pv1 + "\t\u2282 " + pv2);
continue main; // partition, stop
} else if (s1.containsAll(s2)) {
if (contains.length() != 0) contains += " \u222a ";
contains += v2 + " [" + nf.format(s2.size()) + "]";
if (contains.length() != 0) contains += " ";
contains += pv2;
containsSet.addAll(s2);
if (containsSet.size() == s1.size()) break;
} else { // doesn't contain, isn't contained
} else if (doOverlaps) { // doesn't contain, isn't contained
if (overlaps.length() != 0) overlaps += "\r\n\t";
intersection.clear().addAll(s2).retainAll(s1);
disjoint.clear().addAll(s1).removeAll(s2);
@ -1030,7 +1067,8 @@ public class MakeUnicodeFiles {
}
}
if (contains.length() != 0) {
out.println((containsSet.size() == s1.size() ? "\t= " : "\t\u2283 ") + contains);
out.println(pv1 + (containsSet.size() == s1.size() ? "\t= "
: "\t\u2283 ") + "[" + contains + "]");
}
if (overlaps.length() != 0) out.println("\t" + overlaps);
if (false && overlapsSet.size() != 0) {
@ -1152,7 +1190,7 @@ public class MakeUnicodeFiles {
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]");
static void testInvariants() throws IOException {
public static void testInvariants() throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
out.write('\uFEFF'); // BOM
BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt");

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2004/03/11 19:03:16 $
* $Revision: 1.33 $
* $Date: 2004/06/26 00:26:16 $
* $Revision: 1.34 $
*
*******************************************************************************
*/
@ -480,6 +480,41 @@ public final class UCD implements UCD_Types {
byte numericType;
}
public UnicodeMap getHanValue(String propertyName) {
UnicodeMap result = new UnicodeMap();
try {
BufferedReader in = Utility.openUnicodeFile("Unihan", version, true, Utility.UTF8);
int lineCounter = 0;
while (true) {
Utility.dot(++lineCounter);
String line = in.readLine();
if (line == null) break;
if (line.length() < 6) continue;
if (line.charAt(0) == '#') continue;
line = line.trim();
int tabPos = line.indexOf('\t');
int tabPos2 = line.indexOf('\t', tabPos+1);
String property = line.substring(tabPos+1, tabPos2).trim();
if (!property.equalsIgnoreCase(propertyName)) continue;
String scode = line.substring(2, tabPos).trim();
int code = Integer.parseInt(scode, 16);
String propertyValue = line.substring(tabPos2+1).trim();
result.put(code, propertyValue);
}
in.close();
} catch (Exception e) {
throw new ChainException("Han File Processing Exception", null, e);
} finally {
Utility.fixDot();
}
return result;
}
void populateHanExceptions() {
hanExceptions = new IntMap();
BufferedReader in = null;

View file

@ -88,3 +88,5 @@ $Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]]
# Testing
$script:greek = $×script:greek
$gc:lm = $script:inherited

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2004/04/10 16:49:19 $
* $Revision: 1.42 $
* $Date: 2004/06/26 00:26:16 $
* $Revision: 1.43 $
*
*******************************************************************************
*/
@ -17,6 +17,7 @@ import java.util.*;
import java.text.*;
import java.io.*;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.Replaceable;
@ -462,18 +463,22 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
* Splits a string containing divider into pieces, storing in output
* and returns the number of pieces.
*/
public static int split(String s, char divider, String[] output) {
public static int split(String s, char divider, String[] output, boolean trim) {
try {
int last = 0;
int current = 0;
int i;
for (i = 0; i < s.length(); ++i) {
if (s.charAt(i) == divider) {
output[current++] = s.substring(last,i);
String temp = s.substring(last,i);
if (trim) temp = temp.trim();
output[current++] = temp;
last = i+1;
}
}
output[current++] = s.substring(last,i);
String temp = s.substring(last,i);
if (trim) temp = temp.trim();
output[current++] = temp;
int result = current;
while (current < output.length) {
output[current++] = "";
@ -484,9 +489,16 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
}
}
public static String[] split(String s, char divider) {
public static String[] split(String s, char divider) {
return split(s,divider,false);
}
public static int split(String s, char divider, String[] output) {
return split(s,divider,output,false);
}
public static String[] split(String s, char divider, boolean trim) {
String[] result = new String[100]; // HACK
int count = split(s, divider, result);
int count = split(s, divider, result, trim);
return extract(result, 0, count);
}
@ -1209,4 +1221,45 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
return (isSeparateLineIDN(start, ucd) || isSeparateLineIDN(end, ucd));
}
public static Transliterator createFromFile(String fileName, int direction, Transliterator pretrans) throws IOException {
StringBuffer buffer = new StringBuffer();
FileLineIterator fli = new FileLineIterator();
fli.open(fileName, Utility.UTF8);
fli.commentChar = FileLineIterator.NOTCHAR; // disable comments
while (true) {
String line = fli.read();
if (line == null) break;
if (line.startsWith("\uFEFF")) line = line.substring(1);
if (pretrans != null) line = pretrans.transliterate(line);
buffer.append(line);
buffer.append("\r\n"); // separate with whitespace
}
fli.close();
/*
// read and concatenate all the lines
FileInputStream fis = new FileInputStream(fileName);
InputStreamReader isr = new InputStreamReader(fis, "UTF8");
BufferedReader br = new BufferedReader(isr, 32*1024);
while (true) {
String line = br.readLine();
if (line == null) break;
if (line.length() > 0 && line.charAt(0) == '\uFEFF') line = line.substring(1); // strip BOM
if (pretrans != null) line = pretrans.transliterate(line);
buffer.append(line);
buffer.append("\r\n"); // separate with whitespace
}
br.close();
//System.out.println(buffer.toString());
*/
// Transform file name into id
String id = fileName;
int pos = id.lastIndexOf('.');
if (pos >= 0) id = id.substring(0, pos);
//System.out.println(buffer);
return Transliterator.createFromRules(id, buffer.toString(), direction);
}
}