mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
no message
X-SVN-Rev: 15942
This commit is contained in:
parent
28015f3710
commit
ae721a34d1
5 changed files with 416 additions and 30 deletions
|
@ -5,22 +5,29 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
|
||||
* $Date: 2004/04/17 18:21:39 $
|
||||
* $Revision: 1.15 $
|
||||
* $Date: 2004/06/26 00:26:16 $
|
||||
* $Revision: 1.16 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.RuleBasedCollator;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.Replaceable;
|
||||
import com.ibm.icu.text.ReplaceableString;
|
||||
import com.ibm.icu.text.UnicodeMatcher;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
|
||||
import java.util.*;
|
||||
|
@ -250,7 +257,242 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
static final boolean DO_SIMPLE = true;
|
||||
static final boolean SKIP_OVERRIDES = true;
|
||||
|
||||
public static void main(int typeIn) {
|
||||
static PrintWriter out2;
|
||||
|
||||
public static void fixedMandarin() throws IOException {
|
||||
UnicodeMap kMandarin = Default.ucd().getHanValue("kMandarin");
|
||||
UnicodeMap kHanyuPinlu = Default.ucd().getHanValue("kHanyuPinlu");
|
||||
UnicodeSet gotMandarin = kMandarin.getSet(null).complement();
|
||||
UnicodeSet gotHanyu = kHanyuPinlu.getSet(null).complement();
|
||||
UnicodeSet gotAtLeastOne = new UnicodeSet(gotMandarin).addAll(gotHanyu);
|
||||
Map outmap = new TreeMap(Collator.getInstance(new ULocale("zh")));
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(gotAtLeastOne); it.next(); ) {
|
||||
//String code = UTF16.valueOf(it.codepoint);
|
||||
String hanyu = (String) kHanyuPinlu.getValue(it.codepoint);
|
||||
String mandarin = (String) kMandarin.getValue(it.codepoint);
|
||||
String hPinyin = hanyu == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(hanyu,'('));
|
||||
String mPinyin = mandarin == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(mandarin.toLowerCase(),' '));
|
||||
String uPinyin = hPinyin != null ? hPinyin : mPinyin;
|
||||
UnicodeSet s = (UnicodeSet) outmap.get(uPinyin);
|
||||
if (s == null) {
|
||||
s = new UnicodeSet();
|
||||
outmap.put(uPinyin, s);
|
||||
}
|
||||
s.add(it.codepoint);
|
||||
}
|
||||
String filename = "Raw_Transliterator_Han_Latin.txt";
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, filename);
|
||||
for (Iterator it = outmap.keySet().iterator(); it.hasNext();) {
|
||||
String pinyin = (String) it.next();
|
||||
UnicodeSet uset = (UnicodeSet) outmap.get(pinyin);
|
||||
if (uset.size() == 1) {
|
||||
UnicodeSetIterator usi = new UnicodeSetIterator(uset);
|
||||
usi.next();
|
||||
out.println(UTF16.valueOf(usi.codepoint) + ">" + pinyin + ";");
|
||||
} else {
|
||||
out.println(uset.toPattern(false) + ">" + pinyin + ";");
|
||||
}
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
|
||||
public static class PairComparator implements Comparator {
|
||||
Comparator first;
|
||||
Comparator second;
|
||||
PairComparator(Comparator first, Comparator second) {
|
||||
this.first = first;
|
||||
this.second = second;
|
||||
}
|
||||
public int compare(Object o1, Object o2) {
|
||||
Pair p1 = (Pair)o1;
|
||||
Pair p2 = (Pair)o2;
|
||||
int result = first.compare(p1.first, p2.first);
|
||||
if (result != 0) return result;
|
||||
return second.compare(p1.second, p2.second);
|
||||
}
|
||||
}
|
||||
|
||||
public static void quickMandarin() throws Exception {
|
||||
UnicodeMap gcl = new UnicodeMap();
|
||||
addField("C:\\DATA\\dict\\", "gcl_icu.txt", 2, 3, gcl);
|
||||
addField("C:\\DATA\\dict\\", "gcl_other.txt", 2, 5, gcl);
|
||||
Transliterator icuPinyin = Transliterator.getInstance("han-latin");
|
||||
UnicodeMap kMandarin = Default.ucd().getHanValue("kMandarin");
|
||||
UnicodeMap kHanyuPinlu = Default.ucd().getHanValue("kHanyuPinlu");
|
||||
UnicodeSet gotMandarin = kMandarin.getSet(null).complement();
|
||||
UnicodeSet gotHanyu = kHanyuPinlu.getSet(null).complement();
|
||||
UnicodeSet gotAtLeastOne = new UnicodeSet(gotMandarin).addAll(gotHanyu);
|
||||
int counter = 0;
|
||||
int hCount = 0;
|
||||
log = Utility.openPrintWriter("Mandarin_First.txt", Utility.UTF8_WINDOWS);
|
||||
log.println("N\tCode\tChar\tUnihan\tICU\tGCL\tkHanyuPinlu / kMandarin");
|
||||
UnicodeMap reformed = new UnicodeMap();
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(gotAtLeastOne); it.next(); ) {
|
||||
String code = UTF16.valueOf(it.codepoint);
|
||||
String hanyu = (String) kHanyuPinlu.getValue(it.codepoint);
|
||||
String mandarin = (String) kMandarin.getValue(it.codepoint);
|
||||
String hPinyin = hanyu == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(hanyu,'('));
|
||||
String mPinyin = mandarin == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(mandarin.toLowerCase(),' '));
|
||||
String uPinyin = hPinyin != null ? hPinyin : mPinyin;
|
||||
|
||||
String iPinyin = icuPinyin.transliterate(code).trim();
|
||||
if (iPinyin.equals(code)) iPinyin = null;
|
||||
String gPinyin = (String) gcl.getValue(it.codepoint);
|
||||
|
||||
if (hPinyin != null) reformed.put(it.codepoint, hPinyin);
|
||||
else if (gPinyin != null) reformed.put(it.codepoint, gPinyin);
|
||||
else if (mPinyin != null) reformed.put(it.codepoint, mPinyin);
|
||||
else if (iPinyin != null) reformed.put(it.codepoint, iPinyin);
|
||||
|
||||
if (gPinyin != null && !gPinyin.equals(uPinyin)) {
|
||||
log.println((++counter) + "\t" + Utility.hex(it.codepoint) + "\t" + code
|
||||
+ "\t" + (uPinyin == null ? "" : uPinyin)
|
||||
+ "\t" + (iPinyin == null ? "" : iPinyin.equals(gPinyin) ? "" : iPinyin)
|
||||
+ "\t" + (gPinyin == null ? "" : gPinyin)
|
||||
+ "\t" + (hanyu == null ? "" : hanyu + " / ")
|
||||
+ (mandarin == null ? "" : mandarin)
|
||||
);
|
||||
if (hanyu != null) hCount++;
|
||||
continue;
|
||||
}
|
||||
if (true) continue;
|
||||
if (isEqualOrNull(uPinyin, iPinyin)) continue;
|
||||
log.println((++counter) + "\t" + Utility.hex(it.codepoint) + "\t" + code
|
||||
+ "\t" + (uPinyin == null ? "" : uPinyin)
|
||||
+ "\t" + (iPinyin == null ? "" : iPinyin)
|
||||
+ "\t" + (gPinyin == null ? "" : gPinyin)
|
||||
+ "\t" + (hanyu == null ? "" : hanyu + " / ")
|
||||
+ (mandarin == null ? "" : mandarin)
|
||||
);
|
||||
}
|
||||
log.println("kHanyuPinlu count: " + hCount);
|
||||
|
||||
Collator col = Collator.getInstance(new Locale("zh","","PINYIN"));
|
||||
UnicodeSet tailored = col.getTailoredSet().addAll(gotAtLeastOne);
|
||||
Collator pinyinCollator = new RuleBasedCollator(
|
||||
"&[before 1] a < \u0101 <<< \u0100 << \u00E1 <<< \u00C1 << \u01CE <<< \u01CD << \u00E0 <<< \u00C0 << a <<< A" +
|
||||
"&[before 1] e < \u0113 <<< \u0112 << \u00E9 <<< \u00C9 << \u011B <<< \u011A << \u00E8 <<< \u00C8 << e <<< A" +
|
||||
"&[before 1] i < \u012B <<< \u012A << \u00ED <<< \u00CD << \u01D0 <<< \u01CF << \u00EC <<< \u00CC << i <<< I" +
|
||||
"&[before 1] o < \u014D <<< \u014C << \u00F3 <<< \u00D3 << \u01D2 <<< \u01D1 << \u00F2 <<< \u00D2 << o <<< O" +
|
||||
"&[before 1] u < \u016B <<< \u016A << \u00FA <<< \u00DA << \u01D4 <<< \u01D3 << \u00F9 <<< \u00D9 << u <<< U" +
|
||||
" << \u01D6 <<< \u01D5 << \u01D8 <<< \u01D7 << \u01DA <<< \u01D9 << \u01DC <<< \u01DB << \u00FC");
|
||||
printSortedChars("ICU_Pinyin_Sort.txt", col, tailored, reformed, kHanyuPinlu, kMandarin, pinyinCollator);
|
||||
/*
|
||||
MultiComparator mcol = new MultiComparator(new Comparator[] {
|
||||
new UnicodeMapComparator(reformed, pinyinCollator), col});
|
||||
printSortedChars("ICU_Pinyin_Sort2.txt", mcol, tailored);
|
||||
*/
|
||||
log.close();
|
||||
}
|
||||
|
||||
static class UnicodeMapComparator implements Comparator {
|
||||
UnicodeMap map;
|
||||
Comparator comp;
|
||||
UnicodeMapComparator(UnicodeMap map, Comparator comp) {
|
||||
this.map = map;
|
||||
this.comp = comp;
|
||||
}
|
||||
public int compare(Object o1, Object o2) {
|
||||
int c1 = UTF16.charAt((String) o1,0);
|
||||
int c2 = UTF16.charAt((String) o2,0);
|
||||
Object v1 = map.getValue(c1);
|
||||
Object v2 = map.getValue(c2);
|
||||
if (v1 == null) {
|
||||
if (v2 == null) return 0;
|
||||
return -1;
|
||||
} else if (v2 == null) return 1;
|
||||
return comp.compare(v1, v2);
|
||||
}
|
||||
}
|
||||
|
||||
static class MultiComparator implements Comparator {
|
||||
private Comparator[] comparators;
|
||||
|
||||
public MultiComparator (Comparator[] comparators) {
|
||||
this.comparators = comparators;
|
||||
}
|
||||
|
||||
/* Lexigraphic compare. Returns the first difference
|
||||
* @return zero if equal. Otherwise +/- (i+1)
|
||||
* where i is the index of the first comparator finding a difference
|
||||
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
|
||||
*/
|
||||
public int compare(Object arg0, Object arg1) {
|
||||
for (int i = 0; i < comparators.length; ++i) {
|
||||
int result = comparators[i].compare(arg0, arg1);
|
||||
if (result == 0) continue;
|
||||
if (result > 0) return i+1;
|
||||
return -(i+1);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static void printSortedChars(String file, Comparator col, UnicodeSet tailored,
|
||||
UnicodeMap map, UnicodeMap hanyu, UnicodeMap mand, Comparator p2)
|
||||
throws IOException {
|
||||
Set set = new TreeSet(col);
|
||||
PrintWriter pw = Utility.openPrintWriter(file, Utility.UTF8_WINDOWS);
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(tailored); it.next(); ) {
|
||||
set.add(UTF16.valueOf(it.codepoint));
|
||||
}
|
||||
String lastm = "";
|
||||
String lasts = "";
|
||||
for (Iterator it2 = set.iterator(); it2.hasNext(); ) {
|
||||
String s = (String)it2.next();
|
||||
String m = map == null ? null : (String) map.getValue(UTF16.charAt(s,0));
|
||||
if (m == null) m = "";
|
||||
String info = m;
|
||||
if (p2.compare(lastm,m) > 0) {
|
||||
info = info + "\t" + lastm + " > " + m + "\t";
|
||||
Object temp;
|
||||
temp = hanyu.getValue(UTF16.charAt(lasts,0));
|
||||
if (temp != null) info += "[" + temp + "]";
|
||||
temp = mand.getValue(UTF16.charAt(lasts,0));
|
||||
if (temp != null) info += "[" + temp + "]";
|
||||
info += " > ";
|
||||
temp = hanyu.getValue(UTF16.charAt(s,0));
|
||||
if (temp != null) info += "[" + temp + "]";
|
||||
temp = mand.getValue(UTF16.charAt(s,0));
|
||||
if (temp != null) info += "[" + temp + "]";
|
||||
}
|
||||
pw.println(Utility.hex(s) + "\t" + s + "\t" + info);
|
||||
lastm = m;
|
||||
lasts = s;
|
||||
}
|
||||
pw.close();
|
||||
}
|
||||
|
||||
static void addField(String dir, String file, int hexCodeFieldNumber, int valueNumber, UnicodeMap result) throws IOException {
|
||||
BufferedReader br = BagFormatter.openUTF8Reader(dir, file);
|
||||
while (true) {
|
||||
String line = br.readLine();
|
||||
if (line == null) break;
|
||||
line = line.trim();
|
||||
if (line.length() == 0) continue;
|
||||
if (line.startsWith("\uFEFF")) line = line.substring(1);
|
||||
if (line.startsWith("#") || line.length() == 0) continue;
|
||||
String[] pieces = Utility.split(line,'\t');
|
||||
result.put(Integer.parseInt(pieces[hexCodeFieldNumber], 16), pieces[valueNumber]);
|
||||
}
|
||||
br.close();
|
||||
}
|
||||
|
||||
static boolean isEqualOrNull(String a, String b) {
|
||||
if (a == null || b == null) return true;
|
||||
return a.equals(b);
|
||||
}
|
||||
public static String getUpTo(String s, char ch) {
|
||||
int pos = s.indexOf(ch);
|
||||
if (pos < 0) return s;
|
||||
return s.substring(0,pos);
|
||||
}
|
||||
|
||||
public static void main(int typeIn) throws IOException {
|
||||
if (typeIn == CHINESE) {
|
||||
fixedMandarin();
|
||||
return;
|
||||
}
|
||||
type = typeIn;
|
||||
|
||||
try {
|
||||
|
@ -298,7 +540,11 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
log.println();
|
||||
log.println("@Unihan Data");
|
||||
log.println();
|
||||
out2 = BagFormatter.openUTF8Writer(GEN_DIR, "unihan_kmandarinDump.txt");
|
||||
|
||||
readUnihanData(key);
|
||||
|
||||
out2.close();
|
||||
|
||||
if (false) {
|
||||
readCDICT();
|
||||
|
@ -1796,6 +2042,8 @@ Bad pinyin data: \u4E7F ? LE
|
|||
static Map cdict = new TreeMap();
|
||||
static Map simplifiedToTraditional = new HashMap();
|
||||
static Map traditionalToSimplified = new HashMap();
|
||||
|
||||
static UnicodeMap kHanyuPinlu = new UnicodeMap();
|
||||
|
||||
static void readUnihanData(String key) throws java.io.IOException {
|
||||
|
||||
|
@ -1833,7 +2081,16 @@ Bad pinyin data: \u4E7F ? LE
|
|||
traditionalToSimplified.put(UTF16.valueOf(code), propertyValue);
|
||||
}
|
||||
|
||||
if (property.equals(key) || key.equals("kJapaneseOn") && property.equals("kJapaneseKun")) {
|
||||
if (key.equals("kMandarin") && property.equals("kHanyuPinlu")) {
|
||||
// U+64D4 kHanyuPinlu dan1(297), dan4(61), dan5(36)
|
||||
String[] piece = Utility.split(propertyValue,'(');
|
||||
String pinyin = digitToPinyin(piece[0], line);
|
||||
log.println(scode + "\t" + pinyin + "\t" + line);
|
||||
kHanyuPinlu.put(Integer.parseInt(scode,16), pinyin);
|
||||
}
|
||||
if (property.equals(key)
|
||||
|| key.equals("kJapaneseOn") && property.equals("kJapaneseKun")
|
||||
) {
|
||||
storeDef(out, code, propertyValue, line);
|
||||
}
|
||||
}
|
||||
|
@ -1885,6 +2142,7 @@ Bad pinyin data: \u4E7F ? LE
|
|||
definition = definition.substring(0, end3);
|
||||
|
||||
definition = digitToPinyin(definition, line);
|
||||
out2.println(Utility.hex(cp) + '\t' + UTF16.valueOf(cp) + "\t" + definition.toLowerCase());
|
||||
}
|
||||
if (type == DEFINITION) {
|
||||
definition = removeMatched(definition,'(', ')', line);
|
||||
|
|
|
@ -13,6 +13,7 @@ import java.util.Arrays;
|
|||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
@ -968,35 +969,68 @@ public class MakeUnicodeFiles {
|
|||
}
|
||||
}
|
||||
|
||||
public static void showMatches() throws IOException {
|
||||
public static void showAllDiff() throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "propertyDifference.txt");
|
||||
try {
|
||||
showDifferences(out, "4.0.1", "LB", "GC");
|
||||
showDifferences(out, "4.0.1", "East Asian Width", "LB");
|
||||
showDifferences(out, "4.0.1", "East Asian Width", "GC");
|
||||
UnicodeProperty.Factory fac = ToolUnicodePropertySource.make("4.0.1");
|
||||
List props = fac.getAvailableNames(
|
||||
(1<<UnicodeProperty.BINARY)
|
||||
| (1<<UnicodeProperty.ENUMERATED)
|
||||
//| (1<<UnicodeProperty.CATALOG)
|
||||
);
|
||||
Set skipList = new HashSet();
|
||||
skipList.add("Age");
|
||||
skipList.add("Joining_Group");
|
||||
skipList.add("Canonical_Combining_Class");
|
||||
|
||||
for (Iterator it = props.iterator(); it.hasNext();) {
|
||||
String prop1 = (String) it.next();
|
||||
for (Iterator it2 = props.iterator(); it2.hasNext();) {
|
||||
String prop2 = (String) it2.next();
|
||||
if (prop1.equals(prop2)) continue;
|
||||
if (skipList.contains(prop2)) continue;
|
||||
System.out.println(prop1 + " vs. " + prop2);
|
||||
showDifferences(out, fac.getProperty(prop1), fac.getProperty(prop2), false);
|
||||
out.flush();
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
||||
static NumberFormat nf = NumberFormat.getIntegerInstance(Locale.ENGLISH);
|
||||
|
||||
|
||||
|
||||
static void showDifferences(PrintWriter out, String version, String prop1, String prop2) throws IOException {
|
||||
UnicodeProperty p1 = ToolUnicodePropertySource.make(version).getProperty(prop1);
|
||||
UnicodeProperty p2 = ToolUnicodePropertySource.make(version).getProperty(prop2);
|
||||
showDifferences(out, p1, p2, true);
|
||||
}
|
||||
|
||||
static void showDifferences(PrintWriter out, UnicodeProperty p1, UnicodeProperty p2, boolean doOverlaps) throws IOException {
|
||||
BagFormatter bf = new BagFormatter();
|
||||
out.println("Comparing " + p1.getName() + " and " + p2.getName());
|
||||
//out.println("Comparing " + p1.getName() + " and " + p2.getName());
|
||||
System.out.println("Comparing " + p1.getName() + " and " + p2.getName());
|
||||
String pn1 = '$' + p1.getName();
|
||||
String pn2 = '$' + p2.getName();
|
||||
UnicodeSet intersection = new UnicodeSet();
|
||||
UnicodeSet disjoint = new UnicodeSet();
|
||||
String skip1 = p1.getValue(0xEFFFD);
|
||||
String skip2 = p2.getValue(0xEFFFD);
|
||||
main:
|
||||
for (Iterator it1 = p1.getAvailableValues().iterator(); it1.hasNext();) {
|
||||
String v1 = (String)it1.next();
|
||||
if (v1.equals(skip1)) continue;
|
||||
UnicodeSet s1 = p1.getSet(v1);
|
||||
v1 += " (" + p1.getFirstValueAlias(v1) + ")";
|
||||
if (s1.size() == 0) continue;
|
||||
String pv1 = pn1 + (v1.equals("True") ? "" : ":" + v1);
|
||||
//v1 += " (" + p1.getFirstValueAlias(v1) + ")";
|
||||
System.out.println(v1);
|
||||
out.println();
|
||||
out.println(v1 + " [" + nf.format(s1.size()) + "]");
|
||||
//out.println();
|
||||
//out.println(v1 + " [" + nf.format(s1.size()) + "]");
|
||||
|
||||
// create some containers so that the output is organized reasonably
|
||||
String contains = "";
|
||||
|
@ -1005,22 +1039,25 @@ public class MakeUnicodeFiles {
|
|||
Set overlapsSet = new TreeSet();
|
||||
for (Iterator it2 = p2.getAvailableValues().iterator(); it2.hasNext();) {
|
||||
String v2 = (String)it2.next();
|
||||
if (v2.equals(skip2)) continue;
|
||||
UnicodeSet s2 = p2.getSet(v2);
|
||||
if (s2.size() == 0) continue;
|
||||
// v2 += "(" + p2.getFirstValueAlias(v2) + ")";
|
||||
v2 = p2.getFirstValueAlias(v2);
|
||||
//v2 = p2.getFirstValueAlias(v2);
|
||||
String pv2 = pn2 + (v2.equals("True") ? "" : ":" + v2);
|
||||
if (s1.containsNone(s2)) continue;
|
||||
if (s1.equals(s2)) {
|
||||
out.println("\t= " + v2);
|
||||
out.println(pv1 + "\t= " + pv2);
|
||||
continue main; // since they are partitions, we can stop here
|
||||
} else if (s2.containsAll(s1)) {
|
||||
out.println("\t\u2282 " + v2 + " [" + nf.format(s2.size()) + "]");
|
||||
// out.println(pv1 + "\t\u2282 " + pv2);
|
||||
continue main; // partition, stop
|
||||
} else if (s1.containsAll(s2)) {
|
||||
if (contains.length() != 0) contains += " \u222a ";
|
||||
contains += v2 + " [" + nf.format(s2.size()) + "]";
|
||||
if (contains.length() != 0) contains += " ";
|
||||
contains += pv2;
|
||||
containsSet.addAll(s2);
|
||||
if (containsSet.size() == s1.size()) break;
|
||||
} else { // doesn't contain, isn't contained
|
||||
} else if (doOverlaps) { // doesn't contain, isn't contained
|
||||
if (overlaps.length() != 0) overlaps += "\r\n\t";
|
||||
intersection.clear().addAll(s2).retainAll(s1);
|
||||
disjoint.clear().addAll(s1).removeAll(s2);
|
||||
|
@ -1030,7 +1067,8 @@ public class MakeUnicodeFiles {
|
|||
}
|
||||
}
|
||||
if (contains.length() != 0) {
|
||||
out.println((containsSet.size() == s1.size() ? "\t= " : "\t\u2283 ") + contains);
|
||||
out.println(pv1 + (containsSet.size() == s1.size() ? "\t= "
|
||||
: "\t\u2283 ") + "[" + contains + "]");
|
||||
}
|
||||
if (overlaps.length() != 0) out.println("\t" + overlaps);
|
||||
if (false && overlapsSet.size() != 0) {
|
||||
|
@ -1152,7 +1190,7 @@ public class MakeUnicodeFiles {
|
|||
|
||||
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]");
|
||||
|
||||
static void testInvariants() throws IOException {
|
||||
public static void testInvariants() throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
|
||||
out.write('\uFEFF'); // BOM
|
||||
BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt");
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2004/03/11 19:03:16 $
|
||||
* $Revision: 1.33 $
|
||||
* $Date: 2004/06/26 00:26:16 $
|
||||
* $Revision: 1.34 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -480,6 +480,41 @@ public final class UCD implements UCD_Types {
|
|||
byte numericType;
|
||||
}
|
||||
|
||||
public UnicodeMap getHanValue(String propertyName) {
|
||||
UnicodeMap result = new UnicodeMap();
|
||||
try {
|
||||
BufferedReader in = Utility.openUnicodeFile("Unihan", version, true, Utility.UTF8);
|
||||
int lineCounter = 0;
|
||||
while (true) {
|
||||
Utility.dot(++lineCounter);
|
||||
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
if (line.length() < 6) continue;
|
||||
if (line.charAt(0) == '#') continue;
|
||||
line = line.trim();
|
||||
|
||||
int tabPos = line.indexOf('\t');
|
||||
int tabPos2 = line.indexOf('\t', tabPos+1);
|
||||
|
||||
String property = line.substring(tabPos+1, tabPos2).trim();
|
||||
if (!property.equalsIgnoreCase(propertyName)) continue;
|
||||
|
||||
String scode = line.substring(2, tabPos).trim();
|
||||
int code = Integer.parseInt(scode, 16);
|
||||
String propertyValue = line.substring(tabPos2+1).trim();
|
||||
result.put(code, propertyValue);
|
||||
}
|
||||
in.close();
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("Han File Processing Exception", null, e);
|
||||
} finally {
|
||||
Utility.fixDot();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
void populateHanExceptions() {
|
||||
hanExceptions = new IntMap();
|
||||
BufferedReader in = null;
|
||||
|
|
|
@ -88,3 +88,5 @@ $Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]]
|
|||
|
||||
# Testing
|
||||
$script:greek = $×script:greek
|
||||
$gc:lm = $script:inherited
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2004/04/10 16:49:19 $
|
||||
* $Revision: 1.42 $
|
||||
* $Date: 2004/06/26 00:26:16 $
|
||||
* $Revision: 1.43 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -17,6 +17,7 @@ import java.util.*;
|
|||
import java.text.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.Replaceable;
|
||||
|
@ -462,18 +463,22 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
|||
* Splits a string containing divider into pieces, storing in output
|
||||
* and returns the number of pieces.
|
||||
*/
|
||||
public static int split(String s, char divider, String[] output) {
|
||||
public static int split(String s, char divider, String[] output, boolean trim) {
|
||||
try {
|
||||
int last = 0;
|
||||
int current = 0;
|
||||
int i;
|
||||
for (i = 0; i < s.length(); ++i) {
|
||||
if (s.charAt(i) == divider) {
|
||||
output[current++] = s.substring(last,i);
|
||||
String temp = s.substring(last,i);
|
||||
if (trim) temp = temp.trim();
|
||||
output[current++] = temp;
|
||||
last = i+1;
|
||||
}
|
||||
}
|
||||
output[current++] = s.substring(last,i);
|
||||
String temp = s.substring(last,i);
|
||||
if (trim) temp = temp.trim();
|
||||
output[current++] = temp;
|
||||
int result = current;
|
||||
while (current < output.length) {
|
||||
output[current++] = "";
|
||||
|
@ -484,9 +489,16 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
|||
}
|
||||
}
|
||||
|
||||
public static String[] split(String s, char divider) {
|
||||
public static String[] split(String s, char divider) {
|
||||
return split(s,divider,false);
|
||||
}
|
||||
public static int split(String s, char divider, String[] output) {
|
||||
return split(s,divider,output,false);
|
||||
}
|
||||
|
||||
public static String[] split(String s, char divider, boolean trim) {
|
||||
String[] result = new String[100]; // HACK
|
||||
int count = split(s, divider, result);
|
||||
int count = split(s, divider, result, trim);
|
||||
return extract(result, 0, count);
|
||||
}
|
||||
|
||||
|
@ -1209,4 +1221,45 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
|||
return (isSeparateLineIDN(start, ucd) || isSeparateLineIDN(end, ucd));
|
||||
}
|
||||
|
||||
public static Transliterator createFromFile(String fileName, int direction, Transliterator pretrans) throws IOException {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
FileLineIterator fli = new FileLineIterator();
|
||||
fli.open(fileName, Utility.UTF8);
|
||||
fli.commentChar = FileLineIterator.NOTCHAR; // disable comments
|
||||
while (true) {
|
||||
String line = fli.read();
|
||||
if (line == null) break;
|
||||
if (line.startsWith("\uFEFF")) line = line.substring(1);
|
||||
if (pretrans != null) line = pretrans.transliterate(line);
|
||||
buffer.append(line);
|
||||
buffer.append("\r\n"); // separate with whitespace
|
||||
}
|
||||
fli.close();
|
||||
|
||||
/*
|
||||
|
||||
// read and concatenate all the lines
|
||||
FileInputStream fis = new FileInputStream(fileName);
|
||||
InputStreamReader isr = new InputStreamReader(fis, "UTF8");
|
||||
BufferedReader br = new BufferedReader(isr, 32*1024);
|
||||
while (true) {
|
||||
String line = br.readLine();
|
||||
if (line == null) break;
|
||||
if (line.length() > 0 && line.charAt(0) == '\uFEFF') line = line.substring(1); // strip BOM
|
||||
if (pretrans != null) line = pretrans.transliterate(line);
|
||||
buffer.append(line);
|
||||
buffer.append("\r\n"); // separate with whitespace
|
||||
}
|
||||
br.close();
|
||||
//System.out.println(buffer.toString());
|
||||
*/
|
||||
|
||||
// Transform file name into id
|
||||
String id = fileName;
|
||||
int pos = id.lastIndexOf('.');
|
||||
if (pos >= 0) id = id.substring(0, pos);
|
||||
//System.out.println(buffer);
|
||||
return Transliterator.createFromRules(id, buffer.toString(), direction);
|
||||
}
|
||||
|
||||
}
|
Loading…
Add table
Reference in a new issue