diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java index 2a6cdf0f167..77d6ee5a1cb 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $ -* $Date: 2002/06/22 01:21:09 $ -* $Revision: 1.20 $ +* $Date: 2002/07/14 22:04:49 $ +* $Revision: 1.21 $ * ******************************************************************************* */ @@ -508,6 +508,7 @@ public class GenerateData implements UCD_Types { int type = i & 0xFF00; if (type == AGE) continue; if (i == (BINARY_PROPERTIES | CaseFoldTurkishI)) continue; + if (i == (BINARY_PROPERTIES | Non_break)) continue; UnicodeProperty up = UnifiedBinaryProperty.make(i, Default.ucd); if (up == null) continue; diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java index ce7d356ec98..1034f45cb89 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $ -* $Date: 2002/06/13 21:14:05 $ -* $Revision: 1.5 $ +* $Date: 2002/07/14 22:04:49 $ +* $Revision: 1.6 $ * ******************************************************************************* */ @@ -22,6 +22,211 @@ import java.util.*; public final class GenerateHanTransliterator implements UCD_Types { + static class HanInfo { + int count = 0; + int minLen = Integer.MAX_VALUE; + int maxLen = Integer.MIN_VALUE; + int sampleLen = 0; + Set samples = new TreeSet(); + Map map = new TreeMap(); + } + + public static void readUnihan() throws java.io.IOException { + + log = Utility.openPrintWriter("Unihan_log.html", false, false); + log.println("
"); + + BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, true); + + Map properties = new TreeMap(); + + Integer integerCode = new Integer(0); + int lineCounter = 0; + + while (true) { + Utility.dot(++lineCounter); + + String line = in.readLine(); + if (line == null) break; + if (line.length() < 6) continue; + if (line.charAt(0) == '#') continue; + line = line.trim(); + + int tabPos = line.indexOf('\t'); + String scode = line.substring(2, tabPos).trim(); + + int code = Integer.parseInt(scode, 16); + if (code != integerCode.intValue()) { + integerCode = new Integer(code); + } + + int tabPos2 = line.indexOf('\t', tabPos+1); + String property = line.substring(tabPos+1, tabPos2).trim(); + + String propertyValue = line.substring(tabPos2+1).trim(); + if (propertyValue.indexOf("U+") >= 0) propertyValue = fixHex.transliterate(propertyValue); + + HanInfo values = (HanInfo) properties.get(property); + if (values == null) { + values = new HanInfo(); + properties.put(property, values); + Utility.fixDot(); + System.out.println("Property: " + property); + } + ++values.count; + if (values.minLen > propertyValue.length()) values.minLen = propertyValue.length(); + if (values.maxLen < propertyValue.length()) values.maxLen = propertyValue.length(); + if (values.sampleLen < 150) { + String temp = scode + ":" + propertyValue; + values.sampleLen += temp.length() + 2; + values.samples.add(temp); + } + if (property.endsWith("Variant") + || property.endsWith("Numeric") + || property.startsWith("kRS") + || property.equals("kTotalStrokes")) { + values.map.put(integerCode, propertyValue); + } + } + + Set props = properties.keySet(); + /* + log.println("Properties"); + log.print(" "); + Utility.print(log, props, "\r\n "); + log.println(); + log.println(); + + log.println("Sample Values"); + */ + Iterator it = props.iterator(); + log.println("Checking Redundants for " + list[j] + "
"); + redundants.clear(); + Map otherInfo = ((HanInfo) properties.get(list[j])).map; + it = otherInfo.keySet().iterator(); + while (it.hasNext()) { + Integer key = (Integer) it.next(); + Object ovalue = otherInfo.get(key); + Object uvalue = kRSUnicodeMap.get(key); + if (ovalue.equals(uvalue)) { + redundants.add(key); + } else if (++unequalCount < 5) { + log.println(""); + } + + log.println("" + Integer.toString(key.intValue(),16) + + ": " + ovalue + ", " + uvalue + "
"); + } + } + log.println("Total Unique: " + (otherInfo.size() - redundants.size()) + + "(out of" + otherInfo.size() + ")
Checking Redundants for kTotalStrokes
"); + + // pass through first to get a count for the radicals + Map kTotalStrokesMap = ((HanInfo) properties.get("kTotalStrokes")).map; + int[] radCount = new int[512]; + it = kRSUnicodeMap.keySet().iterator(); + while(it.hasNext()) { + Integer key = (Integer) it.next(); + String uvalue = (String) kRSUnicodeMap.get(key); + if (uvalue.endsWith(".0")) { + String tvalue = (String) kTotalStrokesMap.get(key); + if (tvalue == null) continue; + int rs = getRadicalStroke(uvalue); + radCount[rs>>8] = Integer.parseInt(tvalue); + } + } + + // now compare the computed value against the real value + it = kTotalStrokesMap.keySet().iterator(); + unequalCount = 0; + redundants.clear(); + while(it.hasNext()) { + Integer key = (Integer) it.next(); + String uvalue = (String) kRSUnicodeMap.get(key); + int rs = getRadicalStroke(uvalue); + String tvalue = (String) kTotalStrokesMap.get(key); + int t = Integer.parseInt(tvalue); + int projected = radCount[rs>>8] + (rs & 0xFF); + if (t == projected) { + redundants.add(key); + } else if (++unequalCount < 5) { + log.println(""); + + log.println(""); + in.close(); + log.close(); + } + + static int getRadicalStroke(String s) { + int dotPos = s.indexOf('.'); + int strokes = Integer.parseInt(s.substring(dotPos+1)); + int radical = 0; + if (s.charAt(dotPos - 1) == '\'') { + radical = 256; + --dotPos; + } + radical += Integer.parseInt(s.substring(0,dotPos)); + return (radical << 8) + strokes; + } + + static Transliterator fixHex = Transliterator.getInstance("hex-any/unicode"); + + /* + static String convertUPlus(String other) { + int pos1 = other.indexOf("U+"); + if (pos1 < 0) return other; + return fixHex( + pos1 += 2; + + StringBuffer result = new StringBuffer(); + while (pos1 < other.length()) { + int end = getHexEnd(s, pos1); + result.append(UTF16.valueOf(Integer.parseInt(other.substring(pos1, end), 16))); + pos1 = other.indexOf("U+", pos1); + if (pos2 < 0) pos2 = other.length(); + pos1 = pos2; + } + return result.toString(); + } + + static int getHexEnd(String s, int start) { + int i= start; + for (; i < s.length; ++i) { + char c = s.charAt(i); + if ('0' <= c && c <= '9') continue; + if ('A' <= c && c <= 'F') continue; + if ('a' <= c && c <= 'f') continue; + break; + } + return i; + } + */ + static final boolean TESTING = false; static int type; @@ -59,6 +264,7 @@ public final class GenerateHanTransliterator implements UCD_Types { default: throw new IllegalArgumentException("Unexpected option: must be 0..2"); } + if (type == DEFINITION) readCDICTDefinitions(); readUnihanData(key, filter); if (false) { @@ -86,6 +292,7 @@ public final class GenerateHanTransliterator implements UCD_Types { Set gotAlready = new HashSet(); Iterator it = rankList.iterator(); Set lenSet = new TreeSet(); + Set backSet = new TreeSet(); int rank = 0; while (it.hasNext()) { Comparable keyChar = (Comparable) it.next(); @@ -93,6 +300,10 @@ public final class GenerateHanTransliterator implements UCD_Types { if (def == null) continue; // skipping // sort longer definitions first! lenSet.add(new Pair( + new Pair(new Integer(-keyChar.toString().length()), + new Pair(new Integer(-def.toString().length()), new Integer(rank++))), + new Pair(keyChar, def))); + backSet.add(new Pair( new Pair(new Integer(-def.toString().length()), new Integer(rank++)), new Pair(keyChar, def))); gotAlready.add(keyChar); @@ -102,26 +313,53 @@ public final class GenerateHanTransliterator implements UCD_Types { it = unihanMap.keySet().iterator(); while (it.hasNext()) { Comparable keyChar = (Comparable) it.next(); + if (gotAlready.contains(keyChar)) continue; + Comparable def = (Comparable) unihanMap.get(keyChar); - if (!gotAlready.contains(keyChar)) { - lenSet.add(new Pair( - new Pair(new Integer(-def.toString().length()), new Integer(rank++)), - new Pair(keyChar, def))); - } + lenSet.add(new Pair( + new Pair(new Integer(-keyChar.toString().length()), + new Pair(new Integer(-def.toString().length()), new Integer(rank++))), + new Pair(keyChar, def))); + backSet.add(new Pair( + new Pair(new Integer(-def.toString().length()), new Integer(rank++)), + new Pair(keyChar, def))); } - + + // First, find the ones that we want a definition for, based on the ranking + // We might have a situation where the definitions are masked. + // In that case, write forwards and backwards separately + + Set doReverse = new HashSet(); Set gotIt = new HashSet(); + + it = backSet.iterator(); + while (it.hasNext()) { + Pair p = (Pair) it.next(); + p = (Pair) p.second; + + String keyChar = (String) p.first; + String def = (String) p.second; + if (!gotIt.contains(def)) { + if (unihanNonSingular) { + out.println(quoteNonLetters.transliterate(keyChar) + " < " + quoteNonLetters.transliterate(def) + ";"); + } else { + doReverse.add(keyChar); + } + } + gotIt.add(def); + } + + it = lenSet.iterator(); while (it.hasNext()) { Pair p = (Pair) it.next(); p = (Pair) p.second; - Comparable keyChar = p.first; - Comparable def = p.second; - String rel = gotIt.contains(def) ? " > " : " <> "; - out.println(keyChar + rel + def + ";"); + String keyChar = (String) p.first; + String def = (String) p.second; + String rel = doReverse.contains(keyChar) ? " <> " : " > "; + out.println(quoteNonLetters.transliterate(keyChar) + rel + quoteNonLetters.transliterate(def) + ";"); //if (TESTING) System.out.println("# " + code + " > " + definition); - gotIt.add(def); } out.println("\u3002 <> '.';"); @@ -129,7 +367,7 @@ public final class GenerateHanTransliterator implements UCD_Types { out.println(":: katakana-latin;"); out.println(":: hiragana-latin;"); } - out.println(":: fullwidth-halfwidth;"); + out.println(":: fullwidth-halfwidth ();"); @@ -270,21 +508,21 @@ public final class GenerateHanTransliterator implements UCD_Types { String ch2 = Default.nfkd.normalize(ch); pinyin = (String) unihanMap.get(ch2); if (pinyin != null) { - unihanMap.put(ch, pinyin); + addCheck(ch, pinyin, "n/a"); kPinyin.add(i); } else { String trial = (String) simplifiedToTraditional.get(ch2); if (trial != null) { pinyin = (String) unihanMap.get(trial); if (pinyin != null) { - unihanMap.put(ch, pinyin); + addCheck(ch, pinyin, "n/a"); tPinyin.add(i); } else { trial = (String) traditionalToSimplified.get(ch2); if (trial != null) { pinyin = (String) unihanMap.get(trial); if (pinyin != null) { - unihanMap.put(ch, pinyin); + addCheck(ch, pinyin, "n/a"); sPinyin.add(i); } } @@ -342,6 +580,59 @@ public final class GenerateHanTransliterator implements UCD_Types { static Map rankMap = new TreeMap(); // maps from single char strings to overall rank static List rankList = new ArrayList(10000); + // form: ???? [ai4 wu1 ji2 wu1] /love me/love my dog/ + + static void readCDICTDefinitions() throws IOException { + System.out.println("Reading cdict.txt"); + BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\cdict.txt", true); + int counter = 0; + String[] pieces = new String[50]; + String line = ""; + try { + while (true) { + line = Utility.readDataLine(br); + if (line == null) break; + if (line.length() == 0) continue; + Utility.dot(counter++); + + + int pinyinStart = line.indexOf('['); + String word = line.substring(0,pinyinStart).trim(); + int pinyinEnd = line.indexOf(']', pinyinStart+1); + int defStart = line.indexOf('/', pinyinEnd+1); + int defEnd = line.indexOf('/', defStart+1); + String definition = fixDefinition(line.substring(defStart+1, defEnd), line); + // word might have / in it, so do each part separately + int wordSlash = word.indexOf('/'); + if (wordSlash < 0) { + addCheck(word, definition, line); + } else { + addCheck(word.substring(0, wordSlash), definition, line); + addCheck(word.substring(wordSlash+1), definition, line); + } + } + br.close(); + } catch (Exception e) { + throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e); + } + } + + static UnicodeSet pua = new UnicodeSet("[:private use:]"); + static UnicodeSet numbers = new UnicodeSet("[0-9]"); + + static void addCheck(String word, String definition, String line) { + if (pua.containsSome(word) ) { + Utility.fixDot(); + System.out.println("PUA on: " + line); + } else if (numbers.containsAll(definition) ) { + Utility.fixDot(); + System.out.println("Only numbers on: " + line); + } else { + unihanMap.put(word, definition); + } + if (UTF16.countCodePoint(word) > 1) unihanNonSingular = true; + } + static void readCDICT() throws IOException { System.out.println("Reading cdict.txt"); BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\cdict.txt", true); @@ -543,14 +834,17 @@ public final class GenerateHanTransliterator implements UCD_Types { if (type == DEFINITION) { definition = removeMatched(definition,'(', ')', line); definition = removeMatched(definition,'[', ']', line); - definition = definition.trim(); - definition = Utility.replace(definition, " ", " "); - definition = "'[" + quoteNonLetters.transliterate(definition) + "]'"; + definition = fixDefinition(definition, line); } - definition.trim(); + definition = definition.trim(); definition = Default.ucd.getCase(definition, FULL, LOWER); String cp = UTF16.valueOf(Integer.parseInt(code, 16)); - unihanMap.put(cp, definition); + if (definition.length() == 0) { + Utility.fixDot(); + System.out.println("Zero value for " + Default.ucd.getCode(cp) + " on: " + hex.transliterate(line)); + } else { + addCheck(cp, definition, line); + } /* String key = (String) unihanMap.get(definition); if (key == null) { @@ -561,6 +855,15 @@ public final class GenerateHanTransliterator implements UCD_Types { */ } + static String fixDefinition(String definition, String line) { + definition = definition.trim(); + definition = Utility.replace(definition, " ", " "); + definition = Utility.replace(definition, " ", "-"); + definition = Default.ucd.getCase(definition, FULL, LOWER); + return definition; + } + + // WARNING not supplemenatary-safe! static int findInString(String source, String chars) { @@ -587,12 +890,13 @@ public final class GenerateHanTransliterator implements UCD_Types { } static Map unihanMap = new HashMap(); + static boolean unihanNonSingular = false; static StringBuffer handlePinyinTemp = new StringBuffer(); static Transliterator hex = Transliterator.getInstance("[^\\u0020-\\u007F] hex"); static Transliterator quoteNonLetters = Transliterator.createFromRules("any-quotenonletters", - "([[\\u0021-\\u007E]-[:L:]-[\\']]) > \\u005C $1; \\' > \\'\\';", Transliterator.FORWARD); + "([[\\u0021-\\u007E]-[:L:]-[\\']-[0-9]]) > \\u005C $1; \\' > \\'\\';", Transliterator.FORWARD); diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index 01747d03315..5f02556b9dd 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2002/06/22 01:21:09 $ -* $Revision: 1.17 $ +* $Date: 2002/07/14 22:04:49 $ +* $Revision: 1.18 $ * ******************************************************************************* */ @@ -69,6 +69,8 @@ public final class Main implements UCD_Types { else if (arg.equalsIgnoreCase("definitionTransliterator")) GenerateHanTransliterator.main(0); else if (arg.equalsIgnoreCase("romajiTransliterator")) GenerateHanTransliterator.main(1); else if (arg.equalsIgnoreCase("pinYinTransliterator")) GenerateHanTransliterator.main(2); + else if (arg.equalsIgnoreCase("hanproperties")) GenerateHanTransliterator.readUnihan(); + else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry(); else if (arg.equalsIgnoreCase("checkBIDI")) VerifyUCD.checkBIDI(); diff --git a/tools/unicodetools/com/ibm/text/utility/TestUtility.java b/tools/unicodetools/com/ibm/text/utility/TestUtility.java index 8e38732b7f6..38624624c5e 100644 --- a/tools/unicodetools/com/ibm/text/utility/TestUtility.java +++ b/tools/unicodetools/com/ibm/text/utility/TestUtility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/TestUtility.java,v $ -* $Date: 2002/06/22 21:01:25 $ -* $Revision: 1.1 $ +* $Date: 2002/07/14 22:04:49 $ +* $Revision: 1.2 $ * ******************************************************************************* */ @@ -17,9 +17,9 @@ import java.util.*; import java.text.*; import java.io.*; -public class Main { +public class TestUtility { - static public class MyEnum implements EnumBase { + static public class MyEnum extends EnumBase { public static MyEnum ZEROED = (MyEnum) makeNext(myEnum.getClass()), SHIFTED = (MyEnum) makeNext()," + Integer.toString(key.intValue(),16) + + ": " + t + ", " + projected + "
"); + } + } + log.println("Total Unique: " + (kTotalStrokesMap.size() - redundants.size()) + + "(out of" + kTotalStrokesMap.size() + ")