no message

X-SVN-Rev: 15942
2025-04-05 21:45:37 +00:00 · 2004-06-26 00:26:16 +00:00 · 2004-06-26 00:26:16 +00:00 · ae721a34d1
commit ae721a34d1
parent 28015f3710
5 changed files with 416 additions and 30 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
@ -5,22 +5,29 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
-* $Date: 2004/04/17 18:21:39 $
-* $Revision: 1.15 $
+* $Date: 2004/06/26 00:26:16 $
+* $Revision: 1.16 $
 *
 *******************************************************************************
 */

 package com.ibm.text.UCD;
 import java.io.*;
+
 import com.ibm.text.utility.*;

+import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.dev.test.util.UnicodeMap;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
 import com.ibm.icu.text.Transliterator;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.Replaceable;
 import com.ibm.icu.text.ReplaceableString;
 import com.ibm.icu.text.UnicodeMatcher;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.ULocale;


 import java.util.*;
@ -250,7 +257,242 @@ public final class GenerateHanTransliterator implements UCD_Types {
    static final boolean DO_SIMPLE = true;
    static final boolean SKIP_OVERRIDES = true;
    
-    public static void main(int typeIn) {
+    static PrintWriter out2;
+    
+    public static void fixedMandarin() throws IOException {
+        UnicodeMap kMandarin = Default.ucd().getHanValue("kMandarin");
+        UnicodeMap kHanyuPinlu = Default.ucd().getHanValue("kHanyuPinlu");
+        UnicodeSet gotMandarin = kMandarin.getSet(null).complement();
+        UnicodeSet gotHanyu = kHanyuPinlu.getSet(null).complement();
+        UnicodeSet gotAtLeastOne = new UnicodeSet(gotMandarin).addAll(gotHanyu);
+        Map outmap = new TreeMap(Collator.getInstance(new ULocale("zh")));
+        for (UnicodeSetIterator it = new UnicodeSetIterator(gotAtLeastOne); it.next(); ) {
+            //String code = UTF16.valueOf(it.codepoint);
+            String hanyu = (String) kHanyuPinlu.getValue(it.codepoint);
+            String mandarin = (String) kMandarin.getValue(it.codepoint);
+            String hPinyin = hanyu == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(hanyu,'('));
+            String mPinyin = mandarin == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(mandarin.toLowerCase(),' '));
+            String uPinyin = hPinyin != null ? hPinyin : mPinyin;
+            UnicodeSet s = (UnicodeSet) outmap.get(uPinyin);
+            if (s == null) {
+                s = new UnicodeSet();
+                outmap.put(uPinyin, s); 
+            }
+            s.add(it.codepoint);
+        }
+        String filename = "Raw_Transliterator_Han_Latin.txt";
+        PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, filename);
+        for (Iterator it = outmap.keySet().iterator(); it.hasNext();) {
+            String pinyin = (String) it.next();
+            UnicodeSet uset = (UnicodeSet) outmap.get(pinyin);
+            if (uset.size() == 1) {
+                UnicodeSetIterator usi = new UnicodeSetIterator(uset);
+                usi.next();
+                out.println(UTF16.valueOf(usi.codepoint) + ">" + pinyin + ";");
+            } else {
+                out.println(uset.toPattern(false) + ">" + pinyin + ";");
+            }
+        }
+        out.close();
+    }
+    
+    public static class PairComparator implements Comparator {
+        Comparator first;
+        Comparator second;
+        PairComparator(Comparator first, Comparator second) {
+            this.first = first;
+            this.second = second;
+        }
+        public int compare(Object o1, Object o2) {
+            Pair p1 = (Pair)o1;
+            Pair p2 = (Pair)o2;
+            int result = first.compare(p1.first, p2.first);
+            if (result != 0) return result;
+            return second.compare(p1.second, p2.second);
+        }
+    }
+    
+    public static void quickMandarin() throws Exception {
+        UnicodeMap gcl = new UnicodeMap();
+        addField("C:\\DATA\\dict\\", "gcl_icu.txt", 2, 3, gcl);
+        addField("C:\\DATA\\dict\\", "gcl_other.txt", 2, 5, gcl);        
+        Transliterator icuPinyin = Transliterator.getInstance("han-latin");
+        UnicodeMap kMandarin = Default.ucd().getHanValue("kMandarin");
+        UnicodeMap kHanyuPinlu = Default.ucd().getHanValue("kHanyuPinlu");
+        UnicodeSet gotMandarin = kMandarin.getSet(null).complement();
+        UnicodeSet gotHanyu = kHanyuPinlu.getSet(null).complement();
+        UnicodeSet gotAtLeastOne = new UnicodeSet(gotMandarin).addAll(gotHanyu);
+        int counter = 0;
+        int hCount = 0;
+        log = Utility.openPrintWriter("Mandarin_First.txt", Utility.UTF8_WINDOWS);
+        log.println("N\tCode\tChar\tUnihan\tICU\tGCL\tkHanyuPinlu / kMandarin");
+        UnicodeMap reformed = new UnicodeMap();
+        for (UnicodeSetIterator it = new UnicodeSetIterator(gotAtLeastOne); it.next(); ) {
+            String code = UTF16.valueOf(it.codepoint);
+            String hanyu = (String) kHanyuPinlu.getValue(it.codepoint);
+            String mandarin = (String) kMandarin.getValue(it.codepoint);
+            String hPinyin = hanyu == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(hanyu,'('));
+            String mPinyin = mandarin == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(mandarin.toLowerCase(),' '));
+            String uPinyin = hPinyin != null ? hPinyin : mPinyin;
+
+            String iPinyin = icuPinyin.transliterate(code).trim();
+            if (iPinyin.equals(code)) iPinyin = null;
+            String gPinyin = (String) gcl.getValue(it.codepoint);
+            
+            if (hPinyin != null) reformed.put(it.codepoint, hPinyin);
+            else if (gPinyin != null) reformed.put(it.codepoint, gPinyin);
+            else if (mPinyin != null) reformed.put(it.codepoint, mPinyin);
+            else if (iPinyin != null) reformed.put(it.codepoint, iPinyin);
+            
+            if (gPinyin != null && !gPinyin.equals(uPinyin)) {
+                log.println((++counter) + "\t" + Utility.hex(it.codepoint) + "\t" + code
+                    + "\t" + (uPinyin == null ? "" : uPinyin)
+                    + "\t" + (iPinyin == null ? "" : iPinyin.equals(gPinyin) ? "" : iPinyin)
+                    + "\t" + (gPinyin == null ? "" : gPinyin)
+                    + "\t" + (hanyu == null ? "" : hanyu + " / ")
+                    + (mandarin == null ? "" : mandarin)
+                     );
+                if (hanyu != null) hCount++;
+                continue;
+            }
+            if (true) continue;
+            if (isEqualOrNull(uPinyin, iPinyin)) continue;
+            log.println((++counter) + "\t" + Utility.hex(it.codepoint) + "\t" + code
+                + "\t" + (uPinyin == null ? "" : uPinyin)
+                + "\t" + (iPinyin == null ? "" : iPinyin)
+                + "\t" + (gPinyin == null ? "" : gPinyin)
+                + "\t" + (hanyu == null ? "" : hanyu + " / ")
+                + (mandarin == null ? "" : mandarin)
+                 );
+        }
+        log.println("kHanyuPinlu count: " + hCount);
+        
+        Collator col = Collator.getInstance(new Locale("zh","","PINYIN"));
+        UnicodeSet tailored = col.getTailoredSet().addAll(gotAtLeastOne);
+        Collator pinyinCollator = new RuleBasedCollator(
+            "&[before 1] a < \u0101 <<< \u0100 << \u00E1 <<< \u00C1 << \u01CE <<< \u01CD << \u00E0 <<< \u00C0 << a <<< A" +
+            "&[before 1] e < \u0113 <<< \u0112 << \u00E9 <<< \u00C9 << \u011B <<< \u011A << \u00E8 <<< \u00C8 << e <<< A" +
+            "&[before 1] i < \u012B <<< \u012A << \u00ED <<< \u00CD << \u01D0 <<< \u01CF << \u00EC <<< \u00CC << i <<< I" +
+            "&[before 1] o < \u014D <<< \u014C << \u00F3 <<< \u00D3 << \u01D2 <<< \u01D1 << \u00F2 <<< \u00D2 << o <<< O" +
+            "&[before 1] u < \u016B <<< \u016A << \u00FA <<< \u00DA << \u01D4 <<< \u01D3 << \u00F9 <<< \u00D9 << u <<< U" +
+            " << \u01D6 <<< \u01D5 << \u01D8 <<< \u01D7 << \u01DA <<< \u01D9 << \u01DC <<< \u01DB << \u00FC");
+        printSortedChars("ICU_Pinyin_Sort.txt", col, tailored, reformed, kHanyuPinlu, kMandarin, pinyinCollator);
+        /*
+        MultiComparator mcol = new MultiComparator(new Comparator[] {
+                new UnicodeMapComparator(reformed, pinyinCollator), col});
+        printSortedChars("ICU_Pinyin_Sort2.txt", mcol, tailored);
+        */
+        log.close();
+    }
+    
+    static class UnicodeMapComparator implements Comparator {
+        UnicodeMap map;
+        Comparator comp;
+        UnicodeMapComparator(UnicodeMap map, Comparator comp) {
+            this.map = map;
+            this.comp = comp;
+        }
+        public int compare(Object o1, Object o2) {
+            int c1 = UTF16.charAt((String) o1,0);
+            int c2 = UTF16.charAt((String) o2,0);
+            Object v1 = map.getValue(c1);
+            Object v2 = map.getValue(c2);
+            if (v1 == null) {
+                if (v2 == null) return 0;
+                return -1;
+            } else if (v2 == null) return 1;
+            return comp.compare(v1, v2);
+        }
+    }
+    
+    static class MultiComparator implements Comparator {
+        private Comparator[] comparators;
+    
+        public MultiComparator (Comparator[] comparators) {
+            this.comparators = comparators;
+        }
+    
+        /* Lexigraphic compare. Returns the first difference
+         * @return zero if equal. Otherwise +/- (i+1) 
+         * where i is the index of the first comparator finding a difference
+         * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+         */
+        public int compare(Object arg0, Object arg1) {
+            for (int i = 0; i < comparators.length; ++i) {
+                int result = comparators[i].compare(arg0, arg1);
+                if (result == 0) continue;
+                if (result > 0) return i+1;
+                return -(i+1);
+            }
+            return 0;
+        }
+    }
+
+    private static void printSortedChars(String file, Comparator col, UnicodeSet tailored,
+         UnicodeMap map, UnicodeMap hanyu, UnicodeMap mand, Comparator p2)
+        throws IOException {
+        Set set = new TreeSet(col);
+        PrintWriter pw = Utility.openPrintWriter(file, Utility.UTF8_WINDOWS);
+        for (UnicodeSetIterator it = new UnicodeSetIterator(tailored); it.next(); ) {
+            set.add(UTF16.valueOf(it.codepoint));
+        }
+        String lastm = "";
+        String lasts = "";
+        for (Iterator it2 = set.iterator(); it2.hasNext(); ) {
+            String s = (String)it2.next();
+            String m = map == null ? null : (String) map.getValue(UTF16.charAt(s,0));
+            if (m == null) m = "";
+            String info = m;
+            if (p2.compare(lastm,m) > 0) {
+                info = info + "\t" + lastm + " > " + m + "\t";
+                Object temp;
+                temp = hanyu.getValue(UTF16.charAt(lasts,0));
+                if (temp != null) info += "[" + temp + "]";
+                temp = mand.getValue(UTF16.charAt(lasts,0));
+                if (temp != null) info += "[" + temp + "]";
+                info += " > ";
+                temp = hanyu.getValue(UTF16.charAt(s,0));
+                if (temp != null) info += "[" + temp + "]";
+                temp = mand.getValue(UTF16.charAt(s,0));
+                if (temp != null) info += "[" + temp + "]";                
+            } 
+            pw.println(Utility.hex(s) + "\t" + s + "\t" + info);
+            lastm = m;
+            lasts = s;
+        }
+        pw.close();
+    }
+    
+    static void addField(String dir, String file, int hexCodeFieldNumber, int valueNumber, UnicodeMap result) throws IOException {
+        BufferedReader br = BagFormatter.openUTF8Reader(dir, file);
+        while (true) {
+            String line = br.readLine();
+            if (line == null) break;
+            line = line.trim();
+            if (line.length() == 0) continue;
+            if (line.startsWith("\uFEFF")) line = line.substring(1);
+            if (line.startsWith("#") || line.length() == 0) continue;
+            String[] pieces = Utility.split(line,'\t');
+            result.put(Integer.parseInt(pieces[hexCodeFieldNumber], 16), pieces[valueNumber]);
+        }
+        br.close();
+    }
+    
+    static boolean isEqualOrNull(String a, String b) {
+        if (a == null || b == null) return true;
+        return a.equals(b);
+    }
+    public static String getUpTo(String s, char ch) {
+        int pos = s.indexOf(ch);
+        if (pos < 0) return s;
+        return s.substring(0,pos);   
+    }
+    
+    public static void main(int typeIn) throws IOException {
+        if (typeIn == CHINESE) {
+            fixedMandarin();
+            return;
+        }
    	type = typeIn;
    	
        try {
@ -298,7 +540,11 @@ public final class GenerateHanTransliterator implements UCD_Types {
            log.println();
            log.println("@Unihan Data");
            log.println();
+            out2 = BagFormatter.openUTF8Writer(GEN_DIR, "unihan_kmandarinDump.txt");
+            
            readUnihanData(key);
+            
+            out2.close();

            if (false) {
                readCDICT();
@ -1796,6 +2042,8 @@ Bad pinyin data: \u4E7F	?	LE
    static Map cdict = new TreeMap();
    static Map simplifiedToTraditional = new HashMap();
    static Map traditionalToSimplified = new HashMap();
+    
+    static UnicodeMap kHanyuPinlu = new UnicodeMap();
  
    static void readUnihanData(String key) throws java.io.IOException {

@ -1833,7 +2081,16 @@ Bad pinyin data: \u4E7F	?	LE
                traditionalToSimplified.put(UTF16.valueOf(code), propertyValue);
            }
            
-            if (property.equals(key) || key.equals("kJapaneseOn") && property.equals("kJapaneseKun")) {
+            if (key.equals("kMandarin") && property.equals("kHanyuPinlu")) {
+                // U+64D4   kHanyuPinlu dan1(297), dan4(61), dan5(36)
+                String[] piece = Utility.split(propertyValue,'(');
+                String pinyin = digitToPinyin(piece[0], line);
+                log.println(scode + "\t" + pinyin + "\t" + line);
+                kHanyuPinlu.put(Integer.parseInt(scode,16), pinyin);
+            }
+            if (property.equals(key) 
+                || key.equals("kJapaneseOn") && property.equals("kJapaneseKun")
+                ) {
                storeDef(out, code, propertyValue, line);
            }            
        }
@ -1885,6 +2142,7 @@ Bad pinyin data: \u4E7F	?	LE
            definition = definition.substring(0, end3);
            
            definition = digitToPinyin(definition, line);
+            out2.println(Utility.hex(cp) + '\t' + UTF16.valueOf(cp) + "\t" + definition.toLowerCase());
        }
        if (type == DEFINITION) {
            definition = removeMatched(definition,'(', ')', line);
--- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java
+++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java
@ -13,6 +13,7 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.Comparator;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;
@ -968,35 +969,68 @@ public class MakeUnicodeFiles {
        }
    }
    
-    public static void showMatches() throws IOException {
+    public static void showAllDiff() throws IOException {
        PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "propertyDifference.txt");
        try {
-            showDifferences(out, "4.0.1", "LB", "GC");
-            showDifferences(out, "4.0.1", "East Asian Width", "LB");
-            showDifferences(out, "4.0.1", "East Asian Width", "GC");
+            UnicodeProperty.Factory fac = ToolUnicodePropertySource.make("4.0.1");
+            List props = fac.getAvailableNames(
+                  (1<<UnicodeProperty.BINARY) 
+                | (1<<UnicodeProperty.ENUMERATED) 
+                //| (1<<UnicodeProperty.CATALOG)
+                );
+            Set skipList = new HashSet();
+            skipList.add("Age");
+            skipList.add("Joining_Group");
+            skipList.add("Canonical_Combining_Class");
+            
+            for (Iterator it = props.iterator(); it.hasNext();) {
+                String prop1 = (String) it.next();
+                for (Iterator it2 = props.iterator(); it2.hasNext();) {
+                    String prop2 = (String) it2.next();
+                    if (prop1.equals(prop2)) continue;
+                    if (skipList.contains(prop2)) continue;
+                    System.out.println(prop1 + " vs. " + prop2);
+                    showDifferences(out, fac.getProperty(prop1), fac.getProperty(prop2), false);
+                    out.flush();
+                }             
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
        } finally {
            out.close();
        }
    }

    static NumberFormat nf = NumberFormat.getIntegerInstance(Locale.ENGLISH);
-    
+
+
    static void showDifferences(PrintWriter out, String version, String prop1, String prop2) throws IOException {
        UnicodeProperty p1 = ToolUnicodePropertySource.make(version).getProperty(prop1);
        UnicodeProperty p2 = ToolUnicodePropertySource.make(version).getProperty(prop2);
+        showDifferences(out, p1, p2, true);
+    }
+        
+    static void showDifferences(PrintWriter out, UnicodeProperty p1, UnicodeProperty p2, boolean doOverlaps) throws IOException {
        BagFormatter bf = new BagFormatter();
-        out.println("Comparing " + p1.getName() + " and " + p2.getName());
+        //out.println("Comparing " + p1.getName() + " and " + p2.getName());
        System.out.println("Comparing " + p1.getName() + " and " + p2.getName());
+        String pn1 = '$' + p1.getName();
+        String pn2 = '$' + p2.getName();
        UnicodeSet intersection = new UnicodeSet();
        UnicodeSet disjoint = new UnicodeSet();
+        String skip1 = p1.getValue(0xEFFFD);
+        String skip2 = p2.getValue(0xEFFFD);
            main:
            for (Iterator it1 = p1.getAvailableValues().iterator(); it1.hasNext();) {
                String v1 = (String)it1.next();
+                if (v1.equals(skip1)) continue;
                UnicodeSet s1 = p1.getSet(v1);
-                v1 += " (" + p1.getFirstValueAlias(v1) + ")";
+                if (s1.size() == 0) continue;
+                String pv1 = pn1 + (v1.equals("True") ? "" : ":" + v1);
+                //v1 += " (" + p1.getFirstValueAlias(v1) + ")";
                System.out.println(v1);
-                out.println();
-                out.println(v1 + " [" + nf.format(s1.size()) + "]");
+                //out.println();
+                //out.println(v1 + " [" + nf.format(s1.size()) + "]");

                // create some containers so that the output is organized reasonably
                String contains = "";
@ -1005,22 +1039,25 @@ public class MakeUnicodeFiles {
                Set overlapsSet = new TreeSet();
                for (Iterator it2 = p2.getAvailableValues().iterator(); it2.hasNext();) {
                    String v2 = (String)it2.next();
+                    if (v2.equals(skip2)) continue;
                    UnicodeSet s2 = p2.getSet(v2);
+                    if (s2.size() == 0) continue;
                    // v2 += "(" + p2.getFirstValueAlias(v2) + ")";
-                    v2 = p2.getFirstValueAlias(v2);
+                    //v2 = p2.getFirstValueAlias(v2);
+                    String pv2 = pn2 + (v2.equals("True") ? "" : ":" + v2);
                    if (s1.containsNone(s2)) continue;
                    if (s1.equals(s2)) {
-                        out.println("\t= " + v2);
+                        out.println(pv1 + "\t= " + pv2);
                        continue main; // since they are partitions, we can stop here
                    } else if (s2.containsAll(s1)) {
-                        out.println("\t\u2282 " + v2 + " [" + nf.format(s2.size()) + "]");
+                        // out.println(pv1 + "\t\u2282 " + pv2);
                        continue main; // partition, stop
                    } else if (s1.containsAll(s2)) {
-                        if (contains.length() != 0) contains += " \u222a ";
-                        contains += v2 + " [" + nf.format(s2.size()) + "]";
+                        if (contains.length() != 0) contains += " ";
+                        contains += pv2;
                        containsSet.addAll(s2);
                        if (containsSet.size() == s1.size()) break;
-                    } else { // doesn't contain, isn't contained
+                    } else if (doOverlaps) { // doesn't contain, isn't contained
                        if (overlaps.length() != 0) overlaps += "\r\n\t";
                        intersection.clear().addAll(s2).retainAll(s1);
                        disjoint.clear().addAll(s1).removeAll(s2);
@ -1030,7 +1067,8 @@ public class MakeUnicodeFiles {
                    } 
                }
                if (contains.length() != 0) {
-                    out.println((containsSet.size() == s1.size() ? "\t= " : "\t\u2283 ") + contains);
+                    out.println(pv1 + (containsSet.size() == s1.size() ? "\t= " 
+                        : "\t\u2283 ") + "[" + contains + "]");
                } 
                if (overlaps.length() != 0) out.println("\t" + overlaps);
                if (false && overlapsSet.size() != 0) {
@ -1152,7 +1190,7 @@ public class MakeUnicodeFiles {
    
    static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]");
    
-    static void testInvariants() throws IOException {
+    public static void testInvariants() throws IOException {
        PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
        out.write('\uFEFF'); // BOM
        BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt");
--- a/tools/unicodetools/com/ibm/text/UCD/UCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
-* $Date: 2004/03/11 19:03:16 $
-* $Revision: 1.33 $
+* $Date: 2004/06/26 00:26:16 $
+* $Revision: 1.34 $
 *
 *******************************************************************************
 */
@ -480,6 +480,41 @@ public final class UCD implements UCD_Types {
        byte numericType;
    }
    
+    public UnicodeMap getHanValue(String propertyName) {
+        UnicodeMap result = new UnicodeMap();
+        try {
+            BufferedReader in = Utility.openUnicodeFile("Unihan", version, true, Utility.UTF8); 
+            int lineCounter = 0;
+            while (true) {
+                Utility.dot(++lineCounter);
+                
+                String line = in.readLine();
+                if (line == null) break;
+                if (line.length() < 6) continue;
+                if (line.charAt(0) == '#') continue;
+                line = line.trim();
+                
+                int tabPos = line.indexOf('\t');
+                int tabPos2 = line.indexOf('\t', tabPos+1);
+                
+                String property = line.substring(tabPos+1, tabPos2).trim();
+                if (!property.equalsIgnoreCase(propertyName)) continue;
+                
+                String scode = line.substring(2, tabPos).trim();
+                int code = Integer.parseInt(scode, 16);
+                String propertyValue = line.substring(tabPos2+1).trim();
+                result.put(code, propertyValue);
+            }
+            in.close();
+        } catch (Exception e) {
+            throw new ChainException("Han File Processing Exception", null, e);
+        } finally {
+            Utility.fixDot();
+        }
+        return result;
+    }
+
+    
    void populateHanExceptions() {
        hanExceptions = new IntMap();
        BufferedReader in = null;
--- a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt
@ -88,3 +88,5 @@ $Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]]

 # Testing
 $script:greek = $×script:greek
+$gc:lm = $script:inherited
+
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2004/04/10 16:49:19 $
-* $Revision: 1.42 $
+* $Date: 2004/06/26 00:26:16 $
+* $Revision: 1.43 $
 *
 *******************************************************************************
 */
@ -17,6 +17,7 @@ import java.util.*;
 import java.text.*;
 import java.io.*;

+import com.ibm.icu.text.Transliterator;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.Replaceable;
@ -462,18 +463,22 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
     * Splits a string containing divider into pieces, storing in output
     * and returns the number of pieces.
     */
-	public static int split(String s, char divider, String[] output) {
+	public static int split(String s, char divider, String[] output, boolean trim) {
 	    try {
            int last = 0;
            int current = 0;
            int i;
            for (i = 0; i < s.length(); ++i) {
                if (s.charAt(i) == divider) {
-                    output[current++] = s.substring(last,i);
+                    String temp = s.substring(last,i);
+                    if (trim) temp = temp.trim();
+                    output[current++] = temp;
                    last = i+1;
                }
            }
-            output[current++] = s.substring(last,i);
+            String temp = s.substring(last,i);
+            if (trim) temp = temp.trim();
+            output[current++] = temp;
            int result = current;
            while (current < output.length) {
                output[current++] = "";
@ -484,9 +489,16 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
        }
 	}

-	public static String[] split(String s, char divider) {
+    public static String[] split(String s, char divider) {
+        return split(s,divider,false);
+    }
+    public static int split(String s, char divider, String[] output) {
+        return split(s,divider,output,false);
+    }
+    
+	public static String[] split(String s, char divider, boolean trim) {
 	    String[] result = new String[100]; // HACK
-	    int count = split(s, divider, result);
+	    int count = split(s, divider, result, trim);
 	    return extract(result, 0, count);
 	}

@ -1209,4 +1221,45 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
        return (isSeparateLineIDN(start, ucd) || isSeparateLineIDN(end, ucd));
    }

+    public static Transliterator createFromFile(String fileName, int direction, Transliterator pretrans) throws IOException {
+        StringBuffer buffer = new StringBuffer();
+        FileLineIterator fli = new FileLineIterator();
+        fli.open(fileName, Utility.UTF8);
+        fli.commentChar = FileLineIterator.NOTCHAR; // disable comments
+        while (true) {
+            String line = fli.read();
+            if (line == null) break;
+            if (line.startsWith("\uFEFF")) line = line.substring(1);
+            if (pretrans != null) line = pretrans.transliterate(line);
+            buffer.append(line);
+            buffer.append("\r\n"); // separate with whitespace
+        }
+        fli.close();
+        
+        /*
+        
+        // read and concatenate all the lines
+        FileInputStream fis = new FileInputStream(fileName);
+        InputStreamReader isr = new InputStreamReader(fis, "UTF8");
+        BufferedReader br = new BufferedReader(isr, 32*1024);
+        while (true) {
+            String line = br.readLine();
+            if (line == null) break;
+            if (line.length() > 0 && line.charAt(0) == '\uFEFF') line = line.substring(1); // strip BOM
+            if (pretrans != null) line = pretrans.transliterate(line);
+            buffer.append(line);
+            buffer.append("\r\n"); // separate with whitespace
+        }
+        br.close();
+        //System.out.println(buffer.toString());
+        */
+        
+        // Transform file name into id
+        String id = fileName;
+        int pos = id.lastIndexOf('.');
+        if (pos >= 0) id = id.substring(0, pos);
+        //System.out.println(buffer);
+        return Transliterator.createFromRules(id, buffer.toString(), direction);
+    }
+
 }