updated for collation bugs, added isFCD.

X-SVN-Rev: 8886
2025-04-10 07:39:16 +00:00 · 2002-06-13 21:14:05 +00:00 · 2002-06-13 21:14:05 +00:00 · 25561ba9b8
commit 25561ba9b8
parent 775e63220e
13 changed files with 1061 additions and 120 deletions
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ 
-* $Date: 2002/06/04 23:56:29 $ 
-* $Revision: 1.17 $
+* $Date: 2002/06/13 21:14:05 $ 
+* $Revision: 1.18 $
 *
 *******************************************************************************
 */
@ -16,6 +16,7 @@ package com.ibm.text.UCA;
 import java.util.*;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.CanonicalIterator;

 import java.io.*;
 //import java.text.*;
@ -135,7 +136,7 @@ public class WriteCollationData implements UCD_Types {
 
    static public void writeCaseFolding() throws IOException {
        System.err.println("Writing Javascript data");
-        BufferedReader in = Utility.openUnicodeFile("CaseFolding", UNICODE_VERSION, true);
+        BufferedReader in = Utility.openUnicodeFile("CaseFolding", UNICODE_VERSION, true, false);
        // new BufferedReader(new FileReader(DIR31 + "CaseFolding-3.d3.alpha.txt"), 64*1024);
        // log = new PrintWriter(new FileOutputStream("CaseFolding_data.js"));
        log = Utility.openPrintWriter("CaseFolding_data.js", false, false);
@ -1487,6 +1488,11 @@ F900..FAFF; CJK Compatibility Ideographs
                if (UCA.isImplicitLeadPrimary(primary)) {
                    if (relation == PRIMARY_DIFF) {
                    	int resetCp = UCA.ImplicitToCodePoint(primary, UCA.getPrimary(ces[1]));
+                    	
+                    	int[] ces2 = new int[50];
+                    	int len2 = collator.getCEs(UTF16.valueOf(resetCp), true, ces2);
+                    	relation = getStrengthDifference(ces, len, ces2, len2);
+                    	
                        reset = quoteOperand(UTF16.valueOf(resetCp));
                        resetComment = ucd.getCodeAndName(resetCp);
                        // lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY);
@ -1542,10 +1548,10 @@ F900..FAFF; CJK Compatibility Ideographs
                if (xmlReset == 2) {
                    log.print("<reset>" + Utility.quoteXML(reset) + "</reset>");
                }
-                log.print("  <" + XML_RELATION_NAMES[relation] + ">");
                if (expansion.length() > 0) {
                    log.print("<x>" + Utility.quoteXML(expansion) + "</x>");
                }
+                log.print("  <" + XML_RELATION_NAMES[relation] + ">");
                log.print(Utility.quoteXML(chr));
                log.print("</" + XML_RELATION_NAMES[relation] + ">");
            } else {
@ -1631,7 +1637,7 @@ F900..FAFF; CJK Compatibility Ideographs
    
    // static final String[] RELATION_NAMES = {" <", "   <<", "     <<<", "         ="};
    static final String[] RELATION_NAMES = {" <\t", "  <<\t", "   <<<\t", "    =\t"};
-    static final String[] XML_RELATION_NAMES = {"o1", "o2", "o3", "o4"};
+    static final String[] XML_RELATION_NAMES = {"g1", "g2", "g3", "eq"};
    
    static class ArrayWrapper {
    	int[] array;
@ -2080,16 +2086,80 @@ F900..FAFF; CJK Compatibility Ideographs
        
        System.out.println("Sorting");
        Map ordered = new TreeMap();
-        UCA.UCAContents ucac = collator.getContents(UCA.FIXED_CE, NFD);
+        Set contentsForCanonicalIteration = new TreeSet();
+        UCA.UCAContents ucac = collator.getContents(UCA.FIXED_CE, null); // NFD
        int ccounter = 0;
        while (true) {
            Utility.dot(ccounter++);
            String s = ucac.next();
            if (s == null) break;
+            contentsForCanonicalIteration.add(s);
            ordered.put(collator.getSortKey(s, UCA.NON_IGNORABLE) + '\u0000' + s, s);
        }
-            
        
+        // Add canonically equivalent characters!!
+        System.out.println("Start Adding canonical Equivalents2");
+        int canCount = 0;
+        
+        System.out.println("Add missing decomposibles");
+        for (int i = 0; i < 0x10FFFF; ++i) {
+            if (!ucd.isAllocated(i)) continue;
+            if (NFD.isNormalized(i)) continue;
+            if (collator.getCEType(i) >= UCA.FIXED_CE) continue;
+            String s = UTF16.valueOf(i);
+            if (contentsForCanonicalIteration.contains(s)) continue;
+            contentsForCanonicalIteration.add(s);
+            ordered.put(collator.getSortKey(s, UCA.NON_IGNORABLE) + '\u0000' + s, s);
+            System.out.println(" + " + ucd.getCodeAndName(s));
+            canCount++;
+        }
+        
+        Set additionalSet = new HashSet();
+        System.out.println("Loading canonical iterator");
+        CanonicalIterator canIt = new CanonicalIterator(".");
+        Iterator it2 = contentsForCanonicalIteration.iterator();
+        System.out.println("Adding any FCD equivalents that have different sort keys");
+        while (it2.hasNext()) {
+            String key = (String)it2.next();
+            if (key == null) {
+                System.out.println("Null Key");
+                continue;
+            }
+            canIt.setSource(key);
+            boolean first = true;
+            while (true) {
+                String s = canIt.next();
+                if (s == null) break;
+                if (s.equals(key)) continue;
+                if (contentsForCanonicalIteration.contains(s)) continue;
+                if (additionalSet.contains(s)) continue;
+                
+                if (s.equals("\u01EC")) {
+                    System.out.println("01ec");
+                }
+                
+                // Skip anything that is not FCD.
+                if (!NFD.isFCD(s)) continue;
+                
+                // We ONLY add if the sort key would be different
+                // Than what we would get if we didn't decompose!!
+                String sortKey = collator.getSortKey(s, UCA.NON_IGNORABLE);
+                String nonDecompSortKey = collator.getSortKey(s, UCA.NON_IGNORABLE, false);
+                if (sortKey.equals(nonDecompSortKey)) continue;
+                
+                if (first) {
+                    System.out.println(" " + ucd.getCodeAndName(key));
+                    first = false;
+                }
+                System.out.println(" => " + ucd.getCodeAndName(s));
+                System.out.println("    old: " + collator.toString(nonDecompSortKey));
+                System.out.println("    new: " + collator.toString(sortKey));
+                canCount++;
+                additionalSet.add(s);
+                ordered.put(sortKey + '\u0000' + s, s);
+            }
+        }
+        System.out.println("Done Adding canonical Equivalents -- added " + canCount);
        /*
        
        for (int ch = 0; ch < 0x10FFFF; ++ch) {
--- a/tools/unicodetools/com/ibm/text/UCD/BuildNames.java
+++ b/tools/unicodetools/com/ibm/text/UCD/BuildNames.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
-* $Date: 2002/06/04 01:59:02 $
-* $Revision: 1.5 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.6 $
 *
 *******************************************************************************
 */
@ -14,6 +14,8 @@
 package com.ibm.text.UCD;

 import java.io.IOException;
+import com.ibm.icu.text.UTF16;
+
 //import com.ibm.text.unicode.UInfo;
 import java.util.*;
 import java.io.*;
@ -35,6 +37,7 @@ public class BuildNames implements UCD_Types {
    static Map words = new TreeMap(new LengthFirstComparator());
    static Map doubleWords = new TreeMap(new LengthFirstComparator());
    static Map tripleWords = new TreeMap(new LengthFirstComparator());
+    static Map quadWords = new TreeMap(new LengthFirstComparator());
    static Set lines = new TreeSet(new LengthFirstComparator());
    static int[] letters = new int[128];
    
@ -44,6 +47,8 @@ public class BuildNames implements UCD_Types {
    }
    
    static String lastWord = "";
+    static String preLastWord = "";
+    static String prePreLastWord = "";
    
    static void addWord(String word, Map words) {
    	Count count = (Count) words.get(word);
@ -59,15 +64,21 @@ public class BuildNames implements UCD_Types {
        
        // doubles
        
-        if (position != 0) {
+        if (position > 0) {
        	addWord(lastWord + "/" + word, doubleWords);
        }
-        lastWord = word;
        
        if (position > 1) {
-        	addWord(lastWord + "/" + word, doubleWords);
+        	addWord(preLastWord + "/" + lastWord + "/" + word, tripleWords);
        }
-        lastLastWord = word;
+        
+        if (position > 2) {
+        	addWord(prePreLastWord + "/" + preLastWord + "/" + lastWord + "/" + word, quadWords);
+        }
+        
+        prePreLastWord = preLastWord;
+        preLastWord = lastWord;
+        lastWord = word;
        
        for (int i = 0; i < word.length(); ++i) {
            letters[word.charAt(i)]++;
@ -129,35 +140,76 @@ public class BuildNames implements UCD_Types {

    static void collectWords() throws IOException {

+        String fname = "ShortNames.txt";
+        System.out.println("Writing " + fname);
+        PrintWriter log = Utility.openPrintWriter(fname, false, true);
+        
        System.out.println("Gathering data");
        //Counter counter = new Counter();
        String[] parts = new String[100];
        //int total = 0;
        int used = 0;
        int sum = 0;
-        for (int i = 0; i < 0x10FFFF; ++i) {
-            if (Default.ucd.hasComputableName(i)) continue;
-            String name = Default.ucd.getName(i);
-            if (name == null) continue;
-            name = transform(name);
-
-            sum += name.length();
-            used++;
-
-            // replace numbers & letters
-
-            int len = Utility.split(name, ' ', parts);
-            for (int j = 0; j < len; ++j) {
-                stash(parts[j], j);
+        int longSum = 0;
+        
+        for (int cp = 0; cp < 0x10FFFF; ++cp) {
+            if (!Default.ucd.isAllocated(cp)) continue;
+            if (Default.ucd.hasComputableName(cp)) continue;
+            Utility.dot(cp);
+            String name;
+            
+            if (Default.ucd.isRepresented(cp)) {
+                name = Default.ucd.getName(cp, SHORT);
+                log.println(Utility.hex(cp) + " " + name);
+                String backName = Utility.replace(name, UCD_Names.NAME_ABBREVIATIONS, false);
+                if (!name.equals(backName)) {
+                    System.out.println("Failed to recreate: " + name + ", " + backName);
+                }
            }
+            
+            // check the string, and its decomposition. This is just to get a good count.
+            
+            String str = UTF16.valueOf(cp);
+            if (false && !Default.nfkd.isNormalized(cp)) {
+                str += Default.nfkd.normalize(cp);
+            }
+                
+            int cp2;
+            for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp2)) {
+                cp2 = UTF16.charAt(str, i);
+                name = Default.ucd.getName(cp2, SHORT);
+                if (name == null) continue;
+                //name = transform(name);

-            lines.add(name);
+                sum += name.length();
+                longSum += Default.ucd.getName(cp2).length();
+                used++;
+
+                // replace numbers & letters
+
+                int len = Utility.split(name, ' ', parts);
+                for (int j = 0; j < len; ++j) {
+                    stash(parts[j], j);
+                }
+
+                lines.add(name);
+            }
        }
-        System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%");
-        System.out.println("Strings: " + sum + ", " + (lastLink*4));
+        log.close();
+        Utility.fixDot();
+        //System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%");
+        //System.out.println("Strings: " + sum + ", " + (lastLink*4));
+        System.out.println("Short Names sum: " + sum + ", average: " + (sum + 0.0)/used);
+        System.out.println("Long Names sum: " + longSum + ", average: " + (longSum + 0.0)/used);
+        System.out.println("Savings: " + (1 - (sum+0.0)/longSum));
+        
        
        printWords(words);
        printWords(doubleWords);
+        printWords(tripleWords);
+        printWords(quadWords);
+        
+        if (true) return;
        
        System.out.println();
        System.out.println("Compacting Words");
--- a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
-* $Date: 2002/04/24 02:38:53 $
-* $Revision: 1.7 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.8 $
 *
 *******************************************************************************
 */
@ -331,7 +331,7 @@ public final class ConvertUCD implements UCD_Types {

    static void readBlocks() throws Exception {
        System.out.println("Reading 'Blocks'");
-        BufferedReader input = Utility.openUnicodeFile(blocksname, version, true);
+        BufferedReader input = Utility.openUnicodeFile(blocksname, version, true, false);
        String line = "";
        try {
    	    String[] parts = new String[20];
@ -376,7 +376,7 @@ public final class ConvertUCD implements UCD_Types {
        }
        String tempVersion = version;
        if (version.equals(UCD.latestVersion)) tempVersion = "";
-        BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion, true);
+        BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion, true, false);
        if (input == null) {
            System.out.println("COULDN'T OPEN: " + labels[0]);
            return;
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
-* $Date: 2002/05/29 02:01:00 $
-* $Revision: 1.4 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.5 $
 *
 *******************************************************************************
 */
@ -14,57 +14,416 @@
 package com.ibm.text.UCD;
 import java.io.*;
 import com.ibm.text.utility.*;
+import com.ibm.icu.text.Transliterator;
+import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UTF16;
 import java.util.*;


-public final class GenerateHanTransliterator {
+public final class GenerateHanTransliterator implements UCD_Types {
    
    static final boolean TESTING = false;
    static int type;
    
+    static final int CHINESE = 2, JAPANESE = 1, DEFINITION = 0;
+    
    public static void main(int typeIn) {
    	type = typeIn;
    	Default.setUCD();
        try {
            System.out.println("Starting");
-            generate();
+            log = Utility.openPrintWriter("Transliterate_log.txt", false, false);
+            err = Utility.openPrintWriter("Transliterate_err.txt", false, false);
+            log.print('\uFEFF');
+            
+            String key; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
+            String filter; // "kJis0";
+            String filename;
+            
+            switch (type) {
+                case DEFINITION:
+                    key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
+                    filter = null; // "kJis0";
+                    filename = "Transliterator_Han_Latin_Definition.txt";
+                    break;
+                case JAPANESE: 
+                    key = "kJapaneseOn";
+                    filter = null; // "kJis0";
+                    filename = "Transliterator_ja_Latin.txt";
+                    break;
+                case CHINESE:
+                    key = "kMandarin";
+                    filename = "Transliterator_Han_Latin.txt";
+                    filter = null;
+                    break;
+                default: throw new IllegalArgumentException("Unexpected option: must be 0..2");
+            }
+                
+            readUnihanData(key, filter);
+            
+            if (false) {
+                readCDICT();
+                compareUnihanWithCEDICT();
+            }
+            
+            readFrequencyData();
+            
+            out = Utility.openPrintWriter(filename, false, false);
+            out.println("# Convert CJK characters");
+            out.println("# Note: adds space between them and letters.");
+            out.println("{ ([:Han:]) } [:L:] > | $1 ' ';");
+            out.println("[\\.\\,\\?\\!\uFF0E\uFF0C\uFF1F\uFF01\u3001\u3002[:Pe:][:Pf:]] { } [:L:] > ' ';");
+            out.println("[:L:] { } [[:Han:][:Ps:][:Pi:]]> ' ';");
+            
+            if (type == JAPANESE) {
+                out.println("$kata = [[\uFF9E\uFF9F\uFF70\u30FC][:katakana:]];");
+                out.println("$kata { } [[:L:]-$kata]> ' ';");
+                out.println("[[:L:]-$kata] { } $kata > ' ';");
+                out.println("[:hiragana:] { } [[:L:]-[:hiragana:]] > ' ';");
+                out.println("[[:L:]-[:hiragana:]] { } [:hiragana:]> ' ';");
+            }
+                        
+            Set gotAlready = new HashSet();
+            Iterator it = rankList.iterator();
+            Set lenSet = new TreeSet();
+            int rank = 0;
+            while (it.hasNext()) {
+                Comparable keyChar = (Comparable) it.next();
+                Comparable def = (Comparable) unihanMap.get(keyChar);
+                if (def == null) continue; // skipping
+                // sort longer definitions first!
+                lenSet.add(new Pair(
+                    new Pair(new Integer(-def.toString().length()), new Integer(rank++)),
+                    new Pair(keyChar, def)));
+                gotAlready.add(keyChar);
+            }
+            
+            // add the ones that are not ranked!
+            it = unihanMap.keySet().iterator();
+            while (it.hasNext()) {
+                Comparable keyChar = (Comparable) it.next();
+                Comparable def = (Comparable) unihanMap.get(keyChar);
+                if (!gotAlready.contains(keyChar)) {
+                    lenSet.add(new Pair(
+                        new Pair(new Integer(-def.toString().length()), new Integer(rank++)),
+                        new Pair(keyChar, def)));
+                }
+            }
+                    
+            Set gotIt = new HashSet();
+            it = lenSet.iterator();
+            while (it.hasNext()) {
+                Pair p = (Pair) it.next();
+                p = (Pair) p.second;
+                
+                Comparable keyChar = p.first; 
+                Comparable def = p.second;
+                String rel = gotIt.contains(def) ? " > " : " <> ";
+                out.println(keyChar + rel + def + ";");
+                //if (TESTING) System.out.println("# " + code + " > " + definition);
+                gotIt.add(def);
+            }
+            
+            out.println("\u3002 <> '.';");
+            if (type == JAPANESE) {
+                out.println(":: katakana-latin;");
+                out.println(":: hiragana-latin;");
+            }
+            out.println(":: fullwidth-halfwidth;");
+
+            
+            
+            System.out.println("Total: " + totalCount);
+            System.out.println("Defined Count: " + count);
        } catch (Exception e) {
            System.out.println("Exception: " + e);
+        } finally {
+            if (log != null) log.close();
+            if (out != null) out.close();
+            if (err != null) err.close();
        }
    }
    
+    static PrintWriter log;
    static PrintWriter out;
    static PrintWriter err;
    
    static int count;
+    static int totalCount;
    static int oldLine;
-  
-    static void generate() throws java.io.IOException {
-        String name = "$Han$English";
-        String key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
-        String filter = "kJis0";
-        String filename = "Han_English";
-        switch (type) {
-            default: break;
-            case 1: name = "$Han$OnRomaji";
-                key = "kJapaneseOn";
-                filter = "kJis0";
-                filename = "Han_Romaji";
-                break;
-            case 2: name = "$Han$Pinyin";
-                key = "kMandarin";
-                filename = "Han_Pinyin";
-                filter = null;
-                break;
+    
+    static void readFrequencyData() throws java.io.IOException {
+        String line = "";
+        try {
+            
+            // chinese_frequency.txt
+            // 1	çš„	1588561	1588561	3.5008%
+            // japanese_frequency.txt
+            // 1 ? 17176
+            
+            Set combinedRank = new TreeSet();
+            
+            System.out.println("Reading chinese_frequency.txt");
+            BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\chinese_frequency.txt", true);
+            int counter = 0;
+            while (true) {
+                line = Utility.readDataLine(br);
+                if (line == null) break;
+                if (line.length() == 0) continue;
+                Utility.dot(counter++);
+                int tabPos = line.indexOf('\t');
+                int rank = Integer.parseInt(line.substring(0,tabPos));
+                int cp = line.charAt(tabPos+1);
+                //if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
+                combinedRank.add(new Pair(new Integer(rank), UTF16.valueOf(cp)));
+            }
+            br.close();
+            
+            System.out.println("Reading japanese_frequency.txt");
+     
+            br = Utility.openReadFile( BASE_DIR + "dict\\japanese_frequency.txt", true);
+            Map japaneseMap = new HashMap();
+            while (true) {
+                line = Utility.readDataLine(br);
+                if (line == null) break;
+                if (line.length() == 0) continue;
+                Utility.dot(counter++);
+                int tabPos = line.indexOf(' ');
+                
+                int tabPos2 = line.indexOf(' ', tabPos+1);
+                int freq = Integer.parseInt(line.substring(tabPos2+1));
+                
+                for (int i = tabPos+1; i < tabPos2; ++i) {
+                    int cp = line.charAt(i);
+                    int script = Default.ucd.getScript(cp);
+                    if (script != HAN_SCRIPT) {
+                        if (script != HIRAGANA_SCRIPT && script != KATAKANA_SCRIPT) {
+                            System.out.println("Huh: " + Default.ucd.getCodeAndName(cp));
+                        }
+                        continue;
+                    }
+                    // if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
+                    Utility.addCount(japaneseMap, UTF16.valueOf(cp), -freq);
+                }
+            }
+            br.close();
+            
+            // get rank order japanese
+            Iterator it = japaneseMap.keySet().iterator();
+            int countJapanese = 0;
+            while (it.hasNext()) {
+                Comparable key = (Comparable) it.next();
+                Comparable val = (Comparable) japaneseMap.get(key);
+                combinedRank.add(new Pair(new Integer(++countJapanese), key));
+            }
+ 
+            
+            int overallRank = 0;
+            it = combinedRank.iterator();
+            
+            while(it.hasNext()) {
+                Pair p = (Pair) it.next();
+                log.println(p.first + ", " + p.second);
+                Object rank = rankMap.get(p.second);
+                if (rank == null) {
+                    rankMap.put(p.second, new Integer(++overallRank));
+                    rankList.add(p.second);
+                }
+            }
+
+            log.println("@character to rank");
+            
+            // get full order
+            it = rankList.iterator();
+            while (it.hasNext()) {
+                Comparable key = (Comparable) it.next();
+                Comparable val = (Comparable) rankMap.get(key);
+                log.println(key + ", " + val);
+            }
+            
+        } catch (Exception e) {
+            throw new ChainException("Line \"{0}\"", new String[] {line}, e);
+        }
+    }
+    
+    static void compareUnihanWithCEDICT() {
+        System.out.println("@Comparing CEDICT to Unihan");
+        log.println("@Comparing CEDICT to Unihan");
+        Iterator it = unihanMap.keySet().iterator();
+        List inCEDICT = new ArrayList();
+        List inUnihan = new ArrayList();
+        List inBoth = new ArrayList();
+        UnicodeSet noPinyin = new UnicodeSet();
+        UnicodeSet kPinyin = new UnicodeSet();
+        UnicodeSet tPinyin = new UnicodeSet();
+        UnicodeSet sPinyin = new UnicodeSet();
+        
+        for (int i = 0; i < 0x10FFFF; ++i) {
+            if (!Default.ucd.isAllocated(i)) continue;
+            if (Default.ucd.getScript(i) != HAN_SCRIPT) continue;
+            Utility.dot(i);
+            
+            String ch = UTF16.valueOf(i);
+            
+            String pinyin = (String) unihanMap.get(ch);
+            if (pinyin == null) {
+                String ch2 = Default.nfkd.normalize(ch);
+                pinyin = (String) unihanMap.get(ch2);
+                if (pinyin != null) {
+                    unihanMap.put(ch, pinyin);
+                    kPinyin.add(i);
+                } else {
+                    String trial = (String) simplifiedToTraditional.get(ch2);
+                    if (trial != null) {
+                        pinyin = (String) unihanMap.get(trial);
+                        if (pinyin != null) {
+                            unihanMap.put(ch, pinyin);
+                            tPinyin.add(i);
+                        } else {
+                            trial = (String) traditionalToSimplified.get(ch2);
+                            if (trial != null) {
+                                pinyin = (String) unihanMap.get(trial);
+                                if (pinyin != null) {
+                                    unihanMap.put(ch, pinyin);
+                                    sPinyin.add(i);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            Map pinyinSet = (Map) cdict.get(ch);
+            if (pinyin == null) {
+                if (pinyinSet != null) inCEDICT.add(ch + " => " + pinyinSet);
+                noPinyin.add(i);
+            } else if (pinyinSet == null) {
+                inUnihan.add(ch + " => " + pinyin);
+            } else {
+                Object temp = pinyinSet.get(pinyin);
+                if (temp == null) {
+                    inBoth.add(ch + " => " + pinyin + "; " + pinyinSet);
+                }
+            }
        }
        
-        out = Utility.openPrintWriter("Transliterate_" + filename + ".txt", false, false);
-        err = Utility.openPrintWriter("Transliterate_" + filename + "_log.txt", false, false);
+        log.println("@In CEDICT but not Unihan: ");
+        printCollection(log, inCEDICT);
        
-        BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true); 
+        log.println("@In Unihan but not CEDICT: ");
+        printCollection(log, inUnihan);
+        
+        log.println("@In Unihan and CEDICT, but different: ");
+        printCollection(log, inBoth);
+        
+        log.println("@Missing from Unihan: ");
+        log.println(noPinyin.toPattern(true));
+        
+        log.println("@Has mapping if we NFKD it: ");
+        log.println(kPinyin.toPattern(true));
+        
+        log.println("@Has mapping if we NFKC & simp-trad it: ");
+        log.println(tPinyin.toPattern(true));
+        
+        log.println("@Has mapping if we NFKC & trad-simp it: ");
+        log.println(sPinyin.toPattern(true));
+        
+        log.println("@Done comparison");
+    }
+    
+    static void printCollection(PrintWriter p, Collection c) {
+        Iterator it = c.iterator();
+        int count = 0;
+        while (it.hasNext()) {
+            p.println((++count) + "\t" + it.next());
+        }
+    }
+        
+    
+    static Map rankMap = new TreeMap(); // maps from single char strings to overall rank
+    static List rankList = new ArrayList(10000);
+    
+    static void readCDICT() throws IOException {
+        System.out.println("Reading cdict.txt");
+        BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\cdict.txt", true);
+        int counter = 0;
+        String[] pieces = new String[50];
+        String line = "";
+        
+        try {
+            while (true) {
+                line = Utility.readDataLine(br);
+                if (line == null) break;
+                if (line.length() == 0) continue;
+                Utility.dot(counter++);
+                int tabPos = line.indexOf('[');
+                String word = line.substring(0,tabPos).trim();
+                word = Utility.replace(word, "\uFE4D", "");
+                word = Utility.replace(word, ".", "");
+                word = Utility.replace(word, "/", "");
+                word = Utility.replace(word, "(", "");
+                word = Utility.replace(word, ")", "");
+               
+                
+                int tab2Pos = line.indexOf(']', tabPos+1);
+                String pinyins = line.substring(tabPos+1, tab2Pos);
+                int len = Utility.split(pinyins, ' ', pieces);
+                if (word.length() != len) {
+                    log.println("Len mismatch: " + line);
+                    continue;
+                }
+                for (int i = 0; i < len; ++i) {
+                    String chr = word.substring(i, i+1);
+                    String piece = convertPinyin.transliterate(pieces[i]);
+                    Map oldMap = (Map) cdict.get(chr);
+                    if (oldMap == null) {
+                        oldMap = new TreeMap();
+                        cdict.put(chr, oldMap);
+                    }
+                    /*&& !oldMap.equals(piece)) {
+                        log.println("Variant for '" + chr + "', new: '" + piece + "', old: '" + oldMap + "'");
+                    }
+                    */
+                    Utility.addCount(oldMap, piece, 1);
+                }
+            }
+            br.close();
+            
+            Iterator it = cdict.keySet().iterator();
+            Set tempSet = new TreeSet();
+            while (it.hasNext()) {
+                Object key = it.next();
+                Map val = (Map) cdict.get(key);
+                log.print(key + ": ");
+                Iterator it2 = val.keySet().iterator();
+                tempSet.clear();
+                while (it2.hasNext()) {
+                    Comparable key2 = (Comparable) it2.next();
+                    Comparable count = (Comparable) val.get(key2);
+                    Pair p = new Pair(count, key2);
+                    tempSet.add(p); // reverse the order
+                }
+                it2 = tempSet.iterator();
+                int counter2 = 0;
+                while (it2.hasNext()) {
+                    if (counter2++ != 0) log.print("/");
+                    log.print(it2.next());
+                }
+                log.println();
+            }
+            
+        } catch (Exception e) {
+            throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
+        }
+    }
+    
+    static Map cdict = new TreeMap();
+    static Map simplifiedToTraditional = new HashMap();
+    static Map traditionalToSimplified = new HashMap();
+  
+    static void readUnihanData(String key, String filter) throws java.io.IOException {
+
+        BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, true); 

-		int totalCount = 0;
        int count = 0;
        String oldCode = "";
        String oldLine = "";
@ -81,9 +440,34 @@ public final class GenerateHanTransliterator {
            if (line == null) break;
            if (line.length() < 6) continue;
            if (line.charAt(0) == '#') continue;
-            int tabPos = line.indexOf('	');
+            line = line.trim();
+            
+            int tabPos = line.indexOf('\t');
            String code = line.substring(2, tabPos);
            
+            // gather traditional mapping
+            if (line.indexOf("kTraditionalVariant") >= 0) {
+                int tabPos2 = line.indexOf('\t', tabPos+1);
+                int tabPos3 = line.indexOf(' ', tabPos2+1);
+                if (tabPos3 < 0) tabPos3 = line.length();
+                
+                String code2 = line.substring(tabPos2+3, tabPos3);
+                simplifiedToTraditional.put(UTF16.valueOf(Integer.parseInt(code, 16)), 
+                    UTF16.valueOf(Integer.parseInt(code2, 16)));
+            }
+            
+            if (line.indexOf("kSimplifiedVariant") >= 0) {
+                int tabPos2 = line.indexOf('\t', tabPos+1);
+                int tabPos3 = line.indexOf(' ', tabPos2+1);
+                if (tabPos3 < 0) tabPos3 = line.length();
+                
+                String code2 = line.substring(tabPos2+3, tabPos3);
+                traditionalToSimplified.put(UTF16.valueOf(Integer.parseInt(code, 16)), 
+                    UTF16.valueOf(Integer.parseInt(code2, 16)));
+            }
+            
+            
+            
            /* if (code.compareTo("9FA0") >= 0) {
                System.out.println("? " + line);
            }*/
@ -93,12 +477,15 @@ public final class GenerateHanTransliterator {
                if (foundKey && foundFilter) {
                    count++;
                    /*if (true) { //*/
-                    if (count == 1 || (count % 100) == 0) {
+                    if (TESTING && (count == 1 || (count % 100) == 0)) {
                        System.out.println(count + ": " + oldLine);
                    }
-                    printDef(out, oldCode, oldLine, oldStart);
+                    storeDef(out, oldCode, oldLine, oldStart);
+                }
+                if (TESTING) if (count > 1000) {
+                    System.out.println("ABORTING at 1000 for testing");
+                    break;
                }
-                if (TESTING) if (count > 1000) break;
                oldCode = code;
                foundKey = false;
                foundFilter = (filter == null);
@ -113,16 +500,12 @@ public final class GenerateHanTransliterator {
                oldStart += key.length();
            }
        }
-        if (foundKey && foundFilter) printDef(out, oldCode, oldLine, oldStart);
+        if (foundKey && foundFilter) storeDef(out, oldCode, oldLine, oldStart);
        
-        System.out.println("Total: " + totalCount);
-        System.out.println("Defined Count: " + count);
        in.close();
-        out.close();
-        err.close();
    }
    
-    static void printDef(PrintWriter out, String code, String line, int start) {
+    static void storeDef(PrintWriter out, String code, String line, int start) {
        if (code.length() == 0) return;
        
        // skip spaces & numbers at start
@ -139,39 +522,179 @@ public final class GenerateHanTransliterator {
        if (end2 < 0) end2 = line.length();
        if (end > end2) end = end2;
  
-        if (type != 0) {
+        if (type != DEFINITION) {
            end2 = line.indexOf(" ", start);
            if (end2 < 0) end2 = line.length();
            if (end > end2) end = end2;
        }
        
        String definition = line.substring(start,end);
-        if (type == 2) definition = handlePinyin(definition, line);
-        definition.trim();
-        definition = definition.toLowerCase();
-        String cp = UTF16.valueOf(Integer.parseInt(code, 16));
-        String key = (String) definitionMap.get(definition);
-        if (key == null) {
-            definitionMap.put(definition, cp);
+        if (type == CHINESE) {
+            // since data are messed up, terminate after first digit
+            int end3 = findInString(definition, "12345")+1;
+            if (end3 == 0) {
+                log.println("Bad pinyin data: " + line);
+                end3 = definition.length();
+            }
+            definition = definition.substring(0, end3);
+            
+            definition = convertPinyin.transliterate(definition);
        }
-        out.println(cp + (key == null ? " <> " : " > ") + "'[" + definition + "]';");
+        if (type == DEFINITION) {
+            definition = removeMatched(definition,'(', ')', line);
+            definition = removeMatched(definition,'[', ']', line);
+            definition = definition.trim();
+            definition = Utility.replace(definition, "  ", " ");
+            definition = "'[" + quoteNonLetters.transliterate(definition) + "]'";
+        }
+        definition.trim();
+        definition = Default.ucd.getCase(definition, FULL, LOWER);
+        String cp = UTF16.valueOf(Integer.parseInt(code, 16));
+        unihanMap.put(cp, definition);
+        /*
+        String key = (String) unihanMap.get(definition);
+        if (key == null) {
+            unihanMap.put(definition, cp);
+        }
+        out.println(cp + (key == null ? " <> " : " > ") + Default.ucd.getCase(definition, FULL, TITLE) + ";");
        if (TESTING) System.out.println("# " + code + " > " + definition);
+        */
    }
    
-    static Map definitionMap = new HashMap();
+    // WARNING not supplemenatary-safe!
+    
+    static int findInString(String source, String chars) {
+        for (int i = 0; i < source.length(); ++i) {
+            if (chars.indexOf(source.charAt(i)) >= 0) return i;
+        }
+        return -1;
+    }
+        
+    // WARNING not supplemenatary-safe!
+    
+    static String removeMatched(String source, char start, char end, String originalLine) {
+        while (true) {
+            int pos = source.indexOf(start);
+            if (pos < 0) break;
+            int epos = source.indexOf(end, pos+1);
+            if (epos < 0) {
+                epos = source.length()-1;
+                log.println("Mismatches with " + start + ", " + end + ": " + originalLine);
+            }
+            source = source.substring(0,pos) + source.substring(epos+1);
+        }
+        return source;
+    }
+        
+    static Map unihanMap = new HashMap();
    
    static StringBuffer handlePinyinTemp = new StringBuffer();
    
-    static String handlePinyin(String source, String debugLine) {
+    static Transliterator hex = Transliterator.getInstance("[^\\u0020-\\u007F] hex");
+    static Transliterator quoteNonLetters = Transliterator.createFromRules("any-quotenonletters", 
+        "([[\\u0021-\\u007E]-[:L:]-[\\']]) > \\u005C $1; \\' > \\'\\';", Transliterator.FORWARD);
+    
+    
+    
+    // ADD Factory since otherwise getInverse blows out
+    static class DummyFactory implements Transliterator.Factory {
+        static DummyFactory singleton = new DummyFactory();
+        static HashMap m = new HashMap();
+
+        // Since Transliterators are immutable, we don't have to clone on set & get
+        static void add(String ID, Transliterator t) {
+            m.put(ID, t);
+            System.out.println("Registering: " + ID + ", " + t.toRules(true));
+            Transliterator.registerFactory(ID, singleton);
+        }
+        public Transliterator getInstance(String ID) {
+            return (Transliterator) m.get(ID);
+        }
+    }
+    
+    static Transliterator convertPinyin;
+    
+    static {
+        String dt = "1 > ;\n"
+                    + "2 <> \u0301;\n"
+                    + "3 <> \u0306;\n"
+                    + "4 <> \u0300;\n"
+                    + "5 <> \u0304;";
+        
+        String dp = "# syllable is ...vowel+ consonant* number\n"
+                    + "# 'a', 'e' are the preferred bases\n"
+                    + "# otherwise 'o'\n"
+                    + "# otherwise last vowel\n"
+                    + "::NFC;\n"
+                    + "$vowel = [aAeEiIoOuUüÜ];\n"
+                    + "$consonant = [[a-z A-Z] - [$vowel]];\n"
+                    + "$digit = [1-5];\n"
+                    + "([aAeE]) ($vowel* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
+                    + "([oO]) ([$vowel-[aeAE]]* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
+                    + "($vowel) ($consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
+                    + "::NFC;\n";
+ 
+    	Transliterator at = Transliterator.createFromRules("digit-tone", dt, Transliterator.FORWARD);
+    	System.out.println(at.transliterate("a1a2a3a4a5"));
+    	DummyFactory.add(at.getID(), at);
+    	
+    	convertPinyin = Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD);
+    	System.out.println(convertPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2"));
+    
+    }
+    /*
+    
+    static String convertTones(String source, String debugLine) {
        try {
+            result = new StringBuffer();
+            main:
+            for (int i = 0; i < source.length(); ++i) {
+                ch = source.charAt(i);
+                switch (ch) {
+                    case ':': 
+                        if (i > 0) {
+                            char last = result.charAt(result.length()-1);
+                            if (last == 'u') {
+                                result.setCharAt(result.length()-1, 'ü');
+                                continue main;
+                            } else if (last == 'U') {
+                                result.setCharAt(result.length()-1, 'Ü');
+                                continue main;
+                            }
+                        }
+                        break;
+                    case '1': break; // skip character
+                    case '2': case '3': case '4': case '5':
+                        applyToPrecedingBase(result, ch-'0');
+                        break;
+                    default:
+                        result.append(ch);
+                        break;
+                }
+            }
+        }
+            
+                        
+        source = source.trim();
            char ch = source.charAt(source.length()-1);
            int num = (int)(ch-'1');
            if (num < 0 || num > 5) throw new Exception("none");
            handlePinyinTemp.setLength(0);
            boolean gotIt = false;
            boolean messageIfNoGotIt = true;
+            
            for (int i = source.length()-2; i >= 0; --i) {
                ch = source.charAt(i);
+                if (ch == ':') {
+                    ch = 'Ü';
+                    --i;
+                }
+                if ('0' <= ch && ch <= '9') break;
+                if (ch != 'Ü' && (ch < 'A' || ch > 'Z')) {
+                    Utility.fixDot();
+                    System.out.println("Warning: non-ASCII in " + hex.transliterate(source) + " (" + hex.transliterate(debugLine) + ")");
+                    break;
+                }
                if (!gotIt) switch (ch) {
                    case 'A': ch = "AÁ\u0102À\u0100".charAt(num); gotIt = true; break;
                    case 'E': ch = "EÉ\u0114È\u0112".charAt(num); gotIt = true; break;
@ -191,8 +714,31 @@ public final class GenerateHanTransliterator {
            }
            source = handlePinyinTemp.toString().toLowerCase();
        } catch (Exception e) {
-            err.println("Bad line: " + debugLine);
+            log.println("Bad line: " + debugLine);
        }
        return source;
    }
+    
+/*
+A and e trump all other vowels and always take the tone mark.
+There are no Mandarin syllables that contain both a and e. 
+In the combination ou, o takes the mark. 
+In all other cases, the final vowel takes the mark. 
+*/
+/*
+    static String applyToPrecedingBase(StringBuffer result, int tone) {
+        for (int i = result.length()-1; i >= 0; --i) {
+            char ch = result.charAt(i);
+            switch (ch) {
+                case 'a': case 'e': case 'A': case 'E':
+                    result.setCharAt(i, mapTone(ch, tone));
+                    return;
+                case 'o': case 'O': bestSoFar = i; break;
+                case 'i': case 'I': case 'u': case 'U': case '
+        if (tone == 1) return String.valueOf(ch);
+        return Default.nfc.normalize(ch + mapTone[tone]);
+    }
+    
+    static final char[] MAP_TONE = {"\u0301", "\u0306", "\u0300", "\u0304"};
+    */
 }
--- a/tools/unicodetools/com/ibm/text/UCD/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Main.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
-* $Date: 2002/06/04 01:59:02 $
-* $Revision: 1.14 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.15 $
 *
 *******************************************************************************
 */
@ -65,13 +65,14 @@ public final class Main implements UCD_Types {
            else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
            else if (arg.equalsIgnoreCase("verifyNormalizationStability")) VerifyUCD.verifyNormalizationStability();
            
-            else if (arg.equalsIgnoreCase("hanTransliterator")) GenerateHanTransliterator.main(0);
+            else if (arg.equalsIgnoreCase("definitionTransliterator")) GenerateHanTransliterator.main(0);
            else if (arg.equalsIgnoreCase("romajiTransliterator")) GenerateHanTransliterator.main(1);
            else if (arg.equalsIgnoreCase("pinYinTransliterator")) GenerateHanTransliterator.main(2);
            else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry();
            
            else if (arg.equalsIgnoreCase("checkBIDI")) VerifyUCD.checkBIDI();
            else if (arg.equalsIgnoreCase("Buildnames")) BuildNames.main(null);
+            else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
            

            else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
--- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
-* $Date: 2002/05/31 01:41:03 $
-* $Revision: 1.9 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.10 $
 *
 *******************************************************************************
 */
@ -100,7 +100,7 @@ public final class Normalizer implements UCD_Types {
        // then compose if the form requires.

        if (source.length() != 0) {
-            internalDecompose(source, target);
+            internalDecompose(source, target, true, compatibility);
            if (composition) {
                internalCompose(target);
            }
@ -108,6 +108,23 @@ public final class Normalizer implements UCD_Types {
        return target;
    }

+    /**
+    * Normalizes text according to the chosen form,
+    * replacing contents of the target buffer.
+    * @param   source      the original text, unnormalized
+    * @param   target      the resulting normalized text
+    */
+    public boolean isFCD(String source) {
+        if (source.length() == 0) return true;
+        StringBuffer noReorder = new StringBuffer();
+        StringBuffer reorder = new StringBuffer();
+
+        internalDecompose(source, noReorder, false, false);
+        internalDecompose(source, reorder, true, false);
+        
+        return reorder.toString().equals(noReorder.toString());
+    }
+
    /**
    * Normalizes text according to the chosen form
    * @param   source      the original text, unnormalized
@ -280,13 +297,13 @@ public final class Normalizer implements UCD_Types {
    * @param   source      the original text, unnormalized
    * @param   target      the resulting normalized text
    */
-    private void internalDecompose(String source, StringBuffer target) {
+    private void internalDecompose(String source, StringBuffer target, boolean reorder, boolean compat) {
        StringBuffer buffer = new StringBuffer();
        int ch32;
        for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
            buffer.setLength(0);
            ch32 = UTF16.charAt(source, i);
-            data.getRecursiveDecomposition(ch32, buffer, compatibility);
+            data.getRecursiveDecomposition(ch32, buffer, compat);

            // add all of the characters in the decomposition.
            // (may be just the original character, if there was
@ -297,7 +314,7 @@ public final class Normalizer implements UCD_Types {
                ch = UTF16.charAt(buffer, j);
                int chClass = data.getCanonicalClass(ch);
                int k = target.length(); // insertion point
-                if (chClass != 0) {
+                if (chClass != 0 && reorder) {

                    // bubble-sort combining marks as necessary

@ -466,27 +483,27 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
            return isFirst.get(cp);
        }

-        boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
+        boolean normalizationDiffers(int cp, boolean composition, boolean compat) {
            byte dt = ucd.getDecompositionType(cp);
            if (!composition) {
-                if (compatibility) return dt >= CANONICAL;
+                if (compat) return dt >= CANONICAL;
                else return dt == CANONICAL;
            } else {
                // almost the same, except that we add back in the characters
                // that RECOMPOSE
-                if (compatibility) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
+                if (compat) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
                else return dt == CANONICAL && !canonicalRecompose.get(cp);
            }
        }

-        public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
+        public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compat) {
            byte dt = ucd.getDecompositionType(cp);
-            // we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
-            if (dt == CANONICAL || dt > CANONICAL && compatibility) {
+            // we know we decompose all CANONICAL, plus > CANONICAL if compat is TRUE.
+            if (dt == CANONICAL || dt > CANONICAL && compat) {
                String s = ucd.getDecompositionMapping(cp);
                for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
                    cp = UTF16.charAt(s, i);
-                    getRecursiveDecomposition(cp, buffer, compatibility);
+                    getRecursiveDecomposition(cp, buffer, compat);
                }
            } else {
                UTF16.append(buffer, cp);
--- a/tools/unicodetools/com/ibm/text/UCD/TestNormalization.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestNormalization.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNormalization.java,v $
-* $Date: 2002/04/23 01:59:14 $
-* $Revision: 1.4 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.5 $
 *
 *******************************************************************************
 */
@ -34,6 +34,13 @@ public final class TestNormalization {
    public static void main(String[] args)  throws java.io.IOException {
        System.out.println("Creating Normalizers");
        Default.setUCD();
+        
+        String[] testSet = {"a\u0304\u0328", "a\u0328\u0304"};
+        for (int i = 0; i < testSet.length; ++i) {
+            String s = testSet[i];
+            boolean test = Default.nfc.isFCD(s);
+            System.out.println(test + ": " + Default.ucd.getCodeAndName(s));
+        }


            String x = UTF32.valueOf32(0x10000);
--- a/tools/unicodetools/com/ibm/text/UCD/UCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
-* $Date: 2002/05/29 02:01:00 $
-* $Revision: 1.12 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.13 $
 *
 *******************************************************************************
 */
@ -124,20 +124,35 @@ public final class UCD implements UCD_Types {
     * Get the character name.
     */
    public String getName(int codePoint) {
+        return getName(codePoint, NORMAL);
+    }
+
+    /**
+     * Get the character name.
+     */
+    public String getName(String s) {
+        return getName(s, NORMAL);
+    }
+
+    /**
+     * Get the character name.
+     */
+    public String getName(int codePoint, byte style) {
+        if (style == SHORT) return get(codePoint, true).shortName;
        return get(codePoint, true).name;
    }

    /**
     * Get the character names for the code points in a string, separated by ", "
     */
-    public String getName(String s) {
+    public String getName(String s, byte style) {
        if (s.length() == 1) return get(s.charAt(0), true).name;
        StringBuffer result = new StringBuffer();
        int cp;
        for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
            cp = UTF32.char32At(s, i);
            if (i > 0) result.append(", ");
-            result.append(getName(cp));
+            result.append(getName(cp, style));
        }
        return result.toString();
    }
@ -977,6 +992,9 @@ to guarantee identifier closure.
                result = UData.UNASSIGNED;
                if (fixStrings) result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
            }
+            if (result.shortName != null && result.shortName.length() == 0) {
+                result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
+            }
            return result;
          case 0x3400: // CJK Ideograph Extension A
          case 0x4E00: // CJK Ideograph
@ -1006,13 +1024,17 @@ to guarantee identifier closure.
        result = getRaw(rangeStart);
        if (result == null) {
            result = UData.UNASSIGNED;
-            if (fixStrings) result.name = "<reserved-" + Utility.hex(codePoint, 4) + ">";
+            if (fixStrings) {
+                result.name = "<reserved-" + Utility.hex(codePoint, 4) + ">";
+                result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
+            }
            return result;
        }

        result.codePoint = codePoint;
        if (fixStrings) {
            result.name = constructedName;
+            result.shortName = Utility.replace(constructedName, UCD_Names.NAME_ABBREVIATIONS);
            result.decompositionMapping = result.bidiMirror
            = result.simpleLowercase = result.simpleUppercase = result.simpleTitlecase = result.simpleCaseFolding
            = result.fullLowercase = result.fullUppercase = result.fullTitlecase = result.fullCaseFolding
@ -1024,7 +1046,7 @@ to guarantee identifier closure.
        }
        return result;
    }
-
+    
    // Hangul constants

    public static final int
--- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
-* $Date: 2002/03/20 00:21:42 $
-* $Revision: 1.13 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.14 $
 *
 *******************************************************************************
 */
@ -782,6 +782,139 @@ final class UCD_Names implements UCD_Types {
    };
    
    static final String[] NF_NAME = {"NFD", "NFC", "NFKD", "NFKC"};
+    
+    static final String[][] NAME_ABBREVIATIONS = {
+        {"CJK UNIFIED IDEOGRAPH-", "CJK-"},
+        {"CJK COMPATIBILITY IDEOGRAPH-", "CJKC-"},
+        {"IDEOGRAPHIC TELEGRAPH SYMBOL FOR", "ITSF."},
+        
+        {"BRAILLE PATTERN DOTS-", "BPD-"},
+        {"CANADIAN SYLLABICS WEST-", "CSW."},
+        /*{"LATIN SMALL LETTER", "LSL."},
+        {"LATIN CAPITAL LETTER", "LCL."},
+        {"GREEK SMALL LETTER", "GSL."},
+        {"GREEK CAPITAL LETTER", "GCL."},
+        {"CYRILLIC SMALL LETTER", "GSL."},
+        {"CYRILLIC CAPITAL LETTER", "GCL."},
+        {"BYZANTINE MUSICAL SYMBOL", "BMS."},
+        {"YI SYLLABLE", "YS."},
+        {"ETHIOPIC SYLLABLE", "ES."},
+        {"HANGUL SYLLABLE", "HS."},
+        {"CANADIAN SYLLABICS", "CS."},
+        {"ARABIC LETTER", "ALt."},
+        {"ARABIC LIGATURE", "AL."},
+        */
+        
+        {"MATHEMATICAL SANS-SERIF", "MSS."},
+        {"MATHEMATICAL SERIF", "MS."},
+        {"BOLD ITALIC", "BI."},
+        {"ISOLATED FORM", "IF."},
+        {"FINAL FORM", "FF."},
+        {"INITIAL FORM", "IF."},
+        {"VOWEL SIGN", "VS."},
+        {"KANGXI RADICAL", "KR."},
+        {"MUSICAL SYMBOL", "MS."},
+        {"SMALL LETTER", "SL."},
+        {"CAPITAL LETTER", "CL."},
+        
+        {"LIGATURE", "Lg."},
+        {"SYLLABICS", "Ss."},
+        {"MATHEMATICAL", "M."},
+        {"LETTER", "L."},
+        {"SYLLABLE", "S."},
+        {"SYMBOL", "Sy."},
+        {"WITH", "W."},
+        {"CAPITAL", "C."},
+        {"SMALL", "C."},
+        {"COMBINING", "Cm."},
+        {"HANGUL", "H."},
+    };
+    
+    /*
+LETTER: 23598
+MATHEMATICAL:   11976
+SYLLABLE:       11872
+CAPITAL:        8918
+WITH:   8008
+COMPATIBILITY:  7800
+SMALL:  7740
+IDEOGRAPH:      6165
+SYLLABICS:      5670
+ARABIC: 5646
+CANADIAN:       5040
+LATIN:  4840
+SYMBOL: 4626
+LIGATURE:       4048
+MUSICAL:        3255
+FORM:   3044
+ETHIOPIC:       2760
+RADICAL:        2695
+HANGUL: 2670
+ITALIC: 2526
+YI:     2468
+BOLD:   2256
+BYZANTINE:      2214
+
+COMPATIBILITY/IDEOGRAPH:        13800
+YI/SYLLABLE:    12815
+CANADIAN/SYLLABICS:     11340
+CAPITAL/LETTER: 10948
+SMALL/LETTER:   10692
+CJK/COMPATIBILITY:      10200
+ARABIC/LIGATURE:        7110
+IDEOGRAPH/-:    6600
+MUSICAL/SYMBOL: 6510
+MATHEMATICAL/SANS:      5848
+LATIN/SMALL:    5786
+MATHEMATICAL/BOLD:      5678
+ETHIOPIC/SYLLABLE:      5389
+LATIN/CAPITAL:  5330
+ARABIC/LETTER:  4992
+BYZANTINE/MUSICAL:      4182
+BRAILLE/PATTERN:        3825
+ISOLATED/FORM:  3068
+PATTERN/DOTS:   3060
+KANGXI/RADICAL: 2996
+SYLLABICS/CARRIER:      2975
+-/SERIF:        2576
+ITALIC/CAPITAL: 2520
+BOLD/ITALIC:    2420
+KATAKANA/LETTER:        2415
+FINAL/FORM:     2400
+SERIF/BOLD:     2300
+SANS/-: 2208
+ITALIC/SMALL:   2184
+MONGOLIAN/LETTER:       2080
+MATHEMATICAL/ITALIC:    2071
+INITIAL/FORM:   2064
+CYRILLIC/CAPITAL:       2032
+
+CJK/COMPATIBILITY/IDEOGRAPH:    16200
+COMPATIBILITY/IDEOGRAPH/-:      15000
+LATIN/SMALL/LETTER:     9306
+LATIN/CAPITAL/LETTER:   8160
+MATHEMATICAL/SANS/-:    6536
+BYZANTINE/MUSICAL/SYMBOL:       5904
+BRAILLE/PATTERN/DOTS:   5100
+CANADIAN/SYLLABICS/CARRIER:     4550
+SANS/-/SERIF:   4416
+PATTERN/DOTS/-: 3570
+GREEK/SMALL/LETTER:     2934
+CYRILLIC/CAPITAL/LETTER:        2852
+-/SERIF/BOLD:   2760
+MATHEMATICAL/BOLD/ITALIC:       2640
+CYRILLIC/SMALL/LETTER:  2604
+GREEK/CAPITAL/LETTER:   2580
+
+CJK/COMPATIBILITY/IDEOGRAPH/-:  17400
+MATHEMATICAL/SANS/-/SERIF:      8600
+BRAILLE/PATTERN/DOTS/-: 5610
+SANS/-/SERIF/BOLD:      3910
+CANADIAN/SYLLABICS/WEST/-:      2200
+IDEOGRAPHIC/TELEGRAPH/SYMBOL/FOR:       2176
+-/SERIF/BOLD/ITALIC:    2090
+    */
+    

 /*
    static {
--- a/tools/unicodetools/com/ibm/text/UCD/UData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
-* $Date: 2001/10/25 20:33:46 $
-* $Revision: 1.3 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.4 $
 *
 *******************************************************************************
 */
@ -18,6 +18,7 @@ import com.ibm.text.utility.*;

 class UData implements UCD_Types {
    String name;
+    String shortName = ""; // cache
    String decompositionMapping;
    String simpleUppercase;
    String simpleLowercase;
--- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
-* $Date: 2002/05/31 01:41:03 $
-* $Revision: 1.14 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.15 $
 *
 *******************************************************************************
 */
@ -843,6 +843,7 @@ can help you narrow these down.
    }

    static final String names[] = {"LOWER", "TITLE", "UPPER", "(UNC)", "MIXED"};
+    static final String names2[] = {"LOWER", "TITLE", "UPPER", "FOLD"};
    static final String lowerNames[] = {"", "Other_Lower"};
    static final String upperNames[] = {"", "Other_Upper"};

@ -852,13 +853,50 @@ can help you narrow these down.
        for (int cp = 0; cp <= 0x10FFFF; ++cp) {
            Utility.dot(cp);
            if (!Default.ucd.isAssigned(cp) || Default.ucd.isPUA(cp)) continue;
+            
+            boolean failed = false;
            String fullTest = Default.ucd.getCase(Default.ucd.getCase(cp, FULL, UPPER), FULL, LOWER);
            String simpleTest = Default.ucd.getCase(Default.ucd.getCase(cp, SIMPLE, UPPER), SIMPLE, LOWER);

            String full = Default.ucd.getCase(cp, FULL, FOLD);
            String simple = Default.ucd.getCase(cp, SIMPLE, FOLD);
-
-            boolean failed = false;
+            
+            String realTest = "\u0360" + UTF16.valueOf(cp) + "\u0334";
+            
+            int ccc = Default.ucd.getCombiningClass(cp);
+            
+            for (byte style = FOLD; style < CASE_LIMIT; ++style) {
+                
+                String fold_NFD = Default.nfd.normalize(Default.ucd.getCase(realTest, FULL, style));
+                String NFD_fold = Default.ucd.getCase(Default.nfd.normalize(realTest), FULL, style);
+                if (!fold_NFD.equals(NFD_fold)) {
+                    Utility.fixDot();
+                    System.out.println("Case check fails at " + Default.ucd.getCodeAndName(cp));
+                    System.out.println("\t" + names2[style] + ", then NFD: " + Default.ucd.getCodeAndName(fold_NFD));
+                    System.out.println("\tNFD, then " + names2[style] + ": " + Default.ucd.getCodeAndName(NFD_fold));
+                    failed = true;
+                }
+            }
+            
+            /*
+            
+            int ccc = Default.ucd.getCombiningClass(cp);
+                
+            int cp2;
+            for (int i = 0; i < full.length(); i += UTF16.getCharCount(cp2)) {
+                cp2 = UTF16.charAt(full, i);
+                int ccc2 = Default.ucd.getCombiningClass(cp2);
+                if (ccc2 != ccc) {
+                    System.out.println("Case fold CCC fails at " + Default.ucd.getCodeAndName(cp));
+                    System.out.println("\tFull case folding:" + ccc2 + ", " + Default.ucd.getCodeAndName(full));
+                    System.out.println("\tccc:" + ccc);
+                    System.out.println("\tccc:" + ccc2 + ", " + Default.ucd.getCodeAndName(cp2));
+                    failed = true;
+                }
+            }
+            
+            */
+            
            if (!full.equals(fullTest)) {
                Utility.fixDot();
                System.out.println("Case fold fails at " + Default.ucd.getCodeAndName(cp));
--- a/tools/unicodetools/com/ibm/text/utility/Pair.java
+++ b/tools/unicodetools/com/ibm/text/utility/Pair.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Pair.java,v $
-* $Date: 2001/09/19 23:33:52 $
-* $Revision: 1.3 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.4 $
 *
 *******************************************************************************
 */
@ -49,4 +49,9 @@ public final class Pair implements java.lang.Comparable, Cloneable {
            return null;
        }
    }
+    
+    public String toString() {
+        return '(' + (first == null ? "null" : first.toString())
+            + ',' + (second == null ? "null" : second.toString()) + ')';
+    }
 }
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2002/06/02 05:07:08 $
-* $Revision: 1.17 $
+* $Date: 2002/06/13 21:14:05 $
+* $Revision: 1.18 $
 *
 *******************************************************************************
 */
@ -588,6 +588,39 @@ public final class Utility {    // COMMON UTILITIES
    public static void appendFile(String filename, boolean utf8, PrintWriter output) throws IOException {
    	appendFile(filename, utf8, output, null);
    }
+    
+    public static BufferedReader openReadFile(String filename, boolean UTF8) throws FileNotFoundException, UnsupportedEncodingException {
+        FileInputStream fis = new FileInputStream(filename);
+        InputStreamReader isr = UTF8 ? new InputStreamReader(fis, "UTF8") : new InputStreamReader(fis);
+        BufferedReader br = new BufferedReader(isr, 32*1024);
+        return br;
+    }
+    
+    public static void addCount(Map m, Object key, int count) {
+        Integer oldCount = (Integer) m.get(key);
+        if (oldCount == null) {
+            m.put(key, new Integer(count));
+            return;
+        }
+        m.put(key, new Integer(oldCount.intValue() + count));
+    }
+    
+    public static String readDataLine(BufferedReader br) throws IOException {
+        String originalLine = "";
+        String line = "";
+        
+        try {
+            line = originalLine = br.readLine();
+            if (line == null) return null;
+            if (line.length() > 0 && line.charAt(0) == 0xFEFF) line = line.substring(1);
+            int commentPos = line.indexOf('#');
+            if (commentPos >= 0) line = line.substring(0, commentPos);
+            line = line.trim();
+        } catch (Exception e) {
+            throw new ChainException("Line \"{0}\",  \"{1}\"", new String[] {originalLine, line}, e);
+        }
+        return line;
+    }

    public static void appendFile(String filename, boolean utf8, PrintWriter output, String[] replacementList) throws IOException {
        FileInputStream fis = new FileInputStream(filename);
@ -691,10 +724,10 @@ public final class Utility {    // COMMON UTILITIES
        copyTextFile(filename, utf8, newName, null);
    }

-    public static BufferedReader openUnicodeFile(String filename, String version, boolean show) throws IOException {
+    public static BufferedReader openUnicodeFile(String filename, String version, boolean show, boolean UTF8) throws IOException {
        String name = getMostRecentUnicodeDataFile(filename, version, true, show);
        if (name == null) return null;
-        return new BufferedReader(new FileReader(name),32*1024);
+        return openReadFile(name, UTF8); // new BufferedReader(new FileReader(name),32*1024);
    }

    public static String getMostRecentUnicodeDataFile(String filename, String version, 
@ -758,6 +791,7 @@ public final class Utility {    // COMMON UTILITIES
     * Replaces all occurances of piece with replacement, and returns new String
     */
    public static String replace(String source, String piece, String replacement) {
+        if (source == null || source.length() < piece.length()) return source;
    	int pos = 0;
        while (true) {
            pos = source.indexOf(piece, pos);
@ -767,6 +801,21 @@ public final class Utility {    // COMMON UTILITIES
        }
    }
    
+    public static String replace(String source, String[][] replacements) {
+        for (int i = 0; i < replacements.length; ++i) {
+            source = replace(source, replacements[i][0], replacements[i][1]);
+        }
+        return source;
+    }    
+    
+    public static String replace(String source, String[][] replacements, boolean reverse) {
+        if (!reverse) return replace(source, replacements);
+        for (int i = 0; i < replacements.length; ++i) {
+            source = replace(source, replacements[i][1], replacements[i][0]);
+        }
+        return source;
+    }    
+    
    public static String getStack() {
        Exception e = new Exception();
        StringWriter sw = new StringWriter();