Changes for generating linebreak test

X-SVN-Rev: 9433
2025-04-08 06:53:45 +00:00 · 2002-07-30 09:57:18 +00:00 · 2002-07-30 09:57:18 +00:00 · 73cd203e91
commit 73cd203e91
parent a5e7872567
18 changed files with 750 additions and 136 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/BuildNames.java
+++ b/tools/unicodetools/com/ibm/text/UCD/BuildNames.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
-* $Date: 2002/06/13 21:14:05 $
-* $Revision: 1.6 $
+* $Date: 2002/07/30 09:56:41 $
+* $Revision: 1.7 $
 *
 *******************************************************************************
 */
@ -142,7 +142,7 @@ public class BuildNames implements UCD_Types {

        String fname = "ShortNames.txt";
        System.out.println("Writing " + fname);
-        PrintWriter log = Utility.openPrintWriter(fname, false, true);
+        PrintWriter log = Utility.openPrintWriter(fname, Utility.LATIN1_WINDOWS);
        
        System.out.println("Gathering data");
        //Counter counter = new Counter();
--- a/tools/unicodetools/com/ibm/text/UCD/Default.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Default.java
@ -1,6 +1,10 @@
 package com.ibm.text.UCD;
 import com.ibm.text.utility.*;
 import java.util.Date;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.TimeZone;
+

 public final class Default implements UCD_Types {
    
@ -25,5 +29,14 @@ public final class Default implements UCD_Types {
        nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdVersion);
        System.out.println("Loaded UCD" + ucd.getVersion() + " " + (new Date(ucd.getDate())));
    }
+    
+    static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd','HH:mm:ss' GMT'");
+    static {
+        myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
+    }
+    
+    public static String getDate() {
+        return myDateFormat.format(new Date());
+    }

 }
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
-* $Date: 2002/05/31 01:41:04 $
-* $Revision: 1.10 $
+* $Date: 2002/07/30 09:56:41 $
+* $Revision: 1.11 $
 *
 *******************************************************************************
 */
@ -41,7 +41,7 @@ public class GenerateCaseFolding implements UCD_Types {
        PICK_SHORT = NF_CLOSURE = normalized;
        
        Default.setUCD();
-        log = Utility.openPrintWriter("CaseFoldingLog" + GenerateData.getFileSuffix(true));
+        log = Utility.openPrintWriter("CaseFoldingLog" + GenerateData.getFileSuffix(true), Utility.LATIN1_UNIX);
        System.out.println("Writing Log: " + "CaseFoldingLog" + GenerateData.getFileSuffix(true));
        
        System.out.println("Making Full Data");
@ -57,7 +57,7 @@ public class GenerateCaseFolding implements UCD_Types {
        if (normalized) filename += "-Normalized";
        String directory = "DerivedData/";
        String newFile = directory + filename + GenerateData.getFileSuffix(true);
-        PrintWriter out = Utility.openPrintWriter(newFile);
+        PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
        String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true));
        
        out.println("# CaseFolding" + GenerateData.getFileSuffix(false));
@ -444,7 +444,8 @@ public class GenerateCaseFolding implements UCD_Types {
        String suffix2 = "";
        if (normalize) suffix2 = "-Normalized";
        
-        PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions" + suffix2 + GenerateData.getFileSuffix(true));
+        PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions"
+            + suffix2 + GenerateData.getFileSuffix(true), Utility.LATIN1_UNIX);
        
        for (int ch = 0; ch <= 0x10FFFF; ++ch) {
            Utility.dot(ch);
@ -555,7 +556,7 @@ public class GenerateCaseFolding implements UCD_Types {
        
        System.out.println("Writing");
        String newFile = "DerivedData/SpecialCasing" + suffix2 + GenerateData.getFileSuffix(true);
-        PrintWriter out = Utility.openPrintWriter(newFile);
+        PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
        String mostRecent = GenerateData.generateBat("DerivedData/", "SpecialCasing", suffix2 + GenerateData.getFileSuffix(true));
        out.println("# SpecialCasing" + GenerateData.getFileSuffix(false));
        out.println(GenerateData.generateDateLine());
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
-* $Date: 2002/07/14 22:04:49 $
-* $Revision: 1.21 $
+* $Date: 2002/07/30 09:56:41 $
+* $Revision: 1.22 $
 *
 *******************************************************************************
 */
@ -15,8 +15,6 @@ package com.ibm.text.UCD;

 import java.util.*;
 import java.io.*;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;

 import com.ibm.text.utility.*;
 import com.ibm.icu.text.UTF16;
@ -28,6 +26,49 @@ public class GenerateData implements UCD_Types {
    static final boolean DEBUG = false;
    
    static final String HORIZONTAL_LINE = "# ================================================";
+    
+    static final void genSplit () {
+        Default.setUCD();
+        UnicodeSet split = new UnicodeSet();
+        UnicodeSet reordrant = new UnicodeSet(
+            "[\u093F\u09BF\u09c7\u09c8\u0abf\u0abf\u0b47\u0bc6\u0bc7\u0bc8"
+            + "\u0d46\u0d47\u0d48\u0dd9\u0dda\u0ddb\u1031\u17be\u17c1\u17c2\u17c3]");
+        UnicodeSet subjoined = new UnicodeSet();
+        for (int i = 0; i <= 0x10FFFF; ++i) {
+            if (!Default.ucd.isAssigned(i)) continue;
+            Utility.dot(i);
+            int cat = Default.ucd.getCategory(i);
+            if (cat != Mc && cat != Mn && cat != Me) continue;
+            if (Default.ucd.getName(i).indexOf("SUBJOINED") >= 0) {
+                System.out.print('*');
+                subjoined.add(i);
+                continue;
+            }
+            String decomp = Default.nfd.normalize(i);
+            //int count = countTypes(decomp, Mc);
+            if (UTF16.countCodePoint(decomp) > 1) split.add(i);
+        }
+        Utility.fixDot();
+        System.out.println("Split: " + split.size());
+        Utility.showSetNames("", split, false, Default.ucd);
+        
+        System.out.println("Reordrant: " + reordrant.size());
+        Utility.showSetNames("", reordrant, false, Default.ucd);
+        
+        System.out.println("Subjoined: " + subjoined.size());
+        Utility.showSetNames("", subjoined, false, Default.ucd);
+    }
+    
+    static int countTypes(String s, int filter) {
+        int count = 0;
+        int cp;
+        for (int i = 0; i < s.length(); i+= UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(s, i);
+            int cat = Default.ucd.getCategory(i);
+            if (cat == filter) count++;
+        }
+        return count;
+    }

    //static UnifiedBinaryProperty ubp
        
@ -55,12 +96,6 @@ public class GenerateData implements UCD_Types {
    }


-    static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd','HH:mm:ss' GMT'");
-
-    static {
-        myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
-    }
-
    //Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names

    public static String fixFile(String s) {
@ -108,7 +143,7 @@ public class GenerateData implements UCD_Types {
        Default.setUCD();
        String newFile = directory + fileName + getFileSuffix(true);
        System.out.println("New File: " + newFile);
-        PrintWriter output = Utility.openPrintWriter(newFile);
+        PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
        String mostRecent = generateBat(directory, fileName, getFileSuffix(true));
        System.out.println("Most recent: " + mostRecent);
        
@ -156,7 +191,7 @@ public class GenerateData implements UCD_Types {
    public static void generateCompExclusions() throws IOException {
        Default.setUCD();
        String newFile = "DerivedData/CompositionExclusions" + getFileSuffix(true);
-        PrintWriter output = Utility.openPrintWriter(newFile);
+        PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
        String mostRecent = generateBat("DerivedData/", "CompositionExclusions", getFileSuffix(true));
        
        output.println("# CompositionExclusions" + getFileSuffix(false));
@ -217,7 +252,7 @@ public class GenerateData implements UCD_Types {
    }
    
    static String generateDateLine() {
-        return "# Date: " + myDateFormat.format(new Date()) + " [MD]";
+        return "# Date: " + Default.getDate() + " [MD]";
    }

    static class CompLister extends PropertyLister {
@ -332,7 +367,7 @@ public class GenerateData implements UCD_Types {

        Utility.fixDot();
        System.out.println("Set Size: " + map.size());
-        PrintWriter output = Utility.openPrintWriter("Partition" + getFileSuffix(true));
+        PrintWriter output = Utility.openPrintWriter("Partition" + getFileSuffix(true), Utility.LATIN1_UNIX);
        
        Iterator it = map.keySet().iterator();
        while (it.hasNext()) {
@ -351,7 +386,7 @@ public class GenerateData implements UCD_Types {
    public static void listDifferences() throws IOException {

        Default.setUCD();
-        PrintWriter output = Utility.openPrintWriter("PropertyDifferences" + getFileSuffix(true));
+        PrintWriter output = Utility.openPrintWriter("PropertyDifferences" + getFileSuffix(true), Utility.LATIN1_UNIX);
        output.println("# Listing of relationships among properties, suitable for analysis by spreadsheet");
        output.println("# Generated for " + Default.ucd.getVersion());
        output.println(generateDateLine());
@ -610,7 +645,7 @@ public class GenerateData implements UCD_Types {
        
        String filename = "PropertyAliases";
        String newFile = "DerivedData/" + filename + getFileSuffix(true);
-        PrintWriter log = Utility.openPrintWriter(newFile);
+        PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
        String mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
        
        log.println("# " + filename + getFileSuffix(false));
@ -626,7 +661,7 @@ public class GenerateData implements UCD_Types {
        
        filename = "PropertyValueAliases";
        newFile = "DerivedData/" + filename + getFileSuffix(true);
-        log = Utility.openPrintWriter(newFile);
+        log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
        mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
        
        log.println("# " + filename + getFileSuffix(false));
@ -642,7 +677,7 @@ public class GenerateData implements UCD_Types {
        
        filename = "PropertyAliasSummary";
        newFile = "OtherData/" + filename + getFileSuffix(true);
-        log = Utility.openPrintWriter(newFile);
+        log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
        mostRecent = generateBat("OtherData/", filename, getFileSuffix(true));
        log.println();
        log.println(HORIZONTAL_LINE);
@ -793,7 +828,7 @@ public class GenerateData implements UCD_Types {
    }
    
    public static void generateBatAux(String batName, String oldName, String newName) throws IOException {
-        PrintWriter output = Utility.openPrintWriter(batName + ".bat");
+        PrintWriter output = Utility.openPrintWriter(batName + ".bat", Utility.LATIN1_UNIX);
        newName = Utility.getOutputName(newName);
        System.out.println("Writing BAT to compare " + oldName + " and " + newName);
        
@ -812,7 +847,7 @@ public class GenerateData implements UCD_Types {

        Default.setUCD();
        String newFile = directory + file + getFileSuffix(true);
-        PrintWriter output = Utility.openPrintWriter(newFile);
+        PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
        String mostRecent = generateBat(directory, file, getFileSuffix(true));
        
        doHeader(file + getFileSuffix(false), output, headerChoice);
@ -881,7 +916,7 @@ public class GenerateData implements UCD_Types {
    static public void writeNormalizerTestSuite(String directory, String fileName) throws IOException {
        Default.setUCD();
        String newFile = directory + fileName + getFileSuffix(true);
-        PrintWriter log = Utility.openPrintWriter(newFile, true, false);
+        PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
        String mostRecent = generateBat(directory, fileName, getFileSuffix(true));

        String[] example = new String[256];
@ -1082,7 +1117,7 @@ public class GenerateData implements UCD_Types {
    
        Default.setUCD();
        String newFile = directory + filename + getFileSuffix(true);
-        PrintWriter log = Utility.openPrintWriter(newFile);
+        PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
        String mostRecent = generateBat(directory, filename, getFileSuffix(true));
        DiffPropertyLister dpl;
        UnicodeSet cummulative = new UnicodeSet();
@ -1164,7 +1199,7 @@ public class GenerateData implements UCD_Types {
    static final void generateAge(String directory, String filename) throws IOException {
        Default.setUCD();
        String newFile = directory + filename + getFileSuffix(true);
-        PrintWriter log = Utility.openPrintWriter(newFile);
+        PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
        String mostRecent = generateBat(directory, filename, getFileSuffix(true));
        try {
            log.println("# " + filename + getFileSuffix(false));
@ -1259,7 +1294,7 @@ public class GenerateData implements UCD_Types {
    
    public static void listCombiningAccents() throws IOException {
        Default.setUCD();
-        PrintWriter log = Utility.openPrintWriter("ListAccents" + getFileSuffix(true));
+        PrintWriter log = Utility.openPrintWriter("ListAccents" + getFileSuffix(true), Utility.LATIN1_UNIX);
        Set set = new TreeSet();
        Set set2 = new TreeSet();
        
@ -1296,7 +1331,7 @@ public class GenerateData implements UCD_Types {
    
    public static void listGreekVowels() throws IOException {
        Default.setUCD();
-        PrintWriter log = Utility.openPrintWriter("ListGreekVowels" + getFileSuffix(true));
+        PrintWriter log = Utility.openPrintWriter("ListGreekVowels" + getFileSuffix(true), Utility.LATIN1_UNIX);
        Set set = new TreeSet();
        Set set2 = new TreeSet();
        
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
-* $Date: 2002/07/21 08:43:39 $
-* $Revision: 1.7 $
+* $Date: 2002/07/30 09:56:41 $
+* $Revision: 1.8 $
 *
 *******************************************************************************
 */
@ -42,7 +42,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
    
    public static void readUnihan() throws java.io.IOException {

-        log = Utility.openPrintWriter("Unihan_log.html", false, false);
+        log = Utility.openPrintWriter("Unihan_log.html", Utility.UTF8_WINDOWS);
        log.println("<body>");

        BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, true); 
@ -241,6 +241,8 @@ public final class GenerateHanTransliterator implements UCD_Types {
    
    static final int CHINESE = 2, JAPANESE = 1, DEFINITION = 0;
    
+    static final boolean DO_SIMPLE = true;
+    
    public static void main(int typeIn) {
    	type = typeIn;
    	Default.setUCD();
@ -269,13 +271,20 @@ public final class GenerateHanTransliterator implements UCD_Types {
                default: throw new IllegalArgumentException("Unexpected option: must be 0..2");
            }
                
-            log = Utility.openPrintWriter("Transliterate_log.txt", false, false);
-            err = Utility.openPrintWriter("Transliterate_err.txt", false, false);
+            err = Utility.openPrintWriter("Transliterate_err.txt", Utility.UTF8_WINDOWS);
+            log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
            log.print('\uFEFF');
            
-            readUnihanData(key);
+            log.println();
+            log.println("@*DICT Data");
+            log.println();
            readCDICTDefinitions(type);
            
+            log.println();
+            log.println("@Unihan Data");
+            log.println();
+            readUnihanData(key);
+
            if (false) {
                readCDICT();
                compareUnihanWithCEDICT();
@ -283,7 +292,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
            
            readFrequencyData(type);
            
-            out = Utility.openPrintWriter(filename, false, false);
+            out = Utility.openPrintWriter(filename, Utility.UTF8_WINDOWS);
            out.println("# Start RAW data for converting CJK characters");
            /*
            out.println("# Note: adds space between them and letters.");
@ -366,21 +375,24 @@ public final class GenerateHanTransliterator implements UCD_Types {
            Set doReverse = new HashSet();
            Set gotIt = new HashSet();
            
-            it = backSet.iterator();
-            while (it.hasNext()) {
-                Pair p = (Pair) it.next();
-                p = (Pair) p.second;
-                
-                String keyChar = (String) p.first; 
-                String def = (String) p.second;
-                if (!gotIt.contains(def)) {
-                    if (unihanNonSingular) {
-                        out.println(quoteNonLetters.transliterate(keyChar) + " < " + quoteNonLetters.transliterate(def) + ";");
-                    } else {
-                        doReverse.add(keyChar);
+            if (!DO_SIMPLE) {
+                it = backSet.iterator();
+                while (it.hasNext()) {
+                    Pair p = (Pair) it.next();
+                    p = (Pair) p.second;
+                    
+                    String keyChar = (String) p.first; 
+                    String def = (String) p.second;
+                    if (!gotIt.contains(def)) {
+                        if (unihanNonSingular) {
+                            out.println(quoteNonLetters.transliterate(keyChar)
+                                + " < " + quoteNonLetters.transliterate(def) + ";");
+                        } else {
+                            doReverse.add(keyChar);
+                        }
                    }
+                    gotIt.add(def);
                }
-                gotIt.add(def);
            }
            
           
@ -391,10 +403,10 @@ public final class GenerateHanTransliterator implements UCD_Types {
                
                String keyChar = (String) p.first; 
                String def = (String) p.second;
-                String rel = doReverse.contains(keyChar) ? " <> " : " > ";
+                String rel = !DO_SIMPLE && doReverse.contains(keyChar) ? "<>" : ">";
                
                out.println(quoteNonLetters.transliterate(keyChar) + rel
-                    + quoteNonLetters.transliterate(def) + ";");
+                    + quoteNonLetters.transliterate(def) + "|\\ ;");
                    //if (TESTING) System.out.println("# " + code + " > " + definition);
            }
            
@ -413,6 +425,24 @@ public final class GenerateHanTransliterator implements UCD_Types {
            System.out.println("Total: " + totalCount);
            System.out.println("Defined Count: " + count);
            
+            log.println();
+            log.println("@Duplicates");
+            log.println();
+            it = duplicates.keySet().iterator();
+            while (it.hasNext()) {
+                String word = (String) it.next();
+                log.print(hex.transliterate(word) + "\t" + word + "\t");
+                Collection dups = (Collection) duplicates.get(word);
+                Iterator it2 = dups.iterator();
+                boolean gotFirst = false;
+                while (it2.hasNext()) {
+                    if (!gotFirst) gotFirst = true;
+                    else log.print(", ");
+                    log.print(it2.next());
+                }
+                log.println();
+            }
+            
        } catch (Exception e) {
            System.out.println("Exception: " + e);
        } finally {
@ -506,6 +536,10 @@ public final class GenerateHanTransliterator implements UCD_Types {
            int overallRank = 0;
            it = combinedRank.iterator();
            
+            log.println();
+            log.println("@Frequency data: Rank of Character");
+            log.println();
+            
            while(it.hasNext()) {
                Pair p = (Pair) it.next();
                log.println(p.first + ", " + p.second);
@ -516,7 +550,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
                }
            }

-            log.println("@character to rank");
+            log.println();
+            log.println("@Frequency data: Character to Rank");
+            log.println();
            
            // get full order
            it = rankList.iterator();
@ -871,8 +907,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
    }
    
    static void addCheck2(String word, String definition, String line) {
-        definition = Default.nfc.normalize(definition) + " ";
+        definition = Default.nfc.normalize(definition);
        word = Default.nfc.normalize(word);
+        if (DO_SIMPLE && UTF16.countCodePoint(word) > 1) return;
        
        if (pua.containsSome(word) ) {
            Utility.fixDot();
@ -881,7 +918,13 @@ public final class GenerateHanTransliterator implements UCD_Types {
            Utility.fixDot();
            System.out.println("Only numbers on: " + line);
        } else {
-            unihanMap.put(word, definition);
+            Object alreadyThere = unihanMap.get(word);
+            if (alreadyThere == null) {
+                unihanMap.put(word, definition);
+            } else if (!definition.equals(alreadyThere)) {
+                Utility.addToList(duplicates, word, alreadyThere, true);
+                Utility.addToList(duplicates, word, definition, true);
+            }
        }
        if (UTF16.countCodePoint(word) > 1) unihanNonSingular = true;
    }
@ -1025,19 +1068,28 @@ public final class GenerateHanTransliterator implements UCD_Types {
        if (end > end2) end = end2;
  
        // IF CHINESE or JAPANESE, stop at first space!!!
+        rawDefinition = rawDefinition.substring(start,end);
        
-        if (type != DEFINITION) {
-            end2 = rawDefinition.indexOf(" ", start);
-            if (end2 < 0) end2 = rawDefinition.length();
-            if (end > end2) end = end2;
+        if (type == DEFINITION) {
+            storeDef2(out, cp, rawDefinition, line);
+        } else {
+            if (rawDefinition.indexOf(' ') < 0) storeDef2(out, cp, rawDefinition, line);
+            else {
+                String [] pieces = Utility.split(rawDefinition, ' ');
+                for (int i = 0; i < pieces.length; ++i) {
+                    storeDef2(out, cp, pieces[i], line);
+                }
+            }
        }
-        
-        String definition = rawDefinition.substring(start,end);
+    }
+    
+    static void storeDef2(PrintWriter out, int cp, String definition, String line) {
        if (type == CHINESE) {
            // since data are messed up, terminate after first digit
            int end3 = findInString(definition, "12345")+1;
            if (end3 == 0) {
-                log.println("Bad pinyin data: " + rawDefinition);
+                log.println("Bad pinyin data: " + hex.transliterate(UTF16.valueOf(cp))
+                    + "\t" + UTF16.valueOf(cp) + "\t" + definition);
                end3 = definition.length();
            }
            definition = definition.substring(0, end3);
@ -1045,9 +1097,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
            definition = convertPinyin.transliterate(definition);
        }
        if (type == DEFINITION) {
-            definition = removeMatched(definition,'(', ')', rawDefinition);
-            definition = removeMatched(definition,'[', ']', rawDefinition);
-            definition = fixDefinition(definition, rawDefinition);
+            definition = removeMatched(definition,'(', ')', line);
+            definition = removeMatched(definition,'[', ']', line);
+            definition = fixDefinition(definition, line);
        }
        definition = definition.trim();
        definition = Default.ucd.getCase(definition, FULL, LOWER);
@ -1056,7 +1108,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
            Utility.fixDot();
            System.out.println("Zero value for " + Default.ucd.getCode(cp) + " on: " + hex.transliterate(line));
        } else {
-            addCheck(UTF16.valueOf(cp), definition, rawDefinition);
+            addCheck(UTF16.valueOf(cp), definition, line);
        }
        /*
        String key = (String) unihanMap.get(definition);
@ -1103,6 +1155,8 @@ public final class GenerateHanTransliterator implements UCD_Types {
    }
        
    static Map unihanMap = new HashMap();
+    static Map duplicates = new TreeMap();
+    
    static boolean unihanNonSingular = false;
    
    static StringBuffer handlePinyinTemp = new StringBuffer();
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java
@ -0,0 +1,479 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and    *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
+* $Date: 2002/07/30 09:57:18 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+package com.ibm.text.UCD;
+
+import java.util.*;
+import java.io.*;
+
+import com.ibm.text.utility.*;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+
+public class GenerateLineBreakTest implements UCD_Types {
+
+    static String[] samples = new String[LB_LIMIT + 3];
+    
+    static byte[] TROrder = {
+        LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO,
+        LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM,
+        // missing from Pair Table
+        LB_SP, LB_BK, LB_CR, LB_LF, 
+        // resolved types below
+        LB_CB, LB_AI, LB_SA, LB_SG, LB_XX,
+        // 3 JAMO CLASSES
+        29, 30, 31
+    };
+    static final int TABLE_LIMIT = 25;
+     
+    
+    public static void main(String[] args) throws IOException {
+        Default.setUCD();
+        
+        findSamples();
+        
+        // test individual cases
+        //printLine(out, samples[LB_ZW], "", samples[LB_CL]);
+        //printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
+        
+        PrintWriter out = Utility.openPrintWriter("LineBreakTest.html", Utility.UTF8_WINDOWS);
+        out.println("<html><body><h1>Current (fixed only for consistency):</h1>");
+        generateTable(out, false);
+        out.println("<h1>Recommended:</h1>");
+        generateTable(out, true);
+        out.println("</body></html>");
+        out.close();
+        
+        // do main test
+        
+        for (int k = 0; k < 2; ++k) {
+            out = Utility.openPrintWriter(k == 0 ? "LineBreakTest_SHORT.txt" : "LineBreakTest.txt", Utility.UTF8_WINDOWS);
+            int counter = 0;
+            
+            out.println("# Default Linebreak conformance test");
+            out.println("# " + Default.getDate() + ", MED");
+            out.println("#");
+            
+            for (int ii = 0; ii < samples.length; ++ii) {
+                int i = TROrder[ii];
+                String before = samples[i];
+                
+                for (int jj = 0; jj < samples.length; ++jj) {
+                    Utility.dot(counter++);
+                    int j = TROrder[jj];
+                    String after = samples[j];
+                    // do line straight
+                    printLine(out, before, "", after, k != 0);
+                    printLine(out, before, " ", after, k != 0);
+                    printLine(out, before, "\u0301\u0308", after, k != 0);
+                }
+            }
+            out.println("# Lines: " + counter);
+            out.close();
+        }
+    }
+    
+    public static void generateTable(PrintWriter out, boolean recommended) {
+        out.print("<table border='1' cellspacing='0'><tr><th></th>");
+        for (int i = 0; i < TABLE_LIMIT; ++i) {
+            String h = getLBID(samples[TROrder[i]]);
+            out.print("<th>" + h + "</th>");
+        }
+        out.print("</tr>");
+        String[] rule = new String[1];
+        String[] rule2 = new String[1];
+        for (int i = 0; i < TABLE_LIMIT; ++i) {
+            String before = samples[TROrder[i]];
+            String line = "<tr><th>" + getLBID(before) + "</th>";
+            for (int j = 0; j < TABLE_LIMIT; ++j) {
+                String after = samples[TROrder[j]];
+                String t = getTableEntry(before, after, recommended, rule);
+                String background = "";
+                if (recommended) {
+                    String t2 = getTableEntry(before, after, false, rule2);
+                    if (!t.equals(t2)) background = " bgcolor='#FFFF00'";
+                }
+                line += "<th title='" + rule[0] + "'" + background + ">" + t + "</th>";
+            }
+            out.println(line + "</tr>");
+        }
+        out.println("</table>");
+    }
+    
+    public static String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
+        String t = "_";
+        boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended);
+        String spaceRule = rule;
+                
+        boolean spaceBreak2 = isBreak(before + " " + after, before.length(), recommended);
+        String spaceRule2 = rule;
+                
+        boolean normalBreak = isBreak(before + after, before.length(), recommended);
+        String normalRule = rule;
+                
+        if (!normalBreak) {
+            if (!spaceBreak && !spaceBreak2) {
+                t = "^";
+                rule = spaceRule.equals(normalRule) ? normalRule : spaceRule + "/" + normalRule;
+                if (!spaceRule2.equals(normalRule) && !spaceRule2.equals(spaceRule)) {
+                    rule += "/" + spaceRule2;
+                }
+            } else {
+                t = "%";
+                rule = normalRule;
+            }
+        }
+        ruleOut[0] = rule;
+        return t;
+    }
+    
+    
+    public static void printLine(PrintWriter out, String before, String filler, String after, boolean comments) {
+        String s = before + filler + after;
+        int offset = before.length() + filler.length();
+        
+        boolean lb = isBreak(s, offset, false);
+        
+        String tlb = (lb ? "b" : "n");
+        String comment = "";
+        if (comments) comment = 
+            " # " + getLBID(before + filler)
+            + " " + tlb
+            + " " + getLBID(after)
+            + " # " + Default.ucd.getName(before + filler)
+            + " " + tlb
+            + " " + Default.ucd.getName(after);
+            
+        out.println(Utility.hex(before + filler)
+            + "; " + tlb
+            + "; " + Utility.hex(after)
+            + comment);
+    }
+
+    public static void findSamples() {
+        for (int i = 1; i <= 0x10FFFF; ++i) {
+            if (!Default.ucd.isAllocated(i)) continue;
+            if (Default.ucd.isLeadingJamo(i) 
+                || Default.ucd.isVowelJamo(i) 
+                || Default.ucd.isTrailingJamo(i)) continue;
+            byte lb = Default.ucd.getLineBreak(i);
+            if (samples[lb] == null) {
+                samples[lb] = UTF16.valueOf(i);
+            }
+        }
+        // fill the last with special cases
+        samples[LB_LIMIT] = "\u1100";
+        samples[LB_LIMIT+1] = "\u1162";
+        samples[LB_LIMIT+2] = "\u11A8";
+    }
+       
+
+    public static String getLBID(String s) {
+        if (s.length() == 1) return Default.ucd.getLineBreakID(s.charAt(0));
+        StringBuffer result = new StringBuffer();
+        int cp;
+        for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
+            cp = UTF32.char32At(s, i);
+            if (i > 0) result.append(" ");
+            result.append(Default.ucd.getLineBreakID(cp));
+        }
+        return result.toString();
+    }
+       
+    static String rule;
+
+    public static int findLastNon(String source, int offset, byte notLBType) {
+        int cp;
+        for (int i = offset-2; i >= 0; i -= UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(source, i);
+            byte f = getResolvedLB(cp);
+            if (f != notLBType) return cp;
+        }
+        return 0;
+    }
+
+    public static byte getResolvedLB (int cp) {
+        // LB 1  Assign a line break category to each character of the input.
+        // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
+        byte result = Default.ucd.getLineBreak(cp);
+        switch (result) {
+            case LB_AI: result = LB_AI; break;
+            // case LB_CB: result = LB_ID; break;
+            case LB_SA: result = LB_AL; break;
+            // case LB_SG: result = LB_XX; break; Surrogates; will never occur
+            case LB_XX: result = LB_AL; break;
+        }
+        return result;
+    }
+
+    // find out whether there is a break at offset
+    // WARNING: as a side effect, sets "rule"
+
+    public static boolean isBreak(String source, int offset, boolean recommended) {
+
+        // LB 1  Assign a line break category to each character of the input.
+        // Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
+        // this is taken care of in the getResolvedLB function
+
+        // LB 2a  Never break at the start of text
+
+        rule="2a";
+        if (offset <= 0) return false;
+
+        // LB 2b  Always break at the end of text
+
+        rule="2b";
+        if (offset >= source.length()) return true;
+
+
+        // UTF-16: never break in the middle of a code point
+        if (UTF16.isLeadSurrogate(source.charAt(offset-1))
+            && UTF16.isTrailSurrogate(source.charAt(offset))) return false;
+
+
+        // now get the character before and after, and their types
+
+
+        int cpBefore = UTF16.charAt(source, offset-1);
+        int cpAfter = UTF16.charAt(source, offset);
+
+        byte before = getResolvedLB(cpBefore);
+        byte after = getResolvedLB(cpAfter);
+
+
+        rule="3a";
+        // Always break after hard line breaks (but never between CR and LF).
+        // CR ^ LF
+        if (before == LB_CR && after == LB_LF) return false;
+        if (before == LB_BK || before == LB_LF || before == LB_CR) return true;
+
+        //LB 3b  Don’t break before hard line breaks.
+        rule="3b";
+        if (after == LB_BK || after == LB_LF | after == LB_CR) return false;
+
+        // LB 4  Don’t break before spaces or zero-width space.
+        // × SP
+        // × ZW
+
+        rule="4";
+        if (after == LB_SP || after == LB_ZW) return false;
+
+        // LB 5 Break after zero-width space.
+        // ZW ÷
+        rule="5";
+        if (before == LB_ZW) return true;
+
+        // LB 6  Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
+        rule="6";
+        if (after == LB_CM) return false;
+        if (Default.ucd.isLeadingJamo(cpBefore)) {
+            if (Default.ucd.isLeadingJamo(cpAfter) || Default.ucd.isVowelJamo(cpAfter)) return false;
+        } else if (Default.ucd.isVowelJamo(cpBefore)) {
+            if (Default.ucd.isVowelJamo(cpAfter) || Default.ucd.isTrailingJamo(cpAfter)) return false;
+        } else if (Default.ucd.isTrailingJamo(cpBefore)) {
+            if (Default.ucd.isTrailingJamo(cpAfter)) return false;
+        }
+
+        boolean setBase = false;
+        if (before == LB_CM) {
+            setBase = true;
+            int cp = findLastNon(source, offset, LB_CM);
+            if (cp == 0) {
+                before = LB_ID;
+            } else {
+                before = getResolvedLB(cp);
+            }
+        }
+
+        // LB 7  In all of the following rules, if a space is the base character for a combining mark,
+        // the space is changed to type ID. In other words, break before SP CM* in the same cases as
+        // one would break before an ID.
+        rule="7";
+        if (setBase && before == LB_SP) before = LB_ID;
+
+        // LB 8  Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’,  even after spaces.
+        // × CL, × EX, × IS, × SY
+        rule="8";
+        if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false;
+
+
+        // find the last non-space character; we will need it
+        byte lastNonSpace = before;
+        if (lastNonSpace == LB_SP) {
+            int cp = findLastNon(source, offset, LB_CM);
+            if (cp != 0) {
+                lastNonSpace = getResolvedLB(cp);
+            }
+        }
+
+        // LB 9  Don’t break after ‘[’, even after spaces.
+        // OP SP* ×
+        rule="9";
+        if (lastNonSpace == LB_OP) return false;
+
+        // LB 10  Don’t break within ‘”[’, , even with intervening spaces.
+        // QU SP* × OP
+        rule="10";
+        if (lastNonSpace == LB_QU && after == LB_OP) return false;
+
+        // LB 11  Don’t break within ‘]h’, even with intervening spaces.
+        // CL SP* × NS
+        rule="11";
+        if (lastNonSpace == LB_CL && after == LB_NS) return false;
+
+        // LB 11a  Don’t break within ‘——’, even with intervening spaces.
+        // B2 × B2
+        rule="11a";
+        if (lastNonSpace == LB_B2 && after == LB_B2) return false;
+
+
+        if (recommended) {
+            // LB 13  Don’t break before or after NBSP or WORD JOINER
+            // × GL
+            // GL ×
+
+            rule="11b";
+            if (after == LB_GL || before == LB_GL) return false;
+        }
+
+        // [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.]
+
+        rule="12";
+        // LB 12  Break after spaces
+        // SP ÷
+
+        if (before == LB_SP) return true;
+
+        if (!recommended) {
+            // LB 13  Don’t break before or after NBSP or WORD JOINER
+            // × GL
+            // GL ×
+
+            rule="13";
+            if (after == LB_GL || before == LB_GL) return false;
+        }
+
+        rule="14";
+        // LB 14  Don’t break before or after ‘”’
+        // × QU
+        // QU ×
+        if (before == LB_QU || after == LB_QU) return false;
+
+        // LB 15  Don’t break before hyphen-minus, other hyphens, fixed-width spaces,
+        // small kana and other non- starters,  or after acute accents:
+        // × BA
+        // × HY
+        // × NS
+        // BB ×
+        
+        if (recommended) {
+        // LB 14a  Break before and after CB
+        // CB ÷
+        // ÷ CB
+            if (before == LB_CB || after == LB_CB) return true;       
+        
+        }
+
+        rule="15";
+        if (after == LB_NS) return false;
+        if (after == LB_HY) return false;
+        if (after == LB_BA) return false;
+        if (before == LB_BB) return false;
+
+        if (!recommended) {
+            // LB 15b  Break after hyphen-minus, and before acute accents:
+            // HY ÷
+            // ÷ BB
+
+            rule="15b";
+            if (before == LB_HY) return true;
+            if (after == LB_BB) return true;
+        }
+
+        // LB 16  Don’t break between two ellipses, or between letters or numbers and ellipsis:
+        // AL × IN
+        // ID × IN
+        // IN × IN
+        // NU × IN
+        // Examples: ’9...’, ‘a...’, ‘H...’
+        rule="16";
+        if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false;
+        if (before == LB_IN && after == LB_IN) return false;
+
+        // Don't break alphanumerics.
+        // LB 17  Don’t break within ‘a9’, ‘3a’, or ‘H%’
+        // ID × PO
+        // AL × NU
+        // NU × AL
+        // Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ?  PO ?
+        // Examples:   $(12.35)    2,1234    (12)¢    12.54¢
+        // This is approximated with the following rules. (Some cases already handled above,
+        // like ‘9,’, ‘[9’.)
+        rule="17";
+        if (before == LB_ID && after == LB_PO) return false;
+        if (before == LB_AL && after == LB_NU) return false;
+        if (before == LB_NU && after == LB_AL) return false;
+
+        // LB 18  Don’t break between the following pairs of classes.
+        // CL × PO
+        // HY × NU
+        // IS × NU
+        // NU × NU
+        // NU × PO
+        // PR × AL
+        // PR × HY
+        // PR × ID
+        // PR × NU
+        // PR × OP
+        // SY × NU
+        // Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’,  ‘9%’ ‘]%’
+
+        rule="18";
+        if (before == LB_CL && after == LB_PO) return false;
+        if (before == LB_HY && after == LB_NU) return false;
+        if (before == LB_IS && after == LB_NU) return false;
+        if (before == LB_NU && after == LB_NU) return false;
+        if (before == LB_NU && after == LB_PO) return false;
+
+        if (before == LB_PR && after == LB_AL) return false;
+        if (before == LB_PR && after == LB_HY) return false;
+        if (before == LB_PR && after == LB_ID) return false;
+        if (before == LB_PR && after == LB_NU) return false;
+        if (before == LB_PR && after == LB_OP) return false;
+
+        if (before == LB_SY && after == LB_NU) return false;
+
+        if (recommended) {
+            // LB 15b  Break after hyphen-minus, and before acute accents:
+            // HY ÷
+            // ÷ BB
+
+            rule="18b";
+            if (before == LB_HY) return true;
+            if (after == LB_BB) return true;
+        }
+
+        // LB 19  Don’t break between alphabetics (“at”)
+        // AL × AL
+
+        rule="19";
+        if (before == LB_AL && after == LB_AL) return false;
+
+        // LB 20  Break everywhere else
+        // ALL ÷
+        // ÷ ALL
+
+        rule="20";
+        return true;
+    }
+}
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
-* $Date: 2002/04/23 22:50:15 $
-* $Revision: 1.1 $
+* $Date: 2002/07/30 09:56:41 $
+* $Revision: 1.2 $
 *
 *******************************************************************************
 */
@ -81,7 +81,7 @@ public class GenerateThaiBreaks {
        System.out.println("initials size: " + initials.size());
        System.out.println("finals size: " + finals.size());
        
-        out = Utility.openPrintWriter("ThaiData.txt", false, false);
+        out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
        out.write('\uFEFF');
        out.println("Only Initials");
        Utility.print(out, initials, ", ", new MyBreaker());
--- a/tools/unicodetools/com/ibm/text/UCD/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Main.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
-* $Date: 2002/07/14 22:04:49 $
-* $Revision: 1.18 $
+* $Date: 2002/07/30 09:56:41 $
+* $Revision: 1.19 $
 *
 *******************************************************************************
 */
@ -59,7 +59,7 @@ public final class Main implements UCD_Types {
            } else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{Default.ucdVersion});
            else if (arg.equalsIgnoreCase("version")) Default.setUCD(args[++i]);
            else if (arg.equalsIgnoreCase("statistics")) VerifyUCD.statistics();
-            else if (arg.equalsIgnoreCase("testskippable")) NFSkippable.main(null);
+            else if (arg.equalsIgnoreCase("NFSkippable")) NFSkippable.main(null);
            else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable();
            else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
            else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
@ -77,7 +77,12 @@ public final class Main implements UCD_Types {
            else if (arg.equalsIgnoreCase("Buildnames")) BuildNames.main(null);
            else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
            
+            
+            else if (arg.equalsIgnoreCase("linebreaktest")) GenerateLineBreakTest.main(null);

+            else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit();
+            else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity();
+            
            else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
            else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
            else if (arg.equalsIgnoreCase("checkCase3")) VerifyUCD.checkCase3();
--- a/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java
+++ b/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java
@ -12,14 +12,15 @@ public final class NFSkippable extends UnicodeProperty {
    
    private Normalizer nf;
    private Normalizer nfd;
+    private UCD ucd;
    private boolean composes;
    private int[] realTrailers = new int[100];
    private int realTrailerCount = 0;
    
-    public NFSkippable(byte normalizerMode, String unicodeVersion) {
+    public NFSkippable(byte normalizerMode, UCD inputUCD) {
        isStandard = false;
-        ucd = UCD.make(unicodeVersion);
-        nf = new Normalizer(normalizerMode, unicodeVersion);
+        this.ucd = inputUCD;
+        nf = new Normalizer(normalizerMode, ucd.getVersion());
        name = nf.getName() + "_Skippable";
        shortName = nf.getName() + "_Skip";
        header = "# Derived Property: " + name
@ -28,7 +29,7 @@ public final class NFSkippable extends UnicodeProperty {
            + "\r\n#   WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
            + "\r\n#            The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";

-        nfd = new Normalizer(Normalizer.NFD, unicodeVersion);
+        nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
        composes = normalizerMode == Normalizer.NFC || normalizerMode == Normalizer.NFKC;
        
        // preprocess to find possible trailers
@ -36,7 +37,7 @@ public final class NFSkippable extends UnicodeProperty {
        if (composes) for (int cp2 = 0; cp2 <= 0x10FFFF; ++cp2) {
            if (nf.isTrailing(cp2)) {
                //System.out.println("Trailing: " + ucd.getCodeAndName(cp2));
-                if (ucd.isTrailingJamo(cp2)) {
+                if (ucd.isNonLeadJamo(cp2)) {
                    //System.out.println("Jamo: " + ucd.getCodeAndName(cp2));
                    continue;
                }
@ -190,18 +191,21 @@ public final class NFSkippable extends UnicodeProperty {
    static int limit = 0x10FFFF; // full version = 10ffff, for testing may use smaller
    
    public static void main (String[] args) throws java.io.IOException {
+        Default.setUCD();
        
-        String version = ""; // Unicode version, "" = latest released
-        
-        PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt");
+        PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt", Utility.UTF8_WINDOWS);
+        out.println("NFSafeSets");
+        out.println("Version: " + Default.ucd.getVersion());
+        out.println("Date: " + Default.getDate());
+        out.println();
        
        for (int mode = NFD_UnsafeStart; mode <= NFKC_UnsafeStart; ++mode) {
-            UnicodeProperty up = DerivedProperty.make(mode, UCD.make(version));
+            UnicodeProperty up = DerivedProperty.make(mode, Default.ucd);
            generateSet(out, "UNSAFE[" + Normalizer.getName((byte)(mode-NFD_UnsafeStart)) + "]", up);
        }
        
        for (byte mode = NFD; mode <= NFKC; ++mode) {
-            NFSkippable skipper = new NFSkippable(mode,version);
+            NFSkippable skipper = new NFSkippable(mode, Default.ucd);
            generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper);
        }
        
@ -219,9 +223,9 @@ public final class NFSkippable extends UnicodeProperty {
            
        String rSet = result.toPattern(true);          
        rSet = replace(rSet, "\\U", "\\\\U");
+        rSet = replace(rSet, "\\u", "\\\\u");
        out.println(label + " = new UnicodeSet(");
        writeStringInPieces(out, rSet, ", false);");
-        out.println();
            
        rSet = result.toPattern(false);
        out.println("/*Unicode: ");
--- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
-* $Date: 2002/06/24 15:25:10 $
-* $Revision: 1.12 $
+* $Date: 2002/07/30 09:56:41 $
+* $Revision: 1.13 $
 *
 *******************************************************************************
 */
@ -271,7 +271,7 @@ public final class Normalizer implements UCD_Types {
        }
        for (int i = UCD.LBase; i < UCD.TLimit; ++i) {
            if (leading != null && UCD.isLeadingJamo(i)) leading.set(i); // set all initial Jamo (that form syllables)
-            if (trailing != null && UCD.isTrailingJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
+            if (trailing != null && UCD.isNonLeadJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
        }
        if (leading != null) {
            for (int i = UCD.SBase; i < UCD.SLimit; ++i) {
@ -407,7 +407,7 @@ public final class Normalizer implements UCD_Types {
            for (int i = 0; i < 0x10FFFF; ++i) {
                if (!ucd.isAssigned(i)) continue;
                if (ucd.isPUA(i)) continue;
-                if (ucd.isTrailingJamo(i)) isSecond.set(i);
+                if (ucd.isNonLeadJamo(i)) isSecond.set(i);
                if (ucd.isLeadingJamoComposition(i)) isFirst.set(i);
                byte dt = ucd.getDecompositionType(i);
                if (dt != CANONICAL) continue;
--- a/tools/unicodetools/com/ibm/text/UCD/NormalizerSample.java
+++ b/tools/unicodetools/com/ibm/text/UCD/NormalizerSample.java
@ -246,7 +246,7 @@ public class NormalizerSample implements UCD_Types {
            for (int i = 0; i < 0x10FFFF; ++i) {
                if (!ucd.isAssigned(i)) continue;
                if (ucd.isPUA(i)) continue;
-                if (ucd.isTrailingJamo(i)) isSecond.set(i);
+                if (ucd.isNonLeadJamo(i)) isSecond.set(i);
                byte dt = ucd.getDecompositionType(i);
                if (dt != CANONICAL) continue;
                if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
--- a/tools/unicodetools/com/ibm/text/UCD/TestData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
-* $Date: 2001/12/13 23:35:57 $
-* $Revision: 1.8 $
+* $Date: 2002/07/30 09:56:40 $
+* $Revision: 1.9 $
 *
 *******************************************************************************
 */
@ -126,12 +126,6 @@ public class TestData implements UCD_Types {
    }


-    static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.S' GMT'");
-
-    static {
-        myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
-    }
-
    //Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names

    public static String fixFile(String s) {
--- a/tools/unicodetools/com/ibm/text/UCD/UCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
-* $Date: 2002/06/22 01:21:09 $
-* $Revision: 1.15 $
+* $Date: 2002/07/30 09:56:40 $
+* $Revision: 1.16 $
 *
 *******************************************************************************
 */
@ -1170,14 +1170,22 @@ to guarantee identifier closure.
                && ((char1 - SBase) % TCount) == 0);
    }

+    static boolean isVowelJamo(int cp) {
+        return (VBase <= cp && cp < VLimit);
+    }
+
    static boolean isTrailingJamo(int cp) {
-        return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
+        return (TBase <= cp && cp < TLimit);
    }

    static boolean isLeadingJamo(int cp) {
        return (LBase <= cp && cp < LLimit);
    }

+    static boolean isNonLeadJamo(int cp) {
+        return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
+    }
+
    private void fillFromFile(String version) {
    	try {
    		fillFromFile2(version);
--- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
-* $Date: 2002/06/15 02:47:13 $
-* $Revision: 1.13 $
+* $Date: 2002/07/30 09:56:40 $
+* $Revision: 1.14 $
 *
 *******************************************************************************
 */
@ -229,10 +229,12 @@ public interface UCD_Types {

    // line break
    public static final byte
-        LBXX = 0, LBOP = 1, LBCL = 2, LBQU = 3, LBGL = 4, LBNS = 5, LBEX = 6, LBSY = 7,
-        LBIS = 8, LBPR = 9, LBPO = 10, LBNU = 11, LBAL = 12, LBID = 13, LBIN = 14, LBHY = 15,
-        LBCM = 16, LBBB = 17, LBBA = 18, LBSP = 19, LBBK = 20, LBCR = 21, LBLF = 22, LBCB = 23,
-        LBSA = 24, LBAI = 25, LBB2 = 26, LBSG = 27, LBZW = 28, LIMIT_LINE_BREAK = 29;
+        LB_XX = 0, LB_OP = 1, LB_CL = 2, LB_QU = 3, LB_GL = 4, LB_NS = 5, LB_EX = 6, LB_SY = 7,
+        LB_IS = 8, LB_PR = 9, LB_PO = 10, LB_NU = 11, LB_AL = 12, LB_ID = 13, LB_IN = 14, LB_HY = 15,
+        LB_CM = 16, LB_BB = 17, LB_BA = 18, LB_SP = 19, LB_BK = 20, LB_CR = 21, LB_LF = 22, LB_CB = 23,
+        LB_SA = 24, LB_AI = 25, LB_B2 = 26, LB_SG = 27, LB_ZW = 28,
+        LIMIT_LINE_BREAK = 29,
+        LB_LIMIT = LIMIT_LINE_BREAK;

    // east asian width
    public static final byte
--- a/tools/unicodetools/com/ibm/text/UCD/UData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
-* $Date: 2002/06/15 02:47:12 $
-* $Revision: 1.5 $
+* $Date: 2002/07/30 09:56:40 $
+* $Revision: 1.6 $
 *
 *******************************************************************************
 */
@ -42,7 +42,7 @@ class UData implements UCD_Types {
    byte numericType = NUMERIC_NONE;

    byte eastAsianWidth = EAN;
-    byte lineBreak = LBXX;
+    byte lineBreak = LB_XX;
    byte joiningType = JT_U;
    byte joiningGroup = NO_SHAPING;
    byte script = COMMON_SCRIPT;
@ -196,7 +196,7 @@ class UData implements UCD_Types {
        if (full || !Double.isNaN(numericValue)) result.append(" nv='").append(numericValue).append('\'');

        if (full || eastAsianWidth != EAN) result.append(" ea='").append(UCD_Names.EA[eastAsianWidth]).append('\'');
-        if (full || lineBreak != LBAL) result.append(" lb='").append(UCD_Names.LB[lineBreak]).append('\'');
+        if (full || lineBreak != LB_AL) result.append(" lb='").append(UCD_Names.LB[lineBreak]).append('\'');
        if (full || joiningType != JT_U) result.append(" jt='").append(UCD_Names.JOINING_TYPE[joiningType]).append('\'');
        if (full || joiningGroup != NO_SHAPING) result.append(" jg='").append(UCD_Names.JOINING_GROUP[joiningGroup]).append('\'');
        if (full || age != 0) result.append(" ag='").append(UCD_Names.AGE[age]).append('\'');
--- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
-* $Date: 2002/06/22 01:21:09 $
-* $Revision: 1.17 $
+* $Date: 2002/07/30 09:56:40 $
+* $Revision: 1.18 $
 *
 *******************************************************************************
 */
@ -551,7 +551,7 @@ can help you narrow these down.
    static void generateXML() throws IOException {
        Default.setUCD();
        String filename = "UCD.xml";
-        PrintWriter log = Utility.openPrintWriter(filename);
+        PrintWriter log = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);

         //log.println('\uFEFF');
        log.println("<ucd>");
@ -580,14 +580,14 @@ can help you narrow these down.
        
        String ttest = Default.ucd.getCase(test, FULL, TITLE);
        
-        PrintWriter titleTest = Utility.openPrintWriter("TestTitle.txt");
+        PrintWriter titleTest = Utility.openPrintWriter("TestTitle.txt", Utility.LATIN1_UNIX);
        titleTest.println(test);
        titleTest.println(ttest);
        titleTest.close();
        
        System.out.println(Default.ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
        String fileName = "CaseDifferences.txt";
-        PrintWriter log = Utility.openPrintWriter(fileName);
+        PrintWriter log = Utility.openPrintWriter(fileName, Utility.LATIN1_UNIX);

        for (int cp = 0; cp <= 0x10FFFF; ++cp) {
            Utility.dot(cp);
@ -648,7 +648,7 @@ can help you narrow these down.
        
        
        String fileName = "CaseNormalizationDifferences.txt";
-        PrintWriter log = Utility.openPrintWriter(fileName);
+        PrintWriter log = Utility.openPrintWriter(fileName, Utility.LATIN1_UNIX);

        log.println("Differences between case(normalize(cp)) and normalize(case(cp))");
        log.println("u, l, t - upper, lower, title");
@ -1069,7 +1069,7 @@ can help you narrow these down.
        System.out.println("Writing IDNCheck.txt");
        
        
-        PrintWriter log = Utility.openPrintWriter("IDNCheck.txt");
+        PrintWriter log = Utility.openPrintWriter("IDNCheck.txt", Utility.LATIN1_UNIX);
        log.println("IDN Check");
        log.println("Total Errors: " + errorCount);
       
@ -1124,7 +1124,7 @@ can help you narrow these down.
    public static void genIDN() throws IOException {
        PrintWriter out = new PrintWriter(System.out);
        Default.setUCD();
-        PrintWriter log = Utility.openPrintWriter("IDN-tables.txt");
+        PrintWriter log = Utility.openPrintWriter("IDN-tables.txt", Utility.LATIN1_UNIX);
        
        /*UnicodeSet y = UnifiedBinaryProperty.make(CATEGORY + FORMAT).getSet();
        UnicodeSet x = new UnicodeSet(0xE0001,0xE007F).retainAll(y);
@ -1906,7 +1906,7 @@ E0020-E007F; [TAGGING CHARACTERS]
            }
        }
        
-        PrintWriter log = Utility.openPrintWriter("CheckScriptsLog.txt");
+        PrintWriter log = Utility.openPrintWriter("CheckScriptsLog.txt", Utility.LATIN1_UNIX);
        
        Iterator it = m.keySet().iterator();
        while (it.hasNext()) {
--- a/tools/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java
+++ b/tools/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java,v $
-* $Date: 2001/10/25 20:33:46 $
-* $Revision: 1.3 $
+* $Date: 2002/07/30 09:56:40 $
+* $Revision: 1.4 $
 *
 *******************************************************************************
 */
@ -21,7 +21,7 @@ import com.ibm.text.utility.*;
 public class WriteJavaScriptInfo implements UCD_Types {
    
    static public void assigned() throws IOException {
-        PrintWriter log = Utility.openPrintWriter("assigned.js");
+        PrintWriter log = Utility.openPrintWriter("assigned.js", Utility.LATIN1_UNIX);
        UCD ucd = UCD.make();
        boolean wasIn = false;
        int lastWritten = -100;
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2002/07/21 08:43:39 $
-* $Revision: 1.22 $
+* $Date: 2002/07/30 09:56:41 $
+* $Revision: 1.23 $
 *
 *******************************************************************************
 */
@ -531,14 +531,22 @@ public final class Utility {    // COMMON UTILITIES
        "1.1.0",
    };

-    public static PrintWriter openPrintWriter(String filename) throws IOException {
-        return openPrintWriter(filename, true, true);
+    /*public static PrintWriter openPrintWriter(String filename) throws IOException {
+        return openPrintWriter(filename, LATIN1_UNIX);
    }
+    */
+    
+    static final byte WINDOWS_MASK = 1, UTF8_MASK = 2;
+    public static final byte 
+        LATIN1_UNIX = 0,
+        LATIN1_WINDOWS = WINDOWS_MASK, 
+        UTF8_UNIX = UTF8_MASK, 
+        UTF8_WINDOWS = UTF8_MASK | WINDOWS_MASK;
    
    // Normally use false, false.
    // But for UCD files use true, true
    // Or if they are UTF8, use true, false
-    public static PrintWriter openPrintWriter(String filename, boolean removeCR, boolean latin1) throws IOException {
+    public static PrintWriter openPrintWriter(String filename, byte options) throws IOException {
        File file = new File(getOutputName(filename));
        System.out.println("Creating File: " + file);
        File parent = new File(file.getParent());
@ -548,7 +556,7 @@ public final class Utility {    // COMMON UTILITIES
                    new UTF8StreamWriter(
                        new FileOutputStream(file),
                        32*1024,
-                        removeCR, latin1));
+                        (options & WINDOWS_MASK) == 0, (options & UTF8_MASK) == 0));
    }
    
    public static String getOutputName(String filename) {
@ -606,7 +614,7 @@ public final class Utility {    // COMMON UTILITIES
    }
    
    public static void addToSet(Map m, Object key, Object value) {
-        Set set = (Set) m.get(key);
+        Collection set = (Collection) m.get(key);
        if (set == null) {
            set = new TreeSet();
            m.put(key, set);
@ -614,6 +622,15 @@ public final class Utility {    // COMMON UTILITIES
        set.add(value);
    }
        
+    public static void addToList(Map m, Object key, Object value, boolean unique) {
+        Collection set = (Collection) m.get(key);
+        if (set == null) {
+            set = new ArrayList();
+            m.put(key, set);
+        }
+        if (!unique || !set.contains(value)) set.add(value);
+    }
+        
    public static String readDataLine(BufferedReader br) throws IOException {
        String originalLine = "";
        String line = "";
@ -724,7 +741,7 @@ public final class Utility {    // COMMON UTILITIES
    }
    
    public static void copyTextFile(String filename, boolean utf8, String newName, String[] replacementList) throws IOException {
-        PrintWriter out = Utility.openPrintWriter(newName, false, false);
+        PrintWriter out = Utility.openPrintWriter(newName, UTF8_WINDOWS);
        appendFile(filename, utf8, out, replacementList);
        out.close();
    }
@ -834,10 +851,12 @@ public final class Utility {    // COMMON UTILITIES
        return "Showing Stack with fake " + sw.getBuffer().toString();
    }
    
+    static PrintWriter showSetNamesPw;
+    
    public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, UCD ucd) {
-        PrintWriter temp = new PrintWriter(System.out);
-        showSetNames(temp, prefix, set, separateLines, false, ucd);
-        temp.close();
+        if (showSetNamesPw == null) showSetNamesPw = new PrintWriter(System.out);
+        showSetNames(showSetNamesPw, prefix, set, separateLines, false, ucd);
+        showSetNamesPw.flush();
    }
    
    public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) {