ICU-5222 fixes for UnicodeTools (unconnected with rest of ICU4J)

X-SVN-Rev: 20400
2025-04-10 07:39:16 +00:00 · 2006-09-24 23:32:45 +00:00 · 2006-09-24 23:32:45 +00:00 · 690f5c528c
commit 690f5c528c
parent fa66eb7a07
9 changed files with 765 additions and 199 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
-* $Date: 2006/04/05 22:12:44 $
-* $Revision: 1.14 $
+* $Date: 2006/09/24 23:32:44 $
+* $Revision: 1.15 $
 *
 *******************************************************************************
 */
@ -16,14 +16,17 @@ package com.ibm.text.UCD;
 import java.util.*;
 import java.io.*;

+import org.unicode.cldr.util.Segmenter;
+
 import com.ibm.text.utility.*;
+import com.ibm.icu.dev.test.util.UnicodeMap;
 import com.ibm.icu.dev.test.util.UnicodeProperty;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;

 abstract public class GenerateBreakTest implements UCD_Types {

-    static boolean DEBUG = false;
+    static boolean DEBUG = true;
    static final boolean SHOW_TYPE = false;
    UCD ucd;
    Normalizer nfd;
@ -122,7 +125,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
    }

    // quick & dirty routine
-    String insertEverywhere(String source, String insertion, GenerateBreakTest breaker) {
+    static String insertEverywhere(String source, String insertion, GenerateBreakTest breaker) {
        String result = insertion;
        for (int i = 0; i < source.length(); ++i) {
            result += source.charAt(i);
@ -291,6 +294,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
    private String[] ruleList = new String[100];
    private int ruleListCount = 0;
    protected boolean collectingRules = false;
+	protected boolean needsFullBreakSample = true;
    
    public void setRule(String rule) {
        if (collectingRules) {
@ -330,6 +334,12 @@ abstract public class GenerateBreakTest implements UCD_Types {
        out.println("<h2>" + fileName + " Break Chart</h2>");
        out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "</p>");
        out.println("<p><b>Date:</b> " + Default.getDate() + "</p>");
+        out.println("<p>This page illustrates the application of the boundary specifications. " +
+        		"The first chart shows where breaks would appear between different sample characters or strings. " +
+        		"The sample characters are chosen mechanically to represent the different properties used by the specification. " +
+        		"Where properties used in the rules have 'overlaps', the samples are given 'composed' names. " +
+        		"For example, SentenceBreak uses GCLF_Sep: Sep is the SentenceBreak property, but it overlaps with the GraphemeClusterBreak property LF." +
+        		"</p>");
        generateTable(out);
        

@ -485,8 +495,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
            result.append(ucd.getCodeAndName(cp));
            result.append(", gc=" + ucd.getCategoryID_fromIndex(ucd.getCategory(cp),SHORT));
            result.append(", sc=" + ucd.getScriptID_fromIndex(ucd.getScript(cp),SHORT));
-            result.append(", lb=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp))
-                + "=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp), LONG));
+            //result.append(", lb=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp))
+            //    + "=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp), LONG));
        }
        return result.toString();
    }
@ -560,19 +570,41 @@ abstract public class GenerateBreakTest implements UCD_Types {
        }
        
        // gather the data for the rules
+        if (needsFullBreakSample ) {
        collectingRules = true;
        isBreak(fullBreakSample(), 1);
        collectingRules = false;
+        }
        
        out.println("<h3>Rules</h3>");
-        out.println("<ul>");
+        out.println("<p>Due to the way they have been mechanically processed for generation, " +
+        		"the following rules do not match the UAX rules precisely. " +
+        		"In particular:</p>"+
+        		"<ol>" +
+        		"<li>The rules are cast into a more regex-style.</li>"+
+        		"<li>The rules \"sot ÷\", \"÷ eot\", and \"÷ Any\" are added mechanically, and have artificial numbers.</li>"+
+        		"<li>The rules are given decimal numbers, so rules such as 11a are given a number using tenths, such as 11.1.</li>"+
+        		"<li>Where a rule has multiple parts (lines), each one is numbered using hundredths, such as 21.01) × BA, 21.02) × HY,...</li>"+
+        		"<li>Any 'treat as' or 'ignore' rules are handled as discussed in Unicode Standard Annex #29, and thus" +
+        		"reflected in a transformation of the rules not visible here.</li>" +
+        		"</ol>" +
+        		"<p>For the original rules, see the UAX.</p>"
+
+        		);
+        out.println("<ul style='list-style-type: none'>");
            for (int ii = 0; ii < ruleListCount; ++ii) {
-                out.println("<li>" + ruleList[ii] + "</li>");
+                out.println("<li>" + ruleList[ii].replaceAll("[$]","") + "</li>");
            }
        out.println("</ul>");
        
        if (extraSingleSamples.length > 0) {
            out.println("<h3>Sample Strings</h3>");
+            out.println("<p>" +
+            		"The following samples illustrate the application of the rules. " +
+            		"The blue lines indicate possible break points. " +
+            		"If your browser supports titles, then positioning the mouse over each character will show its name, " +
+            		"white positioning between characters shows the rule number of the rule responsible for the break-status." +
+            		"</p>");
            out.println("<ol>");
                for (int ii = 0; ii < extraSingleSamples.length; ++ii) {
                    out.println("<li><font size='5'>");
@ -631,6 +663,7 @@ abstract public class GenerateBreakTest implements UCD_Types {

        if (comments && !html) string.append(comment);
        out.println(string);
+        if (DEBUG) System.out.println("*" + string);
    }

    public void findSamples() {
@ -642,7 +675,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
        BitSet bitset = new BitSet();
        Map list = new TreeMap();

-        for (int i = 1; i <= 0x10FFFF; ++i) {
+        for (int i = 1; i <= 0xFFFF; ++i) {
            if (!ucd.isAllocated(i)) continue;
            if (0xD800 <= i && i <= 0xDFFF) continue;
            if (DEBUG && i == 0x1100) {
@ -657,6 +690,9 @@ abstract public class GenerateBreakTest implements UCD_Types {
            }

            int combined = (mapType(lb) << 7) + mapType(lb2);
+            if (combined < 0) {
+            	throw new IllegalArgumentException("should never happen");
+            }
            if (!bitset.get(combined)) {
                bitset.set(combined);
                list.put(new Integer(combined), UTF16.valueOf(i));
@ -777,10 +813,142 @@ abstract public class GenerateBreakTest implements UCD_Types {


    //==============================================
+    
+    static class XGenerateBreakTest extends GenerateBreakTest {
+    	Segmenter seg;
+    	String sample;
+    	{
+    		needsFullBreakSample = false;
+    	}
+    	
+    	public XGenerateBreakTest(UCD ucd, Segmenter.Builder segBuilder, String sample, String filename, String[] extraSamples, String[] extraSingleSamples) {
+    		super(ucd);
+    		this.seg = segBuilder.make();
+    		this.sample = sample;
+    		List rules = segBuilder.getRules();
+    		collectingRules = true;
+    		for (Iterator it = rules.iterator(); it.hasNext();) {
+    			String rule = (String)it.next();
+    			setRule(rule);
+    		}
+    		collectingRules = false;
+    		map.add("Other", new UnicodeSet(0,0x10FFFF));
+    		UnicodeMap segSamples = seg.getSamples();
+    		Collection x = segSamples.getAvailableValues();
+    		for (Iterator it = x.iterator(); it.hasNext();) {
+    			String label = (String)it.next();
+    			map.add(label, segSamples.getSet(label), true, false);
+    		}
+            this.fileName = filename;
+            sampleMap = map;
+            this.extraSamples = extraSamples;
+            this.extraSingleSamples = extraSingleSamples;
+    	}

-    static class GenerateGraphemeBreakTest extends GenerateBreakTest {
+		public boolean isBreak(String source, int offset) {
+			boolean result = seg.breaksAt(source, offset);
+			setRule(String.valueOf(seg.getBreakRule()));
+			return result;
+		}

-        GenerateGraphemeBreakTest(UCD ucd) {
+		public String fullBreakSample() {
+			return sample;
+		}
+
+        // stuff that subclasses need to override
+        public String getTypeID(int cp) {
+            return map.getLabel(cp);
+        }
+
+        // stuff that subclasses need to override
+        public byte getType(int cp) {
+            return (byte) map.getIndex(cp);
+        }    
+    }
+    
+    static class GenerateGraphemeBreakTest extends XGenerateBreakTest {
+		public GenerateGraphemeBreakTest(UCD ucd) {
+	        super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"GraphemeClusterBreak"), "aa", "Grapheme",
+	        		new String[]{}, new String[]{});
+		}	
+    }
+
+    static class GenerateLineBreakTest extends XGenerateBreakTest {
+		public GenerateLineBreakTest(UCD ucd) {
+	        super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"LineBreak"), "aa", "Line",
+	        	new String[]{}, new String[] {
+	        	"can't", "can\u2019t", "ab\u00ADby",
+	             "-3",
+	             "e.g.",
+	             "\u4e00.\u4e00.",
+	              "a  b",
+	              "a  \u200bb",
+	              "a \u0308b",
+	              "1\u0308b(a)-(b)",
+	              });
+		}	
+    }
+    
+    static class GenerateSentenceBreakTest extends XGenerateBreakTest {
+		public GenerateSentenceBreakTest(UCD ucd) {
+	        super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"SentenceBreak"), "aa", "Sentence",
+	        		new String[]{},
+	        		getExtraSamples());
+		}	
+		static String[] getExtraSamples() {
+            GenerateBreakTest grapheme = new GenerateGraphemeBreakTest(Default.ucd());
+	        String[] extraSingleSamples = new String[] {
+	                "(\"Go.\") (He did.)", 
+	                "(\u201CGo?\u201D) (He did.)", 
+	                "U.S.A\u0300. is", 
+	                "U.S.A\u0300? He", 
+	                "U.S.A\u0300.", 
+	                "3.4", 
+	                "c.d",
+	                "etc.)\u2019 \u2018(the",
+	                "etc.)\u2019 \u2018(The",
+	                "the resp. leaders are",
+	                "\u5B57.\u5B57",
+	                "etc.\u5B83",
+	                "etc.\u3002",
+	                "\u5B57\u3002\u5B83",
+	            };
+	            String[] temp = new String [extraSingleSamples.length * 2];
+	            System.arraycopy(extraSingleSamples, 0, temp, 0, extraSingleSamples.length);
+	            for (int i = 0; i < extraSingleSamples.length; ++i) {
+	                temp[i+extraSingleSamples.length] = insertEverywhere(extraSingleSamples[i], "\u2060", grapheme);
+	            }
+	            extraSingleSamples = temp;
+	            return extraSingleSamples;
+		}
+    }
+
+    static class GenerateWordBreakTest extends XGenerateBreakTest {
+		public GenerateWordBreakTest(UCD ucd) {
+	        super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"WordBreak"), "aa", "Word",
+	    	        new String[] {
+                /*"\uFF70", "\uFF65", "\u30FD", */ "a\u2060", "a:", "a'", "a'\u2060", "a,", "1:", "1'", "1,",  "1.\u2060"
+            	},
+
+
+	        	getExtraSamples());
+		}	
+		static String[] getExtraSamples() {
+            GenerateBreakTest grapheme = new GenerateGraphemeBreakTest(Default.ucd());
+	                String [] temp = {"can't", "can\u2019t", "ab\u00ADby", "a$-34,567.14%b", "3a" };
+	                String[] extraSingleSamples = new String [temp.length * 2];
+	                System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length);
+	                for (int i = 0; i < temp.length; ++i) {
+	                    extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme);
+	                }
+
+	            return extraSingleSamples;
+		}
+    }
+
+    static class OLDGenerateGraphemeBreakTest extends GenerateBreakTest {
+
+    	OLDGenerateGraphemeBreakTest(UCD ucd) {
            super(ucd);
            fileName = "Grapheme";
            sampleMap = map;
@ -866,13 +1034,13 @@ abstract public class GenerateBreakTest implements UCD_Types {

    //==============================================

-    static class GenerateWordBreakTest extends GenerateBreakTest {
+    static class XGenerateWordBreakTest extends GenerateBreakTest {
        
        GenerateGraphemeBreakTest grapheme;
        MyBreakIterator breaker;
        Context context = new Context();

-        GenerateWordBreakTest(UCD ucd) {
+        XGenerateWordBreakTest(UCD ucd) {
            super(ucd);
            grapheme = new GenerateGraphemeBreakTest(ucd);
            breaker = new MyBreakIterator(grapheme);
@ -1017,13 +1185,13 @@ abstract public class GenerateBreakTest implements UCD_Types {

    // ========================================

-    static class GenerateLineBreakTest extends GenerateBreakTest {
+    static class XGenerateLineBreakTest extends GenerateBreakTest {

        GenerateGraphemeBreakTest grapheme;
        MyBreakIterator breaker;
        Context context = new Context();

-        GenerateLineBreakTest(UCD ucd) {
+        XGenerateLineBreakTest(UCD ucd) {
            super(ucd);
            grapheme = new GenerateGraphemeBreakTest(ucd);
            breaker = new MyBreakIterator(grapheme);
@ -1505,12 +1673,12 @@ abstract public class GenerateBreakTest implements UCD_Types {

    //==============================================

-    static class GenerateSentenceBreakTest extends GenerateBreakTest {
+    static class XGenerateSentenceBreakTest extends GenerateBreakTest {
        
        GenerateGraphemeBreakTest grapheme;
        MyBreakIterator breaker;
        
-        GenerateSentenceBreakTest(UCD ucd) {
+        XGenerateSentenceBreakTest(UCD ucd) {
            super(ucd);
            grapheme = new GenerateGraphemeBreakTest(ucd);
            breaker = new MyBreakIterator(grapheme);
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
-* $Date: 2006/06/09 21:21:20 $
-* $Revision: 1.11 $
+* $Date: 2006/09/24 23:32:44 $
+* $Revision: 1.12 $
 *
 *******************************************************************************
 */
@ -19,6 +19,7 @@ import java.io.IOException;
 import java.io.PrintWriter;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Iterator;
@ -37,16 +38,21 @@ import com.ibm.icu.dev.test.util.UnicodeProperty;
 import com.ibm.icu.dev.test.util.XEquivalenceClass;
 import com.ibm.icu.impl.CollectionUtilities;
 import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.Collator;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.ULocale;
 import com.ibm.text.utility.Utility;


 public class GenerateConfusables {
+    public static String version = "2.0";
 	public static boolean EXCLUDE_CONFUSABLE_COMPAT = true;

 	public static void main(String[] args) throws IOException {
+        quickTest();
+        
 		Set arg2 = new HashSet(Arrays.asList(args));
 		try {
 			if (arg2.contains("-b")) generateIDN();
@ -59,6 +65,19 @@ public class GenerateConfusables {
 			System.out.println("Done");
 		}
 	}
+
+    private static void quickTest() {
+        int script = getSingleScript("\u0430\u0061");
+        script = getSingleScript("\u0061\u0430"); //0323 ;  093C
+        String a = "\u0323";
+        String b = "\u093C";
+        int isLess = betterTargetIsLess.compare(a, b); // ("\u0045", "\u13AC");
+        MyEquivalenceClass test = new MyEquivalenceClass();
+        test.add(a, b, "none");
+        Set x = test.getEquivalences(a);
+        String result = (String) CollectionUtilities.getBest(x, betterTargetIsLess, -1);
+    }
+    
 	/**
 	 * 
 	 */
@ -82,32 +101,34 @@ public class GenerateConfusables {
 			_Non_IICore.removeAll(um.getSet("2.1"));
 			
 			// add Chinese?
-			UnicodeSet cjk_nic = new UnicodeSet();
-			String line = null;
-			try {
-				BufferedReader br = BagFormatter.openUTF8Reader(indir, "cjk_nic.txt");
-				while (true) {
-					line = Utility.readDataLine(br);
-					if (line == null) break;
-					if (line.length() == 0) continue;
-					String[] pieces = Utility.split(line, ';');
-					// part 0 is range
-					String range = pieces[0].trim();
-					int rangeDivider = range.indexOf("..");
-					int start, end;
-					if (rangeDivider < 0) {
-						start = end = Integer.parseInt(range, 16);
-					} else {
-						start = Integer.parseInt(range.substring(0, rangeDivider), 16);
-						end = Integer.parseInt(range.substring(rangeDivider+2), 16);
-					}
-					cjk_nic.add(start, end);
-				}
-				br.close();
-			} catch (Exception e) {
-				throw (RuntimeException) new RuntimeException("Failure on line " + line).initCause(e);
-			}
-			_Non_IICore.removeAll(cjk_nic);
+            if (true) {
+    			UnicodeSet cjk_nic = new UnicodeSet();
+    			String line = null;
+    			try {
+    				BufferedReader br = BagFormatter.openUTF8Reader(indir, "cjk_nic.txt");
+    				while (true) {
+    					line = Utility.readDataLine(br);
+    					if (line == null) break;
+    					if (line.length() == 0) continue;
+    					String[] pieces = Utility.split(line, ';');
+    					// part 0 is range
+    					String range = pieces[0].trim();
+    					int rangeDivider = range.indexOf("..");
+    					int start, end;
+    					if (rangeDivider < 0) {
+    						start = end = Integer.parseInt(range, 16);
+    					} else {
+    						start = Integer.parseInt(range.substring(0, rangeDivider), 16);
+    						end = Integer.parseInt(range.substring(rangeDivider+2), 16);
+    					}
+    					cjk_nic.add(start, end);
+    				}
+    				br.close();
+    			} catch (Exception e) {
+    				throw (RuntimeException) new RuntimeException("Failure on line " + line).initCause(e);
+    			}
+    			_Non_IICore.removeAll(cjk_nic);
+            }
 		}
 		return _Non_IICore;
 //		for (Iterator it = um.getAvailableValues().iterator(); it.hasNext();) {
@ -118,7 +139,7 @@ public class GenerateConfusables {
 	}
 	
 	static PrintWriter log;
-	static final String ARROW = "\u2192";
+	static final String ARROW = "\u2192"; // \u2194
 	static UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(""); // ICUPropertyFactory.make();
 	static UnicodeSet UNASSIGNED = ups.getSet("gc=Cn")
 		.addAll(ups.getSet("gc=Co"))
@ -131,12 +152,14 @@ public class GenerateConfusables {
 	static UnicodeSet _skipNFKD;
 	
 	static Map gatheredNFKD = new TreeMap();
-	static UnicodeMap nfcMap = new UnicodeMap();
+    static UnicodeMap nfcMap;
+    static UnicodeMap nfkcMap;
 	
-	static String indir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\source\\";
-	static String outdir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\";
+	static String indir = "C:\\cvsdata\\unicode\\draft\\reports\\tr36\\data\\source\\";
+	static String outdir = "C:\\cvsdata\\unicode\\draft\\reports\\tr36\\data\\";
 	
 	static Comparator codepointComparator = new UTF16.StringComparator(true,false,0);
+    static Comparator UCAComparator = new CollectionUtilities.MultiComparator(new Comparator[] {Collator.getInstance(ULocale.ROOT), codepointComparator});

 	static UnicodeSet setsToAbbreviate = new UnicodeSet("[" +
 			"\\u3400-\\u4DB5" +
@ -208,23 +231,35 @@ public class GenerateConfusables {

 		private UnicodeMap additions = new UnicodeMap(), remap = new UnicodeMap(), removals = new UnicodeMap(),
 		reviews, removals2, lowerIsBetter;
+        
+        private UnicodeSet isCaseFolded;
 		
 		private IdentifierInfo() throws IOException {
-			propNFKCSet = ups.getSet("NFKC_QuickCheck=N")
-					.complement();
+            isCaseFolded = new UnicodeSet();
+            for (int cp = 0; cp <= 0x10FFFF; ++cp) {
+                Utility.dot(cp);
+                int cat = Default.ucd().getCategory(cp);
+                if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
+                String source = UTF16.valueOf(cp);
+                String cf = Default.ucd().getCase(source, UCD.FULL, UCD.FOLD);
+                if (cf.equals(source)) isCaseFolded.add(cp);
+            }
+            
+			propNFKCSet = ups.getSet("NFKC_QuickCheck=N").complement();
 			UnicodeSet propXIDContinueSet = ups.getSet("XID_Continue=TRUE");

+            //removals.putAll(propNFKCSet.complement(), PROHIBITED + "compat variant");
 			loadFileData();
-			xidPlus = new UnicodeSet(propXIDContinueSet).addAll(
-					additions.getSet(null).complement()).retainAll(propNFKCSet);
+			xidPlus = new UnicodeSet(propXIDContinueSet).addAll(additions.keySet()).retainAll(propNFKCSet);

 			getIdentifierSet();
 			notInXID = new UnicodeSet(IDNOutputSet).removeAll(xidPlus);
 			removals.putAll(notInXID, PROHIBITED + NOT_IN_XID);
-			removalSet = removals.getSet(null).complement();
+            //UnicodeSet notNfkcXid = new UnicodeSet(xidPlus).removeAll(removals.keySet()).removeAll(propNFKCSet);
+            //removals.putAll(notNfkcXid, PROHIBITED + "compat variant");
+			removalSet = removals.keySet();

-			remainingOutputSet = new UnicodeSet(IDNOutputSet)
-					.removeAll(removalSet);
+			remainingOutputSet = new UnicodeSet(IDNOutputSet).removeAll(removalSet);

 			UnicodeSet remainingInputSet1 = new UnicodeSet(IDNInputSet)
 					.removeAll(removalSet).removeAll(remainingOutputSet);
@ -234,9 +269,9 @@ public class GenerateConfusables {
 			// the output set
 			for (UnicodeSetIterator usi = new UnicodeSetIterator(
 					remainingInputSet1); usi.next();) {
-				String nss = Default.nfkc().normalize(usi.getString());
+				String nss = getModifiedNKFC(usi.getString());
 				String cf = Default.ucd().getCase(nss, UCD.FULL, UCD.FOLD);
-				String cf2 = Default.nfkc().normalize(cf);
+				String cf2 = getModifiedNKFC(cf);
 				if (remainingOutputSet.containsAll(cf2))
 					remainingInputSet.add(usi.codepoint);
 				else
@ -247,7 +282,7 @@ public class GenerateConfusables {
 			for (UnicodeSetIterator usi = new UnicodeSetIterator(
 					remainingInputSet); usi.next();) {
 				String ss = usi.getString();
-				String nss = Default.nfkc().normalize(ss);
+				String nss = getModifiedNKFC(ss);
 				String cf = Default.ucd().getCase(ss, UCD.FULL, UCD.FOLD);
 				if (usi.codepoint == 0x2126 || usi.codepoint == 0x212B) {
 					System.out.println("check");
@ -395,7 +430,7 @@ public class GenerateConfusables {
 				throw (RuntimeException) new RuntimeException(
 						"Failure on line " + line).initCause(e);
 			}
-			removals.putAll(getNonIICore(), "~IICore");
+			removals.putAll(getNonIICore(), PROHIBITED + "~IICore");
 			br.close();
 		}
 		
@ -417,13 +452,14 @@ public class GenerateConfusables {
 			bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
 			bf.setMergeRanges(true);

-			PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt");
+            PrintWriter out = openAndWriteHeader("review.txt", "Review List for IDN");
+//			PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt");
 			//reviews.putAll(UNASSIGNED, "");
-			out.print("\uFEFF");
-			out.println("# Review List for IDN");
-			out.println("# $Revision: 1.11 $");
-			out.println("# $Date: 2006/06/09 21:21:20 $");
-			out.println("");
+//			out.print("\uFEFF");
+//			out.println("# Review List for IDN");
+//			out.println("# $Revision: 1.12 $");
+//			out.println("# $Date: 2006/09/24 23:32:44 $");
+//			out.println("");

 			UnicodeSet fullSet = reviews.getSet("").complement();

@ -474,19 +510,15 @@ public class GenerateConfusables {
 			
 			UnicodeSet letters = new UnicodeSet("[[:Alphabetic:][:Mark:][:Nd:]]");
 			
-			PrintWriter out = BagFormatter.openUTF8Writer(outdir, "idnchars.txt");
+            PrintWriter out = openAndWriteHeader("idnchars.txt", "Recommended Identifier Profiles for IDN");

-			out.println("# Recommended Identifier Profiles for IDN");
-			out.println("# $Revision: 1.11 $");
-			out.println("# $Date: 2006/06/09 21:21:20 $");
-
-			out.println("");
-			out.println("# Output Characters");
+			out.println("# Allowed as output characters");
 			out.println("");
 			bf.setValueSource("output");
 			bf.showSetNames(out, remainingOutputSet);
 			showExtras(bf, remainingOutputSet, letters);

+            /*
 			out.println("");

 			out.println("");
@ -502,10 +534,10 @@ public class GenerateConfusables {
 			bf.setValueSource("input-lenient");
 			bf.showSetNames(out, inputSet_lenient);
 			showExtras(bf, inputSet_lenient, letters);
-
+			*/
+            
 			out.println("");
-			out
-					.println("# Not allowed at start of identifier");
+			out.println("# Not allowed at start of identifier");
 			out.println("");
 			bf.setValueSource("nonstarting");
 			bf.showSetNames(out, nonstarting);
@ -517,6 +549,7 @@ public class GenerateConfusables {
 			out.close();
 		}

+
 		/**
 		 * 
 		 */
@ -543,13 +576,14 @@ public class GenerateConfusables {
 			bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
 			bf.setMergeRanges(true);

-			PrintWriter out = BagFormatter.openUTF8Writer(outdir,
-					"xidmodifications.txt");
+            PrintWriter out = openAndWriteHeader("xidmodifications.txt", "Security Profile for General Identifiers");
+			/* PrintWriter out = BagFormatter.openUTF8Writer(outdir, "xidmodifications.txt");

 			out.println("# Security Profile for General Identifiers");
-			out.println("# $Revision: 1.11 $");
-			out.println("# $Date: 2006/06/09 21:21:20 $");
-			out.println("");
+			out.println("# $Revision: 1.12 $");
+			out.println("# $Date: 2006/09/24 23:32:44 $");
+            */
+

 			out.println("# Characters restricted");
 			out.println("");
@ -567,11 +601,26 @@ public class GenerateConfusables {
 			out.println("# Characters added");
 			out.println("");
 			bf.setValueSource("addition");
-			bf.showSetNames(out, additions.getSet(null).complement());
+			bf.showSetNames(out, additions.keySet());

 			//showRemapped(out, "Characters remapped on input", remap);

 			out.close();
+            
+           out = openAndWriteHeader("xidAllowed.txt", "Security Profile for General Identifiers");
+           UnicodeSet allowed = new UnicodeSet(xidPlus).removeAll(removals.keySet());
+            UnicodeSet cfAllowed = new UnicodeSet().addAll(allowed).retainAll(isCaseFolded).retainAll(propNFKCSet);
+            allowed.removeAll(cfAllowed);
+            bf.setValueSource("case_folded");
+            out.println("# XID characters allowed (no uppercase)");
+            out.println("");
+            bf.showSetNames(out, cfAllowed);
+            bf.setValueSource("not_case_folded");
+            out.println("");
+            out.println("# XID characters allowed (uppercase)");
+            out.println("");
+            bf.showSetNames(out, allowed);
+            out.close();
 			
 			UnicodeMap someRemovals = new UnicodeMap();
 			UnicodeMap.Composer myComposer = new UnicodeMap.Composer() {
@ -604,8 +653,8 @@ public class GenerateConfusables {
 			//someRemovals = removals;
 			out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt");
 			out.println("# Characters restricted in domain names");
-			out.println("# $Revision: 1.11 $");
-			out.println("# $Date: 2006/06/09 21:21:20 $");
+			out.println("# $Revision: 1.12 $");
+			out.println("# $Date: 2006/09/24 23:32:44 $");
 			out.println("#");
 			out.println("# This file contains a draft list of characters for use in");
 			out.println("#     UTR #36: Unicode Security Considerations");
@ -646,7 +695,7 @@ public class GenerateConfusables {
 				bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
 				}).set(someRemovals).setMain("Removals", "GCB",
 						UnicodeProperty.ENUMERATED, "1.0"));
-				bf.showSetNames(out, someRemovals.getSet(null).complement());
+				bf.showSetNames(out, someRemovals.keySet());
 			}
 			out.close();
 		}
@ -654,6 +703,7 @@ public class GenerateConfusables {

 	static final String PROHIBITED = "restricted ; ";
 	static final String NOT_IN_XID = "not in XID+";
+    public static final boolean suppress_NFKC = true;
 	/**
 * 
 */
@ -674,7 +724,7 @@ public class GenerateConfusables {
 			out.println("");
 			for (UnicodeSetIterator usi = new UnicodeSetIterator(s); usi.next();) {
 				String source = usi.getString();
-				String target = Default.nfkc().normalize(source);
+				String target = getModifiedNKFC(source);
 				writeSourceTargetLine(out, source, null, target, value);
 			}
 			//bf.showSetNames(out, s);
@ -712,7 +762,7 @@ public class GenerateConfusables {
 		out.println("# " + title);
 		out.println("");
 		int count = 0;
-		for (UnicodeSetIterator usi = new UnicodeSetIterator(remap.getSet(null).complement()); usi.next();) {
+		for (UnicodeSetIterator usi = new UnicodeSetIterator(remap.keySet()); usi.next();) {
 			writeSourceTargetLine(out, usi.getString(), "remap-to", (String)remap.getValue(usi.codepoint), null);
 			count++;
 		}
@ -747,6 +797,8 @@ public class GenerateConfusables {
 	}
 	
 	private static UnicodeSet getSkipNFKD() {
+        nfcMap = new UnicodeMap();
+        nfkcMap = new UnicodeMap();
 		if (_skipNFKD == null) {
 			_skipNFKD = new UnicodeSet();
 			UnicodeSet idSet = getIdentifierSet();
@ -755,6 +807,8 @@ public class GenerateConfusables {
 				int cat = Default.ucd().getCategory(cp);
 				if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
 				int decompType = Default.ucd().getDecompositionType(cp);
+                String nfc = Default.nfc().normalize(cp);
+                if (decompType == UCD.CANONICAL) nfcMap.put(cp, nfc);
 				if (decompType == UCD.COMPAT_CIRCLE
 						|| decompType == UCD.COMPAT_SUPER
 						|| decompType == UCD.COMPAT_SUB
@ -765,42 +819,58 @@ public class GenerateConfusables {
 					_skipNFKD.add(cp);
 					continue;
 				}
+                String source = UTF16.valueOf(cp);
 				String mapped = Default.nfkd().normalize(cp);
-				if (mapped.equals(UTF16.valueOf(cp))) continue;
+                String kmapped = getModifiedNKFC(source);
+                if (!kmapped.equals(source) && !kmapped.equals(nfc)) {
+                    if (kmapped.startsWith(" ") || kmapped.startsWith("\u0640")) {
+                        System.out.println("?? " + Default.ucd().getCodeAndName(cp));
+                        System.out.println("\t" + Default.ucd().getCodeAndName(kmapped));
+                        kmapped = getModifiedNKFC(source); // for debugging
+                    }
+                    nfkcMap.put(cp,kmapped);
+                }
+				if (mapped.equals(source)) continue;
 				if (idSet.contains(cp) && !idSet.contains(mapped)) _skipNFKD.add(cp);
 				else if (!whiteSpace.contains(cp) && whiteSpace.containsSome(mapped)) _skipNFKD.add(cp);
-				if (decompType == UCD.CANONICAL) nfcMap.put(cp, Default.nfd().normalize(cp));
 			}
 		}
-		nfcMap.setMissing("");
+        nfcMap.setMissing("");
+        nfcMap.freeze();
+        nfkcMap.setMissing("");
+        nfkcMap.freeze();
 		return _skipNFKD;
 	}
 	
 	private static boolean isMixedScript(String source) {
-		return getSingleScript(source) != UScript.INVALID_CODE;
+		return getSingleScript(source) == UScript.INVALID_CODE;
 	}

-	/*
-	 * Returns UScript.INVALID_CODE if mixed script, otherwise the script
-	 */
-	public static int getSingleScript(String source) {
-		int lastScript = UScript.INVALID_CODE;
-		int cp;
-		for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
-			cp = UTF16.charAt(source, i);
-			int script = UScript.getScript(cp);
-			if (script == UScript.COMMON || script == UScript.INHERITED) {
-				if (XIDContinueSet.contains(cp)) {
-					if (lastScript == UScript.INVALID_CODE) lastScript = script;
-					continue; // skip if not identifier
-				}
-				script = UScript.COMMON;
-			}
-			if (lastScript == UScript.INVALID_CODE) lastScript = script;
-			else if (script != lastScript) return UScript.INVALID_CODE;
+/**
+ * Returns the script of the input text. Script values of COMMON and INHERITED are ignored.
+ * @param source Input text.
+ * @return Script value found in the text.
+ * If more than one script values are found, then UScript.INVALID_CODE is returned.
+ * If no script value is found (other than COMMON or INHERITED), then UScript.COMMON is returned.
+ */
+public static int getSingleScript(String source) {
+    if (source.length() == 0) return UScript.COMMON;
+	int lastScript = UScript.COMMON; // temporary value
+	int cp;
+	for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
+		cp = UTF16.charAt(source, i);
+		int script = UScript.getScript(cp);
+		if (script == UScript.COMMON || script == UScript.INHERITED) {
+            continue;
 		}
-		return lastScript;
+		if (lastScript == UScript.COMMON) {
+            lastScript = script;
+        } else if (script != lastScript) {
+            return UScript.INVALID_CODE;
+        }
 	}
+	return lastScript;
+}

 	/**
 	 * 
@ -856,8 +926,9 @@ public class GenerateConfusables {
 				+ " ;\t" + Utility.hex(target)
 				+ (tag == null ? "" : " ;\t" + tag)
 				//+ " ;\t" + (preferredID.contains(source) ? "ID" : "")
-				+ "\t# "
-				+ "( " + source + " " + ARROW + " " + target + ") " 
+				+ "\t#"
+                + (isXid(source) ? "" : "*")
+				+ " ( " + source + " " + ARROW + " " + target + " ) " 
 				+ Default.ucd().getName(source) + " " + ARROW + " "
 				+ Default.ucd().getName(target)
 				);
@ -992,18 +1063,45 @@ public class GenerateConfusables {
 			for (int i = 0; i < item.length(); i += UTF16.getCharCount(cp)) {
 				cp = UTF16.charAt(item, i);
 				String cps = UTF16.valueOf(cp);
-				String mapped = getParadigm(cps);
+				String mapped = getParadigm(cps, false, false);
 				if (mapped.indexOf(cps) >= 0) result.append(cps);
 				else {
 					result.append(mapped);
-					reasons.append("[" + getReasons(cps, mapped) + "]");
+                    List x = getReasons(cps, mapped);
+					reasons.append(getBestForm(x));
 				}
 			}
 			return result.toString();
 		}
 		
-		public String getParadigm(Object item) {
-			return (String) CollectionUtilities.getBest(getEquivalences(item), betterTargetIsLess, -1);
+		private Object getBestForm(Collection x) {
+            if (x.size() != 1)  return "[" +  x + "]";
+            Object item = x.iterator().next();
+            if (!(item instanceof Collection))  return x.toString();
+            return getBestForm((Collection)item);
+        }
+        
+        public String getParadigm(String item, boolean onlyLowercase, boolean onlySameScript) {
+            Set filteredSet;
+            if (onlyLowercase == false && onlySameScript == false) {
+                filteredSet = getEquivalences(item);
+            } else {
+                filteredSet = new HashSet();
+                for (Iterator it = getEquivalences(item).iterator(); it.hasNext();) {
+                    String other = (String) it.next();
+                    String combined = item + other;
+                    if (onlyLowercase) {
+                        boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD));
+                        if (!isLowercase) continue;
+                    }
+                    if (onlySameScript) {
+                        boolean isMixed = isMixedScript(combined);
+                        if (isMixed) continue;                      
+                    }
+                    filteredSet.add(other);
+                }
+            }
+			return (String) CollectionUtilities.getBest(filteredSet, betterTargetIsLess, -1);
 		}
 		
 		public Set getOrderedExplicitItems() {
@ -1057,12 +1155,21 @@ public class GenerateConfusables {
 			type += ":" + lineCount;
 			
 			String combined = source + target;
+            if (combined.indexOf("\u0430") >= 0) {
+                System.out.println(Default.ucd().getCodeAndName(combined));
+            }
 			boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD));
 			boolean isMixed = isMixedScript(combined);
 			dataMixedAnycase.add(source, target, type);
-			if (isLowercase) dataMixedLowercase.add(source, target, type);
-			if (!isMixed) dataSingleAnycase.add(source, target, type);
-			if (!isMixed && isLowercase) dataSingleLowercase.add(source, target, type);
+			if (isLowercase) {
+                dataMixedLowercase.add(source, target, type);
+            }
+			if (!isMixed) {
+                dataSingleAnycase.add(source, target, type);
+            }
+			if (!isMixed && isLowercase) {
+                dataSingleLowercase.add(source, target, type);
+            }
 			return this;
 		}
 		
@ -1124,7 +1231,13 @@ public class GenerateConfusables {
 						String source = Utility.fromHex(pieces[0].trim(),true);
 						String target = Utility.fromHex(pieces[1].trim(),true);
 						//if (pieces.length > 2) type = pieces[2].trim();
-						add(source, target, type, count, line);
+                        String nfkdSource = Default.nfkd().normalize(source);
+                        String nfkdTarget = Default.nfkd().normalize(target);
+                        if (suppress_NFKC && nfkdSource.equals(nfkdTarget)) {
+                           System.out.println("Suppressing nfkc for: " + Default.ucd().getCodeAndName(source));
+                        } else {
+                            add(source, target, type, count, line);
+                        }
 					}
 				}
 				in.close();
@ -1137,39 +1250,49 @@ public class GenerateConfusables {
 		}
 		
 		public void writeSource(String directory, String filename) throws IOException {
-			PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
-			out.println("# Source File for IDN Confusables");
-			out.println("# $Revision: 1.11 $");
-			out.println("# $Date: 2006/06/09 21:21:20 $");
-			out.println("");
+            PrintWriter out = openAndWriteHeader(filename, "Source File for IDN Confusables");
+//			PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
+//			out.println("# Source File for IDN Confusables");
+//			out.println("# $Revision: 1.12 $");
+//			out.println("# $Date: 2006/09/24 23:32:44 $");
+//			out.println("");
 			dataMixedAnycase.writeSource(out);
 			out.close();
 		}
 		
 		public void writeSourceOrder(String directory, String filename, boolean appendFile, boolean skipNFKEquivs) throws IOException {
-			PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
-			out.print('\uFEFF');
-			out.println("# Recommended confusable mapping for IDN");
-			out.println("# $Revision: 1.11 $");
-			out.println("# $Date: 2006/06/09 21:21:20 $");
-			out.println("");
+            PrintWriter out = openAndWriteHeader(filename, "Recommended confusable mapping for IDN");
+//            PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
+//			out.println("# Recommended confusable mapping for IDN");
+//			out.println("# $Revision: 1.12 $");
+//			out.println("# $Date: 2006/09/24 23:32:44 $");
+//			out.println("");

 			if (appendFile) {
 				String[] replacements = {"%date%", Default.getDate()};
 				Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt", 
 						Utility.UTF8_WINDOWS, out, replacements);
 			}
-			writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs);
-			writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs);
-			writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs);
-			writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs);
+            if (true) {
+                writeSourceOrder(out, dataMixedAnycase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, true, true);
+                writeSourceOrder(out, dataMixedAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, true);
+                writeSourceOrder(out, dataMixedAnycase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, true, false);
+                writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false);                
+            } else {
+    			writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, false, false);
+    			writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, false);
+    			writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, false, false);
+    			writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false);
+            }
 			out.close();
 		}
 		/**
 		 * @param skipNFKEquivs TODO
+		 * @param onlyLowercase TODO
+		 * @param onlySingleScript TODO
 		 * 
 		 */
-		private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs) {
+		private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs, boolean onlyLowercase, boolean onlySingleScript) {
 			// first get all the sets. Then get the best paradigm from each. Then sort.
 //			Set setOfSets = data.getEquivalenceSets();
 //			Map orderedResults = new TreeMap(betterTargetIsLess);
@ -1186,16 +1309,30 @@ public class GenerateConfusables {
 			out.println();
 			int count = 0;
 			UnicodeSet preferredID = getIdentifierSet();
+            ArrayComparator ac = new ArrayComparator(new Comparator[] {UCAComparator, UCAComparator});
+            Set orderedPairs = new TreeSet(ac);
 			for (Iterator it = items.iterator(); it.hasNext();) {
 				String source = (String) it.next();
-				if (UTF16.hasMoreCodePointsThan(source,1)) continue;
-				String target = data.getParadigm(source);
+                if (UTF16.hasMoreCodePointsThan(source,1)) continue;
+				String target = data.getParadigm(source, onlyLowercase, onlySingleScript);
+                if (target == null) continue;
 				if (source.equals(target)) continue;
 				if (skipNFKEquivs) {
 					if (!Default.nfkd().normalize(source).equals(source)) continue;
 				}
+                orderedPairs.add(new String[] {target, source});
+            }
+            String lastTarget = null;
+            for (Iterator it = orderedPairs.iterator(); it.hasNext();) {
+                String[] pair = (String[]) it.next();
+                String source = pair[1];
+                String target = pair[0];
 				String reason = fixReason(data.getReasons(source, target));
+                if (lastTarget != null && !lastTarget.equals(target)) {
+                    out.println();
+                }
 				writeSourceTargetLine(out, source, tag, target, reason);
+                lastTarget = target;
 				count++;
 			}
 			out.println();
@ -1326,7 +1463,7 @@ public class GenerateConfusables {
 		 */
 		public void addUnicodeMap(UnicodeMap decompMap, String type, String errorLine) {
 			int count = 0;
-			for (UnicodeSetIterator it = new UnicodeSetIterator(decompMap.getSet(null).complement()); it.next(); ) {
+			for (UnicodeSetIterator it = new UnicodeSetIterator(decompMap.keySet()); it.next(); ) {
 				add(it.getString(), (String)decompMap.getValue(it.codepoint), type, ++count, errorLine);
 			}
 		}
@ -1355,13 +1492,14 @@ public class GenerateConfusables {
 		 * 
 		 */
 		public void writeSummary(String outdir, String filename, boolean outputOnly, UnicodeSet script) throws IOException {
-			PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
-			UnicodeSet representable = new UnicodeSet();
-			out.print('\uFEFF');
-			out.println("# Summary: Recommended confusable mapping for IDN");
-			out.println("# $Revision: 1.11 $");
-			out.println("# $Date: 2006/06/09 21:21:20 $");
-			out.println("");
+            PrintWriter out = openAndWriteHeader(filename, "Summary: Recommended confusable mapping for IDN");
+//			PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
+//			out.print('\uFEFF');
+//			out.println("# Summary: Recommended confusable mapping for IDN");
+//			out.println("# $Revision: 1.12 $");
+//			out.println("# $Date: 2006/09/24 23:32:44 $");
+//			out.println("");
+            UnicodeSet representable = new UnicodeSet();
 			MyEquivalenceClass data = dataMixedAnycase;
 			Set items = data.getOrderedExplicitItems();
 //			for (Iterator it = items.iterator(); it.hasNext();) {
@ -1481,11 +1619,12 @@ public class GenerateConfusables {
 				wsAny.addEquivalents(equivalents);
 				wsLower.addEquivalents(equivalents);
 			}
-			PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
-			out.print('\uFEFF');
-			out.println("# Summary: Whole-Script Confusables");
-			out.println("# $Revision: 1.11 $");
-			out.println("# $Date: 2006/06/09 21:21:20 $");
+            PrintWriter out = openAndWriteHeader(filename, "Summary: Whole-Script Confusables");
+//			PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
+//			out.print('\uFEFF');
+//			out.println("# Summary: Whole-Script Confusables");
+//			out.println("# $Revision: 1.12 $");
+//			out.println("# $Date: 2006/09/24 23:32:44 $");
 			out.println("# This data is used for determining whether a strings is a");
 			out.println("# whole-script or mixed-script confusable.");
 			out.println("# The mappings here ignore common and inherited script characters,");
@ -1716,7 +1855,6 @@ public class GenerateConfusables {
 	}
 	
 	private static void generateConfusables(String indir, String outdir) throws IOException {
-		betterTargetIsLess.compare("\u0020", "\u2004");
 		File dir = new File(indir);
 		String[] names = dir.list();
 		DataSet total = new DataSet();
@ -1731,12 +1869,26 @@ public class GenerateConfusables {
 			total.addAll(ds);
 			total.close("t*" + names[i]);
 		}
+        // add normalized data
+//        for (int i = 0; i <= 0x10FFFF; ++i) {
+//            if (Default.nfkc().isNormalized(i)) continue;
+//            String result = getModifiedNKFC(UTF16.valueOf(i));
+//            ds.foo();
+//        }
+        getSkipNFKD();
 		DataSet ds = new DataSet();
 		ds.addUnicodeMap(nfcMap, "nfc", "nfc");
 		ds.close("*");
+        total.addAll(ds);
+        total.close("*");
+
+        ds = new DataSet();
+        ds.addUnicodeMap(nfkcMap, "nfkc", "nfkc");
+        ds.close("*");
 		//ds.write(outdir, "new-decomp.txt", false, false);
 		total.addAll(ds);
 		total.close("*");
+        
 		total.writeSummary(outdir, "confusablesSummary.txt", false, null);
 		total.writeSummary(outdir, "confusablesSummaryIdentifier.txt", true, null);
 		//total.writeSummary(outdir, "confusablesSummaryCyrillic.txt", true, 
@ -1893,6 +2045,12 @@ public class GenerateConfusables {
 		MARK_ASCII = new Integer(10);

 	static _BetterTargetIsLess betterTargetIsLess = new _BetterTargetIsLess();
+    
+    static UnicodeSet XID = new UnicodeSet("[:xidcontinue:]");
+    
+    static boolean isXid(String x) {
+        return  XID.containsAll(x);
+    }
 	
 	static class _BetterTargetIsLess implements Comparator {
 		IdentifierInfo info = IdentifierInfo.getIdentifierInfo();
@ -1900,9 +2058,20 @@ public class GenerateConfusables {
 		public int compare(Object o1, Object o2) {
 			String a = (String)o1;
 			String b = (String)o2;
+            // longer is better (less)
 			int ca = UTF16.countCodePoint(a);
 			int cb = UTF16.countCodePoint(b);
-			if (ca != cb) return ca > cb ? -1 : 1;
+			if (ca != cb)  {
+                return ca > cb ? -1 : 1;
+            }
+            
+            // is Identifier is better
+            boolean ba = isXid(a);
+            boolean bb = isXid(b);
+            if (ba != bb) {
+                return ba ? -1 : 1;
+            }
+            
 			int aok = getValue(a);
 			int bok = getValue(b);
 			if (aok != bok) return aok < bok ? -1 : 1;
@ -1947,4 +2116,28 @@ public class GenerateConfusables {
 		return type.substring(dash+1,period);
 	}

+    static Normalizer modNFKC ;
+
+     private static String getModifiedNKFC(String cf) {
+         if (modNFKC == null) {
+             modNFKC =  new Normalizer(Normalizer.NFKC, Default.ucdVersion());
+             modNFKC.setSpacingSubstitute();
+         }
+         return modNFKC.normalize(cf);
+     }
+     
+     private static PrintWriter openAndWriteHeader(String filename, String title) throws IOException {
+         PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
+         out.print('\uFEFF');
+         out.println("# " + title);
+         out.println("# File: " + filename);
+         out.println("# Version: " + version);
+         out.println("# Generated: " + Default.getDate());
+         out.println("# Checkin: $Revision: 1.12 $");
+         out.println("#");
+         out.println("# For documentation and usage, see http://www.unicode.org/reports/tr39/");
+         out.println("#");
+         return out;
+     }
+
 }
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
-* $Date: 2004/02/07 01:01:14 $
-* $Revision: 1.4 $
+* $Date: 2006/09/24 23:32:44 $
+* $Revision: 1.5 $
 *
 *******************************************************************************
 */
@ -14,9 +14,10 @@
 package com.ibm.text.UCD;
 import java.io.*;
 import com.ibm.icu.text.UTF16;
-import com.ibm.text.utility.*;
+//import com.ibm.text.utility;
 import com.ibm.icu.text.UnicodeSet;
-import java.util.*;
+import com.ibm.text.utility.Utility;
+//import java.util.*;

 public class GenerateThaiBreaks {
  public static void main(String [] args) throws IOException {
--- a/tools/unicodetools/com/ibm/text/UCD/InvariantTest.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/InvariantTest.txt
@ -1,3 +1,20 @@
+Show [[:block=tamil:] & [:age=3.2:] - [:age=3.1:]]
+Show [[:block=tamil:] & [:age=4.0:] - [:age=3.2:]]
+Show [[:block=tamil:] & [:age=4.1:] - [:age=4.0:]]
+Show [[:block=tamil:] & [:age=5.0:] - [:age=4.1:]]
+
+Stop
+
+Show [[:NFKCQuickCheck=No:] & [$gc:Lm]]
+
+Stop
+
+[$Name:  $gc:Sk]
+[$Name:  $gc:Lm]
+
+Show [[$whitespace] - [$gc:zs]]
+Show [[$gc:zs] - [$whitespace]]
+
 Let $letter = [$gc:Lu $gc:Ll $gc:Lt $gc:Lo $gc:Lm];
 Let $number = [$gc:Nd $gc:Nl $gc:No]
 Let $mark = [$gc:mn $gc:me $gc:mc]
@ -62,7 +79,7 @@ Let $guessClose = [$gc:pf $gc:pe $gc:pi]
 $guessClose = $__closing_punc

 Let $guessTerm = [$sb:aterm $sb:sterm]
-$guessTerm = [? ? !?? ? ? ? ? ??? ? ? ? ? ? ? ? .?? … ? ? ? ? ? ? ? ?? ? ? ? ? ? ? ?]
+$guessTerm = [? ? !?? ? ? ? ? ??? ? ? ? ? ? ? ? .?? <EFBFBD> ? ? ? ? ? ? ? ?? ? ? ? ? ? ? ?]

 Let $__issymotherr = [\u00A6\u00A7\u06FD\u06FE\u0F01-\u0F03\u0F13-\u0F17\u0F1A-\u0F1F\u0FBE-\u0FC5\u0FC7-\u0FCC\u2100\u2101\u2104-\u2106\u2108\u2109\u2117\u2118\u211E-\u2121\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u2400-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u2613\u2619-\u266E\u2670\u2671\u2701-\u2704\u2706-\u2709\u270C-\u2727\u2729-\u274B\u274F-\u2752\u2758-\u275E\u2761-\u2794\u2798-\u27AF\u27B1-\u27BE\u2800-\u28FF\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3012\u3013\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u3200-\u321C\u322A-\u3243\u3260-\u327B\u328A-\u32B0\u32C0-\u32CB\u32D0-\u32FE\u3300-\u3376\u337B-\u33DD\u33E0-\u33FE\uA490-\uA4A1\uA4A4-\uA4B3\uA4B5-\uA4C0\uA4C2-\uA4C4\uFFED\uFFEE\uFFFC\uFFFD]
 Let $__issymothers = [\u00B6\u0482\u06E9\u09FA\u0B70\u0F34\u0F36\u0F38\u0FCF\u2114\u2123\u2125\u2127\u2129\u212E\u2132\u213A\u21D3\u220E\u2617\u274D\u2756\u3004\u3020\u327F\uA4C6\uFFE4\uFFE8]
--- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
@ -1,5 +1,5 @@
-Generate: .*
-DeltaVersion: 16
+Generate: .*BreakTest.*
+DeltaVersion: 17
 CopyrightYear: 2006

 File: auxiliary/GraphemeBreakProperty
--- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
-* $Date: 2005/11/01 00:10:54 $
-* $Revision: 1.17 $
+* $Date: 2006/09/24 23:32:44 $
+* $Revision: 1.18 $
 *
 *******************************************************************************
 */
@ -14,9 +14,13 @@
 package com.ibm.text.UCD;

 import java.util.*;
+
+import com.ibm.icu.dev.test.util.UnicodeMap;
 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;

 import com.ibm.text.utility.*;
+import com.sun.java_cup.internal.internal_error;


 /**
@ -302,6 +306,7 @@ public final class Normalizer implements UCD_Types {
    private byte form;
    private boolean composition;
    private boolean compatibility;
+    private UnicodeMap substituteMapping;

    /**
    * Decomposes text, either canonical or compatibility,
@ -319,7 +324,12 @@ public final class Normalizer implements UCD_Types {
        for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
            buffer.setLength(0);
            ch32 = UTF16.charAt(source, i);
-            data.getRecursiveDecomposition(ch32, buffer, compat);
+            String sub = substituteMapping == null ? null : (String) substituteMapping.getValue(ch32);
+            if (sub != null) {
+                buffer.append(sub);
+            } else {
+                data.getRecursiveDecomposition(ch32, buffer, compat);
+            }

            // add all of the characters in the decomposition.
            // (may be just the original character, if there was
@ -561,6 +571,81 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
        return result;
    }

+    public UnicodeMap getSubstituteMapping() {
+        return substituteMapping;
+    }
+
+    public Normalizer setSubstituteMapping(UnicodeMap substituteMapping) {
+        this.substituteMapping = substituteMapping;
+        return this;
+    }
+    
+    static UnicodeMap spacingMap;;
+    public void setSpacingSubstitute() {
+        if (spacingMap == null) {
+            makeSpacingMap();
+        }
+        setSubstituteMapping(spacingMap);
+    }
+
+    private void makeSpacingMap() {
+        spacingMap = new UnicodeMap();
+       StringBuffer b = new StringBuffer();
+       main:
+       for (int i = 0; i <= 0x10FFFF; ++i) {
+           boolean compat = data.ucd.getDecompositionType(i) >= data.ucd.CANONICAL; 
+           if (!compat) continue;
+           b.setLength(0);
+           data.getRecursiveDecomposition(i, b, true);
+           if (b.length() == 1) continue;
+           char firstChar = b.charAt(0);
+           if (firstChar != 0x20 && firstChar != '\u0640') continue;
+           // if rest are just Mn or Me marks, then add to substitute mapping
+           int cp;
+           for (int j = 1; j < b.length(); j += UTF16.getCharCount(cp)) {
+               cp = UTF16.charAt(b,j);
+               int cat = data.ucd.getCategory(cp);
+               if (cat != data.ucd.Mn && cat != data.ucd.Me) continue main;
+           }
+           spacingMap.put(i, UTF16.valueOf(i));
+        }
+        String[][] specials = {
+                {"[\\u0384\\u1FFD]", "\u00B4"},
+                {"[\\uFFE3]", "\u00AF"},
+                {"[\\uFE49-\\uFE4C]", "\u203E"},
+                {"[\\u1FED]", "\u00A8\u0300"},
+                {"[\\u1FEE\\u0385]", "\u00A8\u0301"},
+                {"[\\u1FC1]", "\u00A8\u0342"},
+                {"[\\u1FBD]", "\u1FBF"},
+                {"[\\u1FCD]", "\u1FBF\u0300"},
+                {"[\\u1FCE]", "\u1FBF\u0301"},
+                {"[\\u1FCF]", "\u1FBF\u0342"},
+                {"[\\u1FDD]", "\u1FFE\u0300"},
+                {"[\\u1FDE]", "\u1FFE\u0301"},
+                {"[\\u1FDF]", "\u1FFE\u0342"},
+                {"[\\uFC5E]", "\uFE72\u0651"},
+                {"[\\uFC5F]", "\uFE74\u0651"},
+                {"[\\uFC60]", "\uFE76\u0651"},
+                {"[\\uFC61]", "\uFE78\u0651"},
+                {"[\\uFC62]", "\uFE7A\u0651"},
+                {"[\\uFC63]", "\uFE7C\u0670"},
+                {"[\\uFCF2]", "\uFE77\u0651"},
+                {"[\\uFCF3]", "\uFE79\u0651"},
+                {"[\\uFCF4]", "\uFE7B\u0651"},
+            };
+            int count = 0;
+            UnicodeSet mappedChars = spacingMap.keySet();
+            for (int i = 0; i < specials.length; ++i) {
+                UnicodeSet source = new UnicodeSet(specials[i][0]);
+                if (!mappedChars.containsAll(source)) {
+                    throw new InternalError("Remapping character that doesn't need it!" + source);
+                }
+                spacingMap.putAll(source, specials[i][1]);
+                count += source.size();
+            }
+            spacingMap.freeze();
+    }
+
    /**
    * Just accessible for testing.
    */
--- a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java
+++ b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
-* $Date: 2006/06/09 21:21:20 $
-* $Revision: 1.12 $
+* $Date: 2006/09/24 23:32:45 $
+* $Revision: 1.13 $
 *
 *******************************************************************************
 */
@ -24,6 +24,7 @@ import java.io.Writer;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Comparator;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedHashSet;
 import java.util.Map;
@ -38,6 +39,7 @@ import com.ibm.icu.dev.demo.translit.CaseIterator;
 import com.ibm.icu.dev.test.util.BagFormatter;
 import com.ibm.icu.dev.test.util.Tabber;
 import com.ibm.icu.dev.test.util.UnicodeMap;
+import com.ibm.icu.dev.test.util.UnicodeProperty.UnicodeMapProperty;
 import com.ibm.icu.impl.PrettyPrinter;
 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.lang.UCharacter;
@ -57,7 +59,16 @@ import com.ibm.icu.util.ULocale;
 public class QuickTest implements UCD_Types {
 	public static void main(String[] args) throws IOException {
 		try {
+            
+            getHangulDecomps();
+
+            if (true) return;
+
 			
+      showLeadingTrailingNonStarters();
+      //checkBufferStatus(true);
+      
+      
 			checkNormalization("NFC", Default.nfc());
 			//checkNormalization("NFKC", Default.nfkc());
 			
@ -66,7 +77,6 @@ public class QuickTest implements UCD_Types {
 			checkCaseChanges();
 			if (true) return;
 			
-			checkBufferStatus();
 			
 			
 			checkCase();
@ -102,7 +112,43 @@ public class QuickTest implements UCD_Types {
 		}
 	}
 	
-	static void checkNormalization(String title, Normalizer nfx) {
+	private static void getHangulDecomps() {
+        //Normalizer nfkd500 = new Normalizer(Normalizer.NFKD, "5.0.0");
+        Normalizer nfkd218 = new Normalizer(Normalizer.NFKD, "2.1.8");
+        UnicodeMap diff = new UnicodeMap();
+        Map compose = new HashMap();
+        Map decompose = new HashMap();
+        // UnicodeSet applicable = // new UnicodeSet("[:HangulSyllable=NA:]");
+        UnicodeSet applicable = new UnicodeSet("[[\u1100-\u11FF \uAC00-\uD7FF]&[:assigned:]]");
+        for (UnicodeSetIterator it = new UnicodeSetIterator(applicable); it.next(); ) {
+            String source = it.getString();
+            String v218 = nfkd218.normalize(source);
+            //String v500 = nfkd500.normalize(source);
+            if (v218.equals(source)) continue;
+            decompose.put(source, v218);
+            compose.put(v218, source);
+        }
+        // now try recomposing
+
+        for (Iterator it = decompose.keySet().iterator(); it.hasNext();) {
+            String source = (String) it.next();
+            String decomposition = (String) decompose.get(source);
+            if (decomposition.length() > 2) {
+                String trial = decomposition.substring(0, decomposition.length() - 1);
+                String composition = (String) compose.get(trial);
+                if (composition != null) {
+                    decomposition = composition + decomposition.substring(decomposition.length() - 1);
+                }
+            }
+            if (decomposition.length() != 2) System.out.println("Failed decomp: " + Default.ucd().getCodeAndName(source));
+            diff.put(source.charAt(0), com.ibm.text.utility.Utility.hex(decomposition, " "));
+        }
+        UnicodeMapProperty p = new UnicodeMapProperty().set(diff);
+        BagFormatter bf = new BagFormatter().setValueSource(p);
+        System.out.println(bf.showSetNames(diff.keySet()));
+    }
+
+    static void checkNormalization(String title, Normalizer nfx) {
 		UnicodeSet trailing = new UnicodeSet();
 		UnicodeSet leading = new UnicodeSet();
 		UnicodeSet starter = new UnicodeSet();
@ -947,20 +993,22 @@ public class QuickTest implements UCD_Types {
 	static Counter bufferTypes = new Counter();
 	
 	static class BufferData {
+    byte starterIsZero;
 		int initials;
 		int medials;
 		int finals;
 		int sample;
 		public boolean equals(Object other) {
 			BufferData that = (BufferData)other;
-			return initials == that.initials && medials == that.medials && finals == that.finals;
+			return starterIsZero == that.starterIsZero && initials == that.initials && medials == that.medials && finals == that.finals;
 		}
 		public int hashCode() {
-			return (initials*37 + medials)*37 + finals;
+			return ((starterIsZero * 37 + initials)*37 + medials)*37 + finals;
 		}
 		public BufferData set(int codepoint) {
 			String s = Default.nfkd().normalize(codepoint);
 			int cp;
+      starterIsZero = (byte)(UCharacter.getCombiningClass(codepoint) == 0 ? 0 : 1);
 			boolean isInitial = true;
 			for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
 				cp = UTF16.charAt(s, i);
@ -977,14 +1025,30 @@ public class QuickTest implements UCD_Types {
 					finals = 0;
 				}
 			}
+      if (medials != 0) medials = 1;
 			sample = codepoint;
+      if (starterIsZero == 0 && medials == 0) {
+        System.out.println("WARNING: BAD CHARACTER");
+        cp = sample;
+        int ccc = UCharacter.getCombiningClass(cp);
+        System.out.println("U+" +  Utility.hex(cp) + "\t" + UCharacter.getName(cp) + " (ccc=" + ccc + ")");
+        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
+          cp = UTF16.charAt(s, i);
+          ccc = UCharacter.getCombiningClass(cp);
+          System.out.println("\tU+" +  Utility.hex(cp) + "\t" + UCharacter.getName(cp) + " (ccc=" + ccc + ")");
+        }
+      }
 			return this;
 		}
+    public static String getHeader() {
+      return "Starter?" + "\t" + "initials" + "\t" + "Contains Starter?" + "\t" + "finals" + "\t"  + "sample hex" + "\t" + "sample name";
+    }
 		public String toString() {
+      String result = (starterIsZero == 0 ? "Y" : "") + "\t" + initials + "\t" + (medials != 0 ? "Y" : "") + "\t" + finals + "\t";
 			if (sample == 0) {
-				return initials + "\t" + medials + "\t" + finals + "\t" + "-" + "\t" + "all others";
+				return  result + "-" + "\t" + "all others";
 			}
-			return initials + "\t" + medials + "\t" + finals + "\t" + Utility.hex(sample) + "\t" + UCharacter.getName(sample);
+			return result  + Utility.hex(sample) + "\t" + UCharacter.getName(sample);
 		}
 	}
 	static class BufferDataComparator implements Comparator {
@ -992,14 +1056,15 @@ public class QuickTest implements UCD_Types {
 			BufferData a0 = (BufferData)arg0;
 			BufferData a1 = (BufferData)arg1;
 			int result;
-			if (0 != (result = a0.initials - a1.initials)) return result;
+      if (0 != (result = a0.starterIsZero - a1.starterIsZero)) return result;
+      if (0 != (result = a0.initials - a1.initials)) return result;
 			if (0 != (result = a0.finals - a1.finals)) return result;
 			if (0 != (result = a0.medials - a1.medials)) return result;
 			return 0;
 		}
 	}
-	private static void checkBufferStatus() {
-		BufferData non = new BufferData().set(0);
+	private static void showLeadingTrailingNonStarters() {
+    BufferData non = new BufferData().set(0);
 		Tabber tabber = new Tabber.HTMLTabber();
 		for (int i = 0; i <= 0x10ffff; ++i) {
 			int type = Default.ucd().getCategory(i);
@ -1013,6 +1078,7 @@ public class QuickTest implements UCD_Types {
 		TreeSet sorted = new TreeSet(new BufferDataComparator());
 		NumberFormat nf = NumberFormat.getInstance();
 		sorted.addAll(m.keySet());
+    System.out.println(tabber.process("total\t" + BufferData.getHeader()));
 		for (Iterator it = sorted.iterator(); it.hasNext();) {
 			Object key = it.next();
 			Object value = bufferTypes.getCount(key);
--- a/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java
@ -10,6 +10,8 @@ import java.util.List;
 import java.util.Locale;

 import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.dev.test.util.Tabber;
+import com.ibm.icu.dev.test.util.TransliteratorUtilities;
 import com.ibm.icu.dev.tool.UOption;
 import com.ibm.icu.text.SymbolTable;
 import com.ibm.icu.text.UTF16;
@ -21,13 +23,15 @@ public class TestUnicodeInvariants {
    private static final int
    HELP1 = 0,
    FILE = 1,
-    RANGE = 2
+    RANGE = 2,
+    TABLE = 3
    ;

    private static final UOption[] options = {
        UOption.HELP_H(),
        UOption.create("file", 'f', UOption.REQUIRES_ARG),
-        UOption.create("range", 'r', UOption.NO_ARG),
+        UOption.create("norange", 'n', UOption.NO_ARG),
+        UOption.create("table", 't', UOption.NO_ARG),
    };
    
    public static void main(String[] args) throws IOException {
@ -35,7 +39,10 @@ public class TestUnicodeInvariants {

    	String file = "UnicodeInvariants.txt";
    	if (options[FILE].doesOccur) file = options[FILE].value;
-    	boolean doRange = options[RANGE].doesOccur;
+    	boolean doRange = !options[RANGE].doesOccur;
+        System.out.println("File:\t" + file);
+        System.out.println("Ranges?\t" + doRange);
+        System.out.println("HTML?\t" + options[TABLE].doesOccur);
    	
        testInvariants(file, doRange);
    }
@ -92,11 +99,19 @@ public class TestUnicodeInvariants {
       PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
       out.write('\uFEFF'); // BOM
       BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", outputFile);
-       BagFormatter bf = new BagFormatter();
-       bf.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
-       BagFormatter bf2 = new BagFormatter();
-       bf2.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
-       bf2.setMergeRanges(doRange);
+       
+       BagFormatter errorLister = new BagFormatter();
+       errorLister.setMergeRanges(doRange);
+       errorLister.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
+       errorLister.setShowLiteral(TransliteratorUtilities.toXML);
+       if (options[TABLE].doesOccur) errorLister.setTabber(new Tabber.HTMLTabber());
+       
+       BagFormatter showLister = new BagFormatter();
+       showLister.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
+       showLister.setMergeRanges(doRange);
+       showLister.setShowLiteral(TransliteratorUtilities.toXML);
+       if (options[TABLE].doesOccur) showLister.setTabber(new Tabber.HTMLTabber());
+              
       ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
           ToolUnicodePropertySource.make(UCD.lastVersion).getSymbolTable("\u00D7"),
           ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
@ -112,6 +127,7 @@ public class TestUnicodeInvariants {
           int pos = line.indexOf('#');
           if (pos >= 0) line = line.substring(0,pos).trim();
           if (line.length() == 0) continue;
+           if (line.equalsIgnoreCase("Stop")) break;

           // fix all the variables
           String oldLine = line;
@ -133,12 +149,12 @@ public class TestUnicodeInvariants {
           		String part = line.substring(4).trim();
           		if (part.startsWith("Each")) {
           			part = part.substring(4).trim();
-           			bf2.setMergeRanges(false);
+           			showLister.setMergeRanges(false);
           		}
           		pp.setIndex(0);
           		UnicodeSet leftSet = new UnicodeSet(part, pp, st);
-           		bf2.showSetNames(out, leftSet);
-           		bf2.setMergeRanges(doRange);
+           		showLister.showSetNames(out, leftSet);
+           		showLister.setMergeRanges(doRange);
 				continue;
           }
           
@ -210,7 +226,7 @@ public class TestUnicodeInvariants {
           out.println();
           out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH));
           out.println("**** START Error Info ****");
-           bf.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
+           errorLister.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
           out.println("**** END Error Info ****");
           out.println();
           testFailureCount++;      
--- a/tools/unicodetools/readme.html
+++ b/tools/unicodetools/readme.html
@ -331,11 +331,10 @@ AC00..D7A3     # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
 </ol>
 <h3>5. UCA</h3>
 <ol>
-	<li>
-	You will use com.ibm.text.UCA.Main as your main class, creating along 
+	<li>You will use com.ibm.text.UCA.Main as your main class, creating along 
 	the same lines as above.</li>
 	<li>To test whether the UCA files are valid, use the
-	<span style="font-weight: 400">options (<i>note: you should also build the ICU 
+	<span style="font-weight: 400">options (<i>note: you must also build the ICU 
 	files below, since they test other aspects</i>).</span><pre>writeCollationValidityLog</pre>
 	<p>It will create a file:</p>
 	<pre><a href="file:///C:/DATA/GEN/collation/5.0.0/CheckCollationValidity.html">C:\DATA\GEN\collation\5.0.0\CheckCollationValidity.html</a></pre>
@ -354,24 +353,45 @@ AC00..D7A3     # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
 		</ol></li>
 	</ol></li>
 	<li>
-	<h4><span style="font-weight: 400">To build all the charts, use the options:
-	</span> </h4>
+	<h4><span style="font-weight: 400">To build all the charts (including for 
+	the UCA), use the options: </span></h4>
 	<pre>normalizationChart caseChart scriptChart indexChart</pre>
+	<p>They will be built into</p>
+	<pre><a href="file:///C:/DATA/GEN/charts">C:\DATA\GEN\charts</a></pre>
+	<p><b>Once UCA is released, then copy those files up to the right spots in 
+	the Unicode site:</b><ul>
+		<li>
+		<pre><a href="http://www.unicode.org/charts/normalization/">http://www.unicode.org/charts/normalization/</a></pre>
+		</li>
+		<li>
+		<pre><a href="http://www.unicode.org/charts/collation/">http://www.unicode.org/charts/collation/</a> </pre>
+		</li>
+		<li>
+		<pre><a href="http://www.unicode.org/charts/case/">http://www.unicode.org/charts/case/</a> </pre>
+		</li>
+		<li>
+		<pre><a href="http://www.unicode.org/charts/collation/">http://www.unicode.org/charts/collation/</a> </pre>
+		</li>
+	</ul>
 	</li>
 	<li>
 	<h4><span style="font-weight: 400">To build all the UCA files used by ICU, use the 
 	option:</span></h4>
 	<pre>ICU</pre>
+	<p>They will be built into:</p>
+	<pre><a href="file:///C:/DATA/GEN/collation/5.0.0">C:\DATA\GEN\collation\5.0.0</a></pre>
 	</li>
 	<li>You should then build a set of the ICU files for the previous version, 
-	if you don't have them. The key file is UCA_Rules_NoCE.txt. It contains the 
-	rules expressed in ICU format, which allows for comparison across versions 
-	of UCA.<ol>
-	<li>Do a Diff, and verify that all the differences are either new 
-	characters, or were authorized to be changed by the UTC.</li>
-</ol>
-
-	</li>
+	if you don't have them. Use the options:<pre>version 4.1.0 ICU</pre>
+	<p>Or whatever the last version was.</li>
+	<li>Now, you will want to compare versions. The key file is 
+	UCA_Rules_NoCE.txt. It contains the rules expressed in ICU format, which 
+	allows for comparison across versions of UCA without spurious variations of 
+	the numbers getting in the way.<ol>
+		<li>Do a Diff between the last and current versions of these files, and 
+		verify that all the differences are either new characters, or were 
+		authorized to be changed by the UTC.</li>
+	</ol></li>
 </ol>

 </body>