From ac3cc9119ba6328bac86fb16732b94a6f0e71cb4 Mon Sep 17 00:00:00 2001
From: Mark Davis <mark@macchiato.com>
Date: Wed, 6 Apr 2005 08:48:17 +0000
Subject: [PATCH] ICU-0 updates for uca 4.1.0

X-SVN-Rev: 17468
---
 .../ibm/icu/dev/test/util/BagFormatter.java   | 81 ++++++++++++----
 .../icu/dev/test/util/UnicodeProperty.java    | 93 ++++++++++++------
 .../com/ibm/text/UCA/GenOverlap.java          | 14 +--
 .../com/ibm/text/UCA/Implicit.java            | 10 +-
 tools/unicodetools/com/ibm/text/UCA/Main.java | 71 +++++++++++---
 tools/unicodetools/com/ibm/text/UCA/UCA.java  | 96 +++++++++++++++----
 .../com/ibm/text/UCA/UCA_Types.java           | 12 +--
 .../com/ibm/text/UCA/WriteCharts.java         | 31 +++---
 .../com/ibm/text/UCA/WriteCollationData.java  | 83 +++++++++++-----
 .../com/ibm/text/UCD/TestData.java            | 80 ++++++++++++----
 .../com/ibm/text/UCD/idn-charsHeader.html     | 19 +++-
 11 files changed, 440 insertions(+), 150 deletions(-)

diff --git a/icu4j/src/com/ibm/icu/dev/test/util/BagFormatter.java b/icu4j/src/com/ibm/icu/dev/test/util/BagFormatter.java
index 1c7dd1b80c0..206d2f5fc89 100644
--- a/icu4j/src/com/ibm/icu/dev/test/util/BagFormatter.java
+++ b/icu4j/src/com/ibm/icu/dev/test/util/BagFormatter.java
@@ -46,7 +46,10 @@ public class BagFormatter {
         "'>' > '&gt;' ;";
 
     private static final String HTML_RULES = BASE_RULES + CONTENT_RULES + 
-	    "'\"' > '&quot;' ; ";
+    "'\"' > '&quot;' ; ";
+
+    private static final String HTML_RULES_CONTROLS = HTML_RULES + 
+    "([[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]]) > &hex/unicode($1) ; ";
 
     private static final String XML_RULES = HTML_RULES +
 	    "'' > '&apos;' ; ";
@@ -94,6 +97,8 @@ the double-quote character (") as "&quot;".
 
     public static final Transliterator toHTML = Transliterator.createFromRules(
             "any-html", HTML_RULES, Transliterator.FORWARD);
+    public static final Transliterator toHTMLControl = Transliterator.createFromRules(
+            "any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD);
     public static final Transliterator fromHTML = Transliterator.createFromRules(
             "html-any", HTML_RULES, Transliterator.REVERSE);
 
@@ -151,6 +156,14 @@ the double-quote character (") as "&quot;".
         return result.getBuffer().toString();
     }
 
+    public void showSetDifferences(
+            PrintWriter pw,
+            String name1,
+            UnicodeSet set1,
+            String name2,
+            UnicodeSet set2) {
+    	showSetDifferences(pw, name1, set1, name2, set2, -1);
+    }
     /**
      * Compare two UnicodeSets, and show the differences
      * @param name1 name of first set to be compared
@@ -164,24 +177,37 @@ the double-quote character (") as "&quot;".
         String name1,
         UnicodeSet set1,
         String name2,
-        UnicodeSet set2) {
+        UnicodeSet set2,
+		int flags) 
+    {
         if (pw == null) pw = CONSOLE;
         String[] names = { name1, name2 };
 
-        UnicodeSet temp = new UnicodeSet(set1).removeAll(set2);
-        pw.println();
-        pw.println(inOut.format(names));
-        showSetNames(pw, temp);
+        UnicodeSet temp;
+        
+        if ((flags&1) != 0) {
+        	temp = new UnicodeSet(set1).removeAll(set2);
+	        pw.print(lineSeparator);
+	        pw.print(inOut.format(names));
+	        pw.print(lineSeparator);
+	        showSetNames(pw, temp);
+        }
 
-        temp = new UnicodeSet(set2).removeAll(set1);
-        pw.println();
-        pw.println(outIn.format(names));
-        showSetNames(pw, temp);
+        if ((flags&2) != 0) {
+        	temp = new UnicodeSet(set2).removeAll(set1);
+	        pw.print(lineSeparator);
+	        pw.print(outIn.format(names));
+	        pw.print(lineSeparator);
+	        showSetNames(pw, temp);
+	    }
 
-        temp = new UnicodeSet(set2).retainAll(set1);
-        pw.println();
-        pw.println(inIn.format(names));
-        showSetNames(pw, temp);
+        if ((flags&4) != 0) {
+	        temp = new UnicodeSet(set2).retainAll(set1);
+	        pw.print(lineSeparator);
+	        pw.print(inIn.format(names));
+	        pw.print(lineSeparator);
+	        showSetNames(pw, temp);
+        }
         pw.flush();
     }
 
@@ -397,12 +423,14 @@ the double-quote character (") as "&quot;".
 
     // refactored
     public String getName(int codePoint, boolean withCodePoint) {
-        return getNameSource().getValue(codePoint, !withCodePoint);
+    	String result = getNameSource().getValue(codePoint, !withCodePoint);
+        return fixName == null ? result : fixName.transliterate(result);
     }
 
     public String getName(String s, boolean withCodePoint) {
-        return getNameSource().getValue(s, separator, !withCodePoint);
-    }
+       	String result = getNameSource().getValue(s, separator, !withCodePoint);
+        return fixName == null ? result : fixName.transliterate(result);
+     }
 
     public String hex(String s) {
         return hex(s,separator);
@@ -445,6 +473,7 @@ the double-quote character (") as "&quot;".
 
     private boolean mergeRanges = true;
     private Transliterator showLiteral = null;
+    private Transliterator fixName = null;
     private boolean showSetAlso = false;
 
     private RangeFinder rf = new RangeFinder();
@@ -580,10 +609,16 @@ the double-quote character (") as "&quot;".
                 doAt((Visitor.CodePointRange) o);
             } else {
                 String thing = o.toString();
+                String value = getValueSource() == UnicodeLabel.NULL ? "" : getValueSource().getValue(thing, ",", true);
+                if (value.length() != 0) value = "\t; " + value;
+                String label = getLabelSource(true).getValue(thing, ",", true);
+                if (label.length() != 0) label = " " + label;
                 output.print(
                     myTabber.process(
                         hex(thing)
+							+ value
                             + commentSeparator
+							+ label
                             + insertLiteral(thing)
                             + "\t"
                             + getName(thing))
@@ -1095,4 +1130,16 @@ the double-quote character (") as "&quot;".
         return this;
     }
 
+	/**
+	 * @return Returns the fixName.
+	 */
+	public Transliterator getFixName() {
+		return fixName;
+	}
+	/**
+	 * @param fixName The fixName to set.
+	 */
+	public void setFixName(Transliterator fixName) {
+		this.fixName = fixName;
+	}
 }
diff --git a/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java b/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java
index c6deb9d47e8..a78f9fb4de0 100644
--- a/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java
+++ b/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java
@@ -121,7 +121,7 @@ public abstract class UnicodeProperty extends UnicodeLabel {
     public List getValueAliases(String valueAlias, List result) {
         if (result == null) result = new ArrayList(1);
         result = _getValueAliases(valueAlias, result);
-        if (!result.contains(valueAlias) && type < NUMERIC) {
+        if (!result.contains(valueAlias) ) { // FIX && type < NUMERIC 
         	result = _getValueAliases(valueAlias, result); // for debugging
             throw new IllegalArgumentException(
                 "Internal error: " + getName() + " doesn't contain " + valueAlias
@@ -609,6 +609,7 @@ public abstract class UnicodeProperty extends UnicodeLabel {
         }
 
         private class PropertySymbolTable implements SymbolTable  {
+        	static final boolean DEBUG = false;
             private String prefix;
             RegexMatcher regexMatcher = new RegexMatcher();
 
@@ -698,7 +699,7 @@ public abstract class UnicodeProperty extends UnicodeLabel {
                 int i;
                 for (i = start; i < limit; i += UTF16.getCharCount(cp)) {
                     cp = UTF16.charAt(text, i);
-                    if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) {
+                    if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp) && cp != '.') {
                         break;
                     }
                 }
@@ -876,7 +877,9 @@ public abstract class UnicodeProperty extends UnicodeLabel {
     
     public static abstract class BaseProperty extends UnicodeProperty {
         protected List propertyAliases = new ArrayList(1);
-        String version;
+        protected Map toValueAliases;
+        protected String version;
+        
         public BaseProperty setMain(String alias, String shortAlias, int propertyType,
                 String version) {
                   setName(alias);
@@ -893,12 +896,56 @@ public abstract class UnicodeProperty extends UnicodeLabel {
             addAllUnique(propertyAliases, result);
             return result;
         }
+        public BaseProperty addValueAliases(String[][] valueAndAlternates) {
+        	if (toValueAliases == null) _fixValueAliases();
+            for (int i = 0; i < valueAndAlternates.length; ++i) {
+            	for (int j = 1; j < valueAndAlternates[0].length; ++j) {
+            		addValueAlias(valueAndAlternates[i][0], valueAndAlternates[i][j]);
+            	}
+            }
+            return this;
+        }
+		public void addValueAlias(String value, String valueAlias) {
+    		List result = (List) toValueAliases.get(value);
+    		addUnique(value, result);
+    		addUnique(valueAlias, result);
+		}
+		protected List _getValueAliases(String valueAlias, List result) {
+			if (toValueAliases == null) _fixValueAliases();
+            List a = (List) toValueAliases.get(valueAlias);
+            if (a != null) addAllUnique(a, result);
+            return result;
+        }
+
+		protected void _fixValueAliases() {
+			if (toValueAliases == null) toValueAliases = new HashMap(1);
+			for (Iterator it = getAvailableValues().iterator(); it.hasNext();) {
+				Object value = it.next();
+				List result;
+				_ensureValueInAliases(value);
+			}
+		}
+		protected void _ensureValueInAliases(Object value) {
+			List result = (List) toValueAliases.get(value);
+			if (result == null) toValueAliases.put(value, result = new ArrayList(1));
+			addUnique(value, result);
+		}
+        public BaseProperty swapFirst2ValueAliases() {
+        	for (Iterator it = toValueAliases.keySet().iterator(); it.hasNext();) {
+        		List list = (List) toValueAliases.get(it.next());
+        		if (list.size() < 2) continue;
+        		Object first = list.get(0);
+        		list.set(0, list.get(1));
+        		list.set(1, first);
+        	}
+        	return this;
+        }
+
 
     }
     
     public static abstract class SimpleProperty extends BaseProperty {
         List values;
-        Map toValueAliases = new HashMap(1);
 
         public SimpleProperty addName(String alias) {
             propertyAliases.add(alias);
@@ -918,62 +965,52 @@ public abstract class UnicodeProperty extends UnicodeLabel {
             }
             return this;
         }
-
+        
         public SimpleProperty setValues(List valueAliases) {
             this.values = new ArrayList(valueAliases);
             for (Iterator it = this.values.iterator(); it.hasNext(); ) {
-                _addToValues(it.next(), null);
+                _addToValues((String)it.next(), null);
             }
             return this;
         }
 
-        public List _getValueAliases(String valueAlias, List result) {
-            if (toValueAliases == null) _fillValues();
-            List a = (List) toValueAliases.get(valueAlias);
-            if (a != null) addAllUnique(a, result);
-            return result;
-        }
-
         public List _getAvailableValues(List result) {
             if (values == null) _fillValues();
             result.addAll(values);
             return result;
         }
 
-        private void _fillValues() {
+
+        protected void _fillValues() {
             List newvalues = (List) getUnicodeMap().getAvailableValues(new ArrayList());
             for (Iterator it = newvalues.iterator(); it.hasNext();) {
-                _addToValues(it.next(), null);
+                _addToValues((String)it.next(), null);
             }
         }
-
-        private void _addToValues(Object item, Object alias) {
+        
+        private void _addToValues(String item, String alias) {
             if (values == null) values = new ArrayList(1);
+            if (toValueAliases == null) _fixValueAliases();
             addUnique(item, values);
-            List aliases = (List) toValueAliases.get(item);
-            if (aliases == null) {
-                aliases = new ArrayList(1);
-                toValueAliases.put(item, aliases);
-            }
-            addUnique(alias, aliases);
-            addUnique(item, aliases);
+            _ensureValueInAliases(item);
+            addValueAlias(item, alias);
         }
-        public String _getVersion() {
+/*        public String _getVersion() {
             return version;
         }
-    }
+*/    }
     
     public static class UnicodeMapProperty extends BaseProperty {
         protected UnicodeMap unicodeMap;
         protected String _getValue(int codepoint) {
             return (String) unicodeMap.getValue(codepoint);
         }
-		protected List _getValueAliases(String valueAlias, List result) {
+/*		protected List _getValueAliases(String valueAlias, List result) {
 			if (!unicodeMap.getAvailableValues().contains(valueAlias)) return result;
 			result.add(valueAlias);
 			return result; // no other aliases
 		}
-		protected List _getAvailableValues(List result) {
+*/		protected List _getAvailableValues(List result) {
 			return (List) unicodeMap.getAvailableValues(result);
 		}
     }
diff --git a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java
index b8594df26bf..d9898edf514 100644
--- a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java
+++ b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $ 
-* $Date: 2004/02/07 01:01:12 $ 
-* $Revision: 1.12 $
+* $Date: 2005/04/06 08:48:16 $ 
+* $Revision: 1.13 $
 *
 *******************************************************************************
 */
@@ -164,8 +164,8 @@ public class GenOverlap implements UCD_Types, UCA_Types {
     static boolean PROGRESS = false;
       
     static void fullCheck() throws IOException {
-        PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "Overlap.html", Utility.UTF8_WINDOWS);
-        PrintWriter simpleList = Utility.openPrintWriter(UCA_GEN_DIR, "Overlap.txt", Utility.UTF8_WINDOWS);
+        PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.html", Utility.UTF8_WINDOWS);
+        PrintWriter simpleList = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.txt", Utility.UTF8_WINDOWS);
         
         Iterator it = completes.keySet().iterator();
         int counter = 0;
@@ -448,7 +448,7 @@ public class GenOverlap implements UCD_Types, UCA_Types {
         newKeys.removeAll(joint);
         oldKeys.removeAll(joint);
         
-        PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), Utility.UTF8_WINDOWS);
+        PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), Utility.UTF8_WINDOWS);
         Iterator it = list.iterator();
         int last = -1;
         while (it.hasNext()) {
@@ -631,7 +631,7 @@ public class GenOverlap implements UCD_Types, UCA_Types {
         
         System.out.println("Data Gathered");
 
-        PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "checkstringsearchhash.html", Utility.UTF8_WINDOWS);
+        PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "checkstringsearchhash.html", Utility.UTF8_WINDOWS);
         Utility.writeHtmlHeader(log, "Check Hash");
         log.println("<h1>Collisions</h1>");
         log.println("<p>Shows collisions among primary values when hashed to table size = " + tableLength + ".");
@@ -694,7 +694,7 @@ public class GenOverlap implements UCD_Types, UCA_Types {
     }
     
     public static void listCyrillic(UCA collatorIn) throws IOException {
-        PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "ListCyrillic.txt", Utility.UTF8_WINDOWS);
+        PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "ListCyrillic.txt", Utility.UTF8_WINDOWS);
         Set set = new TreeSet(collatorIn);
         Set set2 = new TreeSet(collatorIn);
         ucd = UCD.make();
diff --git a/tools/unicodetools/com/ibm/text/UCA/Implicit.java b/tools/unicodetools/com/ibm/text/UCA/Implicit.java
index 23db9b7b730..9850719a22c 100644
--- a/tools/unicodetools/com/ibm/text/UCA/Implicit.java
+++ b/tools/unicodetools/com/ibm/text/UCA/Implicit.java
@@ -168,7 +168,7 @@ public class Implicit implements UCD_Types {
      */
     public Implicit(int minPrimary, int maxPrimary) {
         // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
-        this(minPrimary, maxPrimary, 0x03, 0xFE, 1, 1);
+        this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1);
     }
     
     /**
@@ -181,6 +181,14 @@ public class Implicit implements UCD_Types {
      * @param primaries3count number of 3-byte primarys we can use (normally 1)
      */
     public Implicit(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) {
+    	if (DEBUG) {
+    		System.out.println("minPrimary: " + Utility.hex(minPrimary));
+	    	System.out.println("maxPrimary: " + Utility.hex(maxPrimary));
+	    	System.out.println("minTrail: " + Utility.hex(minTrail));
+	    	System.out.println("maxTrail: " + Utility.hex(maxTrail));
+	    	System.out.println("gap3: " + Utility.hex(gap3));
+	    	System.out.println("primaries3count: " + primaries3count);
+    	}
         // some simple parameter checks
         if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) throw new IllegalArgumentException("bad lead bytes");
         if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) throw new IllegalArgumentException("bad trail bytes");
diff --git a/tools/unicodetools/com/ibm/text/UCA/Main.java b/tools/unicodetools/com/ibm/text/UCA/Main.java
index 779c0e8d40c..f914671af27 100644
--- a/tools/unicodetools/com/ibm/text/UCA/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCA/Main.java
@@ -5,19 +5,24 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $ 
-* $Date: 2004/01/15 01:08:30 $ 
-* $Revision: 1.18 $
+* $Date: 2005/04/06 08:48:16 $ 
+* $Revision: 1.19 $
 *
 *******************************************************************************
 */
 
 package com.ibm.text.UCA;
+import java.io.File;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.CanonicalIterator;
+import com.ibm.icu.text.UTF16;
 import com.ibm.text.UCD.*;
 import com.ibm.text.utility.*;
 
 
 public class Main {
-	static final String UCDVersion = "4.0.0";
+	//static final String UCDVersion = "4.0.0";
 	static final String[] ICU_FILES = {"writeCollationValidityLog", "writeFractionalUCA",
 		"WriteRules", "WriteRulesXML", "writeconformance", "writeconformanceshifted", 
 		"short", 
@@ -28,18 +33,10 @@ public class Main {
     };
 	
 	public static void main(String args[]) throws Exception {
-		
+		//checkCanonicalIterator();
 		// NOTE: so far, we don't need to build the UCA with anything but the latest versions.
 		// A few changes would need to be made to the code to do older versions.
         try {
-            System.out.println("Building UCA");
-            Default.setUCD(UCDVersion);
-            WriteCollationData.collator = new UCA(null, UCDVersion);
-            System.out.println("Built version " + WriteCollationData.collator.getDataVersion()
-            	+ "/ucd: " + WriteCollationData.collator.getUCDVersion());
-            
-            System.out.println("Building UCD data");
-            WriteCollationData.ucd = UCD.make(WriteCollationData.collator.getUCDVersion());
             
             if (args.length == 0) args = new String[] {"?"}; // force the help comment
             boolean shortPrint = false;
@@ -54,7 +51,22 @@ public class Main {
                     args = Utility.append(ICU_FILES, Utility.subarray(args, i+1));
                     i = -1;
                     continue;     
-                } 
+                }
+                if (arg.equalsIgnoreCase("version")) {
+                	Default.setUCD(args[++i]); // get next arg
+                	continue;
+                }
+                if (WriteCollationData.collator == null) {
+                    System.out.println("Building UCA");
+                    String file = Utility.searchDirectory(new File(UCD_Types.BASE_DIR + "UCA\\" + Default.ucdVersion() + "\\"), "allkeys", true, ".txt");
+                    WriteCollationData.collator = new UCA(file, Default.ucdVersion());
+                    System.out.println("Built version " + WriteCollationData.collator.getDataVersion()
+                    	+ "/ucd: " + WriteCollationData.collator.getUCDVersion());
+                    
+                    System.out.println("Building UCD data");
+                    WriteCollationData.ucd = UCD.make(WriteCollationData.collator.getUCDVersion());
+
+                }
                 if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(WriteCollationData.collator);
                 else if (arg.equalsIgnoreCase("validateUCA")) GenOverlap.validateUCA(WriteCollationData.collator);
                 //else if (arg.equalsIgnoreCase("writeNonspacingDifference")) WriteCollationData.writeNonspacingDifference();
@@ -125,4 +137,37 @@ public class Main {
             */
         }
     }
+
+	/**
+	 * 
+	 */
+	private static void checkCanonicalIterator() {
+		
+		int firstImplicit = WriteCollationData.getImplicitPrimary(UCD_Types.CJK_BASE);
+		System.out.println("UCD_Types.CJK_BASE: " + Utility.hex(UCD_Types.CJK_BASE));
+		System.out.println("first implicit: " + Utility.hex((long)(firstImplicit & 0xFFFFFFFFL)));
+		
+		CanonicalIterator it = new CanonicalIterator("");
+		String[] tests = new String[] {"\uF900"};
+		for (int j = 0; j < tests.length; ++j) {
+			System.out.println(tests[j]);
+			it.setSource(tests[j]);
+			String ss;
+			for (int i = 0; (ss = it.next()) != null; ++i) {
+				System.out.println(i + "\t" + Utility.hex(ss));
+			}
+		}
+		if (true) throw new IllegalArgumentException();
+		for (int i = 0; i < 0x10FFFF; ++i) {
+			int cat = UCharacter.getType(i);
+			if (cat == UCharacter.UNASSIGNED || cat == UCharacter.PRIVATE_USE || cat == UCharacter.SURROGATE) continue;
+			String s = UTF16.valueOf(i);
+			try {
+				it.setSource(s);
+			} catch (RuntimeException e) {
+				System.out.println("Failure with U+" + Utility.hex(i));
+				e.printStackTrace();
+			}
+		}
+	}
 }
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java
index c0c3bedc8da..1516c7e7be4 100644
--- a/tools/unicodetools/com/ibm/text/UCA/UCA.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ 
-* $Date: 2004/02/06 18:32:03 $ 
-* $Revision: 1.23 $
+* $Date: 2005/04/06 08:48:16 $ 
+* $Revision: 1.24 $
 *
 *******************************************************************************
 */
@@ -14,6 +14,8 @@
 package com.ibm.text.UCA;
 
 import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import java.io.BufferedReader;
 import java.io.Reader;
 import java.io.PrintWriter;
@@ -108,13 +110,16 @@ final public class UCA implements Comparator, UCA_Types {
 // Main Methods
 // =============================================================
 
+    private String fileVersion = "??";
+    
     /**
      * Initializes the collation from a stream of rules in the normal formal.
      * If the source is null, uses the normal Unicode data files, which
      * need to be in BASE_DIR.
      */
-    public UCA(BufferedReader source, String unicodeVersion) throws java.io.IOException {
-        fullData = source == null;
+    public UCA(String sourceFile, String unicodeVersion) throws java.io.IOException {
+        fullData = sourceFile == null;
+        fileVersion = sourceFile;
         
         // load the normalizer
         if (toD == null) {
@@ -127,15 +132,19 @@ final public class UCA implements Comparator, UCA_Types {
         ucaData = new UCA_Data(toD, ucd);
         
         // either get the full sources, or just a demo set
-        if (fullData) {
+/*        if (fullData) {      
             for (int i = 0; i < KEYS.length; ++i) {
                 BufferedReader in = new BufferedReader(
                     new FileReader(KEYS[i]), BUFFER_SIZE);
                 addCollationElements(in);
                 in.close();
             }
-        } else {
-            addCollationElements(source);
+        } else */
+        {
+        	BufferedReader in = new BufferedReader(
+                    new FileReader(sourceFile), BUFFER_SIZE);
+            addCollationElements(in);
+            in.close();
         }
         cleanup();
     }
@@ -830,16 +839,17 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
     /**
      * List of files to use for constructing the CE data, used by build()
      */
-    private static final String[] KEYS = {
+
+/*    private static final String[] KEYS = {
         //"D:\\UnicodeData\\testkeys.txt",
-        BASE_DIR + "Collation\\allkeys" + VERSION + ".txt",
-        /*
+        BASE_DIR + "UCA\\allkeys" + VERSION + ".txt",
+        
         BASE_DIR + "UnicodeData\\Collation\\basekeys" + VERSION + ".txt",
         BASE_DIR + "UnicodeData\\Collation\\compkeys" + VERSION + ".txt",
         BASE_DIR + "UnicodeData\\Collation\\ctrckeys" + VERSION + ".txt",
-        */
+        
     };
- 
+*/ 
     /**
      * File buffer size, used to make reads faster.
      */
@@ -1089,6 +1099,13 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
     
     static boolean haveUnspecified = false;
     static UnicodeSet unspecified = new UnicodeSet();
+    UnicodeSet variantSecondaries = new UnicodeSet(0x0153,0x0154);
+    UnicodeSet digitSecondaries = new UnicodeSet(0x155,0x017F);
+    UnicodeSet homelessSecondaries;
+    
+    // static UnicodeSet homelessSecondaries = new UnicodeSet(0x0176, 0x0198);
+    //  0x0153..0x017F
+
         
     public class UCAContents {
         int current = -1;
@@ -1130,9 +1147,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
         
         /**
          * use FIXED_CE as the limit
+         * @param newValue TODO
          */
-        public void enableSamples() {
-            doSamples = true;
+        public void setDoEnableSamples(boolean newValue) {
+            doSamples = newValue;
         }
         
         /**
@@ -1179,7 +1197,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
             if (!haveUnspecified) {
                 if (DEBUG) System.out.println("Specified = " + unspecified.toPattern(true));
                 UnicodeSet temp = new UnicodeSet();
-                for (int i = 0; i < 0x10ffff; ++i) {
+                for (int i = 0; i <= 0x10ffff; ++i) {
                     if (!ucd.isAllocated(i)) continue;
                     if (!unspecified.contains(i)) {
                         temp.add(i);
@@ -1265,6 +1283,12 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
             return true;
         }
         
+		/**
+		 * @return Returns the doSamples.
+		 */
+		public boolean isDoSamples() {
+			return doSamples;
+		}
     }
     
     static final int[][] SAMPLE_RANGES = {
@@ -1312,6 +1336,14 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
         while (true) try {
             inputLine = in.readLine();
             if (inputLine == null) break;       // means file is done
+            
+            // HACK
+            if (inputLine.startsWith("# Variant secondaries:")) {
+            	variantSecondaries = extractSet(inputLine);
+            } else if (inputLine.startsWith("# Digit secondaries:")) {
+            	digitSecondaries = extractSet(inputLine);
+            }
+
             String line = cleanLine(inputLine); // remove comments, extra whitespace
             if (line.length() == 0) continue;   // skip empty lines
             
@@ -1407,7 +1439,18 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
         }
     }
     
-    /*
+    /**
+	 * 
+	 */
+	private UnicodeSet extractSet(String inputLine) {
+        //# Variant secondaries:    0177..017B (5)
+		//# Digit secondaries:      017C..0198 (29)
+		Matcher m = Pattern.compile(".*:\\s*([0-9A-Fa-f]+)\\.\\.([0-9A-Fa-f]+).*").matcher("");
+		if (!m.reset(inputLine).matches()) throw new IllegalArgumentException("Failed to recognized special Ken lines: " + inputLine);
+		return new UnicodeSet(Integer.parseInt(m.group(1),16), Integer.parseInt(m.group(2),16));
+	}
+
+	/*
     private void concat(int[] ces1, int[] ces2) {
         
     }
@@ -1737,4 +1780,25 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
             uniqueTable.put(ceObj, new Character(value));
         }
     }
+/**
+ * @return Returns the fileVersion.
+ */
+public String getFileVersion() {
+	return fileVersion;
+}
+/**
+ * @return Returns the uCA_GEN_DIR.
+ */
+public String getUCA_GEN_DIR() {
+	return BASE_UCA_GEN_DIR + getDataVersion() + "\\";
+}
+
+
+	/**
+	 * @return Returns the homelessSecondaries.
+	 */
+	public UnicodeSet getHomelessSecondaries() {
+		if (homelessSecondaries == null) homelessSecondaries = new UnicodeSet(variantSecondaries).addAll(digitSecondaries);
+		return homelessSecondaries;
+	}
 }
diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA_Types.java b/tools/unicodetools/com/ibm/text/UCA/UCA_Types.java
index cfc07810d98..bf700a7ea94 100644
--- a/tools/unicodetools/com/ibm/text/UCA/UCA_Types.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA_Types.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Types.java,v $ 
-* $Date: 2004/01/13 18:32:11 $ 
-* $Revision: 1.6 $
+* $Date: 2005/04/06 08:48:17 $ 
+* $Revision: 1.7 $
 *
 *******************************************************************************
 */
@@ -20,11 +20,11 @@ public interface UCA_Types {
      * Version of the UCA tables to use
      */
     //private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7"; 
-    public static final String UCA_BASE = "4.0.0";  // "3.1.1"; //      ; // ""; // "-2.1.9d7"; 
-    public static final String VERSION = "-" + UCA_BASE; //  + "d6" ""; // "-2.1.9d7"; 
+    //public static final String UCA_BASE = "4.1.0";  // "3.1.1"; //      ; // ""; // "-2.1.9d7"; 
+    //public static final String VERSION = "-" + UCA_BASE; //  + "d6" ""; // "-2.1.9d7"; 
     public static final String ALLFILES = "allkeys"; // null if not there
 
-    public static final String UCA_GEN_DIR = UCD_Types.GEN_DIR + "collation_" + UCA_BASE + "\\";
+    public static final String BASE_UCA_GEN_DIR = UCD_Types.GEN_DIR + "collation" + "\\";
     public static final char LEVEL_SEPARATOR = '\u0000'; 
     /**
      * Expanding characters are marked with a exception bit combination
@@ -94,5 +94,5 @@ public interface UCA_Types {
         CJK_CE = 3, CJK_AB_CE = 4, HANGUL_CE = 5, UNSUPPORTED_CE = 7,
         FIXED_CE = 3;
         // SURROGATE_CE = 6, 
-   
+
 }
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
index b73744279a4..5f3642241f6 100644
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
-* $Date: 2004/02/12 08:23:19 $
-* $Revision: 1.20 $
+* $Date: 2005/04/06 08:48:17 $
+* $Revision: 1.21 $
 *
 *******************************************************************************
 */
@@ -29,6 +29,7 @@ import java.text.SimpleDateFormat;
 
 public class WriteCharts implements UCD_Types {
 
+	static String WORKING_DIR = ".\\com\\ibm\\text\\UCA\\";
     static boolean HACK_KANA = false;
 
     static public void special() {
@@ -50,7 +51,7 @@ public class WriteCharts implements UCD_Types {
         //Normalizer nfc = new Normalizer(Normalizer.NFC);
 
         UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps
-        cc.enableSamples();
+        cc.setDoEnableSamples(true);
 
         Set set = new TreeSet();
 
@@ -84,12 +85,12 @@ public class WriteCharts implements UCD_Types {
         String[] replacement = new String[] {"%%%", "Collation Charts"};
         String folder = "charts\\uca\\";
 
-        Utility.copyTextFile("index.html", Utility.UTF8, folder + "index.html", replacement);
-        Utility.copyTextFile("charts.css", Utility.LATIN1, folder + "charts.css");
-        Utility.copyTextFile("help.html", Utility.UTF8, folder + "help.html");
+        Utility.copyTextFile(WORKING_DIR + "index.html", Utility.UTF8, folder + "index.html", replacement);
+        Utility.copyTextFile(WORKING_DIR + "charts.css", Utility.LATIN1, folder + "charts.css");
+        Utility.copyTextFile(WORKING_DIR + "help.html", Utility.UTF8, folder + "help.html");
 
         indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
-        Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement);
+        Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
 
         /*
         indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
@@ -100,6 +101,7 @@ public class WriteCharts implements UCD_Types {
         indexFile.println("<p align='center'><a href = 'help.html'>Help</a>");
         */
 
+        int lastCp = -1;
         while (it.hasNext()) {
             Utility.dot(counter);
 
@@ -110,6 +112,7 @@ public class WriteCharts implements UCD_Types {
             int cp = UTF16.charAt(s,0);
 
             byte script = Default.ucd().getScript(cp);
+            if (cp == 0x1DBF) script = UCD.GREEK_SCRIPT; // 4.1.0 hack
 
             // get first non-zero primary
             int currentPrimary = getFirstPrimary(sortKey);
@@ -128,6 +131,7 @@ public class WriteCharts implements UCD_Types {
             if (script == KATAKANA_SCRIPT) script = HIRAGANA_SCRIPT;
             else if ((script == INHERITED_SCRIPT || script == COMMON_SCRIPT) && oldScript >= 0) script = oldScript;
 
+            int veryOldScript = oldScript;
             if (script != oldScript
                     // && (script != COMMON_SCRIPT && script != INHERITED_SCRIPT)
                     ) {
@@ -140,7 +144,9 @@ public class WriteCharts implements UCD_Types {
                 ++scriptCount[script+3];
                 if (scriptCount[script+3] > 1) {
                     System.out.println("\t\tFAIL: " + scriptCount[script+3] + ", " +
-                        getChunkName(script, LONG) + ", " + Default.ucd().getCodeAndName(s));
+                        getChunkName(script, LONG) + ", " + Default.ucd().getCodeAndName(s)
+						+ " - last char: " 
+						+ getChunkName(veryOldScript, LONG) + ", " + Default.ucd().getCodeAndName(lastCp));
                 }
                 output = openFile(scriptCount[script+3], folder, script);
             }
@@ -179,6 +185,7 @@ public class WriteCharts implements UCD_Types {
 
             output.println(breaker + outline);
             ++columnCount;
+            lastCp = cp;
         }
 
         closeFile(output);
@@ -265,7 +272,7 @@ public class WriteCharts implements UCD_Types {
         Utility.copyTextFile("norm_help.html", Utility.UTF8, folder + "help.html");
 
         indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
-        Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement);
+        Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
 
         /*
         indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
@@ -373,7 +380,7 @@ public class WriteCharts implements UCD_Types {
         Utility.copyTextFile("case_help.html", Utility.UTF8, folder + "help.html");
 
         indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
-        Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement);
+        Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
 
         /*
         indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
@@ -485,7 +492,7 @@ public class WriteCharts implements UCD_Types {
 			Utility.copyTextFile("script_help.html", Utility.UTF8, folder + "help.html");
 
 			indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
-			Utility.appendFile("script_index_header.html", Utility.UTF8, indexFile, replacement);
+			Utility.appendFile(WORKING_DIR + "script_index_header.html", Utility.UTF8, indexFile, replacement);
 
 			/*
 			indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
@@ -607,7 +614,7 @@ public class WriteCharts implements UCD_Types {
         Utility.copyTextFile("name_help.html", Utility.UTF8, folder + "help.html");
 
         indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
-        Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement);
+        Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
 
         int columnCount = 0;
         char lastInitial = 0;
diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
index 1c25f6469c3..5eb49d77a44 100644
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ 
-* $Date: 2004/02/07 01:01:11 $ 
-* $Revision: 1.39 $
+* $Date: 2005/04/06 08:48:17 $ 
+* $Revision: 1.40 $
 *
 *******************************************************************************
 */
@@ -17,6 +17,9 @@ import java.util.*;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.CanonicalIterator;
+import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.dev.test.util.UnicodeProperty;
+import com.ibm.icu.dev.test.util.UnicodePropertySource;
 import com.ibm.icu.impl.UCharacterProperty;
 
 import java.io.*;
@@ -36,6 +39,8 @@ import com.ibm.text.UCD.Normalizer;
 
 public class WriteCollationData implements UCD_Types, UCA_Types {
 	
+	// may require fixing 
+
 	static final boolean DEBUG = false;
 	static final boolean DEBUG_SHOW_ITERATION = false;
 	
@@ -145,7 +150,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
         BufferedReader in = Utility.openUnicodeFile("CaseFolding", UNICODE_VERSION, true, Utility.LATIN1);
         // new BufferedReader(new FileReader(DIR31 + "CaseFolding-3.d3.alpha.txt"), 64*1024);
         // log = new PrintWriter(new FileOutputStream("CaseFolding_data.js"));
-        log = Utility.openPrintWriter(UCA_GEN_DIR, "CaseFolding_data.js", Utility.UTF8_WINDOWS);
+        log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "CaseFolding_data.js", Utility.UTF8_WINDOWS);
         log.println("var CF = new Object();");
         int count = 0;
         while (true) {
@@ -190,7 +195,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
         //Normalizer normKD = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
         //Normalizer normD = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
         //log = new PrintWriter(new FileOutputStream("Normalization_data.js"));
-        log = Utility.openPrintWriter(UCA_GEN_DIR, "Normalization_data.js", Utility.LATIN1_WINDOWS);
+        log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Normalization_data.js", Utility.LATIN1_WINDOWS);
         
         
         int count = 0;
@@ -319,7 +324,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
         }
         
         String fullFileName = filename + (shortPrint ? "_SHORT" : "") + ".txt";
-        PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS);
+        PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), fullFileName, Utility.UTF8_WINDOWS);
         //if (!shortPrint) log.write('\uFEFF');
         writeVersionAndDate(log, fullFileName);
         
@@ -327,7 +332,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
         int counter = 0;
         
         UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null);
-        cc.enableSamples();
+        cc.setDoEnableSamples(true);
         UnicodeSet found2 = new UnicodeSet();
         
         while (true) {
@@ -711,7 +716,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
     
     static void testCompatibilityCharacters() throws IOException {
         String fullFileName = "UCA_CompatComparison.txt";
-        log = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS);
+        log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), fullFileName, Utility.UTF8_WINDOWS);
         
         int[] kenCes = new int[50];
         int[] markCes = new int[50];
@@ -1191,7 +1196,13 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
         while (it.hasNext()) {
             if (result.length() != 0) result.append(";<br>");
             Object item = it.next();
-            if (m != null) item = m.get(item);
+            if (m != null) {
+            	Object item2 = m.get(item);
+            	if (item2 != null) item = item2;
+            	else {
+            		System.out.println("Missing Item: " + item);
+            	}
+            }
             if (useName) item = ucd.getCodeAndName(item.toString());
             result.append(item);
         }
@@ -1207,7 +1218,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
                 32*1024));
                 */
         String fullFileName = "UCA_Contractions.txt";
-        PrintWriter diLog = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS);
+        PrintWriter diLog = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), fullFileName, Utility.UTF8_WINDOWS);
                 
         diLog.write('\uFEFF');
 
@@ -1246,7 +1257,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
                     "UTF8"),
                 32*1024));
                 */
-        PrintWriter diLog = Utility.openPrintWriter(UCA_GEN_DIR, "DisjointIgnorables.js", Utility.UTF8_WINDOWS);
+        PrintWriter diLog = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "DisjointIgnorables.js", Utility.UTF8_WINDOWS);
                 
         diLog.write('\uFEFF');
 
@@ -1425,7 +1436,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
                     "UTF8"),
                 32*1024));
                 */
-        PrintWriter diLog = Utility.openPrintWriter(UCA_GEN_DIR, "DisjointIgnorables2.js", Utility.UTF8_WINDOWS);
+        PrintWriter diLog = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "DisjointIgnorables2.js", Utility.UTF8_WINDOWS);
                 
         diLog.write('\uFEFF');
 
@@ -1637,7 +1648,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
     	if (UCA.isImplicitLeadCE(ces[0])) {
     		expansionStart = 2; // move up if first is double-ce
     	} 
-    	if (len > expansionStart && homelessSecondaries.contains(UCA.getSecondary(ces[expansionStart]))) {
+    	if (len > expansionStart && collator.getHomelessSecondaries().contains(UCA.getSecondary(ces[expansionStart]))) {
             if (log2 != null) log2.println("Homeless: " + CEList.toString(ces, len));
     		++expansionStart; // move up if *second* is homeless ignoreable
     	}
@@ -1674,7 +1685,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
         int[] lenArray = new int[1];
         
         Set alreadyDone = new HashSet();
-        log2 = Utility.openPrintWriter(UCA_GEN_DIR, "UCARules-log.txt", Utility.UTF8_WINDOWS);
+        log2 = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "UCARules-log.txt", Utility.UTF8_WINDOWS);
 
         while (true) {
             String s = cc.next(ces, lenArray);
@@ -1799,7 +1810,7 @@ F900..FAFF; CJK Compatibility Ideographs
         if (noCE) filename += "_NoCE";
         if (option == IN_XML) filename += ".xml"; else filename += ".txt";
         
-        log = Utility.openPrintWriter(UCA_GEN_DIR, filename, Utility.UTF8_WINDOWS);
+        log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), filename, Utility.UTF8_WINDOWS);
         
         String[] commentText = {
         	"UCA Rules",
@@ -2316,8 +2327,6 @@ F900..FAFF; CJK Compatibility Ideographs
     }
     
     
-    static UnicodeSet homelessSecondaries = new UnicodeSet(0x0153,0x017F);
-    
     /*static int[] ignorableList = new int[homelessSecondaries.size()];
     
     static {
@@ -2396,7 +2405,7 @@ F900..FAFF; CJK Compatibility Ideographs
             }
             if (s == null) {
             	do {
-            		if (homelessSecondaries.contains(UCA.getSecondary(ces[i]))) {
+            		if (collator.getHomelessSecondaries().contains(UCA.getSecondary(ces[i]))) {
             			s = "";
             			if (rel[0] > 1) rel[0] = 1; // HACK
             			break;
@@ -2846,11 +2855,11 @@ F900..FAFF; CJK Compatibility Ideographs
         
         Utility.fixDot();
         System.out.println("Writing");
-        PrintWriter shortLog = new PrintWriter(new BufferedWriter(new FileWriter(UCA_GEN_DIR + filename + "_SHORT.txt"), 32*1024));
-        PrintWriter longLog = new PrintWriter(new BufferedWriter(new FileWriter(UCA_GEN_DIR + filename + ".txt"), 32*1024));
+        PrintWriter shortLog = new PrintWriter(new BufferedWriter(new FileWriter(collator.getUCA_GEN_DIR() + filename + "_SHORT.txt"), 32*1024));
+        PrintWriter longLog = new PrintWriter(new BufferedWriter(new FileWriter(collator.getUCA_GEN_DIR() + filename + ".txt"), 32*1024));
         log = new PrintWriter(new DualWriter(shortLog, longLog));
         
-        PrintWriter summary = new PrintWriter(new BufferedWriter(new FileWriter(UCA_GEN_DIR + filename + "_summary.txt"), 32*1024));
+        PrintWriter summary = new PrintWriter(new BufferedWriter(new FileWriter(collator.getUCA_GEN_DIR() + filename + "_summary.txt"), 32*1024));
         //log.println("[Variable Low = " + UCA.toString(collator.getVariableLow()) + "]");
         //log.println("[Variable High = " + UCA.toString(collator.getVariableHigh()) + "]");
         
@@ -3976,7 +3985,7 @@ F900..FAFF; CJK Compatibility Ideographs
     static void writeCollationValidityLog() throws IOException {
     	
         //log = new PrintWriter(new FileOutputStream("CheckCollationValidity.html"));
-        log = Utility.openPrintWriter(UCA_GEN_DIR, "CheckCollationValidity.html", Utility.UTF8_WINDOWS);
+        log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "CheckCollationValidity.html", Utility.UTF8_WINDOWS);
         
         log.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
         log.println("<title>UCA Validity Log</title>");
@@ -4002,15 +4011,18 @@ F900..FAFF; CJK Compatibility Ideographs
         */
 
         UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null);
-        cc.enableSamples();
+        //cc.setDoEnableSamples(true);
+        UnicodeSet coverage = new UnicodeSet();
         
         while (true) {
             String s = cc.next();
             if (s == null) break;
             addString(s, option);
+            coverage.add(s);
         }
-                
+
         System.out.println("Total: " + sortedD.size());
+
         Iterator it;
         
         //ucd.init();
@@ -4051,7 +4063,10 @@ F900..FAFF; CJK Compatibility Ideographs
         
         log.println("<h1>Collation Validity Checks</h1>");
         log.println("<table><tr><td>Generated: </td><td>" + getNormalDate() + "</td></tr>");
-        log.println("<tr><td>File Version: </td><td>" + collator.getDataVersion() + "/" + collator.getUCDVersion() + "</td></tr></table>");
+        log.println("<tr><td>Unicode  Version: </td><td>" + collator.getUCDVersion());
+        log.println("<tr><td>UCA Data Version (@version in file): </td><td>" + collator.getDataVersion());
+        log.println("<tr><td>UCA File Name: </td><td>" + collator.getFileVersion());
+        log.println("</td></tr></table>");
         
         if (collator.getDataVersion() == UCA.BADVERSION) {
             log.println(SERIOUS_ERROR);
@@ -4076,6 +4091,24 @@ F900..FAFF; CJK Compatibility Ideographs
         addClosure();
         writeDuplicates();
         writeOverlap();
+        
+        log.println("<h2>Coverage</h2>");
+        BagFormatter bf = new BagFormatter();
+        bf.setLineSeparator("<br>\r\n");
+        ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
+        bf.setUnicodePropertyFactory(ups);
+        bf.setShowLiteral(bf.toHTML);
+        bf.setFixName(bf.toHTML);
+        UCD ucd = Default.ucd();
+        UnicodeProperty cat = ups.getProperty("gc");
+        UnicodeSet ucd410 = cat.getSet("Cn")
+		.addAll(cat.getSet("Co"))
+		.addAll(cat.getSet("Cs"))
+		.complement()
+		//.addAll(ups.getSet("Noncharactercodepoint=true"))
+		//.addAll(ups.getSet("Default_Ignorable_Code_Point=true"))
+		;
+        bf.showSetDifferences(log, "UCD4.1.0", ucd410, "UCA4.1.0", coverage, 3);
 
         log.println("</body></html>");
         log.close();
@@ -4670,7 +4703,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
 
     static PrintWriter writeHead(int counter, int end, String title, String other, String version, boolean show) throws IOException {
 
-        PrintWriter out = Utility.openPrintWriter(UCA_GEN_DIR, title + pad(counter) + ".html", Utility.UTF8_WINDOWS);
+        PrintWriter out = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), title + pad(counter) + ".html", Utility.UTF8_WINDOWS);
         
         copyFile(out, "HTML-Part1.txt");
         /*
diff --git a/tools/unicodetools/com/ibm/text/UCD/TestData.java b/tools/unicodetools/com/ibm/text/UCD/TestData.java
index 73400fb19af..35a780541bb 100644
--- a/tools/unicodetools/com/ibm/text/UCD/TestData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
-* $Date: 2005/03/30 17:19:32 $
-* $Revision: 1.20 $
+* $Date: 2005/04/06 08:48:17 $
+* $Revision: 1.21 $
 *
 *******************************************************************************
 */
@@ -21,6 +21,7 @@ import java.text.SimpleDateFormat;
 import com.ibm.icu.dev.test.util.BagFormatter;
 import com.ibm.icu.dev.test.util.ICUPropertyFactory;
 import com.ibm.icu.dev.test.util.UnicodeLabel;
+import com.ibm.icu.dev.test.util.UnicodeMap;
 import com.ibm.icu.dev.test.util.UnicodeProperty;
 import com.ibm.icu.impl.ICUData;
 import com.ibm.icu.impl.ICUResourceBundle;
@@ -153,17 +154,23 @@ public class TestData implements UCD_Types {
 	static class GenStringPrep {
 		UnicodeSet[] coreChars = new UnicodeSet[100];
 		UnicodeSet decomposable = new UnicodeSet();
+		UnicodeMap suspect = new UnicodeMap();
 		
 		ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
 		//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
-		UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher())
-		.retainAll(ups.getSet("gc=Sk"))
-		.addAll(new UnicodeSet("[\u0027 \u002D \u002E \u003A \u00B7 \u058A \u05F3" +
-		" \u05F4 \u200C \u200D \u2010 \u2019 \u2027 \u30A0]"));
+		UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
+		UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher());
+		{
+			wordChars.retainAll(ups.getSet("gc=Sk"));
+			wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
+			" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0]"));
+			//wordChars.removeAll(xid_continue);
+		}
 		
 		UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
+		UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
 		
-		UnicodeSet not_xid_continue = ups.getSet("XID_Continue=true").complement().removeAll(wordChars);
+		UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
 		
 		//UnicodeSet[] decompChars = new UnicodeSet[100];
 		UCD ucd = Default.ucd();
@@ -180,7 +187,8 @@ public class TestData implements UCD_Types {
 				"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
 
 		UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
-		UnicodeSet hasUpper = new UnicodeSet();
+		UnicodeSet hasNoUpper = new UnicodeSet();
+		UnicodeSet hasNoUpperMinus = new UnicodeSet();
 		BagFormatter bf = new BagFormatter();
 		UnicodeSet inIDN = new UnicodeSet();
 
@@ -200,16 +208,16 @@ public class TestData implements UCD_Types {
 				if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
 				int idnaType = getIDNAType(cp);
 				idnaTypeSet[idnaType].add(cp);
+				String str = UTF16.valueOf(cp);
+				if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
 				int script = ucd.getScript(cp);
 				if (coreChars[script] == null)
 					coreChars[script] = new UnicodeSet();
 				coreChars[script].add(cp);
 			}
-			// find characters with no uppercase
-			for (UnicodeSetIterator it = new UnicodeSetIterator(lowercase); it.next();) {
-				String str = UTF16.valueOf(it.codepoint);
-				if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(it.codepoint);
-			}
+			// fix characters with no uppercase
+			hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
+			System.out.println(bf.showSetNames(hasNoUpper));
 			
 			Utility.fixDot();
 			PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
@@ -245,6 +253,23 @@ public class TestData implements UCD_Types {
 			showCodes(htmlOut, textOut, INHERITED_SCRIPT);
 			htmlOut.println("</table></body></html>");
 			htmlOut.close();
+			bf.setMergeRanges(false);
+
+			textOut.println();
+			textOut.println("# *** WORD CHARACTERS ADDED ***");
+			bf.setValueSource("word-chars");
+			bf.showSetNames(textOut, wordChars);
+			
+			textOut.println();
+			textOut.println("# *** FOR REVIEW (collected from above) ***");
+			bf.setLabelSource(UnicodeLabel.NULL);
+			for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
+				textOut.println();
+				String value = (String)it.next();
+				bf.setValueSource(value);
+				bf.showSetNames(textOut, suspect.getSet(value));
+			}
+			textOut.close();
 		}
 		
 		UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
@@ -302,25 +327,38 @@ public class TestData implements UCD_Types {
 			UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
 			UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
 			
+			UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
+			UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
+			
 			UnicodeSet decomp = extract(decomposable, core);
 			UnicodeSet pattern = extract(patternProp, core);
 			UnicodeSet non_id = extract(not_xid_continue, core);
 			
-			UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
-			core.removeAll(otherCore);
-			if (core.size() == 0) {
-				UnicodeSet temp = core;
-				core = otherCore;
-				otherCore = temp;
+			UnicodeSet bicameralNoupper = new UnicodeSet();
+			if (!hasNoUpper.containsAll(core)) {
+				bicameralNoupper = extract(hasNoUpperMinus, core);
+			}
+
+			UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
+			for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
+				String cat = Default.ucd().getCategoryID(it.codepoint);
+				String name = Default.ucd().getName(it.codepoint);
+				if (name.indexOf("MUSICAL SYMBOL") >= 0 
+						|| name.indexOf("DINGBA") >= 0 
+						|| name.indexOf("RADICAL ") >= 0 
+						 						) cat = "XX";
+				suspect.put(it.codepoint, cat);
 			}
 			
 			if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode);
-			if (otherCore.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", otherCore, scriptCode);
+			if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode);
 			if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
 			if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
 			if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "Decomposable", decomp, scriptCode);
 
-			if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped", remapped, scriptCode);
+			if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode);
+			if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Decomposable", remappedIsNFKCDecomp, scriptCode);
+			if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode);
 			if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
 			if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Illegal", illegal, scriptCode);
 		}
diff --git a/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html b/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
index b989b5d28b1..5ae33e840a4 100644
--- a/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
+++ b/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
@@ -14,7 +14,9 @@
 .Non-XID       { background-color: #FFCCCC }
 .Decomposable       { background-color: #FFFFCC }
 .Pattern_Syntax       { background-color: #FFCCFF }
-.IDN-Remapped       { background-color: #FF6666 }
+.IDN-Remapped-Case-Atomic       { background-color: #CCFFFF }
+.IDN-Remapped-Case-Decomposable       { background-color: #66FFFF }
+.IDN-Remapped-Compat       { background-color: #FF6666 }
 .IDN-Deleted       { background-color: #66FF66 }
 .IDN-Illegal       { background-color: #6666FF }
 th           { text-align: left }
@@ -25,7 +27,7 @@ th           { text-align: left }
 <body style="margin: 2em">
 
 <h1>IDN Character Categorization</h1>
-<p><i>$Date: 2005/03/30 17:19:32 $, MED</i></p>
+<p><i>$Date: 2005/04/06 08:48:17 $, MED</i></p>
 <p>This page lists all of the valid output IDN characters broken down by category. By &quot;output&quot; IDN 
 characters, we mean ones that can result from nameprep. Characters are grouped first by script, and 
 then by subcategory. Within each subcategory characters are sorted according to the default
@@ -69,8 +71,17 @@ and name (in enabled browsers).</p>
       <td>Characters with NFC decompositions.</td>
     </tr>
     <tr>
-      <td class="IDN-Remapped">IDN-Remapped</td>
-      <td>Characters remapped by IDN.</td>
+      <td class="IDN-Remapped-Case-Atomic">IDN-Remapped</td>
+      <td>Characters remapped by IDN due to case folding</td>
+    </tr>
+    <tr>
+      <td class="IDN-Remapped-Case-Decomposable">IDN-Remapped</td>
+      <td>Characters remapped by IDN due to case folding, that are decomposable.</td>
+    </tr>
+    IDN-Remapped-Case-Decomposable
+    <tr>
+      <td class="IDN-Remapped-Compat">IDN-Remapped</td>
+      <td>Characters remapped by IDN due to compatibility mapping.</td>
     </tr>
     <tr>
       <td class="IDN-Deleted">IDN-Deleted</td>