finally did some significant code cleanup on collation. not enough, but it's a start

X-SVN-Rev: 8896
2025-04-14 01:11:02 +00:00 · 2002-06-15 02:47:14 +00:00 · 2002-06-15 02:47:14 +00:00 · 3940ed8c00
commit 3940ed8c00
parent 05d2989deb
11 changed files with 522 additions and 627 deletions
--- a/tools/unicodetools/com/ibm/text/UCA/CEList.java
+++ b/tools/unicodetools/com/ibm/text/UCA/CEList.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/CEList.java,v $ 
-* $Date: 2002/05/31 01:41:03 $ 
-* $Revision: 1.4 $
+* $Date: 2002/06/15 02:47:12 $ 
+* $Revision: 1.5 $
 *
 *******************************************************************************
 */
@ -165,6 +165,15 @@ public final class CEList implements java.lang.Comparable, UCD_Types {
        return result.toString();
    }
    
+    public static String toString(IntStack ces) {
+        StringBuffer result = new StringBuffer();
+        for (int i = 0; i < ces.length(); ++i) {
+            if (i != 0) result.append(' ');
+            result.append(toString(ces.get(i)));
+        }
+        return result.toString();
+    }
+    
    public static String toString(int ce) {
        return "[" + Utility.hex(UCA.getPrimary(ce)) + "." 
          + Utility.hex(UCA.getSecondary(ce)) + "."
--- a/tools/unicodetools/com/ibm/text/UCA/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCA/Main.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $ 
-* $Date: 2002/06/04 01:59:01 $ 
-* $Revision: 1.5 $
+* $Date: 2002/06/15 02:47:12 $ 
+* $Revision: 1.6 $
 *
 *******************************************************************************
 */
@ -18,7 +18,8 @@ import com.ibm.text.utility.*;

 public class Main {
 	static final String UCDVersion = "";
-	static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA", "writeconformance", "writeconformanceshifted", 
+	static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA",
+	    "writeconformance", "writeconformanceshifted", 
 		"WriteRules", "WriteRulesWithNames", "WriteRulesXML"};
 	
 	public static void main(String args[]) throws Exception {
--- a/tools/unicodetools/com/ibm/text/UCA/UCA.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ 
-* $Date: 2002/06/04 01:58:56 $ 
-* $Revision: 1.13 $
+* $Date: 2002/06/15 02:47:12 $ 
+* $Revision: 1.14 $
 *
 *******************************************************************************
 */
@ -24,6 +24,7 @@ import com.ibm.text.UCD.Normalizer;
 import com.ibm.text.UCD.UCD;
 import com.ibm.text.utility.*;
 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;

 //import com.ibm.text.CollationData.*;

@ -62,7 +63,7 @@ This is because of shared
 characters between scripts with different directions, like French with Arabic or Greek.
 */

-final public class UCA implements Comparator {
+final public class UCA implements Comparator, UCA_Types {
    public static final String copyright = 
      "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
      
@ -85,19 +86,13 @@ final public class UCA implements Comparator {
    // base directory will change depending on the installation
    public static final String BASE_DIR = "c:\\DATA\\";
    
-    /** Enum for alternate handling */
-    public static final byte SHIFTED = 0, ZEROED = 1, NON_IGNORABLE = 2, SHIFTED_TRIMMED = 3, LAST = 3;
-    
-    /**
-     * Used to terminate a list of CEs
-     */
-    public static final int TERMINATOR = 0xFFFFFFFF;   // CE that marks end of string
-         
    
 // =============================================================
 // Test Settings
 // =============================================================
    static final boolean DEBUG = false;
+    static final boolean DEBUG_SHOW_LINE = false;
+    
    static final boolean SHOW_STATS = true;
    
    static final boolean SHOW_CE = false;
@ -109,6 +104,7 @@ final public class UCA implements Comparator {
    static final boolean RECORDING_CHARS = true;
    
    private UCD ucd;
+    private UCA_Data ucaData;
    
 // =============================================================
 // Main Methods
@ -121,11 +117,7 @@ final public class UCA implements Comparator {
     */
    public UCA(BufferedReader source, String unicodeVersion) throws java.io.IOException {
        fullData = source == null;
-
-        // clear some tables
-        for (int i = 0; i < collationElements.length; ++i) {
-            collationElements[i] = UNSUPPORTED;
-        }
+        
        // load the normalizer
        if (toD == null) {
            toD = new Normalizer(Normalizer.NFD, unicodeVersion);
@ -134,6 +126,8 @@ final public class UCA implements Comparator {
        ucd = UCD.make(unicodeVersion);
        ucdVersion = ucd.getVersion();
        
+        ucaData = new UCA_Data(toD, ucd);
+        
        // either get the full sources, or just a demo set
        if (fullData) {
            for (int i = 0; i < KEYS.length; ++i) {
@ -234,7 +228,7 @@ final public class UCA implements Comparator {
            }
            if (SHOW_CE) {
                if (debugList.length() != 0) debugList.append("/");
-                debugList.append(ceToString(ce));
+                debugList.append(CEList.toString(ce));
            }
            
            // add weights
@ -412,6 +406,35 @@ final public class UCA implements Comparator {
        return target;
    }
    
+    /**
+     * Returns a list of CEs for a unicode character at a position.
+     * @param sourceString string to make a sort key for.
+     * @param offset position in string
+     * @param decomposition true for UCA, false where the text is guaranteed to be
+     * normalization form C with no combining marks of class 0.
+     * @param output array for output. Must be large enough on entry. When done, is terminated with TERMINATOR.
+     */
+    public void getCEs(String sourceString, boolean decomposition, IntStack output) {
+        decompositionBuffer.setLength(0);
+        if (decomposition) {
+            toD.normalize(sourceString, decompositionBuffer);
+        } else {
+            decompositionBuffer.append(sourceString);
+        }
+        rearrangeBuffer = EMPTY;            // clear the rearrange buffer (thai)
+        index = 0;
+
+        // process CEs, building weight strings
+        while (true) {
+            //fixQuaternatiesPosition = quaternaries.length();
+            int ce = getCE();
+            if (ce == 0) continue;
+            if (ce == TERMINATOR) break;
+            output.push(ce);
+        }
+    }
+    
+    
    /**
     * Returns a list of CEs for a unicode character at a position.
     * @param sourceString string to make a sort key for.
@ -477,14 +500,6 @@ final public class UCA implements Comparator {
        return strength == 1 ? primarySet : strength == 2 ? secondarySet : tertiarySet;
    }
     
-    /**
-     * CE Type
-     */
-    static final byte NORMAL_CE = 0, CONTRACTING_CE = 1, EXPANDING_CE = 2, 
-        CJK_CE = 3, CJK_AB_CE = 4, HANGUL_CE = 5, UNSUPPORTED_CE = 7,
-        FIXED_CE = 3;
-        // SURROGATE_CE = 6, 
-   
    /**
     * Returns the char associated with a FIXED value
     */
@ -497,28 +512,7 @@ final public class UCA implements Comparator {
     * Return the type of the CE
     */
    public byte getCEType(int ch) {
-        
-        if (ch > 0xFFFF) ch = UTF16.getLeadSurrogate(ch); // first if expands
-        
-        int ce = collationElements[ch];
-        if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return NORMAL_CE;
-        if (ce == UNSUPPORTED) {
-            
-            // Special check for Han, Hangul
-            if (isHangul(ch)) return HANGUL_CE;
-            
-            if (isCJK(ch)) return CJK_CE;
-            if (isCJK_AB(ch)) return CJK_AB_CE;
-                        
-            // special check for unsupported surrogate pair, 20 1/8 bits
-            //if (0xD800 <= ch && ch <= 0xDFFF) {
-            //    return SURROGATE_CE;
-            //}
-            return UNSUPPORTED_CE;
-        }
-            
-        if (ce == CONTRACTING) return CONTRACTING_CE;
-        return EXPANDING_CE;
+        return ucaData.getCEType(ch);
    }

    /**
@ -604,19 +598,11 @@ final public class UCA implements Comparator {
        return result.toString();
    }
    
-    /**
-     * Produces a human-readable string for a collation element
-     */
-    static public String ceToString(int ce) {
-        return "[" + Utility.hex(getPrimary(ce)) + "." 
-          + Utility.hex(getSecondary(ce)) + "."
-          + Utility.hex(getTertiary(ce)) + "]";
-    }
-    
    /**
     * Produces a human-readable string for a collation element.
     * value is terminated by -1!
     */
+     /*
    static public String ceToString(int[] ces, int len) {
        StringBuffer result = new StringBuffer();
        for (int i = 0; i < len; ++i) {
@ -624,11 +610,13 @@ final public class UCA implements Comparator {
        }
        return result.toString();
    }
+    &/
    
    /**
     * Produces a human-readable string for a collation element.
     * value is terminated by -1!
     */
+     /*
    static public String ceToString(int[] ces) {
        StringBuffer result = new StringBuffer();
        for (int i = 0; ; ++i) {
@ -637,7 +625,7 @@ final public class UCA implements Comparator {
        }
        return result.toString();
    }
-    
+    */
    
    static boolean isImplicitLeadCE(int ce) {
    	return isImplicitLeadPrimary(getPrimary(ce));
@ -670,10 +658,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
     * and to get the second part use (x & 0xFFFF)
     */
    
-    static void CodepointToImplicit(int cp, int[] output) {
+    void CodepointToImplicit(int cp, int[] output) {
 		int base = UNSUPPORTED_OTHER_BASE;
-        if (isCJK(cp)) base = UNSUPPORTED_CJK_BASE;
-        else if (isCJK_AB(cp)) base = UNSUPPORTED_CJK_AB_BASE;
+        if (ucd.isCJK_BASE(cp)) base = UNSUPPORTED_CJK_BASE;
+        else if (ucd.isCJK_AB(cp)) base = UNSUPPORTED_CJK_AB_BASE;
        output[0] = base + (cp >>> 15);
        output[1] = (cp & 0x7FFF) | 0x8000;
    }
@ -768,6 +756,9 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
 // Privates
 // =============================================================
    
+    
+    IntStack expandingStack = new IntStack(10);
+    
    /**
     * Array used to reorder surrogates to top of 16-bit range, and others down.
     * Adds 2000 to D800..DFFF, making them F800..FFFF
@ -847,77 +838,13 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
 // Collation Element Memory Data Table Formats
 // =============================================================

-    /**
-     * Used to composed Hangul and Han characters
-     */
-     
-    static final int NEUTRAL_SECONDARY = 0x20;
-    static final int NEUTRAL_TERTIARY = 0x02;
-    
    /**
     * Temporary buffer used in getSortKey for the decomposed string
     */
    private StringBuffer decompositionBuffer = new StringBuffer();
    
-    /**
-     * The collation element data is stored a couple of different structures.
-     * First is collationElements, which generally contains the 32-bit CE corresponding
-     * to the data. It is directly indexed by character code.<br>
-     * For brevity in the implementation, we just use a flat array.
-     * A real implementation would use a multi-stage table, as described in TUS Section 5.
-     * table of simple collation elements, indexed by char.<br>
-     * Exceptional cases: expanding, contracting, unsupported are handled as described below.
-     */
-    private int[] collationElements = new int[65536];
-    
-    /**
-     * A special bit combination in a CE is used to reserve exception cases. This has the effect
-     * of removing a small number of the primary key values out of the 65536 possible.
-     */
-    private static final int EXCEPTION_CE_MASK = 0xF8000000;
-    
-       
-    /**
-     * Any unsupported characters (those not in the UCA data tables) 
-     * are marked with a exception bit combination
-     * so that they can be treated specially.<br>
-     * There are at least 34 values, so that we can use a range for surrogates
-     * However, we do add to the first weight if we have surrogate pairs!
-     */
-    private static final int UNSUPPORTED_CJK_BASE = 0xFF40;
-    private static final int UNSUPPORTED_CJK_AB_BASE = 0xFF80;
-    private static final int UNSUPPORTED_OTHER_BASE = 0xFFC0;
-    
-    private static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE;
-    private static final int UNSUPPORTED_LIMIT = UNSUPPORTED_OTHER_BASE + 0x40;
-    
-    private static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
-    
    // was 0xFFC20101;
    
-    /**
-     * Contracting characters are marked with a exception bit combination 
-     * in the collationElement table.
-     * This means that they are the first character of a contraction, and need
-     * to be looked up (with following characters) in the contractingTable.<br>
-     * This isn't a MASK since there is exactly one value.
-     */
-    private static final int CONTRACTING = 0xFA310000;
-
-    /**
-     * Expanding characters are marked with a exception bit combination
-     * in the collationElement table.
-     * This means that they map to more than one CE, which is looked up in
-     * the expansionTable by index. See EXCEPTION_INDEX_MASK
-     */
-    private static final int EXPANDING_MASK = 0xFA300000; // marks expanding range start
-    
-    /**
-     * This mask is used to get the index from an EXPANDING exception.
-     * The contracting characters can also make use of this in a future optimization.
-     */
-    static final int EXCEPTION_INDEX_MASK = 0x0000FFFF;
- 
    /**
     * We take advantage of the variables being in a closed range to save a bit per CE.
     * The low and high values are initially set to be at the opposite ends of the range,
@ -931,27 +858,18 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
    private int variableLowCE;  // used for testing against
    private int variableHighCE; // used for testing against
    
-    /**
-     * Although a single character can expand into multiple CEs, we don't want to burden
-     * the normal case with the storage. So, they get a special value in the collationElements
-     * array. This value has a distinct primary weight, followed by an index into a separate
-     * table called expandingTable. All of the CEs in that table, up to a TERMINATOR value
-     * will be used for the expansion. The implementation is as a stack; this just makes it
-     * easy to generate.
-     */
-    private IntStack expandingTable = new IntStack(3600); // initial number is from compKeys
-        
-    /**
-     * For now, this is just a simple mapping of strings to collation elements.
-     * The implementation depends on the contracting characters being "completed",
-     * so that it can be efficiently determined when to stop looking.
-     */
-    private Hashtable contractingTable = new Hashtable();
+    /*
    
-    /**
-     *  Special char value that means failed or terminated
-     */
-    private static final char NOT_A_CHAR = '\uFFFF';
+    private void fixSurrogateContraction(char ch) {
+        //if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0]));            
+        if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return;
+        String chs = String.valueOf(ch);
+        Object probe = contractingTable.get(chs);
+        if (probe != null) return;
+        contractingTable.put(chs, new Integer(UNSUPPORTED));
+    }
+    
+    */
    
    /**
     * Marks whether we are using the full data set, or an abbreviated version for
@ -965,11 +883,6 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
 // Made part of the object to avoid reallocating each time.
 // =============================================================

-    /**
-     * Stack for expanding characters
-     */
-    private IntStack expandingStack = new IntStack(100);
-    
    /**
     * Temporary buffers used in getSortKey to store weights
     * these are NOT strings of Unicode characters--they are
@ -990,8 +903,6 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
     * Temporary with requested decomposition
     */
    boolean storedDecomposition;
-    int hangulHackBottom;
-    int hangulHackTop;
    
    /**
     * Used for supporting Thai rearrangement
@ -1015,7 +926,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
     * (normalized) character code.
     */
    private int getCE() {
-        if (!expandingStack.isEmpty()) return expandingStack.pop();
+        if (!expandingStack.isEmpty()) return expandingStack.popFront();
        char ch;
        
        // Fetch next character. Handle rearrangement for Thai, etc.
@ -1037,190 +948,56 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
            }
        }
        
-        int ce = collationElements[ch];
-        
-        // Hangul tailoring hack
-        //if (!storedDecomposition && hangulHackBottom <= ce && ce < hangulHackTop) return fixJamo(ch, ce);   // hard coded fix!!
-
-        // if the CE is not exceptional (unsupported, contracting, expanding) we are done.
-        if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return ce;
-        
-        if (ce == UNSUPPORTED) {
-            int bigChar = ch;
+        index = ucaData.get(ch, decompositionBuffer, index, expandingStack);
+        int ce = expandingStack.popFront(); // pop first (guaranteed to exist!)
+        if (ce == UNSUPPORTED_FLAG) {
+            return handleUnsupported(ch);
+        }
+        return ce;
+    }
+    
+    private int handleUnsupported(char ch) {
+        int bigChar = ch;
            
-            // Special check for Hangul
-            if (isHangul(bigChar)) {
-                // MUST DECOMPOSE!!
-                hangulBuffer = new StringBuffer();
-                decomposeHangul(bigChar, hangulBuffer);
-                return getCE();
-                // RECURSIVE!!!
-            }
+        // Special check for Hangul
+        if (ucd.isHangulSyllable(bigChar)) {
+            // MUST DECOMPOSE!!
+            hangulBuffer = new StringBuffer();
+            decomposeHangul(bigChar, hangulBuffer);
+            return getCE();
+            // RECURSIVE!!!
+        }
+        
+        // special check and fix for unsupported surrogate pair, 20 1/8 bits
+        if (0xD800 <= bigChar && bigChar <= 0xDFFF) {
+            // ignore unmatched surrogates (e.g. return zero)
+            if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched
+            int ch2 = decompositionBuffer.charAt(index);
+            if (ch2 < 0xDC00 || 0xDFFF < ch2) return 0;  // unmatched
+            index++; // skip next char
+            bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
+        }
+
                        
-            if (ucd.isNoncharacter(bigChar)) { // illegal code value, ignore!!
-                return 0;
-            }
+        if (ucd.isNoncharacter(bigChar)) { // illegal code value, ignore!!
+            return 0;
+        }
            
-            // special check and fix for unsupported surrogate pair, 20 1/8 bits
-            if (0xD800 <= bigChar && bigChar <= 0xDFFF) {
-                // ignore unmatched surrogates (e.g. return zero)
-                if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched
-                int ch2 = decompositionBuffer.charAt(index);
-                if (ch2 < 0xDC00 || 0xDFFF < ch2) return 0;  // unmatched
-                index++; // skip next char
-                bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
-            }
-
-			// find the implicit values; returned in 0 and 1
-			int[] implicit = new int[2];
-			CodepointToImplicit(bigChar, implicit);
+		// find the implicit values; returned in 0 and 1
+		int[] implicit = new int[2];
+		CodepointToImplicit(bigChar, implicit);
 			
-            // Now compose the two keys
-            // first push BBBB, which is #1
+        // Now compose the two keys
+            
+        // push BBBB
                        
-            expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
+        expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
+        
+        // return AAAA
            
-            // now return AAAA, which is #0
-            
-            return makeKey(implicit[0], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
+        return makeKey(implicit[0], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
+        

-        }
-        if (ce == CONTRACTING) {
-            // Contracting is probably the most interesting (read "tricky") part
-            // of the algorithm.
-            // First get longest substring that is in the contracting table.
-            // For simplicity, we use a hash table for contracting.
-            // There are much better optimizations, 
-            // but they take a more complicated build algorithm than we want to show here.
-            // NOTE: We are guaranteed that the character itself is in the contracting table because
-            // of the build process.
-            String probe = String.valueOf(ch);
-            Object value = contractingTable.get(probe);
-            if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch));
-            
-            // We loop, trying to add successive characters to the longest substring.
-            while (index < decompositionBuffer.length()) {
-                char ch2 = decompositionBuffer.charAt(index);
-                
-                // see whether the current string plus the next char are in
-                // the contracting table.
-                String newProbe = probe + ch2;
-                Object newValue = contractingTable.get(newProbe);
-                if (newValue == null) break;    // stop if not in table.
-                
-                // We succeeded--so update our new values, and set index
-                // and quaternary to indicate that we swallowed another character.
-                probe = newProbe;
-                value = newValue;
-                index++;
-            }
-            
-            // Now, see if we can add any combining marks
-            short lastCan = 0;
-            for (int i = index; i < decompositionBuffer.length(); ++i) {
-                // We only take certain characters. They have to be accents,
-                // and they have to not be blocked.
-                // Unlike above, if we don't find a match (and it was an accent!)
-                // then we don't stop, we continue looping.
-                char ch2 = decompositionBuffer.charAt(i);
-                short can = toD.getCanonicalClass(ch2);
-                if (can == 0) break;            // stop with any zero (non-accent)
-                if (can == lastCan) continue;   // blocked if same class as last
-                lastCan = can;                  // remember for next time
-                
-                // Now see if we can successfully add it onto our string
-                // and find it in the contracting table.
-                String newProbe = probe + ch2;
-                Object newValue = contractingTable.get(newProbe);
-                if (newValue == null) continue;
-
-                // We succeeded--so update our new values, remove the char, and update
-                // quaternary to indicate that we swallowed another character.
-                probe = newProbe;
-                value = newValue;
-                decompositionBuffer.setCharAt(i,'\u0000');  // zero char
-            }
-            
-            // we are all done, and can extract the CE from the last value set.
-            ce = ((Integer)value).intValue();
-            // if the CE is not exceptional (unsupported expanding) we are done.
-            // BTW we will never have a contracting CE at this point.
-            if ((ce & EXCEPTION_CE_MASK) != EXCEPTION_CE_MASK) return ce;
-            // otherwise fall through to expansion
-        }
-        // expanding, so copy list of items onto stack
-        int index = ce & EXCEPTION_INDEX_MASK; // get index
-        // copy onto stack from index until reach TERMINATOR
-        while (true) {
-            ce = expandingTable.get(index++);
-            if (ce == TERMINATOR) break;
-            expandingStack.push(ce);
-        }
-        return expandingStack.pop(); // pop last (guaranteed to exist!)
-    }
-    
-    // Neither Mapped nor Composite CJK: [\u3400-\u4DB5\u4E00-\u9FA5\U00020000-\U0002A6D6]
-    
-    public static boolean isCJK(int cp) {
-        return (CJK_BASE <= cp && cp < CJK_LIMIT 
-        || cp == 0xFA0E	// compat characters that don't decompose.
-        || cp == 0xFA0F
-        || cp == 0xFA11
-        || cp == 0xFA13
-        || cp == 0xFA14
-        || cp == 0xFA1F
-        || cp == 0xFA21
-        || cp == 0xFA23
-        || cp == 0xFA24
-        || cp == 0xFA27
-        || cp == 0xFA28
-        || cp == 0xFA29
-        || cp == 0xFA2E
-        || cp == 0xFA2F
-        );
-    }
-    
-    public static final int 
-    	CJK_BASE = 0x4E00,
-    	CJK_LIMIT = 0x9FFF+1,
-    	CJK_COMPAT_USED_BASE = 0xFA0E,
-    	CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
-    	CJK_A_BASE = 0x3400,
-    	CJK_A_LIMIT = 0x4DBF+1,
-    	CJK_B_BASE = 0x20000,
-    	CJK_B_LIMIT = 0x2A6DF+1;
-    
-    public static final boolean isCJK_AB(int bigChar) {
-        return (CJK_A_BASE <= bigChar && bigChar < CJK_A_LIMIT
-             || CJK_B_BASE <= bigChar && bigChar < CJK_B_LIMIT);
-    }
-/*
-2E80..2EFF; CJK Radicals Supplement
-2F00..2FDF; Kangxi Radicals
-
-3400..4DBF; CJK Unified Ideographs Extension A
-4E00..9FFF; CJK Unified Ideographs
-F900..FAFF; CJK Compatibility Ideographs
-
-20000..2A6DF; CJK Unified Ideographs Extension B
-2F800..2FA1F; CJK Compatibility Ideographs Supplement
-
-Compat:
-# F900..FA0D     [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
-# FA10                 CJK COMPATIBILITY IDEOGRAPH-FA10
-# FA12                 CJK COMPATIBILITY IDEOGRAPH-FA12
-# FA15..FA1E      [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
-# FA20                 CJK COMPATIBILITY IDEOGRAPH-FA20
-# FA22                 CJK COMPATIBILITY IDEOGRAPH-FA22
-# FA25..FA26       [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
-# FA2A..FA2D       [4] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA2D
-# FA30..FA6A      [59] CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPATIBILITY IDEOGRAPH-FA6A
-# 2F800..2FA1D   [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
-
-*/
-    
-    private final boolean isHangul(int bigChar) {
-        return (0xAC00 <= bigChar && bigChar <= 0xD7A3);
    }
    
    /**
@ -1287,12 +1064,12 @@ Compat:
     */
    private int count1 = 0, count2 = 0, count3 = 0, max2 = 0, max3 = 0;
    private int oldKey1 = -1, oldKey2 = -1, oldKey3 = -1;
-    Map multiTable = new TreeMap();
-    BitSet found = new BitSet();
+    UnicodeSet found = new UnicodeSet();
    
-    public Hashtable getContracting() {
+    /*public Hashtable getContracting() {
        return new Hashtable(multiTable);
    }
+    */
    
    public UCAContents getContents(byte ceLimit, Normalizer skipDecomps) {
        return new UCAContents(ceLimit, skipDecomps, ucdVersion);
@ -1317,6 +1094,16 @@ Compat:
            this.ceLimit = ceLimit;
            this.nfd = new Normalizer(Normalizer.NFD, unicodeVersion);
            this.skipDecomps = skipDecomps;
+            
+            // FIX SAMPLES
+            if (SAMPLE_RANGES[0][0] == 0) {
+                for (int i = 0; ; ++i) { // add first unallocated character
+                    if (!ucd.isAssigned(i)) {
+                        SAMPLE_RANGES[0][0] = i;
+                        break;
+                    }
+                }
+            }
        }
        
        /**
@ -1334,7 +1121,9 @@ Compat:
            
            // normal case
            while (current++ < 0x10FFFF) {
-
+                if (current == 0x406) {
+                    System.out.println("DEBUG");
+                }
                //char ch = (char)current;
                byte type = getCEType(current);
                if (type >= ceLimit || type == CONTRACTING_CE) continue;
@ -1349,15 +1138,18 @@ Compat:
            }
            
            // contractions
-            if (enum == null) enum = multiTable.keySet().iterator();
-            if (enum.hasNext()) {
+            if (enum == null) enum = ucaData.getContractions();
+            while (enum.hasNext()) {
                result = (String)enum.next();
+                if (result.length() == 1 && UTF16.isLeadSurrogate(result.charAt(0))) {
+                    //System.out.println("Skipping " + ucd.getCodeAndName(result));
+                    continue; // try again
+                }
                return result;
            }
            
            // extra samples
            if (currentRange < SAMPLE_RANGES.length) {
-            	System.out.println("*");
                try {
                    result = UTF16.valueOf(itemInRange);
                } catch (RuntimeException e) {
@ -1372,10 +1164,11 @@ Compat:
                        endOfRange = SAMPLE_RANGES[currentRange].length > 1
                            ? SAMPLE_RANGES[currentRange][1]
                            : startOfRange;
-                        skip = ((endOfRange - startOfRange) / 513);
+                        //skip = ((endOfRange - startOfRange) / 3);
                    }
-                } else if (itemInRange > startOfRange + 9 && itemInRange < endOfRange - 9 - skip) {
-                    itemInRange += skip;
+                } else if (itemInRange > startOfRange + 5 && itemInRange < endOfRange - 5 /* - skip*/) {
+                    //itemInRange += skip;
+                    itemInRange = endOfRange - 5;
                }
            }
            
@ -1410,14 +1203,16 @@ Compat:
    }
    
    static final int[][] SAMPLE_RANGES = {
-                {0x10000},
-                {0x10FFFF},
-                {0x0220},
+                {0}, // LEAVE EMPTY--Turns into first unassigned character
                {0xFFF0}, 
                {0xD800},
                {0xDFFF},
                {0xFFFE},
                {0xFFFF},
+                {0x10000},
+                {0xC0000},
+                {0xD0000},
+                {0x10FFFF},
                {0x10FFFE},
                {0x10FFFF},
                {0x3400, 0x4DB5},
@ -1426,7 +1221,7 @@ Compat:
                {0xA000, 0xA48C},
                {0xE000, 0xF8FF},
                {0x20000, 0x2A6D6},
-                {0xE0000, 0xE00FF},
+                {0xE0000, 0xE007E},
                {0xF0000, 0xF00FD},
                {0xFFF00, 0xFFFFD},
                {0x100000, 0x1000FD},
@ -1438,7 +1233,7 @@ Compat:
     * Values will override any previous mappings.
     */
    private void addCollationElements(BufferedReader in) throws java.io.IOException {
-        IntStack tempStack = new IntStack(100); // used for reversal
+        IntStack tempStack = new IntStack(100);
        StringBuffer multiChars = new StringBuffer(); // used for contracting chars
        String inputLine = "";
        boolean[] wasImplicitLeadPrimary = new boolean[1];
@ -1448,6 +1243,10 @@ Compat:
            if (inputLine == null) break;       // means file is done
            String line = cleanLine(inputLine); // remove comments, extra whitespace
            if (line.length() == 0) continue;   // skip empty lines
+            
+            if (DEBUG_SHOW_LINE) {
+                System.out.println("Processing: " + inputLine);
+            } 

            position[0] = 0;                    // start at front of line
            if (line.startsWith("@version")) {
@ -1464,29 +1263,21 @@ Compat:
            }
            
            // collect characters
-            char value = getChar(line, position);
-            fixSurrogateContraction(value);
-            char value2 = getChar(line, position);
            multiChars.setLength(0);            // clear buffer
-            if (value2 != NOT_A_CHAR) {
-                fixSurrogateContraction(value2);
-                multiChars.append(value);       // append until we get terminator
+            
+            char value = getChar(line, position);
+            multiChars.append(value);
+            
+            //fixSurrogateContraction(value);
+            char value2 = getChar(line, position);
+            // append until we get terminator
+            while (value2 != NOT_A_CHAR) {
                multiChars.append(value2);
-                while (true) {
-                    value2 = getChar(line, position);
-                    if (value2 == NOT_A_CHAR) break;
-                    fixSurrogateContraction(value2);
-                    multiChars.append(value2);
-                }
+                value2 = getChar(line, position);
            }
+
            if (RECORDING_CHARS) {
-                if (multiChars.length() > 1) {
-                    multiTable.put(multiChars.toString(), "");
-                }
-                found.set(value);
-                for (int i = 1; i < multiChars.length(); ++i) {
-                    found.set(multiChars.charAt(i));
-                }
+                found.addAll(multiChars.toString());
            }
            if (!fullData && RECORDING_DATA) {
                if (value == 0 || value == '\t' || value == '\n' || value == '\r'
@ -1522,141 +1313,69 @@ Compat:
                    }
                }
            }
-            if (ce2 != TERMINATOR) { // have expanding character!
-                // put list into the expanding table
-                // use a temporary stack to get them in reverse order
-                tempStack.push(ce);
-                tempStack.push(ce2);
-                // set collationElement to exception value, plus index
-                ce = EXPANDING_MASK | expandingTable.getTop();
-                while (true) {
-                    ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
-                    if (ce2 == TERMINATOR) break;
-                    tempStack.push(ce2);
-                } 
-                // push onto expanding table, now in reverse order
-                while (!tempStack.isEmpty()) expandingTable.push(tempStack.pop());
-                expandingTable.push(TERMINATOR);
-            }
            
-            //if (value == 0xd801) System.out.print("DEBUG: " + line);
-            	
-            // assign CE(s) to char(s)
-            if (multiChars.length() > 0) {
-                contractingTable.put(multiChars.toString(), new Integer(ce));
-                if (collationElements[value] == UNSUPPORTED) {
-                    collationElements[value] = CONTRACTING; // mark special
-                } else if (collationElements[value] != CONTRACTING) {
-                    // move old value to contracting table!
-                    contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
-                    collationElements[value] = CONTRACTING; // signal we must look up in table
-                }
-            } else if (collationElements[value] == CONTRACTING) {
-                // must add old value to contracting table!
-                contractingTable.put(String.valueOf(value), new Integer(ce));
-            } else {
-                collationElements[value] = ce; // normal
-            }
-        //} catch (Exception e) {
-          //  throw new IllegalArgumentException("Malformed line: " + inputLine + "\n " 
-            //  + e.getClass().getName() + ": " + e.getMessage());
+            tempStack.clear();
+            tempStack.push(ce);
+            
+            while (ce2 != TERMINATOR) {
+                tempStack.push(ce2);
+                ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
+                if (ce2 == TERMINATOR) break;
+            } 
+            
+            ucaData.add(multiChars, tempStack);
+            
        } catch (RuntimeException e) {
            System.out.println("Error on line: " + inputLine);
            throw e;
        }
    }
    
-    private void fixSurrogateContraction(char ch) {
-        //if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0]));            
-        if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return;
-        String chs = String.valueOf(ch);
-        Object probe = contractingTable.get(chs);
-        if (probe != null) return;
-        contractingTable.put(chs, new Integer(0));
-    }
-    
+    /*
    private void concat(int[] ces1, int[] ces2) {
        
    }
-    
-    private void add(String source, int[] ces, int ceLen) {
-        
-        int ce;
-        if (ceLen < 1) {
-            throw new IllegalArgumentException("CE too short: " + ceLen);
-        } else if (ceLen == 1) {
-            ce = ces[0];
-        } else {
-            ce = EXPANDING_MASK | expandingTable.getTop();
-            for (int i = 0; i < ceLen; ++i) {
-                expandingTable.push(ces[i]);
-            }
-        }
-        
-        // assign CE(s) to char(s)
-        int value = source.charAt(0);
-        //if (value == 0x10000) System.out.print("DEBUG2: " + source);
-            	        
-        if (source.length() > 0) {
-            contractingTable.put(source.toString(), new Integer(ce));
-            if (collationElements[value] == UNSUPPORTED) {
-                collationElements[value] = CONTRACTING; // mark special
-            } else if (collationElements[value] != CONTRACTING) {
-                // move old value to contracting table!
-                contractingTable.put(String.valueOf(value), new Integer(collationElements[value]));
-                collationElements[value] = CONTRACTING; // signal we must look up in table
-            }
-        } else if (collationElements[value] == CONTRACTING) {
-            // must add old value to contracting table!
-            contractingTable.put(source, new Integer(ce));
-        } else {
-            collationElements[source.charAt(0)] = ce; // normal
-        }
-    }
+    */
    
    /**
     * Checks the internal tables corresponding to the UCA data.
     */
    private void cleanup() {
        
-        // at this point, we have to guarantee that the contractingTable is CLOSED
-        // e.g. if a substring of length n is in the table, then the first n-1 characters
-        // are also!!
+        ucaData.checkConsistency();
+
+        Map missingStrings = new HashMap();
+        Map tempMap = new HashMap();
        
-        
-/*
-0FB2 0F71 ; [.124E.0020.0002.0FB2][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER RA + TIBETAN VOWEL SIGN AA
-0FB3 0F71 ; [.1250.0020.0002.0FB3][.125F.0020.0002.0F71] # TIBETAN SUBJOINED LETTER LA + TIBETAN VOWEL SIGN AA
-        int[] temp1 = int[20];
-        int[] temp2 = int[20];
-        int[] temp3 = int[20];
-        getCEs("\u0fb2", true, temp1);
-        getCEs("\u0fb3", true, temp2);
-        getCEs("\u0f71", true, temp3);
-        add("\u0FB2\u0F71", concat(temp1, temp3));
-*/
-        
-        Hashtable missingStrings = new Hashtable();
-        
-        int[] temp1 = new int[20];
-        Enumeration enum = contractingTable.keys();
-        while (enum.hasMoreElements()) {
-            String sequence = (String)enum.nextElement();
+        Iterator enum = ucaData.getContractions();
+        while (enum.hasNext()) {
+            String sequence = (String)enum.next();
            //System.out.println("Contraction: " + Utility.hex(sequence));
            for (int i = sequence.length()-1; i > 0; --i) {
                String shorter = sequence.substring(0,i);
-                Object probe = contractingTable.get(shorter);
-                if (probe == null) {
-                    int len = getCEs(shorter, true, temp1);
-                    if (false) System.out.println("WARNING: CLOSING: " + UCD.make().getCodeAndName(shorter) + " => " + ceToString(temp1, len));
-                    add(shorter, temp1, len);
+                if (!ucaData.contractionTableContains(shorter)) {
+                    IntStack tempStack = new IntStack(1);
+                    getCEs(shorter, true, tempStack);
+                    if (false) System.out.println("WARNING: CLOSING: " + ucd.getCodeAndName(shorter)
+                        + " => " + CEList.toString(tempStack));
+                    tempMap.put(shorter, tempStack);
                    // missingStrings.put(shorter,"");
                    // collationElements[sequence.charAt(0)] = UNSUPPORTED; // nuke all bad values
                }
            }
        }
        
-        enum = missingStrings.keys();
+        // now add them. We couldn't before because we were iterating over it.
+        
+        enum = tempMap.keySet().iterator();
+        while (enum.hasNext()) {
+            String shorter = (String) enum.next();
+            IntStack tempStack = (IntStack) tempMap.get(shorter);
+            ucaData.add(shorter, tempStack);
+        }
+        
+        
+        enum = missingStrings.keySet().iterator();
        if (missingStrings.size() != 0) {
            /**
            while (enum.hasMoreElements()) {
@ -1666,26 +1385,30 @@ Compat:
            }
            */
            String errorMessage = "";
-            while (enum.hasMoreElements()) {
-                String missing = (String)enum.nextElement();
+            while (enum.hasNext()) {
+                String missing = (String)enum.next();
                if (errorMessage.length() != 0) errorMessage += ", ";
                errorMessage += "\"" + missing + "\"";
            }
            throw new IllegalArgumentException("Contracting table not closed! Missing " + errorMessage);
        }
-        
+
        //fixlater;
        variableLowCE = variableLow << 16;
        variableHighCE = (variableHigh << 16) | 0xFFFF; // turn on bottom bits
        
-        hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & tertiaries
-        hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries
-        if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", " + Utility.hex(hangulHackTop));
+        //int hangulHackBottom;
+        //int hangulHackTop;
+        
+        //hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & tertiaries
+        //hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries
+        //if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", " + Utility.hex(hangulHackTop));
        
        // show some statistics
        if (SHOW_STATS) System.out.println("\tcount1: " + count1);
        if (SHOW_STATS) System.out.println("\tcount2: " + max2);
        if (SHOW_STATS) System.out.println("\tcount3: " + max3);
+        if (SHOW_STATS) System.out.println("\tcontractions: " + ucaData.getContractionCount());
        
        if (SHOW_STATS) System.out.println("\tMIN1/MAX1: " + Utility.hex(MIN1) + "/" + Utility.hex(MAX1));
        if (SHOW_STATS) System.out.println("\tMIN2/MAX2: " + Utility.hex(MIN2) + "/" + Utility.hex(MAX2));
@ -1912,7 +1635,7 @@ Compat:
    /**
     * Used for checking data file integrity
     */
-    private Hashtable uniqueTable = new Hashtable();
+    private Map uniqueTable = new HashMap();
    
    /**
     * Used for checking data file integrity
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ 
-* $Date: 2002/06/13 21:14:05 $ 
-* $Revision: 1.18 $
+* $Date: 2002/06/15 02:47:12 $ 
+* $Revision: 1.19 $
 *
 *******************************************************************************
 */
@ -31,9 +31,12 @@ import com.ibm.text.UCD.UCD_Types;
 import com.ibm.text.utility.*;
 import com.ibm.text.UCD.Normalizer;

-public class WriteCollationData implements UCD_Types {
+public class WriteCollationData implements UCD_Types, UCA_Types {
 	
 	static final boolean DEBUG = false;
+	static final boolean DEBUG_SHOW_ITERATION = true;
+	
+	
 	
    public static final String copyright = 
      "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
@ -289,7 +292,21 @@ public class WriteCollationData implements UCD_Types {
    
    
    static void writeConformance(String filename, byte option, boolean shortPrint)  throws IOException {
-        UCD ucd30 = UCD.make("3.0.0");
+        //UCD ucd30 = UCD.make("3.0.0");
+        
+/*
+U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
+ => U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS, U+0304 COMBINING MACRON
+*/
+        String[] testList = {"\uF934", "U", "U\u0308", "\u00DC", "\u00DC\u0304", "U\u0308\u0304"};
+        for (int jj = 0; jj < testList.length; ++jj) {
+            String t = testList[jj];
+            System.out.println(ucd.getCodeAndName(t));
+            String test = collator.getSortKey(t, UCA.NON_IGNORABLE);
+            System.out.println("Decomp: " + collator.toString(test));
+            test = collator.getSortKey(t, UCA.NON_IGNORABLE, false);
+            System.out.println("No Dec: " + collator.toString(test));
+        }
        
        PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, false);
        if (!shortPrint) log.write('\uFEFF');
@ -297,9 +314,39 @@ public class WriteCollationData implements UCD_Types {
        System.out.println("Sorting");
        int counter = 0;
        
-        for (int i = 0; i <= 0x10FFFF; ++i) {
+        UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null);
+        cc.enableSamples();
+        UnicodeSet found2 = new UnicodeSet();
+        
+        while (true) {
+            String s = cc.next();
+            if (s == null) break;
+            
+            found2.addAll(s);
+            
+            if (DEBUG_SHOW_ITERATION) {
+                int cp = UTF16.charAt(s, 0);
+                if (cp == 0x220 || !ucd.isAssigned(cp) || ucd.isCJK_BASE(cp)) {
+                    System.out.println(ucd.getCodeAndName(s));
+                }
+            }
            Utility.dot(counter++);
-            if (!ucd.isRepresented(i)) continue;
+            addStringX(s, option);
+            // TODO: add other accents with Cyrillic
+        }
+        
+        UnicodeSet found = collator.found;
+        if (!found2.containsAll(found2)) {
+            System.out.println("In both: " + new UnicodeSet(found).retainAll(found2).toPattern(true));
+            System.out.println("In UCA but not iteration: " + new UnicodeSet(found).removeAll(found2).toPattern(true));
+            System.out.println("In iteration but not UCA: " + new UnicodeSet(found2).removeAll(found).toPattern(true));
+            throw new IllegalArgumentException("Inconsistent data");
+            
+        }
+        
+        /*
+        for (int i = 0; i <= 0x10FFFF; ++i) {
+            if (!ucd.isAssigned(i)) continue;
            addStringX(UTF32.valueOf32(i), option);
        }
        
@ -318,15 +365,6 @@ public class WriteCollationData implements UCD_Types {
            addStringX(s, option);
        }
        
-        for (int i = 0; ; ++i) { // add first unallocated character
-            if (!ucd.isAssigned(i)) {
-                String s = UTF32.valueOf32(i);
-                Utility.fixDot();
-                System.out.println("Adding: " + Utility.hex(s));
-                addStringX(s, option);
-                break;
-            }
-        }
        
        
        for (int i = 0; i < extraConformanceRanges.length; ++i) {
@ -343,6 +381,7 @@ public class WriteCollationData implements UCD_Types {
            addStringX(end-1, option);
            addStringX(end, option);
        }
+        */
        
        Utility.fixDot();
        System.out.println("Total: " + sortedD.size());
@ -364,12 +403,12 @@ public class WriteCollationData implements UCD_Types {
            //String status = key.equals(lastKey) ? "*" : "";
            //lastKey = key;
            //log.println(source);
+            char extra = source.charAt(source.length()-1);
            String clipped = source.substring(0, source.length()-1);
-            String stren = source.substring(source.length()-1);
            if (!shortPrint) {
                log.print(Utility.hex(source));
                log.print(
-                    ";\t#" + ucd.getName(clipped) + "\t" + UCA.toString(key));
+                    ";\t# " + (extra != LOW_ACCENT ? extra : '.') + " " + ucd.getName(clipped, SHORT) + "\t" + UCA.toString(key));
            } else {
                log.print(source + "\t" + Utility.hex(clipped));
            }
@ -384,13 +423,15 @@ public class WriteCollationData implements UCD_Types {
    static void addStringX(int x, byte option) {
        addStringX(UTF32.valueOf32(x), option);
    }
+    
+    static final char LOW_ACCENT = '\u0325';
   
    static void addStringX(String s, byte option) {
        addStringY(s + 'a', option);
        addStringY(s + 'A', option);
        addStringY(s + 'á', option);
        addStringY(s + 'b', option);
-        addStringY(s + '\u0325', option);
+        addStringY(s + LOW_ACCENT, option);
        addStringY(s + '!', option);
    }
    
@ -527,7 +568,7 @@ public class WriteCollationData implements UCD_Types {
            
            if (!arraysMatch(kenCes, kenLen, markCes, markLen)) {
                int kenCLen = fixCompatibilityCE(s, true, kenComp, true);
-                String comp = collator.ceToString(kenComp, kenCLen);
+                String comp = CEList.toString(kenComp, kenCLen);
                
                if (arraysMatch(kenCes, kenLen, kenComp, kenCLen)) {
                    forLater.put((char)(COMPRESSED | type) + s, comp);
@ -567,10 +608,10 @@ public class WriteCollationData implements UCD_Types {
            String comp = (String)forLater.get(key);
            
            int kenLen = collator.getCEs(s, decompType, kenCes);
-            String kenStr = collator.ceToString(kenCes, kenLen);
+            String kenStr = CEList.toString(kenCes, kenLen);
            
            int markLen = fixCompatibilityCE(s, true, markCes, false);
-            String markStr = collator.ceToString(markCes, markLen);
+            String markStr = CEList.toString(markCes, markLen);
            
            if ((type & COMPRESSED) != 0) {
                log.println("COMPRESSED #" + (++count) + ": " + ucd.getCodeAndName(s));
@ -589,7 +630,7 @@ public class WriteCollationData implements UCD_Types {
                    log.println("NFD       : " + ucd.getCodeAndName(nfd));
                }
                //kenCLen = collator.getCEs(decomp, true, kenComp);
-                //log.println("decomp ce: " + collator.ceToString(kenComp, kenCLen));                   
+                //log.println("decomp ce: " + CEList.toString(kenComp, kenCLen));                   
            }
            log.println();
        }
@ -785,7 +826,7 @@ public class WriteCollationData implements UCD_Types {
            
            if (s.length() > 1) {
                diLog.println(Utility.hex(s, " ")
-                    + ";\t #" + collator.ceToString(ces, len)
+                    + ";\t #" + CEList.toString(ces, len)
                    + " ( " + s + " )"
                    + " " + ucd.getName(s));
            }
@ -859,7 +900,7 @@ public class WriteCollationData implements UCD_Types {
                ccc = UTF32.char32At(s,kk);
                byte cat = ucd.getCategory(ccc);
                if (cat == Cf || cat == Cc || cat == Zs || cat == Zl || cat == Zp) {
-                    sortedCodes.add(UCA.ceToString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
+                    sortedCodes.add(CEList.toString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
                    break;
                }
            }
@ -882,7 +923,7 @@ public class WriteCollationData implements UCD_Types {
                if (collator.isVariable(ce)) haveMixture |= 1;
                else haveMixture |= 2;
                if (haveMixture == 3) {
-                    mixedCEs.add(UCA.ceToString(ces, len) + "\t" + ucd.getCodeAndName(s));
+                    mixedCEs.add(CEList.toString(ces, len) + "\t" + ucd.getCodeAndName(s));
                }
            }
        }
@ -1030,7 +1071,7 @@ public class WriteCollationData implements UCD_Types {
                ccc = UTF32.char32At(s,kk);
                byte cat = ucd.getCategory(ccc);
                if (cat == Cf || cat == Cc || cat == Zs || cat == Zl || cat == Zp) {
-                    sortedCodes.add(UCA.ceToString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
+                    sortedCodes.add(CEList.toString(ces, lenArray[0]) + "\t" + ucd.getCodeAndName(s));
                    break;
                }
            }
@ -1053,7 +1094,7 @@ public class WriteCollationData implements UCD_Types {
                if (collator.isVariable(ce)) haveMixture |= 1;
                else haveMixture |= 2;
                if (haveMixture == 3) {
-                    mixedCEs.add(UCA.ceToString(ces, len) + "\t" + ucd.getCodeAndName(s));
+                    mixedCEs.add(CEList.toString(ces, len) + "\t" + ucd.getCodeAndName(s));
                }
            }
        }
@ -1130,8 +1171,8 @@ public class WriteCollationData implements UCD_Types {
            + "\t" + head
            //+ "\t" + Utility.hex(oldWeight)
            //+ " => " + Utility.hex(newWeight)
-            + "\t" + collator.ceToString(ces, len)
-            + (doNew ? " => " + collator.ceToString(newCes, newLen) : "")
+            + "\t" + CEList.toString(ces, len)
+            + (doNew ? " => " + CEList.toString(newCes, newLen) : "")
            + "\t( " + src + " )"
            + "\t" + ucd.getName(src)
            );
@ -1198,7 +1239,7 @@ public class WriteCollationData implements UCD_Types {
        
        if (false) {
        int len2 = collator.getCEs("\u2474", true, ces);
-        System.out.println(UCA.ceToString(ces, len2));
+        System.out.println(CEList.toString(ces, len2));

        String a = collator.getSortKey("a");
        String b = collator.getSortKey("A");
@ -1442,9 +1483,9 @@ F900..FAFF; CJK Compatibility Ideographs
           
            
            if (false) System.out.println(
-                collator.ceToString(lastCE) + " " 
-                + collator.ceToString(ce) + " " 
-                + collator.ceToString(nextCE) + " " 
+                CEList.toString(lastCE) + " " 
+                + CEList.toString(ce) + " " 
+                + CEList.toString(nextCE) + " " 
                + ucd.getCodeAndName(chr)
                );
            
@ -1513,7 +1554,7 @@ F900..FAFF; CJK Compatibility Ideographs
            */

            if (chr.equals("\u2F00")) {
-                System.out.println(UCA.ceToString(ces, len));
+                System.out.println(CEList.toString(ces, len));
            }
            
            // There are double-CEs, so we have to know what the length of the first bit is.
@ -1561,7 +1602,7 @@ F900..FAFF; CJK Compatibility Ideographs
                if (expansion.length() > 0) log.print(" / " + quoteOperand(expansion));
                if (option == WITH_NAMES) {
                    log.print("\t# " 
-                        + collator.ceToString(ces, len) + " " 
+                        + CEList.toString(ces, len) + " " 
                        + ucd.getCodeAndName(chr));
                    if (expansion.length() > 0) log.print(" / " + Utility.hex(expansion));
                }
@ -1801,7 +1842,7 @@ F900..FAFF; CJK Compatibility Ideographs
                    
                    // we failed completely. Print error message, and bail
                    
-                    System.out.println("No back map for " + collator.ceToString(ces[i])
+                    System.out.println("No back map for " + CEList.toString(ces[i])
                        + " from " + CEList.toString(ces, len));
                    System.out.println("\t" + ucd.getCodeAndName(chr)
                        + " => " + ucd.getCodeAndName(nfkdNew.normalize(chr))
@ -2126,6 +2167,7 @@ F900..FAFF; CJK Compatibility Ideographs
                continue;
            }
            canIt.setSource(key);
+            
            boolean first = true;
            while (true) {
                String s = canIt.next();
@ -2134,9 +2176,6 @@ F900..FAFF; CJK Compatibility Ideographs
                if (contentsForCanonicalIteration.contains(s)) continue;
                if (additionalSet.contains(s)) continue;
                
-                if (s.equals("\u01EC")) {
-                    System.out.println("01ec");
-                }
                
                // Skip anything that is not FCD.
                if (!NFD.isFCD(s)) continue;
@ -2234,7 +2273,7 @@ F900..FAFF; CJK Compatibility Ideographs
        log.println("#  - Differs from previous version in that MAX value was introduced at 1F.");
        log.println("#    All tertiary values are shifted down by 1, filling the gap at 7!");
        
-        int firstImplicit = getImplicitPrimary(UCA.CJK_BASE) >>> 24;
+        int firstImplicit = getImplicitPrimary(CJK_BASE) >>> 24;
        int lastImplicit = getImplicitPrimary(0x10FFFF) >>> 24;
        log.println("[FIRST_IMPLICIT= " + Utility.hex(firstImplicit) + "]");
        log.println("[LAST_IMPLICIT= " + Utility.hex(lastImplicit) + "]");
@ -2285,13 +2324,15 @@ F900..FAFF; CJK Compatibility Ideographs
                int sec = UCA.getSecondary(ces[q]); 
                int ter = UCA.getTertiary(ces[q]);
                
-                oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
+                oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16);
                
                // special treatment for unsupported!
                
                if (UCA.isImplicitLeadPrimary(pri)) {
+                    System.out.println("DEBUG: " + CEList.toString(ces, len) 
+                        + ", Current: " + q + ", " + ucd.getCodeAndName(chr));
                    ++q;
-                    oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
+                    oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16);
                
                    int pri2 = UCA.getPrimary(ces[q]);
                    // get old code point
@ -2301,7 +2342,7 @@ F900..FAFF; CJK Compatibility Ideographs
                    // double check results!
                    
                    int[] testImplicit = new int[2];
-                    UCA.CodepointToImplicit(cp, testImplicit);
+                    collator.CodepointToImplicit(cp, testImplicit);
                    boolean gotError = pri != testImplicit[0] || pri2 != testImplicit[1];
                    if (gotError) {
                    	System.out.println("ERROR");
@ -2360,7 +2401,7 @@ F900..FAFF; CJK Compatibility Ideographs
            }
            if (nonePrinted) {
                log.print("[,,]");
-                oldStr.append(UCA.ceToString(0));
+                oldStr.append(CEList.toString(0));
            }
            longLog.print("    # " + oldStr + " # " + ucd.getName(UTF16.charAt(chr, 0)));
            log.println();
@ -2386,7 +2427,7 @@ F900..FAFF; CJK Compatibility Ideographs
        
        boolean lastOne = false;
        for (int i = 0; i < 0x10FFFF; ++i) {
-            boolean thisOne = UCA.isCJK(i) || UCA.isCJK_AB(i);
+            boolean thisOne = ucd.isCJK_BASE(i) || ucd.isCJK_AB(i);
            if (thisOne != lastOne) {
                summary.println("# Implicit Cusp: CJK=" + lastOne + ": " + Utility.hex(i-1) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i-1)));
                summary.println("# Implicit Cusp: CJK=" + thisOne + ": " + Utility.hex(i) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i)));
@ -2425,7 +2466,7 @@ F900..FAFF; CJK Compatibility Ideographs
            summary.print("# " + Utility.hex(i) + ": (" + Utility.hex(newval) + ") "
                + Utility.hex(sampleEq[i]) + " ");
            for (int q = 0; q < len; ++q) {
-                summary.print(UCA.ceToString(ces[q]));
+                summary.print(CEList.toString(ces[q]));
            }
            summary.println(" " + ucd.getName(sampleEq[i]));
        }
@ -2499,24 +2540,24 @@ F900..FAFF; CJK Compatibility Ideographs
 	*/
 static int swapCJK(int i) {
    	
-	if (i >= UCA.CJK_BASE) {
-		if (i < UCA.CJK_LIMIT)				return i - UCA.CJK_BASE;
+	if (i >= CJK_BASE) {
+		if (i < CJK_LIMIT)				return i - CJK_BASE;
 			
-		if (i < UCA.CJK_COMPAT_USED_BASE)	return i + NON_CJK_OFFSET;
+		if (i < CJK_COMPAT_USED_BASE)	return i + NON_CJK_OFFSET;
    		
-		if (i < UCA.CJK_COMPAT_USED_LIMIT)	return i - UCA.CJK_COMPAT_USED_BASE
-												+ (UCA.CJK_LIMIT - UCA.CJK_BASE);
-		if (i < UCA.CJK_B_BASE)				return i + NON_CJK_OFFSET;
+		if (i < CJK_COMPAT_USED_LIMIT)	return i - CJK_COMPAT_USED_BASE
+												+ (CJK_LIMIT - CJK_BASE);
+		if (i < CJK_B_BASE)				return i + NON_CJK_OFFSET;
    		
-		if (i < UCA.CJK_B_LIMIT)			return i; // non-BMP-CJK
+		if (i < CJK_B_LIMIT)			return i; // non-BMP-CJK
    		
 		return i + NON_CJK_OFFSET;	// non-CJK
 	}
-	if (i < UCA.CJK_A_BASE)					return i + NON_CJK_OFFSET;
+	if (i < CJK_A_BASE)					return i + NON_CJK_OFFSET;
 		
-	if (i < UCA.CJK_A_LIMIT)				return i - UCA.CJK_A_BASE
-												+ (UCA.CJK_LIMIT - UCA.CJK_BASE) 
-												+ (UCA.CJK_COMPAT_USED_LIMIT - UCA.CJK_COMPAT_USED_BASE);
+	if (i < CJK_A_LIMIT)				return i - CJK_A_BASE
+												+ (CJK_LIMIT - CJK_BASE) 
+												+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
    return i + NON_CJK_OFFSET; // non-CJK
 }
    
@ -2642,14 +2683,14 @@ static int swapCJK(int i) {
            oldPrimary = newPrimary;
    	}
    	
-        showImplicit("# First CJK", UCA.CJK_BASE);
-        showImplicit("# Last CJK", UCA.CJK_LIMIT-1);
-        showImplicit("# First CJK-compat", UCA.CJK_COMPAT_USED_BASE);
-        showImplicit("# Last CJK-compat", UCA.CJK_COMPAT_USED_LIMIT-1);
-        showImplicit("# First CJK_A", UCA.CJK_A_BASE);
-        showImplicit("# Last CJK_A", UCA.CJK_A_LIMIT-1);
-        showImplicit("# First CJK_B", UCA.CJK_B_BASE);
-        showImplicit("# Last CJK_B", UCA.CJK_B_LIMIT-1);
+        showImplicit("# First CJK", CJK_BASE);
+        showImplicit("# Last CJK", CJK_LIMIT-1);
+        showImplicit("# First CJK-compat", CJK_COMPAT_USED_BASE);
+        showImplicit("# Last CJK-compat", CJK_COMPAT_USED_LIMIT-1);
+        showImplicit("# First CJK_A", CJK_A_BASE);
+        showImplicit("# Last CJK_A", CJK_A_LIMIT-1);
+        showImplicit("# First CJK_B", CJK_B_BASE);
+        showImplicit("# Last CJK_B", CJK_B_LIMIT-1);
        showImplicit("# First Other Implicit", 0);
        showImplicit("# Last Other Implicit", 0x10FFFF);
        
@ -2667,9 +2708,9 @@ static int swapCJK(int i) {
        		
        		// separate the three groups
        		
-        		if (UCA.isCJK(i) || UCA.CJK_COMPAT_USED_BASE <= i && i < UCA.CJK_COMPAT_USED_LIMIT) {
+        		if (ucd.isCJK_BASE(i) || CJK_COMPAT_USED_BASE <= i && i < CJK_COMPAT_USED_LIMIT) {
        			if (batch != 0) continue;
-        		} else if (UCA.isCJK_AB(i)) {
+        		} else if (ucd.isCJK_AB(i)) {
        			if (batch != 1) continue;
        		} else if (batch != 2) continue;
        		
@ -2993,7 +3034,7 @@ static int swapCJK(int i) {
         
        for (char ch = 0; ch < 0xFFFF; ++ch) {
            byte type = collator.getCEType(ch);
-            if (type < UCA.FIXED_CE) {
+            if (type < FIXED_CE) {
                int len = collator.getCEs(String.valueOf(ch), true, ces);
                int primary = UCA.getPrimary(ces[0]);
                if (primary < variableHigh) continue;
@ -3088,36 +3129,22 @@ static int swapCJK(int i) {
        System.out.println("Sorting");
        
        for (int i = 0; i <= 0xFFFF; ++i) {
-            if (EXCLUDE_UNSUPPORTED && !collator.found.get(i)) continue;
+            if (EXCLUDE_UNSUPPORTED && !collator.found.contains(i)) continue;
            if (0xD800 <= i && i <= 0xF8FF) continue; // skip surrogates and private use
            //if (0xA000 <= c && c <= 0xA48F) continue; // skip YI
            addString(UTF32.valueOf32(i), option);
        }
        
-        Hashtable multiTable = collator.getContracting();
-        Enumeration enum = multiTable.keys();
-        while (enum.hasMoreElements()) {
-            addString((String)enum.nextElement(), option);
-        }
+
+        UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null);
+        cc.enableSamples();
        
-        for (int i = 0; i < extraConformanceTests.length; ++i) { // put in sample non-characters
-            addString(extraConformanceTests[i], option);
+        while (true) {
+            String s = cc.next();
+            if (s == null) break;
+            addString(s, option);
        }
-        
-        for (int i = 0; i < extraConformanceRanges.length; ++i) {
-            int start = extraConformanceRanges[i][0];
-            int end = extraConformanceRanges[i][1];
-            int increment = ((end - start + 1) / 303) + 1;
-            //System.out.println("Range: " + start + ", " + end + ", " + increment);
-            addString(start, option);
-            for (int j = start+1; j < end-1; j += increment) {
-                addString(j, option);
-                addString(j+1, option);
-            }
-            addString(end-1, option);
-            addString(end, option);
-        }
-        
+                
        System.out.println("Total: " + sortedD.size());
        Iterator it;
        
--- a/tools/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java
@ -5,12 +5,14 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java,v $ 
-* $Date: 2002/05/31 01:41:03 $ 
-* $Revision: 1.7 $
+* $Date: 2002/06/15 02:47:12 $ 
+* $Revision: 1.8 $
 *
 *******************************************************************************
 */

+WARNING: OLD FILE. DON"T COMPILE.
+
 package com.ibm.text.UCA;

 import java.util.*;
@ -21,6 +23,7 @@ import com.ibm.text.UCD.*;
 import com.ibm.text.utility.*;

 public class WriteHTMLCollation implements UCD_Types {
+
    public static final String copyright = 
      "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
      
@ -74,8 +77,8 @@ public class WriteHTMLCollation implements UCD_Types {
        */
        
        // DO FOLLOWING
-        writeConformance("CollationTest_NON_IGNORABLE.txt", UCA.NON_IGNORABLE);
-        writeConformance("CollationTest_SHIFTED.txt", UCA.SHIFTED);
+        //writeConformance("CollationTest_NON_IGNORABLE.txt", UCA.NON_IGNORABLE);
+        //writeConformance("CollationTest_SHIFTED.txt", UCA.SHIFTED);
       
        // SKIP BELOW
        if (true) return;
@ -178,7 +181,7 @@ public class WriteHTMLCollation implements UCD_Types {
        }
        return result.toString();
    }
-    
+    /*
    static void writeConformance(String filename, byte option)  throws IOException {
        PrintWriter log = Utility.openPrintWriter(filename);

@ -193,6 +196,7 @@ public class WriteHTMLCollation implements UCD_Types {
            addStringX(c, option);
        }
        
+
        Hashtable multiTable = collator.getContracting();
        Enumeration enum = multiTable.keys();
        while (enum.hasMoreElements()) {
@ -248,7 +252,8 @@ public class WriteHTMLCollation implements UCD_Types {
        sortedD.clear();
        System.out.println("Done");
    }
-
+    */
+    
    static void addStringX(int x, byte option) {
        addStringX(String.valueOf((char)x), option);
    }
@ -382,7 +387,7 @@ public class WriteHTMLCollation implements UCD_Types {
            
            if (!arraysMatch(kenCes, kenLen, markCes, markLen)) {
                int kenCLen = fixCompatibilityCE(s, true, kenComp, true);
-                String comp = collator.ceToString(kenComp, kenCLen);
+                String comp = CEList.toString(kenComp, kenCLen);
                
                if (arraysMatch(kenCes, kenLen, kenComp, kenCLen)) {
                    forLater.put((char)(COMPRESSED | type) + s, comp);
@ -422,10 +427,10 @@ public class WriteHTMLCollation implements UCD_Types {
            String comp = (String)forLater.get(key);
            
            int kenLen = collator.getCEs(s, decompType, kenCes);
-            String kenStr = collator.ceToString(kenCes, kenLen);
+            String kenStr = CEList.toString(kenCes, kenLen);
            
            int markLen = fixCompatibilityCE(s, true, markCes, false);
-            String markStr = collator.ceToString(markCes, markLen);
+            String markStr = CEList.toString(markCes, markLen);
            
            if ((type & COMPRESSED) != 0) {
                log.println("COMPRESSED #" + (++count) + ": " + ucd.getCodeAndName(s));
@ -444,7 +449,7 @@ public class WriteHTMLCollation implements UCD_Types {
                    log.println("NFD       : " + ucd.getCodeAndName(nfdstr));
                }
                //kenCLen = collator.getCEs(decomp, true, kenComp);
-                //log.println("decomp ce: " + collator.ceToString(kenComp, kenCLen));                   
+                //log.println("decomp ce: " + CEList.toString(kenComp, kenCLen));                   
            }
            log.println();
        }
@ -569,7 +574,7 @@ public class WriteHTMLCollation implements UCD_Types {
        
        {
        int len2 = collator.getCEs("\u2474", true, ces);
-        System.out.println(UCA.ceToString(ces, len2));
+        System.out.println(CEList.toString(ces, len2));

        String a = collator.getSortKey("a");
        String b = collator.getSortKey("A");
@ -640,7 +645,7 @@ public class WriteHTMLCollation implements UCD_Types {
            else if (collator.getTertiary(ce) != collator.getTertiary(lastCE)) relation = "    <<<";
            lastCE = ce;
            if (chr.equals("\u2474")) {
-                System.out.println(UCA.ceToString(ces, len));
+                System.out.println(CEList.toString(ces, len));
            }
            
            // check expansions
@ -653,7 +658,7 @@ public class WriteHTMLCollation implements UCD_Types {
                    int probe = ces[i];
                    String s = getFromBackMap(backMap, probe);
                    if (s == null) {
-                        System.out.println("No back map for " + collator.ceToString(ces[i])
+                        System.out.println("No back map for " + CEList.toString(ces[i])
                            + ": " + ucd.getCodeAndName(chr));
                        expansion += "[" + Utility.hex(ces[i]) + "]";
                    } else {
@ -943,7 +948,7 @@ public class WriteHTMLCollation implements UCD_Types {
                }
                if (sampleEq[sec] == null) sampleEq[sec] = chr;
                if (sampleEq[ter] == null) sampleEq[ter] = chr;
-                oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
+                oldStr.append(CEList.toString(ces[q]));// + "," + Integer.toString(ces[q],16);
                int np = primaryDelta[UCA.getPrimary(ces[q])];
                hexBytes(np, newPrimary);
                hexBytes(fixSecondary(UCA.getSecondary(ces[q])), newSecondary);
@ -968,7 +973,7 @@ public class WriteHTMLCollation implements UCD_Types {
            }
            if (nonePrinted) {
                log.print("[,,]");
-                oldStr.append(UCA.ceToString(0));
+                oldStr.append(CEList.toString(0));
            }
            log.println("    # " + oldStr + " # " + ucd.getName(chr.charAt(0)));
            lastChr = chr;
@ -1017,7 +1022,7 @@ public class WriteHTMLCollation implements UCD_Types {
            summary.print("# " + Utility.hex(i) + ": (" + Utility.hex(newval) + ") "
                + Utility.hex(sampleEq[i]) + " ");
            for (int q = 0; q < len; ++q) {
-                summary.print(UCA.ceToString(ces[q]));
+                summary.print(CEList.toString(ces[q]));
            }
            summary.println(" " + ucd.getName(sampleEq[i]));
        }
@ -1438,7 +1443,7 @@ public class WriteHTMLCollation implements UCD_Types {
        
        for (int i = 0; i <= 0xFFFF; ++i) {
            char c = (char)i;
-            if (EXCLUDE_UNSUPPORTED && !collator.found.get(c)) continue;
+            if (EXCLUDE_UNSUPPORTED && !collator.found.contains(c)) continue;
            if (0xD800 <= i && i <= 0xF8FF) continue; // skip surrogates and private use
            //if (0xA000 <= c && c <= 0xA48F) continue; // skip YI
            addString(String.valueOf(c), option);
--- a/tools/unicodetools/com/ibm/text/UCD/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Main.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
-* $Date: 2002/06/13 21:14:05 $
-* $Revision: 1.15 $
+* $Date: 2002/06/15 02:47:14 $
+* $Revision: 1.16 $
 *
 *******************************************************************************
 */
@ -63,6 +63,7 @@ public final class Main implements UCD_Types {
            else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable();
            else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
            else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
+            else if (arg.equalsIgnoreCase("onetime")) VerifyUCD.oneTime();
            else if (arg.equalsIgnoreCase("verifyNormalizationStability")) VerifyUCD.verifyNormalizationStability();
            
            else if (arg.equalsIgnoreCase("definitionTransliterator")) GenerateHanTransliterator.main(0);
--- a/tools/unicodetools/com/ibm/text/UCD/UCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
-* $Date: 2002/06/13 21:14:05 $
-* $Revision: 1.13 $
+* $Date: 2002/06/15 02:47:13 $
+* $Revision: 1.14 $
 *
 *******************************************************************************
 */
@ -146,7 +146,7 @@ public final class UCD implements UCD_Types {
     * Get the character names for the code points in a string, separated by ", "
     */
    public String getName(String s, byte style) {
-        if (s.length() == 1) return get(s.charAt(0), true).name;
+        if (s.length() == 1) return getName(s.charAt(0), style);
        StringBuffer result = new StringBuffer();
        int cp;
        for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
@ -182,15 +182,15 @@ public final class UCD implements UCD_Types {
    /**
     * Get the name and number (U+xxxx NAME) for a code point
     */
-    public String getCodeAndName(int codePoint) {
-        return getCode(codePoint) + " " + getName(codePoint);
+    public String getCodeAndName(int codePoint, byte type) {
+        return getCode(codePoint) + " " + getName(codePoint, type);
    }

    /**
     * Get the name and number (U+xxxx NAME) for the code points in a string,
     * separated by ", "
     */
-    public String getCodeAndName(String s) {
+    public String getCodeAndName(String s, byte type) {
        if (s == null || s.length() == 0) return "NULL";
        if (s.length() == 1) return getCodeAndName(s.charAt(0)); // fast path
        StringBuffer result = new StringBuffer();
@ -203,6 +203,20 @@ public final class UCD implements UCD_Types {
        return result.toString();
    }

+    /**
+     * Get the name and number (U+xxxx NAME) for a code point
+     */
+    public String getCodeAndName(int codePoint) {
+        return getCodeAndName(codePoint, NORMAL);
+    }
+
+    /**
+     * Get the name and number (U+xxxx NAME) for a code point
+     */
+    public String getCodeAndName(String s) {
+        return getCodeAndName(s, NORMAL);
+    }
+
    /**
     * Get the general category
     */
@ -990,10 +1004,20 @@ to guarantee identifier closure.
            result = getRaw(codePoint);
            if (result == null) {
                result = UData.UNASSIGNED;
-                if (fixStrings) result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
+                result.name = null; // clean this up, since we reuse UNASSIGNED
+                result.shortName = null;
+                if (fixStrings) {
+                    result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
+                }
            }
-            if (result.shortName != null && result.shortName.length() == 0) {
-                result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
+            if (fixStrings) {
+                if (result.name == null) {
+                    result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
+                    System.out.println("Warning: fixing name for " + result.name);
+                }
+                if (result.shortName == null) {
+                    result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
+                }
            }
            return result;
          case 0x3400: // CJK Ideograph Extension A
@ -1024,6 +1048,8 @@ to guarantee identifier closure.
        result = getRaw(rangeStart);
        if (result == null) {
            result = UData.UNASSIGNED;
+            result.name = null; // clean this up, since we reuse UNASSIGNED
+            result.shortName = null;
            if (fixStrings) {
                result.name = "<reserved-" + Utility.hex(codePoint, 4) + ">";
                result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
@ -1047,6 +1073,32 @@ to guarantee identifier closure.
        return result;
    }
    
+    // Neither Mapped nor Composite CJK: [\u3400-\u4DB5\u4E00-\u9FA5\U00020000-\U0002A6D6]
+    
+    public static final boolean isCJK_AB(int bigChar) {
+        return (CJK_A_BASE <= bigChar && bigChar < CJK_A_LIMIT
+             || CJK_B_BASE <= bigChar && bigChar < CJK_B_LIMIT);
+    }
+    
+    public static boolean isCJK_BASE(int cp) {
+        return (CJK_BASE <= cp && cp < CJK_LIMIT 
+        || cp == 0xFA0E	// compat characters that don't decompose.
+        || cp == 0xFA0F
+        || cp == 0xFA11
+        || cp == 0xFA13
+        || cp == 0xFA14
+        || cp == 0xFA1F
+        || cp == 0xFA21
+        || cp == 0xFA23
+        || cp == 0xFA24
+        || cp == 0xFA27
+        || cp == 0xFA28
+        || cp == 0xFA29
+        || cp == 0xFA2E
+        || cp == 0xFA2F
+        );
+    }
+    
    // Hangul constants

    public static final int
@ -1108,7 +1160,7 @@ to guarantee identifier closure.
        return 0xFFFF; // no composition
    }
    
-    static boolean isHangulSyllable(int char1) {
+    static public boolean isHangulSyllable(int char1) {
        return SBase <= char1 && char1 < SLimit;
    }

--- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
-* $Date: 2002/05/29 02:01:00 $
-* $Revision: 1.12 $
+* $Date: 2002/06/15 02:47:13 $
+* $Revision: 1.13 $
 *
 *******************************************************************************
 */
@ -21,8 +21,17 @@ public interface UCD_Types {
    public static final String UCD_DIR = BASE_DIR + "UCD\\";
    public static final String BIN_DIR = BASE_DIR + "BIN\\";
    public static final String GEN_DIR = BASE_DIR + "GEN\\";
-
-
+    
+    public static final int 
+    	CJK_BASE = 0x4E00,
+    	CJK_LIMIT = 0x9FFF+1,
+    	CJK_COMPAT_USED_BASE = 0xFA0E,
+    	CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
+    	CJK_A_BASE = 0x3400,
+    	CJK_A_LIMIT = 0x4DBF+1,
+    	CJK_B_BASE = 0x20000,
+    	CJK_B_LIMIT = 0x2A6DF+1;
+    
    static final byte BINARY_FORMAT = 6; // bumped if binary format of UCD changes
    
    // Unicode Property Types
--- a/tools/unicodetools/com/ibm/text/UCD/UData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
-* $Date: 2002/06/13 21:14:05 $
-* $Revision: 1.4 $
+* $Date: 2002/06/15 02:47:12 $
+* $Revision: 1.5 $
 *
 *******************************************************************************
 */
@ -18,7 +18,7 @@ import com.ibm.text.utility.*;

 class UData implements UCD_Types {
    String name;
-    String shortName = ""; // cache
+    String shortName; // cache
    String decompositionMapping;
    String simpleUppercase;
    String simpleLowercase;
--- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
-* $Date: 2002/06/13 21:14:05 $
-* $Revision: 1.15 $
+* $Date: 2002/06/15 02:47:12 $
+* $Revision: 1.16 $
 *
 *******************************************************************************
 */
@ -27,6 +27,27 @@ import com.ibm.text.utility.*;
 import java.text.NumberFormat;

 public class VerifyUCD implements UCD_Types {
+    
+    static void oneTime() {
+        Default.setUCD();
+        int[] testSet = {0x10000, 'a', 0xE0000, '\u0221'}; // 10000
+        for (int i = 0; i < testSet.length; ++i) {
+            int item = testSet[i];
+            System.out.println(Default.ucd.getCode(item));
+            
+            boolean ass = Default.ucd.isAssigned(item);
+            System.out.println(ass ? " assigned" : " unassigned");
+            ass = Default.ucd.isAllocated(item);
+            System.out.println(ass ? " allocated" : " unallocated");
+            
+            String name = Default.ucd.getName(item, SHORT);
+            System.out.println(" " + name);
+            name = Default.ucd.getName(item);
+            System.out.println(" " + name);
+            
+            System.out.println();
+       }
+    }
 	
 	static final byte NC = UNUSED_CATEGORY;
    
--- a/tools/unicodetools/com/ibm/text/utility/IntStack.java
+++ b/tools/unicodetools/com/ibm/text/utility/IntStack.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/IntStack.java,v $
-* $Date: 2001/09/19 23:33:52 $
-* $Revision: 1.3 $
+* $Date: 2002/06/15 02:47:14 $
+* $Revision: 1.4 $
 *
 *******************************************************************************
 */
@ -17,30 +17,65 @@ package com.ibm.text.utility;
 // Simple stack mechanism, with push, pop and access
 // =============================================================

-public final class IntStack implements Comparable {
+public final class IntStack implements Comparable, Cloneable {
    private int[] values;
    private int top = 0;
+    private int first = 0;

    public IntStack(int initialSize) {
        values = new int[initialSize];
    }
+    
+    public IntStack append(IntStack other) {
+        // TODO speed up by copying arrays
+        for (int i = 0; i < other.getTop(); ++i) {
+            push(other.get(i));
+        }
+        return this;
+    }

-    public void push(int value) {
+    public IntStack append(int value) {
+        return push(value);
+    }
+
+    public int length() {
+        return top - first;
+    }
+
+    public IntStack push(int value) {
        if (top >= values.length) { // must grow?
            int[] temp = new int[values.length*2];
            System.arraycopy(values,0,temp,0,values.length);
            values = temp;
        }
        values[top++] = value;
+        return this;
    }

    public int pop() {
-        if (top > 0) return values[--top];
+        if (top > first) {
+            int result = values[--top];
+            if (top == first && first > 0) {
+                top = first = 0;
+            }
+            return result;
+        }
+        throw new IllegalArgumentException("Stack underflow");
+    }
+
+    public int popFront() {
+        if (top > first) {
+            int result = values[first++];
+            if (top == first) {
+                top = first = 0;
+            }
+            return result;
+        }
        throw new IllegalArgumentException("Stack underflow");
    }

    public int get(int index) {
-        if (0 <= index && index < top) return values[index];
+        if (first <= index && index < top) return values[index];
        throw new IllegalArgumentException("Stack index out of bounds");
    }

@ -49,22 +84,24 @@ public final class IntStack implements Comparable {
    }

    public boolean isEmpty() {
-        return top == 0;
+        return top - first == 0;
    }
    
    public void clear() {
-        top = 0;
+        top = first = 0;
    }
    
    public int compareTo(Object other) {
        IntStack that = (IntStack) other;
-        int min = top;
-        if (min < that.top) min = that.top;
-        for (int i = 0; i < min; ++i) {
-            int result = values[i] - that.values[i];
+        int myLen = top - first;
+        int thatLen = that.top - that.first;
+        int limit = first + ((myLen < thatLen) ? myLen : thatLen);
+        int delta = that.first - first;
+        for (int i = first; i < limit; ++i) {
+            int result = values[i] - that.values[i + delta];
            if (result != 0) return result;
        }
-        return top - that.top;
+        return myLen - thatLen;
    }

    public boolean equals(Object other) {
@ -73,9 +110,19 @@ public final class IntStack implements Comparable {

    public int hashCode() {
        int result = top;
-        for (int i = 0; i < top; ++i) {
+        for (int i = first; i < top; ++i) {
            result = result * 37 + values[i];
        }
        return result;
    }
+    
+    public Object clone() {
+        try {
+            IntStack result = (IntStack) (super.clone());
+            result.values = (int[]) result.values.clone();
+            return result;
+        } catch (CloneNotSupportedException e) {
+            throw new IllegalArgumentException("Will never happen");
+        }
+    }
 }