fixes for collation charts

X-SVN-Rev: 6436
2025-04-06 14:05:32 +00:00 · 2001-10-25 20:35:42 +00:00 · 2001-10-25 20:35:42 +00:00 · ab4045e909
commit ab4045e909
parent 815bc3f18a
4 changed files with 187 additions and 33 deletions
--- a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java
+++ b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $ 
-* $Date: 2001/09/19 23:32:21 $ 
-* $Revision: 1.4 $
+* $Date: 2001/10/25 20:35:42 $ 
+* $Revision: 1.5 $
 *
 *******************************************************************************
 */
@ -30,6 +30,33 @@ public class GenOverlap implements UCD_Types {
    static Normalizer nfd;
    static Normalizer nfkd;
    
+    public static void validateUCA(UCA collatorIn) throws Exception {
+        collator = collatorIn;
+        ucd = UCD.make();
+
+        nfd = new Normalizer(Normalizer.NFD);
+        nfkd = new Normalizer(Normalizer.NFKD);
+
+        for (int cp = 0x0; cp <= 0x10FFFF; ++cp) {
+            Utility.dot(cp);
+            if (!ucd.isRepresented(cp)) continue;
+            byte decompType = ucd.getDecompositionType(cp);
+            if (decompType >= UCD.COMPATIBILITY) {
+                String decomp = nfkd.normalize(cp);
+                CEList celistDecomp = getCEList(cp, decomp, true, decompType);
+                CEList celistNormal = getCEList(UTF16.valueOf(cp), false);
+                if (!celistNormal.equals(celistDecomp)) {
+                    Utility.fixDot();
+                    System.out.println();
+                    System.out.println(ucd.getCodeAndName(cp));
+                    System.out.println(celistNormal);
+                    System.out.println(celistDecomp);
+                }
+            }
+        }
+        
+    }
+    
    public static void test(UCA collatorIn) throws Exception {
        collator = collatorIn;
            
@ -68,7 +95,7 @@ public class GenOverlap implements UCD_Types {
            byte decompType = ucd.getDecompositionType(cp);
            if (decompType >= UCD.COMPATIBILITY) {
                String decomp = nfkd.normalize(cp);
-                CEList celist = getCEList(cp, decomp, decompType);
+                CEList celist = getCEList(cp, decomp, true, decompType);
                addString(decomp, celist);
                System.out.println("Adding: " + ucd.getCodeAndName(cp) + "\t" + celist);
            }
@ -182,16 +209,22 @@ public class GenOverlap implements UCD_Types {
    }
  
    static private CEList getCEList(String s) {
-        int len = collator.getCEs(s, true, ces);
+        return getCEList(s, true);
+    }
+    
+    static private CEList getCEList(String s, boolean decomp) {
+        int len = collator.getCEs(s, decomp, ces);
        return new CEList(ces, 0, len);
    }
  
-    static private CEList getCEList(int originalChar, String s, byte type) {
-        int len = collator.getCEs(s, true, ces);
-        for (int i = 0; i < len; ++i) {
-            ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]), 
-                UCA.getSecondary(ces[i]),
-                CEList.remap(originalChar, type, UCA.getTertiary(ces[i])));
+    static private CEList getCEList(int originalChar, String s, boolean decomp, byte type) {
+        int len = collator.getCEs(s, decomp, ces);
+        if (decomp) {
+            for (int i = 0; i < len; ++i) {
+                ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]), 
+                    UCA.getSecondary(ces[i]),
+                    CEList.remap(originalChar, type, UCA.getTertiary(ces[i])));
+            }
        }
        return new CEList(ces, 0, len);
    }
@ -290,7 +323,7 @@ public class GenOverlap implements UCD_Types {
    }
    
    public static void generateRevision (UCA collatorIn) throws Exception {
-        generateRevision(collatorIn, false);
+        //generateRevision(collatorIn, false);
        generateRevision(collatorIn, true);
    }
        
@ -336,7 +369,7 @@ public class GenOverlap implements UCD_Types {
            int cp;
            for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) {
                cp = UTF16.charAt(str, i);
-                if (0xFF67 <= cp && cp <= 0xFF6F) {
+                if (0xFF3F == cp) {
                    System.out.println("debug");
                }
                boolean mashLast = false;
@ -351,7 +384,7 @@ public class GenOverlap implements UCD_Types {
                            int s = UCA.getSecondary(ces[j]);
                            boolean needsFix = (s != 0x20 && p != 0);
                            if (needsFix) ++len;
-                            int t = (doMax && len > 1 && j == len-1 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j])));
+                            int t = (doMax && j > 0 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j])));
                            if (needsFix) {
                                ces[j++] = UCA.makeKey(p, 0x20, t);             // Set Extra
                                System.arraycopy(ces, j, ces, j+1, len - j);    // Insert HOLE!
@ -413,7 +446,7 @@ public class GenOverlap implements UCD_Types {
        newKeys.removeAll(joint);
        oldKeys.removeAll(joint);
        
-        PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"));
+        PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), false);
        Iterator it = list.iterator();
        int last = -1;
        while (it.hasNext()) {
@ -657,4 +690,51 @@ public class GenOverlap implements UCD_Types {
            + "</td><td align='right'>" + nf.format(sd)
            + "</td></tr>");            
    }
+    
+    public static void listCyrillic(UCA collatorIn) throws IOException {
+        PrintWriter log = Utility.openPrintWriter("ListCyrillic.txt", false);
+        Set set = new TreeSet(collatorIn);
+        Set set2 = new TreeSet(collatorIn);
+        ucd = UCD.make();
+        
+        nfd = new Normalizer(Normalizer.NFD);
+        
+        for (char i = 0; i < 0xFFFF; ++i) {
+            Utility.dot(i);
+            if (!ucd.isRepresented(i)) continue;
+            if (ucd.getScript(i) != CYRILLIC_SCRIPT) continue;
+            
+            String decomp = nfd.normalize(String.valueOf(i));
+            String oldDecomp = decomp;
+            for (int j = 0; j < decomp.length(); ++j) {
+                if (ucd.getCategory(decomp.charAt(j)) == Mn) {
+                    decomp = decomp.substring(0,j) + decomp.substring(j+1);
+                }
+            }
+            if (decomp.length() == 0) continue;
+            
+            set.add(decomp);
+            if (!decomp.equals(oldDecomp)) set2.add(oldDecomp);
+        }
+        
+        Iterator it = set.iterator();
+        while (it.hasNext()) {
+            String s = (String) it.next();
+            String name = ucd.getName(s.charAt(0));
+            Utility.replace(name, "CYRILLIC ", "");
+            log.println("# " + s + " <> XXX ; # " + name);
+        }
+ 
+        it = set2.iterator();
+        while (it.hasNext()) {
+            String s = (String) it.next();
+            String name = ucd.getName(s.charAt(0));
+            Utility.replace(name, "CYRILLIC ", "");
+            log.println("### " + s + " <> XXX ; # " + name);
+        }
+        
+        log.close();
+    }
+    
+    
 }
--- a/tools/unicodetools/com/ibm/text/UCA/UCA.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ 
-* $Date: 2001/09/19 23:32:21 $ 
-* $Revision: 1.4 $
+* $Date: 2001/10/25 20:35:41 $ 
+* $Revision: 1.5 $
 *
 *******************************************************************************
 */
@ -61,9 +61,13 @@ This is because of shared
 characters between scripts with different directions, like French with Arabic or Greek.
 */

-final public class UCA {
+final public class UCA implements Comparator {
    public static final String copyright = 
      "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
+      
+    public int compare(Object a, Object b) {
+        return getSortKey((String) a).compareTo(getSortKey((String) b));
+    }

    /**
     * Version of the UCA tables to use
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $ 
-* $Date: 2001/09/19 23:31:50 $ 
-* $Revision: 1.1 $
+* $Date: 2001/10/25 20:35:41 $ 
+* $Revision: 1.2 $
 *
 *******************************************************************************
 */
@ -30,6 +30,7 @@ public class WriteCharts implements UCD_Types {
        
        ucd = UCD.make();
        Normalizer nfd = new Normalizer(Normalizer.NFD);
+        Normalizer nfc = new Normalizer(Normalizer.NFC);
          
        UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps
          
@ -53,18 +54,23 @@ public class WriteCharts implements UCD_Types {
        
        int lastPrimary = -1;
        
-        String lastSortKey = null;
+        String lastSortKey = "\u0000";
        
        int high = uca.getSortKey("a").charAt(0);
        int variable = UCA.getPrimary(uca.getVariableHigh());
        
        int columnCount = 0;
        
+        Utility.copyTextFile("index.html", true, "CollationCharts\\index.html");
+        Utility.copyTextFile("charts.css", false, "CollationCharts\\charts.css");
+        Utility.copyTextFile("help.html", true, "CollationCharts\\help.html");
+        
        indexFile = Utility.openPrintWriter("CollationCharts\\index_list.html");

        indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
        indexFile.println("<title>UCA Default Collation Table</title>");
        indexFile.println("<base target='main'>");
+        indexFile.println("<style><!-- p { font-size: 90% } --></style>");
        indexFile.println("</head><body><h2 align='center'>UCA Default Collation Table</h2>");
        indexFile.println("<p align='center'><a href = 'help.html'>Help</a>");
        
@ -102,19 +108,31 @@ public class WriteCharts implements UCD_Types {
                oldScript = script;
            }
            
-            int strength = 6;
-            if (lastSortKey != null && sortKey.charAt(0) == lastSortKey.charAt(0)) {
-                strength = uca.strengthDifference(sortKey, lastSortKey);
-                if (strength < 0) strength = -strength;
-            }
+            boolean firstPrimaryEquals = sortKey.charAt(0) == lastSortKey.charAt(0);
+            
+            int strength = uca.strengthDifference(sortKey, lastSortKey);
+            if (strength < 0) strength = -strength;
            lastSortKey = sortKey;
+            
+            // find out if this is an expansion: more than one primary weight
+            
+            int primaryCount = 0;
+            for (int i = 0; i < sortKey.length(); ++i) {
+                char w = sortKey.charAt(i);
+                if (w == 0) break;
+                ++ primaryCount;
+            }
+            
            String breaker = "";
-            if (columnCount > 10 || strength > 5) {
-                if (strength <= 5) breaker = "</tr><tr><td></td>";
-                else breaker = "</tr><tr>";
+            if (columnCount > 10 || !firstPrimaryEquals) {
+                if (!firstPrimaryEquals) breaker = "</tr><tr>";
+                else breaker = "</tr><tr><td></td>"; // indent 1 cell
                columnCount = 0;
            }
-            output.println(breaker + CLASSNAME[strength] + s 
+            
+            String classname = primaryCount > 1 ? XCLASSNAME[strength] : CLASSNAME[strength];
+            
+            output.println(breaker + classname + nfc.normalize(s) 
                + "<br><tt>" + Utility.hex(s) 
                //+ "<br>" + script
                //+ "<br>" + UCA.toString(sortKey) 
@ -133,8 +151,15 @@ public class WriteCharts implements UCD_Types {
        "<td class='q'>", 
        "<td class='t'>", 
        "<td class='s'>", 
-        "<td class='p'>", 
-        "<td class='f'>"};
+        "<td class='p'>"};
+        
+    static final String[] XCLASSNAME = {
+        "<td class='eq'>", 
+        "<td class='eq'>", 
+        "<td class='eq'>", 
+        "<td class='et'>", 
+        "<td class='es'>", 
+        "<td class='ep'>"};
        

    static PrintWriter indexFile;
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ 
-* $Date: 2001/09/19 23:32:21 $ 
-* $Revision: 1.4 $
+* $Date: 2001/10/25 20:35:41 $ 
+* $Revision: 1.5 $
 *
 *******************************************************************************
 */
@ -63,9 +63,13 @@ public class WriteCollationData implements UCD_Types {
            String arg = args[i];
            if      (arg.equalsIgnoreCase("WriteRulesWithNames")) writeRules(WITH_NAMES);
            else if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(collator);
+            else if (arg.equalsIgnoreCase("validateUCA")) GenOverlap.validateUCA(collator);
+            else if (arg.equalsIgnoreCase("writeNonspacingDifference")) writeNonspacingDifference();
+            
            else if (arg.equalsIgnoreCase("WriteCharts")) WriteCharts.test(collator);
            else if (arg.equalsIgnoreCase("CheckHash")) GenOverlap.checkHash(collator);
            else if (arg.equalsIgnoreCase("generateRevision")) GenOverlap.generateRevision(collator);
+            else if (arg.equalsIgnoreCase("listCyrillic")) GenOverlap.listCyrillic(collator);
            
            else if (arg.equalsIgnoreCase("WriteRules")) writeRules(WITHOUT_NAMES);
            else if (arg.equalsIgnoreCase("WriteRulesXML")) writeRules(IN_XML);
@ -748,6 +752,47 @@ public class WriteCollationData implements UCD_Types {
        return len;
    }
    
+    static void writeNonspacingDifference() throws IOException {
+        PrintWriter diLog = new PrintWriter(
+            new BufferedWriter(
+                new OutputStreamWriter(
+                    new FileOutputStream(GEN_DIR + "UCA_Nonspacing.txt"),
+                    "UTF8"),
+                32*1024));
+        diLog.write('\uFEFF');
+
+        Normalizer nfd = new Normalizer(Normalizer.NFD);
+        
+        Set sorted = new TreeSet();
+        
+        for (int i = 0; i < 0x10FFFF; ++i) {
+            Utility.dot(i);
+            if (!ucd.isRepresented(i)) continue;
+            byte cat = ucd.getCategory(i);
+            boolean isNonSpacing = cat == Mn || cat == Me;
+            CEList celist = collator.getCEList(UTF32.valueOf32(i), true);
+            boolean isPrimaryIgnorable = true;
+            for (int j = 0; j < celist.length(); ++j) {
+                int ce = celist.at(j);
+                int primary = collator.getPrimary(ce);
+                if (primary != 0) {
+                    isPrimaryIgnorable = false;
+                    break;
+                }
+            }
+            
+            if (isNonSpacing != isPrimaryIgnorable) {
+                sorted.add(ucd.getCategoryID(i)
+                    + "\t" + celist
+                    + "\t" + ucd.getCodeAndName(i));
+            }
+        }
+        
+        Utility.print(diLog, sorted, "\r\n");
+        
+        diLog.close();
+    }
+    
    static void writeContractions() throws IOException {
        PrintWriter diLog = new PrintWriter(
            new BufferedWriter(