minor additions to change PropertyAlias.txt

X-SVN-Rev: 6812
2025-04-14 17:24:01 +00:00 · 2001-11-13 02:31:55 +00:00 · 2001-11-13 02:31:55 +00:00 · 3405bab3d1
commit 3405bab3d1
parent 9565246f34
6 changed files with 190 additions and 65 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
-* $Date: 2001/10/31 00:02:27 $
-* $Revision: 1.9 $
+* $Date: 2001/11/13 02:31:55 $
+* $Revision: 1.10 $
 *
 *******************************************************************************
 */
@ -424,6 +424,7 @@ public class GenerateData implements UCD_Types {
        Set accumulation = new TreeSet(java.text.Collator.getInstance());
        String spacing;
        
+        /*
        BufferedReader blocks = Utility.openUnicodeFile("Blocks", ucd.getVersion());
        String[] parts = new String[10];
        while (true) {
@ -442,6 +443,7 @@ public class GenerateData implements UCD_Types {
            checkDuplicate(duplicates, accumulation, value, "Block=" + value);
        }
        blocks.close();
+        */
        
        for (int k = 0; k < UCD_Names.NON_ENUMERATED.length; ++k) {
            propAbb = fixGaps(UCD_Names.NON_ENUMERATED[k][0], false);
@ -456,15 +458,19 @@ public class GenerateData implements UCD_Types {
            valueAbb = fixGaps(UCD_Names.SUPER_CATEGORIES[k][0], false);
            value = fixGaps(UCD_Names.SUPER_CATEGORIES[k][1], true);
            spacing = Utility.repeat(" ", 10-valueAbb.length());
-            sorted.add("gc; " + valueAbb + spacing + "; " + value);
+            String baseLine = "gc; " + valueAbb + spacing + "; " + value;
+            spacing = Utility.repeat(" ", 50-baseLine.length());
+            sorted.add(baseLine + spacing + "# " + UCD_Names.SUPER_CATEGORIES[k][2]);
            checkDuplicate(duplicates, accumulation, value, "General_Category=" + value);
            if (!value.equals(valueAbb)) checkDuplicate(duplicates, accumulation, valueAbb, "General_Category=" + value);
        }
        
+        /*
        sorted.add("xx; T         ; True");
        checkDuplicate(duplicates, accumulation, "T", "xx=True");
        sorted.add("xx; F         ; False");
        checkDuplicate(duplicates, accumulation, "F", "xx=False");
+        */
        sorted.add("qc; Y         ; Yes");
        checkDuplicate(duplicates, accumulation, "Y", "qc=Yes");
        sorted.add("qc; N         ; No");
@ -507,6 +513,10 @@ public class GenerateData implements UCD_Types {
                if (value.startsWith("Fixed_")) { continue; }
            }
            
+            if (type == JOINING_GROUP) {
+                valueAbb = "n/a";
+            }
+            
            /*
            String elide = "";
            if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{"
@ -546,7 +556,18 @@ public class GenerateData implements UCD_Types {
        log.println("# Generated: " + new Date() + ", MD");
        log.println(HORIZONTAL_LINE);
        log.println();
-        Utility.print(log, sorted, "\r\n", new MyBreaker());
+        Utility.print(log, sorted, "\r\n", new MyBreaker(true));
+        log.close();
+        
+        log = Utility.openPrintWriter("PropertyValueAliases-" + ucd.getVersion() + "dX.txt");
+        Utility.appendFile("PropertyValueAliasHeader.txt", false, log);
+        log.println("# Generated: " + new Date() + ", MD");
+        log.println(HORIZONTAL_LINE);
+        log.println();
+        Utility.print(log, sorted, "\r\n", new MyBreaker(false));
+        log.close();
+        
+        log = Utility.openPrintWriter("PropertyAliasSummary-" + ucd.getVersion() + "dX.txt");
        log.println();
        log.println(HORIZONTAL_LINE);
        log.println();
@ -555,20 +576,43 @@ public class GenerateData implements UCD_Types {
        log.println("# Note: no two property names can be the same,");
        log.println("# nor can two property value names for the same property be the same.");
        log.println();
-        Utility.print(log, accumulation, "\r\n", new MyBreaker());
+        Utility.print(log, accumulation, "\r\n", new MyBreaker(false));
        log.println();
        log.close();
    }
    
    static class MyBreaker implements Utility.Breaker {
+        boolean status;
+        
+        public MyBreaker(boolean status) {
+            this.status = status;
+        }
+        
+        public boolean filter(Object current) {
+            String c = current.toString();
+            if (c.startsWith("AA") || c.startsWith("BB") || c.startsWith("ZZ")) return status;
+            return !status;
+        }
+        
        public String get(Object current, Object old) {
-            if (old == null) return "";
+            if (old == null) {
+                old = "  ";
+            }
            String c = current.toString();
            String o = old.toString();
-            if (c.length() >= 2 && o.length() >= 0 && !c.substring(0,2).equals(o.substring(0,2))) {
-                return "\r\n";
+            String sep = "";
+            if (!c.substring(0,2).equals(o.substring(0,2))) {
+                sep = "\r\n";
+                if (status) {
+                    if (c.startsWith("AA")) sep = sep + HORIZONTAL_LINE + sep + "# Non-enumerated Properties" + sep + HORIZONTAL_LINE + sep;
+                    if (c.startsWith("BB")) sep = sep + HORIZONTAL_LINE + sep + "# Enumerated Non-Binary Properties" + sep + HORIZONTAL_LINE + sep;
+                    if (c.startsWith("ZZ")) sep = sep + HORIZONTAL_LINE + sep + "# Binary Properties" + sep + HORIZONTAL_LINE + sep;
+                }
            }
-            return "";
+            if (status) {
+                c = c.substring(4);
+            }
+            return sep + c;
        }
    }
    
--- a/tools/unicodetools/com/ibm/text/UCD/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Main.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
-* $Date: 2001/10/25 20:33:46 $
-* $Revision: 1.4 $
+* $Date: 2001/11/13 02:31:55 $
+* $Revision: 1.5 $
 *
 *******************************************************************************
 */
@ -41,7 +41,8 @@ public final class Main {

            else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
            else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
-            else if (arg.equalsIgnoreCase("checkCase2")) VerifyUCD.checkCase2();
+            else if (arg.equalsIgnoreCase("checkCaseLong")) VerifyUCD.checkCase2(true);
+            else if (arg.equalsIgnoreCase("checkCaseShort")) VerifyUCD.checkCase2(false);
            else if (arg.equalsIgnoreCase("checkCanonicalProperties")) VerifyUCD.checkCanonicalProperties();
            else if (arg.equalsIgnoreCase("CheckCaseFold")) VerifyUCD.CheckCaseFold();
            else if (arg.equalsIgnoreCase("idn")) VerifyUCD.VerifyIDN();
--- a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt
@ -1,46 +1,30 @@
 # DRAFT
 # PropertyAliases-3.2.0.txt
 #
-# This file contains aliases for properties and property values used in the UCD.
+# This file contains aliases for properties used in the UCD.
 # These names can be used for XML formats of UCD data, for regular-expression
 # property tests, and other programmatic textual descriptions of Unicode data.
-# The names are not normative, except where they correspond to normative values
-# in the UCD.
+# The names are not normative, except where they correspond to normative
+# properties in the UCD. For information on which properties are normative,
+# see UnicodeCharacterDatabase.html.
 #
 # The names may be translated in appropriate environments, and additional
 # aliases may be useful.
 #
 # FORMAT
 #
-# Each line has three fields, separated by semicolons.
+# Each line has two fields, separated by semicolons.
 #
-# First Field: Where the first field is AA, BB, or ZZ, then the line describes a property name:
+# First Field: The first field is an abbreviated name for the property
 #
-# AA - non-enumerated properties
-# BB - enumerated, non-binary properties
-# ZZ - binary properties and quick-check properties
-#
-# (The values AA, BB, and ZZ are arbitrary -- they were simply chosen to distinguish
-# the different types.)
-#
-# Where the first field is not one of the above, the line describes a
-# property value name. The first field describes the property for which that
-# property value name is used. There are two special properties:
-#
-# xx stands for any binary property
-# qc stands for any quick-check property
-#
-# Second Field: The second field is an abbreviated name.
-# If there is no abbreviated name available, the field is marked with "n/a".
-#
-# Third Field: The third field is a long name.
+# Second Field: The second field is a long name
 #
 # With loose matching of property names, the case distinctions, whitespace,
 # and '_' are ignored.
 #
 # NOTE: Currently there is at most one abbreviated name and one long name for
-# each property and property value. However, in the future additional aliases
-# may be added. In such a case, the first line for the property or property value
+# each property. However, in the future additional aliases
+# may be added. In such a case, the first line for the property
 # would have the preferred alias for output.
 #
 # NOTE: The property value names are NOT unique across properties, especially
@ -53,7 +37,5 @@
 # cc means Combining_Class property, and
 # cc means the General_Category property value Control (cc)
 #
-# Comments at the end of the file show cases of non-unique names.
-#
 # The combination of property value and property name is, however, unique.
 # For more information, see UTR #24: Regular Expression Guidelines
--- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
-* $Date: 2001/10/31 00:02:27 $
-* $Revision: 1.6 $
+* $Date: 2001/11/13 02:31:55 $
+* $Revision: 1.7 $
 *
 *******************************************************************************
 */
@ -32,7 +32,10 @@ final class UCD_Names implements UCD_Types {
        {"stc", "Simple_Titlecase_Mapping"},
        {"sfc", "Simple_Case_Folding"},
        {"scc", "Special_Case_Condition"},
-        {"blk", "Block"}
+        {"blk", "Block"},
+        {"na1", "Unicode_1_Name"},
+        {"isc", "ISO_Comment"},
+        {"age", "Age"},
    };

    static final String[] UNIFIED_PROPERTIES = {
@ -406,13 +409,14 @@ final class UCD_Names implements UCD_Types {
    };

    static final String[][] SUPER_CATEGORIES = {
-        {"L", "Letter"},
-        {"M", "Mark"},
-        {"N", "Number"},
-        {"Z", "Separator"},
-        {"C", "Other"},
-        {"S", "Symbol"},
-        {"P", "Punctuation"},
+        {"L", "Letter", "Ll | Lm | Lo | Lt | Lu"},
+        {"M", "Mark", "Mc | Me | Mn"},
+        {"N", "Number", "Nd | Nl | No"},
+        {"Z", "Separator", "Zl | Zp | Zs"},
+        {"C", "Other", "Cc | Cf | Cn | Co | Cs"},
+        {"S", "Symbol", "Sc | Sk | Sm | So"},
+        {"P", "Punctuation", "Pc | Pd | Pe | Pf | Pi | Po | Ps"},
+        {"Lc", "Cased Letter", "Ll | Lt | Lu"},
    };


--- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
-* $Date: 2001/10/25 20:33:46 $
-* $Revision: 1.6 $
+* $Date: 2001/11/13 02:31:55 $
+* $Revision: 1.7 $
 *
 *******************************************************************************
 */
@ -141,43 +141,109 @@ public class VerifyUCD implements UCD_Types {
        log.close();
    }

-    public static void checkCase2() throws IOException {
+    public static void checkCase2(boolean longForm) throws IOException {
        Utility.fixDot();
        System.out.println("checkCase");
        ucd = UCD.make(Main.ucdVersion);
        initNormalizers();
-        System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
+        
+        /*String tx1 = "\u0391\u0342\u0345";
+        String ux1 = "\u0391\u0342\u0399";
+        String ctx1 = nfc.normalize(tx1);
+        String ctx2 = nfc.normalize(ux1); // wrong??
+
+        //System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
+        */
+        
+        
        String fileName = "CaseNormalizationDifferences.txt";
        PrintWriter log = Utility.openPrintWriter(fileName);

        log.println("Differences between case(normalize(cp)) and normalize(case(cp))");
        log.println("u, l, t - upper, lower, title");
        log.println("c, d - nfc, nfd");
+        
+        //Utility.DOTMASK = 0x7F;

        for (int cp = 0; cp <= 0x10FFFF; ++cp) {
            Utility.dot(cp);
            if (!ucd.isRepresented(cp) || ucd.isPUA(cp)) continue;
-            if (cp == '\u3371') {
+            if (cp == '\u0130') {
               System.out.println("debug");
            }

            String x = UTF32.valueOf32(cp);
+            String dx = nfd.normalize(cp);
+            String cx = nfc.normalize(cp);

            String ux = ucd.getCase(x, FULL, UPPER);
            String lx = ucd.getCase(x, FULL, LOWER);
            String tx = ucd.getCase(x, FULL, TITLE);
-
-            String dux = nfd.normalize(ux);
-            String dlx = nfd.normalize(lx);
-            String dtx = nfd.normalize(tx);
+            
+            if (x.equals(dx) && dx.equals(cx) && cx.equals(ux) && ux.equals(lx) && lx.equals(tx)) continue;

            String cux = nfc.normalize(ux);
            String clx = nfc.normalize(lx);
            String ctx = nfc.normalize(tx);
+            
+            if (x.equals(cx)) {
+                boolean needBreak = false;
+                if (!clx.equals(lx)) needBreak = true;
+                if (!ctx.equals(tx)) needBreak = true;
+                if (!cux.equals(ux)) needBreak = true;
+                
+                if (needBreak) {
+                    log.println("# Was not NFC:");
+                    log.println(
+                        "## " + Utility.hex(x) + "; "
+                        + Utility.hex(lx) + "; "
+                        + Utility.hex(tx) + "; "
+                        + Utility.hex(ux) + "; # "
+                        + ucd.getName(x));
+                    log.println("#   should be:");
+                    log.println(
+                        Utility.hex(x) + "; "
+                        + Utility.hex(clx) + "; "
+                        + Utility.hex(ctx) + "; "
+                        + Utility.hex(cux) + "; # "
+                        + ucd.getName(x));
+                    log.println();
+                }
+            }
+                       
+            String dux = nfd.normalize(ux);
+            String dlx = nfd.normalize(lx);
+            String dtx = nfd.normalize(tx);
+            
+            
+            
+            String startdx = getMarks(dx, false);
+            String enddx = getMarks(dx, true);

-            String dx = nfd.normalize(cp);
-            String cx = nfc.normalize(cp);
+            String startdux = getMarks(dux, false);
+            String enddux = getMarks(dux, true);

+            String startdtx = getMarks(dtx, false);
+            String enddtx = getMarks(dtx, true);
+
+            String startdlx = getMarks(dlx, false);
+            String enddlx = getMarks(dlx, true);
+            
+            // If the new marks don't occur in the old decomposition, we got a problem!
+            
+            if (!startdx.startsWith(startdux) || !startdx.startsWith(startdtx) || !startdx.startsWith(startdlx)
+              || !enddx.endsWith(enddux) || !enddx.endsWith(enddtx) || !enddx.endsWith(enddlx)) {
+                log.println("Combining Class Difference for " + ucd.getCodeAndName(x));
+                log.println("x:  " + ucd.getCodeAndName(dx) + ", " + Utility.hex(startdx) + ", " + Utility.hex(enddx));
+                log.println("ux: " + ucd.getCodeAndName(dux) + ", " + Utility.hex(startdux) + ", " + Utility.hex(enddux));
+                log.println("tx: " + ucd.getCodeAndName(dtx) + ", " + Utility.hex(startdtx) + ", " + Utility.hex(enddtx));
+                log.println("lx: " + ucd.getCodeAndName(dlx) + ", " + Utility.hex(startdlx) + ", " + Utility.hex(enddlx));
+                log.println();
+            }
+            
+
+            if (!longForm) continue;
+                        
            String udx = ucd.getCase(dx, FULL, UPPER);
            String ldx = ucd.getCase(dx, FULL, LOWER);
            String tdx = ucd.getCase(dx, FULL, TITLE);
@ -286,6 +352,28 @@ public class VerifyUCD implements UCD_Types {

        log.close();
    }
+    
+    public static String getMarks(String s, boolean doEnd) {
+        int cp;
+        if (!doEnd) {
+            for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
+                cp = UTF16.charAt(s, i);
+                int cc = ucd.getCombiningClass(cp);
+                if (cc == 0) {
+                    return s.substring(0, i);
+                }
+            }
+        } else {
+            for (int i = s.length(); i > 0; i -= UTF16.getCharCount(cp)) {
+                cp = UTF16.charAt(s, i-1); // will go 2 before if necessary
+                int cc = ucd.getCombiningClass(cp);
+                if (cc == 0) {
+                    return s.substring(i);
+                }
+            }
+        }
+        return s;
+    }

    static final String names[] = {"LOWER", "TITLE", "UPPER", "(UNC)", "MIXED"};
    static final String lowerNames[] = {"", "Other_Lower"};
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2001/10/31 00:02:54 $
-* $Revision: 1.7 $
+* $Date: 2001/11/13 02:31:34 $
+* $Revision: 1.8 $
 *
 *******************************************************************************
 */
@ -30,9 +30,11 @@ public final class Utility {    // COMMON UTILITIES
    }

    private static boolean needCRLF = false;
+    
+    public static int DOTMASK = 0x7FF;

    public static void dot(int i) {
-        if ((i % 0x7FF) == 0) {
+        if ((i % DOTMASK) == 0) {
            needCRLF = true;
            System.out.print('.');
        }
@ -458,6 +460,7 @@ public final class Utility {    // COMMON UTILITIES
    
    public interface Breaker {
        public String get(Object current, Object old);
+        public boolean filter(Object current); // true is keep
    }
    
    public static void print(PrintWriter pw, Collection c, String separator, Breaker b) {
@ -466,14 +469,17 @@ public final class Utility {    // COMMON UTILITIES
        Object last = null;
        while (it.hasNext()) {
            Object obj = it.next();
+            if (b != null && !b.filter(obj)) continue;
            if (first) {
                first = false;
+            } else {
+                pw.print(separator);
            }
-            else pw.print(separator);
            if (b != null) {
                pw.print(b.get(obj, last));
+            } else {
+                pw.print(obj);
            }
-            pw.print(obj);
            last = obj;
        }
    }