more updates

X-SVN-Rev: 11332
2025-04-05 21:45:37 +00:00 · 2003-03-15 02:36:49 +00:00 · 2003-03-15 02:36:49 +00:00 · 53394d58a2
commit 53394d58a2
parent c31d7e59cd
11 changed files with 127 additions and 72 deletions
--- a/tools/unicodetools/com/ibm/text/UCA/UCA.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ 
-* $Date: 2002/07/03 02:15:47 $ 
-* $Revision: 1.17 $
+* $Date: 2003/03/15 02:36:49 $ 
+* $Revision: 1.18 $
 *
 *******************************************************************************
 */
@ -1109,7 +1109,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
        int itemInRange = startOfRange;
        int skip = 1;
        boolean doSamples = false;
-        UnicodeSetIterator usi = new UnicodeSetIterator();
+        AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator();
        
        /**
         * use FIXED_CE as the limit
@ -1120,8 +1120,8 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
            this.nfkd = new Normalizer(Normalizer.NFKD, unicodeVersion);
            this.skipDecomps = skipDecomps;
            currentRange = 0;
-            usi.reset(unspecified);
-            usi.setAbbreviated(true);
+            usi.reset(unspecified, true);
+            //usi.setAbbreviated(true);
            
            // FIX SAMPLES
            if (SAMPLE_RANGES[0][0] == 0) {
@ -1204,8 +1204,8 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
                    }
                }
                unspecified = temp;
-                usi.reset(unspecified);
-                usi.setAbbreviated(true);
+                usi.reset(unspecified, true);
+                //usi.setAbbreviated(true);
                if (DEBUG) System.out.println("Unspecified = " + unspecified.toPattern(true));
                haveUnspecified = true;
             }
--- a/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt
@ -11,6 +11,8 @@
 # (where string lengths may grow). Note that where they can be supported, the
 # full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
 #
+# All code points not listed in this file map to themselves.
+#
 # NOTE: case folding does not preserve normalization formats!
 #
 # For information on case folding, see
--- a/tools/unicodetools/com/ibm/text/UCD/Default.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Default.java
@ -16,6 +16,10 @@ public final class Default implements UCD_Types {
    public static Normalizer nfkd;
    public static Normalizer[] nf = new Normalizer[4];
    
+    public static void ensureUCD() {
+    	if (ucd == null) setUCD();
+    }
+    
    public static void setUCD(String version) {
    	ucdVersion = version;
    	setUCD();
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
-* $Date: 2003/03/12 16:01:26 $
-* $Revision: 1.25 $
+* $Date: 2003/03/15 02:36:48 $
+* $Revision: 1.26 $
 *
 *******************************************************************************
 */
@ -141,6 +141,39 @@ public class GenerateData implements UCD_Types {
            + ".html";
    }
    
+    public static void checkDifferences (String targetVersion) throws IOException {
+        System.out.println("Checking Differences");
+        UCD target = UCD.make(targetVersion);
+        
+        PrintWriter log1 = Utility.openPrintWriter("Log1.xml", Utility.LATIN1_UNIX);
+        log1.println("<diff version='" + target.getVersion() + "'>");
+
+        PrintWriter log2 = Utility.openPrintWriter("Log2.xml", Utility.LATIN1_UNIX);
+        log2.println("<diff version='" + Default.ucd.getVersion() + "'>");
+        
+        for (int i = 0; i <= 0x10FFFF; ++i) {
+            if (!target.isAllocated(i)) continue;
+            Utility.dot(i);
+            UData t = target.get(i, true);
+            UData current = Default.ucd.get(i, true);
+            if (i == 0x5E) {
+                System.out.println(target.getDecompositionTypeID(i) 
+                    + ", " + Utility.hex(target.getDecompositionMapping(i)));
+                System.out.println(Default.ucd.getDecompositionTypeID(i) 
+                    + ", " + Utility.hex(Default.ucd.getDecompositionMapping(i)));
+            }
+            if (t.equals(current)) continue;
+            
+            // print both for comparison
+            log1.println(t.toString(target, UData.ABBREVIATED));
+            log2.println(current.toString(Default.ucd, UData.ABBREVIATED));
+        }
+        log1.println("</diff>");
+        log2.println("</diff>");
+        log1.close();
+        log2.close();
+    }
+    
    public static void generateDerived (byte type, boolean checkTypeAndStandard, int headerChoice, String directory, String fileName) throws IOException {

        Default.setUCD();
--- a/tools/unicodetools/com/ibm/text/UCD/Main.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Main.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
-* $Date: 2003/03/12 16:01:26 $
-* $Revision: 1.27 $
+* $Date: 2003/03/15 02:36:48 $
+* $Revision: 1.28 $
 *
 *******************************************************************************
 */
@ -47,11 +47,31 @@ public final class Main implements UCD_Types {
    public static void main (String[] args) throws Exception {

        for (int i = 0; i < args.length; ++i) {
+        
+            long mask = 0;
+
            String arg = args[i];
            if (arg.charAt(0) == '#') return; // skip rest of line

            Utility.fixDot();
            System.out.println("Argument: " + args[i]);
+            
+            // Expand string arguments
+            
+            if (arg.equalsIgnoreCase("All")) {
+                args = Utility.append(ALL_FILES, Utility.subarray(args, i+1));
+                continue;
+            }
+            
+            // make sure the UCD is set up
+            
+            if (arg.equalsIgnoreCase("version")) {
+                Default.setUCD(args[++i]);
+                continue;
+            }
+            Default.ensureUCD();
+            
+            // Now handle other options

            if (arg.equalsIgnoreCase("verify")) {
                VerifyUCD.verify();
@ -60,7 +80,6 @@ public final class Main implements UCD_Types {
                VerifyUCD.checkAgainstUInfo();

            } else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{Default.ucdVersion});
-            else if (arg.equalsIgnoreCase("version")) Default.setUCD(args[++i]);
            else if (arg.equalsIgnoreCase("statistics")) VerifyUCD.statistics();
            else if (arg.equalsIgnoreCase("NFSkippable")) NFSkippable.main(null);
            else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable();
@ -123,6 +142,7 @@ public final class Main implements UCD_Types {
            else if (arg.equalsIgnoreCase("TestDirectoryIterator")) DirectoryIterator.test();
            else if (arg.equalsIgnoreCase("checkIdentical")) GenerateData.handleIdentical();
            else if (arg.equalsIgnoreCase("testnameuniqueness")) TestNameUniqueness.test();
+            else if (arg.equalsIgnoreCase("checkDifferences")) GenerateData.checkDifferences("3.2.0");
            
            //else if (arg.equalsIgnoreCase("NormalizationCharts")) ChartGenerator.writeNormalizationCharts();
            
@ -130,36 +150,9 @@ public final class Main implements UCD_Types {
            /*else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
                GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt");
                */
-            else extras(new String[] {arg});
-        }
-    }
-    
-    public static void extras (String[] args) throws Exception {
-        //ubp = new UnifiedBinaryProperty(ucd);
-        
-        boolean expanding = false;
-        
-        for (int i = 0; i < args.length; ++i) {
-            String arg = args[i];
-            if (arg.charAt(0) == '#') return; // skip rest of line
-            long mask = 0;
-
-            Utility.fixDot();
-            if (expanding) System.out.println("Argument: " + args[i]);
-
-            if (arg.equalsIgnoreCase("All")) {
-                // Append all args at end
-                /*
-                String[] temp = new String[args.length + ALL_FILES.length];
-                System.arraycopy(args, 0, temp, 0, args.length);
-                System.arraycopy(ALL_FILES, 0, temp, args.length, ALL_FILES.length);
-                */
-                args = Utility.append(args, ALL_FILES);
-                expanding = true;
-
            // EXTRACTED PROPERTIES
            
-            } else if (arg.equalsIgnoreCase("DerivedBidiClass")) {
+            else if (arg.equalsIgnoreCase("DerivedBidiClass")) {
                GenerateData.generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, GenerateData.HEADER_DERIVED,
                    "DerivedData/extracted/", "DerivedBidiClass");
                    
--- a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
@ -6,6 +6,8 @@
 # characters where they are 1-1, and does not have locale-specific mappings.)
 # For more information, see the discussion of Case Mappings in the Unicode Standard.
 #
+# All code points not listed in this file that do not have a simple case mappings
+# in UnicodeData.txt map to themselves.
 # ================================================================================
 # Format
 # ================================================================================
--- a/tools/unicodetools/com/ibm/text/UCD/UCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
-* $Date: 2003/03/12 16:01:26 $
-* $Revision: 1.21 $
+* $Date: 2003/03/15 02:36:48 $
+* $Revision: 1.22 $
 *
 *******************************************************************************
 */
@ -123,7 +123,7 @@ public final class UCD implements UCD_Types {
     * Return XML version of the data associated with the code point.
     */
    public String toString(int codePoint) {
-        return get(codePoint, true).toString(FULL);
+        return get(codePoint, true).toString(this,FULL);
    }

    /**
@ -1389,6 +1389,7 @@ to guarantee identifier closure.
            size = uDataFileCount = dataIn.readInt();

            boolean didJoiningHack = false;
+            System.out.println("Loading UCD " + foundVersion);


            // records
@ -1396,7 +1397,7 @@ to guarantee identifier closure.
                UData uData = new UData();
                uData.readBytes(dataIn);

-                if (uData.codePoint == 0x0221) {
+                if (uData.codePoint == 0x5E) {
                    System.out.println("SPOT-CHECK: " + uData);
                }

--- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
-* $Date: 2003/03/12 16:01:26 $
-* $Revision: 1.17 $
+* $Date: 2003/03/15 02:36:48 $
+* $Revision: 1.18 $
 *
 *******************************************************************************
 */
@ -51,7 +51,9 @@ final class UCD_Names implements UCD_Types {
            + "#\tAll code points not listed here have the type U",
        "Joining Group (listing ArabicShaping.txt, field 2)",
        "BidiMirrored (listing UnicodeData.txt, field 9: see UnicodeData.html)",
-        "Script",
+        "Script\r\n"
+            + "#\tThe value for all code points not explicitly listed in this file is COMMON."
+        ,
        "Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)",
        "Hangul Syllable Type\r\n# All codepoints not explicitly listed here have the value NA",
        "Derived"
@ -219,11 +221,11 @@ final class UCD_Names implements UCD_Types {
        "IS", "PR", "PO", "NU", "AL", "ID", "IN", "HY",
        "CM", "BB", "BA", "SP", "BK", "CR", "LF", "CB",
        "SA", "AI", "B2", "SG", "ZW",
-        "JL",
-        "JV",
-        "JT",
        "NL",
        "WJ",
+        //"JL",
+        //"JV",
+        //"JT",

    };

@ -235,11 +237,11 @@ final class UCD_Names implements UCD_Types {
        "CombiningMark", "BreakBefore", "BreakAfter", "Space",
        "MandatoryBreak", "CarriageReturn", "LineFeed", "ContingentBreak",
        "ComplexContext", "Ambiguous", "BreakBoth", "Surrogate", "ZWSpace",
-        "Leading_Jamo",
-        "Vowel_Jamo",
-        "Trailing_Jamo",
        "Next_Line",
        "Word_Joiner"
+        //"Leading_Jamo",
+        //"Vowel_Jamo",
+        //"Trailing_Jamo",
    };

 	public static final String[] SCRIPT = {
--- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
-* $Date: 2003/03/12 16:01:26 $
-* $Revision: 1.18 $
+* $Date: 2003/03/15 02:36:48 $
+* $Revision: 1.19 $
 *
 *******************************************************************************
 */
@ -15,7 +15,7 @@ package com.ibm.text.UCD;

 public interface UCD_Types {
    
-    public static final int dVersion = 15; // change to fix the generated file D version. If less than zero, no "d"
+    public static final int dVersion = 18; // change to fix the generated file D version. If less than zero, no "d"
    
    public static final String BASE_DIR = "C:\\DATA\\";
    public static final String UCD_DIR = BASE_DIR + "UCD\\";
@ -34,7 +34,7 @@ public interface UCD_Types {
    	CJK_B_BASE = 0x20000,
    	CJK_B_LIMIT = 0x2A6DF+1;
    
-    static final byte BINARY_FORMAT = 7; // bumped if binary format of UCD changes
+    static final byte BINARY_FORMAT = 8; // bumped if binary format of UCD changes
    
    // Unicode Property Types
    static final byte 
@ -240,12 +240,12 @@ public interface UCD_Types {
        LB_IS = 8, LB_PR = 9, LB_PO = 10, LB_NU = 11, LB_AL = 12, LB_ID = 13, LB_IN = 14, LB_HY = 15,
        LB_CM = 16, LB_BB = 17, LB_BA = 18, LB_SP = 19, LB_BK = 20, LB_CR = 21, LB_LF = 22, LB_CB = 23,
        LB_SA = 24, LB_AI = 25, LB_B2 = 26, LB_SG = 27, LB_ZW = 28,
-        LB_JL = 29,
-        LB_JV = 30,
-        LB_JT = 31,
-        LB_NL = 32,
-        LB_WJ = 33,
-        LIMIT_LINE_BREAK = 34,
+        LB_NL = 29,
+        LB_WJ = 30,
+        //LB_JL = 29,
+        //LB_JV = 30,
+        //LB_JT = 31,
+        LIMIT_LINE_BREAK = 31,
        LB_LIMIT = LIMIT_LINE_BREAK;

    // east asian width
--- a/tools/unicodetools/com/ibm/text/UCD/UData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
-* $Date: 2003/03/12 16:01:26 $
-* $Revision: 1.7 $
+* $Date: 2003/03/15 02:36:48 $
+* $Revision: 1.8 $
 *
 *******************************************************************************
 */
@ -78,6 +78,9 @@ class UData implements UCD_Types {

    public boolean equals(Object that) {
        UData other = (UData) that;
+        
+        // use equals for objects
+        
        if (!name.equals(other.name)) return false;
        if (!decompositionMapping.equals(other.decompositionMapping)) return false;
        if (!simpleUppercase.equals(other.simpleUppercase)) return false;
@ -90,8 +93,12 @@ class UData implements UCD_Types {
        if (!fullCaseFolding.equals(other.fullCaseFolding)) return false;
        if (!specialCasing.equals(other.specialCasing)) return false;
        if (!bidiMirror.equals(other.bidiMirror)) return false;
+        
+        // == for primitives
+        // Warning: doubles have to use special comparison, because of NaN
+        
        if (codePoint != other.codePoint) return false;
-        if (numericValue != other.numericValue) return false;
+        if (numericValue < other.numericValue || numericValue > other.numericValue) return false;
        if (binaryProperties != other.binaryProperties) return false;
        if (generalCategory != other.generalCategory) return false;
        if (combiningClass != other.combiningClass) return false;
@ -104,6 +111,7 @@ class UData implements UCD_Types {
        if (joiningGroup != other.joiningGroup) return false;
        if (script != other.script) return false;
        if (age != other.age) return false;
+        
        return true;
    }

@ -178,17 +186,17 @@ class UData implements UCD_Types {
    static final byte ABBREVIATED = 0, FULL = 1;

    public String toString() {
-        return toString(FULL);
+        return toString(Default.ucd, FULL);
    }

-    public String toString(byte style) {
+    public String toString(UCD ucd, byte style) {
        boolean full = style == FULL;
        StringBuffer result = new StringBuffer();
        String s = UTF32.valueOf32(codePoint);

-        result.append("<e c='").append(Utility.quoteXML(codePoint)).append('\'');
+        result.append("<e cp='").append(Utility.quoteXML(codePoint)).append('\'');
        result.append(" hx='").append(Utility.hex(codePoint)).append('\'');
-        if (full || script != COMMON_SCRIPT) result.append(" sn='").append(UCD_Names.SCRIPT[script]).append('\'');
+        if (full || script != COMMON_SCRIPT) result.append(" sn='").append(ucd.getScriptID_fromIndex(script,SHORT)).append('\'');
        result.append(" n='").append(Utility.quoteXML(name)).append("'\r\n");

        int lastPos = result.length();
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2003/03/12 16:01:26 $
-* $Revision: 1.28 $
+* $Date: 2003/03/15 02:36:47 $
+* $Revision: 1.29 $
 *
 *******************************************************************************
 */
@ -36,6 +36,16 @@ public final class Utility implements UCD_Types {    // COMMON UTILITIES
        return temp;
    }

+    public static String[] subarray(String[] array1, int start, int limit) {
+        String[] temp = new String[limit - start];
+        System.arraycopy(array1, start, temp, 0, limit - start);
+        return temp;
+    }
+
+    public static String[] subarray(String[] array1, int start) {
+        return subarray(array1, start, array1.length);
+    }
+
    public static String getName(int i, String[] names) {
        try {
            return names[i];