ICU-0 update for U4.1.0

X-SVN-Rev: 17400
2025-04-08 06:53:45 +00:00 · 2005-03-26 05:40:05 +00:00 · 2005-03-26 05:40:05 +00:00 · 641a6d6d79
commit 641a6d6d79
parent 599dbb508c
12 changed files with 118 additions and 54 deletions
--- a/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt
@ -1,4 +1,6 @@
+#
 # Note:   The casing of block names is not normative.
 #         For example, "Basic Latin" and "BASIC LATIN" are equivalent.
+#
 # Format:
 # Start Code..End Code; Block Name
--- a/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt
@ -1,3 +1,4 @@
+#
 # Case Folding Properties
 #
 # This file is a supplement to the UnicodeData file.
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
-* $Date: 2004/02/12 08:23:15 $
-* $Revision: 1.16 $
+* $Date: 2005/03/26 05:40:04 $
+* $Revision: 1.17 $
 *
 *******************************************************************************
 */
@ -574,14 +574,19 @@ public class GenerateCaseFolding implements UCD_Types {
        log.close();
        
        System.out.println("Writing");
-        String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true);
-        PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        String[] batName = {""};
+        //String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true);
+        //PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
+        
+        UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedData/", "SpecialCasing" + suffix2);
+        PrintWriter out = udf.out;
+        
+ /*       String[] batName = {""};
        String mostRecent = UnicodeDataFile.generateBat("DerivedData/", "SpecialCasing", suffix2 + UnicodeDataFile.getFileSuffix(true), batName);
        out.println("# SpecialCasing" + UnicodeDataFile.getFileSuffix(false));
        out.println(UnicodeDataFile.generateDateLine());
        out.println("#");
        Utility.appendFile("SpecialCasingHeader.txt", Utility.UTF8, out);
+*/        

        Iterator it = sorted.keySet().iterator();
        int lastOrder = -1;
@ -612,8 +617,8 @@ public class GenerateCaseFolding implements UCD_Types {
            }
            out.println(line);
        }
-        Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
-        out.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
+        //Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
+        udf.close();
+        //Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
    }
 }
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
-* $Date: 2005/03/10 02:37:19 $
-* $Revision: 1.37 $
+* $Date: 2005/03/26 05:40:04 $
+* $Revision: 1.38 $
 *
 *******************************************************************************
 */
@ -744,16 +744,19 @@ public class GenerateData implements UCD_Types {
    
    static public void writeNormalizerTestSuite(String directory, String fileName) throws IOException {
        
+    	UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, fileName);
+        PrintWriter log = fc.out;
+        
        String newFile = directory + fileName + UnicodeDataFile.getFileSuffix(true);
-        PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
-        String[] batName = {""};
-        String mostRecent = UnicodeDataFile.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName);
+        //PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
+        //String[] batName = {""};
+        //String mostRecent = UnicodeDataFile.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName);

        String[] example = new String[256];

-        log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false));
-        log.println(UnicodeDataFile.generateDateLine());
-        log.println("#");
+        //log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false));
+        //log.println(UnicodeDataFile.generateDateLine());
+        /*log.println("#");
        log.println("# Normalization Test Suite");
        log.println("# Format:");
        log.println("#");
@ -787,7 +790,7 @@ public class GenerateData implements UCD_Types {

        log.println("#");
        log.println("@Part0 # Specific cases");
-        log.println("#");
+        log.println("#");*/

        for (int j = 0; j < testSuiteCases.length; ++j) {
            writeLine(testSuiteCases[j], log, false);
@ -891,8 +894,8 @@ public class GenerateData implements UCD_Types {
        Utility.fixDot();
        log.println("#");
        log.println("# END OF FILE");
-        log.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
+        fc.close();
+        //Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
    }
    
    static void handleIdentical() throws IOException {
@ -942,12 +945,13 @@ public class GenerateData implements UCD_Types {

    // not recursive!!!
    static final String comma(String s) {
+    	//if (true) return s;
        commaResult.setLength(0);
        int cp;
-        for (int i = 0; i < s.length(); i += UTF32.count16(i)) {
-            cp = UTF32.char32At(s, i);
+        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(s, i);
            if (Default.ucd().getCategory(cp) == Mn) commaResult.append('\u25CC');
-            UTF32.append32(commaResult, cp);
+            UTF16.append(commaResult, cp);
        }
        return commaResult.toString();
    }
--- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
@ -1,5 +1,5 @@
-Generate: Derived.*
-DeltaVersion: 12
+Generate:
+DeltaVersion: 13
 CopyrightYear: 2005

 File: auxiliary/GraphemeBreakProperty
@ -58,6 +58,13 @@ Value:	4.1
 File:	extracted/DerivedBidiClass
 Property:	Bidi_Class
 # Bidi Class (listing UnicodeData.txt, field 4: see UCD.html)
+# Unlike other properties, unassigned code points in blocks reserved for right-to-left scripts are given either types R or AL.
+# The unassigned characters that default to R are:
+#   Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF
+# The unassigned characters that default to AL are:
+#   Arabic, Syriac, Thaana, Arabic_Presentation_Forms_A, Arabic_Presentation_Forms_B, Arabic_Supplement,
+#   and the range \u0750-\u077F, minus the Noncharacter_Code_Points
+# For all other cases:
 Format:	valueStyle=short skipUnassigned=Left_To_Right

 File:	extracted/DerivedBinaryProperties
@ -67,8 +74,6 @@ Property:	Bidi_Mirrored
 File:	extracted/DerivedCombiningClass
 Property:	Canonical_Combining_Class
 # Combining Class (listing UnicodeData.txt, field 3: see UCD.html)
-#	All code points not explicitly listed in this file have the property
-#	value:   0.
 Format: nameStyle=none valueStyle=short skipUnassigned=Not_Reordered

 File:	DerivedCoreProperties
--- a/tools/unicodetools/com/ibm/text/UCD/NormalizationTestHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/NormalizationTestHeader.txt
@ -0,0 +1,32 @@
+#
+# Normalization Test Suite
+# Format:
+#
+#   Columns (c1, c2,...) are separated by semicolons
+#   Comments are indicated with hash marks
+#
+# CONFORMANCE:
+# 1. The following invariants must be true for all conformant implementations
+#
+#    NFC
+#      c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
+#      c4 ==  NFC(c4) ==  NFC(c5)
+#
+#    NFD
+#      c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
+#      c5 ==  NFD(c4) ==  NFD(c5)
+#
+#    NFKC
+#      c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
+#
+#    NFKD
+#      c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
+#
+# 2. For every code point X assigned in this version of Unicode that is not specifically
+#    listed in Part 1, the following invariants must be true for all conformant
+#    implementations:
+#
+#      X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
+#
+@Part0 # Specific cases
+#
--- a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt
@ -1,3 +1,4 @@
+#
 # This file contains aliases for properties used in the UCD.
 # These names can be used for XML formats of UCD data, for regular-expression
 # property tests, and other programmatic textual descriptions of Unicode data.
--- a/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt
@ -1,3 +1,4 @@
+#
 # This file contains aliases for property values used in the UCD.
 # These names can be used for XML formats of UCD data, for regular-expression
 # property tests, and other programmatic textual descriptions of Unicode data.
--- a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
@ -1,3 +1,4 @@
+#
 # Special Casing Properties
 #
 # This file is a supplement to the UnicodeData file.
--- a/tools/unicodetools/com/ibm/text/UCD/TestData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
-* $Date: 2005/03/10 02:37:19 $
-* $Revision: 1.18 $
+* $Date: 2005/03/26 05:40:05 $
+* $Revision: 1.19 $
 *
 *******************************************************************************
 */
@ -151,7 +151,12 @@ public class TestData implements UCD_Types {
 	
 	static class GenStringPrep {
 		UnicodeSet[] coreChars = new UnicodeSet[100];
-		UnicodeSet[] decompChars = new UnicodeSet[100];
+		UnicodeSet decomposable = new UnicodeSet();
+		UnicodeSet pattern = new UnicodeSet();
+		ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
+		//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
+		UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
+		//UnicodeSet[] decompChars = new UnicodeSet[100];
 		UCD ucd = Default.ucd();

 		Collator uca = Collator.getInstance(ULocale.ENGLISH);
@ -167,10 +172,13 @@ public class TestData implements UCD_Types {


 		void genStringPrep() throws IOException {
+			//BagFormatter bf = new BagFormatter();
+			//System.out.println(bf.showSetDifferences("ID_Continue", id_continue, "XID_Continue", xid_continue));
 			StringBuffer inbuffer = new StringBuffer();
 			StringBuffer intermediate, outbuffer;
 			for (int cp = 0; cp <= 0x10FFFF; ++cp) {
 				Utility.dot(cp);
+				if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
 				inbuffer.setLength(0);
 				UTF16.append(inbuffer, cp);
 				try {
@ -189,15 +197,9 @@ public class TestData implements UCD_Types {
 				if (!TestData.equals(inbuffer, outbuffer))
 					continue;
 				int script = ucd.getScript(cp);
-				if (!Default.nfd().isNormalized(cp)) {
-					if (decompChars[script] == null)
-						decompChars[script] = new UnicodeSet();
-					decompChars[script].add(cp);
-				} else {
-					if (coreChars[script] == null)
-						coreChars[script] = new UnicodeSet();
-					coreChars[script].add(cp);
-				}
+				if (coreChars[script] == null)
+					coreChars[script] = new UnicodeSet();
+				coreChars[script].add(cp);
 			}
 			// find characters with no uppercase
 			for (UnicodeSetIterator it = new UnicodeSetIterator(lowercase); it.next();) {
@ -212,8 +214,11 @@ public class TestData implements UCD_Types {
 					.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
 			out.println("<title>IDN Characters</title><style>");
 			out.println("<!--");
-			out
-					.println(".script       { font-size: 150%; background-color: #C0C0C0 }");
+			out.println(".script       { font-size: 150%; background-color: #CCCCCC }");
+			out.println(".Atomic       { background-color: #CCCCFF }");
+			out.println(".Atomic-no-uppercase       { background-color: #CCFFCC }");
+			out.println(".Non-ID       { background-color: #FFCCCC }");
+			out.println(".Decomposable       { background-color: #FFFFCC }");
 			out.println("th           { text-align: left }");
 			out.println("-->");
 			out.println("</style></head><body><table>");
@ -240,15 +245,16 @@ public class TestData implements UCD_Types {
 		 * @param scriptCode
 		 */
 		private void showCodes(PrintWriter out, int scriptCode) {
-			if (coreChars[scriptCode] == null
-					&& decompChars[scriptCode] == null)
-				return;
+			if (coreChars[scriptCode] == null) return;
 			System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
-			String script = Default.ucd().getScriptID_fromIndex(
-					(byte) scriptCode);
+			String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
 			out.println();
 			out.println("<tr><th class='script'>Script: " + script + "</th></tr>");
 			UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
+			UnicodeSet decomp = new UnicodeSet(core).retainAll(decomposable);
+			core.removeAll(decomp);
+			UnicodeSet non_id = new UnicodeSet(core).removeAll(xid_continue);
+			core.removeAll(non_id);
 			UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
 			core.removeAll(otherCore);
 			if (core.size() == 0) {
@ -257,9 +263,9 @@ public class TestData implements UCD_Types {
 				otherCore = temp;
 			}
 			printlnSet(out, "Atomic", core, scriptCode);
-			if (otherCore.size() != 0) printlnSet(out, "Atomic [noUpper]", otherCore, scriptCode);							
-			UnicodeSet decomp = decompChars[scriptCode];
-			if (decomp != null && decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
+			if (otherCore.size() != 0) printlnSet(out, "Atomic-no-uppercase", otherCore, scriptCode);
+			if (non_id.size() != 0) printlnSet(out, "Non-ID", non_id, scriptCode);
+			if (decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
 		}

 		/**
@ -277,7 +283,7 @@ public class TestData implements UCD_Types {
 					&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
 			out.println("<tr><th class='" + title + "'>" + title + " ("
 					+ nf.format(size) + ")</th></tr>");
-			out.print("<tr><td" + dir + ">");
+			out.print("<tr><td class='" + title + "'" + dir + ">");
 			UnicodeSetIterator usi = new UnicodeSetIterator();
 			if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
 				usi.reset(unicodeset);
--- a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
+++ b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
@ -264,7 +264,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
        		unicodeMap.putAll(lineBreak.getSet("Infix_Numeric")
        				.remove(0x003A), "MidNum");
        		unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric");
-        		unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "Numeric");
+        		unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "ExtendNumLet");
        		unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
        		unicodeMap.setMissing("Other");
        	}
@ -479,9 +479,10 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
        public List _getValueAliases(String valueAlias, List result) {
            if (result == null) result = new ArrayList();
            int type = getType() & CORE_MASK;
-            if (type == STRING || type == MISC) return result;
-            else if (type == NUMERIC) return result;
-            else if (type == BINARY) {
+            if (type == STRING || type == MISC || type == NUMERIC) {
+            	UnicodeProperty.addUnique(valueAlias, result);
+            	return result;
+            } else if (type == BINARY) {
                UnicodeProperty.addUnique(valueAlias, result);
                return lookup(valueAlias, UCD_Names.YN_TABLE_LONG, UCD_Names.YN_TABLE, null, result);
            } else if (type == ENUMERATED || type == CATALOG) {
--- a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java
+++ b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java
@ -15,15 +15,17 @@ public class UnicodeDataFile {
    private String newFile;
    private String batName;
    private String mostRecent;
+    private String filename;
    private UnicodeDataFile(){};
    
    public static UnicodeDataFile openAndWriteHeader(String directory, String filename) throws IOException {
        UnicodeDataFile result = new UnicodeDataFile();
        result.newFile = directory + filename + UnicodeDataFile.getFileSuffix(true);
-        result.out = Utility.openPrintWriter(result.newFile, Utility.LATIN1_UNIX);
+        result.out = Utility.openPrintWriter(result.newFile, Utility.UTF8_UNIX);
        String[] batName = {""};
        result.mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
        result.batName = batName[0];
+    	result.filename = filename;
        
        result.out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
        result.out.println(generateDateLine());
@ -50,6 +52,9 @@ public class UnicodeDataFile {
    }
    
    public void close() throws IOException {
+        try {
+            Utility.appendFile(filename + "Footer.txt", Utility.LATIN1, out);
+        } catch (FileNotFoundException e) {}
        out.close();           
        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName);
    }