From 641a6d6d79dc61c93dc6d4dbdf67ce5f03aef2dd Mon Sep 17 00:00:00 2001
From: Mark Davis <mark@macchiato.com>
Date: Sat, 26 Mar 2005 05:40:05 +0000
Subject: [PATCH] ICU-0 update for U4.1.0

X-SVN-Rev: 17400
---
 .../com/ibm/text/UCD/BlocksHeader.txt         |  2 +
 .../com/ibm/text/UCD/CaseFoldingHeader.txt    |  1 +
 .../com/ibm/text/UCD/GenerateCaseFolding.java | 21 +++++---
 .../com/ibm/text/UCD/GenerateData.java        | 32 +++++++-----
 .../com/ibm/text/UCD/MakeUnicodeFiles.txt     | 13 +++--
 .../ibm/text/UCD/NormalizationTestHeader.txt  | 32 ++++++++++++
 .../ibm/text/UCD/PropertyAliasesHeader.txt    |  1 +
 .../text/UCD/PropertyValueAliasesHeader.txt   |  1 +
 .../com/ibm/text/UCD/SpecialCasingHeader.txt  |  1 +
 .../com/ibm/text/UCD/TestData.java            | 52 +++++++++++--------
 .../text/UCD/ToolUnicodePropertySource.java   |  9 ++--
 .../com/ibm/text/utility/UnicodeDataFile.java |  7 ++-
 12 files changed, 118 insertions(+), 54 deletions(-)
 create mode 100644 tools/unicodetools/com/ibm/text/UCD/NormalizationTestHeader.txt

diff --git a/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt b/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt
index ad4ee67b6a2..1f1a02b7761 100644
--- a/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt
@@ -1,4 +1,6 @@
+#
 # Note:   The casing of block names is not normative.
 #         For example, "Basic Latin" and "BASIC LATIN" are equivalent.
+#
 # Format:
 # Start Code..End Code; Block Name
diff --git a/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt b/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt
index ca8da1ac349..ef6ad4e18fa 100644
--- a/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt
@@ -1,3 +1,4 @@
+#
 # Case Folding Properties
 #
 # This file is a supplement to the UnicodeData file.
diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
index 6e2d6382efe..772ec22a7c0 100644
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
-* $Date: 2004/02/12 08:23:15 $
-* $Revision: 1.16 $
+* $Date: 2005/03/26 05:40:04 $
+* $Revision: 1.17 $
 *
 *******************************************************************************
 */
@@ -574,14 +574,19 @@ public class GenerateCaseFolding implements UCD_Types {
         log.close();
         
         System.out.println("Writing");
-        String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true);
-        PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
-        String[] batName = {""};
+        //String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true);
+        //PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
+        
+        UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedData/", "SpecialCasing" + suffix2);
+        PrintWriter out = udf.out;
+        
+ /*       String[] batName = {""};
         String mostRecent = UnicodeDataFile.generateBat("DerivedData/", "SpecialCasing", suffix2 + UnicodeDataFile.getFileSuffix(true), batName);
         out.println("# SpecialCasing" + UnicodeDataFile.getFileSuffix(false));
         out.println(UnicodeDataFile.generateDateLine());
         out.println("#");
         Utility.appendFile("SpecialCasingHeader.txt", Utility.UTF8, out);
+*/        
 
         Iterator it = sorted.keySet().iterator();
         int lastOrder = -1;
@@ -612,8 +617,8 @@ public class GenerateCaseFolding implements UCD_Types {
             }
             out.println(line);
         }
-        Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
-        out.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
+        //Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
+        udf.close();
+        //Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
     }
 }
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
index 8d7e1de3f75..4019fec01fa 100644
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
-* $Date: 2005/03/10 02:37:19 $
-* $Revision: 1.37 $
+* $Date: 2005/03/26 05:40:04 $
+* $Revision: 1.38 $
 *
 *******************************************************************************
 */
@@ -744,16 +744,19 @@ public class GenerateData implements UCD_Types {
     
     static public void writeNormalizerTestSuite(String directory, String fileName) throws IOException {
         
+    	UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, fileName);
+        PrintWriter log = fc.out;
+        
         String newFile = directory + fileName + UnicodeDataFile.getFileSuffix(true);
-        PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
-        String[] batName = {""};
-        String mostRecent = UnicodeDataFile.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName);
+        //PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
+        //String[] batName = {""};
+        //String mostRecent = UnicodeDataFile.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName);
 
         String[] example = new String[256];
 
-        log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false));
-        log.println(UnicodeDataFile.generateDateLine());
-        log.println("#");
+        //log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false));
+        //log.println(UnicodeDataFile.generateDateLine());
+        /*log.println("#");
         log.println("# Normalization Test Suite");
         log.println("# Format:");
         log.println("#");
@@ -787,7 +790,7 @@ public class GenerateData implements UCD_Types {
 
         log.println("#");
         log.println("@Part0 # Specific cases");
-        log.println("#");
+        log.println("#");*/
 
         for (int j = 0; j < testSuiteCases.length; ++j) {
             writeLine(testSuiteCases[j], log, false);
@@ -891,8 +894,8 @@ public class GenerateData implements UCD_Types {
         Utility.fixDot();
         log.println("#");
         log.println("# END OF FILE");
-        log.close();
-        Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
+        fc.close();
+        //Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
     }
     
     static void handleIdentical() throws IOException {
@@ -942,12 +945,13 @@ public class GenerateData implements UCD_Types {
 
     // not recursive!!!
     static final String comma(String s) {
+    	//if (true) return s;
         commaResult.setLength(0);
         int cp;
-        for (int i = 0; i < s.length(); i += UTF32.count16(i)) {
-            cp = UTF32.char32At(s, i);
+        for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(s, i);
             if (Default.ucd().getCategory(cp) == Mn) commaResult.append('\u25CC');
-            UTF32.append32(commaResult, cp);
+            UTF16.append(commaResult, cp);
         }
         return commaResult.toString();
     }
diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
index 16fc4c750b1..5067563f555 100644
--- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt
@@ -1,5 +1,5 @@
-Generate: Derived.*
-DeltaVersion: 12
+Generate:
+DeltaVersion: 13
 CopyrightYear: 2005
 
 File: auxiliary/GraphemeBreakProperty
@@ -58,6 +58,13 @@ Value:	4.1
 File:	extracted/DerivedBidiClass
 Property:	Bidi_Class
 # Bidi Class (listing UnicodeData.txt, field 4: see UCD.html)
+# Unlike other properties, unassigned code points in blocks reserved for right-to-left scripts are given either types R or AL.
+# The unassigned characters that default to R are:
+#   Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF
+# The unassigned characters that default to AL are:
+#   Arabic, Syriac, Thaana, Arabic_Presentation_Forms_A, Arabic_Presentation_Forms_B, Arabic_Supplement,
+#   and the range \u0750-\u077F, minus the Noncharacter_Code_Points
+# For all other cases:
 Format:	valueStyle=short skipUnassigned=Left_To_Right
 
 File:	extracted/DerivedBinaryProperties
@@ -67,8 +74,6 @@ Property:	Bidi_Mirrored
 File:	extracted/DerivedCombiningClass
 Property:	Canonical_Combining_Class
 # Combining Class (listing UnicodeData.txt, field 3: see UCD.html)
-#	All code points not explicitly listed in this file have the property
-#	value:   0.
 Format: nameStyle=none valueStyle=short skipUnassigned=Not_Reordered
 
 File:	DerivedCoreProperties
diff --git a/tools/unicodetools/com/ibm/text/UCD/NormalizationTestHeader.txt b/tools/unicodetools/com/ibm/text/UCD/NormalizationTestHeader.txt
new file mode 100644
index 00000000000..32aa458b912
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/NormalizationTestHeader.txt
@@ -0,0 +1,32 @@
+#
+# Normalization Test Suite
+# Format:
+#
+#   Columns (c1, c2,...) are separated by semicolons
+#   Comments are indicated with hash marks
+#
+# CONFORMANCE:
+# 1. The following invariants must be true for all conformant implementations
+#
+#    NFC
+#      c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
+#      c4 ==  NFC(c4) ==  NFC(c5)
+#
+#    NFD
+#      c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
+#      c5 ==  NFD(c4) ==  NFD(c5)
+#
+#    NFKC
+#      c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
+#
+#    NFKD
+#      c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
+#
+# 2. For every code point X assigned in this version of Unicode that is not specifically
+#    listed in Part 1, the following invariants must be true for all conformant
+#    implementations:
+#
+#      X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
+#
+@Part0 # Specific cases
+#
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt
index f502de853f6..50fef4b0cff 100644
--- a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt
@@ -1,3 +1,4 @@
+#
 # This file contains aliases for properties used in the UCD.
 # These names can be used for XML formats of UCD data, for regular-expression
 # property tests, and other programmatic textual descriptions of Unicode data.
diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt
index 0e9d5bec886..282326d9a2c 100644
--- a/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt
@@ -1,3 +1,4 @@
+#
 # This file contains aliases for property values used in the UCD.
 # These names can be used for XML formats of UCD data, for regular-expression
 # property tests, and other programmatic textual descriptions of Unicode data.
diff --git a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
index fcf77089488..0fcfa85a34e 100644
--- a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
+++ b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
@@ -1,3 +1,4 @@
+#
 # Special Casing Properties
 #
 # This file is a supplement to the UnicodeData file.
diff --git a/tools/unicodetools/com/ibm/text/UCD/TestData.java b/tools/unicodetools/com/ibm/text/UCD/TestData.java
index 5932c04c34f..81135b10ddb 100644
--- a/tools/unicodetools/com/ibm/text/UCD/TestData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java
@@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
-* $Date: 2005/03/10 02:37:19 $
-* $Revision: 1.18 $
+* $Date: 2005/03/26 05:40:05 $
+* $Revision: 1.19 $
 *
 *******************************************************************************
 */
@@ -151,7 +151,12 @@ public class TestData implements UCD_Types {
 	
 	static class GenStringPrep {
 		UnicodeSet[] coreChars = new UnicodeSet[100];
-		UnicodeSet[] decompChars = new UnicodeSet[100];
+		UnicodeSet decomposable = new UnicodeSet();
+		UnicodeSet pattern = new UnicodeSet();
+		ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
+		//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
+		UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
+		//UnicodeSet[] decompChars = new UnicodeSet[100];
 		UCD ucd = Default.ucd();
 
 		Collator uca = Collator.getInstance(ULocale.ENGLISH);
@@ -167,10 +172,13 @@ public class TestData implements UCD_Types {
 
 
 		void genStringPrep() throws IOException {
+			//BagFormatter bf = new BagFormatter();
+			//System.out.println(bf.showSetDifferences("ID_Continue", id_continue, "XID_Continue", xid_continue));
 			StringBuffer inbuffer = new StringBuffer();
 			StringBuffer intermediate, outbuffer;
 			for (int cp = 0; cp <= 0x10FFFF; ++cp) {
 				Utility.dot(cp);
+				if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
 				inbuffer.setLength(0);
 				UTF16.append(inbuffer, cp);
 				try {
@@ -189,15 +197,9 @@ public class TestData implements UCD_Types {
 				if (!TestData.equals(inbuffer, outbuffer))
 					continue;
 				int script = ucd.getScript(cp);
-				if (!Default.nfd().isNormalized(cp)) {
-					if (decompChars[script] == null)
-						decompChars[script] = new UnicodeSet();
-					decompChars[script].add(cp);
-				} else {
-					if (coreChars[script] == null)
-						coreChars[script] = new UnicodeSet();
-					coreChars[script].add(cp);
-				}
+				if (coreChars[script] == null)
+					coreChars[script] = new UnicodeSet();
+				coreChars[script].add(cp);
 			}
 			// find characters with no uppercase
 			for (UnicodeSetIterator it = new UnicodeSetIterator(lowercase); it.next();) {
@@ -212,8 +214,11 @@ public class TestData implements UCD_Types {
 					.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
 			out.println("<title>IDN Characters</title><style>");
 			out.println("<!--");
-			out
-					.println(".script       { font-size: 150%; background-color: #C0C0C0 }");
+			out.println(".script       { font-size: 150%; background-color: #CCCCCC }");
+			out.println(".Atomic       { background-color: #CCCCFF }");
+			out.println(".Atomic-no-uppercase       { background-color: #CCFFCC }");
+			out.println(".Non-ID       { background-color: #FFCCCC }");
+			out.println(".Decomposable       { background-color: #FFFFCC }");
 			out.println("th           { text-align: left }");
 			out.println("-->");
 			out.println("</style></head><body><table>");
@@ -240,15 +245,16 @@ public class TestData implements UCD_Types {
 		 * @param scriptCode
 		 */
 		private void showCodes(PrintWriter out, int scriptCode) {
-			if (coreChars[scriptCode] == null
-					&& decompChars[scriptCode] == null)
-				return;
+			if (coreChars[scriptCode] == null) return;
 			System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
-			String script = Default.ucd().getScriptID_fromIndex(
-					(byte) scriptCode);
+			String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
 			out.println();
 			out.println("<tr><th class='script'>Script: " + script + "</th></tr>");
 			UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
+			UnicodeSet decomp = new UnicodeSet(core).retainAll(decomposable);
+			core.removeAll(decomp);
+			UnicodeSet non_id = new UnicodeSet(core).removeAll(xid_continue);
+			core.removeAll(non_id);
 			UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
 			core.removeAll(otherCore);
 			if (core.size() == 0) {
@@ -257,9 +263,9 @@ public class TestData implements UCD_Types {
 				otherCore = temp;
 			}
 			printlnSet(out, "Atomic", core, scriptCode);
-			if (otherCore.size() != 0) printlnSet(out, "Atomic [noUpper]", otherCore, scriptCode);							
-			UnicodeSet decomp = decompChars[scriptCode];
-			if (decomp != null && decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
+			if (otherCore.size() != 0) printlnSet(out, "Atomic-no-uppercase", otherCore, scriptCode);
+			if (non_id.size() != 0) printlnSet(out, "Non-ID", non_id, scriptCode);
+			if (decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
 		}
 
 		/**
@@ -277,7 +283,7 @@ public class TestData implements UCD_Types {
 					&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
 			out.println("<tr><th class='" + title + "'>" + title + " ("
 					+ nf.format(size) + ")</th></tr>");
-			out.print("<tr><td" + dir + ">");
+			out.print("<tr><td class='" + title + "'" + dir + ">");
 			UnicodeSetIterator usi = new UnicodeSetIterator();
 			if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
 				usi.reset(unicodeset);
diff --git a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
index 1aafeb37581..9d951c0a651 100644
--- a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
+++ b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
@@ -264,7 +264,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
         		unicodeMap.putAll(lineBreak.getSet("Infix_Numeric")
         				.remove(0x003A), "MidNum");
         		unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric");
-        		unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "Numeric");
+        		unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "ExtendNumLet");
         		unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
         		unicodeMap.setMissing("Other");
         	}
@@ -479,9 +479,10 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
         public List _getValueAliases(String valueAlias, List result) {
             if (result == null) result = new ArrayList();
             int type = getType() & CORE_MASK;
-            if (type == STRING || type == MISC) return result;
-            else if (type == NUMERIC) return result;
-            else if (type == BINARY) {
+            if (type == STRING || type == MISC || type == NUMERIC) {
+            	UnicodeProperty.addUnique(valueAlias, result);
+            	return result;
+            } else if (type == BINARY) {
                 UnicodeProperty.addUnique(valueAlias, result);
                 return lookup(valueAlias, UCD_Names.YN_TABLE_LONG, UCD_Names.YN_TABLE, null, result);
             } else if (type == ENUMERATED || type == CATALOG) {
diff --git a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java
index 6bc06639392..c15ed90343d 100644
--- a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java
+++ b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java
@@ -15,15 +15,17 @@ public class UnicodeDataFile {
     private String newFile;
     private String batName;
     private String mostRecent;
+    private String filename;
     private UnicodeDataFile(){};
     
     public static UnicodeDataFile openAndWriteHeader(String directory, String filename) throws IOException {
         UnicodeDataFile result = new UnicodeDataFile();
         result.newFile = directory + filename + UnicodeDataFile.getFileSuffix(true);
-        result.out = Utility.openPrintWriter(result.newFile, Utility.LATIN1_UNIX);
+        result.out = Utility.openPrintWriter(result.newFile, Utility.UTF8_UNIX);
         String[] batName = {""};
         result.mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
         result.batName = batName[0];
+    	result.filename = filename;
         
         result.out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
         result.out.println(generateDateLine());
@@ -50,6 +52,9 @@ public class UnicodeDataFile {
     }
     
     public void close() throws IOException {
+        try {
+            Utility.appendFile(filename + "Footer.txt", Utility.LATIN1, out);
+        } catch (FileNotFoundException e) {}
         out.close();           
         Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName);
     }