ICU-5222 updates for UCD/A 5.0.0

X-SVN-Rev: 19697
2025-04-14 01:11:02 +00:00 · 2006-06-08 18:16:40 +00:00 · 2006-06-08 18:16:40 +00:00 · 1d7d7f00ba
commit 1d7d7f00ba
parent 18f81012d0
5 changed files with 153 additions and 54 deletions
--- a/tools/unicodetools/com/ibm/text/UCA/UCA.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ 
-* $Date: 2005/05/02 15:39:54 $ 
-* $Revision: 1.25 $
+* $Date: 2006/06/08 18:16:40 $ 
+* $Revision: 1.26 $
 *
 *******************************************************************************
 */
@ -1128,7 +1128,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
        Normalizer skipDecomps;
        Normalizer nfd;
        Normalizer nfkd;
-        Iterator enum = null;
+        Iterator enum1 = null;
        byte ceLimit;
        int currentRange = SAMPLE_RANGES.length; // set to ZERO to enable
        int startOfRange = SAMPLE_RANGES[0][0];
@ -1197,9 +1197,9 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
            }
            
            // contractions
-            if (enum == null) enum = ucaData.getContractions();
-            while (enum.hasNext()) {
-                result = (String)enum.next();
+            if (enum1 == null) enum1 = ucaData.getContractions();
+            while (enum1.hasNext()) {
+                result = (String)enum1.next();
                if (result.length() == 1 && UTF16.isLeadSurrogate(result.charAt(0))) {
                    //System.out.println("Skipping " + ucd.getCodeAndName(result));
                    continue; // try again
@ -1500,9 +1500,9 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
        Map missingStrings = new HashMap();
        Map tempMap = new HashMap();
        
-        Iterator enum = ucaData.getContractions();
-        while (enum.hasNext()) {
-            String sequence = (String)enum.next();
+        Iterator enum1 = ucaData.getContractions();
+        while (enum1.hasNext()) {
+            String sequence = (String)enum1.next();
            //System.out.println("Contraction: " + Utility.hex(sequence));
            for (int i = sequence.length()-1; i > 0; --i) {
                String shorter = sequence.substring(0,i);
@ -1520,26 +1520,26 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
        
        // now add them. We couldn't before because we were iterating over it.
        
-        enum = tempMap.keySet().iterator();
-        while (enum.hasNext()) {
-            String shorter = (String) enum.next();
+        enum1 = tempMap.keySet().iterator();
+        while (enum1.hasNext()) {
+            String shorter = (String) enum1.next();
            IntStack tempStack = (IntStack) tempMap.get(shorter);
            ucaData.add(shorter, tempStack);
        }
        
        
-        enum = missingStrings.keySet().iterator();
+        enum1 = missingStrings.keySet().iterator();
        if (missingStrings.size() != 0) {
            /**
-            while (enum.hasMoreElements()) {
-                String sequence = (String)enum.nextElement();
+            while (enum1.hasMoreElements()) {
+                String sequence = (String)enum1.nextElement();
                getCE(sequence);
                FIX LATER;
            }
            */
            String errorMessage = "";
-            while (enum.hasNext()) {
-                String missing = (String)enum.next();
+            while (enum1.hasNext()) {
+                String missing = (String)enum1.next();
                if (errorMessage.length() != 0) errorMessage += ", ";
                errorMessage += "\"" + missing + "\"";
            }
--- a/tools/unicodetools/com/ibm/text/UCA/UCA_Data.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA_Data.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Data.java,v $ 
-* $Date: 2005/05/02 15:39:54 $ 
-* $Revision: 1.3 $
+* $Date: 2006/06/08 18:16:40 $ 
+* $Revision: 1.4 $
 *
 *******************************************************************************
 */
@ -294,9 +294,9 @@ public class UCA_Data implements UCA_Types {
            if (collationElements[i] == CONTRACTING) ceSet.add(i);
        }
        UnicodeSet ceSet2 = new UnicodeSet();
-        Iterator enum = contractingTable.keySet().iterator();
-        while (enum.hasNext()) {
-            String sequence = (String)enum.next();
+        Iterator enum1 = contractingTable.keySet().iterator();
+        while (enum1.hasNext()) {
+            String sequence = (String)enum1.next();
            ceSet2.add(sequence.charAt(0));
        }
        
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
-* $Date: 2005/04/06 08:48:17 $
-* $Revision: 1.21 $
+* $Date: 2006/06/08 18:16:40 $
+* $Revision: 1.22 $
 *
 *******************************************************************************
 */
@ -266,10 +266,12 @@ public class WriteCharts implements UCD_Types {

        String[] replacement = new String[] {"%%%", "Normalization Charts"};
        String folder = "charts\\normalization\\";
+        
+        //System.out.println("File: " + new File(".").getCanonicalPath());

-        Utility.copyTextFile("index.html", Utility.UTF8, folder + "index.html", replacement);
-        Utility.copyTextFile("charts.css", Utility.LATIN1, folder + "charts.css");
-        Utility.copyTextFile("norm_help.html", Utility.UTF8, folder + "help.html");
+        Utility.copyTextFile("com/ibm/text/UCA/index.html", Utility.UTF8, folder + "index.html", replacement);
+        Utility.copyTextFile("com/ibm/text/UCA/charts.css", Utility.LATIN1, folder + "charts.css");
+        Utility.copyTextFile("com/ibm/text/UCA/norm_help.html", Utility.UTF8, folder + "help.html");

        indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
        Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
@ -375,9 +377,9 @@ public class WriteCharts implements UCD_Types {
        String[] replacement = new String[] {"%%%", "Case Charts"};
        String folder = "charts\\case\\";

-        Utility.copyTextFile("index.html", Utility.UTF8, folder + "index.html", replacement);
-        Utility.copyTextFile("charts.css", Utility.LATIN1, folder + "charts.css");
-        Utility.copyTextFile("case_help.html", Utility.UTF8, folder + "help.html");
+        Utility.copyTextFile("com/ibm/text/UCA/index.html", Utility.UTF8, folder + "index.html", replacement);
+        Utility.copyTextFile("com/ibm/text/UCA/charts.css", Utility.LATIN1, folder + "charts.css");
+        Utility.copyTextFile("com/ibm/text/UCA/case_help.html", Utility.UTF8, folder + "help.html");

        indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
        Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
@ -487,9 +489,9 @@ public class WriteCharts implements UCD_Types {
 			String[] replacement = new String[] {"%%%", "Script Charts"};
 			String folder = "charts\\script\\";

-			Utility.copyTextFile("index.html", Utility.UTF8, folder + "index.html", replacement);
-			Utility.copyTextFile("charts.css", Utility.LATIN1, folder + "charts.css");
-			Utility.copyTextFile("script_help.html", Utility.UTF8, folder + "help.html");
+			Utility.copyTextFile("com/ibm/text/UCA/index.html", Utility.UTF8, folder + "index.html", replacement);
+			Utility.copyTextFile("com/ibm/text/UCA/charts.css", Utility.LATIN1, folder + "charts.css");
+			Utility.copyTextFile("com/ibm/text/UCA/script_help.html", Utility.UTF8, folder + "help.html");

 			indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
 			Utility.appendFile(WORKING_DIR + "script_index_header.html", Utility.UTF8, indexFile, replacement);
@ -609,9 +611,9 @@ public class WriteCharts implements UCD_Types {
        String[] replacement = new String[] {"%%%", "Name Charts"};
        String folder = "charts\\name\\";

-        Utility.copyTextFile("index.html", Utility.UTF8, folder + "index.html", replacement);
-        Utility.copyTextFile("charts.css", Utility.LATIN1, folder + "charts.css");
-        Utility.copyTextFile("name_help.html", Utility.UTF8, folder + "help.html");
+        Utility.copyTextFile("com/ibm/text/UCA/index.html", Utility.UTF8, folder + "index.html", replacement);
+        Utility.copyTextFile("com/ibm/text/UCA/charts.css", Utility.LATIN1, folder + "charts.css");
+        Utility.copyTextFile("com/ibm/text/UCA/name_help.html", Utility.UTF8, folder + "help.html");

        indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
        Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ 
-* $Date: 2006/04/05 22:12:46 $ 
-* $Revision: 1.43 $
+* $Date: 2006/06/08 18:16:40 $ 
+* $Revision: 1.44 $
 *
 *******************************************************************************
 */
@ -2301,7 +2301,7 @@ F900..FAFF; CJK Compatibility Ideographs
        // NOTE: we add the back map based on the string value; the smallest (UTF-16 order) string wins
        Object key = new ArrayWrapper((int[])(ces.clone()),0, len);
        if (false) {
-            Object value = backMap.get(key);
+            String value = (String)backMap.get(key);
            if (value == null) return;
            if (s.compareTo(value) >= 0) return;
        }
@ -4099,23 +4099,23 @@ F900..FAFF; CJK Compatibility Ideographs
        writeDuplicates();
        writeOverlap();
        
-        log.println("<h2>Coverage</h2>");
+        log.println("<h2>11. Coverage</h2>");
        BagFormatter bf = new BagFormatter();
        bf.setLineSeparator("<br>\r\n");
-        ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
+        ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(Default.ucdVersion());
        bf.setUnicodePropertyFactory(ups);
        bf.setShowLiteral(TransliteratorUtilities.toHTML);
        bf.setFixName(TransliteratorUtilities.toHTML);
        UCD ucd = Default.ucd();
        UnicodeProperty cat = ups.getProperty("gc");
-        UnicodeSet ucd410 = cat.getSet("Cn")
+        UnicodeSet ucdCharacters = cat.getSet("Cn")
 		.addAll(cat.getSet("Co"))
 		.addAll(cat.getSet("Cs"))
 		.complement()
 		//.addAll(ups.getSet("Noncharactercodepoint=true"))
 		//.addAll(ups.getSet("Default_Ignorable_Code_Point=true"))
 		;
-        bf.showSetDifferences(log, "UCD4.1.0", ucd410, "UCA4.1.0", coverage, 3);
+        bf.showSetDifferences(log, "UCD" + Default.ucdVersion(), ucdCharacters, collator.getFileVersion(), coverage, 3);

        log.println("</body></html>");
        log.close();
--- a/tools/unicodetools/readme.html
+++ b/tools/unicodetools/readme.html
@ -4,6 +4,11 @@
 <meta http-equiv="Content-Language" content="en-us">
 <meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
 <title>New Page 18</title>
+<style>
+<!--
+li           { margin-top: 0.5em; margin-bottom: 0.5em }
+-->
+</style>
 </head>

 <body>
@ -170,14 +175,14 @@ CopyrightYear: <b>2006</b> <i> // Pick the current year</i></pre>
 		</li>
 		<li>Run&gt;Run As...<ol>
 			<li>Choose Java Application<ul>
-				<li>it will fail, don't worry; you need to set some parameters</li>
+				<li>it will fail, don't worry; you need to set some parameters.</li>
 			</ul>
 			</li>
 		</ol>
 		</li>
 		<li>Run&gt;Run...<ul>
 			<li>Select the Arguments tab, and fill in the following<ul>
-				<li>Program arguments:<pre>build 5.0 MakeUnicodeFiles</pre>
+				<li>Program arguments:<pre>build 5.0<span style="background-color: #FFFF00">.0</span> MakeUnicodeFiles</pre>
 				</li>
 				<li>VM arguments: 
 				<pre>-Xms512m -Xmx512m</pre>
@ -229,6 +234,14 @@ UNCHANGED-Diff_PropertyValueAliases-5.0.0d10.txt.bat</pre>
 		<li>On Windows you can run these BATs to compare files:</li>
 	</ol>
 	</li>
+	<li><span style="background-color: #FFFF00">NFSkippable</span><ol>
+	<li><span style="background-color: #FFFF00">A file is needed by ICU that is 
+	generated with the same tool. Just use the input parameter &quot;NFSkippable&quot; to 
+	generate the file NFSafeSets.txt, also in </span>
+	<a href="file:///C:/DATA/GEN"><span style="background-color: #FFFF00">
+	file:///C:/DATA/GEN</span></a></li>
+</ol>
+	</li>
 </ol>
 <h3>5. Invariant Checking</h3>
 <ol>
@ -242,9 +255,63 @@ UNCHANGED-Diff_PropertyValueAliases-5.0.0d10.txt.bat</pre>
 		</li>
 		<li>Run&gt;Run As... Java Application<br>
 		Will create the following file of results:<pre><a href="file:///C:/DATA/GEN/UnicodeInvariantResults.txt/">C:\DATA\GEN\UnicodeInvariantResults.txt\</a></pre>
+		<p>And on the console will list whether any problems are found. Thus in 
+		the following case there was one failure:</p>
+		<pre>ParseErrorCount=0
+TestFailureCount=1</pre>
+		</li>
+		<li>The header of the result file explains the syntax of the tests.</li>
+		<li>Open that file and search for &quot;**** START Error Info ****&quot;. Each such 
+		point provides a dump of comparison information.<ol>
+		<li>Failures print a list of differences between two sets being 
+		compared. So if A and B are being compared, it prints all the items in 
+		A-B, then in B-A, then in A&amp;B.</li>
+		<li>For example, here is a listing of a problem that must be corrected. 
+		Note that usually there is a comment that explains what the following 
+		line or lines are supposed to test. Then will come FALSE (indicating 
+		that the test failed), then the detailed error report.<pre><span style="font-size: 9pt"># Canonical decompositions (minus exclusions) must be identical across releases
+[$Decomposition_Type:Canonical - $Full_Composition_Exclusion] = [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion]
+
+FALSE
+**** START Error Info ****
+
+In [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion], but not in [$Decomposition_Type:Canonical - $Full_Composition_Exclusion] :
+
+# Total code points: 0
+
+Not in [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion], but in [$Decomposition_Type:Canonical - $Full_Composition_Exclusion] :
+1B06           # Lo       BALINESE LETTER AKARA TEDUNG
+1B08           # Lo       BALINESE LETTER IKARA TEDUNG
+1B0A           # Lo       BALINESE LETTER UKARA TEDUNG
+1B0C           # Lo       BALINESE LETTER RA REPA TEDUNG
+1B0E           # Lo       BALINESE LETTER LA LENGA TEDUNG
+1B12           # Lo       BALINESE LETTER OKARA TEDUNG
+1B3B           # Mc       BALINESE VOWEL SIGN RA REPA TEDUNG
+1B3D           # Mc       BALINESE VOWEL SIGN LA LENGA TEDUNG
+1B40..1B41     # Mc   [2] BALINESE VOWEL SIGN TALING TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG
+1B43           # Mc       BALINESE VOWEL SIGN PEPET TEDUNG
+
+# Total code points: 11
+
+In both [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion], and in [$Decomposition_Type:Canonical - $Full_Composition_Exclusion] :
+00C0..00C5     # L&amp;   [6] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER A WITH RING ABOVE
+00C7..00CF     # L&amp;   [9] LATIN CAPITAL LETTER C WITH CEDILLA..LATIN CAPITAL LETTER I WITH DIAERESIS
+00D1..00D6     # L&amp;   [6] LATIN CAPITAL LETTER N WITH TILDE..LATIN CAPITAL LETTER O WITH DIAERESIS
+...
+30F7..30FA     # Lo   [4] KATAKANA LETTER VA..KATAKANA LETTER VO
+30FE           # Lm       KATAKANA VOICED ITERATION MARK
+AC00..D7A3     # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
+
+# Total code points: 12089
+**** END Error Info ****</span></pre>
+		</li>
+	</ol>
+		</li>
+		<li>Options:<ol>
+		<li>-r&nbsp;&nbsp;&nbsp; Print the failures as a range list.</li>
+		<li>-fxxx&nbsp;&nbsp;&nbsp; Use a different input file, such as -fInvariantTest.txt</li>
+	</ol>
 		</li>
-		<li>Open that file and search for &quot;**** START Error Info ****&quot; Each such 
-		point provides a dump of comparison information.</li>
 	</ol>
 	</li>
 </ol>
@ -265,18 +332,48 @@ UNCHANGED-Diff_PropertyValueAliases-5.0.0d10.txt.bat</pre>
 <h3>5. UCA</h3>
 <ol>
 	<li>
-	<h3>You will use com.ibm.text.UCA.Main as your main class, creating along 
-	the same lines as above.</h3></li>
+	You will use com.ibm.text.UCA.Main as your main class, creating along 
+	the same lines as above.</li>
+	<li>To test whether the UCA files are valid, use the
+	<span style="font-weight: 400">options (<i>note: you should also build the ICU 
+	files below, since they test other aspects</i>).</span><pre>writeCollationValidityLog</pre>
+	<p>It will create a file:</p>
+	<pre><a href="file:///C:/DATA/GEN/collation/5.0.0/CheckCollationValidity.html">C:\DATA\GEN\collation\5.0.0\CheckCollationValidity.html</a></pre>
+	<ol>
+		<li>Review this file. It will list errors. Some of those are actually 
+	warnings, and indicate possible problems (this is indicated in the text, 
+	such as by: &quot;These are not necessarily errors, but should be examined for 
+		<i>possible</i> errors&quot;). In those cases, the items should be reviewed to make 
+	sure that there are no inadvertent problems.</li>
+		<li>If it is not so marked, it is a true error, and must be fixed.</li>
+		<li>At the end, there is section <b>11. Coverage</b>. There are two sections:<ol>
+			<li>In UCDxxx, but not in allkeys. Check this over to make sure that these 
+	are all the characters that should get <b><i>implicit</i></b> weights.</li>
+			<li>In allkeys, but not in UCD. These should be <b><i>only</i></b> 
+	contractions. Check them over to make sure they look right also.</li>
+		</ol></li>
+	</ol></li>
 	<li>
-	<h4>To build all the UCA files used by ICU, use the Program arguments:</h4>
-	<pre>Main ICU</pre>
+	<h4><span style="font-weight: 400">To build all the charts, use the options:
+	</span> </h4>
+	<pre>normalizationChart caseChart scriptChart indexChart</pre>
 	</li>
 	<li>
-	<h4>To build all the charts, use the UCA project, with options: </h4>
-	<pre>normalizationChart caseChart scriptChart indexChart</pre>
+	<h4><span style="font-weight: 400">To build all the UCA files used by ICU, use the 
+	option:</span></h4>
+	<pre>ICU</pre>
+	</li>
+	<li>You should then build a set of the ICU files for the previous version, 
+	if you don't have them. The key file is UCA_Rules_NoCE.txt. It contains the 
+	rules expressed in ICU format, which allows for comparison across versions 
+	of UCA.<ol>
+	<li>Do a Diff, and verify that all the differences are either new 
+	characters, or were authorized to be changed by the UTC.</li>
+</ol>
+
 	</li>
 </ol>

 </body>

-</html>
+</html>