ICU-5222 updates for UCD/A 5.0.0

X-SVN-Rev: 19697
This commit is contained in:
Mark Davis 2006-06-08 18:16:40 +00:00
parent 18f81012d0
commit 1d7d7f00ba
5 changed files with 153 additions and 54 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
* $Date: 2005/05/02 15:39:54 $
* $Revision: 1.25 $
* $Date: 2006/06/08 18:16:40 $
* $Revision: 1.26 $
*
*******************************************************************************
*/
@ -1128,7 +1128,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
Normalizer skipDecomps;
Normalizer nfd;
Normalizer nfkd;
Iterator enum = null;
Iterator enum1 = null;
byte ceLimit;
int currentRange = SAMPLE_RANGES.length; // set to ZERO to enable
int startOfRange = SAMPLE_RANGES[0][0];
@ -1197,9 +1197,9 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
}
// contractions
if (enum == null) enum = ucaData.getContractions();
while (enum.hasNext()) {
result = (String)enum.next();
if (enum1 == null) enum1 = ucaData.getContractions();
while (enum1.hasNext()) {
result = (String)enum1.next();
if (result.length() == 1 && UTF16.isLeadSurrogate(result.charAt(0))) {
//System.out.println("Skipping " + ucd.getCodeAndName(result));
continue; // try again
@ -1500,9 +1500,9 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
Map missingStrings = new HashMap();
Map tempMap = new HashMap();
Iterator enum = ucaData.getContractions();
while (enum.hasNext()) {
String sequence = (String)enum.next();
Iterator enum1 = ucaData.getContractions();
while (enum1.hasNext()) {
String sequence = (String)enum1.next();
//System.out.println("Contraction: " + Utility.hex(sequence));
for (int i = sequence.length()-1; i > 0; --i) {
String shorter = sequence.substring(0,i);
@ -1520,26 +1520,26 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
// now add them. We couldn't before because we were iterating over it.
enum = tempMap.keySet().iterator();
while (enum.hasNext()) {
String shorter = (String) enum.next();
enum1 = tempMap.keySet().iterator();
while (enum1.hasNext()) {
String shorter = (String) enum1.next();
IntStack tempStack = (IntStack) tempMap.get(shorter);
ucaData.add(shorter, tempStack);
}
enum = missingStrings.keySet().iterator();
enum1 = missingStrings.keySet().iterator();
if (missingStrings.size() != 0) {
/**
while (enum.hasMoreElements()) {
String sequence = (String)enum.nextElement();
while (enum1.hasMoreElements()) {
String sequence = (String)enum1.nextElement();
getCE(sequence);
FIX LATER;
}
*/
String errorMessage = "";
while (enum.hasNext()) {
String missing = (String)enum.next();
while (enum1.hasNext()) {
String missing = (String)enum1.next();
if (errorMessage.length() != 0) errorMessage += ", ";
errorMessage += "\"" + missing + "\"";
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Data.java,v $
* $Date: 2005/05/02 15:39:54 $
* $Revision: 1.3 $
* $Date: 2006/06/08 18:16:40 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -294,9 +294,9 @@ public class UCA_Data implements UCA_Types {
if (collationElements[i] == CONTRACTING) ceSet.add(i);
}
UnicodeSet ceSet2 = new UnicodeSet();
Iterator enum = contractingTable.keySet().iterator();
while (enum.hasNext()) {
String sequence = (String)enum.next();
Iterator enum1 = contractingTable.keySet().iterator();
while (enum1.hasNext()) {
String sequence = (String)enum1.next();
ceSet2.add(sequence.charAt(0));
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
* $Date: 2005/04/06 08:48:17 $
* $Revision: 1.21 $
* $Date: 2006/06/08 18:16:40 $
* $Revision: 1.22 $
*
*******************************************************************************
*/
@ -266,10 +266,12 @@ public class WriteCharts implements UCD_Types {
String[] replacement = new String[] {"%%%", "Normalization Charts"};
String folder = "charts\\normalization\\";
//System.out.println("File: " + new File(".").getCanonicalPath());
Utility.copyTextFile("index.html", Utility.UTF8, folder + "index.html", replacement);
Utility.copyTextFile("charts.css", Utility.LATIN1, folder + "charts.css");
Utility.copyTextFile("norm_help.html", Utility.UTF8, folder + "help.html");
Utility.copyTextFile("com/ibm/text/UCA/index.html", Utility.UTF8, folder + "index.html", replacement);
Utility.copyTextFile("com/ibm/text/UCA/charts.css", Utility.LATIN1, folder + "charts.css");
Utility.copyTextFile("com/ibm/text/UCA/norm_help.html", Utility.UTF8, folder + "help.html");
indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
@ -375,9 +377,9 @@ public class WriteCharts implements UCD_Types {
String[] replacement = new String[] {"%%%", "Case Charts"};
String folder = "charts\\case\\";
Utility.copyTextFile("index.html", Utility.UTF8, folder + "index.html", replacement);
Utility.copyTextFile("charts.css", Utility.LATIN1, folder + "charts.css");
Utility.copyTextFile("case_help.html", Utility.UTF8, folder + "help.html");
Utility.copyTextFile("com/ibm/text/UCA/index.html", Utility.UTF8, folder + "index.html", replacement);
Utility.copyTextFile("com/ibm/text/UCA/charts.css", Utility.LATIN1, folder + "charts.css");
Utility.copyTextFile("com/ibm/text/UCA/case_help.html", Utility.UTF8, folder + "help.html");
indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
@ -487,9 +489,9 @@ public class WriteCharts implements UCD_Types {
String[] replacement = new String[] {"%%%", "Script Charts"};
String folder = "charts\\script\\";
Utility.copyTextFile("index.html", Utility.UTF8, folder + "index.html", replacement);
Utility.copyTextFile("charts.css", Utility.LATIN1, folder + "charts.css");
Utility.copyTextFile("script_help.html", Utility.UTF8, folder + "help.html");
Utility.copyTextFile("com/ibm/text/UCA/index.html", Utility.UTF8, folder + "index.html", replacement);
Utility.copyTextFile("com/ibm/text/UCA/charts.css", Utility.LATIN1, folder + "charts.css");
Utility.copyTextFile("com/ibm/text/UCA/script_help.html", Utility.UTF8, folder + "help.html");
indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
Utility.appendFile(WORKING_DIR + "script_index_header.html", Utility.UTF8, indexFile, replacement);
@ -609,9 +611,9 @@ public class WriteCharts implements UCD_Types {
String[] replacement = new String[] {"%%%", "Name Charts"};
String folder = "charts\\name\\";
Utility.copyTextFile("index.html", Utility.UTF8, folder + "index.html", replacement);
Utility.copyTextFile("charts.css", Utility.LATIN1, folder + "charts.css");
Utility.copyTextFile("name_help.html", Utility.UTF8, folder + "help.html");
Utility.copyTextFile("com/ibm/text/UCA/index.html", Utility.UTF8, folder + "index.html", replacement);
Utility.copyTextFile("com/ibm/text/UCA/charts.css", Utility.LATIN1, folder + "charts.css");
Utility.copyTextFile("com/ibm/text/UCA/name_help.html", Utility.UTF8, folder + "help.html");
indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2006/04/05 22:12:46 $
* $Revision: 1.43 $
* $Date: 2006/06/08 18:16:40 $
* $Revision: 1.44 $
*
*******************************************************************************
*/
@ -2301,7 +2301,7 @@ F900..FAFF; CJK Compatibility Ideographs
// NOTE: we add the back map based on the string value; the smallest (UTF-16 order) string wins
Object key = new ArrayWrapper((int[])(ces.clone()),0, len);
if (false) {
Object value = backMap.get(key);
String value = (String)backMap.get(key);
if (value == null) return;
if (s.compareTo(value) >= 0) return;
}
@ -4099,23 +4099,23 @@ F900..FAFF; CJK Compatibility Ideographs
writeDuplicates();
writeOverlap();
log.println("<h2>Coverage</h2>");
log.println("<h2>11. Coverage</h2>");
BagFormatter bf = new BagFormatter();
bf.setLineSeparator("<br>\r\n");
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(Default.ucdVersion());
bf.setUnicodePropertyFactory(ups);
bf.setShowLiteral(TransliteratorUtilities.toHTML);
bf.setFixName(TransliteratorUtilities.toHTML);
UCD ucd = Default.ucd();
UnicodeProperty cat = ups.getProperty("gc");
UnicodeSet ucd410 = cat.getSet("Cn")
UnicodeSet ucdCharacters = cat.getSet("Cn")
.addAll(cat.getSet("Co"))
.addAll(cat.getSet("Cs"))
.complement()
//.addAll(ups.getSet("Noncharactercodepoint=true"))
//.addAll(ups.getSet("Default_Ignorable_Code_Point=true"))
;
bf.showSetDifferences(log, "UCD4.1.0", ucd410, "UCA4.1.0", coverage, 3);
bf.showSetDifferences(log, "UCD" + Default.ucdVersion(), ucdCharacters, collator.getFileVersion(), coverage, 3);
log.println("</body></html>");
log.close();

View file

@ -4,6 +4,11 @@
<meta http-equiv="Content-Language" content="en-us">
<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
<title>New Page 18</title>
<style>
<!--
li { margin-top: 0.5em; margin-bottom: 0.5em }
-->
</style>
</head>
<body>
@ -170,14 +175,14 @@ CopyrightYear: <b>2006</b> <i> // Pick the current year</i></pre>
</li>
<li>Run&gt;Run As...<ol>
<li>Choose Java Application<ul>
<li>it will fail, don't worry; you need to set some parameters</li>
<li>it will fail, don't worry; you need to set some parameters.</li>
</ul>
</li>
</ol>
</li>
<li>Run&gt;Run...<ul>
<li>Select the Arguments tab, and fill in the following<ul>
<li>Program arguments:<pre>build 5.0 MakeUnicodeFiles</pre>
<li>Program arguments:<pre>build 5.0<span style="background-color: #FFFF00">.0</span> MakeUnicodeFiles</pre>
</li>
<li>VM arguments:
<pre>-Xms512m -Xmx512m</pre>
@ -229,6 +234,14 @@ UNCHANGED-Diff_PropertyValueAliases-5.0.0d10.txt.bat</pre>
<li>On Windows you can run these BATs to compare files:</li>
</ol>
</li>
<li><span style="background-color: #FFFF00">NFSkippable</span><ol>
<li><span style="background-color: #FFFF00">A file is needed by ICU that is
generated with the same tool. Just use the input parameter &quot;NFSkippable&quot; to
generate the file NFSafeSets.txt, also in </span>
<a href="file:///C:/DATA/GEN"><span style="background-color: #FFFF00">
file:///C:/DATA/GEN</span></a></li>
</ol>
</li>
</ol>
<h3>5. Invariant Checking</h3>
<ol>
@ -242,9 +255,63 @@ UNCHANGED-Diff_PropertyValueAliases-5.0.0d10.txt.bat</pre>
</li>
<li>Run&gt;Run As... Java Application<br>
Will create the following file of results:<pre><a href="file:///C:/DATA/GEN/UnicodeInvariantResults.txt/">C:\DATA\GEN\UnicodeInvariantResults.txt\</a></pre>
<p>And on the console will list whether any problems are found. Thus in
the following case there was one failure:</p>
<pre>ParseErrorCount=0
TestFailureCount=1</pre>
</li>
<li>The header of the result file explains the syntax of the tests.</li>
<li>Open that file and search for &quot;**** START Error Info ****&quot;. Each such
point provides a dump of comparison information.<ol>
<li>Failures print a list of differences between two sets being
compared. So if A and B are being compared, it prints all the items in
A-B, then in B-A, then in A&amp;B.</li>
<li>For example, here is a listing of a problem that must be corrected.
Note that usually there is a comment that explains what the following
line or lines are supposed to test. Then will come FALSE (indicating
that the test failed), then the detailed error report.<pre><span style="font-size: 9pt"># Canonical decompositions (minus exclusions) must be identical across releases
[$Decomposition_Type:Canonical - $Full_Composition_Exclusion] = [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion]
FALSE
**** START Error Info ****
In [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion], but not in [$Decomposition_Type:Canonical - $Full_Composition_Exclusion] :
# Total code points: 0
Not in [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion], but in [$Decomposition_Type:Canonical - $Full_Composition_Exclusion] :
1B06 # Lo BALINESE LETTER AKARA TEDUNG
1B08 # Lo BALINESE LETTER IKARA TEDUNG
1B0A # Lo BALINESE LETTER UKARA TEDUNG
1B0C # Lo BALINESE LETTER RA REPA TEDUNG
1B0E # Lo BALINESE LETTER LA LENGA TEDUNG
1B12 # Lo BALINESE LETTER OKARA TEDUNG
1B3B # Mc BALINESE VOWEL SIGN RA REPA TEDUNG
1B3D # Mc BALINESE VOWEL SIGN LA LENGA TEDUNG
1B40..1B41 # Mc [2] BALINESE VOWEL SIGN TALING TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG
1B43 # Mc BALINESE VOWEL SIGN PEPET TEDUNG
# Total code points: 11
In both [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion], and in [$Decomposition_Type:Canonical - $Full_Composition_Exclusion] :
00C0..00C5 # L&amp; [6] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER A WITH RING ABOVE
00C7..00CF # L&amp; [9] LATIN CAPITAL LETTER C WITH CEDILLA..LATIN CAPITAL LETTER I WITH DIAERESIS
00D1..00D6 # L&amp; [6] LATIN CAPITAL LETTER N WITH TILDE..LATIN CAPITAL LETTER O WITH DIAERESIS
...
30F7..30FA # Lo [4] KATAKANA LETTER VA..KATAKANA LETTER VO
30FE # Lm KATAKANA VOICED ITERATION MARK
AC00..D7A3 # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
# Total code points: 12089
**** END Error Info ****</span></pre>
</li>
</ol>
</li>
<li>Options:<ol>
<li>-r&nbsp;&nbsp;&nbsp; Print the failures as a range list.</li>
<li>-fxxx&nbsp;&nbsp;&nbsp; Use a different input file, such as -fInvariantTest.txt</li>
</ol>
</li>
<li>Open that file and search for &quot;**** START Error Info ****&quot; Each such
point provides a dump of comparison information.</li>
</ol>
</li>
</ol>
@ -265,18 +332,48 @@ UNCHANGED-Diff_PropertyValueAliases-5.0.0d10.txt.bat</pre>
<h3>5. UCA</h3>
<ol>
<li>
<h3>You will use com.ibm.text.UCA.Main as your main class, creating along
the same lines as above.</h3></li>
You will use com.ibm.text.UCA.Main as your main class, creating along
the same lines as above.</li>
<li>To test whether the UCA files are valid, use the
<span style="font-weight: 400">options (<i>note: you should also build the ICU
files below, since they test other aspects</i>).</span><pre>writeCollationValidityLog</pre>
<p>It will create a file:</p>
<pre><a href="file:///C:/DATA/GEN/collation/5.0.0/CheckCollationValidity.html">C:\DATA\GEN\collation\5.0.0\CheckCollationValidity.html</a></pre>
<ol>
<li>Review this file. It will list errors. Some of those are actually
warnings, and indicate possible problems (this is indicated in the text,
such as by: &quot;These are not necessarily errors, but should be examined for
<i>possible</i> errors&quot;). In those cases, the items should be reviewed to make
sure that there are no inadvertent problems.</li>
<li>If it is not so marked, it is a true error, and must be fixed.</li>
<li>At the end, there is section <b>11. Coverage</b>. There are two sections:<ol>
<li>In UCDxxx, but not in allkeys. Check this over to make sure that these
are all the characters that should get <b><i>implicit</i></b> weights.</li>
<li>In allkeys, but not in UCD. These should be <b><i>only</i></b>
contractions. Check them over to make sure they look right also.</li>
</ol></li>
</ol></li>
<li>
<h4>To build all the UCA files used by ICU, use the Program arguments:</h4>
<pre>Main ICU</pre>
<h4><span style="font-weight: 400">To build all the charts, use the options:
</span> </h4>
<pre>normalizationChart caseChart scriptChart indexChart</pre>
</li>
<li>
<h4>To build all the charts, use the UCA project, with options: </h4>
<pre>normalizationChart caseChart scriptChart indexChart</pre>
<h4><span style="font-weight: 400">To build all the UCA files used by ICU, use the
option:</span></h4>
<pre>ICU</pre>
</li>
<li>You should then build a set of the ICU files for the previous version,
if you don't have them. The key file is UCA_Rules_NoCE.txt. It contains the
rules expressed in ICU format, which allows for comparison across versions
of UCA.<ol>
<li>Do a Diff, and verify that all the differences are either new
characters, or were authorized to be changed by the UTC.</li>
</ol>
</li>
</ol>
</body>
</html>
</html>