diff --git a/tools/unicodetools/com/ibm/text/UCA/CEList.java b/tools/unicodetools/com/ibm/text/UCA/CEList.java
index 24f0073b3a2..ce2511664fb 100644
--- a/tools/unicodetools/com/ibm/text/UCA/CEList.java
+++ b/tools/unicodetools/com/ibm/text/UCA/CEList.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/CEList.java,v $
-* $Date: 2001/08/31 00:20:40 $
-* $Revision: 1.2 $
+* $Date: 2001/09/19 23:32:21 $
+* $Revision: 1.3 $
*
*******************************************************************************
*/
@@ -102,7 +102,8 @@ public final class CEList implements java.lang.Comparable, UCD_Types {
for (int i = startOffset; i < min; ++i) {
if (contents[i] != that.contents[i + delta]) {
- if (contents[i] < that.contents[i + delta]) return -1;
+ if ((contents[i] & 0xFFFFFFFFL)
+ < (that.contents[i + delta] & 0xFFFFFFFFL)) return -1;
return 1;
}
}
@@ -158,7 +159,9 @@ public final class CEList implements java.lang.Comparable, UCD_Types {
public static String toString(int ce) {
return "[" + Utility.hex(UCA.getPrimary(ce)) + "."
+ Utility.hex(UCA.getSecondary(ce)) + "."
- + Utility.hex(UCA.getTertiary(ce)) + "](" + NAME3[UCA.getTertiary(ce)] + ")";
+ + Utility.hex(UCA.getTertiary(ce)) + "]"
+ // + "(" + NAME3[UCA.getTertiary(ce)] + ")"
+ ;
}
static final String[] NAME3 = {
diff --git a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java
index 30b5ab84e4a..42a64314b40 100644
--- a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java
+++ b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $
-* $Date: 2001/09/06 01:30:31 $
-* $Revision: 1.3 $
+* $Date: 2001/09/19 23:32:21 $
+* $Revision: 1.4 $
*
*******************************************************************************
*/
@@ -43,7 +43,7 @@ public class GenOverlap implements UCD_Types {
nfd = new Normalizer(Normalizer.NFD);
nfkd = new Normalizer(Normalizer.NFKD);
- UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
+ UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
// store data for faster lookup
@@ -307,7 +307,7 @@ public class GenOverlap implements UCD_Types {
nfd = new Normalizer(Normalizer.NFD);
nfkd = new Normalizer(Normalizer.NFKD);
- UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
+ UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
// store data for faster lookup
@@ -505,7 +505,7 @@ public class GenOverlap implements UCD_Types {
//nfd = new Normalizer(Normalizer.NFD);
//nfkd = new Normalizer(Normalizer.NFKD);
- UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
+ UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
nfd = new Normalizer(Normalizer.NFD);
nfkd = new Normalizer(Normalizer.NFKD);
diff --git a/tools/unicodetools/com/ibm/text/UCA/Main.java b/tools/unicodetools/com/ibm/text/UCA/Main.java
new file mode 100644
index 00000000000..4968b192849
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCA/Main.java
@@ -0,0 +1,20 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and *
+* others. All Rights Reserved. *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
+* $Date: 2001/09/19 23:31:50 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+package com.ibm.text.UCA;
+
+public class Main {
+ public static void main(String args[]) throws Exception {
+ WriteCollationData.main(args); // TODO, pull from there to here.
+ }
+}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java
index 6446431c460..3855c3cf0c9 100644
--- a/tools/unicodetools/com/ibm/text/UCA/UCA.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
-* $Date: 2001/09/06 01:30:31 $
-* $Revision: 1.3 $
+* $Date: 2001/09/19 23:32:21 $
+* $Revision: 1.4 $
*
*******************************************************************************
*/
@@ -765,15 +765,6 @@ final public class UCA {
*/
static final int EXCEPTION_CE_MASK = 0xFFC00000;
- /**
- * Any unsupported characters (those not in the UCA data tables)
- * are marked with a exception bit combination
- * so that they can be treated specially.
- * There are at least 34 values, so that we can use a range for surrogates
- * However, we do add to the first weight if we have surrogate pairs!
- */
- static final int UNSUPPORTED = 0xFFC20101;
-
/**
* Used to composed Hangul and Han characters
*/
@@ -781,6 +772,18 @@ final public class UCA {
static final int NEUTRAL_SECONDARY = 0x20;
static final int NEUTRAL_TERTIARY = 0x02;
+ /**
+ * Any unsupported characters (those not in the UCA data tables)
+ * are marked with a exception bit combination
+ * so that they can be treated specially.
+ * There are at least 34 values, so that we can use a range for surrogates
+ * However, we do add to the first weight if we have surrogate pairs!
+ */
+ static final int UNSUPPORTED_P = 0xFFC2;
+ static final int UNSUPPORTED = makeKey(UNSUPPORTED_P, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
+
+ // was 0xFFC20101;
+
/**
* Contracting characters are marked with a exception bit combination
* in the collationElement table.
@@ -968,9 +971,14 @@ final public class UCA {
// in code order.
// add bottom 5 bits to UNSUPPORTED, and push rest
//return UNSUPPORTED + (bigChar & 0xFFFF0000); // top bits added
+ expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, 0, 0)); // primary = bottom 15 bits plus turn bottom bit on.
+ // secondary and tertiary are both zero
+ return makeKey(UNSUPPORTED_P + (bigChar >> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); // top 34 values plus UNSUPPORTED
+ /*
expandingStack.push(((bigChar & 0x7FFF) << 16) | 0x10000000); // primary = bottom 15 bits plus turn bottom bit on.
// secondary and tertiary are both zero
return UNSUPPORTED + ((bigChar << 1) & 0xFFFF0000); // top 34 values plus UNSUPPORTED
+ */
}
if (ce == CONTRACTING) {
// Contracting is probably the most interesting (read "tricky") part
@@ -1127,11 +1135,11 @@ final public class UCA {
return new Hashtable(multiTable);
}
- public CollationContents getCollationContents(byte ceLimit, Normalizer skipDecomps) {
- return new CollationContents(ceLimit, skipDecomps);
+ public UCAContents getContents(byte ceLimit, Normalizer skipDecomps) {
+ return new UCAContents(ceLimit, skipDecomps);
}
- public class CollationContents {
+ public class UCAContents {
int current = -1;
Normalizer skipDecomps = new Normalizer(Normalizer.NFD);
Iterator enum = null;
@@ -1140,16 +1148,15 @@ final public class UCA {
/**
* use FIXED_CE as the limit
*/
- CollationContents(byte ceLimit, Normalizer skipDecomps) {
+ UCAContents(byte ceLimit, Normalizer skipDecomps) {
this.ceLimit = ceLimit;
this.skipDecomps = skipDecomps;
}
-
+
/**
- * returns a string and its ces
+ * returns a string
*/
- public String next(int[] ces, int[] len) {
-
+ public String next() {
String result = null; // null if done
// normal case
@@ -1158,7 +1165,6 @@ final public class UCA {
if (getCEType(ch) >= ceLimit) continue;
if (skipDecomps != null && skipDecomps.hasDecomposition(ch)) continue;
result = String.valueOf(ch);
- len[0] = getCEs(result, true, ces);
return result;
}
@@ -1166,11 +1172,36 @@ final public class UCA {
if (enum == null) enum = multiTable.keySet().iterator();
if (enum.hasNext()) {
result = (String)enum.next();
- len[0] = getCEs(result, true, ces);
}
return result;
}
+
+
+ /**
+ * returns a string and its ces
+ */
+ public String next(int[] ces, int[] len) {
+
+ String result = next(); // null if done
+ if (result != null) {
+ len[0] = getCEs(result, true, ces);
+ }
+ return result;
+ }
+
+ int[] lengthBuffer = new int[1];
+
+ /**
+ * returns a string and its ces
+ */
+ public boolean next(Pair result) {
+ String s = next(ceListBuffer, lengthBuffer);
+ if (s == null) return false;
+ result.first = new CEList(ceListBuffer, 0, lengthBuffer[0]);
+ result.second = s;
+ return true;
+ }
}
/**
diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
new file mode 100644
index 00000000000..f61ddac467a
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
@@ -0,0 +1,213 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and *
+* others. All Rights Reserved. *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
+* $Date: 2001/09/19 23:31:50 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+package com.ibm.text.UCA;
+
+import java.util.*;
+
+import java.io.*;
+import com.ibm.text.UCD.*;
+import com.ibm.text.utility.*;
+import com.ibm.text.UTF16;
+
+public class WriteCharts implements UCD_Types {
+
+ static UCD ucd;
+
+ static public void test(UCA uca) throws IOException {
+
+ uca.setAlternate(UCA.NON_IGNORABLE);
+
+ ucd = UCD.make();
+ Normalizer nfd = new Normalizer(Normalizer.NFD);
+
+ UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps
+
+ Set set = new TreeSet();
+
+ while (true) {
+ String x = cc.next();
+ if (x == null) break;
+ set.add(new Pair(uca.getSortKey(x), x));
+ }
+
+ PrintWriter output = null;
+
+ Iterator it = set.iterator();
+
+ int oldScript = -999;
+
+ int[] scriptCount = new int[LIMIT_SCRIPT];
+
+ int counter = 0;
+
+ int lastPrimary = -1;
+
+ String lastSortKey = null;
+
+ int high = uca.getSortKey("a").charAt(0);
+ int variable = UCA.getPrimary(uca.getVariableHigh());
+
+ int columnCount = 0;
+
+ indexFile = Utility.openPrintWriter("CollationCharts\\index_list.html");
+
+ indexFile.println("
";
+ columnCount = 0;
+ }
+ output.println(breaker + CLASSNAME[strength] + s
+ + "" + Utility.hex(s)
+ //+ " " + script
+ //+ " " + UCA.toString(sortKey)
+ + " ");
+ ++columnCount;
+ }
+
+ closeFile(output);
+ indexFile.println("");
+ indexFile.close();
+ }
+
+ static final String[] CLASSNAME = {
+ "",
+ " ",
+ " ",
+ " ",
+ " ",
+ " ",
+ " "};
+
+
+ static PrintWriter indexFile;
+
+ static PrintWriter openFile(int count, byte script) throws IOException {
+ String scriptName = getChunkName(script);
+ scriptName = ucd.getCase(scriptName, FULL, TITLE);
+
+ String fileName = "chart_" + scriptName + (count > 1 ? count + "" : "") + ".html";
+ PrintWriter output = Utility.openPrintWriter("CollationCharts\\" + fileName);
+ Utility.fixDot();
+ System.out.println("Writing: " + scriptName);
+
+ indexFile.println(" | " + scriptName + " ");
+ String title = "UCA: " + scriptName;
+ output.println(" ");
+ output.println("" + title + " ");
+ output.println(" ");
+ output.println("" + scriptName + " ");
+ output.println("");
+ return output;
+ }
+
+ static String getChunkName(byte script) {
+ if (script == -3) return "NULL";
+ else if (script == -2) return "IGNORABLE";
+ else if (script == -1) return "VARIABLE";
+ else if (script == HIRAGANA_SCRIPT) return "KATAKANA-HIRAGANA";
+ else return ucd.getScriptID_fromIndex(script);
+ }
+
+ static void closeFile(PrintWriter output) {
+ if (output == null) return;
+ output.println("
");
+ output.close();
+ }
+}
+
+
+
+ /*
+ static final IntStack p1 = new IntStack(30);
+ static final IntStack s1 = new IntStack(30);
+ static final IntStack t1 = new IntStack(30);
+ static final IntStack p2 = new IntStack(30);
+ static final IntStack s2 = new IntStack(30);
+ static final IntStack t2 = new IntStack(30);
+
+ static int getStrengthDifference(CEList ceList, CEList lastCEList) {
+ extractNonzeros(ceList, p1, s1, t1);
+ extractNonzeros(lastCEList, p2, s2, t2);
+ int temp = p1.compareTo(p2);
+ if (temp != 0) return 3;
+ temp = s1.compareTo(s2);
+ if (temp != 0) return 2;
+ temp = t1.compareTo(t2);
+ if (temp != 0) return 1;
+ return 0;
+ }
+
+ static void extractNonzeros(CEList ceList, IntStack primaries, IntStack secondaries, IntStack tertiaries) {
+ primaries.clear();
+ secondaries.clear();
+ tertiaries.clear();
+
+ for (int i = 0; i < ceList.length(); ++i) {
+ int ce = ceList.at(i);
+ int temp = UCA.getPrimary(ce);
+ if (temp != 0) primaries.push(temp);
+ temp = UCA.getSecondary(ce);
+ if (temp != 0) secondaries.push(temp);
+ temp = UCA.getTertiary(ce);
+ if (temp != 0) tertiaries.push(temp);
+ }
+ }
+ */
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
index 826bab5b94f..aa04e472ab6 100644
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
-* $Date: 2001/09/06 01:30:30 $
-* $Revision: 1.3 $
+* $Date: 2001/09/19 23:32:21 $
+* $Revision: 1.4 $
*
*******************************************************************************
*/
@@ -34,7 +34,6 @@ public class WriteCollationData implements UCD_Types {
static final boolean EXCLUDE_UNSUPPORTED = true;
static final boolean GENERATED_NFC_MISMATCHES = true;
static final boolean DO_CHARTS = true;
- static final boolean WRITE_NAME_IN_CONFORMANCE = true;
static UCA collator;
@@ -58,12 +57,13 @@ public class WriteCollationData implements UCD_Types {
ucd = UCD.make("");
if (args.length == 0) args = new String[] {"?"}; // force the help comment
- boolean hex = false;
+ boolean shortPrint = false;
for (int i = 0; i < args.length; ++i) {
String arg = args[i];
if (arg.equalsIgnoreCase("WriteRulesWithNames")) writeRules(WITH_NAMES);
else if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(collator);
+ else if (arg.equalsIgnoreCase("WriteCharts")) WriteCharts.test(collator);
else if (arg.equalsIgnoreCase("CheckHash")) GenOverlap.checkHash(collator);
else if (arg.equalsIgnoreCase("generateRevision")) GenOverlap.generateRevision(collator);
@@ -72,15 +72,15 @@ public class WriteCollationData implements UCD_Types {
else if (arg.equalsIgnoreCase("checkDisjointIgnorables")) checkDisjointIgnorables();
else if (arg.equalsIgnoreCase("writeContractions")) writeContractions();
else if (arg.equalsIgnoreCase("FractionalUCA")) writeFractionalUCA("FractionalUCA");
- else if (arg.equalsIgnoreCase("writeConformance")) writeConformance("CollationTest_NON_IGNORABLE.txt", UCA.NON_IGNORABLE, hex);
- else if (arg.equalsIgnoreCase("writeConformanceSHIFTED")) writeConformance("CollationTest_SHIFTED.txt", UCA.SHIFTED, hex);
+ else if (arg.equalsIgnoreCase("writeConformance")) writeConformance("CollationTest_NON_IGNORABLE", UCA.NON_IGNORABLE, shortPrint);
+ else if (arg.equalsIgnoreCase("writeConformanceSHIFTED")) writeConformance("CollationTest_SHIFTED", UCA.SHIFTED, shortPrint);
else if (arg.equalsIgnoreCase("testCompatibilityCharacters")) testCompatibilityCharacters();
else if (arg.equalsIgnoreCase("writeCollationValidityLog")) writeCollationValidityLog();
else if (arg.equalsIgnoreCase("writeCaseExceptions")) writeCaseExceptions();
else if (arg.equalsIgnoreCase("writeJavascriptInfo")) writeJavascriptInfo();
else if (arg.equalsIgnoreCase("writeCaseFolding")) writeCaseFolding();
else if (arg.equalsIgnoreCase("javatest")) javatest();
- else if (arg.equalsIgnoreCase("hex")) hex = true;
+ else if (arg.equalsIgnoreCase("short")) shortPrint = true;
else {
System.out.println();
System.out.println("UNKNOWN OPTION (" + arg + "): must be one of the following (case-insensitive)");
@@ -339,15 +339,17 @@ public class WriteCollationData implements UCD_Types {
}
- static void writeConformance(String filename, byte option, boolean hex) throws IOException {
- UCD ucd30 = UCD.make("300");
+ static void writeConformance(String filename, byte option, boolean shortPrint) throws IOException {
+ UCD ucd30 = UCD.make("3.0.0");
- PrintWriter log = Utility.openPrintWriter(filename);
- if (!hex) log.write('\uFEFF');
+ PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt");
+ if (!shortPrint) log.write('\uFEFF');
System.out.println("Sorting");
+ int counter = 0;
for (int i = 0; i <= 0x10FFFF; ++i) {
+ Utility.dot(counter++);
if (!ucd.isRepresented(i)) continue;
addStringX(UTF32.valueOf32(i), option);
}
@@ -355,11 +357,14 @@ public class WriteCollationData implements UCD_Types {
Hashtable multiTable = collator.getContracting();
Enumeration enum = multiTable.keys();
while (enum.hasMoreElements()) {
+ Utility.dot(counter++);
addStringX((String)enum.nextElement(), option);
}
for (int i = 0; i < extraConformanceTests.length; ++i) { // put in sample non-characters
+ Utility.dot(counter++);
String s = UTF32.valueOf32(extraConformanceTests[i]);
+ Utility.fixDot();
System.out.println("Adding: " + Utility.hex(s));
addStringX(s, option);
}
@@ -367,6 +372,7 @@ public class WriteCollationData implements UCD_Types {
for (int i = 0; ; ++i) { // add first unallocated character
if (!ucd.isAssigned(i)) {
String s = UTF32.valueOf32(i);
+ Utility.fixDot();
System.out.println("Adding: " + Utility.hex(s));
addStringX(s, option);
break;
@@ -375,6 +381,7 @@ public class WriteCollationData implements UCD_Types {
for (int i = 0; i < extraConformanceRanges.length; ++i) {
+ Utility.dot(counter++);
int start = extraConformanceRanges[i][0];
int end = extraConformanceRanges[i][1];
int increment = ((end - start + 1) / 303) + 1;
@@ -388,6 +395,7 @@ public class WriteCollationData implements UCD_Types {
addStringX(end, option);
}
+ Utility.fixDot();
System.out.println("Total: " + sortedD.size());
Iterator it;
@@ -399,6 +407,7 @@ public class WriteCollationData implements UCD_Types {
String lastKey = "";
while (it.hasNext()) {
+ Utility.dot(counter);
String key = (String) it.next();
String source = (String) sortedD.get(key);
int fluff = key.charAt(key.length() - 1);
@@ -408,14 +417,12 @@ public class WriteCollationData implements UCD_Types {
//log.println(source);
String clipped = source.substring(0, source.length()-1);
String stren = source.substring(source.length()-1);
- if (hex) {
+ if (!shortPrint) {
log.print(Utility.hex(source));
- } else {
- log.print(source + "\t" + Utility.hex(clipped));
- }
- if (WRITE_NAME_IN_CONFORMANCE) {
log.print(
";\t#" + ucd.getName(clipped)+ "\t" + UCA.toString(key));
+ } else {
+ log.print(source + "\t" + Utility.hex(clipped));
}
log.println();
}
@@ -754,7 +761,7 @@ public class WriteCollationData implements UCD_Types {
int[] ces = new int[50];
- UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
+ UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
int[] lenArray = new int[1];
diLog.println("# Contractions");
@@ -819,7 +826,7 @@ public class WriteCollationData implements UCD_Types {
String s = String.valueOf(ch);
int len = collator.getCEs(s, true, ces);
*/
- UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
+ UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
int[] lenArray = new int[1];
Set sortedCodes = new TreeSet();
@@ -987,7 +994,7 @@ public class WriteCollationData implements UCD_Types {
String s = String.valueOf(ch);
int len = collator.getCEs(s, true, ces);
*/
- UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd);
+ UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd);
int[] lenArray = new int[1];
Set sortedCodes = new TreeSet();
@@ -1179,7 +1186,7 @@ public class WriteCollationData implements UCD_Types {
java.util.Comparator cm = new RuleComparator();
Map ordered = new TreeMap(cm);
- UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE,
+ UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE,
SKIP_CANONICAL_DECOMPOSIBLES ? nfd : null);
int[] lenArray = new int[1];
diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
index e4901d5ff36..15834c01ca1 100644
--- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
+++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
-* $Date: 2001/09/06 01:29:48 $
-* $Revision: 1.3 $
+* $Date: 2001/09/19 23:33:16 $
+* $Revision: 1.4 $
*
*******************************************************************************
*/
@@ -58,8 +58,10 @@ public class DerivedProperty implements UCD_Types {
DefaultIgnorable = 26,
GraphemeExtend = 27,
GraphemeBase = 28,
+
+ FC_NFC_Closure = 29,
- LIMIT = 29;
+ LIMIT = 30;
public DerivedProperty(UCD ucd) {
@@ -156,8 +158,8 @@ public class DerivedProperty implements UCD_Types {
compName = "NFD for the character";
}
header = "# Derived Property: " + name
- + "\r\n# Normalized form " + NAME[i-GenNFD] + ", where DIFFERENT from " + compName + "."
- + "\r\n# HANGUL SYLLABLES are algorithmically decomposed, and not listed explicitly."
+ + "\r\n# Lists characters in normalized form " + NAME[i-GenNFD] + "."
+ + "\r\n# Only those characters whith normalized forms are DIFFERENT from " + compName + " are listed!"
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# It is NOT sufficient to replace characters one-by-one with these results!";
}
@@ -422,6 +424,25 @@ of characters, the first of which has a non-zero combining class.
boolean hasProperty(int cp) { return getProperty(cp).length() != 0; }
};
+ dprops[FC_NFC_Closure] = new DProp() {
+ {
+ name = "FC_NFC_Closure";
+ header = "# Derived Property: " + name
+ + "\r\n# Generated from computing: b = NFC(Fold(a)); c = NFC(Fold(b));"
+ + "\r\n# Then if (c != b) add the mapping from a to c to the set of"
+ + "\r\n# mappings that constitute the FC_NFC_Closure list";
+ }
+ public boolean propertyVaries() {return true;} // default
+ public String getProperty(int cp) {
+ if (!ucdData.isRepresented(cp)) return "";
+ String b = nfc.normalize(fold(cp));
+ String c = nfc.normalize(fold(b));
+ if (c.equals(b)) return "";
+ return "FN; " + Utility.hex(c);
+ } // default
+ boolean hasProperty(int cp) { return getProperty(cp).length() != 0; }
+ };
+
for (int i = QuickNFD; i <= QuickNFKC; ++i) {
dprops[i] = new QuickDProp(i);
}
diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java
index 190d1473bfd..6d6329c6e98 100644
--- a/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java
+++ b/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
-* $Date: 2001/09/06 01:29:48 $
-* $Revision: 1.4 $
+* $Date: 2001/09/19 23:33:16 $
+* $Revision: 1.5 $
*
*******************************************************************************
*/
@@ -27,7 +27,7 @@ final class DerivedPropertyLister extends PropertyLister {
int width;
boolean varies;
- public DerivedPropertyLister(UCD ucd, int propMask, PrintStream output) {
+ public DerivedPropertyLister(UCD ucd, int propMask, PrintWriter output) {
this.propMask = propMask;
this.output = output;
this.ucdData = ucd;
@@ -87,7 +87,7 @@ final class DerivedPropertyLister extends PropertyLister {
String last;
public byte status(int cp) {
- if (!ucdData.isAssigned(cp)) return EXCLUDE;
+ if (!ucdData.isAssigned(cp) && propMask != DerivedProperty.DefaultIgnorable) return EXCLUDE;
if (!varies) {
return dprop.hasProperty(cp, propMask) ? INCLUDE : EXCLUDE;
}
diff --git a/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java
index c58fdf345d6..6710c92effa 100644
--- a/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java
+++ b/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
-* $Date: 2001/08/31 00:30:17 $
-* $Revision: 1.2 $
+* $Date: 2001/09/19 23:33:16 $
+* $Revision: 1.3 $
*
*******************************************************************************
*/
@@ -17,14 +17,11 @@ import java.io.*;
class DiffPropertyLister extends PropertyLister {
private UCD oldUCD;
- public DiffPropertyLister(String oldUCDName, String newUCDName, PrintStream output) {
+ public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output) {
this.output = output;
this.ucdData = UCD.make(newUCDName);
if (oldUCDName != null) this.oldUCD = UCD.make(oldUCDName);
- }
-
- public byte status (int cp) {
- return INCLUDE;
+ breakByCategory = false;
}
public String propertyName(int cp) {
@@ -42,14 +39,23 @@ class DiffPropertyLister extends PropertyLister {
*/
- public byte status(int lastCp, int cp) {
+ public byte status(int cp) {
/*if (cp == 0xFFFF) {
System.out.println("# " + Utility.hex(cp));
}
*/
return ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp)) ? INCLUDE : EXCLUDE;
}
-
+
+ public String headerString() {
+ if (oldUCD != null) {
+ return "# Differences between " + ucdData.getVersion() + " and " + oldUCD.getVersion();
+ } else {
+ return "# Allocated as of " + ucdData.getVersion();
+ }
+ }
+
+ /*
public int print() {
String status;
if (oldUCD != null) {
@@ -73,6 +79,7 @@ class DiffPropertyLister extends PropertyLister {
output.println();
return count;
}
+ */
}
diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
index 6c2a5ad16e6..779a33a2ecd 100644
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
-* $Date: 2001/08/31 00:30:17 $
-* $Revision: 1.2 $
+* $Date: 2001/09/19 23:33:16 $
+* $Revision: 1.3 $
*
*******************************************************************************
*/
@@ -20,7 +20,7 @@ import com.ibm.text.utility.*;
public class GenerateCaseFolding implements UCD_Types {
public static boolean DEBUG = false;
- public static UCD ucd = UCD.make("310");
+ public static UCD ucd = UCD.make("");
public static void main(String[] args) throws java.io.IOException {
makeCaseFold();
@@ -285,71 +285,4 @@ public class GenerateCaseFolding implements UCD_Types {
}
return result + "}";
}
-
- static final void getAge() throws IOException {
- PrintStream log = new PrintStream(
- new BufferedOutputStream (
- new FileOutputStream("UnicodeAge.txt"),
- 4*1024));
- try {
- log.println("# Derived file showing when various code points were allocated in Unicode");
- log.println("# author: M. Davis");
- log.println("# generated: " + new Date());
- log.println("# Notes:");
- log.println("# - The old Hangul Syllables (removed from 2.0) are not included in the 110 listing.");
- log.println("# - The supplementary private use code points, although allocated earlier,");
- log.println("# were NOT specifically listed in the UCD until 3.0.1, and are not included until then.");
- new DiffPropertyLister(null, "110", log).print();
- new DiffPropertyLister("110", "200", log).print();
- new DiffPropertyLister("200", "210", log).print();
- new DiffPropertyLister("210", "300", log).print();
- new DiffPropertyLister("300", "310", log).print();
- /*
- printDiff("110", "200");
- UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
- UnicodeSet u20 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.0.txt", false);
- UnicodeSet u21 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.1.txt", false);
- UnicodeSet u30 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.0.txt", false);
- UnicodeSet u31 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.1.txt", false);
-
- log.println();
- log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): "
- + n.format(u11.count()));
- log.println();
- u11.print(log, false, false, "1.1");
-
- UnicodeSet u20m = new UnicodeSet(u20).remove(u11);
- log.println();
- log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): "
- + n.format(u20m.count()));
- log.println();
- u20m.print(log, false, false, "2.0");
-
- UnicodeSet u21m = new UnicodeSet(u21).remove(u20);
- log.println();
- log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): "
- + n.format(u21m.count()));
- log.println();
- u21m.print(log, false, false, "2.1");
-
- UnicodeSet u30m = new UnicodeSet(u30).remove(u21);
- log.println();
- log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): "
- + n.format(u30m.count()));
- log.println();
- u30m.print(log, false, false, "3.0");
-
- UnicodeSet u31m = new UnicodeSet(u31).remove(u30);
- log.println();
- log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): "
- + n.format(u31m.count()));
- log.println();
- u31m.print(log, false, false, "3.1");
- */
- } finally {
- if (log != null) log.close();
- }
-
- }
-
}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
index f1202517a67..878d6899abe 100644
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
-* $Date: 2001/09/06 01:29:48 $
-* $Revision: 1.5 $
+* $Date: 2001/09/19 23:33:16 $
+* $Revision: 1.6 $
*
*******************************************************************************
*/
@@ -22,9 +22,9 @@ import com.ibm.text.utility.*;
public class GenerateData implements UCD_Types {
- public static void main (String[] args) throws IOException {
+ public static void main (String inVersion, String[] args) throws IOException {
System.out.println("START");
- ucd = UCD.make();
+ ucd = UCD.make(inVersion);
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
String version = ucd.getVersion();
@@ -36,10 +36,7 @@ public class GenerateData implements UCD_Types {
Utility.fixDot();
System.out.println("Argument: " + args[i]);
- if (arg.equalsIgnoreCase("version")) {
- version = args[++i];
- ucd = UCD.make(version);
- } else if (arg.equalsIgnoreCase("partition")) {
+ if (arg.equalsIgnoreCase("partition")) {
partitionProperties();
} else if (arg.equalsIgnoreCase("list")) {
listProperties();
@@ -91,9 +88,12 @@ public class GenerateData implements UCD_Types {
} else if (arg.equalsIgnoreCase("DerivedCoreProperties")) {
mask = Utility.setBits(0, DerivedProperty.PropMath, DerivedProperty.Mod_ID_Continue_NO_Cf);
- mask = Utility.setBits(mask, DerivedProperty.DefaultIgnorable, DerivedProperty.LIMIT-1);
+ mask = Utility.setBits(mask, DerivedProperty.DefaultIgnorable, DerivedProperty.FC_NFC_Closure-1);
generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-" + version );
+ } else if (arg.equalsIgnoreCase("DerivedAge")) {
+ generateAge("DerivedAge-" + version );
+
} else if (arg.equalsIgnoreCase("DerivedLineBreak")) {
generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED,
"DerivedLineBreak-" + version );
@@ -181,7 +181,7 @@ public class GenerateData implements UCD_Types {
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
- public static void doHeader(String fileName, PrintStream output, int headerChoice) {
+ public static void doHeader(String fileName, PrintWriter output, int headerChoice) {
output.println("# " + fileName + ".txt");
output.println("#");
if (headerChoice == HEADER_SCRIPTS) {
@@ -203,7 +203,7 @@ public class GenerateData implements UCD_Types {
}
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
- PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName + "dX.txt"));
+ PrintWriter output = Utility.openPrintWriter(fileName + "dX.txt");
doHeader(fileName, output, headerChoice);
for (int i = 0; i < DerivedProperty.LIMIT; ++i) {
if ((bitMask & (1<
+ * See UTR#15 for details.
+ * Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.
+ * The Unicode Consortium makes no expressed or implied warranty of any
+ * kind, and assumes no liability for errors or omissions.
+ * No liability is assumed for incidental and consequential damages
+ * in connection with or arising out of the use of the information here.
+ * @author Mark Davis
+ */
+
+public class NormalizerSample implements UCD_Types {
+ static final String copyright = "Copyright (C) 2001, IBM Corp. and Unicode Inc. All Rights Reserved.";
+
+ public static boolean SHOW_PROGRESS = false;
+
+ /**
+ * Create a normalizer for a given form.
+ */
+ public NormalizerSample(byte form, String unicodeVersion) {
+ this.composition = (form & COMPOSITION_MASK) != 0;
+ this.compatibility = (form & COMPATIBILITY_MASK) != 0;
+ this.data = getData(unicodeVersion);
+ }
+
+ /**
+ * Create a normalizer for a given form.
+ */
+ public NormalizerSample(byte form) {
+ this(form,"");
+ }
+
+ /**
+ * Masks for the form selector
+ */
+ public static final byte
+ COMPATIBILITY_MASK = 1,
+ COMPOSITION_MASK = 2;
+
+ /**
+ * Normalization Form Selector
+ */
+ public static final byte
+ NFD = 0 ,
+ NFKD = COMPATIBILITY_MASK,
+ NFC = COMPOSITION_MASK,
+ NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
+
+ /**
+ * Normalizes text according to the chosen form,
+ * replacing contents of the target buffer.
+ * @param source the original text, unnormalized
+ * @param target the resulting normalized text
+ */
+ public StringBuffer normalize(String source, StringBuffer target) {
+
+ // First decompose the source into target,
+ // then compose if the form requires.
+
+ if (source.length() != 0) {
+ internalDecompose(source, target);
+ if (composition) {
+ internalCompose(target);
+ }
+ }
+ return target;
+ }
+
+ /**
+ * Normalizes text according to the chosen form
+ * @param source the original text, unnormalized
+ * @return target the resulting normalized text
+ */
+ public String normalize(String source) {
+ return normalize(source, new StringBuffer()).toString();
+ }
+
+ /**
+ * Normalizes text according to the chosen form
+ * @param source the original text, unnormalized
+ * @return target the resulting normalized text
+ */
+ public String normalize(int cp) {
+ return normalize(UTF16.valueOf(cp));
+ }
+
+ /**
+ */
+ private StringBuffer hasDecompositionBuffer = new StringBuffer();
+
+ public boolean hasDecomposition(int cp) {
+ hasDecompositionBuffer.setLength(0);
+ normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
+ if (hasDecompositionBuffer.length() != 1) return true;
+ return cp != hasDecompositionBuffer.charAt(0);
+ }
+
+ /**
+ * Utility: Checks whether there is a recursive decomposition of a character from the
+ * Unicode Character Database. It is compatibility or canonical according to the particular
+ * normalizer.
+ * @param ch the source character
+ */
+ public boolean normalizationDiffers(int ch) {
+ return data.normalizationDiffers(ch, composition, compatibility);
+ }
+
+ /**
+ * Utility: Gets recursive decomposition of a character from the
+ * Unicode Character Database.
+ * @param compatibility If false selects the recursive
+ * canonical decomposition, otherwise selects
+ * the recursive compatibility AND canonical decomposition.
+ * @param ch the source character
+ * @param buffer buffer to be filled with the decomposition
+ */
+ public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
+ data.getRecursiveDecomposition(ch, buffer, compatibility);
+ }
+
+
+ // ======================================
+ // PRIVATES
+ // ======================================
+
+ /**
+ * The current form.
+ */
+ private boolean composition;
+ private boolean compatibility;
+
+ /**
+ * Decomposes text, either canonical or compatibility,
+ * replacing contents of the target buffer.
+ * @param form the normalization form. If COMPATIBILITY_MASK
+ * bit is on in this byte, then selects the recursive
+ * compatibility decomposition, otherwise selects
+ * the recursive canonical decomposition.
+ * @param source the original text, unnormalized
+ * @param target the resulting normalized text
+ */
+ private void internalDecompose(String source, StringBuffer target) {
+ StringBuffer buffer = new StringBuffer();
+ int ch32;
+ for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
+ buffer.setLength(0);
+ ch32 = UTF16.charAt(source, i);
+ data.getRecursiveDecomposition(ch32, buffer, compatibility);
+
+ // add all of the characters in the decomposition.
+ // (may be just the original character, if there was
+ // no decomposition mapping)
+
+ int ch;
+ for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
+ ch = UTF16.charAt(buffer, j);
+ int chClass = data.getCanonicalClass(ch);
+ int k = target.length(); // insertion point
+ if (chClass != 0) {
+
+ // bubble-sort combining marks as necessary
+
+ int ch2;
+ for (; k > 0; k -= UTF16.getCharCount(ch2)) {
+ ch2 = UTF16.charAt(target, k-1);
+ if (data.getCanonicalClass(ch2) <= chClass) break;
+ }
+ }
+ target.insert(k, UTF16.valueOf(ch));
+ }
+ }
+ }
+
+ /**
+ * Composes text in place. Target must already
+ * have been decomposed.
+ * Uses UTF16, which is a utility class for supplementary character support in Java.
+ * @param target input: decomposed text.
+ * output: the resulting normalized text.
+ */
+ private void internalCompose(StringBuffer target) {
+ int starterPos = 0;
+ int starterCh = UTF16.charAt(target,0);
+ int compPos = UTF16.getCharCount(starterCh); // length of last composition
+ int lastClass = data.getCanonicalClass(starterCh);
+ if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
+ int oldLen = target.length();
+
+ // Loop on the decomposed characters, combining where possible
+
+ int ch;
+ for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
+ ch = UTF16.charAt(target, decompPos);
+ if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
+ + ", decompPos: " + decompPos
+ + ", compPos: " + compPos
+ + ", ch: " + Utility.hex(ch)
+ );
+ int chClass = data.getCanonicalClass(ch);
+ int composite = data.getPairwiseComposition(starterCh, ch);
+ if (composite != data.NOT_COMPOSITE
+ && (lastClass < chClass || lastClass == 0)) {
+ UTF16.setCharAt(target, starterPos, composite);
+ // we know that we will only be replacing non-supplementaries by non-supplementaries
+ // so we don't have to adjust the decompPos
+ starterCh = composite;
+ } else {
+ if (chClass == 0) {
+ starterPos = compPos;
+ starterCh = ch;
+ }
+ lastClass = chClass;
+ UTF16.setCharAt(target, compPos, ch);
+ if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
+ System.out.println("ADJUSTING: " + Utility.hex(target));
+ decompPos += target.length() - oldLen;
+ oldLen = target.length();
+ }
+ compPos += UTF16.getCharCount(ch);
+ }
+ }
+ target.setLength(compPos);
+ }
+
+ // The following class makes use of the UCD class, which accesses data in the Unicode Character Database
+
+ static class Stub {
+ private UCD ucd;
+ private HashMap compTable = new HashMap();
+ private BitSet isSecond = new BitSet();
+ private BitSet canonicalRecompose = new BitSet();
+ private BitSet compatibilityRecompose = new BitSet();
+ static final int NOT_COMPOSITE = 0xFFFF;
+
+ Stub(String version) {
+ ucd = UCD.make(version);
+ for (int i = 0; i < 0x10FFFF; ++i) {
+ if (!ucd.isAssigned(i)) continue;
+ if (ucd.isPUA(i)) continue;
+ if (ucd.isTrailingJamo(i)) isSecond.set(i);
+ byte dt = ucd.getDecompositionType(i);
+ if (dt != CANONICAL) continue;
+ if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
+ try {
+ String s = ucd.getDecompositionMapping(i);
+ int len = UTF16.countCodePoint(s);
+ if (len != 2) {
+ if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
+ continue;
+ }
+ int a = UTF16.charAt(s, 0);
+ if (ucd.getCombiningClass(a) != 0) continue;
+
+ int b = UTF16.charAt(s, UTF16.getCharCount(a));
+ isSecond.set(b);
+
+ // have a recomposition, so set the bit
+ canonicalRecompose.set(i);
+
+ // set the compatibility recomposition bit
+ // ONLY if the component characters
+ // don't compatibility decompose
+ if (ucd.getDecompositionType(a) <= CANONICAL
+ && ucd.getDecompositionType(b) <= CANONICAL) {
+ compatibilityRecompose.set(i);
+ }
+
+ long key = (((long)a)<<32) | b;
+
+ compTable.put(new Long(key), new Integer(i));
+ } catch (Exception e) {
+ throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
+ }
+ }
+ }
+ }
+
+ short getCanonicalClass(int cp) {
+ return ucd.getCombiningClass(cp);
+ }
+
+ boolean isTrailing(int cp) {
+ return isSecond.get(cp);
+ }
+
+ boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
+ byte dt = ucd.getDecompositionType(cp);
+ if (!composition) {
+ if (compatibility) return dt >= CANONICAL;
+ else return dt == CANONICAL;
+ } else {
+ // almost the same, except that we add back in the characters
+ // that RECOMPOSE
+ if (compatibility) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
+ else return dt == CANONICAL && !canonicalRecompose.get(cp);
+ }
+ }
+
+ public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
+ byte dt = ucd.getDecompositionType(cp);
+ // we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
+ if (dt == CANONICAL || dt > CANONICAL && compatibility) {
+ String s = ucd.getDecompositionMapping(cp);
+ for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
+ cp = UTF16.charAt(s, i);
+ getRecursiveDecomposition(cp, buffer, compatibility);
+ }
+ } else {
+ UTF16.append(buffer, cp);
+ }
+ }
+
+ int getPairwiseComposition(int starterCh, int ch) {
+ int hangulPoss = UCD.composeHangul(starterCh, ch);
+ if (hangulPoss != 0xFFFF) return hangulPoss;
+ Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch));
+ if (obj == null) return 0xFFFF;
+ return ((Integer)obj).intValue();
+ }
+
+ }
+
+ /**
+ * Contains normalization data from the Unicode Character Database.
+ * use false for the minimal set, true for the real set.
+ */
+ private Stub data;
+
+ private static HashMap versionCache = new HashMap();
+
+ private static Stub getData (String version) {
+ if (version.length() == 0) version = UCD.latestVersion;
+ Stub result = (Stub)versionCache.get(version);
+ if (result == null) {
+ result = new Stub(version);
+ versionCache.put(version, result);
+ }
+ return result;
+ }
+}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java
index aff3ad03a2d..5ef990e8811 100644
--- a/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java
+++ b/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
-* $Date: 2001/08/31 00:30:17 $
-* $Revision: 1.2 $
+* $Date: 2001/09/19 23:33:16 $
+* $Revision: 1.3 $
*
*******************************************************************************
*/
@@ -15,6 +15,7 @@ package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
+import java.text.NumberFormat;
abstract public class PropertyLister implements UCD_Types {
@@ -24,9 +25,10 @@ abstract public class PropertyLister implements UCD_Types {
protected UCD ucdData;
- protected PrintStream output;
+ protected PrintWriter output;
protected boolean showOnConsole;
protected boolean usePropertyComment = true;
+ protected boolean breakByCategory = true;
protected int firstRealCp = -2;
protected int lastRealCp = -2;
protected boolean alwaysBreaks = false; // set to true if property only breaks
@@ -51,7 +53,7 @@ abstract public class PropertyLister implements UCD_Types {
}
public String optionalComment(int cp) {
- if (!usePropertyComment) return "";
+ if (!usePropertyComment || !breakByCategory) return "";
int cat = ucdData.getCategory(cp);
if (cat == Lt || cat == Ll || cat == Lu) return "L&";
return ucdData.getCategoryID(cp);
@@ -167,7 +169,7 @@ abstract public class PropertyLister implements UCD_Types {
if (s == INCLUDE && firstRealCp != -1) {
byte cat = ucdData.getCategory(cp);
if (cat == Lt || cat == Ll) cat = Lu;
- if (cat != firstRealCpCat) s = BREAK;
+ if (breakByCategory && cat != firstRealCpCat) s = BREAK;
}
switch(s) {
@@ -208,9 +210,12 @@ abstract public class PropertyLister implements UCD_Types {
}
if (count == 0) System.out.println("WARNING -- ZERO COUNT FOR " + header);
+ NumberFormat nf = NumberFormat.getInstance();
+ nf.setMaximumFractionDigits(0);
output.println();
- output.println("# Total code points: " + count);
+ output.println("# Total code points: " + nf.format(count));
output.println();
return count;
}
+
}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/TestData.java b/tools/unicodetools/com/ibm/text/UCD/TestData.java
index 173f4d8fc4c..8608921214d 100644
--- a/tools/unicodetools/com/ibm/text/UCD/TestData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
-* $Date: 2001/09/01 00:06:15 $
-* $Revision: 1.3 $
+* $Date: 2001/09/19 23:33:16 $
+* $Revision: 1.4 $
*
*******************************************************************************
*/
@@ -145,7 +145,7 @@ public class TestData implements UCD_Types {
static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2;
- public static void doHeader(String fileName, PrintStream output, int headerChoice) {
+ public static void doHeader(String fileName, PrintWriter output, int headerChoice) {
output.println("# " + fixFile(fileName));
output.println("#");
if (headerChoice == HEADER_SCRIPTS) {
@@ -167,8 +167,8 @@ public class TestData implements UCD_Types {
}
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
- ucd = UCD.make("310");
- PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName));
+ ucd = UCD.make("3.1.0");
+ PrintWriter output = Utility.openPrintWriter(fileName);
doHeader(fileName, output, headerChoice);
for (int i = 0; i < 32; ++i) {
if ((bitMask & (1< 0xFFFF) return false;
return true; // Noncharacter
}
+ if (major >= 2 && codePoint >= 0xF0000 && codePoint <= 0x10FFFD) return true;
if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && major >= 3 && minor >= 1) return true;
return false;
}
@@ -438,6 +440,21 @@ public final class UCD implements UCD_Types {
public byte getScript(int codePoint) {
return get(codePoint, false).script;
}
+
+
+ public byte getScript(String s) {
+ byte result = COMMON_SCRIPT;
+ if (s == null || s.length() == 0) return result;
+ int cp;
+ for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
+ cp = UTF32.char32At(s, i);
+ byte script = getScript(cp);
+ if (script == INHERITED_SCRIPT) continue;
+ result = script;
+ }
+ return result;
+ }
+
public byte getAge(int codePoint) {
return get(codePoint, false).age;
diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
index d09b0a59dc0..53175278cea 100644
--- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
+++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
-* $Date: 2001/08/31 00:29:50 $
-* $Revision: 1.2 $
+* $Date: 2001/09/19 23:33:16 $
+* $Revision: 1.3 $
*
*******************************************************************************
*/
@@ -15,8 +15,8 @@ package com.ibm.text.UCD;
public interface UCD_Types {
public static final String DATA_DIR = "C:\\DATA\\";
- public static final String BIN_DIR = DATA_DIR + "\\BIN\\";
- public static final String GEN_DIR = DATA_DIR + "\\GEN\\";
+ public static final String BIN_DIR = DATA_DIR + "BIN\\";
+ public static final String GEN_DIR = DATA_DIR + "GEN\\";
static final byte BINARY_FORMAT = 5; // bumped if binary format of UCD changes
diff --git a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
index 663b81bf99a..c7da3dd5fdd 100644
--- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
-* $Date: 2001/09/06 01:29:48 $
-* $Revision: 1.4 $
+* $Date: 2001/09/19 23:33:15 $
+* $Revision: 1.5 $
*
*******************************************************************************
*/
@@ -20,6 +20,7 @@ import java.math.BigDecimal;
import java.util.*;
import java.io.*;
//import java.text.*;
+import com.ibm.text.*;
import com.ibm.text.utility.*;
@@ -331,6 +332,7 @@ public class VerifyUCD implements UCD_Types {
System.out.println("Checking Prohibited and Unassigned");
System.out.println();
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
+ Utility.dot(cp);
if (mappedOut.get(cp)) continue;
boolean ucdUnassigned = !ucd.isAllocated(cp);
@@ -339,33 +341,89 @@ public class VerifyUCD implements UCD_Types {
boolean idnProhibited = prohibited.get(cp);
if (ucdUnassigned && !idnUnassigned) {
- showError("UCD Unassigned but not IDN Unassigned: ", cp);
+ showError("?UCD Unassigned but not IDN Unassigned", cp, "");
++errorCount;
} else if (!ucdUnassigned && idnUnassigned) {
- showError("Not UCD Unassigned but IDN Unassigned: ", cp);
+ showError("?Not UCD Unassigned but IDN Unassigned", cp, "");
++errorCount;
}
if (idnProhibited && unassigned.get(cp)) {
- showError("Both IDN Unassigned AND IDN Prohibited: ", cp);
+ showError("?Both IDN Unassigned AND IDN Prohibited", cp, "");
++errorCount;
}
if (guess && !idnProhibited) {
- showError("UCD ?prohibited? but not IDN Prohibited: ", cp);
+ showError("?UCD ?prohibited? but not IDN Prohibited ", cp, "");
++errorCount;
} else if (!guess && idnProhibited) {
- showError("Not UCD ?prohibited? but IDN Prohibited: ", cp);
+ showError("?Not UCD ?prohibited? but IDN Prohibited ", cp, "");
++errorCount;
}
+
+ if (cp == 0x3131) {
+ System.out.println("Debug: " + idnProhibited
+ + ", " + idnUnassigned
+ + ", " + nfkc.hasDecomposition(cp)
+ + ", " + ucd.getCodeAndName(nfkc.normalize(cp))
+ + ", " + ucd.getCodeAndName(nfc.normalize(cp)));
+ }
+
+ if (!idnProhibited && ! idnUnassigned && nfkc.hasDecomposition(cp)) {
+ String kc = nfkc.normalize(cp);
+ String c = nfc.normalize(cp);
+ if (kc.equals(c)) continue;
+ int cp2;
+ boolean excluded = false;
+ for (int j = 0; j < kc.length(); j += UTF16.getCharCount(cp2)) {
+ cp2 = UTF16.charAt(kc, j);
+ if (prohibited.get(cp2)) {
+ showError("Prohibited with NFKC, but output with NFC", cp, "");
+ excluded = true;
+ break;
+ }
+ }
+ if (!excluded) {
+ showError("Remapped to core abstract character with NFKC (but not NFC)", cp, ""); // , "\t=> " + ucd.getCodeAndName(kc));
+ }
+ }
}
- System.out.println();
- System.out.println("Total Errors: " + errorCount);
+ System.out.println("Writing IDNCheck.txt");
+
+
+ PrintWriter log = Utility.openPrintWriter("IDNCheck.txt");
+ log.println("IDN Check");
+ log.println("Total Errors: " + errorCount);
+
+ Iterator it = idnMap.keySet().iterator();
+ while (it.hasNext()) {
+ String description = (String) it.next();
+ Map map = (Map) idnMap.get(description);
+ log.println();
+ log.println(description);
+ log.println("Total: " + map.size());
+ log.println();
+
+ Iterator it2 = map.keySet().iterator();
+ while (it2.hasNext()) {
+ Object key = it2.next();
+ String line = (String) map.get(key);
+ log.println(" " + line);
+ }
+ }
+ log.close();
}
+
+ static Map idnMap = new HashMap();
- static void showError(String description, int cp) {
- System.out.println(description + ucd.getCodeAndName(cp) + " (" + ucd.getCategoryID(cp) + ")");
+ static void showError(String description, int cp, String option) {
+ Map probe = (Map) idnMap.get(description);
+ if (probe == null) {
+ probe = new TreeMap();
+ idnMap.put(description, probe);
+ }
+ probe.put(new Integer(cp), ucd.getCodeAndName(cp) + " (" + ucd.getCategoryID(cp) + ")" + option);
}
@@ -611,8 +669,7 @@ E0020-E007F; [TAGGING CHARACTERS]
if (reason.equals("Map out")) {
value = Utility.fromHex(parts[1]);
Utility.fixDot();
- System.out.println("Note, Mapping Out: " + ucd.getCodeAndName(cp)
- + ", " + ucd.getCodeAndName(value) + ", " + ucd.getCategoryID(cp));
+ showError("Mapping Out: ", cp, "");
mappedOut.set(cp);
}
idnFold.put(key, value);
@@ -1033,26 +1090,37 @@ E0020-E007F; [TAGGING CHARACTERS]
int sum = 0;
long start, end;
+ java.text.NumberFormat nf = java.text.NumberFormat.getPercentInstance();
+
+ start = System.currentTimeMillis();
+ for (int i = count; i >= 0; --i) {
+ sum += dummy0(i).length();
+ }
+ end = System.currentTimeMillis();
+ double base = end - start;
+
+ System.out.println("unsynchronized static char[]: " + nf.format((end - start)/base));
+
start = System.currentTimeMillis();
for (int i = count; i >= 0; --i) {
sum += dummy2(i).length();
}
end = System.currentTimeMillis();
- System.out.println("synchronized: " + (end - start));
+ System.out.println("synchronized static char[]: " + nf.format((end - start)/base));
start = System.currentTimeMillis();
for (int i = count; i >= 0; --i) {
sum += dummy1(i).length();
}
end = System.currentTimeMillis();
- System.out.println("char[] each time: " + (end - start));
+ System.out.println("char[] each time: " + nf.format((end - start)/base));
start = System.currentTimeMillis();
for (int i = count; i >= 0; --i) {
sum += dummy3(i).length();
}
end = System.currentTimeMillis();
- System.out.println("String +: " + (end - start));
+ System.out.println("two valueofs: " + nf.format((end - start)/base));
System.out.println(sum);
}
@@ -1074,6 +1142,12 @@ E0020-E007F; [TAGGING CHARACTERS]
}
}
+ static String dummy0(int a) {
+ temp2[0] = (char)(a >>> 16);
+ temp2[1] = (char)a;
+ return new String(temp2);
+ }
+
static String dummy3(int a) {
return String.valueOf((char)(a >>> 16)) + (char)a;
}
diff --git a/tools/unicodetools/com/ibm/text/utility/IntStack.java b/tools/unicodetools/com/ibm/text/utility/IntStack.java
index 94d14016971..5fdca1f1f9f 100644
--- a/tools/unicodetools/com/ibm/text/utility/IntStack.java
+++ b/tools/unicodetools/com/ibm/text/utility/IntStack.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/IntStack.java,v $
-* $Date: 2001/08/31 00:19:16 $
-* $Revision: 1.2 $
+* $Date: 2001/09/19 23:33:52 $
+* $Revision: 1.3 $
*
*******************************************************************************
*/
@@ -17,7 +17,7 @@ package com.ibm.text.utility;
// Simple stack mechanism, with push, pop and access
// =============================================================
-public final class IntStack {
+public final class IntStack implements Comparable {
private int[] values;
private int top = 0;
@@ -51,4 +51,31 @@ public final class IntStack {
public boolean isEmpty() {
return top == 0;
}
+
+ public void clear() {
+ top = 0;
+ }
+
+ public int compareTo(Object other) {
+ IntStack that = (IntStack) other;
+ int min = top;
+ if (min < that.top) min = that.top;
+ for (int i = 0; i < min; ++i) {
+ int result = values[i] - that.values[i];
+ if (result != 0) return result;
+ }
+ return top - that.top;
+ }
+
+ public boolean equals(Object other) {
+ return compareTo(other) == 0;
+ }
+
+ public int hashCode() {
+ int result = top;
+ for (int i = 0; i < top; ++i) {
+ result = result * 37 + values[i];
+ }
+ return result;
+ }
}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/utility/Pair.java b/tools/unicodetools/com/ibm/text/utility/Pair.java
index 55fdf15ade3..1941aa102e3 100644
--- a/tools/unicodetools/com/ibm/text/utility/Pair.java
+++ b/tools/unicodetools/com/ibm/text/utility/Pair.java
@@ -5,15 +5,15 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Pair.java,v $
-* $Date: 2001/08/31 00:19:16 $
-* $Revision: 1.2 $
+* $Date: 2001/09/19 23:33:52 $
+* $Revision: 1.3 $
*
*******************************************************************************
*/
package com.ibm.text.utility;
-public final class Pair implements java.lang.Comparable {
+public final class Pair implements java.lang.Comparable, Cloneable {
public Comparable first, second;
@@ -41,4 +41,12 @@ public final class Pair implements java.lang.Comparable {
if (trial != 0) return trial;
return second.compareTo(that.second);
}
+
+ public Object clone() {
+ try {
+ return super.clone();
+ } catch (CloneNotSupportedException e) {
+ return null;
+ }
+ }
}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java b/tools/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java
index 41e7687adb4..1c8a03170bd 100644
--- a/tools/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java
+++ b/tools/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java,v $
-* $Date: 2001/08/31 00:19:16 $
-* $Revision: 1.2 $
+* $Date: 2001/09/19 23:33:52 $
+* $Revision: 1.3 $
*
*******************************************************************************
*/
@@ -56,7 +56,7 @@ public final class UTF8StreamWriter extends Writer {
TRAILING_TOP = 0x80;
private static final int MAGIC = 0x10000 + ((0 - 0xD800) << 10) + (0 - 0xDC00);
-
+
public final void write(char[] buffer, int cStart, int cLength) throws IOException {
int cEnd = cStart + cLength;
while (cStart < cEnd) {
@@ -71,6 +71,8 @@ public final class UTF8StreamWriter extends Writer {
// get code point
int utf32 = buffer[cStart++];
+
+ if (utf32 == 0x0D) continue; // skip write
// special check for surrogates
diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java
index 50b07793a6c..e05b43a65de 100644
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2001/09/06 01:29:03 $
-* $Revision: 1.3 $
+* $Date: 2001/09/19 23:33:52 $
+* $Revision: 1.4 $
*
*******************************************************************************
*/
@@ -408,12 +408,15 @@ public final class Utility { // COMMON UTILITIES
private static final String[] searchPath = {
"EXTRAS",
- "3.1.2",
+ "3.2.0",
"3.1.1",
"3.1.0",
"3.0.1",
"3.0.0",
"2.1.9",
+ "2.1.8",
+ "2.1.5",
+ "2.1.2",
"2.0.0",
"1.1.0",
};