diff --git a/tools/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java b/tools/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java
index 4ebaa37b7ea..3620ebfdb23 100644
--- a/tools/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java
+++ b/tools/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java,v $
-* $Date: 2003/03/17 23:00:20 $
-* $Revision: 1.1 $
+* $Date: 2004/02/06 18:32:04 $
+* $Revision: 1.2 $
*
*******************************************************************************
*/
@@ -24,7 +24,7 @@ import com.ibm.text.UCD.Normalizer;
import com.ibm.text.UCD.UCD;
import com.ibm.text.utility.*;
import com.ibm.text.UCD.UnifiedBinaryProperty;
-import com.ibm.text.UCD.UnicodeProperty;
+import com.ibm.text.UCD.UCDProperty;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java
index 9c2d870f8e0..c0c3bedc8da 100644
--- a/tools/unicodetools/com/ibm/text/UCA/UCA.java
+++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
-* $Date: 2003/08/21 07:32:52 $
-* $Revision: 1.22 $
+* $Date: 2004/02/06 18:32:03 $
+* $Revision: 1.23 $
*
*******************************************************************************
*/
@@ -24,7 +24,7 @@ import com.ibm.text.UCD.Normalizer;
import com.ibm.text.UCD.UCD;
import com.ibm.text.utility.*;
import com.ibm.text.UCD.UnifiedBinaryProperty;
-import com.ibm.text.UCD.UnicodeProperty;
+import com.ibm.text.UCD.UCDProperty;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
@@ -1418,7 +1418,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
*/
private void cleanup() {
- UnicodeProperty ubp = UnifiedBinaryProperty.make(
+ UCDProperty ubp = UnifiedBinaryProperty.make(
UCD.BINARY_PROPERTIES + UCD.Logical_Order_Exception, ucd);
UnicodeSet desiredSet = ubp.getSet();
diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
index 97749a7744b..9d6c983f4a7 100644
--- a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
+++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
-* $Date: 2003/08/22 16:51:21 $
-* $Revision: 1.17 $
+* $Date: 2004/02/06 18:32:03 $
+* $Revision: 1.18 $
*
*******************************************************************************
*/
@@ -175,32 +175,9 @@ public class WriteCharts implements UCD_Types {
String classname = primaryCount > 1 ? XCLASSNAME[strength] : CLASSNAME[strength];
- String name = Default.ucd.getName(s);
+ String outline = showCell2(sortKey, s, script, classname);
-
- if (s.equals("\u1eaf")) {
- System.out.println("debug");
- }
-
- String comp = Default.nfc.normalize(s);
-
- String outline = breaker + classname
- + " title='"
- + (script != UNSUPPORTED
- ? Utility.quoteXML(name, true) + ": "
- : "")
- + UCA.toString(sortKey) + "'>"
- + Utility.quoteXML(comp, true)
- + "
"
- + Utility.hex(s)
- //+ "
" + script
- + ""
- + (script == UNSUPPORTED
- ? "
" + Utility.quoteXML(name, true) + " | "
- : "")
- ;
-
- output.println(outline);
+ output.println(breaker + outline);
++columnCount;
}
@@ -208,6 +185,46 @@ public class WriteCharts implements UCD_Types {
closeIndexFile(indexFile, "
UCA: " + uca.getDataVersion(), COLLATION);
}
+ private static String showCell2(
+ String sortKey,
+ String s,
+ byte script,
+ String classname) {
+ String name = Default.ucd.getName(s);
+
+
+ if (s.equals("\u1eaf")) {
+ System.out.println("debug");
+ }
+
+ String comp = Default.nfc.normalize(s);
+ int cat = Default.ucd.getCategory(UTF16.charAt(comp,0));
+ if (cat == Mn || cat == Mc || cat == Me) {
+ comp = '\u25CC' + comp;
+ if (s.equals("\u0300")) {
+ System.out.println(Default.ucd.getCodeAndName(comp));
+ }
+ }
+ // TODO: merge with showCell
+
+ String outline = classname
+ + " title='"
+ + (script != UNSUPPORTED
+ ? Utility.quoteXML(name, true) + ": "
+ : "")
+ + UCA.toString(sortKey) + "'>"
+ + Utility.quoteXML(comp, true)
+ + "
"
+ + Utility.hex(s)
+ //+ "
" + script
+ + ""
+ + (script == UNSUPPORTED
+ ? "" + Utility.quoteXML(name, true) + " | "
+ : "")
+ ;
+ return outline;
+ }
+
static public void normalizationChart() throws IOException {
Default.setUCD();
HACK_KANA = false;
@@ -642,9 +659,20 @@ public class WriteCharts implements UCD_Types {
closeIndexFile(indexFile, "", CASE);
}
- static void showCell(PrintWriter output, String s, String prefix, String extra, boolean skipName) {
+ static void showCell(PrintWriter output, String s,
+ String prefix, String extra, boolean skipName) {
+ if (s.equals("\u0300")) {
+ System.out.println();
+ }
String name = Default.ucd.getName(s);
String comp = Default.nfc.normalize(s);
+ int cat = Default.ucd.getCategory(UTF16.charAt(comp,0));
+ if (cat == Mn || cat == Mc || cat == Me) {
+ comp = '\u25CC' + comp;
+ if (s.equals("\u0300")) {
+ System.out.println(Default.ucd.getCodeAndName(comp));
+ }
+ }
String outline = prefix
+ (skipName ? "" : " title='" + Utility.quoteXML(name, true) + "'")
diff --git a/tools/unicodetools/com/ibm/text/UCD/CheckICU.java b/tools/unicodetools/com/ibm/text/UCD/CheckICU.java
new file mode 100644
index 00000000000..c28d0a8c41c
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/CheckICU.java
@@ -0,0 +1,218 @@
+package com.ibm.text.UCD;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.dev.test.util.UnicodeProperty;
+import com.ibm.icu.dev.test.util.ICUPropertyFactory;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.text.utility.Utility;
+
+public class CheckICU {
+ static final BagFormatter bf = new BagFormatter();
+
+ public static void main(String[] args) throws IOException {
+ System.out.println("Start");
+ test();
+ System.out.println("End");
+ }
+
+ static UnicodeSet itemFailures;
+ static ICUPropertyFactory icuFactory;
+ static ToolUnicodePropertySource toolFactory;
+
+ public static void test() throws IOException {
+ checkUCD();
+ itemFailures = new UnicodeSet();
+ icuFactory = ICUPropertyFactory.make();
+ toolFactory = ToolUnicodePropertySource.make("4.0.0");
+
+ String[] quickList = {
+ "Name",
+ // "Script", "Bidi_Mirroring_Glyph", "Case_Folding",
+ //"Numeric_Value"
+ };
+ for (int i = 0; i < quickList.length; ++i) {
+ testProperty(quickList[i], -1);
+ }
+ if (quickList.length > 0) return;
+
+ Collection availableTool = toolFactory.getAvailablePropertyAliases(new TreeSet());
+
+ Collection availableICU = icuFactory.getAvailablePropertyAliases(new TreeSet());
+ System.out.println(showDifferences("Property Aliases", "ICU", availableICU, "Tool", availableTool));
+ Collection common = new TreeSet(availableICU);
+ common.retainAll(availableTool);
+
+ for (int j = UnicodeProperty.BINARY; j < UnicodeProperty.LIMIT_TYPE; ++j) {
+ System.out.println();
+ System.out.println(UnicodeProperty.getTypeName(j));
+ Iterator it = common.iterator();
+ while (it.hasNext()) {
+ String prop = (String)it.next();
+ testProperty(prop, j);
+ }
+ }
+ }
+
+ private static void checkUCD() throws IOException {
+ UCD myUCD = UCD.make("4.0.0");
+ Normalizer nfc = new Normalizer(Normalizer.NFC, "4.0.0");
+ UnicodeSet leading = new UnicodeSet();
+ UnicodeSet trailing = new UnicodeSet();
+ UnicodeSet starter = new UnicodeSet();
+ for (int i = 0; i <= 0x10FFFF; ++i) {
+ if (myUCD.getCombiningClass(i) == 0) starter.add(i);
+ if (nfc.isTrailing(i)) trailing.add(i);
+ if (nfc.isLeading(i)) leading.add(i);
+ }
+ PrintWriter pw = bf.openUTF8Writer(UCD_Types.GEN_DIR, "Trailing.txt");
+ bf.showSetNames(pw, "+Trailing+Starter", new UnicodeSet(trailing).retainAll(starter));
+ bf.showSetNames(pw, "+Trailing-Starter", new UnicodeSet(trailing).removeAll(starter));
+ bf.showSetNames(pw, "-Trailing-Starter", new UnicodeSet(trailing).complement().removeAll(starter));
+ bf.showSetNames(pw, "+Trailing+Leading", new UnicodeSet(trailing).retainAll(leading));
+ bf.showSetNames(pw, "+Trailing-Leading", new UnicodeSet(trailing).removeAll(leading));
+ pw.close();
+ }
+ /*
+ * int icuType;
+ int toolType;
+ Collection icuAliases;
+ Collection toolAliases;
+ String firstDiffICU;
+ String firstDiffTool;
+ String firstDiffCP;
+ String icuProp;
+ String toolProp;
+
+ */
+
+ private static void testProperty(String prop, int typeFilter) {
+ UnicodeProperty icuProp = icuFactory.getProperty(prop);
+ int icuType = icuProp.getPropertyType();
+
+ if (typeFilter >= 0 && icuType != typeFilter) return;
+
+ System.out.println();
+ System.out.println("Testing: " + prop);
+ UnicodeProperty toolProp = toolFactory.getProperty(prop);
+
+ int toolType = toolProp.getPropertyType();
+ if (icuType != toolType) {
+ System.out.println("FAILURE Type: ICU: " + UnicodeProperty.getTypeName(icuType)
+ + "\tTool: " + UnicodeProperty.getTypeName(toolType));
+ }
+
+ Collection icuAliases = icuProp.getPropertyAliases(new ArrayList());
+ Collection toolAliases = toolProp.getPropertyAliases(new ArrayList());
+ System.out.println(showDifferences("Aliases", "ICU", icuAliases, "Tool", toolAliases));
+
+ icuAliases = icuProp.getAvailablePropertyValueAliases(new ArrayList());
+ toolAliases = toolProp.getAvailablePropertyValueAliases(new ArrayList());
+ System.out.println(showDifferences("Value Aliases", "ICU", icuAliases, "Tool", toolAliases));
+
+ // TODO do property value aliases
+ itemFailures.clear();
+ String firstDiffICU = null, firstDiffTool = null, firstDiffCP = null;
+ for (int i = 0; i <= 0x10FFFF; ++i) {
+ /*if (i == 0x0237) {
+ System.out.println();
+ }
+ */
+ String icuValue = icuProp.getPropertyValue(i);
+ String toolValue = toolProp.getPropertyValue(i);
+ if (!equals(icuValue, toolValue)) {
+ itemFailures.add(i);
+ if (firstDiffCP == null) {
+ firstDiffICU = icuValue;
+ firstDiffTool = toolValue;
+ firstDiffCP = Utility.hex(i);
+ }
+ }
+ }
+ if (itemFailures.size() != 0) {
+ System.out.println("FAILURE " + itemFailures.size() + " Differences: ");
+ System.out.println(itemFailures.toPattern(true));
+ if (firstDiffICU != null) firstDiffICU = bf.hex.transliterate(firstDiffICU);
+ if (firstDiffTool != null) firstDiffTool = bf.hex.transliterate(firstDiffTool);
+ System.out.println(firstDiffCP
+ + "\tICU: <" + firstDiffICU
+ + ">\tTool: <" + firstDiffTool + ">");
+ }
+ System.out.println("done");
+
+ // do values later, and their aliases
+ /*
+ System.out.println("-Values");
+ UnicodeSet
+ System.out.println(showDifferences("ICU", availableICU, "Tool", availableTool));
+ */
+ }
+
+ static boolean equals(Object a, Object b) {
+ if (a == null) return b == null;
+ return a.equals(b);
+ }
+
+ static public String showDifferences(
+ String title,
+ String name1,
+ Collection set1,
+ String name2,
+ Collection set2) {
+
+ Collection temp = new TreeSet(set1);
+ temp.retainAll(set2);
+
+ if (set1.size() == temp.size()) {
+ return title + ": " + name1 + " == " + name2 + ": " + bf.join(set1);
+ }
+
+ StringBuffer result = new StringBuffer();
+ result.append(title + "\tFAILURE\r\n");
+ result.append("\t" + name1 + " = " + bf.join(set1) + "\r\n");
+ result.append("\t" + name2 + " = " + bf.join(set2) + "\r\n");
+
+ // damn'd collection doesn't have a clone, so
+ // we go with Set, even though that
+ // may not preserve order and duplicates
+ if (temp.size() != 0) {
+ result.append("\t" + name2 + " & " + name1 + ":\r\n");
+ result.append("\t" + bf.join(temp));
+ result.append("\r\n");
+ }
+
+
+ temp.clear();
+ temp.addAll(set1);
+ temp.removeAll(set2);
+ if (temp.size() != 0) {
+ result.append("\t" + name1 + " - " + name2 + ":\r\n");
+ result.append("\t" + bf.join(temp));
+ result.append("\r\n");
+ }
+
+ temp.clear();
+ temp.addAll(set2);
+ temp.removeAll(set1);
+ if (temp.size() != 0) {
+ result.append("\t" + name2 + " - " + name1 + ":\r\n");
+ result.append("\t" + bf.join(temp));
+ result.append("\r\n");
+ }
+
+
+ return result.toString();
+ }
+
+
+}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/ChineseFrequency.java b/tools/unicodetools/com/ibm/text/UCD/ChineseFrequency.java
new file mode 100644
index 00000000000..6412d2d0c10
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/ChineseFrequency.java
@@ -0,0 +1,81 @@
+package com.ibm.text.UCD;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.text.DecimalFormat;
+import com.ibm.icu.text.NumberFormat;
+import com.ibm.icu.text.UTF16;
+import com.ibm.text.utility.Pair;
+import com.ibm.text.utility.Utility;
+
+public class ChineseFrequency {
+ static final String DICT_DIR = "C:\\DATA\\dict\\";
+ static NumberFormat percent = new DecimalFormat("0.000000%");
+ static NumberFormat percent3 = new DecimalFormat("000.000000%");
+ static NumberFormat number = new DecimalFormat("#,##0");
+
+ static class InverseCompareTo implements Comparator {
+ public int compare(Object o1, Object o2) {
+ return -((Comparable)o1).compareTo(o2);
+ }
+ }
+
+ public static void test() throws IOException{
+ Set freq_char = new TreeSet(new InverseCompareTo());
+ BufferedReader br = BagFormatter.openUTF8Reader(DICT_DIR, "kHYPLCDPF.txt");
+ double grandTotal = 0.0;
+ while (true) {
+ String line = br.readLine();
+ if (line == null) break;
+ String[] pieces = Utility.split(line,'\t');
+ int cp = Integer.parseInt(pieces[0],16);
+ String[] says = Utility.split(pieces[1],',');
+ long total = 0;
+ for (int i = 0; i < says.length; ++i) {
+ int start = says[i].indexOf('(');
+ int end = says[i].indexOf(')');
+ long count = Long.parseLong(says[i].substring(start+1, end));
+ total += count;
+ }
+ grandTotal += total;
+ freq_char.add(new Pair(new Long(total), new Integer(cp)));
+ }
+ br.close();
+ PrintWriter pw = BagFormatter.openUTF8Writer(DICT_DIR,"kHYPLCDPF_frequency.txt");
+ pw.write("\uFEFF");
+ pw.println("No.\tPercentage\tAccummulated\tHex\tChar");
+
+ Iterator it = freq_char.iterator();
+ int counter = 0;
+ double cummulative = 0;
+ double cummulativePercentage = 0;
+ while (it.hasNext()) {
+ Pair item = (Pair)it.next();
+ Long total = (Long) item.first;
+ Integer cp = (Integer) item.second;
+ double current = total.longValue();
+ cummulative += current;
+ double percentage = current / grandTotal;
+ cummulativePercentage += percentage;
+ pw.println(
+ ++counter
+ //+ "\t" + number.format(current)
+ //+ "\t" + number.format(cummulative)
+ + "\t" + percent.format(percentage)
+ + "\t" + percent3.format(cummulativePercentage)
+ + "\t" + Integer.toHexString(cp.intValue()).toUpperCase()
+ + "\t" + UTF16.valueOf(cp.intValue()));
+ }
+ //pw.println("Grand total: " + (long)grandTotal);
+ pw.close();
+ }
+}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/CompareProperties.java b/tools/unicodetools/com/ibm/text/UCD/CompareProperties.java
index c9dc96a64fb..d480cf32ae4 100644
--- a/tools/unicodetools/com/ibm/text/UCD/CompareProperties.java
+++ b/tools/unicodetools/com/ibm/text/UCD/CompareProperties.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CompareProperties.java,v $
-* $Date: 2003/07/21 15:50:07 $
-* $Revision: 1.2 $
+* $Date: 2004/02/06 18:30:23 $
+* $Revision: 1.3 $
*
*******************************************************************************
*/
@@ -89,7 +89,7 @@ public class CompareProperties implements UCD_Types {
}
}
- public final class UnicodeSetComparator implements Comparator {
+ public final static class UnicodeSetComparator implements Comparator {
/**
* Compares two UnicodeSets, producing a transitive ordering.
* @return -1 if first is smaller (in size) than second,
@@ -121,7 +121,7 @@ public class CompareProperties implements UCD_Types {
boolean isPartitioned = false;
- UnicodeProperty[] props = new UnicodeProperty[500];
+ UCDProperty[] props = new UCDProperty[500];
UnicodeSet[] sets = new UnicodeSet[500];
int count = 0;
BitSet[] disjoints = new BitSet[500];
@@ -147,7 +147,7 @@ public class CompareProperties implements UCD_Types {
if (!Default.ucd.isAllocated(cp)) continue;
for (int i = 0; i < count; ++i) {
- UnicodeProperty up = props[i];
+ UCDProperty up = props[i];
boolean iProp = up.hasValue(cp);
if (iProp) {
probe.set(i);
@@ -177,7 +177,7 @@ public class CompareProperties implements UCD_Types {
if (i == 0x0900) {
System.out.println("debug");
}
- UnicodeProperty up = UnifiedBinaryProperty.make(i, Default.ucd);
+ UCDProperty up = UnifiedBinaryProperty.make(i, Default.ucd);
if (up == null) continue;
if (up.getValueType() < BINARY_PROP) {
System.out.println("\tSkipping " + up.getName() + "; value varies");
@@ -378,7 +378,7 @@ public class CompareProperties implements UCD_Types {
return getPropName(props[propertyIndex]);
}
- private String getPropName(UnicodeProperty ubp) {
+ private String getPropName(UCDProperty ubp) {
return Utility.getUnskeleton(ubp.getFullName(LONG), true);
}
@@ -395,7 +395,7 @@ public class CompareProperties implements UCD_Types {
for (int i = 1; i < UCD_Types.LIMIT_ENUM; ++i) {
int iType = i & 0xFF00;
if (iType == UCD_Types.JOINING_GROUP || iType == UCD_Types.AGE || iType == UCD_Types.COMBINING_CLASS || iType == UCD_Types.SCRIPT) continue;
- UnicodeProperty upi = UnifiedBinaryProperty.make(i, Default.ucd);
+ UCDProperty upi = UnifiedBinaryProperty.make(i, Default.ucd);
if (upi == null) continue;
if (!upi.isStandard()) {
System.out.println("Skipping " + upi.getName() + "; not standard");
@@ -419,7 +419,7 @@ public class CompareProperties implements UCD_Types {
int jType = j & 0xFF00;
if (jType == UCD_Types.JOINING_GROUP || jType == UCD_Types.AGE || jType == UCD_Types.COMBINING_CLASS || jType == UCD_Types.SCRIPT
|| (jType == iType && jType != UCD_Types.BINARY_PROPERTIES)) continue;
- UnicodeProperty upj = UnifiedBinaryProperty.make(j, Default.ucd);
+ UCDProperty upj = UnifiedBinaryProperty.make(j, Default.ucd);
if (upj == null) continue;
if (!upj.isStandard()) continue;
if (upj.getValueType() < UCD_Types.BINARY_PROP) continue;
diff --git a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java
index a2debacc829..adc03b0eff9 100644
--- a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java
+++ b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
-* $Date: 2003/07/21 15:50:06 $
-* $Revision: 1.12 $
+* $Date: 2004/02/06 18:30:23 $
+* $Revision: 1.13 $
*
*******************************************************************************
*/
@@ -27,12 +27,14 @@ import java.io.*;
public final class ConvertUCD implements UCD_Types {
public static final boolean SHOW = false;
public static final boolean DEBUG = false;
+ static final boolean SHOW_SAMPLE = false;
- public static int major;
- public static int minor;
- public static int update;
- static String version;
+ int major;
+ int minor;
+ int update;
+
+ String version;
// varies by version
/*
@@ -79,6 +81,47 @@ public final class ConvertUCD implements UCD_Types {
/*
//*/
};
+ static HashMap isHex = new HashMap();
+ static HashMap defaults = new HashMap();
+
+ static {
+ for (int j = 0; j < labelList.length; ++j) {
+ String[] labels = labelList[j];
+
+ for (int i = 1; i < labels.length; ++i) {
+ boolean hex = false;
+ String def = null;
+ //char appendChar = '\u0000';
+
+ // pull off "*": hex interpretation
+ if (labels[i].charAt(0) == '*') { // HEX value
+ hex = true;
+ labels[i] = labels[i].substring(1);
+ }
+
+ /*
+ // pull off "$": append duplicates
+ if (labels[i].charAt(0) == '$') { // HEX value
+ appendChar = labels[i].charAt(1);
+ labels[i] = labels[i].substring(2);
+ }
+
+ // pull off default values
+ int pos = labels[i].indexOf('-');
+ if (pos >= 0) {
+ def = labels[i].substring(pos+1);
+ labels[i] = labels[i].substring(0,pos);
+ }
+ */
+ // store results
+ // we do this after all processing, so that the label is clean!!
+
+ if (hex) isHex.put(labels[i], "");
+ //if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar));
+ defaults.put(labels[i], def);
+ }
+ }
+ }
/*
static String[][] labelList31 = {
// Labels for the incoming files. Labels MUST match field order in file.
@@ -212,15 +255,10 @@ public final class ConvertUCD implements UCD_Types {
try {
for (int i = 0; i < args.length; ++i) {
- version = args[i];
+ String version = args[i];
if (version.length() == 0) version = UCD.latestVersion;
- String[] parts = new String[3];
- Utility.split(version, '.', parts);
- major = Integer.parseInt(parts[0]);
- minor = Integer.parseInt(parts[1]);
- update = Integer.parseInt(parts[2]);
- toJava();
+ new ConvertUCD().toJava(version);
}
} finally {
log.close();
@@ -242,7 +280,13 @@ public final class ConvertUCD implements UCD_Types {
}
*/
- static void toJava() throws Exception {
+ void toJava(String version) throws Exception {
+ this.version = version;
+ String[] parts = new String[3];
+ Utility.split(version, '.', parts);
+ major = Integer.parseInt(parts[0]);
+ minor = Integer.parseInt(parts[1]);
+ update = Integer.parseInt(parts[2]);
System.out.println("Building " + version);
// Blocks is special
// Unihan is special
@@ -264,10 +308,13 @@ public final class ConvertUCD implements UCD_Types {
UData ud;
ud = getEntry(0x5e);
System.out.println("SPOT-CHECK: 5e: " + ud);
-
+
ud = getEntry(0x130);
System.out.println("SPOT-CHECK: 130: " + ud);
+ ud = getEntry(0x1f6);
+ System.out.println("SPOT-CHECK: 1f6: " + ud);
+
ud = getEntry(0x2A6D6);
System.out.println("SPOT-CHECK: 2A6D6: " + ud);
@@ -285,51 +332,10 @@ public final class ConvertUCD implements UCD_Types {
* "OMIT" is special -- means don't record
*/
- static HashMap isHex = new HashMap();
- static HashMap defaults = new HashMap();
- static {
- for (int j = 0; j < labelList.length; ++j) {
- String[] labels = labelList[j];
+ List blockData = new LinkedList();
- for (int i = 1; i < labels.length; ++i) {
- boolean hex = false;
- String def = null;
- //char appendChar = '\u0000';
-
- // pull off "*": hex interpretation
- if (labels[i].charAt(0) == '*') { // HEX value
- hex = true;
- labels[i] = labels[i].substring(1);
- }
-
- /*
- // pull off "$": append duplicates
- if (labels[i].charAt(0) == '$') { // HEX value
- appendChar = labels[i].charAt(1);
- labels[i] = labels[i].substring(2);
- }
-
- // pull off default values
- int pos = labels[i].indexOf('-');
- if (pos >= 0) {
- def = labels[i].substring(pos+1);
- labels[i] = labels[i].substring(0,pos);
- }
- */
- // store results
- // we do this after all processing, so that the label is clean!!
-
- if (hex) isHex.put(labels[i], "");
- //if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar));
- defaults.put(labels[i], def);
- }
- }
- }
-
- static List blockData = new LinkedList();
-
- static void readBlocks() throws Exception {
+ void readBlocks() throws Exception {
System.out.println("Reading 'Blocks'");
BufferedReader input = Utility.openUnicodeFile(blocksname, version, true, Utility.LATIN1);
String line = "";
@@ -363,9 +369,9 @@ public final class ConvertUCD implements UCD_Types {
}
}
- static Set properties = new TreeSet();
+ Set properties = new TreeSet();
- static void readSemi(String[] labels) throws Exception {
+ void readSemi(String[] labels) throws Exception {
System.out.println();
System.out.println("Reading '" + labels[0] + "'");
if (major < 3 || (major == 3 && minor < 1)) {
@@ -554,8 +560,9 @@ public final class ConvertUCD implements UCD_Types {
System.out.println(";");
}
- static Map charData = new TreeMap();
+ Map charData = new TreeMap();
+ /*
static void writeXML() throws IOException {
System.out.println("Writing 'UCD-Main.xml'");
BufferedWriter output = new BufferedWriter(
@@ -604,7 +611,7 @@ public final class ConvertUCD implements UCD_Types {
String value = Utility.quoteXML((String) data.get(label));
output.write(" " + label + "='" + value + "'");
}
- */
+ *//*
output.write("/>\r\n");
}
@@ -615,8 +622,9 @@ public final class ConvertUCD implements UCD_Types {
output.close();
}
}
-
- static void writeJavaData() throws IOException {
+ */
+
+ void writeJavaData() throws IOException {
Iterator it = charData.keySet().iterator();
int codePoint = -1;
System.out.println("Writing " + dataFilePrefix + version);
@@ -665,13 +673,13 @@ public final class ConvertUCD implements UCD_Types {
}
}
- static String[] xsSplit = new String[40];
+ //static String[] xsSplit = new String[40];
// Cache a little bit for speed
- static int getEntryCodePoint = -1;
- static UData getEntryUData = null;
+ int getEntryCodePoint = -1;
+ UData getEntryUData = null;
- static UData getEntryIfExists(int cp) {
+ UData getEntryIfExists(int cp) {
if (cp == getEntryCodePoint) return getEntryUData;
Integer cc = new Integer(cp);
UData charEntry = (UData) charData.get(cc);
@@ -683,7 +691,7 @@ public final class ConvertUCD implements UCD_Types {
/* Get entry in table for cc
*/
- static UData getEntry(int cp) {
+ UData getEntry(int cp) {
if (cp == getEntryCodePoint) return getEntryUData;
Integer cc = new Integer(cp);
UData charEntry = (UData) charData.get(cc);
@@ -699,12 +707,12 @@ public final class ConvertUCD implements UCD_Types {
/** Adds the character data. Signals duplicates with an exception
*/
- static void setBinaryProperty(int cp, int binProp) {
+ void setBinaryProperty(int cp, int binProp) {
UData charEntry = getEntry(cp);
charEntry.binaryProperties |= (1L << binProp);
}
- static void appendCharProperties(int cp, String key) {
+ void appendCharProperties(int cp, String key) {
int ind;
//if (true || NEWPROPS) {
ind = Utility.lookup(key, UCD_Names.BP, true);
@@ -716,14 +724,12 @@ public final class ConvertUCD implements UCD_Types {
setBinaryProperty(cp, ind);
}
- static Set jtSet = new TreeSet();
- static Set jgSet = new TreeSet();
+ Set jtSet = new TreeSet();
+ Set jgSet = new TreeSet();
- static final boolean SHOW_SAMPLE = false;
-
/** Adds the character data. Signals duplicates with an exception
*/
- static void addCharData(int cp, String key, String value) {
+ void addCharData(int cp, String key, String value) {
//if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
UData charEntry = getEntry(cp);
//if (cp < 10) System.out.println(" " + charEntry);
@@ -794,7 +800,7 @@ public final class ConvertUCD implements UCD_Types {
}
- static public void setField(UData uData, String fieldName, String fieldValue) {
+ public void setField(UData uData, String fieldName, String fieldValue) {
try {
if (fieldName.equals("n")) {
uData.name = fieldValue;
diff --git a/tools/unicodetools/com/ibm/text/UCD/Default.java b/tools/unicodetools/com/ibm/text/UCD/Default.java
index 60f04992cb2..b0755ec6aca 100644
--- a/tools/unicodetools/com/ibm/text/UCD/Default.java
+++ b/tools/unicodetools/com/ibm/text/UCD/Default.java
@@ -8,7 +8,7 @@ import java.util.TimeZone;
public final class Default implements UCD_Types {
- public static String ucdVersion = UCD.latestVersion;
+ private static String ucdVersion = UCD.latestVersion;
public static UCD ucd;
public static Normalizer nfc;
public static Normalizer nfd;
@@ -21,16 +21,16 @@ public final class Default implements UCD_Types {
}
public static void setUCD(String version) {
- ucdVersion = version;
+ setUcdVersion(version);
setUCD();
}
public static void setUCD() {
- ucd = UCD.make(ucdVersion);
- nfd = nf[NFD] = new Normalizer(Normalizer.NFD, ucdVersion);
- nfc = nf[NFC] = new Normalizer(Normalizer.NFC, ucdVersion);
- nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, ucdVersion);
- nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdVersion);
+ ucd = UCD.make(getUcdVersion());
+ nfd = nf[NFD] = new Normalizer(Normalizer.NFD, getUcdVersion());
+ nfc = nf[NFC] = new Normalizer(Normalizer.NFC, getUcdVersion());
+ nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, getUcdVersion());
+ nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, getUcdVersion());
System.out.println("Loaded UCD" + ucd.getVersion() + " " + (new Date(ucd.getDate())));
}
@@ -43,4 +43,12 @@ public final class Default implements UCD_Types {
return myDateFormat.format(new Date());
}
+ public static void setUcdVersion(String ucdVersion) {
+ Default.ucdVersion = ucdVersion;
+ }
+
+ public static String getUcdVersion() {
+ return ucdVersion;
+ }
+
}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
index 37ec045c0e7..0408e544ca0 100644
--- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
+++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
-* $Date: 2003/07/21 15:50:06 $
-* $Revision: 1.22 $
+* $Date: 2004/02/06 18:30:22 $
+* $Revision: 1.23 $
*
*******************************************************************************
*/
@@ -31,11 +31,11 @@ public final class DerivedProperty implements UCD_Types {
// ADD CONSTANT to UCD_TYPES
- static public UnicodeProperty make(int derivedPropertyID) {
+ static public UCDProperty make(int derivedPropertyID) {
return make(derivedPropertyID, Default.ucd);
}
- static public UnicodeProperty make(int derivedPropertyID, UCD ucd) {
+ static public UCDProperty make(int derivedPropertyID, UCD ucd) {
if (derivedPropertyID < 0 || derivedPropertyID >= DERIVED_PROPERTY_LIMIT) return null;
DerivedProperty dp = getCached(ucd);
return dp.dprops[derivedPropertyID];
@@ -96,14 +96,14 @@ public final class DerivedProperty implements UCD_Types {
return dprops[propNumber].getValue(int cp);
}
*/
- private UnicodeProperty[] dprops = new UnicodeProperty[50];
+ private UCDProperty[] dprops = new UCDProperty[50];
static final String[] CaseNames = {
"Uppercase",
"Lowercase",
"Mixedcase"};
- class ExDProp extends UnicodeProperty {
+ class ExDProp extends UCDProperty {
Normalizer nfx;
ExDProp(int i) {
type = DERIVED_NORMALIZATION;
@@ -124,7 +124,7 @@ public final class DerivedProperty implements UCD_Types {
}
};
- class NF_UnsafeStartProp extends UnicodeProperty {
+ class NF_UnsafeStartProp extends UCDProperty {
Normalizer nfx;
//int prop;
@@ -180,7 +180,7 @@ public final class DerivedProperty implements UCD_Types {
*/
- class NFC_Prop extends UnicodeProperty {
+ class NFC_Prop extends UCDProperty {
BitSet bitset;
boolean filter = false;
boolean keepNonZero = true;
@@ -224,7 +224,7 @@ public final class DerivedProperty implements UCD_Types {
};
};
- class GenDProp extends UnicodeProperty {
+ class GenDProp extends UCDProperty {
Normalizer nfx;
Normalizer nfComp = null;
@@ -281,7 +281,7 @@ public final class DerivedProperty implements UCD_Types {
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
- class CaseDProp extends UnicodeProperty {
+ class CaseDProp extends UCDProperty {
byte val;
CaseDProp (int i) {
type = DERIVED_CORE;
@@ -301,7 +301,7 @@ public final class DerivedProperty implements UCD_Types {
}
};
- class QuickDProp extends UnicodeProperty {
+ class QuickDProp extends UCDProperty {
String NO;
String MAYBE;
Normalizer nfx;
@@ -357,7 +357,7 @@ public final class DerivedProperty implements UCD_Types {
dprops[i] = new NF_UnsafeStartProp(i-NFD_UnsafeStart);
}
- dprops[ID_Start] = new UnicodeProperty() {
+ dprops[ID_Start] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "ID_Start";
@@ -371,7 +371,7 @@ public final class DerivedProperty implements UCD_Types {
}
};
- dprops[ID_Continue_NO_Cf] = new UnicodeProperty() {
+ dprops[ID_Continue_NO_Cf] = new UCDProperty() {
{
name = "ID_Continue";
type = DERIVED_CORE;
@@ -441,7 +441,7 @@ public final class DerivedProperty implements UCD_Types {
if (status != 0) XID_Continue_Set.add(cp);
}
- dprops[Mod_ID_Start] = new UnicodeProperty() {
+ dprops[Mod_ID_Start] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "XID_Start";
@@ -457,7 +457,7 @@ public final class DerivedProperty implements UCD_Types {
}
};
- dprops[Mod_ID_Continue_NO_Cf] = new UnicodeProperty() {
+ dprops[Mod_ID_Continue_NO_Cf] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "XID_Continue";
@@ -474,7 +474,7 @@ public final class DerivedProperty implements UCD_Types {
}
};
- dprops[PropMath] = new UnicodeProperty() {
+ dprops[PropMath] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Math";
@@ -490,7 +490,7 @@ public final class DerivedProperty implements UCD_Types {
}
};
- dprops[PropAlphabetic] = new UnicodeProperty() {
+ dprops[PropAlphabetic] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Alphabetic";
@@ -506,7 +506,7 @@ public final class DerivedProperty implements UCD_Types {
}
};
- dprops[PropLowercase] = new UnicodeProperty() {
+ dprops[PropLowercase] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Lowercase";
@@ -522,7 +522,7 @@ public final class DerivedProperty implements UCD_Types {
}
};
- dprops[PropUppercase] = new UnicodeProperty() {
+ dprops[PropUppercase] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Uppercase";
@@ -549,7 +549,7 @@ including all characters whose canonical decomposition consists of a single char
file by including all characters whose canonical decomposition consists of a sequence
of characters, the first of which has a non-zero combining class.
*/
- dprops[FullCompExclusion] = new UnicodeProperty() {
+ dprops[FullCompExclusion] = new UCDProperty() {
{
type = DERIVED_NORMALIZATION;
name = "Full_Composition_Exclusion";
@@ -577,7 +577,7 @@ of characters, the first of which has a non-zero combining class.
*/
};
- dprops[FullCompInclusion] = new UnicodeProperty() {
+ dprops[FullCompInclusion] = new UCDProperty() {
{
isStandard = false;
type = DERIVED_NORMALIZATION;
@@ -598,7 +598,7 @@ of characters, the first of which has a non-zero combining class.
}
};
- dprops[FC_NFKC_Closure] = new UnicodeProperty() {
+ dprops[FC_NFKC_Closure] = new UCDProperty() {
{
type = DERIVED_NORMALIZATION;
setValueType(STRING_PROP);
@@ -621,7 +621,7 @@ of characters, the first of which has a non-zero combining class.
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
- dprops[FC_NFC_Closure] = new UnicodeProperty() {
+ dprops[FC_NFC_Closure] = new UCDProperty() {
{
type = DERIVED_NORMALIZATION;
isStandard = false;
@@ -649,33 +649,47 @@ of characters, the first of which has a non-zero combining class.
dprops[i] = new QuickDProp(i - QuickNFD);
}
- dprops[DefaultIgnorable] = new UnicodeProperty() {
+ dprops[DefaultIgnorable] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Default_Ignorable_Code_Point";
hasUnassigned = true;
shortName = "DI";
- header = header = "# Derived Property: " + name
- + "\r\n# Generated from (Other_Default_Ignorable_Code_Point + Variation_Selector"
- + "\r\n# + Noncharacter_Code_Point + Cf + Cc + Cs) - White_Space"
- //+ "\r\n# - U+0600..U+0603 - U+06DD - U+070F"
- ;
+ header = null;
+
}
+ public String getHeader() {
+ if (ucdData.getCompositeVersion() > 0x040000) return "# Derived Property: " + name
+ + "\r\n# Generated from (Other_Default_Ignorable_Code_Point + Variation_Selector"
+ + "\r\n# + Noncharacter_Code_Point + Cf + Cc + Cs) - White_Space"
+ + "\r\n# - U+FFF9..U+FFFB// INTERLINEAR ANNOTATION characters";
+ //+ "\r\n# - U+0600..U+0603 - U+06DD - U+070F"
+ return "# Derived Property: " + name
+ + "\r\n# Generated from (Other_Default_Ignorable_Code_Point + Cf + Cc + Cs) - White_Space";
+ }
+
public boolean hasValue(int cp) {
if (ucdData.getBinaryProperty(cp, White_space)) return false;
+ if (ucdData.getBinaryProperty(cp, Other_Default_Ignorable_Code_Point)) return true;
+
+ if (ucdData.getCompositeVersion() > 0x040000 && cp >= 0xFFF9 && cp <= 0xFFFB) return false;
+
+ byte cat = ucdData.getCategory(cp);
+ if (cat == Cf || cat == Cs || cat == Cc) return true;
+
+ if (ucdData.getCompositeVersion() <= 0x040000) return false;
+
+ //if (cp >= 0xFFF9 && cp <= 0xFFFB) return false;
//if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true;
//if (0x0600 <= cp && cp <= 0x0603 || 0x06DD == cp || 0x070F == cp) return false;
- if (ucdData.getBinaryProperty(cp, Other_Default_Ignorable_Code_Point)) return true;
if (ucdData.getBinaryProperty(cp, Variation_Selector)) return true;
if (ucdData.getBinaryProperty(cp, Noncharacter_Code_Point)) return true;
- byte cat = ucdData.getCategory(cp);
- if (cat == Cf || cat == Cs || cat == Cc) return true;
return false;
}
};
- dprops[Case_Sensitive] = new UnicodeProperty() {
+ dprops[Case_Sensitive] = new UCDProperty() {
{
type = DERIVED_CORE;
isStandard = false;
@@ -763,7 +777,7 @@ of characters, the first of which has a non-zero combining class.
}
};
- dprops[Other_Case_Ignorable] = new UnicodeProperty() {
+ dprops[Other_Case_Ignorable] = new UCDProperty() {
{
name = "Other_Case_Ignorable";
shortName = "OCI";
@@ -785,7 +799,7 @@ of characters, the first of which has a non-zero combining class.
}
};
- dprops[Type_i] = new UnicodeProperty() {
+ dprops[Type_i] = new UCDProperty() {
{
type = DERIVED_CORE;
isStandard = false;
@@ -819,7 +833,7 @@ of characters, the first of which has a non-zero combining class.
}
};
- dprops[Case_Ignorable] = new UnicodeProperty() {
+ dprops[Case_Ignorable] = new UCDProperty() {
{
name = "Case_Ignorable";
isStandard = false;
@@ -842,7 +856,7 @@ of characters, the first of which has a non-zero combining class.
# GraphemeBase :=
*/
- dprops[GraphemeExtend] = new UnicodeProperty() {
+ dprops[GraphemeExtend] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Grapheme_Extend";
@@ -865,7 +879,7 @@ of characters, the first of which has a non-zero combining class.
}
};
- dprops[GraphemeBase] = new UnicodeProperty() {
+ dprops[GraphemeBase] = new UCDProperty() {
{
type = DERIVED_CORE;
name = "Grapheme_Base";
@@ -888,7 +902,7 @@ of characters, the first of which has a non-zero combining class.
};
for (int i = 0; i < dprops.length; ++i) {
- UnicodeProperty up = dprops[i];
+ UCDProperty up = dprops[i];
if (up == null) continue;
if (up.getValueType() != BINARY_PROP) continue;
up.setValue(NUMBER, "1");
diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java
index 04d13a29c37..464cafc2b86 100644
--- a/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java
+++ b/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
-* $Date: 2003/07/21 15:50:06 $
-* $Revision: 1.11 $
+* $Date: 2004/02/06 18:30:22 $
+* $Revision: 1.12 $
*
*******************************************************************************
*/
@@ -24,7 +24,7 @@ final class DerivedPropertyLister extends PropertyLister {
//private int propMask;
//private DerivedProperty dprop;
- private UnicodeProperty uprop;
+ private UCDProperty uprop;
int width;
boolean varies;
diff --git a/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java
index 21cabdc3eac..7da5fc17922 100644
--- a/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java
+++ b/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
-* $Date: 2003/02/25 23:38:23 $
-* $Revision: 1.8 $
+* $Date: 2004/02/06 18:30:22 $
+* $Revision: 1.9 $
*
*******************************************************************************
*/
@@ -56,8 +56,8 @@ class DiffPropertyLister extends PropertyLister {
}
*/
- UnicodeProperty newProp = null;
- UnicodeProperty oldProp = null;
+ UCDProperty newProp = null;
+ UCDProperty oldProp = null;
String value = "";
public String optionalComment(int cp) {
diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java
index 88ec39c1201..cd61eafb2c1 100644
--- a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
-* $Date: 2003/04/23 20:18:43 $
-* $Revision: 1.7 $
+* $Date: 2004/02/06 18:30:22 $
+* $Revision: 1.8 $
*
*******************************************************************************
*/
@@ -24,92 +24,28 @@ abstract public class GenerateBreakTest implements UCD_Types {
static boolean DEBUG = false;
static final boolean SHOW_TYPE = false;
+ UCD ucd;
+ Normalizer nfd;
+ Normalizer nfkd;
UnicodeMap sampleMap = null;
+ UnicodeMap map = new UnicodeMap();
// ====================== Main ===========================
public static void main(String[] args) throws IOException {
System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61");
//Default.setUCD();
-
- if (false) {
-
- PrintWriter log = Utility.openPrintWriter("Diff.txt", Utility.UTF8_WINDOWS);
- UnicodeSet Term = new UnicodeSet(
- "[\\u0021\\u003F\\u0589\\u061F\\u06D4\\u0700\\u0701\\u0702\\u0964\\u1362\\u1367"
- + "\\u1368\\u104A\\u104B\\u166E\\u1803\\u1809\\u203C\\u203D\\u2047\\u2048\\u2049"
- + "\\u3002\\uFE52\\uFE57\\uFF01\\uFF0E\\uFF1F\\uFF61]");
- UnicodeSet terminal_punctuation = getSet(BINARY_PROPERTIES, Terminal_Punctuation);
- UnicodeMap names = new UnicodeMap();
- names.add("Pd", getSet(CATEGORY, Pd));
- names.add("Ps", getSet(CATEGORY, Ps));
- names.add("Pe", getSet(CATEGORY, Pe));
- names.add("Pc", getSet(CATEGORY, Pc));
- names.add("Po", getSet(CATEGORY, Po));
- names.add("Pi", getSet(CATEGORY, Pi));
- names.add("Pf", getSet(CATEGORY, Pf));
-
- Utility.showSetDifferences(log, "Term", Term, "Terminal_Punctuation", terminal_punctuation, true, true, names, Default.ucd);
- Utility.showSetDifferences(log, "Po", getSet(CATEGORY, Po), "Terminal_Punctuation", terminal_punctuation, true, true, names, Default.ucd);
- log.close();
-
- if (true) return;
-
- UnicodeSet whitespace = getSet(BINARY_PROPERTIES, White_space);
- UnicodeSet space = getSet(CATEGORY, Zs).addAll(getSet(CATEGORY, Zp)).addAll(getSet(CATEGORY, Zl));
- Utility.showSetDifferences("White_Space", whitespace, "Z", space, true, Default.ucd);
-
- UnicodeSet isSpace = new UnicodeSet();
- UnicodeSet isSpaceChar = new UnicodeSet();
- UnicodeSet isWhitespace = new UnicodeSet();
- for (int i = 0; i <= 0xFFFF; ++i) {
- if (Character.isSpace((char)i)) isSpace.add(i);
- if (Character.isSpaceChar((char)i)) isSpaceChar.add(i);
- if (Character.isWhitespace((char)i)) isWhitespace.add(i);
- }
- Utility.showSetDifferences("White_Space", whitespace, "isSpace", isSpace, true, Default.ucd);
- Utility.showSetDifferences("White_Space", whitespace, "isSpaceChar", isSpaceChar, true, Default.ucd);
- Utility.showSetDifferences("White_Space", whitespace, "isWhitespace", isWhitespace, true, Default.ucd);
- return;
- }
-
- if (DEBUG) {
- checkDecomps();
-
- Utility.showSetNames("", new UnicodeSet("[\u034F\u00AD\u1806[:DI:]-[:Cs:]-[:Cn:]]"), true, Default.ucd);
-
- System.out.println("*** Extend - Cf");
-
- generateTerminalClosure();
-
- GenerateWordBreakTest gwb = new GenerateWordBreakTest();
- PrintWriter systemPrintWriter = new PrintWriter(System.out);
- gwb.printLine(systemPrintWriter, "n\u0308't", true, true, false);
- systemPrintWriter.flush();
- //showSet("sepSet", GenerateSentenceBreakTest.sepSet);
- //showSet("atermSet", GenerateSentenceBreakTest.atermSet);
- //showSet("termSet", GenerateSentenceBreakTest.termSet);
- }
-
- if (true) {
- GenerateBreakTest foo = new GenerateLineBreakTest();
- //foo.isBreak("(\"Go.\") (He did)", 5, true);
- foo.isBreak("\u4e00\u4300", 1, true);
- /*
- GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest();
- //foo.isBreak("(\"Go.\") (He did)", 5, true);
- foo.isBreak("3.4", 2, true);
- */
- }
-
- new GenerateGraphemeBreakTest().run();
- new GenerateWordBreakTest().run();
- new GenerateLineBreakTest().run();
- new GenerateSentenceBreakTest().run();
-
- //if (true) return; // cut short for now
-
+ new GenerateGraphemeBreakTest(Default.ucd).run();
+ new GenerateWordBreakTest(Default.ucd).run();
+ new GenerateLineBreakTest(Default.ucd).run();
+ new GenerateSentenceBreakTest(Default.ucd).run();
+ }
+
+ GenerateBreakTest(UCD ucd) {
+ this.ucd = ucd;
+ nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
+ nfkd = new Normalizer(Normalizer.NFKD, ucd.getVersion());
}
// COMMON STUFF for Hangul
@@ -119,11 +55,11 @@ abstract public class GenerateBreakTest implements UCD_Types {
static byte getHangulType(int cp) {
- if (Default.ucd.isLeadingJamo(cp)) return hL;
- if (Default.ucd.isVowelJamo(cp)) return hV;
- if (Default.ucd.isTrailingJamo(cp)) return hT;
- if (Default.ucd.isHangulSyllable(cp)) {
- if (Default.ucd.isDoubleHangul(cp)) return hLV;
+ if (ucd.isLeadingJamo(cp)) return hL;
+ if (ucd.isVowelJamo(cp)) return hV;
+ if (ucd.isTrailingJamo(cp)) return hT;
+ if (ucd.isHangulSyllable(cp)) {
+ if (ucd.isDoubleHangul(cp)) return hLV;
return hLVT;
}
return hNot;
@@ -131,7 +67,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
*/
/* static {
- Default.setUCD();
+ setUCD();
}
*/
@@ -144,11 +80,11 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
// finds the first base character, or the first character if there is no base
- public static int findFirstBase(String source, int start, int limit) {
+ public int findFirstBase(String source, int start, int limit) {
int cp;
for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
- byte cat = Default.ucd.getCategory(cp);
+ byte cat = ucd.getCategory(cp);
if (((1< " + showData(decomp, INFOPROPS, "\r\n\t"));
+ System.out.println(showData(ucd, UTF16.valueOf(i), INFOPROPS, "\r\n\t"));
+ System.out.println(" => " + showData(ucd, decomp, INFOPROPS, "\r\n\t"));
shown = true;
}
System.out.println(j + ": " + tests[k].fileName);
@@ -203,13 +140,13 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
}
- static String showData(String source, UnicodeProperty[] props, String separator) {
+ static String showData(UCD ucd, String source, UCDProperty[] props, String separator) {
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
if (i != 0) result.append(separator);
- result.append(Default.ucd.getCodeAndName(cp));
+ result.append(ucd.getCodeAndName(cp));
for (int j = 0; j < props.length; ++j) {
result.append(", ");
result.append(props[j].getProperty(SHORT)).append('=').append(props[j].getValue(cp,SHORT));
@@ -218,20 +155,18 @@ abstract public class GenerateBreakTest implements UCD_Types {
return result.toString();
}
- static void showSet(String title, UnicodeSet set) {
+ void showSet(String title, UnicodeSet set) {
System.out.println(title + ": " + set.toPattern(true));
- Utility.showSetNames("", set, false, Default.ucd);
+ Utility.showSetNames("", set, false, ucd);
}
-
-
// determines if string is of form Base NSM*
- static boolean isBaseNSMStar(String source) {
+ boolean isBaseNSMStar(String source) {
int cp;
int status = 0;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
- byte cat = Default.ucd.getCategory(cp);
+ byte cat = ucd.getCategory(cp);
int catMask = 1<");
out.println("" + fileName + " Break Chart");
out.println("