diff --git a/tools/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java b/tools/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java index 4ebaa37b7ea..3620ebfdb23 100644 --- a/tools/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java +++ b/tools/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/AbbreviatedUnicodeSetIterator.java,v $ -* $Date: 2003/03/17 23:00:20 $ -* $Revision: 1.1 $ +* $Date: 2004/02/06 18:32:04 $ +* $Revision: 1.2 $ * ******************************************************************************* */ @@ -24,7 +24,7 @@ import com.ibm.text.UCD.Normalizer; import com.ibm.text.UCD.UCD; import com.ibm.text.utility.*; import com.ibm.text.UCD.UnifiedBinaryProperty; -import com.ibm.text.UCD.UnicodeProperty; +import com.ibm.text.UCD.UCDProperty; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java index 9c2d870f8e0..c0c3bedc8da 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ -* $Date: 2003/08/21 07:32:52 $ -* $Revision: 1.22 $ +* $Date: 2004/02/06 18:32:03 $ +* $Revision: 1.23 $ * ******************************************************************************* */ @@ -24,7 +24,7 @@ import com.ibm.text.UCD.Normalizer; import com.ibm.text.UCD.UCD; import com.ibm.text.utility.*; import com.ibm.text.UCD.UnifiedBinaryProperty; -import com.ibm.text.UCD.UnicodeProperty; +import com.ibm.text.UCD.UCDProperty; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; @@ -1418,7 +1418,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] */ private void cleanup() { - UnicodeProperty ubp = UnifiedBinaryProperty.make( + UCDProperty ubp = UnifiedBinaryProperty.make( UCD.BINARY_PROPERTIES + UCD.Logical_Order_Exception, ucd); UnicodeSet desiredSet = ubp.getSet(); diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java index 97749a7744b..9d6c983f4a7 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $ -* $Date: 2003/08/22 16:51:21 $ -* $Revision: 1.17 $ +* $Date: 2004/02/06 18:32:03 $ +* $Revision: 1.18 $ * ******************************************************************************* */ @@ -175,32 +175,9 @@ public class WriteCharts implements UCD_Types { String classname = primaryCount > 1 ? XCLASSNAME[strength] : CLASSNAME[strength]; - String name = Default.ucd.getName(s); + String outline = showCell2(sortKey, s, script, classname); - - if (s.equals("\u1eaf")) { - System.out.println("debug"); - } - - String comp = Default.nfc.normalize(s); - - String outline = breaker + classname - + " title='" - + (script != UNSUPPORTED - ? Utility.quoteXML(name, true) + ": " - : "") - + UCA.toString(sortKey) + "'>" - + Utility.quoteXML(comp, true) - + "
" - + Utility.hex(s) - //+ "
" + script - + "
" - + (script == UNSUPPORTED - ? "" + Utility.quoteXML(name, true) + "" - : "") - ; - - output.println(outline); + output.println(breaker + outline); ++columnCount; } @@ -208,6 +185,46 @@ public class WriteCharts implements UCD_Types { closeIndexFile(indexFile, "
UCA: " + uca.getDataVersion(), COLLATION); } + private static String showCell2( + String sortKey, + String s, + byte script, + String classname) { + String name = Default.ucd.getName(s); + + + if (s.equals("\u1eaf")) { + System.out.println("debug"); + } + + String comp = Default.nfc.normalize(s); + int cat = Default.ucd.getCategory(UTF16.charAt(comp,0)); + if (cat == Mn || cat == Mc || cat == Me) { + comp = '\u25CC' + comp; + if (s.equals("\u0300")) { + System.out.println(Default.ucd.getCodeAndName(comp)); + } + } + // TODO: merge with showCell + + String outline = classname + + " title='" + + (script != UNSUPPORTED + ? Utility.quoteXML(name, true) + ": " + : "") + + UCA.toString(sortKey) + "'>" + + Utility.quoteXML(comp, true) + + "
" + + Utility.hex(s) + //+ "
" + script + + "
" + + (script == UNSUPPORTED + ? "" + Utility.quoteXML(name, true) + "" + : "") + ; + return outline; + } + static public void normalizationChart() throws IOException { Default.setUCD(); HACK_KANA = false; @@ -642,9 +659,20 @@ public class WriteCharts implements UCD_Types { closeIndexFile(indexFile, "", CASE); } - static void showCell(PrintWriter output, String s, String prefix, String extra, boolean skipName) { + static void showCell(PrintWriter output, String s, + String prefix, String extra, boolean skipName) { + if (s.equals("\u0300")) { + System.out.println(); + } String name = Default.ucd.getName(s); String comp = Default.nfc.normalize(s); + int cat = Default.ucd.getCategory(UTF16.charAt(comp,0)); + if (cat == Mn || cat == Mc || cat == Me) { + comp = '\u25CC' + comp; + if (s.equals("\u0300")) { + System.out.println(Default.ucd.getCodeAndName(comp)); + } + } String outline = prefix + (skipName ? "" : " title='" + Utility.quoteXML(name, true) + "'") diff --git a/tools/unicodetools/com/ibm/text/UCD/CheckICU.java b/tools/unicodetools/com/ibm/text/UCD/CheckICU.java new file mode 100644 index 00000000000..c28d0a8c41c --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/CheckICU.java @@ -0,0 +1,218 @@ +package com.ibm.text.UCD; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; +import java.util.TreeSet; + +import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.dev.test.util.UnicodeProperty; +import com.ibm.icu.dev.test.util.ICUPropertyFactory; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.text.utility.Utility; + +public class CheckICU { + static final BagFormatter bf = new BagFormatter(); + + public static void main(String[] args) throws IOException { + System.out.println("Start"); + test(); + System.out.println("End"); + } + + static UnicodeSet itemFailures; + static ICUPropertyFactory icuFactory; + static ToolUnicodePropertySource toolFactory; + + public static void test() throws IOException { + checkUCD(); + itemFailures = new UnicodeSet(); + icuFactory = ICUPropertyFactory.make(); + toolFactory = ToolUnicodePropertySource.make("4.0.0"); + + String[] quickList = { + "Name", + // "Script", "Bidi_Mirroring_Glyph", "Case_Folding", + //"Numeric_Value" + }; + for (int i = 0; i < quickList.length; ++i) { + testProperty(quickList[i], -1); + } + if (quickList.length > 0) return; + + Collection availableTool = toolFactory.getAvailablePropertyAliases(new TreeSet()); + + Collection availableICU = icuFactory.getAvailablePropertyAliases(new TreeSet()); + System.out.println(showDifferences("Property Aliases", "ICU", availableICU, "Tool", availableTool)); + Collection common = new TreeSet(availableICU); + common.retainAll(availableTool); + + for (int j = UnicodeProperty.BINARY; j < UnicodeProperty.LIMIT_TYPE; ++j) { + System.out.println(); + System.out.println(UnicodeProperty.getTypeName(j)); + Iterator it = common.iterator(); + while (it.hasNext()) { + String prop = (String)it.next(); + testProperty(prop, j); + } + } + } + + private static void checkUCD() throws IOException { + UCD myUCD = UCD.make("4.0.0"); + Normalizer nfc = new Normalizer(Normalizer.NFC, "4.0.0"); + UnicodeSet leading = new UnicodeSet(); + UnicodeSet trailing = new UnicodeSet(); + UnicodeSet starter = new UnicodeSet(); + for (int i = 0; i <= 0x10FFFF; ++i) { + if (myUCD.getCombiningClass(i) == 0) starter.add(i); + if (nfc.isTrailing(i)) trailing.add(i); + if (nfc.isLeading(i)) leading.add(i); + } + PrintWriter pw = bf.openUTF8Writer(UCD_Types.GEN_DIR, "Trailing.txt"); + bf.showSetNames(pw, "+Trailing+Starter", new UnicodeSet(trailing).retainAll(starter)); + bf.showSetNames(pw, "+Trailing-Starter", new UnicodeSet(trailing).removeAll(starter)); + bf.showSetNames(pw, "-Trailing-Starter", new UnicodeSet(trailing).complement().removeAll(starter)); + bf.showSetNames(pw, "+Trailing+Leading", new UnicodeSet(trailing).retainAll(leading)); + bf.showSetNames(pw, "+Trailing-Leading", new UnicodeSet(trailing).removeAll(leading)); + pw.close(); + } + /* + * int icuType; + int toolType; + Collection icuAliases; + Collection toolAliases; + String firstDiffICU; + String firstDiffTool; + String firstDiffCP; + String icuProp; + String toolProp; + + */ + + private static void testProperty(String prop, int typeFilter) { + UnicodeProperty icuProp = icuFactory.getProperty(prop); + int icuType = icuProp.getPropertyType(); + + if (typeFilter >= 0 && icuType != typeFilter) return; + + System.out.println(); + System.out.println("Testing: " + prop); + UnicodeProperty toolProp = toolFactory.getProperty(prop); + + int toolType = toolProp.getPropertyType(); + if (icuType != toolType) { + System.out.println("FAILURE Type: ICU: " + UnicodeProperty.getTypeName(icuType) + + "\tTool: " + UnicodeProperty.getTypeName(toolType)); + } + + Collection icuAliases = icuProp.getPropertyAliases(new ArrayList()); + Collection toolAliases = toolProp.getPropertyAliases(new ArrayList()); + System.out.println(showDifferences("Aliases", "ICU", icuAliases, "Tool", toolAliases)); + + icuAliases = icuProp.getAvailablePropertyValueAliases(new ArrayList()); + toolAliases = toolProp.getAvailablePropertyValueAliases(new ArrayList()); + System.out.println(showDifferences("Value Aliases", "ICU", icuAliases, "Tool", toolAliases)); + + // TODO do property value aliases + itemFailures.clear(); + String firstDiffICU = null, firstDiffTool = null, firstDiffCP = null; + for (int i = 0; i <= 0x10FFFF; ++i) { + /*if (i == 0x0237) { + System.out.println(); + } + */ + String icuValue = icuProp.getPropertyValue(i); + String toolValue = toolProp.getPropertyValue(i); + if (!equals(icuValue, toolValue)) { + itemFailures.add(i); + if (firstDiffCP == null) { + firstDiffICU = icuValue; + firstDiffTool = toolValue; + firstDiffCP = Utility.hex(i); + } + } + } + if (itemFailures.size() != 0) { + System.out.println("FAILURE " + itemFailures.size() + " Differences: "); + System.out.println(itemFailures.toPattern(true)); + if (firstDiffICU != null) firstDiffICU = bf.hex.transliterate(firstDiffICU); + if (firstDiffTool != null) firstDiffTool = bf.hex.transliterate(firstDiffTool); + System.out.println(firstDiffCP + + "\tICU: <" + firstDiffICU + + ">\tTool: <" + firstDiffTool + ">"); + } + System.out.println("done"); + + // do values later, and their aliases + /* + System.out.println("-Values"); + UnicodeSet + System.out.println(showDifferences("ICU", availableICU, "Tool", availableTool)); + */ + } + + static boolean equals(Object a, Object b) { + if (a == null) return b == null; + return a.equals(b); + } + + static public String showDifferences( + String title, + String name1, + Collection set1, + String name2, + Collection set2) { + + Collection temp = new TreeSet(set1); + temp.retainAll(set2); + + if (set1.size() == temp.size()) { + return title + ": " + name1 + " == " + name2 + ": " + bf.join(set1); + } + + StringBuffer result = new StringBuffer(); + result.append(title + "\tFAILURE\r\n"); + result.append("\t" + name1 + " = " + bf.join(set1) + "\r\n"); + result.append("\t" + name2 + " = " + bf.join(set2) + "\r\n"); + + // damn'd collection doesn't have a clone, so + // we go with Set, even though that + // may not preserve order and duplicates + if (temp.size() != 0) { + result.append("\t" + name2 + " & " + name1 + ":\r\n"); + result.append("\t" + bf.join(temp)); + result.append("\r\n"); + } + + + temp.clear(); + temp.addAll(set1); + temp.removeAll(set2); + if (temp.size() != 0) { + result.append("\t" + name1 + " - " + name2 + ":\r\n"); + result.append("\t" + bf.join(temp)); + result.append("\r\n"); + } + + temp.clear(); + temp.addAll(set2); + temp.removeAll(set1); + if (temp.size() != 0) { + result.append("\t" + name2 + " - " + name1 + ":\r\n"); + result.append("\t" + bf.join(temp)); + result.append("\r\n"); + } + + + return result.toString(); + } + + +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/ChineseFrequency.java b/tools/unicodetools/com/ibm/text/UCD/ChineseFrequency.java new file mode 100644 index 00000000000..6412d2d0c10 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/ChineseFrequency.java @@ -0,0 +1,81 @@ +package com.ibm.text.UCD; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + +import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.text.DecimalFormat; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.UTF16; +import com.ibm.text.utility.Pair; +import com.ibm.text.utility.Utility; + +public class ChineseFrequency { + static final String DICT_DIR = "C:\\DATA\\dict\\"; + static NumberFormat percent = new DecimalFormat("0.000000%"); + static NumberFormat percent3 = new DecimalFormat("000.000000%"); + static NumberFormat number = new DecimalFormat("#,##0"); + + static class InverseCompareTo implements Comparator { + public int compare(Object o1, Object o2) { + return -((Comparable)o1).compareTo(o2); + } + } + + public static void test() throws IOException{ + Set freq_char = new TreeSet(new InverseCompareTo()); + BufferedReader br = BagFormatter.openUTF8Reader(DICT_DIR, "kHYPLCDPF.txt"); + double grandTotal = 0.0; + while (true) { + String line = br.readLine(); + if (line == null) break; + String[] pieces = Utility.split(line,'\t'); + int cp = Integer.parseInt(pieces[0],16); + String[] says = Utility.split(pieces[1],','); + long total = 0; + for (int i = 0; i < says.length; ++i) { + int start = says[i].indexOf('('); + int end = says[i].indexOf(')'); + long count = Long.parseLong(says[i].substring(start+1, end)); + total += count; + } + grandTotal += total; + freq_char.add(new Pair(new Long(total), new Integer(cp))); + } + br.close(); + PrintWriter pw = BagFormatter.openUTF8Writer(DICT_DIR,"kHYPLCDPF_frequency.txt"); + pw.write("\uFEFF"); + pw.println("No.\tPercentage\tAccummulated\tHex\tChar"); + + Iterator it = freq_char.iterator(); + int counter = 0; + double cummulative = 0; + double cummulativePercentage = 0; + while (it.hasNext()) { + Pair item = (Pair)it.next(); + Long total = (Long) item.first; + Integer cp = (Integer) item.second; + double current = total.longValue(); + cummulative += current; + double percentage = current / grandTotal; + cummulativePercentage += percentage; + pw.println( + ++counter + //+ "\t" + number.format(current) + //+ "\t" + number.format(cummulative) + + "\t" + percent.format(percentage) + + "\t" + percent3.format(cummulativePercentage) + + "\t" + Integer.toHexString(cp.intValue()).toUpperCase() + + "\t" + UTF16.valueOf(cp.intValue())); + } + //pw.println("Grand total: " + (long)grandTotal); + pw.close(); + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/CompareProperties.java b/tools/unicodetools/com/ibm/text/UCD/CompareProperties.java index c9dc96a64fb..d480cf32ae4 100644 --- a/tools/unicodetools/com/ibm/text/UCD/CompareProperties.java +++ b/tools/unicodetools/com/ibm/text/UCD/CompareProperties.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CompareProperties.java,v $ -* $Date: 2003/07/21 15:50:07 $ -* $Revision: 1.2 $ +* $Date: 2004/02/06 18:30:23 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -89,7 +89,7 @@ public class CompareProperties implements UCD_Types { } } - public final class UnicodeSetComparator implements Comparator { + public final static class UnicodeSetComparator implements Comparator { /** * Compares two UnicodeSets, producing a transitive ordering. * @return -1 if first is smaller (in size) than second, @@ -121,7 +121,7 @@ public class CompareProperties implements UCD_Types { boolean isPartitioned = false; - UnicodeProperty[] props = new UnicodeProperty[500]; + UCDProperty[] props = new UCDProperty[500]; UnicodeSet[] sets = new UnicodeSet[500]; int count = 0; BitSet[] disjoints = new BitSet[500]; @@ -147,7 +147,7 @@ public class CompareProperties implements UCD_Types { if (!Default.ucd.isAllocated(cp)) continue; for (int i = 0; i < count; ++i) { - UnicodeProperty up = props[i]; + UCDProperty up = props[i]; boolean iProp = up.hasValue(cp); if (iProp) { probe.set(i); @@ -177,7 +177,7 @@ public class CompareProperties implements UCD_Types { if (i == 0x0900) { System.out.println("debug"); } - UnicodeProperty up = UnifiedBinaryProperty.make(i, Default.ucd); + UCDProperty up = UnifiedBinaryProperty.make(i, Default.ucd); if (up == null) continue; if (up.getValueType() < BINARY_PROP) { System.out.println("\tSkipping " + up.getName() + "; value varies"); @@ -378,7 +378,7 @@ public class CompareProperties implements UCD_Types { return getPropName(props[propertyIndex]); } - private String getPropName(UnicodeProperty ubp) { + private String getPropName(UCDProperty ubp) { return Utility.getUnskeleton(ubp.getFullName(LONG), true); } @@ -395,7 +395,7 @@ public class CompareProperties implements UCD_Types { for (int i = 1; i < UCD_Types.LIMIT_ENUM; ++i) { int iType = i & 0xFF00; if (iType == UCD_Types.JOINING_GROUP || iType == UCD_Types.AGE || iType == UCD_Types.COMBINING_CLASS || iType == UCD_Types.SCRIPT) continue; - UnicodeProperty upi = UnifiedBinaryProperty.make(i, Default.ucd); + UCDProperty upi = UnifiedBinaryProperty.make(i, Default.ucd); if (upi == null) continue; if (!upi.isStandard()) { System.out.println("Skipping " + upi.getName() + "; not standard"); @@ -419,7 +419,7 @@ public class CompareProperties implements UCD_Types { int jType = j & 0xFF00; if (jType == UCD_Types.JOINING_GROUP || jType == UCD_Types.AGE || jType == UCD_Types.COMBINING_CLASS || jType == UCD_Types.SCRIPT || (jType == iType && jType != UCD_Types.BINARY_PROPERTIES)) continue; - UnicodeProperty upj = UnifiedBinaryProperty.make(j, Default.ucd); + UCDProperty upj = UnifiedBinaryProperty.make(j, Default.ucd); if (upj == null) continue; if (!upj.isStandard()) continue; if (upj.getValueType() < UCD_Types.BINARY_PROP) continue; diff --git a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java index a2debacc829..adc03b0eff9 100644 --- a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $ -* $Date: 2003/07/21 15:50:06 $ -* $Revision: 1.12 $ +* $Date: 2004/02/06 18:30:23 $ +* $Revision: 1.13 $ * ******************************************************************************* */ @@ -27,12 +27,14 @@ import java.io.*; public final class ConvertUCD implements UCD_Types { public static final boolean SHOW = false; public static final boolean DEBUG = false; + static final boolean SHOW_SAMPLE = false; - public static int major; - public static int minor; - public static int update; - static String version; + int major; + int minor; + int update; + + String version; // varies by version /* @@ -79,6 +81,47 @@ public final class ConvertUCD implements UCD_Types { /* //*/ }; + static HashMap isHex = new HashMap(); + static HashMap defaults = new HashMap(); + + static { + for (int j = 0; j < labelList.length; ++j) { + String[] labels = labelList[j]; + + for (int i = 1; i < labels.length; ++i) { + boolean hex = false; + String def = null; + //char appendChar = '\u0000'; + + // pull off "*": hex interpretation + if (labels[i].charAt(0) == '*') { // HEX value + hex = true; + labels[i] = labels[i].substring(1); + } + + /* + // pull off "$": append duplicates + if (labels[i].charAt(0) == '$') { // HEX value + appendChar = labels[i].charAt(1); + labels[i] = labels[i].substring(2); + } + + // pull off default values + int pos = labels[i].indexOf('-'); + if (pos >= 0) { + def = labels[i].substring(pos+1); + labels[i] = labels[i].substring(0,pos); + } + */ + // store results + // we do this after all processing, so that the label is clean!! + + if (hex) isHex.put(labels[i], ""); + //if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar)); + defaults.put(labels[i], def); + } + } + } /* static String[][] labelList31 = { // Labels for the incoming files. Labels MUST match field order in file. @@ -212,15 +255,10 @@ public final class ConvertUCD implements UCD_Types { try { for (int i = 0; i < args.length; ++i) { - version = args[i]; + String version = args[i]; if (version.length() == 0) version = UCD.latestVersion; - String[] parts = new String[3]; - Utility.split(version, '.', parts); - major = Integer.parseInt(parts[0]); - minor = Integer.parseInt(parts[1]); - update = Integer.parseInt(parts[2]); - toJava(); + new ConvertUCD().toJava(version); } } finally { log.close(); @@ -242,7 +280,13 @@ public final class ConvertUCD implements UCD_Types { } */ - static void toJava() throws Exception { + void toJava(String version) throws Exception { + this.version = version; + String[] parts = new String[3]; + Utility.split(version, '.', parts); + major = Integer.parseInt(parts[0]); + minor = Integer.parseInt(parts[1]); + update = Integer.parseInt(parts[2]); System.out.println("Building " + version); // Blocks is special // Unihan is special @@ -264,10 +308,13 @@ public final class ConvertUCD implements UCD_Types { UData ud; ud = getEntry(0x5e); System.out.println("SPOT-CHECK: 5e: " + ud); - + ud = getEntry(0x130); System.out.println("SPOT-CHECK: 130: " + ud); + ud = getEntry(0x1f6); + System.out.println("SPOT-CHECK: 1f6: " + ud); + ud = getEntry(0x2A6D6); System.out.println("SPOT-CHECK: 2A6D6: " + ud); @@ -285,51 +332,10 @@ public final class ConvertUCD implements UCD_Types { * "OMIT" is special -- means don't record */ - static HashMap isHex = new HashMap(); - static HashMap defaults = new HashMap(); - static { - for (int j = 0; j < labelList.length; ++j) { - String[] labels = labelList[j]; + List blockData = new LinkedList(); - for (int i = 1; i < labels.length; ++i) { - boolean hex = false; - String def = null; - //char appendChar = '\u0000'; - - // pull off "*": hex interpretation - if (labels[i].charAt(0) == '*') { // HEX value - hex = true; - labels[i] = labels[i].substring(1); - } - - /* - // pull off "$": append duplicates - if (labels[i].charAt(0) == '$') { // HEX value - appendChar = labels[i].charAt(1); - labels[i] = labels[i].substring(2); - } - - // pull off default values - int pos = labels[i].indexOf('-'); - if (pos >= 0) { - def = labels[i].substring(pos+1); - labels[i] = labels[i].substring(0,pos); - } - */ - // store results - // we do this after all processing, so that the label is clean!! - - if (hex) isHex.put(labels[i], ""); - //if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar)); - defaults.put(labels[i], def); - } - } - } - - static List blockData = new LinkedList(); - - static void readBlocks() throws Exception { + void readBlocks() throws Exception { System.out.println("Reading 'Blocks'"); BufferedReader input = Utility.openUnicodeFile(blocksname, version, true, Utility.LATIN1); String line = ""; @@ -363,9 +369,9 @@ public final class ConvertUCD implements UCD_Types { } } - static Set properties = new TreeSet(); + Set properties = new TreeSet(); - static void readSemi(String[] labels) throws Exception { + void readSemi(String[] labels) throws Exception { System.out.println(); System.out.println("Reading '" + labels[0] + "'"); if (major < 3 || (major == 3 && minor < 1)) { @@ -554,8 +560,9 @@ public final class ConvertUCD implements UCD_Types { System.out.println(";"); } - static Map charData = new TreeMap(); + Map charData = new TreeMap(); + /* static void writeXML() throws IOException { System.out.println("Writing 'UCD-Main.xml'"); BufferedWriter output = new BufferedWriter( @@ -604,7 +611,7 @@ public final class ConvertUCD implements UCD_Types { String value = Utility.quoteXML((String) data.get(label)); output.write(" " + label + "='" + value + "'"); } - */ + *//* output.write("/>\r\n"); } @@ -615,8 +622,9 @@ public final class ConvertUCD implements UCD_Types { output.close(); } } - - static void writeJavaData() throws IOException { + */ + + void writeJavaData() throws IOException { Iterator it = charData.keySet().iterator(); int codePoint = -1; System.out.println("Writing " + dataFilePrefix + version); @@ -665,13 +673,13 @@ public final class ConvertUCD implements UCD_Types { } } - static String[] xsSplit = new String[40]; + //static String[] xsSplit = new String[40]; // Cache a little bit for speed - static int getEntryCodePoint = -1; - static UData getEntryUData = null; + int getEntryCodePoint = -1; + UData getEntryUData = null; - static UData getEntryIfExists(int cp) { + UData getEntryIfExists(int cp) { if (cp == getEntryCodePoint) return getEntryUData; Integer cc = new Integer(cp); UData charEntry = (UData) charData.get(cc); @@ -683,7 +691,7 @@ public final class ConvertUCD implements UCD_Types { /* Get entry in table for cc */ - static UData getEntry(int cp) { + UData getEntry(int cp) { if (cp == getEntryCodePoint) return getEntryUData; Integer cc = new Integer(cp); UData charEntry = (UData) charData.get(cc); @@ -699,12 +707,12 @@ public final class ConvertUCD implements UCD_Types { /** Adds the character data. Signals duplicates with an exception */ - static void setBinaryProperty(int cp, int binProp) { + void setBinaryProperty(int cp, int binProp) { UData charEntry = getEntry(cp); charEntry.binaryProperties |= (1L << binProp); } - static void appendCharProperties(int cp, String key) { + void appendCharProperties(int cp, String key) { int ind; //if (true || NEWPROPS) { ind = Utility.lookup(key, UCD_Names.BP, true); @@ -716,14 +724,12 @@ public final class ConvertUCD implements UCD_Types { setBinaryProperty(cp, ind); } - static Set jtSet = new TreeSet(); - static Set jgSet = new TreeSet(); + Set jtSet = new TreeSet(); + Set jgSet = new TreeSet(); - static final boolean SHOW_SAMPLE = false; - /** Adds the character data. Signals duplicates with an exception */ - static void addCharData(int cp, String key, String value) { + void addCharData(int cp, String key, String value) { //if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value)); UData charEntry = getEntry(cp); //if (cp < 10) System.out.println(" " + charEntry); @@ -794,7 +800,7 @@ public final class ConvertUCD implements UCD_Types { } - static public void setField(UData uData, String fieldName, String fieldValue) { + public void setField(UData uData, String fieldName, String fieldValue) { try { if (fieldName.equals("n")) { uData.name = fieldValue; diff --git a/tools/unicodetools/com/ibm/text/UCD/Default.java b/tools/unicodetools/com/ibm/text/UCD/Default.java index 60f04992cb2..b0755ec6aca 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Default.java +++ b/tools/unicodetools/com/ibm/text/UCD/Default.java @@ -8,7 +8,7 @@ import java.util.TimeZone; public final class Default implements UCD_Types { - public static String ucdVersion = UCD.latestVersion; + private static String ucdVersion = UCD.latestVersion; public static UCD ucd; public static Normalizer nfc; public static Normalizer nfd; @@ -21,16 +21,16 @@ public final class Default implements UCD_Types { } public static void setUCD(String version) { - ucdVersion = version; + setUcdVersion(version); setUCD(); } public static void setUCD() { - ucd = UCD.make(ucdVersion); - nfd = nf[NFD] = new Normalizer(Normalizer.NFD, ucdVersion); - nfc = nf[NFC] = new Normalizer(Normalizer.NFC, ucdVersion); - nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, ucdVersion); - nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdVersion); + ucd = UCD.make(getUcdVersion()); + nfd = nf[NFD] = new Normalizer(Normalizer.NFD, getUcdVersion()); + nfc = nf[NFC] = new Normalizer(Normalizer.NFC, getUcdVersion()); + nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, getUcdVersion()); + nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, getUcdVersion()); System.out.println("Loaded UCD" + ucd.getVersion() + " " + (new Date(ucd.getDate()))); } @@ -43,4 +43,12 @@ public final class Default implements UCD_Types { return myDateFormat.format(new Date()); } + public static void setUcdVersion(String ucdVersion) { + Default.ucdVersion = ucdVersion; + } + + public static String getUcdVersion() { + return ucdVersion; + } + } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java index 37ec045c0e7..0408e544ca0 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $ -* $Date: 2003/07/21 15:50:06 $ -* $Revision: 1.22 $ +* $Date: 2004/02/06 18:30:22 $ +* $Revision: 1.23 $ * ******************************************************************************* */ @@ -31,11 +31,11 @@ public final class DerivedProperty implements UCD_Types { // ADD CONSTANT to UCD_TYPES - static public UnicodeProperty make(int derivedPropertyID) { + static public UCDProperty make(int derivedPropertyID) { return make(derivedPropertyID, Default.ucd); } - static public UnicodeProperty make(int derivedPropertyID, UCD ucd) { + static public UCDProperty make(int derivedPropertyID, UCD ucd) { if (derivedPropertyID < 0 || derivedPropertyID >= DERIVED_PROPERTY_LIMIT) return null; DerivedProperty dp = getCached(ucd); return dp.dprops[derivedPropertyID]; @@ -96,14 +96,14 @@ public final class DerivedProperty implements UCD_Types { return dprops[propNumber].getValue(int cp); } */ - private UnicodeProperty[] dprops = new UnicodeProperty[50]; + private UCDProperty[] dprops = new UCDProperty[50]; static final String[] CaseNames = { "Uppercase", "Lowercase", "Mixedcase"}; - class ExDProp extends UnicodeProperty { + class ExDProp extends UCDProperty { Normalizer nfx; ExDProp(int i) { type = DERIVED_NORMALIZATION; @@ -124,7 +124,7 @@ public final class DerivedProperty implements UCD_Types { } }; - class NF_UnsafeStartProp extends UnicodeProperty { + class NF_UnsafeStartProp extends UCDProperty { Normalizer nfx; //int prop; @@ -180,7 +180,7 @@ public final class DerivedProperty implements UCD_Types { */ - class NFC_Prop extends UnicodeProperty { + class NFC_Prop extends UCDProperty { BitSet bitset; boolean filter = false; boolean keepNonZero = true; @@ -224,7 +224,7 @@ public final class DerivedProperty implements UCD_Types { }; }; - class GenDProp extends UnicodeProperty { + class GenDProp extends UCDProperty { Normalizer nfx; Normalizer nfComp = null; @@ -281,7 +281,7 @@ public final class DerivedProperty implements UCD_Types { public boolean hasValue(int cp) { return getValue(cp).length() != 0; } }; - class CaseDProp extends UnicodeProperty { + class CaseDProp extends UCDProperty { byte val; CaseDProp (int i) { type = DERIVED_CORE; @@ -301,7 +301,7 @@ public final class DerivedProperty implements UCD_Types { } }; - class QuickDProp extends UnicodeProperty { + class QuickDProp extends UCDProperty { String NO; String MAYBE; Normalizer nfx; @@ -357,7 +357,7 @@ public final class DerivedProperty implements UCD_Types { dprops[i] = new NF_UnsafeStartProp(i-NFD_UnsafeStart); } - dprops[ID_Start] = new UnicodeProperty() { + dprops[ID_Start] = new UCDProperty() { { type = DERIVED_CORE; name = "ID_Start"; @@ -371,7 +371,7 @@ public final class DerivedProperty implements UCD_Types { } }; - dprops[ID_Continue_NO_Cf] = new UnicodeProperty() { + dprops[ID_Continue_NO_Cf] = new UCDProperty() { { name = "ID_Continue"; type = DERIVED_CORE; @@ -441,7 +441,7 @@ public final class DerivedProperty implements UCD_Types { if (status != 0) XID_Continue_Set.add(cp); } - dprops[Mod_ID_Start] = new UnicodeProperty() { + dprops[Mod_ID_Start] = new UCDProperty() { { type = DERIVED_CORE; name = "XID_Start"; @@ -457,7 +457,7 @@ public final class DerivedProperty implements UCD_Types { } }; - dprops[Mod_ID_Continue_NO_Cf] = new UnicodeProperty() { + dprops[Mod_ID_Continue_NO_Cf] = new UCDProperty() { { type = DERIVED_CORE; name = "XID_Continue"; @@ -474,7 +474,7 @@ public final class DerivedProperty implements UCD_Types { } }; - dprops[PropMath] = new UnicodeProperty() { + dprops[PropMath] = new UCDProperty() { { type = DERIVED_CORE; name = "Math"; @@ -490,7 +490,7 @@ public final class DerivedProperty implements UCD_Types { } }; - dprops[PropAlphabetic] = new UnicodeProperty() { + dprops[PropAlphabetic] = new UCDProperty() { { type = DERIVED_CORE; name = "Alphabetic"; @@ -506,7 +506,7 @@ public final class DerivedProperty implements UCD_Types { } }; - dprops[PropLowercase] = new UnicodeProperty() { + dprops[PropLowercase] = new UCDProperty() { { type = DERIVED_CORE; name = "Lowercase"; @@ -522,7 +522,7 @@ public final class DerivedProperty implements UCD_Types { } }; - dprops[PropUppercase] = new UnicodeProperty() { + dprops[PropUppercase] = new UCDProperty() { { type = DERIVED_CORE; name = "Uppercase"; @@ -549,7 +549,7 @@ including all characters whose canonical decomposition consists of a single char file by including all characters whose canonical decomposition consists of a sequence of characters, the first of which has a non-zero combining class. */ - dprops[FullCompExclusion] = new UnicodeProperty() { + dprops[FullCompExclusion] = new UCDProperty() { { type = DERIVED_NORMALIZATION; name = "Full_Composition_Exclusion"; @@ -577,7 +577,7 @@ of characters, the first of which has a non-zero combining class. */ }; - dprops[FullCompInclusion] = new UnicodeProperty() { + dprops[FullCompInclusion] = new UCDProperty() { { isStandard = false; type = DERIVED_NORMALIZATION; @@ -598,7 +598,7 @@ of characters, the first of which has a non-zero combining class. } }; - dprops[FC_NFKC_Closure] = new UnicodeProperty() { + dprops[FC_NFKC_Closure] = new UCDProperty() { { type = DERIVED_NORMALIZATION; setValueType(STRING_PROP); @@ -621,7 +621,7 @@ of characters, the first of which has a non-zero combining class. public boolean hasValue(int cp) { return getValue(cp).length() != 0; } }; - dprops[FC_NFC_Closure] = new UnicodeProperty() { + dprops[FC_NFC_Closure] = new UCDProperty() { { type = DERIVED_NORMALIZATION; isStandard = false; @@ -649,33 +649,47 @@ of characters, the first of which has a non-zero combining class. dprops[i] = new QuickDProp(i - QuickNFD); } - dprops[DefaultIgnorable] = new UnicodeProperty() { + dprops[DefaultIgnorable] = new UCDProperty() { { type = DERIVED_CORE; name = "Default_Ignorable_Code_Point"; hasUnassigned = true; shortName = "DI"; - header = header = "# Derived Property: " + name - + "\r\n# Generated from (Other_Default_Ignorable_Code_Point + Variation_Selector" - + "\r\n# + Noncharacter_Code_Point + Cf + Cc + Cs) - White_Space" - //+ "\r\n# - U+0600..U+0603 - U+06DD - U+070F" - ; + header = null; + } + public String getHeader() { + if (ucdData.getCompositeVersion() > 0x040000) return "# Derived Property: " + name + + "\r\n# Generated from (Other_Default_Ignorable_Code_Point + Variation_Selector" + + "\r\n# + Noncharacter_Code_Point + Cf + Cc + Cs) - White_Space" + + "\r\n# - U+FFF9..U+FFFB// INTERLINEAR ANNOTATION characters"; + //+ "\r\n# - U+0600..U+0603 - U+06DD - U+070F" + return "# Derived Property: " + name + + "\r\n# Generated from (Other_Default_Ignorable_Code_Point + Cf + Cc + Cs) - White_Space"; + } + public boolean hasValue(int cp) { if (ucdData.getBinaryProperty(cp, White_space)) return false; + if (ucdData.getBinaryProperty(cp, Other_Default_Ignorable_Code_Point)) return true; + + if (ucdData.getCompositeVersion() > 0x040000 && cp >= 0xFFF9 && cp <= 0xFFFB) return false; + + byte cat = ucdData.getCategory(cp); + if (cat == Cf || cat == Cs || cat == Cc) return true; + + if (ucdData.getCompositeVersion() <= 0x040000) return false; + + //if (cp >= 0xFFF9 && cp <= 0xFFFB) return false; //if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true; //if (0x0600 <= cp && cp <= 0x0603 || 0x06DD == cp || 0x070F == cp) return false; - if (ucdData.getBinaryProperty(cp, Other_Default_Ignorable_Code_Point)) return true; if (ucdData.getBinaryProperty(cp, Variation_Selector)) return true; if (ucdData.getBinaryProperty(cp, Noncharacter_Code_Point)) return true; - byte cat = ucdData.getCategory(cp); - if (cat == Cf || cat == Cs || cat == Cc) return true; return false; } }; - dprops[Case_Sensitive] = new UnicodeProperty() { + dprops[Case_Sensitive] = new UCDProperty() { { type = DERIVED_CORE; isStandard = false; @@ -763,7 +777,7 @@ of characters, the first of which has a non-zero combining class. } }; - dprops[Other_Case_Ignorable] = new UnicodeProperty() { + dprops[Other_Case_Ignorable] = new UCDProperty() { { name = "Other_Case_Ignorable"; shortName = "OCI"; @@ -785,7 +799,7 @@ of characters, the first of which has a non-zero combining class. } }; - dprops[Type_i] = new UnicodeProperty() { + dprops[Type_i] = new UCDProperty() { { type = DERIVED_CORE; isStandard = false; @@ -819,7 +833,7 @@ of characters, the first of which has a non-zero combining class. } }; - dprops[Case_Ignorable] = new UnicodeProperty() { + dprops[Case_Ignorable] = new UCDProperty() { { name = "Case_Ignorable"; isStandard = false; @@ -842,7 +856,7 @@ of characters, the first of which has a non-zero combining class. # GraphemeBase := */ - dprops[GraphemeExtend] = new UnicodeProperty() { + dprops[GraphemeExtend] = new UCDProperty() { { type = DERIVED_CORE; name = "Grapheme_Extend"; @@ -865,7 +879,7 @@ of characters, the first of which has a non-zero combining class. } }; - dprops[GraphemeBase] = new UnicodeProperty() { + dprops[GraphemeBase] = new UCDProperty() { { type = DERIVED_CORE; name = "Grapheme_Base"; @@ -888,7 +902,7 @@ of characters, the first of which has a non-zero combining class. }; for (int i = 0; i < dprops.length; ++i) { - UnicodeProperty up = dprops[i]; + UCDProperty up = dprops[i]; if (up == null) continue; if (up.getValueType() != BINARY_PROP) continue; up.setValue(NUMBER, "1"); diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java index 04d13a29c37..464cafc2b86 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $ -* $Date: 2003/07/21 15:50:06 $ -* $Revision: 1.11 $ +* $Date: 2004/02/06 18:30:22 $ +* $Revision: 1.12 $ * ******************************************************************************* */ @@ -24,7 +24,7 @@ final class DerivedPropertyLister extends PropertyLister { //private int propMask; //private DerivedProperty dprop; - private UnicodeProperty uprop; + private UCDProperty uprop; int width; boolean varies; diff --git a/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java index 21cabdc3eac..7da5fc17922 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java +++ b/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $ -* $Date: 2003/02/25 23:38:23 $ -* $Revision: 1.8 $ +* $Date: 2004/02/06 18:30:22 $ +* $Revision: 1.9 $ * ******************************************************************************* */ @@ -56,8 +56,8 @@ class DiffPropertyLister extends PropertyLister { } */ - UnicodeProperty newProp = null; - UnicodeProperty oldProp = null; + UCDProperty newProp = null; + UCDProperty oldProp = null; String value = ""; public String optionalComment(int cp) { diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java index 88ec39c1201..cd61eafb2c1 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $ -* $Date: 2003/04/23 20:18:43 $ -* $Revision: 1.7 $ +* $Date: 2004/02/06 18:30:22 $ +* $Revision: 1.8 $ * ******************************************************************************* */ @@ -24,92 +24,28 @@ abstract public class GenerateBreakTest implements UCD_Types { static boolean DEBUG = false; static final boolean SHOW_TYPE = false; + UCD ucd; + Normalizer nfd; + Normalizer nfkd; UnicodeMap sampleMap = null; + UnicodeMap map = new UnicodeMap(); // ====================== Main =========================== public static void main(String[] args) throws IOException { System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61"); //Default.setUCD(); - - if (false) { - - PrintWriter log = Utility.openPrintWriter("Diff.txt", Utility.UTF8_WINDOWS); - UnicodeSet Term = new UnicodeSet( - "[\\u0021\\u003F\\u0589\\u061F\\u06D4\\u0700\\u0701\\u0702\\u0964\\u1362\\u1367" - + "\\u1368\\u104A\\u104B\\u166E\\u1803\\u1809\\u203C\\u203D\\u2047\\u2048\\u2049" - + "\\u3002\\uFE52\\uFE57\\uFF01\\uFF0E\\uFF1F\\uFF61]"); - UnicodeSet terminal_punctuation = getSet(BINARY_PROPERTIES, Terminal_Punctuation); - UnicodeMap names = new UnicodeMap(); - names.add("Pd", getSet(CATEGORY, Pd)); - names.add("Ps", getSet(CATEGORY, Ps)); - names.add("Pe", getSet(CATEGORY, Pe)); - names.add("Pc", getSet(CATEGORY, Pc)); - names.add("Po", getSet(CATEGORY, Po)); - names.add("Pi", getSet(CATEGORY, Pi)); - names.add("Pf", getSet(CATEGORY, Pf)); - - Utility.showSetDifferences(log, "Term", Term, "Terminal_Punctuation", terminal_punctuation, true, true, names, Default.ucd); - Utility.showSetDifferences(log, "Po", getSet(CATEGORY, Po), "Terminal_Punctuation", terminal_punctuation, true, true, names, Default.ucd); - log.close(); - - if (true) return; - - UnicodeSet whitespace = getSet(BINARY_PROPERTIES, White_space); - UnicodeSet space = getSet(CATEGORY, Zs).addAll(getSet(CATEGORY, Zp)).addAll(getSet(CATEGORY, Zl)); - Utility.showSetDifferences("White_Space", whitespace, "Z", space, true, Default.ucd); - - UnicodeSet isSpace = new UnicodeSet(); - UnicodeSet isSpaceChar = new UnicodeSet(); - UnicodeSet isWhitespace = new UnicodeSet(); - for (int i = 0; i <= 0xFFFF; ++i) { - if (Character.isSpace((char)i)) isSpace.add(i); - if (Character.isSpaceChar((char)i)) isSpaceChar.add(i); - if (Character.isWhitespace((char)i)) isWhitespace.add(i); - } - Utility.showSetDifferences("White_Space", whitespace, "isSpace", isSpace, true, Default.ucd); - Utility.showSetDifferences("White_Space", whitespace, "isSpaceChar", isSpaceChar, true, Default.ucd); - Utility.showSetDifferences("White_Space", whitespace, "isWhitespace", isWhitespace, true, Default.ucd); - return; - } - - if (DEBUG) { - checkDecomps(); - - Utility.showSetNames("", new UnicodeSet("[\u034F\u00AD\u1806[:DI:]-[:Cs:]-[:Cn:]]"), true, Default.ucd); - - System.out.println("*** Extend - Cf"); - - generateTerminalClosure(); - - GenerateWordBreakTest gwb = new GenerateWordBreakTest(); - PrintWriter systemPrintWriter = new PrintWriter(System.out); - gwb.printLine(systemPrintWriter, "n\u0308't", true, true, false); - systemPrintWriter.flush(); - //showSet("sepSet", GenerateSentenceBreakTest.sepSet); - //showSet("atermSet", GenerateSentenceBreakTest.atermSet); - //showSet("termSet", GenerateSentenceBreakTest.termSet); - } - - if (true) { - GenerateBreakTest foo = new GenerateLineBreakTest(); - //foo.isBreak("(\"Go.\") (He did)", 5, true); - foo.isBreak("\u4e00\u4300", 1, true); - /* - GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest(); - //foo.isBreak("(\"Go.\") (He did)", 5, true); - foo.isBreak("3.4", 2, true); - */ - } - - new GenerateGraphemeBreakTest().run(); - new GenerateWordBreakTest().run(); - new GenerateLineBreakTest().run(); - new GenerateSentenceBreakTest().run(); - - //if (true) return; // cut short for now - + new GenerateGraphemeBreakTest(Default.ucd).run(); + new GenerateWordBreakTest(Default.ucd).run(); + new GenerateLineBreakTest(Default.ucd).run(); + new GenerateSentenceBreakTest(Default.ucd).run(); + } + + GenerateBreakTest(UCD ucd) { + this.ucd = ucd; + nfd = new Normalizer(Normalizer.NFD, ucd.getVersion()); + nfkd = new Normalizer(Normalizer.NFKD, ucd.getVersion()); } // COMMON STUFF for Hangul @@ -119,11 +55,11 @@ abstract public class GenerateBreakTest implements UCD_Types { static byte getHangulType(int cp) { - if (Default.ucd.isLeadingJamo(cp)) return hL; - if (Default.ucd.isVowelJamo(cp)) return hV; - if (Default.ucd.isTrailingJamo(cp)) return hT; - if (Default.ucd.isHangulSyllable(cp)) { - if (Default.ucd.isDoubleHangul(cp)) return hLV; + if (ucd.isLeadingJamo(cp)) return hL; + if (ucd.isVowelJamo(cp)) return hV; + if (ucd.isTrailingJamo(cp)) return hT; + if (ucd.isHangulSyllable(cp)) { + if (ucd.isDoubleHangul(cp)) return hLV; return hLVT; } return hNot; @@ -131,7 +67,7 @@ abstract public class GenerateBreakTest implements UCD_Types { */ /* static { - Default.setUCD(); + setUCD(); } */ @@ -144,11 +80,11 @@ abstract public class GenerateBreakTest implements UCD_Types { } // finds the first base character, or the first character if there is no base - public static int findFirstBase(String source, int start, int limit) { + public int findFirstBase(String source, int start, int limit) { int cp; for (int i = start; i < limit; i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(source, i); - byte cat = Default.ucd.getCategory(cp); + byte cat = ucd.getCategory(cp); if (((1< " + showData(decomp, INFOPROPS, "\r\n\t")); + System.out.println(showData(ucd, UTF16.valueOf(i), INFOPROPS, "\r\n\t")); + System.out.println(" => " + showData(ucd, decomp, INFOPROPS, "\r\n\t")); shown = true; } System.out.println(j + ": " + tests[k].fileName); @@ -203,13 +140,13 @@ abstract public class GenerateBreakTest implements UCD_Types { } } - static String showData(String source, UnicodeProperty[] props, String separator) { + static String showData(UCD ucd, String source, UCDProperty[] props, String separator) { StringBuffer result = new StringBuffer(); int cp; for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(source, i); if (i != 0) result.append(separator); - result.append(Default.ucd.getCodeAndName(cp)); + result.append(ucd.getCodeAndName(cp)); for (int j = 0; j < props.length; ++j) { result.append(", "); result.append(props[j].getProperty(SHORT)).append('=').append(props[j].getValue(cp,SHORT)); @@ -218,20 +155,18 @@ abstract public class GenerateBreakTest implements UCD_Types { return result.toString(); } - static void showSet(String title, UnicodeSet set) { + void showSet(String title, UnicodeSet set) { System.out.println(title + ": " + set.toPattern(true)); - Utility.showSetNames("", set, false, Default.ucd); + Utility.showSetNames("", set, false, ucd); } - - // determines if string is of form Base NSM* - static boolean isBaseNSMStar(String source) { + boolean isBaseNSMStar(String source) { int cp; int status = 0; for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(source, i); - byte cat = Default.ucd.getCategory(cp); + byte cat = ucd.getCategory(cp); int catMask = 1<"); out.println("" + fileName + " Break Chart"); out.println("