From 953d673966016d3fe5de7b3c9a51d63ee20d34ce Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Tue, 11 Oct 2005 19:39:16 +0000 Subject: [PATCH] ICU-4677 update tools X-SVN-Rev: 18658 --- .../ibm/icu/dev/demo/chart/UnicodeChart.java | 146 +++-- .../dev/test/util/CollectionUtilities.java | 31 +- .../ibm/icu/dev/test/util/TestUtilities.java | 2 +- .../icu/dev/test/util/UnicodeProperty.java | 17 +- .../com/ibm/text/UCD/GenerateData.java | 78 +-- .../UCD/GenerateStandardizedVariants.java | 10 +- tools/unicodetools/com/ibm/text/UCD/Main.java | 23 +- .../com/ibm/text/UCD/QuickTest.java | 87 ++- .../com/ibm/text/utility/Counter.java | 32 +- .../com/ibm/text/utility/TestUtility.java | 507 +++++++++++++++++- .../com/ibm/text/utility/UnicodeDataFile.java | 2 +- 11 files changed, 782 insertions(+), 153 deletions(-) diff --git a/icu4j/src/com/ibm/icu/dev/demo/chart/UnicodeChart.java b/icu4j/src/com/ibm/icu/dev/demo/chart/UnicodeChart.java index aed90038533..a3ee1d377ee 100644 --- a/icu4j/src/com/ibm/icu/dev/demo/chart/UnicodeChart.java +++ b/icu4j/src/com/ibm/icu/dev/demo/chart/UnicodeChart.java @@ -7,19 +7,33 @@ package com.ibm.icu.dev.demo.chart; import java.io.*; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + import com.ibm.icu.dev.test.util.*; +import com.ibm.icu.impl.UCharacterProperty; import com.ibm.icu.lang.*; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.VersionInfo; public class UnicodeChart { + static int surrogateType = UCharacter.getType('\ud800'); + static int privateUseType = UCharacter.getType('\ue000'); + public static void main(String[] args) throws IOException { //int rowWidth = 256; - PrintWriter pw = BagFormatter.openUTF8Writer("", "UnicodeChart.html"); + VersionInfo vi = UCharacter.getUnicodeVersion(); + String version = vi.getMajor() + "." + vi.getMinor() + "." + vi.getMilli(); + PrintWriter pw = BagFormatter.openUTF8Writer("C:\\DATA\\GEN\\", "UnicodeChart.html"); pw.println(""); pw.println(""); pw.println(""); - pw.println("Unicode 4.0 Chart"); + pw.println("Unicode " + version + " Chart"); pw.println(""); - pw.println(""); + pw.println("

Unicode 4.0 Chart

"); /*pw.println(""); for (int j = 0; j < rowWidth; ++j) { @@ -29,58 +43,118 @@ public class UnicodeChart { */ // TODO: fix Utility to take ints - int surrogateType = UCharacter.getType('\ud800'); - int privateUseType = UCharacter.getType('\ue000'); - System.out.println("Surrogate Type: Java=" + Character.SURROGATE + ", ICU=" + surrogateType); - System.out.println("Private-Use Type: Java=" + Character.PRIVATE_USE + ", ICU=" + privateUseType); + System.out.println("//Surrogate Type: Java=" + Character.SURROGATE + ", ICU=" + surrogateType); + System.out.println("//Private-Use Type: Java=" + Character.PRIVATE_USE + ", ICU=" + privateUseType); //boolean gotOne = true; int columns = 0; - int limit = 0x10FFFF; + int limit = 0x110000/16; char lastType = 'x'; int lastCount = 0; pw.println("

Unicode " + version + " Chart

"); pw.println(""); pw.println(""); pw.println(""); - pw.println(""); - pw.println(""); - pw.println(""); + pw.println(""); + pw.println(""); + pw.println(""); pw.println("
Key
XGraphic characters
\u00A0Whitespace
 Other Default Ignorable
 Undefined, Private Use, or Surrogates
 Noncharacter
\u00A0Other Default Ignorable
\u00A0Undefined, Private Use, or Surrogates
\u00A0Noncharacter
"); pw.println("

Copyright \u00A9 2003, Mark Davis. All Rights Reserved."); pw.close(); - System.out.println("columns: " + columns); + System.out.println("//columns: " + columns); } + + private static char getType(int i) { + char type = 'v'; + int cat = UCharacter.getType(i); + if (UCharacter.hasBinaryProperty(i, UProperty.NONCHARACTER_CODE_POINT)) { + type = 'n'; + } else if (cat == Character.UNASSIGNED || cat == surrogateType || cat == privateUseType) { + type = 'u'; + } else if (UCharacter.isUWhiteSpace(i)) { + type = 'w'; + } else if (UCharacter.hasBinaryProperty(i, UProperty.DEFAULT_IGNORABLE_CODE_POINT)) { + type = 'i'; + } else { + type = 'v'; + } + return type; + } static String hex(int i, int padTo) { String result = Integer.toHexString(i).toUpperCase(java.util.Locale.ENGLISH); diff --git a/icu4j/src/com/ibm/icu/dev/test/util/CollectionUtilities.java b/icu4j/src/com/ibm/icu/dev/test/util/CollectionUtilities.java index bd41b1a04b8..9b449dcb369 100644 --- a/icu4j/src/com/ibm/icu/dev/test/util/CollectionUtilities.java +++ b/icu4j/src/com/ibm/icu/dev/test/util/CollectionUtilities.java @@ -206,29 +206,32 @@ public final class CollectionUtilities { } /** - * Returns an int with bits set. - * Bit 4: a - b != {} - * Bit 2: a * b != {} // * is intersects - * Bit 1: b - a != {} - * Thus the bits can be used to get the following relations, plus - * for A_SUPERSET_B, use (x & NOT_A_SUPERSET_B) == 0 - * for A_SUBSET_B, use (x & NOT_A_SUBSET_B) == 0 - * for A_EQUALS_B, use (x & A_PROPER_DISJOINT_B) == 0 - * for A_DISJOINT_B, use (x & NOT_A_DISJOINT_B) == 0 - * for A_OVERLAPS_B, use (x & NOT_A_DISJOINT_B) == 1 + * Used for results of getContainmentRelation */ - static final int - // ContainmentRelation + public static final int ALL_EMPTY = 0, NOT_A_SUPERSET_B = 1, NOT_A_DISJOINT_B = 2, NOT_A_SUBSET_B = 4, + NOT_A_EQUALS_B = NOT_A_SUBSET_B | NOT_A_SUPERSET_B, A_PROPER_SUBSET_OF_B = NOT_A_DISJOINT_B | NOT_A_SUPERSET_B, - A_PROPER_DISJOINT_B = NOT_A_SUBSET_B | NOT_A_SUPERSET_B, A_PROPER_SUPERSET_B = NOT_A_SUBSET_B | NOT_A_DISJOINT_B, A_PROPER_OVERLAPS_B = NOT_A_SUBSET_B | NOT_A_DISJOINT_B | NOT_A_SUPERSET_B; - public static int getContainmentRelation(Collection a, Collection b) { + /** + * Assesses all the possible containment relations between collections A and B with one call.
+ * Returns an int with bits set, according to a "Venn Diagram" view of A vs B.
+ * NOT_A_SUPERSET_B: a - b != {}
+ * NOT_A_DISJOINT_B: a * b != {} // * is intersects
+ * NOT_A_SUBSET_B: b - a != {}
+ * Thus the bits can be used to get the following relations:
+ * for A_SUPERSET_B, use (x & CollectionUtilities.NOT_A_SUPERSET_B) == 0
+ * for A_SUBSET_B, use (x & CollectionUtilities.NOT_A_SUBSET_B) == 0
+ * for A_EQUALS_B, use (x & CollectionUtilities.NOT_A_EQUALS_B) == 0
+ * for A_DISJOINT_B, use (x & CollectionUtilities.NOT_A_DISJOINT_B) == 0
+ * for A_OVERLAPS_B, use (x & CollectionUtilities.NOT_A_DISJOINT_B) != 0
+ */ + public static int getContainmentRelation(Collection a, Collection b) { if (a.size() == 0) { return (b.size() == 0) ? ALL_EMPTY : NOT_A_SUPERSET_B; } else if (b.size() == 0) { diff --git a/icu4j/src/com/ibm/icu/dev/test/util/TestUtilities.java b/icu4j/src/com/ibm/icu/dev/test/util/TestUtilities.java index 29901b4cd01..1c1ccb6af65 100644 --- a/icu4j/src/com/ibm/icu/dev/test/util/TestUtilities.java +++ b/icu4j/src/com/ibm/icu/dev/test/util/TestUtilities.java @@ -185,7 +185,7 @@ public class TestUtilities extends TestFmwk { case CollectionUtilities.A_PROPER_SUBSET_OF_B: checkContainment(b.containsAll(a) && !a.equals(b), a, relation, b); break; - case CollectionUtilities.A_PROPER_DISJOINT_B: + case CollectionUtilities.NOT_A_EQUALS_B: checkContainment(!CollectionUtilities.containsSome(a, b) && a.size() != 0 && b.size() != 0, a, relation, b); break; case CollectionUtilities.A_PROPER_SUPERSET_B: diff --git a/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java b/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java index a33c1682e13..5723e85a57c 100644 --- a/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java +++ b/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java @@ -318,8 +318,23 @@ Name: Unicode_1_Name * @return the unicode map */ public UnicodeMap getUnicodeMap() { - return (UnicodeMap) getUnicodeMap_internal().clone(); + return getUnicodeMap(false); } + + /** + * @return the unicode map + */ + public UnicodeMap getUnicodeMap(boolean getShortest) { + if (!getShortest) return (UnicodeMap) getUnicodeMap_internal().clone(); + UnicodeMap result = new UnicodeMap(); + for (int i = 0; i <= 0x10FFFF; ++i) { + //if (DEBUG && i == 0x41) System.out.println(i + "\t" + getValue(i)); + String value = getValue(i,true); + result.put(i, value); + } + return result; + } + /** * @return the unicode map diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java index 4019fec01fa..dd863002008 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $ -* $Date: 2005/03/26 05:40:04 $ -* $Revision: 1.38 $ +* $Date: 2005/10/11 19:39:15 $ +* $Revision: 1.39 $ * ******************************************************************************* */ @@ -24,7 +24,7 @@ import com.ibm.icu.text.UnicodeSet; public class GenerateData implements UCD_Types { - static final boolean DEBUG = false; + /* static final boolean DEBUG = false; static final String HORIZONTAL_LINE = "# ================================================"; @@ -156,8 +156,7 @@ public class GenerateData implements UCD_Types { System.out.println("New File: " + newFile); PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); String[] batName = {""}; - String mostRecent = UnicodeDataFile.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName); - System.out.println("Most recent: " + mostRecent); + org.unicode.cldr.util.Utility.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName); doHeader(fileName + UnicodeDataFile.getFileSuffix(false), output, headerChoice); for (int i = 0; i < DERIVED_PROPERTY_LIMIT; ++i) { @@ -180,7 +179,7 @@ public class GenerateData implements UCD_Types { Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); } - /* + public static void listStrings(String file, int type, int subtype) throws IOException { Default.ucd = UCD.make("3.1.0"); UCD ucd30 = UCD.make("3.0.0"); @@ -199,14 +198,14 @@ public class GenerateData implements UCD_Types { } output.close(); } - */ + public static void generateCompExclusions() throws IOException { String newFile = "DerivedData/CompositionExclusions" + UnicodeDataFile.getFileSuffix(true); PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); String[] batName = {""}; - String mostRecent = UnicodeDataFile.generateBat("DerivedData/", "CompositionExclusions", UnicodeDataFile.getFileSuffix(true), batName); + String mostRecent = org.unicode.cldr.util.Utility.generateBat("DerivedData/", "CompositionExclusions", UnicodeDataFile.getFileSuffix(true), batName); output.println("# CompositionExclusions" + UnicodeDataFile.getFileSuffix(false)); output.println(UnicodeDataFile.generateDateLine()); @@ -280,11 +279,11 @@ public class GenerateData implements UCD_Types { } public String optionalComment(int cp) { return ""; } - /* + public String valueName(int cp) { return UTF32.length32(ucdData.getDecompositionMapping(cp)) + ""; } - */ + public byte status(int cp) { if (getType(cp) == type) return INCLUDE; return EXCLUDE; @@ -356,12 +355,12 @@ public class GenerateData implements UCD_Types { if (extra != null) checkDuplicate(duplicates, accumulation, extra, "General_Category=" + value); } - /* + addLine(sorted, "xx; T ; True"); checkDuplicate(duplicates, accumulation, "T", "xx=True"); addLine(sorted, "xx; F ; False"); checkDuplicate(duplicates, accumulation, "F", "xx=False"); - */ + addLine(sorted, "qc", UCD_Names.YN_TABLE[1], UCD_Names.YN_TABLE_LONG[1], null); checkDuplicate(duplicates, accumulation, UCD_Names.YN_TABLE[1], "qc=" + UCD_Names.YN_TABLE_LONG[1]); addLine(sorted, "qc", UCD_Names.YN_TABLE[0], UCD_Names.YN_TABLE_LONG[0], null); @@ -440,7 +439,7 @@ public class GenerateData implements UCD_Types { valueAbb = "n/a"; } - /* + String elide = ""; if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{" + valueAbb @@ -458,9 +457,9 @@ public class GenerateData implements UCD_Types { + value + "}"; System.out.println("" + elide + "" + abb + "" + norm + ""); - */ - /* + + if (type == BINARY_PROPERTIES || type == DERIVED) { //if (value.equals(YN_TABLE_LONG[1])) continue; addLine(sorted, PROP_TYPE_NAMES[BINARY][1], valueAbb, value); @@ -468,7 +467,7 @@ public class GenerateData implements UCD_Types { if (!value.equalsIgnoreCase(valueAbb)) checkDuplicate(duplicates, accumulation, valueAbb, value); continue; } - */ + if (type == COMBINING_CLASS) { String num = up.getValue(NUMBER); @@ -487,20 +486,20 @@ public class GenerateData implements UCD_Types { while (blockIterator.hasNext()) { addLine(sorted, "blk", "n/a", (String)blockIterator.next(), null); } - /* + UCD.BlockData blockData = new UCD.BlockData(); int blockId = 0; while (Default.ucd().getBlockData(blockId++, blockData)) { addLine(sorted, "blk", "n/a", blockData.name); } - */ + String filename = "PropertyAliases"; String newFile = "DerivedData/" + filename + UnicodeDataFile.getFileSuffix(true); PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); String[] batName = {""}; - String mostRecent = UnicodeDataFile.generateBat("DerivedData/", filename, UnicodeDataFile.getFileSuffix(true), batName); + String mostRecent = org.unicode.cldr.util.Utility.generateBat("DerivedData/", filename, UnicodeDataFile.getFileSuffix(true), batName); log.println("# " + filename + UnicodeDataFile.getFileSuffix(false)); log.println(UnicodeDataFile.generateDateLine()); @@ -520,7 +519,7 @@ public class GenerateData implements UCD_Types { filename = "PropertyValueAliases"; newFile = "DerivedData/" + filename + UnicodeDataFile.getFileSuffix(true); log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); - mostRecent = UnicodeDataFile.generateBat("DerivedData/", filename, UnicodeDataFile.getFileSuffix(true), batName); + mostRecent = org.unicode.cldr.util.Utility.generateBat("DerivedData/", filename, UnicodeDataFile.getFileSuffix(true), batName); log.println("# " + filename + UnicodeDataFile.getFileSuffix(false)); log.println(UnicodeDataFile.generateDateLine()); @@ -536,7 +535,7 @@ public class GenerateData implements UCD_Types { filename = "PropertyAliasSummary"; newFile = "OtherData/" + filename + UnicodeDataFile.getFileSuffix(true); log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); - mostRecent = UnicodeDataFile.generateBat("OtherData/", filename, UnicodeDataFile.getFileSuffix(true), batName); + mostRecent = org.unicode.cldr.util.Utility.generateBat("OtherData/", filename, UnicodeDataFile.getFileSuffix(true), batName); log.println(); log.println(HORIZONTAL_LINE); @@ -650,13 +649,13 @@ public class GenerateData implements UCD_Types { } // accumulate differences - /* + String acc = (String)accumulation.get(toCheck); if (acc == null) { acc = "# \"" + toCheck + "\":\t" + originalComment; } acc += ";\t" + result; - */ + result.add(comment); accumulation.add("# " + result.toString() + ":\t" + toCheck); } else { @@ -673,7 +672,7 @@ public class GenerateData implements UCD_Types { String newFile = directory + file + UnicodeDataFile.getFileSuffix(true); PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); String[] batName = {""}; - String mostRecent = UnicodeDataFile.generateBat(directory, file, UnicodeDataFile.getFileSuffix(true), batName); + String mostRecent = org.unicode.cldr.util.Utility.generateBat(directory, file, UnicodeDataFile.getFileSuffix(true), batName); doHeader(file + UnicodeDataFile.getFileSuffix(false), output, headerChoice); int last = -1; @@ -682,7 +681,7 @@ public class GenerateData implements UCD_Types { if (up == null) continue; if (up.skipInDerivedListing()) continue; - /* + if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE || i == (BINARY_PROPERTIES | Non_break) || i == (BINARY_PROPERTIES | CaseFoldTurkishI) @@ -690,11 +689,11 @@ public class GenerateData implements UCD_Types { || i == (JOINING_TYPE | JT_U) || i == (JOINING_GROUP | NO_SHAPING) ) continue; // skip zero case - */ - /*if (skipSpecial == SKIP_SPECIAL + + if (skipSpecial == SKIP_SPECIAL && i >= (BINARY_PROPERTIES | CompositionExclusion) && i < (AGE + NEXT_ENUM)) continue; - */ + if ((last & 0xFF00) != (i & 0xFF00) && (i <= BINARY_PROPERTIES || i >= SCRIPT)) { output.println(); output.println(HORIZONTAL_LINE); @@ -741,7 +740,8 @@ public class GenerateData implements UCD_Types { Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); System.out.println(); } - + + */ static public void writeNormalizerTestSuite(String directory, String fileName) throws IOException { UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, fileName); @@ -750,13 +750,13 @@ public class GenerateData implements UCD_Types { String newFile = directory + fileName + UnicodeDataFile.getFileSuffix(true); //PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX); //String[] batName = {""}; - //String mostRecent = UnicodeDataFile.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName); + //String mostRecent = org.unicode.cldr.util.Utility.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName); String[] example = new String[256]; //log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false)); //log.println(UnicodeDataFile.generateDateLine()); - /*log.println("#"); + log.println("#"); log.println("# Normalization Test Suite"); log.println("# Format:"); log.println("#"); @@ -790,7 +790,7 @@ public class GenerateData implements UCD_Types { log.println("#"); log.println("@Part0 # Specific cases"); - log.println("#");*/ + log.println("#"); for (int j = 0; j < testSuiteCases.length; ++j) { writeLine(testSuiteCases[j], log, false); @@ -897,6 +897,7 @@ public class GenerateData implements UCD_Types { fc.close(); //Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); } + /* static void handleIdentical() throws IOException { DirectoryIterator target = new DirectoryIterator(GEN_DIR + File.separator + "DerivedData"); @@ -916,6 +917,7 @@ public class GenerateData implements UCD_Types { } } +*/ static void writeLine(String cc, PrintWriter log, boolean check) { String c = Default.nfc().normalize(cc); String d = Default.nfd().normalize(cc); @@ -982,14 +984,14 @@ public class GenerateData implements UCD_Types { "\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD" }; - + /* static final void backwardsCompat(String directory, String filename, int[] list) throws IOException { String newFile = directory + filename + UnicodeDataFile.getFileSuffix(true); PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); String[] batName = {""}; - String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName); + String mostRecent = org.unicode.cldr.util.Utility.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName); DiffPropertyLister dpl; UnicodeSet cummulative = new UnicodeSet(); @@ -1072,7 +1074,7 @@ public class GenerateData implements UCD_Types { String newFile = directory + filename + UnicodeDataFile.getFileSuffix(true); PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); String[] batName = {""}; - String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName); + String mostRecent = org.unicode.cldr.util.Utility.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName); try { log.println("# " + filename + UnicodeDataFile.getFileSuffix(false)); log.println(UnicodeDataFile.generateDateLine()); @@ -1116,7 +1118,7 @@ public class GenerateData implements UCD_Types { log.println(HORIZONTAL_LINE); log.println(); new DiffPropertyLister("3.2.0", "4.0.0", log).print(); - /* + printDiff("110", "200"); UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false); UnicodeSet u20 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.0.txt", false); @@ -1157,7 +1159,7 @@ public class GenerateData implements UCD_Types { + n.format(u31m.count())); log.println(); u31m.print(log, false, false, "3.1"); - */ + } finally { if (log != null) { log.close(); @@ -1326,5 +1328,5 @@ public class GenerateData implements UCD_Types { + (start != end ? ".." + Default.ucd().getName(end) : "")); } System.out.println("TrailingZero count: " + result.size()); - } + }*/ } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java b/tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java index 3ba8181c18f..496d8042cc4 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java,v $ -* $Date: 2004/02/12 08:23:15 $ -* $Revision: 1.5 $ +* $Date: 2005/10/11 19:39:15 $ +* $Revision: 1.6 $ * ******************************************************************************* */ @@ -99,8 +99,8 @@ public final class GenerateStandardizedVariants implements UCD_Types { String directory = "DerivedData/"; String filename = directory + "StandardizedVariants" + UnicodeDataFile.getHTMLFileSuffix(true); PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX); - String[] batName = {""}; - String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName); + //String[] batName = {""}; + //String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName); String version = Default.ucd().getVersion(); int lastDot = version.lastIndexOf('.'); @@ -118,6 +118,6 @@ public final class GenerateStandardizedVariants implements UCD_Types { Utility.appendFile("StandardizedVariants-Template.html", Utility.UTF8, out, replacementList); out.close(); - Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]); + //Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]); } } diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index dfaad4d993a..c059d8d1c74 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2004/10/14 17:54:56 $ -* $Revision: 1.35 $ +* $Date: 2005/10/11 19:39:15 $ +* $Revision: 1.36 $ * ******************************************************************************* */ @@ -143,7 +143,7 @@ public final class Main implements UCD_Types { else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null); else if (arg.equalsIgnoreCase("checkcollator")) CheckCollator.main(null); - else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit(); + //else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit(); else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity(); else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test(); @@ -157,7 +157,7 @@ public final class Main implements UCD_Types { else if (arg.equalsIgnoreCase("VerifyIDN")) VerifyUCD.VerifyIDN(); else if (arg.equalsIgnoreCase("NFTest")) VerifyUCD.NFTest(); else if (arg.equalsIgnoreCase("test1")) VerifyUCD.test1(); - else if (arg.equalsIgnoreCase("TrailingZeros")) GenerateData.genTrailingZeros(); + //else if (arg.equalsIgnoreCase("TrailingZeros")) GenerateData.genTrailingZeros(); else if (arg.equalsIgnoreCase("GenerateThaiBreaks")) GenerateThaiBreaks.main(null); else if (arg.equalsIgnoreCase("TestData")) TestData.main(new String[]{args[++i]}); @@ -168,9 +168,9 @@ public final class Main implements UCD_Types { else if (arg.equalsIgnoreCase("BuildNames")) BuildNames.main(null); else if (arg.equalsIgnoreCase("JavascriptProperties")) WriteJavaScriptInfo.assigned(); else if (arg.equalsIgnoreCase("TestDirectoryIterator")) DirectoryIterator.test(); - else if (arg.equalsIgnoreCase("checkIdentical")) GenerateData.handleIdentical(); + //else if (arg.equalsIgnoreCase("checkIdentical")) GenerateData.handleIdentical(); else if (arg.equalsIgnoreCase("testnameuniqueness")) TestNameUniqueness.checkNameList(); - else if (arg.equalsIgnoreCase("checkDifferences")) GenerateData.checkDifferences("3.2.0"); + //else if (arg.equalsIgnoreCase("checkDifferences")) GenerateData.checkDifferences("3.2.0"); else if (arg.equalsIgnoreCase("Compare14652")) Compare14652.main(null); @@ -182,7 +182,7 @@ public final class Main implements UCD_Types { GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt"); */ // EXTRACTED PROPERTIES - + /* else if (arg.equalsIgnoreCase("DerivedBidiClass")) { GenerateData.generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, GenerateData.HEADER_DERIVED, "DerivedData/extracted/", "DerivedBidiClass"); @@ -230,8 +230,9 @@ public final class Main implements UCD_Types { } else if (arg.equalsIgnoreCase("DerivedNumericValues")) { GenerateData.generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, GenerateData.HEADER_DERIVED, "DerivedData/extracted/", "DerivedNumericValues" ); - - } else if (arg.equalsIgnoreCase("StandardizedVariants")) { + } + */ + else if (arg.equalsIgnoreCase("StandardizedVariants")) { GenerateStandardizedVariants.generate(); // OTHER STANDARD PROPERTIES @@ -244,7 +245,7 @@ public final class Main implements UCD_Types { GenerateCaseFolding.generateSpecialCasing(true); GenerateCaseFolding.generateSpecialCasing(false); - } else if (arg.equalsIgnoreCase("CompositionExclusions")) { + /* } else if (arg.equalsIgnoreCase("CompositionExclusions")) { GenerateData.generateCompExclusions(); } else if (arg.equalsIgnoreCase("DerivedAge")) { @@ -305,7 +306,7 @@ public final class Main implements UCD_Types { } else if (arg.equalsIgnoreCase("listKatakana")) { GenerateData.listKatakana(); - +*/ /* } else if (arg.equalsIgnoreCase("DerivedFullNormalization")) { mask = Utility.setBits(0, DerivedProperty.GenNFD, DerivedProperty.GenNFKC); diff --git a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java index 48eba3058aa..fbb4bab907b 100644 --- a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $ -* $Date: 2005/06/24 23:51:52 $ -* $Revision: 1.6 $ +* $Date: 2005/10/11 19:39:15 $ +* $Revision: 1.7 $ * ******************************************************************************* */ @@ -17,6 +17,11 @@ import java.util.*; import java.io.*; import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.dev.test.util.UnicodeMap; +import com.ibm.icu.dev.test.util.UnicodeProperty; +import com.ibm.icu.dev.test.util.UnicodePropertySource; +import com.ibm.icu.dev.test.util.UnicodeMap.MapIterator; +import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; @@ -24,7 +29,77 @@ import com.ibm.icu.text.UnicodeSetIterator; import com.ibm.text.utility.*; public class QuickTest implements UCD_Types { + public static void main(String[] args) throws IOException { + getBidiMirrored(); + if (true) return; + getLengths("NFC", Default.nfc()); + getLengths("NFD", Default.nfd()); + getLengths("NFKC", Default.nfkc()); + getLengths("NFKD", Default.nfkd()); + System.out.println("Done"); + } + + + private static void getBidiMirrored() { + ToolUnicodePropertySource foo = ToolUnicodePropertySource.make(""); + UnicodeMap status = new UnicodeMap(); + status.putAll(foo.getSet("generalcategory=ps"), "*open/close*"); + status.putAll(foo.getSet("generalcategory=pe"), "*open/close*"); + status.putAll(foo.getSet("generalcategory=pi"), "*open/close*"); + status.putAll(foo.getSet("generalcategory=pf"), "*open/close*"); + + UnicodeSet bidiMirroredSet = foo.getSet("bidimirrored=true"); + status.putAll(bidiMirroredSet, "*core*"); + UnicodeSet bidiMirroringSet = new UnicodeSet(); + UnicodeProperty x = foo.getProperty("bidimirroringglyph"); + for (int i = 0; i < 0x10FFFF; ++i) { + String s = x.getValue(i); + if (!s.equals(UTF16.valueOf(i))) bidiMirroringSet.add(i); + } + status.putAll(new UnicodeSet(bidiMirroredSet).removeAll(bidiMirroringSet), "no bidi mirroring"); + UnicodeSet mathSet = foo.getSet("generalcategory=sm"); + status.putAll(mathSet, "math"); + + UnicodeSet special = new UnicodeSet("[<>]"); + for (UnicodeSetIterator it = new UnicodeSetIterator(mathSet); it.next();) { + String s = Default.nfkd().normalize(it.codepoint); + if (special.containsSome(s)) status.put(it.codepoint, "*special*"); + } + //showStatus(status); + // close under nfd + for (int i = 0; i < 0x10FFFF; ++i) { + if (!Default.ucd().isAssigned(i)) continue; + if (!Default.ucd().isPUA(i)) continue; + if (Default.nfkc().isNormalized(i)) continue; + String oldValue = (String) status.getValue(i); + if (oldValue != null) continue; + String s = Default.nfkc().normalize(i); + if (UTF16.countCodePoint(s) != 1) continue; + int cp = UTF16.charAt(s, 0); + String value = (String)status.getValue(cp); + if (value != null) status.put(i, "nfc-closure-" + value); + } + showStatus(status, bidiMirroredSet); + } + + static BagFormatter bf = new BagFormatter(); + private static void showStatus(UnicodeMap status, UnicodeSet x) { + Collection list = new TreeSet(status.getAvailableValues()); + for (Iterator it = list.iterator(); it.hasNext(); ) { + String value = (String) it.next(); + if (value == null) continue; + UnicodeSet set = status.getSet(value); + for (UnicodeSetIterator umi = new UnicodeSetIterator(set); umi.next();) { + System.out.println(Utility.hex(umi.codepoint) + + ";\t" + value + + ";\t" + (x.contains(umi.codepoint) ? "O" : "") + + ";\t" + Default.ucd().getName(umi.codepoint)); + } + } + } + + public static class Length { String title; int bytesPerCodeUnit; @@ -50,14 +125,6 @@ public class QuickTest implements UCD_Types { } } - public static void main(String[] args) throws IOException { - getLengths("NFC", Default.nfc()); - getLengths("NFD", Default.nfd()); - getLengths("NFKC", Default.nfkc()); - getLengths("NFKD", Default.nfkd()); - System.out.println("Done"); - } - static final int skip = (1< 2) return false; + if (len == 1) return i == value.charAt(0); + if (i <= 0xFFFF) return false; + return i == UTF16.charAt(value,0); + } + + /** + * + */ + private static void testHanProp(int iterations, int total, String pname, String type) throws IOException, ClassNotFoundException { + System.out.println(); + UnicodeMap umap = Default.ucd().getHanValue(pname); + System.out.println(umap); + umap.setMissing("na"); + System.out.print("Name:\t" + pname + "\tType:\t" + type); + total = testUnicodeMapSerialization(iterations, total, pname, umap); + } + + static String outdircore = "C:\\DATA\\bin\\UCD_Data"; + static String outdir = outdircore + "4.1.0\\"; + /** + * @param pname + * + */ + private static int testUnicodeMapSerialization(int iterations, int total, String pname, UnicodeMap umap) throws IOException, ClassNotFoundException { + System.out.print("\tValue Count:\t" + umap.getAvailableValues().size()); + + String filename = outdir + pname + ".bin"; + OutputStream out; + ByteArrayOutputStream baout = null; + if (USE_FILE) { + out = new FileOutputStream(filename); + } else { + out = baout = new ByteArrayOutputStream(); + } + out = new GZIPOutputStream(out); + ObjectOutputStream oos = new ObjectOutputStream(out); + //Random rand = new Random(); + +/* if (false) { + oos.writeObject(umap); + oos.close(); + buffer = baout.toByteArray(); + in = new ByteArrayInputStream(buffer, 0, baout.size()); + ois = new ObjectInputStream(in); + reverseMap = (UnicodeMap) ois.readObject(); + } +*/ + // UnicodeMap.StreamCompressor sc = new UnicodeMap.StreamCompressor(); + // int test = (int)Math.abs(rand.nextGaussian()*100000); + // System.out.print(Integer.toString(test, 16).toUpperCase()); + // sc.writeInt(out, test); + // out.close(); + //oos.writeBoolean(true); + //oos.writeUTF("abcdefg"); + oos.writeObject(umap); + oos.close(); + + + long size; + byte[] buffer; + if (USE_FILE) { + size = new File(filename).length(); + } else { + size = baout.size(); + buffer = baout.toByteArray(); + if (DEBUG) System.out.println(showBuffer(buffer, size)); + } + System.out.print("\t"+"Size:\t" + size); + + + // only measure read time + UnicodeMap reverseMap = null; + long start = System.currentTimeMillis(); + for (int i = iterations; i > 0; --i) { + InputStream in; + if (USE_FILE) { + in = new FileInputStream(filename); + } else { + in = new ByteArrayInputStream(buffer, 0, (int)size); + } + in = new GZIPInputStream(in); + // int x = sc.readInt(in); + // if (x != test) System.out.println("Failure"); + // System.out.println("\t=> " + Integer.toString(x, 16).toUpperCase()); + ObjectInputStream ois = new ObjectInputStream(in); + //System.out.println(ois.readBoolean()); + //System.out.println(ois.readUTF()); + + try { + reverseMap = (UnicodeMap) ois.readObject(); + } catch (java.io.OptionalDataException e1) { + System.out.println(e1.eof + "\t" + e1.length); + // TODO Auto-generated catch block + e1.printStackTrace(); + } + ois.close(); + } + long end = System.currentTimeMillis(); + + if (!reverseMap.equals(umap)) { + System.out.println("Failed roundtrip"); + for (int i = 0; i <= 0x10FFFF; ++i) { + String main = (String) umap.getValue(i); + String rev = (String) reverseMap.getValue(i); + if (UnicodeMap.areEqual(main, rev)) + continue; + System.out.println(Utility.hex(i) + "\t'" + main + "',\t'" + + rev + "'"); + } + } + //out.toByteArray(); + total += size; + System.out.print("\tTime:\t" + (end - start) / (iterations * 1.0) + + "\tmsecs (raw:\t" + ((end - start) / 1000.0) + "\tsecs)"); + /* with Vanilla Serialization + * Size: 24131 + * Time: 1.9488 msecs (raw: 9.744 secs) + * With my serialization + * Size: 19353 + * Time: 0.8652 msecs (raw: 4.326 secs) + * With my serialization, and compression of ints + * Size: 8602 + * Time: 2.784 msecs (raw: 1.392 secs) + * With delta encoding + * Size: 5226 + * Time: 1.924 msecs (raw: 0.962 secs) + * Name: + * Size: 776926 + * Time: 180.3 msecs (raw: 1.803 secs) + */ + return total; + } + + /** + * + */ + private static String showBuffer(byte[] buffer, long size) { + StringBuffer result = new StringBuffer(); + for (int j = 0; j < size; ++j) { + if (j != 0) result.append(' '); + result.append(Utility.hex(buffer[j]&0xFF,2)); + } + return result.toString(); + } + + /** + * + */ + private static void testStreamCompressor() throws IOException { + Object[] tests = { + UTF16.valueOf(0x10FFFF),"\u1234", "abc", + new Long(-3), new Long(12345), + new Short(Short.MAX_VALUE), new Short(Short.MIN_VALUE), + new Integer(Integer.MAX_VALUE), new Integer(Integer.MIN_VALUE), + new Long(Long.MIN_VALUE), new Long(Long.MAX_VALUE)}; + + for (int i = 0; i < tests.length; ++i) { + Object source = tests[i]; + ByteArrayOutputStream out = new ByteArrayOutputStream(100); + ObjectOutputStream out2 = new ObjectOutputStream(out); + ByteArrayInputStream in; + ObjectInputStream ois; + byte[] buffer; + DataOutputCompressor sc = new DataOutputCompressor(out2); + long y = 0; + if (source instanceof String) { + sc.writeUTF((String)source); + } else { + y = ((Number)source).longValue(); + sc.writeLong(y); + } + out2.close(); + buffer = out.toByteArray(); + showBytes(buffer, out.size()); + System.out.println(); + in = new ByteArrayInputStream(buffer, 0, out.size()); + ObjectInputStream in2 = new ObjectInputStream(in); + DataInputCompressor isc = new DataInputCompressor(in2); + boolean success = false; + Object result; + boolean isString = source instanceof String; + long x = 0; + if (isString) { + result = isc.readUTF(); + System.out.println(i + "\t" + source + + "\t" + result + + (source.equals(result) ? "\tSuccess" : "\tBitter Failure")); + } else { + x = isc.readLong(); + result = new Long(x); + System.out.println(i + "\t" + y + + x + + "\t" + Utility.hex(y) + + "\t" + Utility.hex(x) + + (x == y ? "\tSuccess" : "\tBitter Failure")); + } + + in2.close(); + } + } + + /** + * + */ + private static void showBytes(byte[] buffer, int len) { + for (int i = 0; i < len; ++i) { + System.out.print(Utility.hex(buffer[i]&0xFF,2) + " "); + } + } + + /** + * + */ + private static UnicodeMap fixNameMap(BreakIterator bk, UnicodeMap umap) { + UnicodeMap temp = new UnicodeMap(); + Counter counter = new Counter(); + for (int i = 0; i < 0x10FFFF; ++i) { + String name = (String) umap.getValue(i); + if (name == null) + continue; + if (name.startsWith("CJK UNIFIED IDEOGRAPH-")) + name = "*"; + else if (name.startsWith("CJK COMPATIBILITY IDEOGRAPH-")) + name = "#"; + else if (name.startsWith("HANGUL SYLLABLE ")) name = "@"; + bk.setText(name); + int start = 0; + while (true) { + int end = bk.next(); + if (end == bk.DONE) + break; + String word = name.substring(start, end); + counter.add(word, Math.max(0, word.length() - 2)); + start = end; + } + temp.put(i, name); + } + if (false) { + Map m = counter.getSortedByCount(); + int count = 0; + int running = 0; + for (Iterator it = m.keySet().iterator(); it.hasNext();) { + Counter.RWInteger c = (Counter.RWInteger) it.next(); + String value = (String) m.get(c); + running += c.value; + System.out.println(count++ + "\t" + c + "\t" + running + + "\t" + value); + } + for (UnicodeMap.MapIterator it2 = new UnicodeMap.MapIterator( + temp); it2.nextRange();) { + System.out.println(Utility.hex(it2.codepoint) + "\t" + + Utility.hex(it2.codepointEnd) + "\t" + + it2.value); + } + } + umap = temp; + return umap; + } + + /** + * + */ + private static void tryFileUnicodeProperty() { + UnicodeProperty.Factory factory = FileUnicodeProperty.Factory.make("4.1.0"); + System.out.println(factory.getAvailableNames()); + UnicodeProperty prop = factory.getProperty("White_Space"); + System.out.println(prop.getUnicodeMap()); + prop = factory.getProperty("kRSUnicode"); + System.out.println(); + System.out.println(prop.getUnicodeMap()); + } + + public static class FileUnicodeProperty extends UnicodeProperty { + private File file; + private String version; + private UnicodeMap map; + + private FileUnicodeProperty(File file, String version) { + this.file = file; + this.version = version; + String base = file.getName(); + setName(base.substring(0, base.length()-4)); // subtract .bin + } + + public static class Factory extends UnicodeProperty.Factory { + private Factory() {} + public static Factory make(String version) { + Factory result = new Factory(); + File f = new File(outdircore + version + "\\"); + File[] files = f.listFiles(); + for (int i = 0; i < files.length; ++i) { + result.add(new FileUnicodeProperty(files[i], version)); + } + return result; + } + } + + protected List _getAvailableValues(List result) { + if (map == null) make(); + return (List) map.getAvailableValues(result); + } + + protected String _getVersion() { + return version; + } + + /* (non-Javadoc) + * @see com.ibm.icu.dev.test.util.UnicodeProperty#_getValue(int) + */ + protected String _getValue(int codepoint) { + if (map == null) make(); + return (String)map.getValue(codepoint); + } + + /** + * + */ + private void make() { + try { + InputStream in = new FileInputStream(file.getCanonicalPath()); + ObjectInputStream ois = new ObjectInputStream(in); + map = (UnicodeMap) ois.readObject(); + ois.close(); + } catch (Exception e) { + throw (InternalError)new InternalError("Can't create property").initCause(e); + } + } + + protected List _getNameAliases(List result) { + result.add(getName()); + return result; + } + + protected List _getValueAliases(String valueAlias, List result) { + return result; } } - */ } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java index f2fe443767e..881197e45f0 100644 --- a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java +++ b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java @@ -35,7 +35,7 @@ public class UnicodeDataFile { String[] batName2 = {""}; mostRecent = UnicodeDataFile.generateBat(directory, filename, newSuffix, fileType, batName2); batName = batName2[0]; - filename = filename; + this.filename = filename; if (!isHTML) { out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));