diff --git a/tools/unicodetools/com/ibm/text/UCA/CEList.java b/tools/unicodetools/com/ibm/text/UCA/CEList.java index 24f0073b3a2..ce2511664fb 100644 --- a/tools/unicodetools/com/ibm/text/UCA/CEList.java +++ b/tools/unicodetools/com/ibm/text/UCA/CEList.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/CEList.java,v $ -* $Date: 2001/08/31 00:20:40 $ -* $Revision: 1.2 $ +* $Date: 2001/09/19 23:32:21 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -102,7 +102,8 @@ public final class CEList implements java.lang.Comparable, UCD_Types { for (int i = startOffset; i < min; ++i) { if (contents[i] != that.contents[i + delta]) { - if (contents[i] < that.contents[i + delta]) return -1; + if ((contents[i] & 0xFFFFFFFFL) + < (that.contents[i + delta] & 0xFFFFFFFFL)) return -1; return 1; } } @@ -158,7 +159,9 @@ public final class CEList implements java.lang.Comparable, UCD_Types { public static String toString(int ce) { return "[" + Utility.hex(UCA.getPrimary(ce)) + "." + Utility.hex(UCA.getSecondary(ce)) + "." - + Utility.hex(UCA.getTertiary(ce)) + "](" + NAME3[UCA.getTertiary(ce)] + ")"; + + Utility.hex(UCA.getTertiary(ce)) + "]" + // + "(" + NAME3[UCA.getTertiary(ce)] + ")" + ; } static final String[] NAME3 = { diff --git a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java index 30b5ab84e4a..42a64314b40 100644 --- a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java +++ b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $ -* $Date: 2001/09/06 01:30:31 $ -* $Revision: 1.3 $ +* $Date: 2001/09/19 23:32:21 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -43,7 +43,7 @@ public class GenOverlap implements UCD_Types { nfd = new Normalizer(Normalizer.NFD); nfkd = new Normalizer(Normalizer.NFKD); - UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd); + UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd); // store data for faster lookup @@ -307,7 +307,7 @@ public class GenOverlap implements UCD_Types { nfd = new Normalizer(Normalizer.NFD); nfkd = new Normalizer(Normalizer.NFKD); - UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd); + UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd); // store data for faster lookup @@ -505,7 +505,7 @@ public class GenOverlap implements UCD_Types { //nfd = new Normalizer(Normalizer.NFD); //nfkd = new Normalizer(Normalizer.NFKD); - UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd); + UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd); nfd = new Normalizer(Normalizer.NFD); nfkd = new Normalizer(Normalizer.NFKD); diff --git a/tools/unicodetools/com/ibm/text/UCA/Main.java b/tools/unicodetools/com/ibm/text/UCA/Main.java new file mode 100644 index 00000000000..4968b192849 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCA/Main.java @@ -0,0 +1,20 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $ +* $Date: 2001/09/19 23:31:50 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCA; + +public class Main { + public static void main(String args[]) throws Exception { + WriteCollationData.main(args); // TODO, pull from there to here. + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java index 6446431c460..3855c3cf0c9 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ -* $Date: 2001/09/06 01:30:31 $ -* $Revision: 1.3 $ +* $Date: 2001/09/19 23:32:21 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -765,15 +765,6 @@ final public class UCA { */ static final int EXCEPTION_CE_MASK = 0xFFC00000; - /** - * Any unsupported characters (those not in the UCA data tables) - * are marked with a exception bit combination - * so that they can be treated specially.
- * There are at least 34 values, so that we can use a range for surrogates - * However, we do add to the first weight if we have surrogate pairs! - */ - static final int UNSUPPORTED = 0xFFC20101; - /** * Used to composed Hangul and Han characters */ @@ -781,6 +772,18 @@ final public class UCA { static final int NEUTRAL_SECONDARY = 0x20; static final int NEUTRAL_TERTIARY = 0x02; + /** + * Any unsupported characters (those not in the UCA data tables) + * are marked with a exception bit combination + * so that they can be treated specially.
+ * There are at least 34 values, so that we can use a range for surrogates + * However, we do add to the first weight if we have surrogate pairs! + */ + static final int UNSUPPORTED_P = 0xFFC2; + static final int UNSUPPORTED = makeKey(UNSUPPORTED_P, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); + + // was 0xFFC20101; + /** * Contracting characters are marked with a exception bit combination * in the collationElement table. @@ -968,9 +971,14 @@ final public class UCA { // in code order. // add bottom 5 bits to UNSUPPORTED, and push rest //return UNSUPPORTED + (bigChar & 0xFFFF0000); // top bits added + expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, 0, 0)); // primary = bottom 15 bits plus turn bottom bit on. + // secondary and tertiary are both zero + return makeKey(UNSUPPORTED_P + (bigChar >> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); // top 34 values plus UNSUPPORTED + /* expandingStack.push(((bigChar & 0x7FFF) << 16) | 0x10000000); // primary = bottom 15 bits plus turn bottom bit on. // secondary and tertiary are both zero return UNSUPPORTED + ((bigChar << 1) & 0xFFFF0000); // top 34 values plus UNSUPPORTED + */ } if (ce == CONTRACTING) { // Contracting is probably the most interesting (read "tricky") part @@ -1127,11 +1135,11 @@ final public class UCA { return new Hashtable(multiTable); } - public CollationContents getCollationContents(byte ceLimit, Normalizer skipDecomps) { - return new CollationContents(ceLimit, skipDecomps); + public UCAContents getContents(byte ceLimit, Normalizer skipDecomps) { + return new UCAContents(ceLimit, skipDecomps); } - public class CollationContents { + public class UCAContents { int current = -1; Normalizer skipDecomps = new Normalizer(Normalizer.NFD); Iterator enum = null; @@ -1140,16 +1148,15 @@ final public class UCA { /** * use FIXED_CE as the limit */ - CollationContents(byte ceLimit, Normalizer skipDecomps) { + UCAContents(byte ceLimit, Normalizer skipDecomps) { this.ceLimit = ceLimit; this.skipDecomps = skipDecomps; } - + /** - * returns a string and its ces + * returns a string */ - public String next(int[] ces, int[] len) { - + public String next() { String result = null; // null if done // normal case @@ -1158,7 +1165,6 @@ final public class UCA { if (getCEType(ch) >= ceLimit) continue; if (skipDecomps != null && skipDecomps.hasDecomposition(ch)) continue; result = String.valueOf(ch); - len[0] = getCEs(result, true, ces); return result; } @@ -1166,11 +1172,36 @@ final public class UCA { if (enum == null) enum = multiTable.keySet().iterator(); if (enum.hasNext()) { result = (String)enum.next(); - len[0] = getCEs(result, true, ces); } return result; } + + + /** + * returns a string and its ces + */ + public String next(int[] ces, int[] len) { + + String result = next(); // null if done + if (result != null) { + len[0] = getCEs(result, true, ces); + } + return result; + } + + int[] lengthBuffer = new int[1]; + + /** + * returns a string and its ces + */ + public boolean next(Pair result) { + String s = next(ceListBuffer, lengthBuffer); + if (s == null) return false; + result.first = new CEList(ceListBuffer, 0, lengthBuffer[0]); + result.second = s; + return true; + } } /** diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java new file mode 100644 index 00000000000..f61ddac467a --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java @@ -0,0 +1,213 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $ +* $Date: 2001/09/19 23:31:50 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCA; + +import java.util.*; + +import java.io.*; +import com.ibm.text.UCD.*; +import com.ibm.text.utility.*; +import com.ibm.text.UTF16; + +public class WriteCharts implements UCD_Types { + + static UCD ucd; + + static public void test(UCA uca) throws IOException { + + uca.setAlternate(UCA.NON_IGNORABLE); + + ucd = UCD.make(); + Normalizer nfd = new Normalizer(Normalizer.NFD); + + UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps + + Set set = new TreeSet(); + + while (true) { + String x = cc.next(); + if (x == null) break; + set.add(new Pair(uca.getSortKey(x), x)); + } + + PrintWriter output = null; + + Iterator it = set.iterator(); + + int oldScript = -999; + + int[] scriptCount = new int[LIMIT_SCRIPT]; + + int counter = 0; + + int lastPrimary = -1; + + String lastSortKey = null; + + int high = uca.getSortKey("a").charAt(0); + int variable = UCA.getPrimary(uca.getVariableHigh()); + + int columnCount = 0; + + indexFile = Utility.openPrintWriter("CollationCharts\\index_list.html"); + + indexFile.println(""); + indexFile.println("UCA Default Collation Table"); + indexFile.println(""); + indexFile.println("

UCA Default Collation Table

"); + indexFile.println("

Help"); + + while (it.hasNext()) { + Utility.dot(counter); + + Pair p = (Pair) it.next(); + String sortKey = (String) p.first; + String s = (String) p.second; + + int cp = UTF16.charAt(s,0); + byte script = ucd.getScript(cp); + if (script == KATAKANA_SCRIPT) script = HIRAGANA_SCRIPT; + else if (script == INHERITED_SCRIPT) script = COMMON_SCRIPT; + + // get first non-zero primary + int primary = sortKey.charAt(0); + if (sortKey.length() < 4) script = -3; + else if (primary == 0) script = -2; + else if (primary < variable) script = -1; + else if (primary < high) script = COMMON_SCRIPT; + + if (script != oldScript + && (oldScript < COMMON_SCRIPT || script != COMMON_SCRIPT && script != INHERITED_SCRIPT)) { + closeFile(output); + output = null; + } + if (output == null) { + ++scriptCount[script+3]; + if (scriptCount[script+3] > 1) { + System.out.println("\t\tFAIL: " + scriptCount[script+3] + ", " + + ucd.getScriptID_fromIndex(script) + ", " + ucd.getCodeAndName(s)); + } + output = openFile(scriptCount[script+3], script); + oldScript = script; + } + + int strength = 6; + if (lastSortKey != null && sortKey.charAt(0) == lastSortKey.charAt(0)) { + strength = uca.strengthDifference(sortKey, lastSortKey); + if (strength < 0) strength = -strength; + } + lastSortKey = sortKey; + String breaker = ""; + if (columnCount > 10 || strength > 5) { + if (strength <= 5) breaker = ""; + else breaker = ""; + columnCount = 0; + } + output.println(breaker + CLASSNAME[strength] + s + + "
" + Utility.hex(s) + //+ "
" + script + //+ "
" + UCA.toString(sortKey) + + "
"); + ++columnCount; + } + + closeFile(output); + indexFile.println(""); + indexFile.close(); + } + + static final String[] CLASSNAME = { + "", + "", + "", + "", + "", + "", + ""}; + + + static PrintWriter indexFile; + + static PrintWriter openFile(int count, byte script) throws IOException { + String scriptName = getChunkName(script); + scriptName = ucd.getCase(scriptName, FULL, TITLE); + + String fileName = "chart_" + scriptName + (count > 1 ? count + "" : "") + ".html"; + PrintWriter output = Utility.openPrintWriter("CollationCharts\\" + fileName); + Utility.fixDot(); + System.out.println("Writing: " + scriptName); + + indexFile.println(" | " + scriptName + ""); + String title = "UCA: " + scriptName; + output.println(""); + output.println("" + title + ""); + output.println(""); + output.println("

" + scriptName + "

"); + output.println(""); + return output; + } + + static String getChunkName(byte script) { + if (script == -3) return "NULL"; + else if (script == -2) return "IGNORABLE"; + else if (script == -1) return "VARIABLE"; + else if (script == HIRAGANA_SCRIPT) return "KATAKANA-HIRAGANA"; + else return ucd.getScriptID_fromIndex(script); + } + + static void closeFile(PrintWriter output) { + if (output == null) return; + output.println("
"); + output.close(); + } +} + + + + /* + static final IntStack p1 = new IntStack(30); + static final IntStack s1 = new IntStack(30); + static final IntStack t1 = new IntStack(30); + static final IntStack p2 = new IntStack(30); + static final IntStack s2 = new IntStack(30); + static final IntStack t2 = new IntStack(30); + + static int getStrengthDifference(CEList ceList, CEList lastCEList) { + extractNonzeros(ceList, p1, s1, t1); + extractNonzeros(lastCEList, p2, s2, t2); + int temp = p1.compareTo(p2); + if (temp != 0) return 3; + temp = s1.compareTo(s2); + if (temp != 0) return 2; + temp = t1.compareTo(t2); + if (temp != 0) return 1; + return 0; + } + + static void extractNonzeros(CEList ceList, IntStack primaries, IntStack secondaries, IntStack tertiaries) { + primaries.clear(); + secondaries.clear(); + tertiaries.clear(); + + for (int i = 0; i < ceList.length(); ++i) { + int ce = ceList.at(i); + int temp = UCA.getPrimary(ce); + if (temp != 0) primaries.push(temp); + temp = UCA.getSecondary(ce); + if (temp != 0) secondaries.push(temp); + temp = UCA.getTertiary(ce); + if (temp != 0) tertiaries.push(temp); + } + } + */ \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index 826bab5b94f..aa04e472ab6 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2001/09/06 01:30:30 $ -* $Revision: 1.3 $ +* $Date: 2001/09/19 23:32:21 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -34,7 +34,6 @@ public class WriteCollationData implements UCD_Types { static final boolean EXCLUDE_UNSUPPORTED = true; static final boolean GENERATED_NFC_MISMATCHES = true; static final boolean DO_CHARTS = true; - static final boolean WRITE_NAME_IN_CONFORMANCE = true; static UCA collator; @@ -58,12 +57,13 @@ public class WriteCollationData implements UCD_Types { ucd = UCD.make(""); if (args.length == 0) args = new String[] {"?"}; // force the help comment - boolean hex = false; + boolean shortPrint = false; for (int i = 0; i < args.length; ++i) { String arg = args[i]; if (arg.equalsIgnoreCase("WriteRulesWithNames")) writeRules(WITH_NAMES); else if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(collator); + else if (arg.equalsIgnoreCase("WriteCharts")) WriteCharts.test(collator); else if (arg.equalsIgnoreCase("CheckHash")) GenOverlap.checkHash(collator); else if (arg.equalsIgnoreCase("generateRevision")) GenOverlap.generateRevision(collator); @@ -72,15 +72,15 @@ public class WriteCollationData implements UCD_Types { else if (arg.equalsIgnoreCase("checkDisjointIgnorables")) checkDisjointIgnorables(); else if (arg.equalsIgnoreCase("writeContractions")) writeContractions(); else if (arg.equalsIgnoreCase("FractionalUCA")) writeFractionalUCA("FractionalUCA"); - else if (arg.equalsIgnoreCase("writeConformance")) writeConformance("CollationTest_NON_IGNORABLE.txt", UCA.NON_IGNORABLE, hex); - else if (arg.equalsIgnoreCase("writeConformanceSHIFTED")) writeConformance("CollationTest_SHIFTED.txt", UCA.SHIFTED, hex); + else if (arg.equalsIgnoreCase("writeConformance")) writeConformance("CollationTest_NON_IGNORABLE", UCA.NON_IGNORABLE, shortPrint); + else if (arg.equalsIgnoreCase("writeConformanceSHIFTED")) writeConformance("CollationTest_SHIFTED", UCA.SHIFTED, shortPrint); else if (arg.equalsIgnoreCase("testCompatibilityCharacters")) testCompatibilityCharacters(); else if (arg.equalsIgnoreCase("writeCollationValidityLog")) writeCollationValidityLog(); else if (arg.equalsIgnoreCase("writeCaseExceptions")) writeCaseExceptions(); else if (arg.equalsIgnoreCase("writeJavascriptInfo")) writeJavascriptInfo(); else if (arg.equalsIgnoreCase("writeCaseFolding")) writeCaseFolding(); else if (arg.equalsIgnoreCase("javatest")) javatest(); - else if (arg.equalsIgnoreCase("hex")) hex = true; + else if (arg.equalsIgnoreCase("short")) shortPrint = true; else { System.out.println(); System.out.println("UNKNOWN OPTION (" + arg + "): must be one of the following (case-insensitive)"); @@ -339,15 +339,17 @@ public class WriteCollationData implements UCD_Types { } - static void writeConformance(String filename, byte option, boolean hex) throws IOException { - UCD ucd30 = UCD.make("300"); + static void writeConformance(String filename, byte option, boolean shortPrint) throws IOException { + UCD ucd30 = UCD.make("3.0.0"); - PrintWriter log = Utility.openPrintWriter(filename); - if (!hex) log.write('\uFEFF'); + PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt"); + if (!shortPrint) log.write('\uFEFF'); System.out.println("Sorting"); + int counter = 0; for (int i = 0; i <= 0x10FFFF; ++i) { + Utility.dot(counter++); if (!ucd.isRepresented(i)) continue; addStringX(UTF32.valueOf32(i), option); } @@ -355,11 +357,14 @@ public class WriteCollationData implements UCD_Types { Hashtable multiTable = collator.getContracting(); Enumeration enum = multiTable.keys(); while (enum.hasMoreElements()) { + Utility.dot(counter++); addStringX((String)enum.nextElement(), option); } for (int i = 0; i < extraConformanceTests.length; ++i) { // put in sample non-characters + Utility.dot(counter++); String s = UTF32.valueOf32(extraConformanceTests[i]); + Utility.fixDot(); System.out.println("Adding: " + Utility.hex(s)); addStringX(s, option); } @@ -367,6 +372,7 @@ public class WriteCollationData implements UCD_Types { for (int i = 0; ; ++i) { // add first unallocated character if (!ucd.isAssigned(i)) { String s = UTF32.valueOf32(i); + Utility.fixDot(); System.out.println("Adding: " + Utility.hex(s)); addStringX(s, option); break; @@ -375,6 +381,7 @@ public class WriteCollationData implements UCD_Types { for (int i = 0; i < extraConformanceRanges.length; ++i) { + Utility.dot(counter++); int start = extraConformanceRanges[i][0]; int end = extraConformanceRanges[i][1]; int increment = ((end - start + 1) / 303) + 1; @@ -388,6 +395,7 @@ public class WriteCollationData implements UCD_Types { addStringX(end, option); } + Utility.fixDot(); System.out.println("Total: " + sortedD.size()); Iterator it; @@ -399,6 +407,7 @@ public class WriteCollationData implements UCD_Types { String lastKey = ""; while (it.hasNext()) { + Utility.dot(counter); String key = (String) it.next(); String source = (String) sortedD.get(key); int fluff = key.charAt(key.length() - 1); @@ -408,14 +417,12 @@ public class WriteCollationData implements UCD_Types { //log.println(source); String clipped = source.substring(0, source.length()-1); String stren = source.substring(source.length()-1); - if (hex) { + if (!shortPrint) { log.print(Utility.hex(source)); - } else { - log.print(source + "\t" + Utility.hex(clipped)); - } - if (WRITE_NAME_IN_CONFORMANCE) { log.print( ";\t#" + ucd.getName(clipped)+ "\t" + UCA.toString(key)); + } else { + log.print(source + "\t" + Utility.hex(clipped)); } log.println(); } @@ -754,7 +761,7 @@ public class WriteCollationData implements UCD_Types { int[] ces = new int[50]; - UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd); + UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd); int[] lenArray = new int[1]; diLog.println("# Contractions"); @@ -819,7 +826,7 @@ public class WriteCollationData implements UCD_Types { String s = String.valueOf(ch); int len = collator.getCEs(s, true, ces); */ - UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd); + UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd); int[] lenArray = new int[1]; Set sortedCodes = new TreeSet(); @@ -987,7 +994,7 @@ public class WriteCollationData implements UCD_Types { String s = String.valueOf(ch); int len = collator.getCEs(s, true, ces); */ - UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, nfd); + UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, nfd); int[] lenArray = new int[1]; Set sortedCodes = new TreeSet(); @@ -1179,7 +1186,7 @@ public class WriteCollationData implements UCD_Types { java.util.Comparator cm = new RuleComparator(); Map ordered = new TreeMap(cm); - UCA.CollationContents cc = collator.getCollationContents(UCA.FIXED_CE, + UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, SKIP_CANONICAL_DECOMPOSIBLES ? nfd : null); int[] lenArray = new int[1]; diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java index e4901d5ff36..15834c01ca1 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $ -* $Date: 2001/09/06 01:29:48 $ -* $Revision: 1.3 $ +* $Date: 2001/09/19 23:33:16 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -58,8 +58,10 @@ public class DerivedProperty implements UCD_Types { DefaultIgnorable = 26, GraphemeExtend = 27, GraphemeBase = 28, + + FC_NFC_Closure = 29, - LIMIT = 29; + LIMIT = 30; public DerivedProperty(UCD ucd) { @@ -156,8 +158,8 @@ public class DerivedProperty implements UCD_Types { compName = "NFD for the character"; } header = "# Derived Property: " + name - + "\r\n# Normalized form " + NAME[i-GenNFD] + ", where DIFFERENT from " + compName + "." - + "\r\n# HANGUL SYLLABLES are algorithmically decomposed, and not listed explicitly." + + "\r\n# Lists characters in normalized form " + NAME[i-GenNFD] + "." + + "\r\n# Only those characters whith normalized forms are DIFFERENT from " + compName + " are listed!" + "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact." + "\r\n# It is NOT sufficient to replace characters one-by-one with these results!"; } @@ -422,6 +424,25 @@ of characters, the first of which has a non-zero combining class. boolean hasProperty(int cp) { return getProperty(cp).length() != 0; } }; + dprops[FC_NFC_Closure] = new DProp() { + { + name = "FC_NFC_Closure"; + header = "# Derived Property: " + name + + "\r\n# Generated from computing: b = NFC(Fold(a)); c = NFC(Fold(b));" + + "\r\n# Then if (c != b) add the mapping from a to c to the set of" + + "\r\n# mappings that constitute the FC_NFC_Closure list"; + } + public boolean propertyVaries() {return true;} // default + public String getProperty(int cp) { + if (!ucdData.isRepresented(cp)) return ""; + String b = nfc.normalize(fold(cp)); + String c = nfc.normalize(fold(b)); + if (c.equals(b)) return ""; + return "FN; " + Utility.hex(c); + } // default + boolean hasProperty(int cp) { return getProperty(cp).length() != 0; } + }; + for (int i = QuickNFD; i <= QuickNFKC; ++i) { dprops[i] = new QuickDProp(i); } diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java index 190d1473bfd..6d6329c6e98 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $ -* $Date: 2001/09/06 01:29:48 $ -* $Revision: 1.4 $ +* $Date: 2001/09/19 23:33:16 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -27,7 +27,7 @@ final class DerivedPropertyLister extends PropertyLister { int width; boolean varies; - public DerivedPropertyLister(UCD ucd, int propMask, PrintStream output) { + public DerivedPropertyLister(UCD ucd, int propMask, PrintWriter output) { this.propMask = propMask; this.output = output; this.ucdData = ucd; @@ -87,7 +87,7 @@ final class DerivedPropertyLister extends PropertyLister { String last; public byte status(int cp) { - if (!ucdData.isAssigned(cp)) return EXCLUDE; + if (!ucdData.isAssigned(cp) && propMask != DerivedProperty.DefaultIgnorable) return EXCLUDE; if (!varies) { return dprop.hasProperty(cp, propMask) ? INCLUDE : EXCLUDE; } diff --git a/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java index c58fdf345d6..6710c92effa 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java +++ b/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $ -* $Date: 2001/08/31 00:30:17 $ -* $Revision: 1.2 $ +* $Date: 2001/09/19 23:33:16 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -17,14 +17,11 @@ import java.io.*; class DiffPropertyLister extends PropertyLister { private UCD oldUCD; - public DiffPropertyLister(String oldUCDName, String newUCDName, PrintStream output) { + public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output) { this.output = output; this.ucdData = UCD.make(newUCDName); if (oldUCDName != null) this.oldUCD = UCD.make(oldUCDName); - } - - public byte status (int cp) { - return INCLUDE; + breakByCategory = false; } public String propertyName(int cp) { @@ -42,14 +39,23 @@ class DiffPropertyLister extends PropertyLister { */ - public byte status(int lastCp, int cp) { + public byte status(int cp) { /*if (cp == 0xFFFF) { System.out.println("# " + Utility.hex(cp)); } */ return ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp)) ? INCLUDE : EXCLUDE; } - + + public String headerString() { + if (oldUCD != null) { + return "# Differences between " + ucdData.getVersion() + " and " + oldUCD.getVersion(); + } else { + return "# Allocated as of " + ucdData.getVersion(); + } + } + + /* public int print() { String status; if (oldUCD != null) { @@ -73,6 +79,7 @@ class DiffPropertyLister extends PropertyLister { output.println(); return count; } + */ } diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java index 6c2a5ad16e6..779a33a2ecd 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $ -* $Date: 2001/08/31 00:30:17 $ -* $Revision: 1.2 $ +* $Date: 2001/09/19 23:33:16 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -20,7 +20,7 @@ import com.ibm.text.utility.*; public class GenerateCaseFolding implements UCD_Types { public static boolean DEBUG = false; - public static UCD ucd = UCD.make("310"); + public static UCD ucd = UCD.make(""); public static void main(String[] args) throws java.io.IOException { makeCaseFold(); @@ -285,71 +285,4 @@ public class GenerateCaseFolding implements UCD_Types { } return result + "}"; } - - static final void getAge() throws IOException { - PrintStream log = new PrintStream( - new BufferedOutputStream ( - new FileOutputStream("UnicodeAge.txt"), - 4*1024)); - try { - log.println("# Derived file showing when various code points were allocated in Unicode"); - log.println("# author: M. Davis"); - log.println("# generated: " + new Date()); - log.println("# Notes:"); - log.println("# - The old Hangul Syllables (removed from 2.0) are not included in the 110 listing."); - log.println("# - The supplementary private use code points, although allocated earlier,"); - log.println("# were NOT specifically listed in the UCD until 3.0.1, and are not included until then."); - new DiffPropertyLister(null, "110", log).print(); - new DiffPropertyLister("110", "200", log).print(); - new DiffPropertyLister("200", "210", log).print(); - new DiffPropertyLister("210", "300", log).print(); - new DiffPropertyLister("300", "310", log).print(); - /* - printDiff("110", "200"); - UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false); - UnicodeSet u20 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.0.txt", false); - UnicodeSet u21 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-2.1.txt", false); - UnicodeSet u30 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.0.txt", false); - UnicodeSet u31 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-3.1.txt", false); - - log.println(); - log.println("# Code points assigned in Unicode 1.1 (minus Hangul Syllables): " - + n.format(u11.count())); - log.println(); - u11.print(log, false, false, "1.1"); - - UnicodeSet u20m = new UnicodeSet(u20).remove(u11); - log.println(); - log.println("# Code points assigned in Unicode 2.0 (minus Unicode 1.1): " - + n.format(u20m.count())); - log.println(); - u20m.print(log, false, false, "2.0"); - - UnicodeSet u21m = new UnicodeSet(u21).remove(u20); - log.println(); - log.println("# Code points assigned in Unicode 2.1 (minus Unicode 2.0): " - + n.format(u21m.count())); - log.println(); - u21m.print(log, false, false, "2.1"); - - UnicodeSet u30m = new UnicodeSet(u30).remove(u21); - log.println(); - log.println("# Code points assigned in Unicode 3.0 (minus Unicode 2.1): " - + n.format(u30m.count())); - log.println(); - u30m.print(log, false, false, "3.0"); - - UnicodeSet u31m = new UnicodeSet(u31).remove(u30); - log.println(); - log.println("# Code points assigned in Unicode 3.1 (minus Unicode 3.0): " - + n.format(u31m.count())); - log.println(); - u31m.print(log, false, false, "3.1"); - */ - } finally { - if (log != null) log.close(); - } - - } - } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java index f1202517a67..878d6899abe 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $ -* $Date: 2001/09/06 01:29:48 $ -* $Revision: 1.5 $ +* $Date: 2001/09/19 23:33:16 $ +* $Revision: 1.6 $ * ******************************************************************************* */ @@ -22,9 +22,9 @@ import com.ibm.text.utility.*; public class GenerateData implements UCD_Types { - public static void main (String[] args) throws IOException { + public static void main (String inVersion, String[] args) throws IOException { System.out.println("START"); - ucd = UCD.make(); + ucd = UCD.make(inVersion); System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate()))); String version = ucd.getVersion(); @@ -36,10 +36,7 @@ public class GenerateData implements UCD_Types { Utility.fixDot(); System.out.println("Argument: " + args[i]); - if (arg.equalsIgnoreCase("version")) { - version = args[++i]; - ucd = UCD.make(version); - } else if (arg.equalsIgnoreCase("partition")) { + if (arg.equalsIgnoreCase("partition")) { partitionProperties(); } else if (arg.equalsIgnoreCase("list")) { listProperties(); @@ -91,9 +88,12 @@ public class GenerateData implements UCD_Types { } else if (arg.equalsIgnoreCase("DerivedCoreProperties")) { mask = Utility.setBits(0, DerivedProperty.PropMath, DerivedProperty.Mod_ID_Continue_NO_Cf); - mask = Utility.setBits(mask, DerivedProperty.DefaultIgnorable, DerivedProperty.LIMIT-1); + mask = Utility.setBits(mask, DerivedProperty.DefaultIgnorable, DerivedProperty.FC_NFC_Closure-1); generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-" + version ); + } else if (arg.equalsIgnoreCase("DerivedAge")) { + generateAge("DerivedAge-" + version ); + } else if (arg.equalsIgnoreCase("DerivedLineBreak")) { generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, KEEP_SPECIAL, HEADER_DERIVED, "DerivedLineBreak-" + version ); @@ -181,7 +181,7 @@ public class GenerateData implements UCD_Types { static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2; - public static void doHeader(String fileName, PrintStream output, int headerChoice) { + public static void doHeader(String fileName, PrintWriter output, int headerChoice) { output.println("# " + fileName + ".txt"); output.println("#"); if (headerChoice == HEADER_SCRIPTS) { @@ -203,7 +203,7 @@ public class GenerateData implements UCD_Types { } public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException { - PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName + "dX.txt")); + PrintWriter output = Utility.openPrintWriter(fileName + "dX.txt"); doHeader(fileName, output, headerChoice); for (int i = 0; i < DerivedProperty.LIMIT; ++i) { if ((bitMask & (1< + * See UTR#15 for details.
+ * Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.
+ * The Unicode Consortium makes no expressed or implied warranty of any + * kind, and assumes no liability for errors or omissions. + * No liability is assumed for incidental and consequential damages + * in connection with or arising out of the use of the information here. + * @author Mark Davis + */ + +public class NormalizerSample implements UCD_Types { + static final String copyright = "Copyright (C) 2001, IBM Corp. and Unicode Inc. All Rights Reserved."; + + public static boolean SHOW_PROGRESS = false; + + /** + * Create a normalizer for a given form. + */ + public NormalizerSample(byte form, String unicodeVersion) { + this.composition = (form & COMPOSITION_MASK) != 0; + this.compatibility = (form & COMPATIBILITY_MASK) != 0; + this.data = getData(unicodeVersion); + } + + /** + * Create a normalizer for a given form. + */ + public NormalizerSample(byte form) { + this(form,""); + } + + /** + * Masks for the form selector + */ + public static final byte + COMPATIBILITY_MASK = 1, + COMPOSITION_MASK = 2; + + /** + * Normalization Form Selector + */ + public static final byte + NFD = 0 , + NFKD = COMPATIBILITY_MASK, + NFC = COMPOSITION_MASK, + NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK); + + /** + * Normalizes text according to the chosen form, + * replacing contents of the target buffer. + * @param source the original text, unnormalized + * @param target the resulting normalized text + */ + public StringBuffer normalize(String source, StringBuffer target) { + + // First decompose the source into target, + // then compose if the form requires. + + if (source.length() != 0) { + internalDecompose(source, target); + if (composition) { + internalCompose(target); + } + } + return target; + } + + /** + * Normalizes text according to the chosen form + * @param source the original text, unnormalized + * @return target the resulting normalized text + */ + public String normalize(String source) { + return normalize(source, new StringBuffer()).toString(); + } + + /** + * Normalizes text according to the chosen form + * @param source the original text, unnormalized + * @return target the resulting normalized text + */ + public String normalize(int cp) { + return normalize(UTF16.valueOf(cp)); + } + + /** + */ + private StringBuffer hasDecompositionBuffer = new StringBuffer(); + + public boolean hasDecomposition(int cp) { + hasDecompositionBuffer.setLength(0); + normalize(UTF16.valueOf(cp), hasDecompositionBuffer); + if (hasDecompositionBuffer.length() != 1) return true; + return cp != hasDecompositionBuffer.charAt(0); + } + + /** + * Utility: Checks whether there is a recursive decomposition of a character from the + * Unicode Character Database. It is compatibility or canonical according to the particular + * normalizer. + * @param ch the source character + */ + public boolean normalizationDiffers(int ch) { + return data.normalizationDiffers(ch, composition, compatibility); + } + + /** + * Utility: Gets recursive decomposition of a character from the + * Unicode Character Database. + * @param compatibility If false selects the recursive + * canonical decomposition, otherwise selects + * the recursive compatibility AND canonical decomposition. + * @param ch the source character + * @param buffer buffer to be filled with the decomposition + */ + public void getRecursiveDecomposition(char ch, StringBuffer buffer) { + data.getRecursiveDecomposition(ch, buffer, compatibility); + } + + + // ====================================== + // PRIVATES + // ====================================== + + /** + * The current form. + */ + private boolean composition; + private boolean compatibility; + + /** + * Decomposes text, either canonical or compatibility, + * replacing contents of the target buffer. + * @param form the normalization form. If COMPATIBILITY_MASK + * bit is on in this byte, then selects the recursive + * compatibility decomposition, otherwise selects + * the recursive canonical decomposition. + * @param source the original text, unnormalized + * @param target the resulting normalized text + */ + private void internalDecompose(String source, StringBuffer target) { + StringBuffer buffer = new StringBuffer(); + int ch32; + for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) { + buffer.setLength(0); + ch32 = UTF16.charAt(source, i); + data.getRecursiveDecomposition(ch32, buffer, compatibility); + + // add all of the characters in the decomposition. + // (may be just the original character, if there was + // no decomposition mapping) + + int ch; + for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) { + ch = UTF16.charAt(buffer, j); + int chClass = data.getCanonicalClass(ch); + int k = target.length(); // insertion point + if (chClass != 0) { + + // bubble-sort combining marks as necessary + + int ch2; + for (; k > 0; k -= UTF16.getCharCount(ch2)) { + ch2 = UTF16.charAt(target, k-1); + if (data.getCanonicalClass(ch2) <= chClass) break; + } + } + target.insert(k, UTF16.valueOf(ch)); + } + } + } + + /** + * Composes text in place. Target must already + * have been decomposed. + * Uses UTF16, which is a utility class for supplementary character support in Java. + * @param target input: decomposed text. + * output: the resulting normalized text. + */ + private void internalCompose(StringBuffer target) { + int starterPos = 0; + int starterCh = UTF16.charAt(target,0); + int compPos = UTF16.getCharCount(starterCh); // length of last composition + int lastClass = data.getCanonicalClass(starterCh); + if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark + int oldLen = target.length(); + + // Loop on the decomposed characters, combining where possible + + int ch; + for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) { + ch = UTF16.charAt(target, decompPos); + if (SHOW_PROGRESS) System.out.println(Utility.hex(target) + + ", decompPos: " + decompPos + + ", compPos: " + compPos + + ", ch: " + Utility.hex(ch) + ); + int chClass = data.getCanonicalClass(ch); + int composite = data.getPairwiseComposition(starterCh, ch); + if (composite != data.NOT_COMPOSITE + && (lastClass < chClass || lastClass == 0)) { + UTF16.setCharAt(target, starterPos, composite); + // we know that we will only be replacing non-supplementaries by non-supplementaries + // so we don't have to adjust the decompPos + starterCh = composite; + } else { + if (chClass == 0) { + starterPos = compPos; + starterCh = ch; + } + lastClass = chClass; + UTF16.setCharAt(target, compPos, ch); + if (target.length() != oldLen) { // MAY HAVE TO ADJUST! + System.out.println("ADJUSTING: " + Utility.hex(target)); + decompPos += target.length() - oldLen; + oldLen = target.length(); + } + compPos += UTF16.getCharCount(ch); + } + } + target.setLength(compPos); + } + + // The following class makes use of the UCD class, which accesses data in the Unicode Character Database + + static class Stub { + private UCD ucd; + private HashMap compTable = new HashMap(); + private BitSet isSecond = new BitSet(); + private BitSet canonicalRecompose = new BitSet(); + private BitSet compatibilityRecompose = new BitSet(); + static final int NOT_COMPOSITE = 0xFFFF; + + Stub(String version) { + ucd = UCD.make(version); + for (int i = 0; i < 0x10FFFF; ++i) { + if (!ucd.isAssigned(i)) continue; + if (ucd.isPUA(i)) continue; + if (ucd.isTrailingJamo(i)) isSecond.set(i); + byte dt = ucd.getDecompositionType(i); + if (dt != CANONICAL) continue; + if (!ucd.getBinaryProperty(i, CompositionExclusion)) { + try { + String s = ucd.getDecompositionMapping(i); + int len = UTF16.countCodePoint(s); + if (len != 2) { + if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i)); + continue; + } + int a = UTF16.charAt(s, 0); + if (ucd.getCombiningClass(a) != 0) continue; + + int b = UTF16.charAt(s, UTF16.getCharCount(a)); + isSecond.set(b); + + // have a recomposition, so set the bit + canonicalRecompose.set(i); + + // set the compatibility recomposition bit + // ONLY if the component characters + // don't compatibility decompose + if (ucd.getDecompositionType(a) <= CANONICAL + && ucd.getDecompositionType(b) <= CANONICAL) { + compatibilityRecompose.set(i); + } + + long key = (((long)a)<<32) | b; + + compTable.put(new Long(key), new Integer(i)); + } catch (Exception e) { + throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e); + } + } + } + } + + short getCanonicalClass(int cp) { + return ucd.getCombiningClass(cp); + } + + boolean isTrailing(int cp) { + return isSecond.get(cp); + } + + boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) { + byte dt = ucd.getDecompositionType(cp); + if (!composition) { + if (compatibility) return dt >= CANONICAL; + else return dt == CANONICAL; + } else { + // almost the same, except that we add back in the characters + // that RECOMPOSE + if (compatibility) return dt >= CANONICAL && !compatibilityRecompose.get(cp); + else return dt == CANONICAL && !canonicalRecompose.get(cp); + } + } + + public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) { + byte dt = ucd.getDecompositionType(cp); + // we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE. + if (dt == CANONICAL || dt > CANONICAL && compatibility) { + String s = ucd.getDecompositionMapping(cp); + for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(s, i); + getRecursiveDecomposition(cp, buffer, compatibility); + } + } else { + UTF16.append(buffer, cp); + } + } + + int getPairwiseComposition(int starterCh, int ch) { + int hangulPoss = UCD.composeHangul(starterCh, ch); + if (hangulPoss != 0xFFFF) return hangulPoss; + Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch)); + if (obj == null) return 0xFFFF; + return ((Integer)obj).intValue(); + } + + } + + /** + * Contains normalization data from the Unicode Character Database. + * use false for the minimal set, true for the real set. + */ + private Stub data; + + private static HashMap versionCache = new HashMap(); + + private static Stub getData (String version) { + if (version.length() == 0) version = UCD.latestVersion; + Stub result = (Stub)versionCache.get(version); + if (result == null) { + result = new Stub(version); + versionCache.put(version, result); + } + return result; + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java index aff3ad03a2d..5ef990e8811 100644 --- a/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java +++ b/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $ -* $Date: 2001/08/31 00:30:17 $ -* $Revision: 1.2 $ +* $Date: 2001/09/19 23:33:16 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -15,6 +15,7 @@ package com.ibm.text.UCD; import java.io.*; import com.ibm.text.utility.*; +import java.text.NumberFormat; abstract public class PropertyLister implements UCD_Types { @@ -24,9 +25,10 @@ abstract public class PropertyLister implements UCD_Types { protected UCD ucdData; - protected PrintStream output; + protected PrintWriter output; protected boolean showOnConsole; protected boolean usePropertyComment = true; + protected boolean breakByCategory = true; protected int firstRealCp = -2; protected int lastRealCp = -2; protected boolean alwaysBreaks = false; // set to true if property only breaks @@ -51,7 +53,7 @@ abstract public class PropertyLister implements UCD_Types { } public String optionalComment(int cp) { - if (!usePropertyComment) return ""; + if (!usePropertyComment || !breakByCategory) return ""; int cat = ucdData.getCategory(cp); if (cat == Lt || cat == Ll || cat == Lu) return "L&"; return ucdData.getCategoryID(cp); @@ -167,7 +169,7 @@ abstract public class PropertyLister implements UCD_Types { if (s == INCLUDE && firstRealCp != -1) { byte cat = ucdData.getCategory(cp); if (cat == Lt || cat == Ll) cat = Lu; - if (cat != firstRealCpCat) s = BREAK; + if (breakByCategory && cat != firstRealCpCat) s = BREAK; } switch(s) { @@ -208,9 +210,12 @@ abstract public class PropertyLister implements UCD_Types { } if (count == 0) System.out.println("WARNING -- ZERO COUNT FOR " + header); + NumberFormat nf = NumberFormat.getInstance(); + nf.setMaximumFractionDigits(0); output.println(); - output.println("# Total code points: " + count); + output.println("# Total code points: " + nf.format(count)); output.println(); return count; } + } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/TestData.java b/tools/unicodetools/com/ibm/text/UCD/TestData.java index 173f4d8fc4c..8608921214d 100644 --- a/tools/unicodetools/com/ibm/text/UCD/TestData.java +++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $ -* $Date: 2001/09/01 00:06:15 $ -* $Revision: 1.3 $ +* $Date: 2001/09/19 23:33:16 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -145,7 +145,7 @@ public class TestData implements UCD_Types { static final int HEADER_EXTEND = 0, HEADER_DERIVED = 1, HEADER_SCRIPTS = 2; - public static void doHeader(String fileName, PrintStream output, int headerChoice) { + public static void doHeader(String fileName, PrintWriter output, int headerChoice) { output.println("# " + fixFile(fileName)); output.println("#"); if (headerChoice == HEADER_SCRIPTS) { @@ -167,8 +167,8 @@ public class TestData implements UCD_Types { } public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException { - ucd = UCD.make("310"); - PrintStream output = new PrintStream(new FileOutputStream(GEN_DIR + fileName)); + ucd = UCD.make("3.1.0"); + PrintWriter output = Utility.openPrintWriter(fileName); doHeader(fileName, output, headerChoice); for (int i = 0; i < 32; ++i) { if ((bitMask & (1< 0xFFFF) return false; return true; // Noncharacter } + if (major >= 2 && codePoint >= 0xF0000 && codePoint <= 0x10FFFD) return true; if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && major >= 3 && minor >= 1) return true; return false; } @@ -438,6 +440,21 @@ public final class UCD implements UCD_Types { public byte getScript(int codePoint) { return get(codePoint, false).script; } + + + public byte getScript(String s) { + byte result = COMMON_SCRIPT; + if (s == null || s.length() == 0) return result; + int cp; + for (int i = 0; i < s.length(); i += UTF32.count16(cp)) { + cp = UTF32.char32At(s, i); + byte script = getScript(cp); + if (script == INHERITED_SCRIPT) continue; + result = script; + } + return result; + } + public byte getAge(int codePoint) { return get(codePoint, false).age; diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index d09b0a59dc0..53175278cea 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2001/08/31 00:29:50 $ -* $Revision: 1.2 $ +* $Date: 2001/09/19 23:33:16 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -15,8 +15,8 @@ package com.ibm.text.UCD; public interface UCD_Types { public static final String DATA_DIR = "C:\\DATA\\"; - public static final String BIN_DIR = DATA_DIR + "\\BIN\\"; - public static final String GEN_DIR = DATA_DIR + "\\GEN\\"; + public static final String BIN_DIR = DATA_DIR + "BIN\\"; + public static final String GEN_DIR = DATA_DIR + "GEN\\"; static final byte BINARY_FORMAT = 5; // bumped if binary format of UCD changes diff --git a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java index 663b81bf99a..c7da3dd5fdd 100644 --- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $ -* $Date: 2001/09/06 01:29:48 $ -* $Revision: 1.4 $ +* $Date: 2001/09/19 23:33:15 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -20,6 +20,7 @@ import java.math.BigDecimal; import java.util.*; import java.io.*; //import java.text.*; +import com.ibm.text.*; import com.ibm.text.utility.*; @@ -331,6 +332,7 @@ public class VerifyUCD implements UCD_Types { System.out.println("Checking Prohibited and Unassigned"); System.out.println(); for (int cp = 0; cp <= 0x10FFFF; ++cp) { + Utility.dot(cp); if (mappedOut.get(cp)) continue; boolean ucdUnassigned = !ucd.isAllocated(cp); @@ -339,33 +341,89 @@ public class VerifyUCD implements UCD_Types { boolean idnProhibited = prohibited.get(cp); if (ucdUnassigned && !idnUnassigned) { - showError("UCD Unassigned but not IDN Unassigned: ", cp); + showError("?UCD Unassigned but not IDN Unassigned", cp, ""); ++errorCount; } else if (!ucdUnassigned && idnUnassigned) { - showError("Not UCD Unassigned but IDN Unassigned: ", cp); + showError("?Not UCD Unassigned but IDN Unassigned", cp, ""); ++errorCount; } if (idnProhibited && unassigned.get(cp)) { - showError("Both IDN Unassigned AND IDN Prohibited: ", cp); + showError("?Both IDN Unassigned AND IDN Prohibited", cp, ""); ++errorCount; } if (guess && !idnProhibited) { - showError("UCD ?prohibited? but not IDN Prohibited: ", cp); + showError("?UCD ?prohibited? but not IDN Prohibited ", cp, ""); ++errorCount; } else if (!guess && idnProhibited) { - showError("Not UCD ?prohibited? but IDN Prohibited: ", cp); + showError("?Not UCD ?prohibited? but IDN Prohibited ", cp, ""); ++errorCount; } + + if (cp == 0x3131) { + System.out.println("Debug: " + idnProhibited + + ", " + idnUnassigned + + ", " + nfkc.hasDecomposition(cp) + + ", " + ucd.getCodeAndName(nfkc.normalize(cp)) + + ", " + ucd.getCodeAndName(nfc.normalize(cp))); + } + + if (!idnProhibited && ! idnUnassigned && nfkc.hasDecomposition(cp)) { + String kc = nfkc.normalize(cp); + String c = nfc.normalize(cp); + if (kc.equals(c)) continue; + int cp2; + boolean excluded = false; + for (int j = 0; j < kc.length(); j += UTF16.getCharCount(cp2)) { + cp2 = UTF16.charAt(kc, j); + if (prohibited.get(cp2)) { + showError("Prohibited with NFKC, but output with NFC", cp, ""); + excluded = true; + break; + } + } + if (!excluded) { + showError("Remapped to core abstract character with NFKC (but not NFC)", cp, ""); // , "\t=> " + ucd.getCodeAndName(kc)); + } + } } - System.out.println(); - System.out.println("Total Errors: " + errorCount); + System.out.println("Writing IDNCheck.txt"); + + + PrintWriter log = Utility.openPrintWriter("IDNCheck.txt"); + log.println("IDN Check"); + log.println("Total Errors: " + errorCount); + + Iterator it = idnMap.keySet().iterator(); + while (it.hasNext()) { + String description = (String) it.next(); + Map map = (Map) idnMap.get(description); + log.println(); + log.println(description); + log.println("Total: " + map.size()); + log.println(); + + Iterator it2 = map.keySet().iterator(); + while (it2.hasNext()) { + Object key = it2.next(); + String line = (String) map.get(key); + log.println(" " + line); + } + } + log.close(); } + + static Map idnMap = new HashMap(); - static void showError(String description, int cp) { - System.out.println(description + ucd.getCodeAndName(cp) + " (" + ucd.getCategoryID(cp) + ")"); + static void showError(String description, int cp, String option) { + Map probe = (Map) idnMap.get(description); + if (probe == null) { + probe = new TreeMap(); + idnMap.put(description, probe); + } + probe.put(new Integer(cp), ucd.getCodeAndName(cp) + " (" + ucd.getCategoryID(cp) + ")" + option); } @@ -611,8 +669,7 @@ E0020-E007F; [TAGGING CHARACTERS] if (reason.equals("Map out")) { value = Utility.fromHex(parts[1]); Utility.fixDot(); - System.out.println("Note, Mapping Out: " + ucd.getCodeAndName(cp) - + ", " + ucd.getCodeAndName(value) + ", " + ucd.getCategoryID(cp)); + showError("Mapping Out: ", cp, ""); mappedOut.set(cp); } idnFold.put(key, value); @@ -1033,26 +1090,37 @@ E0020-E007F; [TAGGING CHARACTERS] int sum = 0; long start, end; + java.text.NumberFormat nf = java.text.NumberFormat.getPercentInstance(); + + start = System.currentTimeMillis(); + for (int i = count; i >= 0; --i) { + sum += dummy0(i).length(); + } + end = System.currentTimeMillis(); + double base = end - start; + + System.out.println("unsynchronized static char[]: " + nf.format((end - start)/base)); + start = System.currentTimeMillis(); for (int i = count; i >= 0; --i) { sum += dummy2(i).length(); } end = System.currentTimeMillis(); - System.out.println("synchronized: " + (end - start)); + System.out.println("synchronized static char[]: " + nf.format((end - start)/base)); start = System.currentTimeMillis(); for (int i = count; i >= 0; --i) { sum += dummy1(i).length(); } end = System.currentTimeMillis(); - System.out.println("char[] each time: " + (end - start)); + System.out.println("char[] each time: " + nf.format((end - start)/base)); start = System.currentTimeMillis(); for (int i = count; i >= 0; --i) { sum += dummy3(i).length(); } end = System.currentTimeMillis(); - System.out.println("String +: " + (end - start)); + System.out.println("two valueofs: " + nf.format((end - start)/base)); System.out.println(sum); } @@ -1074,6 +1142,12 @@ E0020-E007F; [TAGGING CHARACTERS] } } + static String dummy0(int a) { + temp2[0] = (char)(a >>> 16); + temp2[1] = (char)a; + return new String(temp2); + } + static String dummy3(int a) { return String.valueOf((char)(a >>> 16)) + (char)a; } diff --git a/tools/unicodetools/com/ibm/text/utility/IntStack.java b/tools/unicodetools/com/ibm/text/utility/IntStack.java index 94d14016971..5fdca1f1f9f 100644 --- a/tools/unicodetools/com/ibm/text/utility/IntStack.java +++ b/tools/unicodetools/com/ibm/text/utility/IntStack.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/IntStack.java,v $ -* $Date: 2001/08/31 00:19:16 $ -* $Revision: 1.2 $ +* $Date: 2001/09/19 23:33:52 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -17,7 +17,7 @@ package com.ibm.text.utility; // Simple stack mechanism, with push, pop and access // ============================================================= -public final class IntStack { +public final class IntStack implements Comparable { private int[] values; private int top = 0; @@ -51,4 +51,31 @@ public final class IntStack { public boolean isEmpty() { return top == 0; } + + public void clear() { + top = 0; + } + + public int compareTo(Object other) { + IntStack that = (IntStack) other; + int min = top; + if (min < that.top) min = that.top; + for (int i = 0; i < min; ++i) { + int result = values[i] - that.values[i]; + if (result != 0) return result; + } + return top - that.top; + } + + public boolean equals(Object other) { + return compareTo(other) == 0; + } + + public int hashCode() { + int result = top; + for (int i = 0; i < top; ++i) { + result = result * 37 + values[i]; + } + return result; + } } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/utility/Pair.java b/tools/unicodetools/com/ibm/text/utility/Pair.java index 55fdf15ade3..1941aa102e3 100644 --- a/tools/unicodetools/com/ibm/text/utility/Pair.java +++ b/tools/unicodetools/com/ibm/text/utility/Pair.java @@ -5,15 +5,15 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Pair.java,v $ -* $Date: 2001/08/31 00:19:16 $ -* $Revision: 1.2 $ +* $Date: 2001/09/19 23:33:52 $ +* $Revision: 1.3 $ * ******************************************************************************* */ package com.ibm.text.utility; -public final class Pair implements java.lang.Comparable { +public final class Pair implements java.lang.Comparable, Cloneable { public Comparable first, second; @@ -41,4 +41,12 @@ public final class Pair implements java.lang.Comparable { if (trial != 0) return trial; return second.compareTo(that.second); } + + public Object clone() { + try { + return super.clone(); + } catch (CloneNotSupportedException e) { + return null; + } + } } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java b/tools/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java index 41e7687adb4..1c8a03170bd 100644 --- a/tools/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java +++ b/tools/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/UTF8StreamWriter.java,v $ -* $Date: 2001/08/31 00:19:16 $ -* $Revision: 1.2 $ +* $Date: 2001/09/19 23:33:52 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -56,7 +56,7 @@ public final class UTF8StreamWriter extends Writer { TRAILING_TOP = 0x80; private static final int MAGIC = 0x10000 + ((0 - 0xD800) << 10) + (0 - 0xDC00); - + public final void write(char[] buffer, int cStart, int cLength) throws IOException { int cEnd = cStart + cLength; while (cStart < cEnd) { @@ -71,6 +71,8 @@ public final class UTF8StreamWriter extends Writer { // get code point int utf32 = buffer[cStart++]; + + if (utf32 == 0x0D) continue; // skip write // special check for surrogates diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java index 50b07793a6c..e05b43a65de 100644 --- a/tools/unicodetools/com/ibm/text/utility/Utility.java +++ b/tools/unicodetools/com/ibm/text/utility/Utility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ -* $Date: 2001/09/06 01:29:03 $ -* $Revision: 1.3 $ +* $Date: 2001/09/19 23:33:52 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -408,12 +408,15 @@ public final class Utility { // COMMON UTILITIES private static final String[] searchPath = { "EXTRAS", - "3.1.2", + "3.2.0", "3.1.1", "3.1.0", "3.0.1", "3.0.0", "2.1.9", + "2.1.8", + "2.1.5", + "2.1.2", "2.0.0", "1.1.0", };