diff --git a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java index 734b39f2703..5ce495c30aa 100644 --- a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java +++ b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $ -* $Date: 2002/05/31 01:41:03 $ -* $Revision: 1.9 $ +* $Date: 2002/09/25 06:40:13 $ +* $Revision: 1.10 $ * ******************************************************************************* */ @@ -164,8 +164,8 @@ public class GenOverlap implements UCD_Types { static boolean PROGRESS = false; static void fullCheck() throws IOException { - PrintWriter log = Utility.openPrintWriter("Overlap.html"); - PrintWriter simpleList = Utility.openPrintWriter("Overlap.txt"); + PrintWriter log = Utility.openPrintWriter("Overlap.html", Utility.UTF8_WINDOWS); + PrintWriter simpleList = Utility.openPrintWriter("Overlap.txt", Utility.UTF8_WINDOWS); Iterator it = completes.keySet().iterator(); int counter = 0; @@ -448,7 +448,7 @@ public class GenOverlap implements UCD_Types { newKeys.removeAll(joint); oldKeys.removeAll(joint); - PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), false, false); + PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), Utility.UTF8_WINDOWS); Iterator it = list.iterator(); int last = -1; while (it.hasNext()) { @@ -631,7 +631,7 @@ public class GenOverlap implements UCD_Types { System.out.println("Data Gathered"); - PrintWriter log = Utility.openPrintWriter("checkstringsearchhash.html"); + PrintWriter log = Utility.openPrintWriter("checkstringsearchhash.html", Utility.UTF8_WINDOWS); Utility.writeHtmlHeader(log, "Check Hash"); log.println("

Collisions

"); log.println("

Shows collisions among primary values when hashed to table size = " + tableLength + "."); @@ -694,7 +694,7 @@ public class GenOverlap implements UCD_Types { } public static void listCyrillic(UCA collatorIn) throws IOException { - PrintWriter log = Utility.openPrintWriter("ListCyrillic.txt", false, false); + PrintWriter log = Utility.openPrintWriter("ListCyrillic.txt", Utility.UTF8_WINDOWS); Set set = new TreeSet(collatorIn); Set set2 = new TreeSet(collatorIn); ucd = UCD.make(); diff --git a/tools/unicodetools/com/ibm/text/UCA/Main.java b/tools/unicodetools/com/ibm/text/UCA/Main.java index 6363604e396..4f789cfa6be 100644 --- a/tools/unicodetools/com/ibm/text/UCA/Main.java +++ b/tools/unicodetools/com/ibm/text/UCA/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $ -* $Date: 2002/07/03 02:15:47 $ -* $Revision: 1.9 $ +* $Date: 2002/09/25 06:40:13 $ +* $Revision: 1.10 $ * ******************************************************************************* */ @@ -55,6 +55,7 @@ public class Main { else if (arg.equalsIgnoreCase("indexChart")) WriteCharts.indexChart(); else if (arg.equalsIgnoreCase("special")) WriteCharts.special(); + else if (arg.equalsIgnoreCase("writeCompositionChart")) WriteCharts.writeCompositionChart(); else if (arg.equalsIgnoreCase("CheckHash")) GenOverlap.checkHash(WriteCollationData.collator); else if (arg.equalsIgnoreCase("generateRevision")) GenOverlap.generateRevision(WriteCollationData.collator); diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java index 376c7628a05..9c8ad32a214 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java @@ -4,9 +4,9 @@ * others. All Rights Reserved. * ******************************************************************************* * -* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $ -* $Date: 2002/07/03 02:15:47 $ -* $Revision: 1.11 $ +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $ +* $Date: 2002/09/25 06:40:13 $ +* $Revision: 1.12 $ * ******************************************************************************* */ @@ -19,12 +19,17 @@ import java.io.*; import com.ibm.text.UCD.*; import com.ibm.text.utility.*; import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UnicodeSet; + + import java.text.SimpleDateFormat; public class WriteCharts implements UCD_Types { - + static boolean HACK_KANA = false; - + static public void special() { Default.setUCD(); for (int i = 0xE000; i < 0x10000; ++i) { @@ -33,58 +38,58 @@ public class WriteCharts implements UCD_Types { System.out.println(Default.ucd.getCodeAndName(i)); } } - + static public void collationChart(UCA uca) throws IOException { Default.setUCD(uca.getUCDVersion()); HACK_KANA = true; - + uca.setAlternate(UCA.NON_IGNORABLE); - + //Normalizer nfd = new Normalizer(Normalizer.NFD); //Normalizer nfc = new Normalizer(Normalizer.NFC); - + UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps cc.enableSamples(); - + Set set = new TreeSet(); - + while (true) { String x = cc.next(); if (x == null) break; if (x.equals("\u2F00")) { System.out.println("debug"); } - + set.add(new Pair(uca.getSortKey(x), x)); } - + PrintWriter output = null; - + Iterator it = set.iterator(); - + byte oldScript = -127; - + int[] scriptCount = new int[128]; - + int counter = 0; - + String lastSortKey = "\u0000"; - + int high = uca.getSortKey("a").charAt(0); int variable = UCA.getPrimary(uca.getVariableHigh()); - + int columnCount = 0; - + String[] replacement = new String[] {"%%%", "Collation Charts"}; String folder = "charts\\uca\\"; - + Utility.copyTextFile("index.html", true, folder + "index.html", replacement); Utility.copyTextFile("charts.css", false, folder + "charts.css"); Utility.copyTextFile("help.html", true, folder + "help.html"); - - indexFile = Utility.openPrintWriter(folder + "index_list.html", false, false); + + indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS); Utility.appendFile("index_header.html", true, indexFile, replacement); - + /* indexFile.println(""); indexFile.println("UCA Default Collation Table"); @@ -93,22 +98,22 @@ public class WriteCharts implements UCD_Types { indexFile.println("

UCA Default Collation Table

"); indexFile.println("

Help"); */ - + while (it.hasNext()) { Utility.dot(counter); - + Pair p = (Pair) it.next(); String sortKey = (String) p.first; String s = (String) p.second; - + int cp = UTF16.charAt(s,0); - + byte script = Default.ucd.getScript(cp); - + // get first non-zero primary int currentPrimary = getFirstPrimary(sortKey); int primary = currentPrimary >>> 16; - + if (sortKey.length() < 4) script = NULL_ORDER; else if (primary == 0) script = IGNORABLE_ORDER; else if (primary < variable) script = VARIABLE_ORDER; @@ -118,35 +123,35 @@ public class WriteCharts implements UCD_Types { else if (primary < UCA_Types.UNSUPPORTED_OTHER_BASE) script = CJK_AB; else script = UNSUPPORTED; } - + if (script == KATAKANA_SCRIPT) script = HIRAGANA_SCRIPT; else if ((script == INHERITED_SCRIPT || script == COMMON_SCRIPT) && oldScript >= 0) script = oldScript; - if (script != oldScript + if (script != oldScript // && (script != COMMON_SCRIPT && script != INHERITED_SCRIPT) ) { closeFile(output); output = null; oldScript = script; } - + if (output == null) { ++scriptCount[script+3]; if (scriptCount[script+3] > 1) { - System.out.println("\t\tFAIL: " + scriptCount[script+3] + ", " + + System.out.println("\t\tFAIL: " + scriptCount[script+3] + ", " + getChunkName(script, LONG) + ", " + Default.ucd.getCodeAndName(s)); } output = openFile(scriptCount[script+3], folder, script); } - + boolean firstPrimaryEquals = currentPrimary == getFirstPrimary(lastSortKey); - + int strength = uca.strengthDifference(sortKey, lastSortKey); if (strength < 0) strength = -strength; lastSortKey = sortKey; - + // find out if this is an expansion: more than one primary weight - + int primaryCount = 0; for (int i = 0; i < sortKey.length(); ++i) { char w = sortKey.charAt(i); @@ -156,7 +161,7 @@ public class WriteCharts implements UCD_Types { } ++ primaryCount; } - + String breaker = ""; if (columnCount > 10 || !firstPrimaryEquals) { columnCount = 0; @@ -166,20 +171,20 @@ public class WriteCharts implements UCD_Types { ++columnCount; } } - + String classname = primaryCount > 1 ? XCLASSNAME[strength] : CLASSNAME[strength]; - + String name = Default.ucd.getName(s); - - + + if (s.equals("\u1eaf")) { System.out.println("debug"); } - + String comp = Default.nfc.normalize(s); - + String outline = breaker + classname - + " title='" + + " title='" + (script != UNSUPPORTED ? Utility.quoteXML(name, true) + ": " : "") @@ -193,21 +198,21 @@ public class WriteCharts implements UCD_Types { ? "" + Utility.quoteXML(name, true) + "" : "") ; - + output.println(outline); ++columnCount; } - + closeFile(output); closeIndexFile(indexFile, "
UCA: " + uca.getDataVersion(), COLLATION); } - + static public void normalizationChart() throws IOException { Default.setUCD(); HACK_KANA = false; - + Set set = new TreeSet(); - + for (int i = 0; i <= 0x10FFFF; ++i) { if (!Default.ucd.isRepresented(i)) { if (i < 0xAC00) continue; @@ -216,35 +221,35 @@ public class WriteCharts implements UCD_Types { } byte cat = Default.ucd.getCategory(i); if (cat == Cs || cat == Co) continue; - + if (Default.nfkd.isNormalized(i)) continue; String decomp = Default.nfkd.normalize(i); - + byte script = getBestScript(decomp); - + set.add(new Pair(new Integer(script == COMMON_SCRIPT ? cat + CAT_OFFSET : script), new Pair(Default.ucd.getCase(decomp, FULL, FOLD), new Integer(i)))); } - + PrintWriter output = null; - + Iterator it = set.iterator(); - + int oldScript = -127; - + int counter = 0; - + String[] replacement = new String[] {"%%%", "Normalization Charts"}; String folder = "charts\\normalization\\"; Utility.copyTextFile("index.html", true, folder + "index.html", replacement); Utility.copyTextFile("charts.css", false, folder + "charts.css"); Utility.copyTextFile("norm_help.html", true, folder + "help.html"); - - indexFile = Utility.openPrintWriter(folder + "index_list.html", false, false); + + indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS); Utility.appendFile("index_header.html", true, indexFile, replacement); - + /* indexFile.println(""); indexFile.println("UCA Default Collation Table"); @@ -253,107 +258,107 @@ public class WriteCharts implements UCD_Types { indexFile.println("

UCA Default Collation Table

"); indexFile.println("

Help"); */ - + while (it.hasNext()) { Utility.dot(counter); - + Pair p = (Pair) it.next(); int script = ((Integer) p.first).intValue(); int cp = ((Integer)((Pair) p.second).second).intValue(); - - if (script != oldScript + + if (script != oldScript // && (script != COMMON_SCRIPT && script != INHERITED_SCRIPT) ) { closeFile(output); output = null; oldScript = script; } - + if (output == null) { output = openFile(0, folder, script); output.println("CodeCDKCKD"); } - + output.println(""); - + String prefix; String code = UTF16.valueOf(cp); String c = Default.nfc.normalize(cp); String d = Default.nfd.normalize(cp); String kc = Default.nfkc.normalize(cp); String kd = Default.nfkd.normalize(cp); - + showCell(output, code, ""); - + } - + closeFile(output); closeIndexFile(indexFile, "", NORMALIZATION); } - + static public void caseChart() throws IOException { Default.setUCD(); HACK_KANA = false; - + Set set = new TreeSet(); - + for (int i = 0; i <= 0x10FFFF; ++i) { if (!Default.ucd.isRepresented(i)) continue; byte cat = Default.ucd.getCategory(i); if (cat == Cs || cat == Co) continue; - + String code = UTF16.valueOf(i); String lower = Default.ucd.getCase(i, FULL, LOWER); String title = Default.ucd.getCase(i, FULL, TITLE); String upper = Default.ucd.getCase(i, FULL, UPPER); String fold = Default.ucd.getCase(i, FULL, FOLD); - + String decomp = Default.nfkd.normalize(i); int script = 0; if (lower.equals(code) && upper.equals(code) && fold.equals(code) && title.equals(code)) { if (!containsCase(decomp)) continue; script = NO_CASE_MAPPING; } - + if (script == 0) script = getBestScript(decomp); - + set.add(new Pair(new Integer(script == COMMON_SCRIPT ? cat + CAT_OFFSET : script), new Pair(Default.ucd.getCase(decomp, FULL, FOLD), new Integer(i)))); } - + PrintWriter output = null; - + Iterator it = set.iterator(); - + int oldScript = -127; - + int counter = 0; String[] replacement = new String[] {"%%%", "Case Charts"}; String folder = "charts\\case\\"; - + Utility.copyTextFile("index.html", true, folder + "index.html", replacement); Utility.copyTextFile("charts.css", false, folder + "charts.css"); Utility.copyTextFile("case_help.html", true, folder + "help.html"); - - indexFile = Utility.openPrintWriter(folder + "index_list.html", false, false); + + indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS); Utility.appendFile("index_header.html", true, indexFile, replacement); - + /* indexFile.println(""); indexFile.println("UCA Default Collation Table"); @@ -362,24 +367,24 @@ public class WriteCharts implements UCD_Types { indexFile.println("

UCA Default Collation Table

"); indexFile.println("

Help"); */ - + int columnCount = 0; - + while (it.hasNext()) { Utility.dot(counter); - + Pair p = (Pair) it.next(); int script = ((Integer) p.first).intValue(); int cp = ((Integer)((Pair) p.second).second).intValue(); - - if (script != oldScript + + if (script != oldScript // && (script != COMMON_SCRIPT && script != INHERITED_SCRIPT) ) { closeFile(output); output = null; oldScript = script; } - + if (output == null) { output = openFile(0, folder, script); if (script == NO_CASE_MAPPING) output.println(""); @@ -387,7 +392,7 @@ public class WriteCharts implements UCD_Types { +"UpperFold"); } - + if (script == NO_CASE_MAPPING) { if (columnCount > 10) { output.println(""); @@ -397,38 +402,38 @@ public class WriteCharts implements UCD_Types { ++columnCount; continue; } - + output.println(""); - + String prefix; String code = UTF16.valueOf(cp); String lower = Default.ucd.getCase(cp, FULL, LOWER); String title = Default.ucd.getCase(cp, FULL, TITLE); String upper = Default.ucd.getCase(cp, FULL, UPPER); String fold = Default.ucd.getCase(cp, FULL, FOLD); - + showCell(output, code, ""); - + } - + closeFile(output); closeIndexFile(indexFile, "", CASE); } - + static public void addMapChar(Map m, Set stoplist, String key, String ch) { if (stoplist.contains(key)) return; for (int i = 0; i < key.length(); ++i) { @@ -442,23 +447,23 @@ public class WriteCharts implements UCD_Types { } result.add(ch); } - + static public void indexChart() throws IOException { Default.setUCD(); HACK_KANA = false; - + Map map = new TreeMap(); Set stoplist = new TreeSet(); - + String[] stops = {"LETTER", "CHARACTER", "AND", "CAPITAL", "SMALL", "COMPATIBILITY", "WITH"}; stoplist.addAll(Arrays.asList(stops)); System.out.println("Stop-list: " + stoplist); - + for (int i = 0; i < LIMIT_SCRIPT; ++i) { stoplist.add(Default.ucd.getScriptID_fromIndex((byte)i)); } System.out.println("Stop-list: " + stoplist); - + for (int i = 0; i <= 0x10FFFF; ++i) { if (!Default.ucd.isRepresented(i)) continue; if (0xAC00 <= i && i <= 0xD7A3) continue; @@ -466,7 +471,7 @@ public class WriteCharts implements UCD_Types { String s = Default.ucd.getName(i); if (s == null) continue; - + if (s.startsWith("<")) { System.out.println("Wierd character at " + Default.ucd.getCodeAndName(i)); } @@ -490,52 +495,52 @@ public class WriteCharts implements UCD_Types { addMapChar(map, stoplist, word, ch); } } - + PrintWriter output = null; - + Iterator it = map.keySet().iterator(); - + int oldScript = -127; - + int counter = 0; String[] replacement = new String[] {"%%%", "Name Charts"}; String folder = "charts\\name\\"; - + Utility.copyTextFile("index.html", true, folder + "index.html", replacement); Utility.copyTextFile("charts.css", false, folder + "charts.css"); Utility.copyTextFile("name_help.html", true, folder + "help.html"); - - indexFile = Utility.openPrintWriter(folder + "index_list.html", false, false); + + indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS); Utility.appendFile("index_header.html", true, indexFile, replacement); - + int columnCount = 0; char lastInitial = 0; - + while (it.hasNext()) { Utility.dot(counter); - + String key = (String) it.next(); - + Set chars = (Set) map.get(key); - + char initial = key.charAt(0); - + if (initial != lastInitial) { closeFile(output); output = null; lastInitial = initial; } - + if (output == null) { output = openFile2(0, folder, String.valueOf(initial)); } - + output.println("" + key + ""); columnCount = 1; - + Iterator sublist = chars.iterator(); while (sublist.hasNext()) { - + String ch = (String) sublist.next(); if (columnCount > 10) { output.println(""); @@ -545,20 +550,20 @@ public class WriteCharts implements UCD_Types { ++columnCount; continue; } - + output.println(""); - + } - + closeFile(output); closeIndexFile(indexFile, "", CASE); } - + static void showCell(PrintWriter output, String s, String prefix, String extra, boolean skipName) { String name = Default.ucd.getName(s); String comp = Default.nfc.normalize(s); - - String outline = prefix + + String outline = prefix + (skipName ? "" : " title='" + Utility.quoteXML(name, true) + "'") + extra + ">" + Utility.quoteXML(comp, true) @@ -566,10 +571,10 @@ public class WriteCharts implements UCD_Types { + Utility.hex(s) //+ "
" + script + "
"; - + output.println(outline); } - + static byte getBestScript(String s) { int cp; byte result = COMMON_SCRIPT; @@ -588,33 +593,33 @@ public class WriteCharts implements UCD_Types { } return (result << 16); } - + static final String[] CLASSNAME = { - "" + scriptName + ""); @@ -626,10 +631,10 @@ public class WriteCharts implements UCD_Types { output.println(""); return output; } - + static PrintWriter openFile2(int count, String directory, String name) throws IOException { String fileName = "chart_" + name + (count > 1 ? count + "" : "") + ".html"; - PrintWriter output = Utility.openPrintWriter(directory + fileName, false, false); + PrintWriter output = Utility.openPrintWriter(directory + fileName, Utility.UTF8_WINDOWS); Utility.fixDot(); System.out.println("Writing: " + name); indexFile.println(" " + name + ""); @@ -641,8 +646,8 @@ public class WriteCharts implements UCD_Types { output.println("
"); return output; } - - static final int + + static final int NULL_ORDER = -3, IGNORABLE_ORDER = -2, VARIABLE_ORDER = -1, @@ -653,7 +658,7 @@ public class WriteCharts implements UCD_Types { CAT_OFFSET = 128, // categories in here NO_CASE_MAPPING = 200; - + static String getChunkName(int script, byte length) { switch(script) { case NO_CASE_MAPPING: return "NoCaseMapping"; @@ -663,7 +668,7 @@ public class WriteCharts implements UCD_Types { case CJK: return "CJK"; case CJK_AB: return "CJK-Extensions"; case UNSUPPORTED: return "Unsupported"; - default: + default: if (script >= CAT_OFFSET) return Default.ucd.getCategoryID_fromIndex((byte)(script - CAT_OFFSET), length); else if (script == HIRAGANA_SCRIPT && HACK_KANA) return length == SHORT ? "Kata-Hira" : "Katakana-Hiragana"; else return Default.ucd.getCase(Default.ucd.getScriptID_fromIndex((byte)script, length), FULL, TITLE); @@ -678,11 +683,11 @@ public class WriteCharts implements UCD_Types { static final byte COLLATION = 0, NORMALIZATION = 1, CASE = 2; - + static void closeIndexFile(PrintWriter indexFile, String extra, byte choice) { SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd HH:mm:ss"); df.setTimeZone(TimeZone.getTimeZone("GMT")); - + indexFile.println("


"); boolean gotOne = false; if (choice != COLLATION) { @@ -705,12 +710,12 @@ public class WriteCharts implements UCD_Types { indexFile.println("

"); indexFile.close(); } - + static boolean containsCase(String s) { int cp; for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(s, i); - // contains Lu, Lo, Lt, or Lowercase or Uppercase + // contains Lu, Lo, Lt, or Lowercase or Uppercase byte cat = Default.ucd.getCategory(cp); if (cat == Lu || cat == Ll || cat == Lt) return true; if (Default.ucd.getBinaryProperty(cp, Other_Lowercase)) return true; @@ -718,7 +723,204 @@ public class WriteCharts implements UCD_Types { } return false; } - + + static final Transliterator addCircle = Transliterator.createFromRules( + "any-addCircle", "([[:Mn:][:Me:]]) > \u25CC $1", Transliterator.FORWARD); + + public static void writeCompositionChart() throws IOException { + Default.setUCD(); + UCA uca = new UCA(null,""); + + Set letters = new TreeSet(); + Set marks = new TreeSet(uca); + Set totalMarks = new TreeSet(uca); + Map decomposes = new HashMap(); + Set notPrinted = new TreeSet(new UTF16.StringComparator()); + Set printed = new HashSet(); + + // UnicodeSet latin = new UnicodeSet("[:latin:]"); + + PrintWriter out = Utility.openPrintWriter("composition_chart.html", Utility.UTF8_WINDOWS); + try { + out.println(""); + out.println(""); + out.println(""); + out.println("

Composites

"); + + UnicodeSetIterator it = new UnicodeSetIterator(); + + for (byte script = 0; script < UCD_Types.LIMIT_SCRIPT; ++script) { + + String scriptName = ""; + try { + scriptName = Default.ucd.getScriptID_fromIndex(script); + Utility.fixDot(); + System.out.println(scriptName); + } catch (IllegalArgumentException e) { + System.out.println("Failed to create transliterator for: " + scriptName + "(" + script + ")"); + continue; + } + + + letters.clear(); + letters.add(""); // header row + marks.clear(); + notPrinted.clear(); + printed.clear(); + + for (int cp = 0; cp < 0x10FFFF; ++cp) { + byte type = Default.ucd.getCategory(cp); + if (type == Default.ucd.UNASSIGNED || type == Default.ucd.PRIVATE_USE) continue; // skip chaff + Utility.dot(cp); + + byte newScript = Default.ucd.getScript(cp); + if (newScript != script) continue; + + String source = UTF16.valueOf(cp); + String decomp = Default.nfd.normalize(source); + if (decomp.equals(source)) continue; + + // pick up all decompositions + int count = UTF16.getCharCount(UTF16.charAt(decomp, 0)); + + if (count == decomp.length()) { + notPrinted.add(source); + continue; // skip unless marks + } + + if (UCD.isHangulSyllable(cp)) count = 2; + String first = decomp.substring(0, count); + String second = decomp.substring(count); + //if (!markSet.containsAll(second)) continue; // skip unless marks + + letters.add(first); + marks.add(second); + Utility.addToSet(decomposes, decomp, source); + notPrinted.add(source); + if (source.equals("\u212b")) System.out.println("A-RING!"); + } + + if (marks.size() != 0) { + + totalMarks.addAll(marks); + + + out.println("
"); + out.println(""); + + Iterator it2 = letters.iterator(); + while (it2.hasNext()) { + String let = (String)it2.next(); + out.println("" + showCell(Default.nfc.normalize(let), "class='h'")); + Iterator it3 = marks.iterator(); + while (it3.hasNext()) { + String mark = (String)it3.next(); + String merge = let + mark; + if (let.length() != 0 && decomposes.get(merge) == null) { + out.println(""); + continue; + } + String comp; + try { + comp = Default.nfc.normalize(merge); + } catch (Exception e) { + System.out.println("Failed when trying to compose <" + Utility.hex(e) + ">"); + continue; + } + // skip unless single char or header + /*if (let.length() != 0 + && (UTF16.countCodePoint(comp) != 1 || comp.equals(merge))) { + out.println(""); + continue; + } + */ + Set decomps = (Set) decomposes.get(merge); + if (let.length() == 0) { + printed.add(comp); + out.println(showCell(comp, "class='h'")); + } else if (decomps.contains(comp)) { + printed.add(comp); + out.println(showCell(comp, "class='w'")); + } else { + comp = (String) new ArrayList(decomps).get(0); + printed.add(comp); + out.println(showCell(comp, "class='r'")); + } + } + out.println(""); + } + out.println("
" + scriptName + "
(" + letters.size() + " × " + marks.size() + ")
  

"); + + //out.println("
Other LettersOther Marks
"); + //tabulate(out, atomics.iterator(),16); + //out.println(""); + //out.println("
"); + + } + notPrinted.removeAll(printed); + if (notPrinted.size() != 0) { + tabulate(out, scriptName + " Excluded", notPrinted.iterator(), 24, "class='r'"); + out.println("
"); + } + } + + Set otherMarks = new TreeSet(uca); + UnicodeSet markSet = new UnicodeSet("[[:Me:][:Mn:]]"); + it.reset(markSet); + while (it.next()) { + int cp = it.codepoint; + String source = UTF16.valueOf(cp); + if (totalMarks.contains(source)) continue; // skip all that we have already + otherMarks.add(source); + } + tabulate(out, "Marks that never combine", otherMarks.iterator(), 24, "class='b'"); + + out.println(""); + + } finally { + out.close(); + } + } + + public static void tabulate(PrintWriter out, String caption, Iterator it2, int limit, String classType) { + int count = 0; + out.println(""); + if (caption != null && caption.length() != 0) { + out.println(""); + } + while (it2.hasNext()) { + if (++count > limit) { + out.println(""); + count = 1; + } + + out.println(showCell((String)it2.next(), classType)); + } + out.println("
" + caption + "
"); + } + + public static String showCell(String comp, String classType) { + if (comp == null) { + return " "; + } + return "" + addCircle.transliterate(comp) + + "
" + Utility.hex(comp) + ""; + } + + } @@ -730,7 +932,7 @@ public class WriteCharts implements UCD_Types { static final IntStack p2 = new IntStack(30); static final IntStack s2 = new IntStack(30); static final IntStack t2 = new IntStack(30); - + static int getStrengthDifference(CEList ceList, CEList lastCEList) { extractNonzeros(ceList, p1, s1, t1); extractNonzeros(lastCEList, p2, s2, t2); @@ -742,12 +944,12 @@ public class WriteCharts implements UCD_Types { if (temp != 0) return 1; return 0; } - + static void extractNonzeros(CEList ceList, IntStack primaries, IntStack secondaries, IntStack tertiaries) { primaries.clear(); secondaries.clear(); tertiaries.clear(); - + for (int i = 0; i < ceList.length(); ++i) { int ce = ceList.at(i); int temp = UCA.getPrimary(ce); diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index 6ca6b598295..2307d32dd49 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2002/07/15 15:23:01 $ -* $Revision: 1.26 $ +* $Date: 2002/09/25 06:40:14 $ +* $Revision: 1.27 $ * ******************************************************************************* */ @@ -144,7 +144,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types { BufferedReader in = Utility.openUnicodeFile("CaseFolding", UNICODE_VERSION, true, false); // new BufferedReader(new FileReader(DIR31 + "CaseFolding-3.d3.alpha.txt"), 64*1024); // log = new PrintWriter(new FileOutputStream("CaseFolding_data.js")); - log = Utility.openPrintWriter("CaseFolding_data.js", false, false); + log = Utility.openPrintWriter("CaseFolding_data.js", Utility.UTF8_WINDOWS); log.println("var CF = new Object();"); int count = 0; while (true) { @@ -189,7 +189,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types { Normalizer normKD = new Normalizer(Normalizer.NFKD, UNICODE_VERSION); Normalizer normD = new Normalizer(Normalizer.NFD, UNICODE_VERSION); //log = new PrintWriter(new FileOutputStream("Normalization_data.js")); - log = Utility.openPrintWriter("Normalization_data.js", false, false); + log = Utility.openPrintWriter("Normalization_data.js", Utility.LATIN1_WINDOWS); int count = 0; @@ -318,7 +318,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON } } - PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, true); + PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", Utility.UTF8_WINDOWS); //if (!shortPrint) log.write('\uFEFF'); log.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion()); log.println("# Generated: " + getNormalDate()); @@ -702,7 +702,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON }*/ static void testCompatibilityCharacters() throws IOException { - log = Utility.openPrintWriter("UCA_CompatComparison.txt"); + log = Utility.openPrintWriter("UCA_CompatComparison.txt", Utility.UTF8_WINDOWS); int[] kenCes = new int[50]; int[] markCes = new int[50]; @@ -1196,7 +1196,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON "UTF8"), 32*1024)); */ - PrintWriter diLog = Utility.openPrintWriter("UCA_Contractions.txt", false, false); + PrintWriter diLog = Utility.openPrintWriter("UCA_Contractions.txt", Utility.UTF8_WINDOWS); diLog.write('\uFEFF'); @@ -1234,7 +1234,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON "UTF8"), 32*1024)); */ - PrintWriter diLog = Utility.openPrintWriter("DisjointIgnorables.js", false, false); + PrintWriter diLog = Utility.openPrintWriter("DisjointIgnorables.js", Utility.LATIN1_WINDOWS); diLog.write('\uFEFF'); @@ -1413,7 +1413,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON "UTF8"), 32*1024)); */ - PrintWriter diLog = Utility.openPrintWriter("DisjointIgnorables2.js", false, false); + PrintWriter diLog = Utility.openPrintWriter("DisjointIgnorables2.js", Utility.LATIN1_WINDOWS); diLog.write('\uFEFF'); @@ -1660,7 +1660,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON int[] lenArray = new int[1]; Set alreadyDone = new HashSet(); - PrintWriter log2 = Utility.openPrintWriter("UCARules-log.txt", false, false); + PrintWriter log2 = Utility.openPrintWriter("UCARules-log.txt", Utility.LATIN1_WINDOWS); while (true) { String s = cc.next(ces, lenArray); @@ -1784,7 +1784,7 @@ F900..FAFF; CJK Compatibility Ideographs if (shortPrint) filename += "_SHORT"; if (option == IN_XML) filename += ".xml"; else filename += ".txt"; - log = Utility.openPrintWriter(filename, false, false); + log = Utility.openPrintWriter(filename, Utility.LATIN1_WINDOWS); String[] commentText = { "UCA Rules", @@ -3951,7 +3951,7 @@ static int swapCJK(int i) { Default.setUCD(); //log = new PrintWriter(new FileOutputStream("CheckCollationValidity.html")); - log = Utility.openPrintWriter("CheckCollationValidity.html", false, false); + log = Utility.openPrintWriter("CheckCollationValidity.html", Utility.UTF8_WINDOWS); log.println(""); log.println("UCA Validity Log"); @@ -4618,7 +4618,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; static PrintWriter writeHead(int counter, int end, String title, String other, String version, boolean show) throws IOException { - PrintWriter out = Utility.openPrintWriter(title + pad(counter) + ".html"); + PrintWriter out = Utility.openPrintWriter(title + pad(counter) + ".html", Utility.UTF8_WINDOWS); copyFile(out, "HTML-Part1.txt"); /* diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index 836fa6b1dde..b3a9f551dee 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2002/08/09 23:56:24 $ -* $Revision: 1.22 $ +* $Date: 2002/09/25 06:40:13 $ +* $Revision: 1.23 $ * ******************************************************************************* */ @@ -73,11 +73,15 @@ public final class Main implements UCD_Types { else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry(); + else if (arg.equalsIgnoreCase("quicktest")) QuickTest.test(); + else if (arg.equalsIgnoreCase("TernaryStore")) TernaryStore.test(); + else if (arg.equalsIgnoreCase("checkBIDI")) VerifyUCD.checkBIDI(); else if (arg.equalsIgnoreCase("Buildnames")) BuildNames.main(null); else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null); + else if (arg.equalsIgnoreCase("GenerateCaseTest")) GenerateCaseTest.main(null); else if (arg.equalsIgnoreCase("checkDecompFolding")) VerifyUCD.checkDecompFolding(); else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null); diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 324ea0a7f43..6ce3c43055a 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2002/08/04 21:38:45 $ -* $Revision: 1.17 $ +* $Date: 2002/09/25 06:40:13 $ +* $Revision: 1.18 $ * ******************************************************************************* */ @@ -964,6 +964,9 @@ to guarantee identifier closure. public boolean hasComputableName(int codePoint) { if (codePoint >= 0xF900 && codePoint <= 0xFA2D) return true; + if (codePoint >= 0x2800 && codePoint <= 0x28FF) return true; + if (codePoint >= 0x2F800 && codePoint <= 0x2FA1D) return true; + int rangeStart = mapToRepresentative(codePoint, major < 2); switch (rangeStart) { default: diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java index 67fc9fcb655..57c8a170b2d 100644 --- a/tools/unicodetools/com/ibm/text/utility/Utility.java +++ b/tools/unicodetools/com/ibm/text/utility/Utility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ -* $Date: 2002/08/04 21:38:44 $ -* $Revision: 1.24 $ +* $Date: 2002/09/25 06:40:14 $ +* $Revision: 1.25 $ * ******************************************************************************* */ @@ -18,11 +18,16 @@ import java.text.*; import java.io.*; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.Replaceable; +import com.ibm.icu.text.ReplaceableString; +import com.ibm.icu.text.UnicodeMatcher; + import com.ibm.text.UCD.*; public final class Utility implements UCD_Types { // COMMON UTILITIES static final boolean UTF8 = true; // TODO -- make argument + public static final char BOM = '\uFEFF'; public static String[] append(String[] array1, String[] array2) { String[] temp = new String[array1.length + array2.length]; @@ -334,6 +339,83 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES } return output.toString(); } + + + public static final class Position { + public int start, limit; + } + + /** + * Finds the next position in the text that matches. + * @param divider A UnicodeMatcher, such as a UnicodeSet. + * @text obvious + * @offset starting offset + * @output start and limit of the piece found. If the return is false, then start,limit = length + * @return true iff match found + */ + public static boolean next(UnicodeMatcher matcher, Replaceable text, int offset, + Position output) { + int[] io = new int[1]; // TODO replace later; extra object creation + int limit = text.length(); + // don't worry about surrogates; matcher will handle + for (int i = offset; i <= limit; ++i) { + io[0] = i; + if (matcher.matches(text, io, limit, false) == UnicodeMatcher.U_MATCH) { + // a hit, return + output.start = i; + output.limit = io[0]; + return true; + } + } + output.start = output.limit = limit; + return false; + } + + /** + * Finds the next position in the text that matches. + * @param divider A UnicodeMatcher, such as a UnicodeSet. + * @text obvious + * @offset starting offset + * @output start and limit of the piece found. If the return is false, then start,limit = 0 + * @return true iff match found + */ + public static boolean previous(UnicodeMatcher matcher, Replaceable text, int offset, + Position output) { + int[] io = new int[1]; // TODO replace later; extra object creation + int limit = 0; + // don't worry about surrogates; matcher will handle + for (int i = offset; i >= limit; --i) { + io[0] = i; + if (matcher.matches(text, io, offset, false) == UnicodeMatcher.U_MATCH) { + // a hit, return + output.start = i; + output.limit = io[0]; + return true; + } + } + output.start = output.limit = limit; + return false; + } + + /** + * Splits a string containing divider into pieces, storing in output + * and returns the number of pieces. The string does not have to be terminated: + * the segment after the last divider is returned in the last output element. + * Thus if the string has no dividers, then the whole string is returned in output[0] + * with a return value of 1. + * @param divider A UnicodeMatcher, such as a UnicodeSet. + * @param s the text to be divided + * @param output where the resulting pieces go + * @return the number of items put into output + */ + public static int split(UnicodeMatcher divider, Replaceable text, Position[] output) { + int index = 0; + for (int offset = 0;; offset = output[index-1].limit) { + if (output[index] == null) output[index] = new Position(); + boolean matches = next(divider, text, offset, output[index++]); + if (!matches) return index; + } + } /** * Splits a string containing divider into pieces, storing in output @@ -358,14 +440,14 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES } public static String[] split(String s, char divider) { - String[] result = new String[100]; + String[] result = new String[100]; // HACK int count = split(s, divider, result); return extract(result, 0, count); } - public static String[] extract(String[] source, int start, int end) { - String[] result = new String[end-start]; - System.arraycopy(source, start, result, 0, end - start); + public static String[] extract(String[] source, int start, int limit) { + String[] result = new String[limit-start]; + System.arraycopy(source, start, result, 0, limit - start); return result; } @@ -564,7 +646,8 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES // Or if they are UTF8, use true, false public static PrintWriter openPrintWriter(String filename, byte options) throws IOException { File file = new File(getOutputName(filename)); - System.out.println("Creating File: " + file); + Utility.fixDot(); + System.out.println("Creating File: " + file.getCanonicalPath()); File parent = new File(file.getParent()); //System.out.println("Creating File: "+ parent); parent.mkdirs(); @@ -609,6 +692,28 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES } } + public static void print(PrintWriter pw, Map c, String pairSeparator, String separator, Breaker b) { + Iterator it = c.keySet().iterator(); + boolean first = true; + Object last = null; + while (it.hasNext()) { + Object obj = it.next(); + Object result = c.get(obj); + if (b != null && !b.filter(obj)) continue; + if (first) { + first = false; + } else { + pw.print(separator); + } + if (b != null) { + pw.print(b.get(obj, last) + pairSeparator + result); + } else { + pw.print(obj + pairSeparator + result); + } + last = obj; + } + } + public static void appendFile(String filename, boolean utf8, PrintWriter output) throws IOException { appendFile(filename, utf8, output, null); } @@ -870,19 +975,35 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES static PrintWriter showSetNamesPw; public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, UCD ucd) { - if (showSetNamesPw == null) showSetNamesPw = new PrintWriter(System.out); - showSetNames(showSetNamesPw, prefix, set, separateLines, false, ucd); - showSetNamesPw.flush(); + showSetNames(prefix, set, separateLines, false, false, ucd); + } + + public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) { + showSetNames(prefix, set, separateLines, IDN, false, ucd); } public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) { + showSetNames( pw, prefix, set, separateLines, IDN, false, ucd); + } + + public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, boolean IDN, boolean withChar, UCD ucd) { + if (showSetNamesPw == null) showSetNamesPw = new PrintWriter(System.out); + showSetNames(showSetNamesPw, prefix, set, separateLines, IDN, withChar, ucd); + showSetNamesPw.flush(); + } + + public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, + boolean withChar, UCD ucd) { int count = set.getRangeCount(); for (int i = 0; i < count; ++i) { int start = set.getRangeStart(i); int end = set.getRangeEnd(i); if (separateLines || (IDN && isSeparateLineIDN(start,end,ucd))) { for (int cp = start; cp <= end; ++cp) { - if (!IDN) pw.println(prefix + ucd.getCodeAndName(cp)); + if (!IDN) pw.println(prefix + ucd.getCode(cp) + + "\t# " + + (withChar ? " (" + UTF16.valueOf(cp) + ") " : "") + + ucd.getName(cp)); else { pw.println(prefix + Utility.hex(cp,4) + "; " + ucd.getName(cp)); } @@ -891,7 +1012,10 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES if (!IDN) { pw.println(prefix + ucd.getCode(start) + ((start != end) ? (".." + ucd.getCode(end)) : "") - + "\t# " + ucd.getName(start) + ((start != end) ? (".." + ucd.getName(end)) : "") + + "\t# " + + (withChar ? " (" + UTF16.valueOf(start) + + ((start != end) ? (".." + UTF16.valueOf(end)) : "") + ") " : "") + + ucd.getName(start) + ((start != end) ? (".." + ucd.getName(end)) : "") ); } else {