diff --git a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java index 42a64314b40..97278a46bb9 100644 --- a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java +++ b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $ -* $Date: 2001/09/19 23:32:21 $ -* $Revision: 1.4 $ +* $Date: 2001/10/25 20:35:42 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -30,6 +30,33 @@ public class GenOverlap implements UCD_Types { static Normalizer nfd; static Normalizer nfkd; + public static void validateUCA(UCA collatorIn) throws Exception { + collator = collatorIn; + ucd = UCD.make(); + + nfd = new Normalizer(Normalizer.NFD); + nfkd = new Normalizer(Normalizer.NFKD); + + for (int cp = 0x0; cp <= 0x10FFFF; ++cp) { + Utility.dot(cp); + if (!ucd.isRepresented(cp)) continue; + byte decompType = ucd.getDecompositionType(cp); + if (decompType >= UCD.COMPATIBILITY) { + String decomp = nfkd.normalize(cp); + CEList celistDecomp = getCEList(cp, decomp, true, decompType); + CEList celistNormal = getCEList(UTF16.valueOf(cp), false); + if (!celistNormal.equals(celistDecomp)) { + Utility.fixDot(); + System.out.println(); + System.out.println(ucd.getCodeAndName(cp)); + System.out.println(celistNormal); + System.out.println(celistDecomp); + } + } + } + + } + public static void test(UCA collatorIn) throws Exception { collator = collatorIn; @@ -68,7 +95,7 @@ public class GenOverlap implements UCD_Types { byte decompType = ucd.getDecompositionType(cp); if (decompType >= UCD.COMPATIBILITY) { String decomp = nfkd.normalize(cp); - CEList celist = getCEList(cp, decomp, decompType); + CEList celist = getCEList(cp, decomp, true, decompType); addString(decomp, celist); System.out.println("Adding: " + ucd.getCodeAndName(cp) + "\t" + celist); } @@ -182,16 +209,22 @@ public class GenOverlap implements UCD_Types { } static private CEList getCEList(String s) { - int len = collator.getCEs(s, true, ces); + return getCEList(s, true); + } + + static private CEList getCEList(String s, boolean decomp) { + int len = collator.getCEs(s, decomp, ces); return new CEList(ces, 0, len); } - static private CEList getCEList(int originalChar, String s, byte type) { - int len = collator.getCEs(s, true, ces); - for (int i = 0; i < len; ++i) { - ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]), - UCA.getSecondary(ces[i]), - CEList.remap(originalChar, type, UCA.getTertiary(ces[i]))); + static private CEList getCEList(int originalChar, String s, boolean decomp, byte type) { + int len = collator.getCEs(s, decomp, ces); + if (decomp) { + for (int i = 0; i < len; ++i) { + ces[i] = UCA.makeKey(UCA.getPrimary(ces[i]), + UCA.getSecondary(ces[i]), + CEList.remap(originalChar, type, UCA.getTertiary(ces[i]))); + } } return new CEList(ces, 0, len); } @@ -290,7 +323,7 @@ public class GenOverlap implements UCD_Types { } public static void generateRevision (UCA collatorIn) throws Exception { - generateRevision(collatorIn, false); + //generateRevision(collatorIn, false); generateRevision(collatorIn, true); } @@ -336,7 +369,7 @@ public class GenOverlap implements UCD_Types { int cp; for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(str, i); - if (0xFF67 <= cp && cp <= 0xFF6F) { + if (0xFF3F == cp) { System.out.println("debug"); } boolean mashLast = false; @@ -351,7 +384,7 @@ public class GenOverlap implements UCD_Types { int s = UCA.getSecondary(ces[j]); boolean needsFix = (s != 0x20 && p != 0); if (needsFix) ++len; - int t = (doMax && len > 1 && j == len-1 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j]))); + int t = (doMax && j > 0 ? 0x1F : CEList.remap(cp, type, UCA.getTertiary(ces[j]))); if (needsFix) { ces[j++] = UCA.makeKey(p, 0x20, t); // Set Extra System.arraycopy(ces, j, ces, j+1, len - j); // Insert HOLE! @@ -413,7 +446,7 @@ public class GenOverlap implements UCD_Types { newKeys.removeAll(joint); oldKeys.removeAll(joint); - PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt")); + PrintWriter log = Utility.openPrintWriter("UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), false); Iterator it = list.iterator(); int last = -1; while (it.hasNext()) { @@ -657,4 +690,51 @@ public class GenOverlap implements UCD_Types { + "" + nf.format(sd) + ""); } + + public static void listCyrillic(UCA collatorIn) throws IOException { + PrintWriter log = Utility.openPrintWriter("ListCyrillic.txt", false); + Set set = new TreeSet(collatorIn); + Set set2 = new TreeSet(collatorIn); + ucd = UCD.make(); + + nfd = new Normalizer(Normalizer.NFD); + + for (char i = 0; i < 0xFFFF; ++i) { + Utility.dot(i); + if (!ucd.isRepresented(i)) continue; + if (ucd.getScript(i) != CYRILLIC_SCRIPT) continue; + + String decomp = nfd.normalize(String.valueOf(i)); + String oldDecomp = decomp; + for (int j = 0; j < decomp.length(); ++j) { + if (ucd.getCategory(decomp.charAt(j)) == Mn) { + decomp = decomp.substring(0,j) + decomp.substring(j+1); + } + } + if (decomp.length() == 0) continue; + + set.add(decomp); + if (!decomp.equals(oldDecomp)) set2.add(oldDecomp); + } + + Iterator it = set.iterator(); + while (it.hasNext()) { + String s = (String) it.next(); + String name = ucd.getName(s.charAt(0)); + Utility.replace(name, "CYRILLIC ", ""); + log.println("# " + s + " <> XXX ; # " + name); + } + + it = set2.iterator(); + while (it.hasNext()) { + String s = (String) it.next(); + String name = ucd.getName(s.charAt(0)); + Utility.replace(name, "CYRILLIC ", ""); + log.println("### " + s + " <> XXX ; # " + name); + } + + log.close(); + } + + } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java index 3855c3cf0c9..22055cba399 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ -* $Date: 2001/09/19 23:32:21 $ -* $Revision: 1.4 $ +* $Date: 2001/10/25 20:35:41 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -61,9 +61,13 @@ This is because of shared characters between scripts with different directions, like French with Arabic or Greek. */ -final public class UCA { +final public class UCA implements Comparator { public static final String copyright = "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved."; + + public int compare(Object a, Object b) { + return getSortKey((String) a).compareTo(getSortKey((String) b)); + } /** * Version of the UCA tables to use diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java index f61ddac467a..293a8dd3d7a 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $ -* $Date: 2001/09/19 23:31:50 $ -* $Revision: 1.1 $ +* $Date: 2001/10/25 20:35:41 $ +* $Revision: 1.2 $ * ******************************************************************************* */ @@ -30,6 +30,7 @@ public class WriteCharts implements UCD_Types { ucd = UCD.make(); Normalizer nfd = new Normalizer(Normalizer.NFD); + Normalizer nfc = new Normalizer(Normalizer.NFC); UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps @@ -53,18 +54,23 @@ public class WriteCharts implements UCD_Types { int lastPrimary = -1; - String lastSortKey = null; + String lastSortKey = "\u0000"; int high = uca.getSortKey("a").charAt(0); int variable = UCA.getPrimary(uca.getVariableHigh()); int columnCount = 0; + Utility.copyTextFile("index.html", true, "CollationCharts\\index.html"); + Utility.copyTextFile("charts.css", false, "CollationCharts\\charts.css"); + Utility.copyTextFile("help.html", true, "CollationCharts\\help.html"); + indexFile = Utility.openPrintWriter("CollationCharts\\index_list.html"); indexFile.println(""); indexFile.println("UCA Default Collation Table"); indexFile.println(""); + indexFile.println(""); indexFile.println("

UCA Default Collation Table

"); indexFile.println("

Help"); @@ -102,19 +108,31 @@ public class WriteCharts implements UCD_Types { oldScript = script; } - int strength = 6; - if (lastSortKey != null && sortKey.charAt(0) == lastSortKey.charAt(0)) { - strength = uca.strengthDifference(sortKey, lastSortKey); - if (strength < 0) strength = -strength; - } + boolean firstPrimaryEquals = sortKey.charAt(0) == lastSortKey.charAt(0); + + int strength = uca.strengthDifference(sortKey, lastSortKey); + if (strength < 0) strength = -strength; lastSortKey = sortKey; + + // find out if this is an expansion: more than one primary weight + + int primaryCount = 0; + for (int i = 0; i < sortKey.length(); ++i) { + char w = sortKey.charAt(i); + if (w == 0) break; + ++ primaryCount; + } + String breaker = ""; - if (columnCount > 10 || strength > 5) { - if (strength <= 5) breaker = ""; - else breaker = ""; + if (columnCount > 10 || !firstPrimaryEquals) { + if (!firstPrimaryEquals) breaker = ""; + else breaker = ""; // indent 1 cell columnCount = 0; } - output.println(breaker + CLASSNAME[strength] + s + + String classname = primaryCount > 1 ? XCLASSNAME[strength] : CLASSNAME[strength]; + + output.println(breaker + classname + nfc.normalize(s) + "
" + Utility.hex(s) //+ "
" + script //+ "
" + UCA.toString(sortKey) @@ -133,8 +151,15 @@ public class WriteCharts implements UCD_Types { "", "", "", - "", - ""}; + ""}; + + static final String[] XCLASSNAME = { + "", + "", + "", + "", + "", + ""}; static PrintWriter indexFile; diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index aa04e472ab6..da2345bd325 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2001/09/19 23:32:21 $ -* $Revision: 1.4 $ +* $Date: 2001/10/25 20:35:41 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -63,9 +63,13 @@ public class WriteCollationData implements UCD_Types { String arg = args[i]; if (arg.equalsIgnoreCase("WriteRulesWithNames")) writeRules(WITH_NAMES); else if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(collator); + else if (arg.equalsIgnoreCase("validateUCA")) GenOverlap.validateUCA(collator); + else if (arg.equalsIgnoreCase("writeNonspacingDifference")) writeNonspacingDifference(); + else if (arg.equalsIgnoreCase("WriteCharts")) WriteCharts.test(collator); else if (arg.equalsIgnoreCase("CheckHash")) GenOverlap.checkHash(collator); else if (arg.equalsIgnoreCase("generateRevision")) GenOverlap.generateRevision(collator); + else if (arg.equalsIgnoreCase("listCyrillic")) GenOverlap.listCyrillic(collator); else if (arg.equalsIgnoreCase("WriteRules")) writeRules(WITHOUT_NAMES); else if (arg.equalsIgnoreCase("WriteRulesXML")) writeRules(IN_XML); @@ -748,6 +752,47 @@ public class WriteCollationData implements UCD_Types { return len; } + static void writeNonspacingDifference() throws IOException { + PrintWriter diLog = new PrintWriter( + new BufferedWriter( + new OutputStreamWriter( + new FileOutputStream(GEN_DIR + "UCA_Nonspacing.txt"), + "UTF8"), + 32*1024)); + diLog.write('\uFEFF'); + + Normalizer nfd = new Normalizer(Normalizer.NFD); + + Set sorted = new TreeSet(); + + for (int i = 0; i < 0x10FFFF; ++i) { + Utility.dot(i); + if (!ucd.isRepresented(i)) continue; + byte cat = ucd.getCategory(i); + boolean isNonSpacing = cat == Mn || cat == Me; + CEList celist = collator.getCEList(UTF32.valueOf32(i), true); + boolean isPrimaryIgnorable = true; + for (int j = 0; j < celist.length(); ++j) { + int ce = celist.at(j); + int primary = collator.getPrimary(ce); + if (primary != 0) { + isPrimaryIgnorable = false; + break; + } + } + + if (isNonSpacing != isPrimaryIgnorable) { + sorted.add(ucd.getCategoryID(i) + + "\t" + celist + + "\t" + ucd.getCodeAndName(i)); + } + } + + Utility.print(diLog, sorted, "\r\n"); + + diLog.close(); + } + static void writeContractions() throws IOException { PrintWriter diLog = new PrintWriter( new BufferedWriter(