diff --git a/tools/unicodetools/com/ibm/text/UCA/Main.java b/tools/unicodetools/com/ibm/text/UCA/Main.java index 89f93cc0298..5726e81eb2f 100644 --- a/tools/unicodetools/com/ibm/text/UCA/Main.java +++ b/tools/unicodetools/com/ibm/text/UCA/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $ -* $Date: 2002/06/22 21:02:16 $ -* $Revision: 1.7 $ +* $Date: 2002/06/28 01:59:58 $ +* $Revision: 1.8 $ * ******************************************************************************* */ @@ -18,7 +18,7 @@ import com.ibm.text.utility.*; public class Main { static final String UCDVersion = ""; - static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA", + static final String[] ICU_FILES = {"writeCollationValidityLog", "writeFractionalUCA", "WriteRules", "WriteRulesWithNames", "WriteRulesXML", "writeconformance", "writeconformanceshifted", "short", "writeconformance", "writeconformanceshifted", @@ -65,7 +65,7 @@ public class Main { else if (arg.equalsIgnoreCase("WriteRulesXML")) WriteCollationData.writeRules(WriteCollationData.IN_XML); else if (arg.equalsIgnoreCase("checkDisjointIgnorables")) WriteCollationData.checkDisjointIgnorables(); else if (arg.equalsIgnoreCase("writeContractions")) WriteCollationData.writeContractions(); - else if (arg.equalsIgnoreCase("FractionalUCA")) WriteCollationData.writeFractionalUCA("FractionalUCA"); + else if (arg.equalsIgnoreCase("writeFractionalUCA")) WriteCollationData.writeFractionalUCA("FractionalUCA"); else if (arg.equalsIgnoreCase("writeConformance")) WriteCollationData.writeConformance("CollationTest_NON_IGNORABLE", UCA.NON_IGNORABLE, shortPrint); else if (arg.equalsIgnoreCase("writeConformanceSHIFTED")) WriteCollationData.writeConformance("CollationTest_SHIFTED", UCA.SHIFTED, shortPrint); else if (arg.equalsIgnoreCase("testCompatibilityCharacters")) WriteCollationData.testCompatibilityCharacters(); @@ -80,7 +80,7 @@ public class Main { System.out.println("UNKNOWN OPTION (" + arg + "): must be one of the following (case-insensitive)"); System.out.println("\tWriteRulesXML, WriteRulesWithNames, WriteRules,"); System.out.println("\tcheckDisjointIgnorables, writeContractions,"); - System.out.println("\tFractionalUCA, writeConformance, writeConformanceSHIFTED, testCompatibilityCharacters,"); + System.out.println("\twriteFractionalUCA, writeConformance, writeConformanceSHIFTED, testCompatibilityCharacters,"); System.out.println("\twriteCollationValidityLog, writeCaseExceptions, writeJavascriptInfo, writeCaseFolding"); System.out.println("\tjavatest, hex (used for conformance)"); } diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java index d64497980bb..77b2c04e35c 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ -* $Date: 2002/06/24 15:25:10 $ -* $Revision: 1.15 $ +* $Date: 2002/06/28 01:59:58 $ +* $Revision: 1.16 $ * ******************************************************************************* */ @@ -75,7 +75,8 @@ final public class UCA implements Comparator, UCA_Types { * Version of the UCA tables to use */ //private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7"; - public static final String VERSION = "-3.1.1d1"; // ""; // "-2.1.9d7"; + public static final String UCA_BASE = "3.1.1"; // ""; // "-2.1.9d7"; + public static final String VERSION = "-" + UCA_BASE + "d5"; // ""; // "-2.1.9d7"; public static final String ALLFILES = "allkeys"; // null if not there /** @@ -240,7 +241,9 @@ final public class UCA implements Comparator, UCA_Types { // add weights char w = getPrimary(ce); if (DEBUG) System.out.println("\tCE: " + Utility.hex(ce)); - if (w != 0) primaries.append(w); + if (w != 0) { + primaries.append(w); + } w = getSecondary(ce); if (w != 0) { @@ -252,9 +255,13 @@ final public class UCA implements Comparator, UCA_Types { } w = getTertiary(ce); - if (w != 0) tertiaries.append(w); + if (w != 0) { + tertiaries.append(w); + } - if (weight4 != 0) quaternaries.append(weight4); + if (weight4 != 0) { + quaternaries.append(weight4); + } } // Produce weight strings @@ -263,13 +270,13 @@ final public class UCA implements Comparator, UCA_Types { StringBuffer result = primaries; if (strength >= 2) { - result.append('\u0000'); // separator + result.append(LEVEL_SEPARATOR); // separator result.append(secondaries); if (strength >= 3) { - result.append('\u0000'); // separator + result.append(LEVEL_SEPARATOR); // separator result.append(tertiaries); if (strength >= 4) { - result.append('\u0000'); // separator + result.append(LEVEL_SEPARATOR); // separator if (alternate == SHIFTED_TRIMMED) { int q; for (q = quaternaries.length()-1; q >= 0; --q) { @@ -303,7 +310,7 @@ final public class UCA implements Comparator, UCA_Types { char c2 = sortKey2.charAt(i); if (c1 < c2) return -strength; if (c1 > c2) return strength; - if (c1 == '\u0000') --strength; // Separator! + if (c1 == LEVEL_SEPARATOR) --strength; // Separator! } if (len1 < len2) return -strength; if (len1 > len2) return strength; @@ -399,15 +406,21 @@ final public class UCA implements Comparator, UCA_Types { * @param source Normal UTF-16 (Java) string * @return sort key (as string) * @author Markus Scherer (cast into Java by MD) + * NOTE: changed to be longer, but handle isolated surrogates */ public static StringBuffer appendInCodePointOrder(String source, StringBuffer target) { - for (int i = 0; i < source.length(); ++i) { - int ch = source.charAt(i); + int cp; + for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(source, i); + target.append((char)((cp >> 15) | 0x8000)); + target.append((char)(cp | 0x8000)); + /* if (ch <= 1) { // hack to avoid nulls target.append('\u0001'); target.append((char)(ch+1)); } target.append((char)(ch + utf16CodePointOrder[ch>>11])); + */ } return target; } @@ -659,9 +672,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] */ /** - * Returns implicit value as pair, first part in high word; second part in low word - * So to get first part use (x >>> 16) -- remember the >>>! - * and to get the second part use (x & 0xFFFF) + * Returns implicit value */ void CodepointToImplicit(int cp, int[] output) { @@ -673,9 +684,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] } /** - * Takes implicit value as pair, first part in high word; second part in low word - * So to get first part use (x >>> 16) -- remember the >>>! - * and to get the second part use (x & 0xFFFF) + * Takes implicit value */ static int ImplicitToCodePoint(int leadImplicit, int trailImplicit) { @@ -997,7 +1006,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] // push BBBB - expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY)); + expandingStack.push(makeKey(implicit[1], 0, 0)); // return AAAA @@ -1127,7 +1136,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] // normal case while (current++ < 0x10FFFF) { - if (current == 0x406) { + if (DEBUG && current == 0xdbff) { System.out.println("DEBUG"); } //char ch = (char)current; diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index 808943150b3..4ca5ac6c342 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2002/06/24 15:25:10 $ -* $Revision: 1.23 $ +* $Date: 2002/06/28 01:59:53 $ +* $Revision: 1.24 $ * ******************************************************************************* */ @@ -25,6 +25,8 @@ import java.io.*; import java.text.RuleBasedCollator; import java.text.CollationElementIterator; import java.text.Collator; +import java.text.DateFormat; +import java.text.SimpleDateFormat; import com.ibm.text.UCD.*; import com.ibm.text.UCD.UCD_Types; @@ -34,7 +36,7 @@ import com.ibm.text.UCD.Normalizer; public class WriteCollationData implements UCD_Types, UCA_Types { static final boolean DEBUG = false; - static final boolean DEBUG_SHOW_ITERATION = true; + static final boolean DEBUG_SHOW_ITERATION = false; @@ -299,18 +301,27 @@ public class WriteCollationData implements UCD_Types, UCA_Types { U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON => U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS, U+0304 COMBINING MACRON */ - String[] testList = {"\uF934", "U", "U\u0308", "\u00DC", "\u00DC\u0304", "U\u0308\u0304"}; - for (int jj = 0; jj < testList.length; ++jj) { - String t = testList[jj]; - System.out.println(ucd.getCodeAndName(t)); - String test = collator.getSortKey(t, UCA.NON_IGNORABLE); - System.out.println("Decomp: " + collator.toString(test)); - test = collator.getSortKey(t, UCA.NON_IGNORABLE, false); - System.out.println("No Dec: " + collator.toString(test)); + if (DEBUG) { + String[] testList = {"\u3192", "\u3220", "\u0344", "\u0385", "\uF934", "U", "U\u0308", "\u00DC", "\u00DC\u0304", "U\u0308\u0304"}; + for (int jj = 0; jj < testList.length; ++jj) { + String t = testList[jj]; + System.out.println(ucd.getCodeAndName(t)); + + CEList ces = collator.getCEList(t, true); + System.out.println("CEs: " + ces); + + String test = collator.getSortKey(t, option); + System.out.println("Decomp: " + collator.toString(test)); + + test = collator.getSortKey(t, option, false); + System.out.println("No Dec: " + collator.toString(test)); + } } - PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, false); - if (!shortPrint) log.write('\uFEFF'); + PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, true); + //if (!shortPrint) log.write('\uFEFF'); + log.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion()); + log.println("# Generated: " + getNormalDate()); System.out.println("Sorting"); int counter = 0; @@ -333,7 +344,6 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON } Utility.dot(counter++); addStringX(s, option); - // TODO: add other accents with Cyrillic } UnicodeSet found = collator.found; @@ -472,7 +482,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON for (int j = 0; j < CONTRACTION_TEST.length; ++j) { String extra = s.substring(0,i) + CONTRACTION_TEST[j] + s.substring(i); addStringY(extra + 'a', option); - System.out.println(addCounter++ + " Adding " + Default.ucd.getCodeAndName(extra)); + if (DEBUG) System.out.println(addCounter++ + " Adding " + Default.ucd.getCodeAndName(extra)); } } } @@ -488,31 +498,51 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON sortedD.put(colDbase, s); } - + static UCD ucd_uca_base = null; /** * Check that the primaries are the same as the compatibility decomposition. */ static void checkBadDecomps(int strength, boolean decomposition, UnicodeSet alreadySeen) { + if (ucd_uca_base == null) { + ucd_uca_base = UCD.make(UCA.UCA_BASE); + } int oldStrength = collator.getStrength(); collator.setStrength(strength); Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION); - if (strength == 1) { - log.println("

3. Primaries Incompatible with Decompositions

"); - } else { - log.println("

4. Secondaries Incompatible with Decompositions

"); + Normalizer nfc = new Normalizer(Normalizer.NFC, UNICODE_VERSION); + switch (strength) { + case 1: log.println("

3. Primaries Incompatible with Decompositions

"); break; + case 2: log.println("

4. Secondaries Incompatible with Decompositions

"); break; + case 3: log.println("

5. Tertiaries Incompatible with Decompositions

"); + log.println("

Note: Tertiary differences are not really errors; these are just warnings

"); + break; + default: throw new IllegalArgumentException("bad strength: " + strength); } + log.println("

Warning: only checking characters defined in base: " + ucd_uca_base.getVersion() + "

"); + log.println("
"); log.println(""); + + int errorCount = 0; + + UnicodeSet skipSet = new UnicodeSet(); + for (int ch = 0; ch < 0x10FFFF; ++ch) { - if (!ucd.isAllocated(ch)) continue; + if (!ucd_uca_base.isAllocated(ch)) continue; if (nfkd.isNormalized(ch)) continue; if (ch > 0xAC00 && ch < 0xD7A3) continue; // skip most of Hangul if (alreadySeen.contains(ch)) continue; Utility.dot(ch); String decomp = nfkd.normalize(ch); - if (ch != ' ' && decomp.charAt(0) == ' ') continue; // skip wierd decomps - if (ch != '\u0640' && decomp.charAt(0) == '\u0640') continue; // skip wierd decomps + if (ch != ' ' && decomp.charAt(0) == ' ') { + skipSet.add(ch); + continue; // skip wierd decomps + } + if (ch != '\u0640' && decomp.charAt(0) == '\u0640') { + skipSet.add(ch); + continue; // skip wierd decomps + } String sortKey = collator.getSortKey(UTF16.valueOf(ch), UCA.NON_IGNORABLE, decomposition); @@ -521,21 +551,97 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON sortKey = remove(sortKey, '\u0020'); decompSortKey = remove(decompSortKey, '\u0020'); } - if (!sortKey.equals(decompSortKey)) { - log.println("" - ); - alreadySeen.add(ch); + + if (sortKey.equals(decompSortKey)) continue; // no problem! + + // fix key in the case of strength 3 + + if (strength == 3) { + String newSortKey = remapSortKey(ch, decomposition); + if (!sortKey.equals(newSortKey)) { + System.out.println("Fixing: " + ucd.getCodeAndName(ch)); + System.out.println(" Old:" + collator.toString(decompSortKey)); + System.out.println(" New: " + collator.toString(newSortKey)); + System.out.println(" Tgt: " + collator.toString(sortKey)); + } + decompSortKey = newSortKey; } + + if (sortKey.equals(decompSortKey)) continue; // no problem! + + log.println("" + ); + alreadySeen.add(ch); + errorCount++; } log.println("
CodeSort KeyDecomposed Sort KeyName
" + Utility.hex(ch) - + "" + UCA.toString(sortKey) - + "" + UCA.toString(decompSortKey) - + "" + ucd.getName(ch) - + "
" + Utility.hex(ch) + + "" + UCA.toString(sortKey) + + "" + UCA.toString(decompSortKey) + + "" + ucd.getName(ch) + + "
"); + log.println("

Errors: " + errorCount + "

"); + log.println("

Space/Tatweel exceptions: " + skipSet.toPattern(true) + "

"); collator.setStrength(oldStrength); Utility.fixDot(); } + static String remapSortKey(int cp, boolean decomposition) { + if (toD.isNormalized(cp)) return remapCanSortKey(cp, decomposition); + + // we know that it is not NFKD. + String canDecomp = toD.normalize(cp); + String result = ""; + int ch; + for (int j = 0; j < canDecomp.length(); j += UTF16.getCharCount(ch)) { + ch = UTF16.charAt(canDecomp, j); + System.out.println("* " + Default.ucd.getCodeAndName(ch)); + String newSortKey = remapCanSortKey(ch, decomposition); + System.out.println("* " + UCA.toString(newSortKey)); + result = mergeSortKeys(result, newSortKey); + System.out.println("= " + UCA.toString(result)); + } + return result; + } + + static String remapCanSortKey(int ch, boolean decomposition) { + String compatDecomp = Default.nfkd.normalize(ch); + String decompSortKey = collator.getSortKey(compatDecomp, UCA.NON_IGNORABLE, decomposition); + + byte type = ucd.getDecompositionType(ch); + int pos = decompSortKey.indexOf(UCA.LEVEL_SEPARATOR) + 1; // after first separator + pos = decompSortKey.indexOf(UCA.LEVEL_SEPARATOR, pos) + 1; // after second separator + String newSortKey = decompSortKey.substring(0, pos); + for (int i = pos; i < decompSortKey.length(); ++i) { + int weight = decompSortKey.charAt(i); + int newWeight = CEList.remap(ch, type, weight); + if (i > pos + 1) newWeight = 0x1F; + newSortKey += (char)newWeight; + } + return newSortKey; + } + + // keys must be of the same strength + static String mergeSortKeys(String key1, String key2) { + StringBuffer result = new StringBuffer(); + int end1 = 0, end2 = 0; + while (true) { + int pos1 = key1.indexOf(UCA.LEVEL_SEPARATOR, end1); + int pos2 = key2.indexOf(UCA.LEVEL_SEPARATOR, end2); + if (pos1 < 0) { + result.append(key1.substring(end1)).append(key2.substring(end2)); + return result.toString(); + } + if (pos2 < 0) { + result.append(key1.substring(end1, pos1)).append(key2.substring(end2)).append(key1.substring(pos1)); + return result.toString(); + } + result.append(key1.substring(end1, pos1)).append(key2.substring(end2, pos2)).append(UCA.LEVEL_SEPARATOR); + end1 = pos1 + 1; + end2 = pos2 + 1; + } + } + + static final String remove (String s, char ch) { StringBuffer buf = new StringBuffer(); for (int i = 0; i < s.length(); ++i) { @@ -630,7 +736,8 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON Iterator it = forLater.keySet().iterator(); byte oldType = (byte)0xFF; // anything unique int caseCount = 0; - log.println("Generated: " + new Date()); + log.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion()); + log.println("Generated: " + getNormalDate()); while (it.hasNext()) { String key = (String) it.next(); byte type = (byte)key.charAt(0); @@ -863,7 +970,8 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON int[] lenArray = new int[1]; diLog.println("# Contractions"); - diLog.println("# Generated " + new Date()); + diLog.println("# Generated " + getNormalDate()); + diLog.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion()); while (true) { String s = cc.next(ces, lenArray); if (s == null) break; @@ -1409,6 +1517,9 @@ F900..FAFF; CJK Compatibility Ideographs log = Utility.openPrintWriter(filename, false, false); String[] commentText = { + "UCA Rules", + "This file contains the UCA tables for the given version, but transformed into rule syntax.", + "Generated: " + getNormalDate(), "NOTE: Since UCA handles canonical equivalents, no composites are necessary", "(except in extensions).", "For syntax description, see: http://oss.software.ibm.com/icu/userguide/Collate_Intro.html" @@ -1833,6 +1944,24 @@ F900..FAFF; CJK Compatibility Ideographs System.out.println("Test case: " + Utility.hex(s) + ", " + CEList.toString(ces, len)); } backMap.put(new ArrayWrapper((int[])(ces.clone()), 0, len), s); + /* + // HACK until Ken fixes + for (int i = 0; i < len; ++i) { + int ce = ces[i]; + if (collator.isImplicitLeadCE(ce)) { + ++i; + ce = ces[i]; + if (DEBUG + && (UCA.getPrimary(ce) == 0 || UCA.getSecondary(ce) != 0 || UCA.getTertiary(ce) != 0)) { + System.out.println("WEIRD 2nd IMPLICIT: " + + CEList.toString(ces, len) + + ", " + ucd.getCodeAndName(s)); + } + ces[i] = UCA.makeKey(UCA.getPrimary(ce), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); + } + } + backMap.put(new ArrayWrapper((int[])(ces.clone()), 0, len), s); + */ } static int[] ignorableList = { @@ -1915,6 +2044,12 @@ F900..FAFF; CJK Compatibility Ideographs // Try stomping the value to different tertiaries int probe = ces[i]; + if (UCA.isImplicitLeadCE(probe)) { + s = UTF16.valueOf(UCA.ImplicitToCodePoint(UCA.getPrimary(probe), UCA.getPrimary(ces[i+1]))); + ++i; // skip over next item!! + break; + } + char primary = collator.getPrimary(probe); char secondary = collator.getSecondary(probe); @@ -2115,6 +2250,7 @@ F900..FAFF; CJK Compatibility Ideographs static int[] primaryDelta; static void writeFractionalUCA(String filename) throws IOException { + Default.setUCD(); checkImplicit(); checkFixes(); @@ -2345,13 +2481,14 @@ F900..FAFF; CJK Compatibility Ideographs EquivalenceClass secEq = new EquivalenceClass("\r\n#", 2, true); EquivalenceClass terEq = new EquivalenceClass("\r\n#", 2, true); String[] sampleEq = new String[500]; + int[] sampleLen = new int[500]; Iterator it = ordered.keySet().iterator(); int oldFirstPrimary = UCA.getPrimary(UCA.TERMINATOR); boolean wasVariable = false; log.println("# Fractional UCA Table, generated from standard UCA"); - log.println("# M. Davis, " + new Date()); + log.println("# " + getNormalDate()); log.println("# VERSION: UCA=" + collator.getDataVersion() + ", UCD=" + collator.getUCDVersion()); log.println(); log.println("# Generated processed version, as described in ICU design document."); @@ -2389,6 +2526,8 @@ F900..FAFF; CJK Compatibility Ideographs FCE firstTrailing = new FCE(false); FCE lastTrailing = new FCE(true); + + Map backMap = new TreeMap(); while (it.hasNext()) { Object sortKey = it.next(); @@ -2399,19 +2538,19 @@ F900..FAFF; CJK Compatibility Ideographs int firstPrimary = UCA.getPrimary(ces[0]); if (firstPrimary != oldFirstPrimary) { log.println(); - oldFirstPrimary = firstPrimary; boolean isVariable = collator.isVariable(ces[0]); if (isVariable != wasVariable) { if (isVariable) { log.println("# START OF VARIABLE SECTION!!!"); summary.println("# START OF VARIABLE SECTION!!!"); } else { - log.println("[variable top = " + Utility.hex(primaryDelta[firstPrimary]) + "] # END OF VARIABLE SECTION!!!"); + log.println("[variable top = " + Utility.hex(primaryDelta[oldFirstPrimary]) + "] # END OF VARIABLE SECTION!!!"); doVariable = true; } log.println(); } wasVariable = isVariable; + oldFirstPrimary = firstPrimary; } oldStr.setLength(0); chr.getChars(0, chr.length(), codeUnits, 0); @@ -2473,8 +2612,24 @@ F900..FAFF; CJK Compatibility Ideographs if (ter != 0x2) { boolean changed = terEq.add(new Integer(ter), new Integer((pri << 16) | sec)); } - if (sampleEq[sec] == null) sampleEq[sec] = chr; - if (sampleEq[ter] == null) sampleEq[ter] = chr; + + if (sampleEq[sec] == null || sampleLen[sec] > len) { + sampleEq[sec] = chr; + sampleLen[sec] = len; + } + if (sampleEq[ter] == null || sampleLen[sec] > len) { + sampleEq[ter] = chr; + sampleLen[sec] = len; + } + + if ((pri & MARK_CODE_POINT) == 0 && pri == 0) { + Integer key = new Integer(ces[q]); + Pair value = (Pair) backMap.get(key); + if (value == null + || (len < ((Integer)(value.first)).intValue())) { + backMap.put(key, new Pair(new Integer(len), chr)); + } + } // int oldPrimaryValue = UCA.getPrimary(ces[q]); int np = fixPrimary(pri); @@ -2508,38 +2663,76 @@ F900..FAFF; CJK Compatibility Ideographs + "]"); // RECORD STATS + // but ONLY if we are not part of an implicit - if (np == 0 && ns == 0) { - firstSecondaryIgnorable.setValue(np, ns, nt); - lastSecondaryIgnorable.setValue(np, ns, nt); - } else if (np == 0) { - firstPrimaryIgnorable.setValue(np, ns, nt); - lastPrimaryIgnorable.setValue(np, ns, nt); - } else if (collator.isVariable(ces[q])) { - firstVariable.setValue(np, ns, nt); - lastVariable.setValue(np, ns, nt); - } else if (UCA.getPrimary(ces[q]) > UNSUPPORTED_LIMIT) { // Trailing (none currently) - System.out.println("Trailing: " + CEList.toString(ces[q]) - + ", " + Utility.hex(pri) + ", " + Utility.hex(UNSUPPORTED_LIMIT)); - firstTrailing.setValue(np, ns, nt); - lastTrailing.setValue(np, ns, nt); - } else if ((pri & MARK_CODE_POINT) == 0) { // skip implicits - firstNonIgnorable.setValue(np, ns, nt); - lastNonIgnorable.setValue(np, ns, nt); + if ((pri & MARK_CODE_POINT) == 0) { + if (np == 0 && ns == 0) { + firstSecondaryIgnorable.setValue(np, ns, nt); + lastSecondaryIgnorable.setValue(np, ns, nt); + } else if (np == 0) { + firstPrimaryIgnorable.setValue(np, ns, nt); + lastPrimaryIgnorable.setValue(np, ns, nt); + } else if (collator.isVariable(ces[q])) { + firstVariable.setValue(np, ns, nt); + lastVariable.setValue(np, ns, nt); + } else if (UCA.getPrimary(ces[q]) > UNSUPPORTED_LIMIT) { // Trailing (none currently) + System.out.println("Trailing: " + + ucd.getCodeAndName(chr) + ", " + + CEList.toString(ces[q]) + ", " + + Utility.hex(pri) + ", " + + Utility.hex(UNSUPPORTED_LIMIT)); + firstTrailing.setValue(np, ns, nt); + lastTrailing.setValue(np, ns, nt); + } else { + firstNonIgnorable.setValue(np, ns, nt); + lastNonIgnorable.setValue(np, ns, nt); + } } } if (nonePrinted) { log.print("[,,]"); oldStr.append(CEList.toString(0)); } - longLog.print(" # " + oldStr + " # " + ucd.getName(UTF16.charAt(chr, 0))); + longLog.print("\t# " + oldStr + "\t* " + ucd.getName(UTF16.charAt(chr, 0))); log.println(); lastChr = chr; } + // ADD HOMELESS COLLATION ELEMENTS + log.println(); + log.println("# HOMELESS COLLATION ELEMENTS"); + char fakeTrail = 'a'; + Iterator it3 = backMap.keySet().iterator(); + while (it3.hasNext()) { + Integer key = (Integer) it3.next(); + Pair pair = (Pair) backMap.get(key); + if (((Integer)pair.first).intValue() < 2) continue; + String sample = (String)pair.second; + + int ce = key.intValue(); + + int np = fixPrimary(UCA.getPrimary(ce)); + int ns = fixSecondary(UCA.getSecondary(ce)); + int nt = fixTertiary(UCA.getTertiary(ce)); + + newPrimary.setLength(0); + newSecondary.setLength(0); + newTertiary.setLength(0); + + hexBytes(np, newPrimary); + hexBytes(ns, newSecondary); + hexBytes(nt, newTertiary); + + log.print(Utility.hex('\uFDD0' + "" + (char)(fakeTrail++)) + "; " + + "[, " + newSecondary + ", " + newTertiary + "]"); + longLog.print("\t# " + collator.getCEList(sample, true) + "\t* " + ucd.getCodeAndName(sample)); + log.println(); + } + int firstImplicit = getImplicitPrimary(CJK_BASE); int lastImplicit = getImplicitPrimary(0x10FFFF); + log.println(); log.println("# VALUES BASED ON UCA"); log.println("[first tertiary ignorable " + new FCE(false,0,0, 0).formatFCE() + "]"); @@ -2580,16 +2773,17 @@ F900..FAFF; CJK Compatibility Ideographs log.println("[first trailing " + firstTrailing.formatFCE() + "]"); log.println("[last trailing " + lastTrailing.formatFCE() + "]"); + log.println(); log.println("# FIXED VALUES"); - log.println("[top " + Utility.hex(0xA0,2) + "]"); - log.println("[first implicit byte " + Utility.hex(IMPLICIT_BASE_BYTE,2) + "]"); - log.println("[last implicit byte " + Utility.hex(IMPLICIT_LIMIT_BYTE,2) + "]"); - log.println("[first trail byte" + Utility.hex(IMPLICIT_LIMIT_BYTE+1,2) + "]"); - log.println("[last implicit byte" + Utility.hex(SPECIAL_BASE-1,2) + "]"); - log.println("[first special byte" + Utility.hex(SPECIAL_BASE,2) + "]"); - log.println("[last special byte" + Utility.hex(0xFF,2) + "]"); - + log.println("# superceded! [top " + lastNonIgnorable.formatFCE() + "]"); + log.println("[fixed first implicit byte " + Utility.hex(IMPLICIT_BASE_BYTE,2) + "]"); + log.println("[fixed last implicit byte " + Utility.hex(IMPLICIT_LIMIT_BYTE,2) + "]"); + log.println("[fixed first trail byte " + Utility.hex(IMPLICIT_LIMIT_BYTE+1,2) + "]"); + log.println("[fixed last trail byte " + Utility.hex(SPECIAL_BASE-1,2) + "]"); + log.println("[fixed first special byte " + Utility.hex(SPECIAL_BASE,2) + "]"); + log.println("[fixed last special byte " + Utility.hex(0xFF,2) + "]"); + summary.println("Last: " + Utility.hex(lastNp) + ", " + ucd.getCodeAndName(UTF16.charAt(lastChr, 0))); @@ -2636,6 +2830,7 @@ F900..FAFF; CJK Compatibility Ideographs summary.println(); summary.println("# UCA : (FRAC) CODE [ UCA CE ] Name"); summary.println(); + for (int i = 0; i < sampleEq.length; ++i) { if (sampleEq[i] == null) continue; if (i == 0x20) { @@ -2653,6 +2848,7 @@ F900..FAFF; CJK Compatibility Ideographs summary.print(CEList.toString(ces[q])); } summary.println(" " + ucd.getName(sampleEq[i])); + } log.close(); summary.close(); @@ -3379,6 +3575,13 @@ static int swapCJK(int i) { } + static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd','HH:mm:ss' GMT'"); + + static String getNormalDate() { + return myDateFormat.format(new Date()) + " [MD]"; + } + + static void setSingle(char ch, int[] ces) { collator.getCEs(String.valueOf(ch), true, ces); singles.set(UCA.getPrimary(ces[0])); @@ -3396,12 +3599,18 @@ static int swapCJK(int i) { input.close(); } + static UnicodeSet compatibilityExceptions = new UnicodeSet("[\u0CCB\u0DDD\u017F\u1E9B\uFB05]"); + static void writeCollationValidityLog() throws IOException { + Default.setUCD(); //log = new PrintWriter(new FileOutputStream("CheckCollationValidity.html")); log = Utility.openPrintWriter("CheckCollationValidity.html", false, false); - log.println(""); + log.println(""); + log.println("UCA Validity Log"); + log.println(""); + log.println(""); //collator = new UCA(null); @@ -3412,14 +3621,14 @@ static int swapCJK(int i) { } System.out.println("Sorting"); - - for (int i = 0; i <= 0xFFFF; ++i) { + /* + for (int i = 0; i <= 0x10FFFF; ++i) { if (EXCLUDE_UNSUPPORTED && !collator.found.contains(i)) continue; if (0xD800 <= i && i <= 0xF8FF) continue; // skip surrogates and private use //if (0xA000 <= c && c <= 0xA48F) continue; // skip YI addString(UTF32.valueOf32(i), option); } - + */ UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null); cc.enableSamples(); @@ -3469,18 +3678,24 @@ static int swapCJK(int i) { System.out.println("Writing"); String version = collator.getDataVersion(); + log.println("

Collation Validity Checks

"); + log.println(""); + log.println("
Generated: " + getNormalDate() + "
File Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion() + "
"); + + if (GENERATED_NFC_MISMATCHES) showMismatches(); removeAdjacentDuplicates2(); - UnicodeSet exceptions = new UnicodeSet("[\u0CCB\u0DDD\u017F\u1E9B\uFB05]"); - UnicodeSet alreadySeen = new UnicodeSet(exceptions); + UnicodeSet alreadySeen = new UnicodeSet(compatibilityExceptions); checkBadDecomps(1, false, alreadySeen); // if decomposition is off, all primaries should be identical - checkBadDecomps(2, true, alreadySeen); // if decomposition is ON, all primaries and secondaries should be identical + checkBadDecomps(2, false, alreadySeen); // if decomposition is ON, all primaries and secondaries should be identical + checkBadDecomps(3, false, alreadySeen); // if decomposition is ON, all primaries and secondaries should be identical + //checkBadDecomps(2, true, alreadySeen); // if decomposition is ON, all primaries and secondaries should be identical log.println("

Note: characters with decompositions to space + X, and tatweel + X are excluded," - + " as are a few special characters: " + exceptions.toPattern(true) + "

"); + + " as are a few special characters: " + compatibilityExceptions.toPattern(true) + "

"); if (DO_CHARTS) { System.out.println("Charts"); @@ -3564,6 +3779,7 @@ static int swapCJK(int i) { } checkWellformedTable(); + addClosure(); log.println(""); log.close(); @@ -3572,10 +3788,83 @@ static int swapCJK(int i) { } + static void addClosure() { + int canCount = 0; + System.out.println("Add missing decomposibles"); + log.println("

7. Comparing Other Equivalents

"); + log.println("

These are not necessarily errors, but should be examined for possible errors

"); + log.println("

Each of the three strings is canonically equivalent, but has different sort keys

"); + log.println(""); + log.println(""); + + + Set contentsForCanonicalIteration = new TreeSet(); + UCA.UCAContents ucac = collator.getContents(UCA.FIXED_CE, null); // NFD + int ccounter = 0; + while (true) { + Utility.dot(ccounter++); + String s = ucac.next(); + if (s == null) break; + contentsForCanonicalIteration.add(s); + } + + Set additionalSet = new HashSet(); + + System.out.println("Loading canonical iterator"); + if (canIt == null) canIt = new CanonicalIterator("."); + Iterator it2 = contentsForCanonicalIteration.iterator(); + System.out.println("Adding any FCD equivalents that have different sort keys"); + while (it2.hasNext()) { + String key = (String)it2.next(); + if (key == null) { + System.out.println("Null Key"); + continue; + } + canIt.setSource(key); + String nfdKey = toD.normalize(key); + + boolean first = true; + while (true) { + String s = canIt.next(); + if (s == null) break; + if (s.equals(key)) continue; + if (contentsForCanonicalIteration.contains(s)) continue; + if (additionalSet.contains(s)) continue; + + + // Skip anything that is not FCD. + if (!NFD.isFCD(s)) continue; + + // We ONLY add if the sort key would be different + // Than what we would get if we didn't decompose!! + String sortKey = collator.getSortKey(s, UCA.NON_IGNORABLE); + String nonDecompSortKey = collator.getSortKey(s, UCA.NON_IGNORABLE, false); + if (sortKey.equals(nonDecompSortKey)) continue; + + if (DEBUG && first) { + System.out.println(" " + ucd.getCodeAndName(key)); + first = false; + } + log.println(""); + log.println(""); + log.println(""); + log.println(""); + log.println(""); + log.println(""); + log.println(""); + log.println(""); + log.println(""); + additionalSet.add(s); + } + } + log.println("
CountNameCodeSort Keys
" + (++canCount) + "" + Utility.replace(ucd.getName(key), ", ", ",
") + "
" + Utility.hex(key) + "" + collator.toString(sortKey) + "
" + Utility.replace(ucd.getName(nfdKey), ", ", ",
") + "
" + Utility.hex(nfdKey) + "" + collator.toString(sortKey) + "
" + Utility.replace(ucd.getName(s), ", ", ",
") + "
" + Utility.hex(s) + "" + collator.toString(nonDecompSortKey) + "
"); + log.println("

Items: " + canCount + "

"); + } + static void checkWellformedTable() throws IOException { System.out.println("Checking for well-formedness"); - log.println("

5. Checking for well-formedness

"); + log.println("

6. Checking for well-formedness

"); Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION); @@ -3620,6 +3909,7 @@ static int swapCJK(int i) { cc = collator.getContents(UCA.FIXED_CE, nfd); log.println(""); + int lastPrimary = 0; while (true) { String str = cc.next(ces, lenArray); @@ -3632,40 +3922,64 @@ static int swapCJK(int i) { int s = UCA.getSecondary(ce); int t = UCA.getTertiary(ce); + // IF we are at the start of an implicit, then just check that the implicit is in range + // CHECK implicit + if (collator.isImplicitLeadPrimary(lastPrimary)) { + try { + if (s != 0 || t != 0) throw new Exception("Second implicit must be [X,0,0]"); + collator.ImplicitToCodePoint(lastPrimary, p); // throws exception if bad + } catch (Exception e) { + log.println(""); + } + // zap the primary, since we worry about the last REAL primary: + lastPrimary = 0; + continue; + } + + // IF we are in the trailing range, something is wrong. + if (p >= UCA_Types.UNSUPPORTED_LIMIT) { + log.println(""); + lastPrimary = p; + continue; + } + // Check WF#1 if (p != 0 && s == 0) { - log.println(""); - errorCount++; } if (s != 0 && t == 0) { - log.println(""); - errorCount++; } // Check WF#2 if (p != 0) { if (s > minps) { - log.println(""); - errorCount++; } } if (s != 0) { if (t > minpst) { - log.println(""); - errorCount++; } } else { } + + lastPrimary = p; + } } log.println("
" + (++errorCount) + ". BAD IMPLICIT: " + e.getMessage() + + "" + CEList.toString(ces, len) + + "" + ucd.getCodeAndName(str) + "
" + (++errorCount) + ". > " + Utility.hex(UCA_Types.UNSUPPORTED_LIMIT,4) + + "" + CEList.toString(ces, len) + + "" + ucd.getCodeAndName(str) + "
WF1.1" + log.println("
" + (++errorCount) + ". WF1.1" + "" + CEList.toString(ces, len) + "" + ucd.getCodeAndName(str) + "
WF1.2" + log.println("
" + (++errorCount) + ". WF1.2" + "" + CEList.toString(ces, len) + "" + ucd.getCodeAndName(str) + "
WF2.2" + log.println("
" + (++errorCount) + ". WF2.2" + "" + CEList.toString(ces, len) + "" + ucd.getCodeAndName(str) + "
WF2.3" + log.println("
" + (++errorCount) + ". WF2.3" + "" + CEList.toString(ces, len) + "" + ucd.getCodeAndName(str) + "
"); @@ -3679,9 +3993,7 @@ static int swapCJK(int i) { } - if (errorCount > 0) { - log.println("

Well-formedness errors: " + errorCount + "

"); - } + log.println("

Errors: " + errorCount + "

"); } @@ -3738,7 +4050,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; String colDbase = collator.getSortKey(ch, option, true); String colNbase = collator.getSortKey(ch, option, false); String colCbase = collator.getSortKey(toC.normalize(ch), option, false); - if (!colNbase.equals(colCbase)) { + if (!colNbase.equals(colCbase) || !colNbase.equals(colDbase) ) { /*System.out.println(Utility.hex(ch)); System.out.println(printableKey(colNbase)); System.out.println(printableKey(colNbase)); @@ -3770,10 +4082,11 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; String lastChar = ""; int countRem = 0; int countDups = 0; + int errorCount = 0; Iterator it1 = sortedD.keySet().iterator(); Iterator it2 = sortedN.keySet().iterator(); Differ differ = new Differ(250,3); - log.println("

2. Differences in Ordering

"); + log.println("

2. Differences in Ordering

"); log.println("

Codes and names are in the white rows: bold means that the NO-NFD sort key differs from UCA key.

"); log.println("

Keys are in the light blue rows: green is the bad key, blue is UCA, black is where they equal.

"); log.println(""); @@ -3818,6 +4131,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; } log.println(""); } + errorCount++; } //differ.flush(); @@ -3826,6 +4140,8 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; log.println("
"); + log.println("

Errors: " + errorCount + "

"); + //log.println("Removed " + countRem + " adjacent duplicates.
"); System.out.println("Left " + countDups + " conflicts.
"); log.println("Left " + countDups + " conflicts.
"); @@ -3835,10 +4151,12 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; String lastChar = ""; int countRem = 0; int countDups = 0; + int errorCount = 0; Iterator it = sortedD.keySet().iterator(); log.println("

2. Differences in Ordering

"); log.println("

Codes and names are in the white rows: bold means that the NO-NFD sort key differs from UCA key.

"); log.println("

Keys are in the light blue rows: green is the bad key, blue is UCA, black is where they equal.

"); + log.println("

Note: so black lines are generally ok.

"); log.println(""); log.println(""); @@ -3876,9 +4194,11 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; if (!showedLast) { log.println(""); showLine(count-1, lastCh, lastCol, lastColN); + errorCount++; } showedLast = true; showLine(count,ch, col, colN); + errorCount++; } lastCol = col; lastColN = colN; @@ -3886,6 +4206,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; } log.println("
File OrderCode and DecompKey and Decomp-Key
"); + log.println("

Errors: " + errorCount + "

"); } static int compareMinusLast(String a, String b) { @@ -3919,39 +4240,36 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; static final String[] alternateName = {"SHIFTED", "ZEROED", "NON_IGNORABLE", "SHIFTED_TRIMMED"}; static void showMismatches() { - MLStreamWriter out = new MLStreamWriter(log); - out.el("h1").tx("1. Mismatches when NFD is OFF").cl(); - out.el("h2").tx("Date:" + new Date()).cl(); - out.el("h2").tx("File Version:" + UCA.VERSION).cl(); - out.el("p").tx("Alternate Handling = " + alternateName[option]).cl(); - out.el("table").at("border",1); - out.el("caption").tx("Mismatches in UCA-NOD: Plain vs NFC: ").tx(MismatchedC.size()).cl("caption"); - out.el("tr"); - out.el("th").tx("Code").cl(); - out.el("th").tx("Type").cl(); - out.el("th").tx("CC?").cl(); - out.el("th").tx("Key").cl(); - out.cl("tr"); + log.println("

1. Mismatches when NFD is OFF

"); + log.println("

Alternate Handling = " + alternateName[option] + "

"); + log.println("

NOTE: NFD form is used by UCA," + + "so if other forms are different there are ignored. This may indicate a problem, e.g. missing contraction.

"); + log.println(""); + log.println(""); Iterator it = MismatchedC.keySet().iterator(); + int errorCount = 0; while (it.hasNext()) { String ch = (String)it.next(); String MN = (String)MismatchedN.get(ch); String MC = (String)MismatchedC.get(ch); + String MD = (String)MismatchedD.get(ch); String chInC = toC.normalize(ch); - out.el("tr"); - out.el("th").at("rowSpan",2).at("align","right").tx16(ch).tx(' ').tx(ucd.getName(ch)); - out.el("br").cl().tx("NFC=").tx16(chInC).cl(); - out.el("th").tx("Plain").cl(); - out.el("th").tx(containsCombining(ch) ? "y" : "n").cl(); - out.el("td").tx(printableKey(MN)).cl(); - out.cl("tr"); - out.el("tr"); - out.el("th").tx("NFC").cl(); - out.el("th").tx(containsCombining(chInC) ? "Y" : "ERROR").cl(); - out.el("td").tx(printableKey(MC)).cl(); - out.cl("tr"); + String chInD = toD.normalize(ch); + + log.println(""); + + log.println(""); + + log.println(""); + + errorCount++; } - out.closeAllElements(); + log.println("
NameTypeUnicodeKey
" + Utility.replace(ucd.getName(ch), ", ", ",
") + + "
NFD" + Utility.hex(chInD) + + "" + printableKey(MD) + "
NFC" + Utility.hex(chInC) + + "" + printableKey(MC) + "
Plain" + Utility.hex(ch) + + "" + printableKey(MN) + "
"); + log.println("

Errors: " + errorCount + "

"); log.println("
"); }