diff --git a/tools/unicodetools/com/ibm/text/UCA/Main.java b/tools/unicodetools/com/ibm/text/UCA/Main.java index 89f93cc0298..5726e81eb2f 100644 --- a/tools/unicodetools/com/ibm/text/UCA/Main.java +++ b/tools/unicodetools/com/ibm/text/UCA/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $ -* $Date: 2002/06/22 21:02:16 $ -* $Revision: 1.7 $ +* $Date: 2002/06/28 01:59:58 $ +* $Revision: 1.8 $ * ******************************************************************************* */ @@ -18,7 +18,7 @@ import com.ibm.text.utility.*; public class Main { static final String UCDVersion = ""; - static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA", + static final String[] ICU_FILES = {"writeCollationValidityLog", "writeFractionalUCA", "WriteRules", "WriteRulesWithNames", "WriteRulesXML", "writeconformance", "writeconformanceshifted", "short", "writeconformance", "writeconformanceshifted", @@ -65,7 +65,7 @@ public class Main { else if (arg.equalsIgnoreCase("WriteRulesXML")) WriteCollationData.writeRules(WriteCollationData.IN_XML); else if (arg.equalsIgnoreCase("checkDisjointIgnorables")) WriteCollationData.checkDisjointIgnorables(); else if (arg.equalsIgnoreCase("writeContractions")) WriteCollationData.writeContractions(); - else if (arg.equalsIgnoreCase("FractionalUCA")) WriteCollationData.writeFractionalUCA("FractionalUCA"); + else if (arg.equalsIgnoreCase("writeFractionalUCA")) WriteCollationData.writeFractionalUCA("FractionalUCA"); else if (arg.equalsIgnoreCase("writeConformance")) WriteCollationData.writeConformance("CollationTest_NON_IGNORABLE", UCA.NON_IGNORABLE, shortPrint); else if (arg.equalsIgnoreCase("writeConformanceSHIFTED")) WriteCollationData.writeConformance("CollationTest_SHIFTED", UCA.SHIFTED, shortPrint); else if (arg.equalsIgnoreCase("testCompatibilityCharacters")) WriteCollationData.testCompatibilityCharacters(); @@ -80,7 +80,7 @@ public class Main { System.out.println("UNKNOWN OPTION (" + arg + "): must be one of the following (case-insensitive)"); System.out.println("\tWriteRulesXML, WriteRulesWithNames, WriteRules,"); System.out.println("\tcheckDisjointIgnorables, writeContractions,"); - System.out.println("\tFractionalUCA, writeConformance, writeConformanceSHIFTED, testCompatibilityCharacters,"); + System.out.println("\twriteFractionalUCA, writeConformance, writeConformanceSHIFTED, testCompatibilityCharacters,"); System.out.println("\twriteCollationValidityLog, writeCaseExceptions, writeJavascriptInfo, writeCaseFolding"); System.out.println("\tjavatest, hex (used for conformance)"); } diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java index d64497980bb..77b2c04e35c 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ -* $Date: 2002/06/24 15:25:10 $ -* $Revision: 1.15 $ +* $Date: 2002/06/28 01:59:58 $ +* $Revision: 1.16 $ * ******************************************************************************* */ @@ -75,7 +75,8 @@ final public class UCA implements Comparator, UCA_Types { * Version of the UCA tables to use */ //private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7"; - public static final String VERSION = "-3.1.1d1"; // ""; // "-2.1.9d7"; + public static final String UCA_BASE = "3.1.1"; // ""; // "-2.1.9d7"; + public static final String VERSION = "-" + UCA_BASE + "d5"; // ""; // "-2.1.9d7"; public static final String ALLFILES = "allkeys"; // null if not there /** @@ -240,7 +241,9 @@ final public class UCA implements Comparator, UCA_Types { // add weights char w = getPrimary(ce); if (DEBUG) System.out.println("\tCE: " + Utility.hex(ce)); - if (w != 0) primaries.append(w); + if (w != 0) { + primaries.append(w); + } w = getSecondary(ce); if (w != 0) { @@ -252,9 +255,13 @@ final public class UCA implements Comparator, UCA_Types { } w = getTertiary(ce); - if (w != 0) tertiaries.append(w); + if (w != 0) { + tertiaries.append(w); + } - if (weight4 != 0) quaternaries.append(weight4); + if (weight4 != 0) { + quaternaries.append(weight4); + } } // Produce weight strings @@ -263,13 +270,13 @@ final public class UCA implements Comparator, UCA_Types { StringBuffer result = primaries; if (strength >= 2) { - result.append('\u0000'); // separator + result.append(LEVEL_SEPARATOR); // separator result.append(secondaries); if (strength >= 3) { - result.append('\u0000'); // separator + result.append(LEVEL_SEPARATOR); // separator result.append(tertiaries); if (strength >= 4) { - result.append('\u0000'); // separator + result.append(LEVEL_SEPARATOR); // separator if (alternate == SHIFTED_TRIMMED) { int q; for (q = quaternaries.length()-1; q >= 0; --q) { @@ -303,7 +310,7 @@ final public class UCA implements Comparator, UCA_Types { char c2 = sortKey2.charAt(i); if (c1 < c2) return -strength; if (c1 > c2) return strength; - if (c1 == '\u0000') --strength; // Separator! + if (c1 == LEVEL_SEPARATOR) --strength; // Separator! } if (len1 < len2) return -strength; if (len1 > len2) return strength; @@ -399,15 +406,21 @@ final public class UCA implements Comparator, UCA_Types { * @param source Normal UTF-16 (Java) string * @return sort key (as string) * @author Markus Scherer (cast into Java by MD) + * NOTE: changed to be longer, but handle isolated surrogates */ public static StringBuffer appendInCodePointOrder(String source, StringBuffer target) { - for (int i = 0; i < source.length(); ++i) { - int ch = source.charAt(i); + int cp; + for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(source, i); + target.append((char)((cp >> 15) | 0x8000)); + target.append((char)(cp | 0x8000)); + /* if (ch <= 1) { // hack to avoid nulls target.append('\u0001'); target.append((char)(ch+1)); } target.append((char)(ch + utf16CodePointOrder[ch>>11])); + */ } return target; } @@ -659,9 +672,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] */ /** - * Returns implicit value as pair, first part in high word; second part in low word - * So to get first part use (x >>> 16) -- remember the >>>! - * and to get the second part use (x & 0xFFFF) + * Returns implicit value */ void CodepointToImplicit(int cp, int[] output) { @@ -673,9 +684,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] } /** - * Takes implicit value as pair, first part in high word; second part in low word - * So to get first part use (x >>> 16) -- remember the >>>! - * and to get the second part use (x & 0xFFFF) + * Takes implicit value */ static int ImplicitToCodePoint(int leadImplicit, int trailImplicit) { @@ -997,7 +1006,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] // push BBBB - expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY)); + expandingStack.push(makeKey(implicit[1], 0, 0)); // return AAAA @@ -1127,7 +1136,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] // normal case while (current++ < 0x10FFFF) { - if (current == 0x406) { + if (DEBUG && current == 0xdbff) { System.out.println("DEBUG"); } //char ch = (char)current; diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index 808943150b3..4ca5ac6c342 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2002/06/24 15:25:10 $ -* $Revision: 1.23 $ +* $Date: 2002/06/28 01:59:53 $ +* $Revision: 1.24 $ * ******************************************************************************* */ @@ -25,6 +25,8 @@ import java.io.*; import java.text.RuleBasedCollator; import java.text.CollationElementIterator; import java.text.Collator; +import java.text.DateFormat; +import java.text.SimpleDateFormat; import com.ibm.text.UCD.*; import com.ibm.text.UCD.UCD_Types; @@ -34,7 +36,7 @@ import com.ibm.text.UCD.Normalizer; public class WriteCollationData implements UCD_Types, UCA_Types { static final boolean DEBUG = false; - static final boolean DEBUG_SHOW_ITERATION = true; + static final boolean DEBUG_SHOW_ITERATION = false; @@ -299,18 +301,27 @@ public class WriteCollationData implements UCD_Types, UCA_Types { U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON => U+00DC LATIN CAPITAL LETTER U WITH DIAERESIS, U+0304 COMBINING MACRON */ - String[] testList = {"\uF934", "U", "U\u0308", "\u00DC", "\u00DC\u0304", "U\u0308\u0304"}; - for (int jj = 0; jj < testList.length; ++jj) { - String t = testList[jj]; - System.out.println(ucd.getCodeAndName(t)); - String test = collator.getSortKey(t, UCA.NON_IGNORABLE); - System.out.println("Decomp: " + collator.toString(test)); - test = collator.getSortKey(t, UCA.NON_IGNORABLE, false); - System.out.println("No Dec: " + collator.toString(test)); + if (DEBUG) { + String[] testList = {"\u3192", "\u3220", "\u0344", "\u0385", "\uF934", "U", "U\u0308", "\u00DC", "\u00DC\u0304", "U\u0308\u0304"}; + for (int jj = 0; jj < testList.length; ++jj) { + String t = testList[jj]; + System.out.println(ucd.getCodeAndName(t)); + + CEList ces = collator.getCEList(t, true); + System.out.println("CEs: " + ces); + + String test = collator.getSortKey(t, option); + System.out.println("Decomp: " + collator.toString(test)); + + test = collator.getSortKey(t, option, false); + System.out.println("No Dec: " + collator.toString(test)); + } } - PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, false); - if (!shortPrint) log.write('\uFEFF'); + PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, true); + //if (!shortPrint) log.write('\uFEFF'); + log.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion()); + log.println("# Generated: " + getNormalDate()); System.out.println("Sorting"); int counter = 0; @@ -333,7 +344,6 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON } Utility.dot(counter++); addStringX(s, option); - // TODO: add other accents with Cyrillic } UnicodeSet found = collator.found; @@ -472,7 +482,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON for (int j = 0; j < CONTRACTION_TEST.length; ++j) { String extra = s.substring(0,i) + CONTRACTION_TEST[j] + s.substring(i); addStringY(extra + 'a', option); - System.out.println(addCounter++ + " Adding " + Default.ucd.getCodeAndName(extra)); + if (DEBUG) System.out.println(addCounter++ + " Adding " + Default.ucd.getCodeAndName(extra)); } } } @@ -488,31 +498,51 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON sortedD.put(colDbase, s); } - + static UCD ucd_uca_base = null; /** * Check that the primaries are the same as the compatibility decomposition. */ static void checkBadDecomps(int strength, boolean decomposition, UnicodeSet alreadySeen) { + if (ucd_uca_base == null) { + ucd_uca_base = UCD.make(UCA.UCA_BASE); + } int oldStrength = collator.getStrength(); collator.setStrength(strength); Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION); - if (strength == 1) { - log.println("
Code | Sort Key | Decomposed Sort Key | Name |
---|---|---|---|
" + Utility.hex(ch) - + " | " + UCA.toString(sortKey) - + " | " + UCA.toString(decompSortKey) - + " | " + ucd.getName(ch) - + " |
" + Utility.hex(ch) + + " | " + UCA.toString(sortKey) + + " | " + UCA.toString(decompSortKey) + + " | " + ucd.getName(ch) + + " |
Errors: " + errorCount + "
"); + log.println("Space/Tatweel exceptions: " + skipSet.toPattern(true) + "
"); collator.setStrength(oldStrength); Utility.fixDot(); } + static String remapSortKey(int cp, boolean decomposition) { + if (toD.isNormalized(cp)) return remapCanSortKey(cp, decomposition); + + // we know that it is not NFKD. + String canDecomp = toD.normalize(cp); + String result = ""; + int ch; + for (int j = 0; j < canDecomp.length(); j += UTF16.getCharCount(ch)) { + ch = UTF16.charAt(canDecomp, j); + System.out.println("* " + Default.ucd.getCodeAndName(ch)); + String newSortKey = remapCanSortKey(ch, decomposition); + System.out.println("* " + UCA.toString(newSortKey)); + result = mergeSortKeys(result, newSortKey); + System.out.println("= " + UCA.toString(result)); + } + return result; + } + + static String remapCanSortKey(int ch, boolean decomposition) { + String compatDecomp = Default.nfkd.normalize(ch); + String decompSortKey = collator.getSortKey(compatDecomp, UCA.NON_IGNORABLE, decomposition); + + byte type = ucd.getDecompositionType(ch); + int pos = decompSortKey.indexOf(UCA.LEVEL_SEPARATOR) + 1; // after first separator + pos = decompSortKey.indexOf(UCA.LEVEL_SEPARATOR, pos) + 1; // after second separator + String newSortKey = decompSortKey.substring(0, pos); + for (int i = pos; i < decompSortKey.length(); ++i) { + int weight = decompSortKey.charAt(i); + int newWeight = CEList.remap(ch, type, weight); + if (i > pos + 1) newWeight = 0x1F; + newSortKey += (char)newWeight; + } + return newSortKey; + } + + // keys must be of the same strength + static String mergeSortKeys(String key1, String key2) { + StringBuffer result = new StringBuffer(); + int end1 = 0, end2 = 0; + while (true) { + int pos1 = key1.indexOf(UCA.LEVEL_SEPARATOR, end1); + int pos2 = key2.indexOf(UCA.LEVEL_SEPARATOR, end2); + if (pos1 < 0) { + result.append(key1.substring(end1)).append(key2.substring(end2)); + return result.toString(); + } + if (pos2 < 0) { + result.append(key1.substring(end1, pos1)).append(key2.substring(end2)).append(key1.substring(pos1)); + return result.toString(); + } + result.append(key1.substring(end1, pos1)).append(key2.substring(end2, pos2)).append(UCA.LEVEL_SEPARATOR); + end1 = pos1 + 1; + end2 = pos2 + 1; + } + } + + static final String remove (String s, char ch) { StringBuffer buf = new StringBuffer(); for (int i = 0; i < s.length(); ++i) { @@ -630,7 +736,8 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON Iterator it = forLater.keySet().iterator(); byte oldType = (byte)0xFF; // anything unique int caseCount = 0; - log.println("Generated: " + new Date()); + log.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion()); + log.println("Generated: " + getNormalDate()); while (it.hasNext()) { String key = (String) it.next(); byte type = (byte)key.charAt(0); @@ -863,7 +970,8 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON int[] lenArray = new int[1]; diLog.println("# Contractions"); - diLog.println("# Generated " + new Date()); + diLog.println("# Generated " + getNormalDate()); + diLog.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion()); while (true) { String s = cc.next(ces, lenArray); if (s == null) break; @@ -1409,6 +1517,9 @@ F900..FAFF; CJK Compatibility Ideographs log = Utility.openPrintWriter(filename, false, false); String[] commentText = { + "UCA Rules", + "This file contains the UCA tables for the given version, but transformed into rule syntax.", + "Generated: " + getNormalDate(), "NOTE: Since UCA handles canonical equivalents, no composites are necessary", "(except in extensions).", "For syntax description, see: http://oss.software.ibm.com/icu/userguide/Collate_Intro.html" @@ -1833,6 +1944,24 @@ F900..FAFF; CJK Compatibility Ideographs System.out.println("Test case: " + Utility.hex(s) + ", " + CEList.toString(ces, len)); } backMap.put(new ArrayWrapper((int[])(ces.clone()), 0, len), s); + /* + // HACK until Ken fixes + for (int i = 0; i < len; ++i) { + int ce = ces[i]; + if (collator.isImplicitLeadCE(ce)) { + ++i; + ce = ces[i]; + if (DEBUG + && (UCA.getPrimary(ce) == 0 || UCA.getSecondary(ce) != 0 || UCA.getTertiary(ce) != 0)) { + System.out.println("WEIRD 2nd IMPLICIT: " + + CEList.toString(ces, len) + + ", " + ucd.getCodeAndName(s)); + } + ces[i] = UCA.makeKey(UCA.getPrimary(ce), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); + } + } + backMap.put(new ArrayWrapper((int[])(ces.clone()), 0, len), s); + */ } static int[] ignorableList = { @@ -1915,6 +2044,12 @@ F900..FAFF; CJK Compatibility Ideographs // Try stomping the value to different tertiaries int probe = ces[i]; + if (UCA.isImplicitLeadCE(probe)) { + s = UTF16.valueOf(UCA.ImplicitToCodePoint(UCA.getPrimary(probe), UCA.getPrimary(ces[i+1]))); + ++i; // skip over next item!! + break; + } + char primary = collator.getPrimary(probe); char secondary = collator.getSecondary(probe); @@ -2115,6 +2250,7 @@ F900..FAFF; CJK Compatibility Ideographs static int[] primaryDelta; static void writeFractionalUCA(String filename) throws IOException { + Default.setUCD(); checkImplicit(); checkFixes(); @@ -2345,13 +2481,14 @@ F900..FAFF; CJK Compatibility Ideographs EquivalenceClass secEq = new EquivalenceClass("\r\n#", 2, true); EquivalenceClass terEq = new EquivalenceClass("\r\n#", 2, true); String[] sampleEq = new String[500]; + int[] sampleLen = new int[500]; Iterator it = ordered.keySet().iterator(); int oldFirstPrimary = UCA.getPrimary(UCA.TERMINATOR); boolean wasVariable = false; log.println("# Fractional UCA Table, generated from standard UCA"); - log.println("# M. Davis, " + new Date()); + log.println("# " + getNormalDate()); log.println("# VERSION: UCA=" + collator.getDataVersion() + ", UCD=" + collator.getUCDVersion()); log.println(); log.println("# Generated processed version, as described in ICU design document."); @@ -2389,6 +2526,8 @@ F900..FAFF; CJK Compatibility Ideographs FCE firstTrailing = new FCE(false); FCE lastTrailing = new FCE(true); + + Map backMap = new TreeMap(); while (it.hasNext()) { Object sortKey = it.next(); @@ -2399,19 +2538,19 @@ F900..FAFF; CJK Compatibility Ideographs int firstPrimary = UCA.getPrimary(ces[0]); if (firstPrimary != oldFirstPrimary) { log.println(); - oldFirstPrimary = firstPrimary; boolean isVariable = collator.isVariable(ces[0]); if (isVariable != wasVariable) { if (isVariable) { log.println("# START OF VARIABLE SECTION!!!"); summary.println("# START OF VARIABLE SECTION!!!"); } else { - log.println("[variable top = " + Utility.hex(primaryDelta[firstPrimary]) + "] # END OF VARIABLE SECTION!!!"); + log.println("[variable top = " + Utility.hex(primaryDelta[oldFirstPrimary]) + "] # END OF VARIABLE SECTION!!!"); doVariable = true; } log.println(); } wasVariable = isVariable; + oldFirstPrimary = firstPrimary; } oldStr.setLength(0); chr.getChars(0, chr.length(), codeUnits, 0); @@ -2473,8 +2612,24 @@ F900..FAFF; CJK Compatibility Ideographs if (ter != 0x2) { boolean changed = terEq.add(new Integer(ter), new Integer((pri << 16) | sec)); } - if (sampleEq[sec] == null) sampleEq[sec] = chr; - if (sampleEq[ter] == null) sampleEq[ter] = chr; + + if (sampleEq[sec] == null || sampleLen[sec] > len) { + sampleEq[sec] = chr; + sampleLen[sec] = len; + } + if (sampleEq[ter] == null || sampleLen[sec] > len) { + sampleEq[ter] = chr; + sampleLen[sec] = len; + } + + if ((pri & MARK_CODE_POINT) == 0 && pri == 0) { + Integer key = new Integer(ces[q]); + Pair value = (Pair) backMap.get(key); + if (value == null + || (len < ((Integer)(value.first)).intValue())) { + backMap.put(key, new Pair(new Integer(len), chr)); + } + } // int oldPrimaryValue = UCA.getPrimary(ces[q]); int np = fixPrimary(pri); @@ -2508,38 +2663,76 @@ F900..FAFF; CJK Compatibility Ideographs + "]"); // RECORD STATS + // but ONLY if we are not part of an implicit - if (np == 0 && ns == 0) { - firstSecondaryIgnorable.setValue(np, ns, nt); - lastSecondaryIgnorable.setValue(np, ns, nt); - } else if (np == 0) { - firstPrimaryIgnorable.setValue(np, ns, nt); - lastPrimaryIgnorable.setValue(np, ns, nt); - } else if (collator.isVariable(ces[q])) { - firstVariable.setValue(np, ns, nt); - lastVariable.setValue(np, ns, nt); - } else if (UCA.getPrimary(ces[q]) > UNSUPPORTED_LIMIT) { // Trailing (none currently) - System.out.println("Trailing: " + CEList.toString(ces[q]) - + ", " + Utility.hex(pri) + ", " + Utility.hex(UNSUPPORTED_LIMIT)); - firstTrailing.setValue(np, ns, nt); - lastTrailing.setValue(np, ns, nt); - } else if ((pri & MARK_CODE_POINT) == 0) { // skip implicits - firstNonIgnorable.setValue(np, ns, nt); - lastNonIgnorable.setValue(np, ns, nt); + if ((pri & MARK_CODE_POINT) == 0) { + if (np == 0 && ns == 0) { + firstSecondaryIgnorable.setValue(np, ns, nt); + lastSecondaryIgnorable.setValue(np, ns, nt); + } else if (np == 0) { + firstPrimaryIgnorable.setValue(np, ns, nt); + lastPrimaryIgnorable.setValue(np, ns, nt); + } else if (collator.isVariable(ces[q])) { + firstVariable.setValue(np, ns, nt); + lastVariable.setValue(np, ns, nt); + } else if (UCA.getPrimary(ces[q]) > UNSUPPORTED_LIMIT) { // Trailing (none currently) + System.out.println("Trailing: " + + ucd.getCodeAndName(chr) + ", " + + CEList.toString(ces[q]) + ", " + + Utility.hex(pri) + ", " + + Utility.hex(UNSUPPORTED_LIMIT)); + firstTrailing.setValue(np, ns, nt); + lastTrailing.setValue(np, ns, nt); + } else { + firstNonIgnorable.setValue(np, ns, nt); + lastNonIgnorable.setValue(np, ns, nt); + } } } if (nonePrinted) { log.print("[,,]"); oldStr.append(CEList.toString(0)); } - longLog.print(" # " + oldStr + " # " + ucd.getName(UTF16.charAt(chr, 0))); + longLog.print("\t# " + oldStr + "\t* " + ucd.getName(UTF16.charAt(chr, 0))); log.println(); lastChr = chr; } + // ADD HOMELESS COLLATION ELEMENTS + log.println(); + log.println("# HOMELESS COLLATION ELEMENTS"); + char fakeTrail = 'a'; + Iterator it3 = backMap.keySet().iterator(); + while (it3.hasNext()) { + Integer key = (Integer) it3.next(); + Pair pair = (Pair) backMap.get(key); + if (((Integer)pair.first).intValue() < 2) continue; + String sample = (String)pair.second; + + int ce = key.intValue(); + + int np = fixPrimary(UCA.getPrimary(ce)); + int ns = fixSecondary(UCA.getSecondary(ce)); + int nt = fixTertiary(UCA.getTertiary(ce)); + + newPrimary.setLength(0); + newSecondary.setLength(0); + newTertiary.setLength(0); + + hexBytes(np, newPrimary); + hexBytes(ns, newSecondary); + hexBytes(nt, newTertiary); + + log.print(Utility.hex('\uFDD0' + "" + (char)(fakeTrail++)) + "; " + + "[, " + newSecondary + ", " + newTertiary + "]"); + longLog.print("\t# " + collator.getCEList(sample, true) + "\t* " + ucd.getCodeAndName(sample)); + log.println(); + } + int firstImplicit = getImplicitPrimary(CJK_BASE); int lastImplicit = getImplicitPrimary(0x10FFFF); + log.println(); log.println("# VALUES BASED ON UCA"); log.println("[first tertiary ignorable " + new FCE(false,0,0, 0).formatFCE() + "]"); @@ -2580,16 +2773,17 @@ F900..FAFF; CJK Compatibility Ideographs log.println("[first trailing " + firstTrailing.formatFCE() + "]"); log.println("[last trailing " + lastTrailing.formatFCE() + "]"); + log.println(); log.println("# FIXED VALUES"); - log.println("[top " + Utility.hex(0xA0,2) + "]"); - log.println("[first implicit byte " + Utility.hex(IMPLICIT_BASE_BYTE,2) + "]"); - log.println("[last implicit byte " + Utility.hex(IMPLICIT_LIMIT_BYTE,2) + "]"); - log.println("[first trail byte" + Utility.hex(IMPLICIT_LIMIT_BYTE+1,2) + "]"); - log.println("[last implicit byte" + Utility.hex(SPECIAL_BASE-1,2) + "]"); - log.println("[first special byte" + Utility.hex(SPECIAL_BASE,2) + "]"); - log.println("[last special byte" + Utility.hex(0xFF,2) + "]"); - + log.println("# superceded! [top " + lastNonIgnorable.formatFCE() + "]"); + log.println("[fixed first implicit byte " + Utility.hex(IMPLICIT_BASE_BYTE,2) + "]"); + log.println("[fixed last implicit byte " + Utility.hex(IMPLICIT_LIMIT_BYTE,2) + "]"); + log.println("[fixed first trail byte " + Utility.hex(IMPLICIT_LIMIT_BYTE+1,2) + "]"); + log.println("[fixed last trail byte " + Utility.hex(SPECIAL_BASE-1,2) + "]"); + log.println("[fixed first special byte " + Utility.hex(SPECIAL_BASE,2) + "]"); + log.println("[fixed last special byte " + Utility.hex(0xFF,2) + "]"); + summary.println("Last: " + Utility.hex(lastNp) + ", " + ucd.getCodeAndName(UTF16.charAt(lastChr, 0))); @@ -2636,6 +2830,7 @@ F900..FAFF; CJK Compatibility Ideographs summary.println(); summary.println("# UCA : (FRAC) CODE [ UCA CE ] Name"); summary.println(); + for (int i = 0; i < sampleEq.length; ++i) { if (sampleEq[i] == null) continue; if (i == 0x20) { @@ -2653,6 +2848,7 @@ F900..FAFF; CJK Compatibility Ideographs summary.print(CEList.toString(ces[q])); } summary.println(" " + ucd.getName(sampleEq[i])); + } log.close(); summary.close(); @@ -3379,6 +3575,13 @@ static int swapCJK(int i) { } + static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd','HH:mm:ss' GMT'"); + + static String getNormalDate() { + return myDateFormat.format(new Date()) + " [MD]"; + } + + static void setSingle(char ch, int[] ces) { collator.getCEs(String.valueOf(ch), true, ces); singles.set(UCA.getPrimary(ces[0])); @@ -3396,12 +3599,18 @@ static int swapCJK(int i) { input.close(); } + static UnicodeSet compatibilityExceptions = new UnicodeSet("[\u0CCB\u0DDD\u017F\u1E9B\uFB05]"); + static void writeCollationValidityLog() throws IOException { + Default.setUCD(); //log = new PrintWriter(new FileOutputStream("CheckCollationValidity.html")); log = Utility.openPrintWriter("CheckCollationValidity.html", false, false); - log.println(""); + log.println(""); + log.println("Generated: | " + getNormalDate() + " |
File Version: | " + collator.getDataVersion() + "/" + collator.getUCDVersion() + " |
Note: characters with decompositions to space + X, and tatweel + X are excluded," - + " as are a few special characters: " + exceptions.toPattern(true) + "
"); + + " as are a few special characters: " + compatibilityExceptions.toPattern(true) + ""); if (DO_CHARTS) { System.out.println("Charts"); @@ -3564,6 +3779,7 @@ static int swapCJK(int i) { } checkWellformedTable(); + addClosure(); log.println(""); log.close(); @@ -3572,10 +3788,83 @@ static int swapCJK(int i) { } + static void addClosure() { + int canCount = 0; + System.out.println("Add missing decomposibles"); + log.println("These are not necessarily errors, but should be examined for possible errors
"); + log.println("Each of the three strings is canonically equivalent, but has different sort keys
"); + log.println("Count | Name | Code | Sort Keys |
---|---|---|---|
" + (++canCount) + " | " + Utility.replace(ucd.getName(key), ", ", ", ") + " | ");
+ log.println("" + Utility.hex(key) + " | "); + log.println("" + collator.toString(sortKey) + " |
" + Utility.replace(ucd.getName(nfdKey), ", ", ", ") + " | ");
+ log.println("" + Utility.hex(nfdKey) + " | "); + log.println("" + collator.toString(sortKey) + " | |
" + Utility.replace(ucd.getName(s), ", ", ", ") + " | ");
+ log.println("" + Utility.hex(s) + " | "); + log.println("" + collator.toString(nonDecompSortKey) + " |
Items: " + canCount + "
"); + } + static void checkWellformedTable() throws IOException { System.out.println("Checking for well-formedness"); - log.println("" + (++errorCount) + ". BAD IMPLICIT: " + e.getMessage() + + " | " + CEList.toString(ces, len) + + " | " + ucd.getCodeAndName(str) + " |
" + (++errorCount) + ". > " + Utility.hex(UCA_Types.UNSUPPORTED_LIMIT,4) + + " | " + CEList.toString(ces, len) + + " | " + ucd.getCodeAndName(str) + " |
WF1.1" + log.println(" | ||
" + (++errorCount) + ". WF1.1" + " | " + CEList.toString(ces, len) + " | " + ucd.getCodeAndName(str) + " |
WF1.2" + log.println(" | ||
" + (++errorCount) + ". WF1.2" + " | " + CEList.toString(ces, len) + " | " + ucd.getCodeAndName(str) + " |
WF2.2" + log.println(" | ||
" + (++errorCount) + ". WF2.2" + " | " + CEList.toString(ces, len) + " | " + ucd.getCodeAndName(str) + " |
WF2.3" + log.println(" | ||
" + (++errorCount) + ". WF2.3" + " | " + CEList.toString(ces, len) + " | " + ucd.getCodeAndName(str) + " |
Well-formedness errors: " + errorCount + "
"); - } + log.println("Errors: " + errorCount + "
"); } @@ -3738,7 +4050,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; String colDbase = collator.getSortKey(ch, option, true); String colNbase = collator.getSortKey(ch, option, false); String colCbase = collator.getSortKey(toC.normalize(ch), option, false); - if (!colNbase.equals(colCbase)) { + if (!colNbase.equals(colCbase) || !colNbase.equals(colDbase) ) { /*System.out.println(Utility.hex(ch)); System.out.println(printableKey(colNbase)); System.out.println(printableKey(colNbase)); @@ -3770,10 +4082,11 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; String lastChar = ""; int countRem = 0; int countDups = 0; + int errorCount = 0; Iterator it1 = sortedD.keySet().iterator(); Iterator it2 = sortedN.keySet().iterator(); Differ differ = new Differ(250,3); - log.println("Codes and names are in the white rows: bold means that the NO-NFD sort key differs from UCA key.
"); log.println("Keys are in the light blue rows: green is the bad key, blue is UCA, black is where they equal.
"); log.println("Errors: " + errorCount + "
"); + //log.println("Removed " + countRem + " adjacent duplicates.Codes and names are in the white rows: bold means that the NO-NFD sort key differs from UCA key.
"); log.println("Keys are in the light blue rows: green is the bad key, blue is UCA, black is where they equal.
"); + log.println("Note: so black lines are generally ok.
"); log.println("File Order | Code and Decomp | Key and Decomp-Key |
---|---|---|
Errors: " + errorCount + "
"); } static int compareMinusLast(String a, String b) { @@ -3919,39 +4240,36 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; static final String[] alternateName = {"SHIFTED", "ZEROED", "NON_IGNORABLE", "SHIFTED_TRIMMED"}; static void showMismatches() { - MLStreamWriter out = new MLStreamWriter(log); - out.el("h1").tx("1. Mismatches when NFD is OFF").cl(); - out.el("h2").tx("Date:" + new Date()).cl(); - out.el("h2").tx("File Version:" + UCA.VERSION).cl(); - out.el("p").tx("Alternate Handling = " + alternateName[option]).cl(); - out.el("table").at("border",1); - out.el("caption").tx("Mismatches in UCA-NOD: Plain vs NFC: ").tx(MismatchedC.size()).cl("caption"); - out.el("tr"); - out.el("th").tx("Code").cl(); - out.el("th").tx("Type").cl(); - out.el("th").tx("CC?").cl(); - out.el("th").tx("Key").cl(); - out.cl("tr"); + log.println("Alternate Handling = " + alternateName[option] + "
"); + log.println("NOTE: NFD form is used by UCA," + + "so if other forms are different there are ignored. This may indicate a problem, e.g. missing contraction.
"); + log.println("Name | Type | Unicode | Key |
---|---|---|---|
" + Utility.replace(ucd.getName(ch), ", ", ", ") + + " | NFD | " + Utility.hex(chInD) + + " | " + printableKey(MD) + " |
NFC | " + Utility.hex(chInC) + + " | " + printableKey(MC) + " | |
Plain | " + Utility.hex(ch) + + " | " + printableKey(MN) + " |
Errors: " + errorCount + "
"); log.println("