From ac3cc9119ba6328bac86fb16732b94a6f0e71cb4 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Wed, 6 Apr 2005 08:48:17 +0000 Subject: [PATCH] ICU-0 updates for uca 4.1.0 X-SVN-Rev: 17468 --- .../ibm/icu/dev/test/util/BagFormatter.java | 81 ++++++++++++---- .../icu/dev/test/util/UnicodeProperty.java | 93 ++++++++++++------ .../com/ibm/text/UCA/GenOverlap.java | 14 +-- .../com/ibm/text/UCA/Implicit.java | 10 +- tools/unicodetools/com/ibm/text/UCA/Main.java | 71 +++++++++++--- tools/unicodetools/com/ibm/text/UCA/UCA.java | 96 +++++++++++++++---- .../com/ibm/text/UCA/UCA_Types.java | 12 +-- .../com/ibm/text/UCA/WriteCharts.java | 31 +++--- .../com/ibm/text/UCA/WriteCollationData.java | 83 +++++++++++----- .../com/ibm/text/UCD/TestData.java | 80 ++++++++++++---- .../com/ibm/text/UCD/idn-charsHeader.html | 19 +++- 11 files changed, 440 insertions(+), 150 deletions(-) diff --git a/icu4j/src/com/ibm/icu/dev/test/util/BagFormatter.java b/icu4j/src/com/ibm/icu/dev/test/util/BagFormatter.java index 1c7dd1b80c0..206d2f5fc89 100644 --- a/icu4j/src/com/ibm/icu/dev/test/util/BagFormatter.java +++ b/icu4j/src/com/ibm/icu/dev/test/util/BagFormatter.java @@ -46,7 +46,10 @@ public class BagFormatter { "'>' > '>' ;"; private static final String HTML_RULES = BASE_RULES + CONTENT_RULES + - "'\"' > '"' ; "; + "'\"' > '"' ; "; + + private static final String HTML_RULES_CONTROLS = HTML_RULES + + "([[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]]) > &hex/unicode($1) ; "; private static final String XML_RULES = HTML_RULES + "'' > ''' ; "; @@ -94,6 +97,8 @@ the double-quote character (") as """. public static final Transliterator toHTML = Transliterator.createFromRules( "any-html", HTML_RULES, Transliterator.FORWARD); + public static final Transliterator toHTMLControl = Transliterator.createFromRules( + "any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD); public static final Transliterator fromHTML = Transliterator.createFromRules( "html-any", HTML_RULES, Transliterator.REVERSE); @@ -151,6 +156,14 @@ the double-quote character (") as """. return result.getBuffer().toString(); } + public void showSetDifferences( + PrintWriter pw, + String name1, + UnicodeSet set1, + String name2, + UnicodeSet set2) { + showSetDifferences(pw, name1, set1, name2, set2, -1); + } /** * Compare two UnicodeSets, and show the differences * @param name1 name of first set to be compared @@ -164,24 +177,37 @@ the double-quote character (") as """. String name1, UnicodeSet set1, String name2, - UnicodeSet set2) { + UnicodeSet set2, + int flags) + { if (pw == null) pw = CONSOLE; String[] names = { name1, name2 }; - UnicodeSet temp = new UnicodeSet(set1).removeAll(set2); - pw.println(); - pw.println(inOut.format(names)); - showSetNames(pw, temp); + UnicodeSet temp; + + if ((flags&1) != 0) { + temp = new UnicodeSet(set1).removeAll(set2); + pw.print(lineSeparator); + pw.print(inOut.format(names)); + pw.print(lineSeparator); + showSetNames(pw, temp); + } - temp = new UnicodeSet(set2).removeAll(set1); - pw.println(); - pw.println(outIn.format(names)); - showSetNames(pw, temp); + if ((flags&2) != 0) { + temp = new UnicodeSet(set2).removeAll(set1); + pw.print(lineSeparator); + pw.print(outIn.format(names)); + pw.print(lineSeparator); + showSetNames(pw, temp); + } - temp = new UnicodeSet(set2).retainAll(set1); - pw.println(); - pw.println(inIn.format(names)); - showSetNames(pw, temp); + if ((flags&4) != 0) { + temp = new UnicodeSet(set2).retainAll(set1); + pw.print(lineSeparator); + pw.print(inIn.format(names)); + pw.print(lineSeparator); + showSetNames(pw, temp); + } pw.flush(); } @@ -397,12 +423,14 @@ the double-quote character (") as """. // refactored public String getName(int codePoint, boolean withCodePoint) { - return getNameSource().getValue(codePoint, !withCodePoint); + String result = getNameSource().getValue(codePoint, !withCodePoint); + return fixName == null ? result : fixName.transliterate(result); } public String getName(String s, boolean withCodePoint) { - return getNameSource().getValue(s, separator, !withCodePoint); - } + String result = getNameSource().getValue(s, separator, !withCodePoint); + return fixName == null ? result : fixName.transliterate(result); + } public String hex(String s) { return hex(s,separator); @@ -445,6 +473,7 @@ the double-quote character (") as """. private boolean mergeRanges = true; private Transliterator showLiteral = null; + private Transliterator fixName = null; private boolean showSetAlso = false; private RangeFinder rf = new RangeFinder(); @@ -580,10 +609,16 @@ the double-quote character (") as """. doAt((Visitor.CodePointRange) o); } else { String thing = o.toString(); + String value = getValueSource() == UnicodeLabel.NULL ? "" : getValueSource().getValue(thing, ",", true); + if (value.length() != 0) value = "\t; " + value; + String label = getLabelSource(true).getValue(thing, ",", true); + if (label.length() != 0) label = " " + label; output.print( myTabber.process( hex(thing) + + value + commentSeparator + + label + insertLiteral(thing) + "\t" + getName(thing)) @@ -1095,4 +1130,16 @@ the double-quote character (") as """. return this; } + /** + * @return Returns the fixName. + */ + public Transliterator getFixName() { + return fixName; + } + /** + * @param fixName The fixName to set. + */ + public void setFixName(Transliterator fixName) { + this.fixName = fixName; + } } diff --git a/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java b/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java index c6deb9d47e8..a78f9fb4de0 100644 --- a/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java +++ b/icu4j/src/com/ibm/icu/dev/test/util/UnicodeProperty.java @@ -121,7 +121,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { public List getValueAliases(String valueAlias, List result) { if (result == null) result = new ArrayList(1); result = _getValueAliases(valueAlias, result); - if (!result.contains(valueAlias) && type < NUMERIC) { + if (!result.contains(valueAlias) ) { // FIX && type < NUMERIC result = _getValueAliases(valueAlias, result); // for debugging throw new IllegalArgumentException( "Internal error: " + getName() + " doesn't contain " + valueAlias @@ -609,6 +609,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { } private class PropertySymbolTable implements SymbolTable { + static final boolean DEBUG = false; private String prefix; RegexMatcher regexMatcher = new RegexMatcher(); @@ -698,7 +699,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { int i; for (i = start; i < limit; i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(text, i); - if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) { + if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp) && cp != '.') { break; } } @@ -876,7 +877,9 @@ public abstract class UnicodeProperty extends UnicodeLabel { public static abstract class BaseProperty extends UnicodeProperty { protected List propertyAliases = new ArrayList(1); - String version; + protected Map toValueAliases; + protected String version; + public BaseProperty setMain(String alias, String shortAlias, int propertyType, String version) { setName(alias); @@ -893,12 +896,56 @@ public abstract class UnicodeProperty extends UnicodeLabel { addAllUnique(propertyAliases, result); return result; } + public BaseProperty addValueAliases(String[][] valueAndAlternates) { + if (toValueAliases == null) _fixValueAliases(); + for (int i = 0; i < valueAndAlternates.length; ++i) { + for (int j = 1; j < valueAndAlternates[0].length; ++j) { + addValueAlias(valueAndAlternates[i][0], valueAndAlternates[i][j]); + } + } + return this; + } + public void addValueAlias(String value, String valueAlias) { + List result = (List) toValueAliases.get(value); + addUnique(value, result); + addUnique(valueAlias, result); + } + protected List _getValueAliases(String valueAlias, List result) { + if (toValueAliases == null) _fixValueAliases(); + List a = (List) toValueAliases.get(valueAlias); + if (a != null) addAllUnique(a, result); + return result; + } + + protected void _fixValueAliases() { + if (toValueAliases == null) toValueAliases = new HashMap(1); + for (Iterator it = getAvailableValues().iterator(); it.hasNext();) { + Object value = it.next(); + List result; + _ensureValueInAliases(value); + } + } + protected void _ensureValueInAliases(Object value) { + List result = (List) toValueAliases.get(value); + if (result == null) toValueAliases.put(value, result = new ArrayList(1)); + addUnique(value, result); + } + public BaseProperty swapFirst2ValueAliases() { + for (Iterator it = toValueAliases.keySet().iterator(); it.hasNext();) { + List list = (List) toValueAliases.get(it.next()); + if (list.size() < 2) continue; + Object first = list.get(0); + list.set(0, list.get(1)); + list.set(1, first); + } + return this; + } + } public static abstract class SimpleProperty extends BaseProperty { List values; - Map toValueAliases = new HashMap(1); public SimpleProperty addName(String alias) { propertyAliases.add(alias); @@ -918,62 +965,52 @@ public abstract class UnicodeProperty extends UnicodeLabel { } return this; } - + public SimpleProperty setValues(List valueAliases) { this.values = new ArrayList(valueAliases); for (Iterator it = this.values.iterator(); it.hasNext(); ) { - _addToValues(it.next(), null); + _addToValues((String)it.next(), null); } return this; } - public List _getValueAliases(String valueAlias, List result) { - if (toValueAliases == null) _fillValues(); - List a = (List) toValueAliases.get(valueAlias); - if (a != null) addAllUnique(a, result); - return result; - } - public List _getAvailableValues(List result) { if (values == null) _fillValues(); result.addAll(values); return result; } - private void _fillValues() { + + protected void _fillValues() { List newvalues = (List) getUnicodeMap().getAvailableValues(new ArrayList()); for (Iterator it = newvalues.iterator(); it.hasNext();) { - _addToValues(it.next(), null); + _addToValues((String)it.next(), null); } } - - private void _addToValues(Object item, Object alias) { + + private void _addToValues(String item, String alias) { if (values == null) values = new ArrayList(1); + if (toValueAliases == null) _fixValueAliases(); addUnique(item, values); - List aliases = (List) toValueAliases.get(item); - if (aliases == null) { - aliases = new ArrayList(1); - toValueAliases.put(item, aliases); - } - addUnique(alias, aliases); - addUnique(item, aliases); + _ensureValueInAliases(item); + addValueAlias(item, alias); } - public String _getVersion() { +/* public String _getVersion() { return version; } - } +*/ } public static class UnicodeMapProperty extends BaseProperty { protected UnicodeMap unicodeMap; protected String _getValue(int codepoint) { return (String) unicodeMap.getValue(codepoint); } - protected List _getValueAliases(String valueAlias, List result) { +/* protected List _getValueAliases(String valueAlias, List result) { if (!unicodeMap.getAvailableValues().contains(valueAlias)) return result; result.add(valueAlias); return result; // no other aliases } - protected List _getAvailableValues(List result) { +*/ protected List _getAvailableValues(List result) { return (List) unicodeMap.getAvailableValues(result); } } diff --git a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java index b8594df26bf..d9898edf514 100644 --- a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java +++ b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $ -* $Date: 2004/02/07 01:01:12 $ -* $Revision: 1.12 $ +* $Date: 2005/04/06 08:48:16 $ +* $Revision: 1.13 $ * ******************************************************************************* */ @@ -164,8 +164,8 @@ public class GenOverlap implements UCD_Types, UCA_Types { static boolean PROGRESS = false; static void fullCheck() throws IOException { - PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "Overlap.html", Utility.UTF8_WINDOWS); - PrintWriter simpleList = Utility.openPrintWriter(UCA_GEN_DIR, "Overlap.txt", Utility.UTF8_WINDOWS); + PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.html", Utility.UTF8_WINDOWS); + PrintWriter simpleList = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.txt", Utility.UTF8_WINDOWS); Iterator it = completes.keySet().iterator(); int counter = 0; @@ -448,7 +448,7 @@ public class GenOverlap implements UCD_Types, UCA_Types { newKeys.removeAll(joint); oldKeys.removeAll(joint); - PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), Utility.UTF8_WINDOWS); + PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), Utility.UTF8_WINDOWS); Iterator it = list.iterator(); int last = -1; while (it.hasNext()) { @@ -631,7 +631,7 @@ public class GenOverlap implements UCD_Types, UCA_Types { System.out.println("Data Gathered"); - PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "checkstringsearchhash.html", Utility.UTF8_WINDOWS); + PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "checkstringsearchhash.html", Utility.UTF8_WINDOWS); Utility.writeHtmlHeader(log, "Check Hash"); log.println("

Collisions

"); log.println("

Shows collisions among primary values when hashed to table size = " + tableLength + "."); @@ -694,7 +694,7 @@ public class GenOverlap implements UCD_Types, UCA_Types { } public static void listCyrillic(UCA collatorIn) throws IOException { - PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "ListCyrillic.txt", Utility.UTF8_WINDOWS); + PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "ListCyrillic.txt", Utility.UTF8_WINDOWS); Set set = new TreeSet(collatorIn); Set set2 = new TreeSet(collatorIn); ucd = UCD.make(); diff --git a/tools/unicodetools/com/ibm/text/UCA/Implicit.java b/tools/unicodetools/com/ibm/text/UCA/Implicit.java index 23db9b7b730..9850719a22c 100644 --- a/tools/unicodetools/com/ibm/text/UCA/Implicit.java +++ b/tools/unicodetools/com/ibm/text/UCA/Implicit.java @@ -168,7 +168,7 @@ public class Implicit implements UCD_Types { */ public Implicit(int minPrimary, int maxPrimary) { // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. - this(minPrimary, maxPrimary, 0x03, 0xFE, 1, 1); + this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1); } /** @@ -181,6 +181,14 @@ public class Implicit implements UCD_Types { * @param primaries3count number of 3-byte primarys we can use (normally 1) */ public Implicit(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) { + if (DEBUG) { + System.out.println("minPrimary: " + Utility.hex(minPrimary)); + System.out.println("maxPrimary: " + Utility.hex(maxPrimary)); + System.out.println("minTrail: " + Utility.hex(minTrail)); + System.out.println("maxTrail: " + Utility.hex(maxTrail)); + System.out.println("gap3: " + Utility.hex(gap3)); + System.out.println("primaries3count: " + primaries3count); + } // some simple parameter checks if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) throw new IllegalArgumentException("bad lead bytes"); if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) throw new IllegalArgumentException("bad trail bytes"); diff --git a/tools/unicodetools/com/ibm/text/UCA/Main.java b/tools/unicodetools/com/ibm/text/UCA/Main.java index 779c0e8d40c..f914671af27 100644 --- a/tools/unicodetools/com/ibm/text/UCA/Main.java +++ b/tools/unicodetools/com/ibm/text/UCA/Main.java @@ -5,19 +5,24 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $ -* $Date: 2004/01/15 01:08:30 $ -* $Revision: 1.18 $ +* $Date: 2005/04/06 08:48:16 $ +* $Revision: 1.19 $ * ******************************************************************************* */ package com.ibm.text.UCA; +import java.io.File; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.CanonicalIterator; +import com.ibm.icu.text.UTF16; import com.ibm.text.UCD.*; import com.ibm.text.utility.*; public class Main { - static final String UCDVersion = "4.0.0"; + //static final String UCDVersion = "4.0.0"; static final String[] ICU_FILES = {"writeCollationValidityLog", "writeFractionalUCA", "WriteRules", "WriteRulesXML", "writeconformance", "writeconformanceshifted", "short", @@ -28,18 +33,10 @@ public class Main { }; public static void main(String args[]) throws Exception { - + //checkCanonicalIterator(); // NOTE: so far, we don't need to build the UCA with anything but the latest versions. // A few changes would need to be made to the code to do older versions. try { - System.out.println("Building UCA"); - Default.setUCD(UCDVersion); - WriteCollationData.collator = new UCA(null, UCDVersion); - System.out.println("Built version " + WriteCollationData.collator.getDataVersion() - + "/ucd: " + WriteCollationData.collator.getUCDVersion()); - - System.out.println("Building UCD data"); - WriteCollationData.ucd = UCD.make(WriteCollationData.collator.getUCDVersion()); if (args.length == 0) args = new String[] {"?"}; // force the help comment boolean shortPrint = false; @@ -54,7 +51,22 @@ public class Main { args = Utility.append(ICU_FILES, Utility.subarray(args, i+1)); i = -1; continue; - } + } + if (arg.equalsIgnoreCase("version")) { + Default.setUCD(args[++i]); // get next arg + continue; + } + if (WriteCollationData.collator == null) { + System.out.println("Building UCA"); + String file = Utility.searchDirectory(new File(UCD_Types.BASE_DIR + "UCA\\" + Default.ucdVersion() + "\\"), "allkeys", true, ".txt"); + WriteCollationData.collator = new UCA(file, Default.ucdVersion()); + System.out.println("Built version " + WriteCollationData.collator.getDataVersion() + + "/ucd: " + WriteCollationData.collator.getUCDVersion()); + + System.out.println("Building UCD data"); + WriteCollationData.ucd = UCD.make(WriteCollationData.collator.getUCDVersion()); + + } if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(WriteCollationData.collator); else if (arg.equalsIgnoreCase("validateUCA")) GenOverlap.validateUCA(WriteCollationData.collator); //else if (arg.equalsIgnoreCase("writeNonspacingDifference")) WriteCollationData.writeNonspacingDifference(); @@ -125,4 +137,37 @@ public class Main { */ } } + + /** + * + */ + private static void checkCanonicalIterator() { + + int firstImplicit = WriteCollationData.getImplicitPrimary(UCD_Types.CJK_BASE); + System.out.println("UCD_Types.CJK_BASE: " + Utility.hex(UCD_Types.CJK_BASE)); + System.out.println("first implicit: " + Utility.hex((long)(firstImplicit & 0xFFFFFFFFL))); + + CanonicalIterator it = new CanonicalIterator(""); + String[] tests = new String[] {"\uF900"}; + for (int j = 0; j < tests.length; ++j) { + System.out.println(tests[j]); + it.setSource(tests[j]); + String ss; + for (int i = 0; (ss = it.next()) != null; ++i) { + System.out.println(i + "\t" + Utility.hex(ss)); + } + } + if (true) throw new IllegalArgumentException(); + for (int i = 0; i < 0x10FFFF; ++i) { + int cat = UCharacter.getType(i); + if (cat == UCharacter.UNASSIGNED || cat == UCharacter.PRIVATE_USE || cat == UCharacter.SURROGATE) continue; + String s = UTF16.valueOf(i); + try { + it.setSource(s); + } catch (RuntimeException e) { + System.out.println("Failure with U+" + Utility.hex(i)); + e.printStackTrace(); + } + } + } } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java index c0c3bedc8da..1516c7e7be4 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ -* $Date: 2004/02/06 18:32:03 $ -* $Revision: 1.23 $ +* $Date: 2005/04/06 08:48:16 $ +* $Revision: 1.24 $ * ******************************************************************************* */ @@ -14,6 +14,8 @@ package com.ibm.text.UCA; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.io.BufferedReader; import java.io.Reader; import java.io.PrintWriter; @@ -108,13 +110,16 @@ final public class UCA implements Comparator, UCA_Types { // Main Methods // ============================================================= + private String fileVersion = "??"; + /** * Initializes the collation from a stream of rules in the normal formal. * If the source is null, uses the normal Unicode data files, which * need to be in BASE_DIR. */ - public UCA(BufferedReader source, String unicodeVersion) throws java.io.IOException { - fullData = source == null; + public UCA(String sourceFile, String unicodeVersion) throws java.io.IOException { + fullData = sourceFile == null; + fileVersion = sourceFile; // load the normalizer if (toD == null) { @@ -127,15 +132,19 @@ final public class UCA implements Comparator, UCA_Types { ucaData = new UCA_Data(toD, ucd); // either get the full sources, or just a demo set - if (fullData) { +/* if (fullData) { for (int i = 0; i < KEYS.length; ++i) { BufferedReader in = new BufferedReader( new FileReader(KEYS[i]), BUFFER_SIZE); addCollationElements(in); in.close(); } - } else { - addCollationElements(source); + } else */ + { + BufferedReader in = new BufferedReader( + new FileReader(sourceFile), BUFFER_SIZE); + addCollationElements(in); + in.close(); } cleanup(); } @@ -830,16 +839,17 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] /** * List of files to use for constructing the CE data, used by build() */ - private static final String[] KEYS = { + +/* private static final String[] KEYS = { //"D:\\UnicodeData\\testkeys.txt", - BASE_DIR + "Collation\\allkeys" + VERSION + ".txt", - /* + BASE_DIR + "UCA\\allkeys" + VERSION + ".txt", + BASE_DIR + "UnicodeData\\Collation\\basekeys" + VERSION + ".txt", BASE_DIR + "UnicodeData\\Collation\\compkeys" + VERSION + ".txt", BASE_DIR + "UnicodeData\\Collation\\ctrckeys" + VERSION + ".txt", - */ + }; - +*/ /** * File buffer size, used to make reads faster. */ @@ -1089,6 +1099,13 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] static boolean haveUnspecified = false; static UnicodeSet unspecified = new UnicodeSet(); + UnicodeSet variantSecondaries = new UnicodeSet(0x0153,0x0154); + UnicodeSet digitSecondaries = new UnicodeSet(0x155,0x017F); + UnicodeSet homelessSecondaries; + + // static UnicodeSet homelessSecondaries = new UnicodeSet(0x0176, 0x0198); + // 0x0153..0x017F + public class UCAContents { int current = -1; @@ -1130,9 +1147,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] /** * use FIXED_CE as the limit + * @param newValue TODO */ - public void enableSamples() { - doSamples = true; + public void setDoEnableSamples(boolean newValue) { + doSamples = newValue; } /** @@ -1179,7 +1197,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] if (!haveUnspecified) { if (DEBUG) System.out.println("Specified = " + unspecified.toPattern(true)); UnicodeSet temp = new UnicodeSet(); - for (int i = 0; i < 0x10ffff; ++i) { + for (int i = 0; i <= 0x10ffff; ++i) { if (!ucd.isAllocated(i)) continue; if (!unspecified.contains(i)) { temp.add(i); @@ -1265,6 +1283,12 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] return true; } + /** + * @return Returns the doSamples. + */ + public boolean isDoSamples() { + return doSamples; + } } static final int[][] SAMPLE_RANGES = { @@ -1312,6 +1336,14 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] while (true) try { inputLine = in.readLine(); if (inputLine == null) break; // means file is done + + // HACK + if (inputLine.startsWith("# Variant secondaries:")) { + variantSecondaries = extractSet(inputLine); + } else if (inputLine.startsWith("# Digit secondaries:")) { + digitSecondaries = extractSet(inputLine); + } + String line = cleanLine(inputLine); // remove comments, extra whitespace if (line.length() == 0) continue; // skip empty lines @@ -1407,7 +1439,18 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] } } - /* + /** + * + */ + private UnicodeSet extractSet(String inputLine) { + //# Variant secondaries: 0177..017B (5) + //# Digit secondaries: 017C..0198 (29) + Matcher m = Pattern.compile(".*:\\s*([0-9A-Fa-f]+)\\.\\.([0-9A-Fa-f]+).*").matcher(""); + if (!m.reset(inputLine).matches()) throw new IllegalArgumentException("Failed to recognized special Ken lines: " + inputLine); + return new UnicodeSet(Integer.parseInt(m.group(1),16), Integer.parseInt(m.group(2),16)); + } + + /* private void concat(int[] ces1, int[] ces2) { } @@ -1737,4 +1780,25 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] uniqueTable.put(ceObj, new Character(value)); } } +/** + * @return Returns the fileVersion. + */ +public String getFileVersion() { + return fileVersion; +} +/** + * @return Returns the uCA_GEN_DIR. + */ +public String getUCA_GEN_DIR() { + return BASE_UCA_GEN_DIR + getDataVersion() + "\\"; +} + + + /** + * @return Returns the homelessSecondaries. + */ + public UnicodeSet getHomelessSecondaries() { + if (homelessSecondaries == null) homelessSecondaries = new UnicodeSet(variantSecondaries).addAll(digitSecondaries); + return homelessSecondaries; + } } diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA_Types.java b/tools/unicodetools/com/ibm/text/UCA/UCA_Types.java index cfc07810d98..bf700a7ea94 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA_Types.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Types.java,v $ -* $Date: 2004/01/13 18:32:11 $ -* $Revision: 1.6 $ +* $Date: 2005/04/06 08:48:17 $ +* $Revision: 1.7 $ * ******************************************************************************* */ @@ -20,11 +20,11 @@ public interface UCA_Types { * Version of the UCA tables to use */ //private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7"; - public static final String UCA_BASE = "4.0.0"; // "3.1.1"; // ; // ""; // "-2.1.9d7"; - public static final String VERSION = "-" + UCA_BASE; // + "d6" ""; // "-2.1.9d7"; + //public static final String UCA_BASE = "4.1.0"; // "3.1.1"; // ; // ""; // "-2.1.9d7"; + //public static final String VERSION = "-" + UCA_BASE; // + "d6" ""; // "-2.1.9d7"; public static final String ALLFILES = "allkeys"; // null if not there - public static final String UCA_GEN_DIR = UCD_Types.GEN_DIR + "collation_" + UCA_BASE + "\\"; + public static final String BASE_UCA_GEN_DIR = UCD_Types.GEN_DIR + "collation" + "\\"; public static final char LEVEL_SEPARATOR = '\u0000'; /** * Expanding characters are marked with a exception bit combination @@ -94,5 +94,5 @@ public interface UCA_Types { CJK_CE = 3, CJK_AB_CE = 4, HANGUL_CE = 5, UNSUPPORTED_CE = 7, FIXED_CE = 3; // SURROGATE_CE = 6, - + } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java index b73744279a4..5f3642241f6 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $ -* $Date: 2004/02/12 08:23:19 $ -* $Revision: 1.20 $ +* $Date: 2005/04/06 08:48:17 $ +* $Revision: 1.21 $ * ******************************************************************************* */ @@ -29,6 +29,7 @@ import java.text.SimpleDateFormat; public class WriteCharts implements UCD_Types { + static String WORKING_DIR = ".\\com\\ibm\\text\\UCA\\"; static boolean HACK_KANA = false; static public void special() { @@ -50,7 +51,7 @@ public class WriteCharts implements UCD_Types { //Normalizer nfc = new Normalizer(Normalizer.NFC); UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps - cc.enableSamples(); + cc.setDoEnableSamples(true); Set set = new TreeSet(); @@ -84,12 +85,12 @@ public class WriteCharts implements UCD_Types { String[] replacement = new String[] {"%%%", "Collation Charts"}; String folder = "charts\\uca\\"; - Utility.copyTextFile("index.html", Utility.UTF8, folder + "index.html", replacement); - Utility.copyTextFile("charts.css", Utility.LATIN1, folder + "charts.css"); - Utility.copyTextFile("help.html", Utility.UTF8, folder + "help.html"); + Utility.copyTextFile(WORKING_DIR + "index.html", Utility.UTF8, folder + "index.html", replacement); + Utility.copyTextFile(WORKING_DIR + "charts.css", Utility.LATIN1, folder + "charts.css"); + Utility.copyTextFile(WORKING_DIR + "help.html", Utility.UTF8, folder + "help.html"); indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS); - Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement); + Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement); /* indexFile.println(""); @@ -100,6 +101,7 @@ public class WriteCharts implements UCD_Types { indexFile.println("

Help"); */ + int lastCp = -1; while (it.hasNext()) { Utility.dot(counter); @@ -110,6 +112,7 @@ public class WriteCharts implements UCD_Types { int cp = UTF16.charAt(s,0); byte script = Default.ucd().getScript(cp); + if (cp == 0x1DBF) script = UCD.GREEK_SCRIPT; // 4.1.0 hack // get first non-zero primary int currentPrimary = getFirstPrimary(sortKey); @@ -128,6 +131,7 @@ public class WriteCharts implements UCD_Types { if (script == KATAKANA_SCRIPT) script = HIRAGANA_SCRIPT; else if ((script == INHERITED_SCRIPT || script == COMMON_SCRIPT) && oldScript >= 0) script = oldScript; + int veryOldScript = oldScript; if (script != oldScript // && (script != COMMON_SCRIPT && script != INHERITED_SCRIPT) ) { @@ -140,7 +144,9 @@ public class WriteCharts implements UCD_Types { ++scriptCount[script+3]; if (scriptCount[script+3] > 1) { System.out.println("\t\tFAIL: " + scriptCount[script+3] + ", " + - getChunkName(script, LONG) + ", " + Default.ucd().getCodeAndName(s)); + getChunkName(script, LONG) + ", " + Default.ucd().getCodeAndName(s) + + " - last char: " + + getChunkName(veryOldScript, LONG) + ", " + Default.ucd().getCodeAndName(lastCp)); } output = openFile(scriptCount[script+3], folder, script); } @@ -179,6 +185,7 @@ public class WriteCharts implements UCD_Types { output.println(breaker + outline); ++columnCount; + lastCp = cp; } closeFile(output); @@ -265,7 +272,7 @@ public class WriteCharts implements UCD_Types { Utility.copyTextFile("norm_help.html", Utility.UTF8, folder + "help.html"); indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS); - Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement); + Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement); /* indexFile.println(""); @@ -373,7 +380,7 @@ public class WriteCharts implements UCD_Types { Utility.copyTextFile("case_help.html", Utility.UTF8, folder + "help.html"); indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS); - Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement); + Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement); /* indexFile.println(""); @@ -485,7 +492,7 @@ public class WriteCharts implements UCD_Types { Utility.copyTextFile("script_help.html", Utility.UTF8, folder + "help.html"); indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS); - Utility.appendFile("script_index_header.html", Utility.UTF8, indexFile, replacement); + Utility.appendFile(WORKING_DIR + "script_index_header.html", Utility.UTF8, indexFile, replacement); /* indexFile.println(""); @@ -607,7 +614,7 @@ public class WriteCharts implements UCD_Types { Utility.copyTextFile("name_help.html", Utility.UTF8, folder + "help.html"); indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS); - Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement); + Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement); int columnCount = 0; char lastInitial = 0; diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index 1c25f6469c3..5eb49d77a44 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2004/02/07 01:01:11 $ -* $Revision: 1.39 $ +* $Date: 2005/04/06 08:48:17 $ +* $Revision: 1.40 $ * ******************************************************************************* */ @@ -17,6 +17,9 @@ import java.util.*; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.CanonicalIterator; +import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.dev.test.util.UnicodeProperty; +import com.ibm.icu.dev.test.util.UnicodePropertySource; import com.ibm.icu.impl.UCharacterProperty; import java.io.*; @@ -36,6 +39,8 @@ import com.ibm.text.UCD.Normalizer; public class WriteCollationData implements UCD_Types, UCA_Types { + // may require fixing + static final boolean DEBUG = false; static final boolean DEBUG_SHOW_ITERATION = false; @@ -145,7 +150,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types { BufferedReader in = Utility.openUnicodeFile("CaseFolding", UNICODE_VERSION, true, Utility.LATIN1); // new BufferedReader(new FileReader(DIR31 + "CaseFolding-3.d3.alpha.txt"), 64*1024); // log = new PrintWriter(new FileOutputStream("CaseFolding_data.js")); - log = Utility.openPrintWriter(UCA_GEN_DIR, "CaseFolding_data.js", Utility.UTF8_WINDOWS); + log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "CaseFolding_data.js", Utility.UTF8_WINDOWS); log.println("var CF = new Object();"); int count = 0; while (true) { @@ -190,7 +195,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types { //Normalizer normKD = new Normalizer(Normalizer.NFKD, UNICODE_VERSION); //Normalizer normD = new Normalizer(Normalizer.NFD, UNICODE_VERSION); //log = new PrintWriter(new FileOutputStream("Normalization_data.js")); - log = Utility.openPrintWriter(UCA_GEN_DIR, "Normalization_data.js", Utility.LATIN1_WINDOWS); + log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Normalization_data.js", Utility.LATIN1_WINDOWS); int count = 0; @@ -319,7 +324,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON } String fullFileName = filename + (shortPrint ? "_SHORT" : "") + ".txt"; - PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS); + PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), fullFileName, Utility.UTF8_WINDOWS); //if (!shortPrint) log.write('\uFEFF'); writeVersionAndDate(log, fullFileName); @@ -327,7 +332,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON int counter = 0; UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null); - cc.enableSamples(); + cc.setDoEnableSamples(true); UnicodeSet found2 = new UnicodeSet(); while (true) { @@ -711,7 +716,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON static void testCompatibilityCharacters() throws IOException { String fullFileName = "UCA_CompatComparison.txt"; - log = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS); + log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), fullFileName, Utility.UTF8_WINDOWS); int[] kenCes = new int[50]; int[] markCes = new int[50]; @@ -1191,7 +1196,13 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON while (it.hasNext()) { if (result.length() != 0) result.append(";
"); Object item = it.next(); - if (m != null) item = m.get(item); + if (m != null) { + Object item2 = m.get(item); + if (item2 != null) item = item2; + else { + System.out.println("Missing Item: " + item); + } + } if (useName) item = ucd.getCodeAndName(item.toString()); result.append(item); } @@ -1207,7 +1218,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON 32*1024)); */ String fullFileName = "UCA_Contractions.txt"; - PrintWriter diLog = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS); + PrintWriter diLog = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), fullFileName, Utility.UTF8_WINDOWS); diLog.write('\uFEFF'); @@ -1246,7 +1257,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON "UTF8"), 32*1024)); */ - PrintWriter diLog = Utility.openPrintWriter(UCA_GEN_DIR, "DisjointIgnorables.js", Utility.UTF8_WINDOWS); + PrintWriter diLog = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "DisjointIgnorables.js", Utility.UTF8_WINDOWS); diLog.write('\uFEFF'); @@ -1425,7 +1436,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON "UTF8"), 32*1024)); */ - PrintWriter diLog = Utility.openPrintWriter(UCA_GEN_DIR, "DisjointIgnorables2.js", Utility.UTF8_WINDOWS); + PrintWriter diLog = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "DisjointIgnorables2.js", Utility.UTF8_WINDOWS); diLog.write('\uFEFF'); @@ -1637,7 +1648,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON if (UCA.isImplicitLeadCE(ces[0])) { expansionStart = 2; // move up if first is double-ce } - if (len > expansionStart && homelessSecondaries.contains(UCA.getSecondary(ces[expansionStart]))) { + if (len > expansionStart && collator.getHomelessSecondaries().contains(UCA.getSecondary(ces[expansionStart]))) { if (log2 != null) log2.println("Homeless: " + CEList.toString(ces, len)); ++expansionStart; // move up if *second* is homeless ignoreable } @@ -1674,7 +1685,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON int[] lenArray = new int[1]; Set alreadyDone = new HashSet(); - log2 = Utility.openPrintWriter(UCA_GEN_DIR, "UCARules-log.txt", Utility.UTF8_WINDOWS); + log2 = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "UCARules-log.txt", Utility.UTF8_WINDOWS); while (true) { String s = cc.next(ces, lenArray); @@ -1799,7 +1810,7 @@ F900..FAFF; CJK Compatibility Ideographs if (noCE) filename += "_NoCE"; if (option == IN_XML) filename += ".xml"; else filename += ".txt"; - log = Utility.openPrintWriter(UCA_GEN_DIR, filename, Utility.UTF8_WINDOWS); + log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), filename, Utility.UTF8_WINDOWS); String[] commentText = { "UCA Rules", @@ -2316,8 +2327,6 @@ F900..FAFF; CJK Compatibility Ideographs } - static UnicodeSet homelessSecondaries = new UnicodeSet(0x0153,0x017F); - /*static int[] ignorableList = new int[homelessSecondaries.size()]; static { @@ -2396,7 +2405,7 @@ F900..FAFF; CJK Compatibility Ideographs } if (s == null) { do { - if (homelessSecondaries.contains(UCA.getSecondary(ces[i]))) { + if (collator.getHomelessSecondaries().contains(UCA.getSecondary(ces[i]))) { s = ""; if (rel[0] > 1) rel[0] = 1; // HACK break; @@ -2846,11 +2855,11 @@ F900..FAFF; CJK Compatibility Ideographs Utility.fixDot(); System.out.println("Writing"); - PrintWriter shortLog = new PrintWriter(new BufferedWriter(new FileWriter(UCA_GEN_DIR + filename + "_SHORT.txt"), 32*1024)); - PrintWriter longLog = new PrintWriter(new BufferedWriter(new FileWriter(UCA_GEN_DIR + filename + ".txt"), 32*1024)); + PrintWriter shortLog = new PrintWriter(new BufferedWriter(new FileWriter(collator.getUCA_GEN_DIR() + filename + "_SHORT.txt"), 32*1024)); + PrintWriter longLog = new PrintWriter(new BufferedWriter(new FileWriter(collator.getUCA_GEN_DIR() + filename + ".txt"), 32*1024)); log = new PrintWriter(new DualWriter(shortLog, longLog)); - PrintWriter summary = new PrintWriter(new BufferedWriter(new FileWriter(UCA_GEN_DIR + filename + "_summary.txt"), 32*1024)); + PrintWriter summary = new PrintWriter(new BufferedWriter(new FileWriter(collator.getUCA_GEN_DIR() + filename + "_summary.txt"), 32*1024)); //log.println("[Variable Low = " + UCA.toString(collator.getVariableLow()) + "]"); //log.println("[Variable High = " + UCA.toString(collator.getVariableHigh()) + "]"); @@ -3976,7 +3985,7 @@ F900..FAFF; CJK Compatibility Ideographs static void writeCollationValidityLog() throws IOException { //log = new PrintWriter(new FileOutputStream("CheckCollationValidity.html")); - log = Utility.openPrintWriter(UCA_GEN_DIR, "CheckCollationValidity.html", Utility.UTF8_WINDOWS); + log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "CheckCollationValidity.html", Utility.UTF8_WINDOWS); log.println(""); log.println("UCA Validity Log"); @@ -4002,15 +4011,18 @@ F900..FAFF; CJK Compatibility Ideographs */ UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null); - cc.enableSamples(); + //cc.setDoEnableSamples(true); + UnicodeSet coverage = new UnicodeSet(); while (true) { String s = cc.next(); if (s == null) break; addString(s, option); + coverage.add(s); } - + System.out.println("Total: " + sortedD.size()); + Iterator it; //ucd.init(); @@ -4051,7 +4063,10 @@ F900..FAFF; CJK Compatibility Ideographs log.println("

Collation Validity Checks

"); log.println(""); - log.println("
Generated: " + getNormalDate() + "
File Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion() + "
"); + log.println("Unicode Version: " + collator.getUCDVersion()); + log.println("UCA Data Version (@version in file): " + collator.getDataVersion()); + log.println("UCA File Name: " + collator.getFileVersion()); + log.println(""); if (collator.getDataVersion() == UCA.BADVERSION) { log.println(SERIOUS_ERROR); @@ -4076,6 +4091,24 @@ F900..FAFF; CJK Compatibility Ideographs addClosure(); writeDuplicates(); writeOverlap(); + + log.println("

Coverage

"); + BagFormatter bf = new BagFormatter(); + bf.setLineSeparator("
\r\n"); + ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(""); + bf.setUnicodePropertyFactory(ups); + bf.setShowLiteral(bf.toHTML); + bf.setFixName(bf.toHTML); + UCD ucd = Default.ucd(); + UnicodeProperty cat = ups.getProperty("gc"); + UnicodeSet ucd410 = cat.getSet("Cn") + .addAll(cat.getSet("Co")) + .addAll(cat.getSet("Cs")) + .complement() + //.addAll(ups.getSet("Noncharactercodepoint=true")) + //.addAll(ups.getSet("Default_Ignorable_Code_Point=true")) + ; + bf.showSetDifferences(log, "UCD4.1.0", ucd410, "UCA4.1.0", coverage, 3); log.println(""); log.close(); @@ -4670,7 +4703,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;; static PrintWriter writeHead(int counter, int end, String title, String other, String version, boolean show) throws IOException { - PrintWriter out = Utility.openPrintWriter(UCA_GEN_DIR, title + pad(counter) + ".html", Utility.UTF8_WINDOWS); + PrintWriter out = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), title + pad(counter) + ".html", Utility.UTF8_WINDOWS); copyFile(out, "HTML-Part1.txt"); /* diff --git a/tools/unicodetools/com/ibm/text/UCD/TestData.java b/tools/unicodetools/com/ibm/text/UCD/TestData.java index 73400fb19af..35a780541bb 100644 --- a/tools/unicodetools/com/ibm/text/UCD/TestData.java +++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $ -* $Date: 2005/03/30 17:19:32 $ -* $Revision: 1.20 $ +* $Date: 2005/04/06 08:48:17 $ +* $Revision: 1.21 $ * ******************************************************************************* */ @@ -21,6 +21,7 @@ import java.text.SimpleDateFormat; import com.ibm.icu.dev.test.util.BagFormatter; import com.ibm.icu.dev.test.util.ICUPropertyFactory; import com.ibm.icu.dev.test.util.UnicodeLabel; +import com.ibm.icu.dev.test.util.UnicodeMap; import com.ibm.icu.dev.test.util.UnicodeProperty; import com.ibm.icu.impl.ICUData; import com.ibm.icu.impl.ICUResourceBundle; @@ -153,17 +154,23 @@ public class TestData implements UCD_Types { static class GenStringPrep { UnicodeSet[] coreChars = new UnicodeSet[100]; UnicodeSet decomposable = new UnicodeSet(); + UnicodeMap suspect = new UnicodeMap(); ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(""); //UnicodeSet id_continue = ups.getSet("ID_Continue=true"); - UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()) - .retainAll(ups.getSet("gc=Sk")) - .addAll(new UnicodeSet("[\u0027 \u002D \u002E \u003A \u00B7 \u058A \u05F3" + - " \u05F4 \u200C \u200D \u2010 \u2019 \u2027 \u30A0]")); + UnicodeSet xid_continue = ups.getSet("XID_Continue=true"); + UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()); + { + wordChars.retainAll(ups.getSet("gc=Sk")); + wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" + + " \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0]")); + //wordChars.removeAll(xid_continue); + } UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars); + UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement(); - UnicodeSet not_xid_continue = ups.getSet("XID_Continue=true").complement().removeAll(wordChars); + UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars); //UnicodeSet[] decompChars = new UnicodeSet[100]; UCD ucd = Default.ucd(); @@ -180,7 +187,8 @@ public class TestData implements UCD_Types { "[[:Bidi_Class=AL:][:Bidi_Class=R:]]"); UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]"); - UnicodeSet hasUpper = new UnicodeSet(); + UnicodeSet hasNoUpper = new UnicodeSet(); + UnicodeSet hasNoUpperMinus = new UnicodeSet(); BagFormatter bf = new BagFormatter(); UnicodeSet inIDN = new UnicodeSet(); @@ -200,16 +208,16 @@ public class TestData implements UCD_Types { if (!Default.nfd().isNormalized(cp)) decomposable.add(cp); int idnaType = getIDNAType(cp); idnaTypeSet[idnaType].add(cp); + String str = UTF16.valueOf(cp); + if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp); int script = ucd.getScript(cp); if (coreChars[script] == null) coreChars[script] = new UnicodeSet(); coreChars[script].add(cp); } - // find characters with no uppercase - for (UnicodeSetIterator it = new UnicodeSetIterator(lowercase); it.next();) { - String str = UTF16.valueOf(it.codepoint); - if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(it.codepoint); - } + // fix characters with no uppercase + hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars); + System.out.println(bf.showSetNames(hasNoUpper)); Utility.fixDot(); PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html"); @@ -245,6 +253,23 @@ public class TestData implements UCD_Types { showCodes(htmlOut, textOut, INHERITED_SCRIPT); htmlOut.println(""); htmlOut.close(); + bf.setMergeRanges(false); + + textOut.println(); + textOut.println("# *** WORD CHARACTERS ADDED ***"); + bf.setValueSource("word-chars"); + bf.showSetNames(textOut, wordChars); + + textOut.println(); + textOut.println("# *** FOR REVIEW (collected from above) ***"); + bf.setLabelSource(UnicodeLabel.NULL); + for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) { + textOut.println(); + String value = (String)it.next(); + bf.setValueSource(value); + bf.showSetNames(textOut, suspect.getSet(value)); + } + textOut.close(); } UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT]; @@ -302,25 +327,38 @@ public class TestData implements UCD_Types { UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core); UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core); + UnicodeSet remappedIsNFKC = extract(isNFKC, remapped); + UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC); + UnicodeSet decomp = extract(decomposable, core); UnicodeSet pattern = extract(patternProp, core); UnicodeSet non_id = extract(not_xid_continue, core); - UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper); - core.removeAll(otherCore); - if (core.size() == 0) { - UnicodeSet temp = core; - core = otherCore; - otherCore = temp; + UnicodeSet bicameralNoupper = new UnicodeSet(); + if (!hasNoUpper.containsAll(core)) { + bicameralNoupper = extract(hasNoUpperMinus, core); + } + + UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id); + for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) { + String cat = Default.ucd().getCategoryID(it.codepoint); + String name = Default.ucd().getName(it.codepoint); + if (name.indexOf("MUSICAL SYMBOL") >= 0 + || name.indexOf("DINGBA") >= 0 + || name.indexOf("RADICAL ") >= 0 + ) cat = "XX"; + suspect.put(it.codepoint, cat); } if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode); - if (otherCore.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", otherCore, scriptCode); + if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode); if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode); if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode); if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "Decomposable", decomp, scriptCode); - if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped", remapped, scriptCode); + if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode); + if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Decomposable", remappedIsNFKCDecomp, scriptCode); + if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode); if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode); if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Illegal", illegal, scriptCode); } diff --git a/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html b/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html index b989b5d28b1..5ae33e840a4 100644 --- a/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html +++ b/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html @@ -14,7 +14,9 @@ .Non-XID { background-color: #FFCCCC } .Decomposable { background-color: #FFFFCC } .Pattern_Syntax { background-color: #FFCCFF } -.IDN-Remapped { background-color: #FF6666 } +.IDN-Remapped-Case-Atomic { background-color: #CCFFFF } +.IDN-Remapped-Case-Decomposable { background-color: #66FFFF } +.IDN-Remapped-Compat { background-color: #FF6666 } .IDN-Deleted { background-color: #66FF66 } .IDN-Illegal { background-color: #6666FF } th { text-align: left } @@ -25,7 +27,7 @@ th { text-align: left }

IDN Character Categorization

-

$Date: 2005/03/30 17:19:32 $, MED

+

$Date: 2005/04/06 08:48:17 $, MED

This page lists all of the valid output IDN characters broken down by category. By "output" IDN characters, we mean ones that can result from nameprep. Characters are grouped first by script, and then by subcategory. Within each subcategory characters are sorted according to the default @@ -69,8 +71,17 @@ and name (in enabled browsers).

Characters with NFC decompositions. - IDN-Remapped - Characters remapped by IDN. + IDN-Remapped + Characters remapped by IDN due to case folding + + + IDN-Remapped + Characters remapped by IDN due to case folding, that are decomposable. + + IDN-Remapped-Case-Decomposable + + IDN-Remapped + Characters remapped by IDN due to compatibility mapping. IDN-Deleted