diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java b/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java new file mode 100644 index 00000000000..d5c1146ae19 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java @@ -0,0 +1,480 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $ +* $Date: 2005/05/27 21:40:51 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + +import com.ibm.icu.dev.test.util.ArrayComparator; +import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.dev.test.util.UnicodeMap; +import com.ibm.icu.dev.test.util.UnicodePropertySource; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.text.utility.Utility; + +public class GenerateConfusables { + static PrintWriter log; + static final String ARROW = "\u2192"; + + static class Data2 { + String source; + String target; + int count; + Data2(String target, int count) { + this.target = target; + this.count = count; + } + } + + static ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(""); + static UnicodeSet skipSet = ups.getSet("gc=Cn").addAll(ups.getSet("gc=Co")).addAll(ups.getSet("gc=Cc")).addAll(ups.getSet("gc=Cf")); + + static class Data implements Comparable { + String source; + String target; + String type; + Data(String source, String target, String type) { + this.source = source; + this.target = target; + this.type = type; + } + public int compareTo(Object o) { + int result; + Data that = (Data)o; + if (0 != (result = target.compareTo(that.target))) return result; + if (0 != (result = source.compareTo(that.source))) return result; + if (0 != (result = type.compareTo(that.type))) return result; + return 0; + } + } + + static UnicodeSet controls = new UnicodeSet("[:Cc:]"); + + static class DataSet { + Set dataSet = new TreeSet(); + Map dataMap = new TreeMap(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(), new UTF16.StringComparator()})); + + public DataSet add(String source, String target, String type, String errorLine) { + if (skipSet.containsAll(source) || skipSet.containsAll(target)) return this; + String nsource = Default.nfkd().normalize(source); + String ntarget = Default.nfkd().normalize(target); + + // if it is just a compatibility match, return + if (nsource.equals(ntarget)) return this; + + if (type.startsWith("confusables-")) type = type.substring("confusables-".length()); + if (type.endsWith(".txt")) type = type.substring(0,type.length() - ".txt".length()); + + // if it is base + combining sequence => base2 + same combining sequence, do just the base + int nsourceFirst = UTF16.charAt(nsource,0); + String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst)); + int ntargetFirst = UTF16.charAt(ntarget,0); + String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst)); + if (nsourceRest.length() != 0 && nsourceRest.equals(ntargetRest)) { + source = UTF16.valueOf(nsourceFirst); + target = UTF16.valueOf(ntargetFirst); + type += "-base"; + } + + // swap order + if (preferSecondAsSource(source, target)) { + String temp = target; + target = source; + source = temp; + } + if (target.indexOf('\u203D') >= 0) type += "-skip"; + Data newData = new Data(source, target, type); + return add(newData, errorLine); + } + /** + * @param errorLine TODO + * + */ + private DataSet add(Data newData, String errorLine) { + if (controls.containsSome(newData.source) || controls.containsSome(newData.target)) { + System.out.println("Problem with " + errorLine); + System.out.println(getCodeCharName(newData.source) + " => " + getCodeCharName(newData.target)); + } + String[] key = {newData.source, newData.target}; + Data old = (Data) dataMap.get(key); + if (old == null) { + dataSet.add(newData); + dataMap.put(key, newData); + }else { + old.type = old.type + "/" + newData.type; + } + return this; + } + // Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt" + static final int NORMAL = 0, FOLDING = 1, OLD = 2; + + public DataSet addFile(String directory, String filename) throws IOException { + BufferedReader in = BagFormatter.openUTF8Reader(directory, filename); + int kind = NORMAL; + if (filename.indexOf("Folding") >= 0) kind = FOLDING; + else if (false && filename.indexOf("-old") >= 0) kind = OLD; + while (true) { + String line = Utility.readDataLine(in); + if (line == null) break; + if (line.length() == 0) continue; + String[] pieces = Utility.split(line,';'); + if (pieces.length < 2) { + System.out.println("Error on: " + line); + continue; + } + String type = filename; + if (kind==FOLDING) { + String source = Utility.fromHex(pieces[0].trim(),true); + String target = Utility.fromHex(pieces[1].trim(),true); + String nsource = Default.nfkd().normalize(source); + String first = UTF16.valueOf(UTF16.charAt(nsource, 0)); + if (!first.equals(target)) { + add(source, target, type, line); + } + } else if (kind == OLD) { + String target = pieces[0].trim(); + for (int i = 1; i < pieces.length; ++i) { + add(pieces[i].trim(), target, type, line); + } + } else { + String source = Utility.fromHex(pieces[0].trim(),true); + String target = Utility.fromHex(pieces[1].trim(),true); + if (pieces.length > 2) type = pieces[2].trim(); + add(source, target, type, line); + } + } + in.close(); + return this; + } + public void write(String directory, String filename, boolean appendFile) throws IOException { + PrintWriter out = BagFormatter.openUTF8Writer(directory, filename); + if (appendFile) { + String[] replacements = {"%date%", Default.getDate()}; + Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt", + Utility.UTF8_WINDOWS, out, replacements); + } + for (Iterator it = dataSet.iterator(); it.hasNext();) { + Data item = (Data) it.next(); + out.println( + Utility.hex(item.source) + + " ;\t" + Utility.hex(item.target) + + " ;\t" + item.type + + "\t# " + + "( " + item.source + " " + ARROW + " " + item.target + ") " + + Default.ucd().getName(item.source) + " " + ARROW + " " + + Default.ucd().getName(item.target)); + + } + out.close(); + } + /** + * + */ + public void add(DataSet ds) { + for (Iterator it = ds.dataSet.iterator(); it.hasNext();) { + add((Data)it.next(), ""); + } + } + public DataSet clean() { + // remove all skips + DataSet tempSet = new DataSet(); + Map m = new HashMap(); + for (Iterator it = dataSet.iterator(); it.hasNext();) { + Data d = (Data) it.next(); + if (d.type.indexOf("skip") >= 0) continue; + String newTarget = Default.nfkd().normalize(d.target); + String newSource = Default.nfkd().normalize(d.source); + String type = d.type; + if (!d.target.equals(newTarget) || !d.source.equals(newSource)) { + type += "-nf"; + log.println("Norm:\t" + getCodeCharName(d.source) + " " + ARROW + " " + getCodeCharName(newSource)); + log.println("\t" + getCodeCharName(d.target) + " " + ARROW + " " + getCodeCharName(newTarget) + " \t" + type); + continue; + } + // swap order + if (preferSecondAsSource(newSource, newTarget)) { + String temp = newTarget; + newTarget = newSource; + newSource = temp; + } + + Data already = (Data) m.get(newSource); + if (already != null && !newTarget.equals(already.target)) { + log.println("X " + getCodeCharName(newSource) + " " + ARROW); + log.println("\t" + getCodeCharName(newTarget) + " \t" + type); + log.println("\t" + getCodeCharName(already.target) + " \t" + already.type); + if (preferSecondAsSource(already.target, newTarget)) { + // just fix new guy + type += "[" + newSource + "]" + already.type; + newSource = newTarget; + newTarget = already.target; + } else { + // need to fix new guy, AND fix old guy. + tempSet.remove(already); + type += "[" + newSource + "]" + already.type; + newSource = already.target; + already.type += "[" + already.target + "]" + type; + already.target = newTarget; + tempSet.add(already, ""); + } + } + Data newData = new Data(newSource, newTarget, type); + m.put(newSource, newData); + tempSet.add(newData, ""); + } + // now recursively apply + DataSet s = new DataSet(); + for (Iterator it = tempSet.dataSet.iterator(); it.hasNext();) { + Data d = (Data) it.next(); + int cp = 0; + StringBuffer result = new StringBuffer(); + for (int i = 0; i < d.target.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(d.target, i); + String src = UTF16.valueOf(cp); + while (true) { + Data rep = (Data) m.get(src); + if (rep == null) break; + src = rep.target; + } + result.append(src); + } + String newTarget = result.toString(); + newTarget = Default.nfkd().normalize(newTarget); + s.add(d.source, newTarget, d.type + (newTarget.equals(newTarget) ? "" : "-rec"), ""); + } + return s; + } + /** + * + */ + private void remove(Data already) { + String[] key = {already.source, already.target}; + dataMap.remove(key); + dataSet.remove(already); + } + } + public static void main(String[] args) throws IOException { + String indir = Utility.BASE_DIR + "confusables/"; + String outdir = Utility.GEN_DIR + "confusables/"; + log = BagFormatter.openUTF8Writer(outdir, "log.txt"); + //fixMichel(indir, outdir); + generateConfusables(indir, outdir); + log.close(); + System.out.println("Done"); + } + /** + * @throws IOException + * + */ + private static void fixMichel(String indir, String outdir) throws IOException { + BufferedReader in = BagFormatter.openUTF8Reader(indir + "michel/", "tr36comments-annex.txt"); + PrintWriter out = BagFormatter.openUTF8Writer(outdir, "new-tr36comments-annex.txt"); + while (true) { + String line = Utility.readDataLine(in); + if (line == null) break; + String[] pieces = Utility.split(line,'\t'); + if (pieces.length < 2) { + out.println(line); + continue; + } + String source = Utility.fromHex(pieces[0].trim()); + if (Default.nfkd().isNormalized(source)) { + out.println(line); + } + } + in.close(); + out.close(); + } + /** + * + */ + private static void generateConfusables(String indir, String outdir) throws IOException { + File dir = new File(indir); + String[] names = dir.list(); + DataSet total = new DataSet(); + for (int i = 0; i < names.length; ++i) { + if (new File(indir + names[i]).isDirectory()) continue; + System.out.println(names[i]); + DataSet ds = new DataSet(); + ds.addFile(indir, names[i]); + ds.write(outdir, "new-" + names[i], false); + total.add(ds); + } + total.write(outdir, "confusables-raw.txt", false); + DataSet clean = total.clean(); + clean.write(outdir, "confusables.txt", true); + } + /* + BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt"); + Set set = new TreeSet(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(), + new UTF16.StringComparator()})); + while (true) { + String line = Utility.readDataLine(in); + if (line == null) break; + if (line.length() == 0) continue; + String[] pieces = Utility.split(line,';'); + if (pieces.length < 2) { + System.out.println("Error on: " + line); + continue; + } + String source = Utility.fromHex(pieces[0].trim()); + String target = Utility.fromHex(pieces[1].trim()); + String nsource = Default.nfkd().normalize(source); + String first = UTF16.valueOf(UTF16.charAt(nsource, 0)); + if (!first.equals(target)) { + set.add(new String[]{source, target}); + } + } + in.close(); + + } + public static void gen() throws IOException { + Map m = new TreeMap(); + BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables.txt"); + while (true) { + String line = in.readLine(); + if (line == null) break; + String[] pieces = Utility.split(line,';'); + if (pieces.length < 3) { + System.out.println("Error on: " + line); + continue; + } + int codepoint = Integer.parseInt(pieces[1], 16); + int cat = Default.ucd().getCategory(codepoint); + if (cat == UCD_Types.Co || cat == UCD_Types.Cn) continue; // skip private use + if (!Default.nfkd().isNormalized(codepoint)) continue; //skip non NFKC + String result = Utility.fromHex(pieces[0]); + if (!Default.nfkd().isNormalized(result)) continue; //skip non NFKC + int count = Integer.parseInt(pieces[2]); + String source = UTF16.valueOf(codepoint); + add(m, source, result, count); + } + in.close(); + + in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables2.txt"); + while (true) { + String line = in.readLine(); + if (line == null) break; + line = line.trim(); + int pos = line.indexOf("#"); + if (pos >= 0) line = line.substring(0,pos).trim(); + if (line.length() == 0) continue; + if (line.startsWith("@")) continue; + String[] pieces = Utility.split(line,';'); + if (pieces.length < 2) { + System.out.println("Error on: " + line); + continue; + } + String source = pieces[0].trim(); + for (int i = 1; i < pieces.length; ++i) { + add(m, source, pieces[i].trim(), -1); + } + } + in.close(); + + boolean gotOne; + // close the set + do { + gotOne = false; + for (Iterator it = m.keySet().iterator(); it.hasNext();) { + String source = (String) it.next(); + Data2 data = (Data2) m.get(source); + Data2 data2 = (Data2) m.get(data.target); + if (data2 == null) continue; + data.target = data2.target; + gotOne = true; + break; + } + } while (gotOne); + // put into different sorting order + Set s = new TreeSet(); + for (Iterator it = m.keySet().iterator(); it.hasNext();) { + String source = (String) it.next(); + Data2 data = (Data2) m.get(source); + s.add(new Data(source, data.target, data.count)); + } + // write it out + PrintWriter out = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "confusables.txt"); + String[] replacements = {"%date%", Default.getDate()}; + Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt", + Utility.UTF8_WINDOWS, out, replacements); + for (Iterator it = s.iterator(); it.hasNext();) { + Data d = (Data) it.next(); + if (d == null) continue; + out.println(formatLine(d.source, d.target, d.count)); + } + + out.close(); + System.out.println("Done"); + } + /** + * + */ + private static String formatLine(String source, String target, int count) { + return Utility.hex(source) + " ; " + Utility.hex(target," ") + + " ; " + count + + " # " + + "(" + source + " " + ARROW + " " + target + ") " + + Default.ucd().getName(source) + + " " + ARROW + " " + Default.ucd().getName(target); + } + /** + * + */ + private static void add(Map m, String source, String target, int count) { + if (source.length() == 0 || target.length() == 0) return; + if (preferSecondAsSource(source, target)) { + String temp = target; + target = source; + source = temp; + } + Data2 other = (Data2) m.get(source); + if (other != null) { + if (target.equals(other.target)) return; + System.out.println("conflict"); + System.out.println(formatLine(source, target, count)); + System.out.println(formatLine(source, other.target, other.count)); + // skip adding this, and instead add result -> other.target + add(m, target, other.target, count); + } else { + m.put(source, new Data2(target, count)); + } + }; + + static private boolean preferSecondAsSource(String a, String b) { + // if first is longer, prefer second + int ca = UTF16.countCodePoint(a); + int cb = UTF16.countCodePoint(b); + if (ca != cb) { + return ca > cb; + } + // if first is lower, prefer second + return a.compareTo(b) < 0; + } + + static String getCodeCharName(String a) { + return Default.ucd().getCode(a) + "( " + a + " ) " + Default.ucd().getName(a); + } + +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java b/tools/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java new file mode 100644 index 00000000000..d6f563623a4 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java @@ -0,0 +1,125 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java,v $ +* $Date: 2005/05/27 21:40:51 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; +import com.ibm.text.utility.*; +import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import java.util.*; +import java.io.*; + +public final class GenerateNamedSequences implements UCD_Types { + + static final boolean DEBUG = false; + + static public String showVarGlyphs(String code0, String code1, String shape, String description) { + if (DEBUG) System.out.println(code0 + ", " + code1 + ", [" + shape + "]"); + + String abbShape = ""; + if (shape.length() != 0) { + abbShape = '-' + shape.substring(0,4); + if (description.indexOf("feminine") >= 0) abbShape += "fem"; + } + + return "U+" + code0 + "+U+" + code1 + "/" + shape 
+            + ""; + } + +/* +# Field 0: the variation sequence +# Field 1: the description of the desired appearance +# Field 2: where the appearance is only different in in particular shaping environments +# this field lists them. The possible values are: isolated, initial, medial, final. +# If more than one is present, there are spaces between them. +*/ + static public void generate() throws IOException { + + + // read the data and compose the table + + String table = ""; + + String[] splits = new String[4]; + String[] codes = new String[20]; + String[] shapes = new String[4]; + + BufferedReader in = Utility.openUnicodeFile("NamedSequences", Default.ucdVersion(), true, Utility.LATIN1); + Transliterator unicodexml = Transliterator.getInstance("hex/xml"); + while (true) { + String line = Utility.readDataLine(in); + if (line == null) break; + line = line.trim(); + if (line.length() == 0) continue; + + int count = Utility.split(line, ';', splits); + String name = splits[0]; + int codeCount = Utility.split(splits[1], ' ', codes); + StringBuffer codeBuffer = new StringBuffer(); + for (int i = 0; i < codeCount; ++i) { + UTF16.append(codeBuffer, Integer.parseInt(codes[i],16)); + } + String codeWithHyphens = splits[1].replaceAll("\\s", "-"); + String codeAlt = "U+" + splits[1].replaceAll("\\s", " U+"); + String codeString = unicodexml.transliterate(codeBuffer.toString()); + + // 03E2 + + //table += "\n"; + String imageName = "images/U" + codeWithHyphens + ".gif"; + if (splits[1].compareTo("1780") >= 0 && splits[1].compareTo("1800") < 0) { + String codeNoSpaces2 = splits[1].replaceAll("\\s", ""); + imageName = "http://www.unicode.org/reports/tr28/images/" + codeNoSpaces2 + ".gif"; + } + table += "" + + "" + + "" + + "" + + "" + + "\n"; + System.out.println(splits[1] + "\t" + codeString); + } + in.close(); + table += "
Rep GlyphHex SequenceNameCopyable
U+" + codes[0] + "
(" + codeAlt + ")
" + + splits[1] + "
" + splits[1] + "" + name + "" + codeString + "
"; + + // now write out the results + + String directory = "DerivedData/"; + String filename = directory + "NamedSequences" + UnicodeDataFile.getHTMLFileSuffix(true); + PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX); + /* + String[] batName = {""}; + String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName); + + String version = Default.ucd().getVersion(); + int lastDot = version.lastIndexOf('.'); + String updateDirectory = version.substring(0,lastDot) + "-Update"; + int updateV = version.charAt(version.length()-1) - '0'; + if (updateV != 0) updateDirectory += (char)('1' + updateV); + if (DEBUG) System.out.println("updateDirectory: " + updateDirectory); + */ + + String[] replacementList = { + "@revision@", Default.ucd().getVersion(), + //"@updateDirectory@", updateDirectory, + "@date@", Default.getDate(), + "@table@", table}; + + Utility.appendFile("NamedSequences-Template.html", Utility.UTF8, out, replacementList); + + out.close(); + //Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]); + } +} diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java b/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java new file mode 100644 index 00000000000..2a1230f7f0c --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java @@ -0,0 +1,515 @@ +/* + * Created on May 3, 2005 + * Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others. + * For terms of use, see http://www.unicode.org/terms_of_use.html + */ +package com.ibm.text.UCD; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.dev.test.util.CollectionUtilities; +import com.ibm.icu.dev.test.util.UnicodeLabel; +import com.ibm.icu.dev.test.util.UnicodeMap; +import com.ibm.icu.dev.test.util.UnicodeMap.Composer; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.IDNA; +import com.ibm.icu.text.StringPrepParseException; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.text.UTF16.StringComparator; +import com.ibm.icu.util.ULocale; +import com.ibm.text.UCD.GenerateHanTransliterator.MultiComparator; +import com.ibm.text.UCD.TestData.RegexMatcher; +import com.ibm.text.utility.Utility; + + +class GenerateStringPrep implements UCD_Types { + + public static void main (String[] args) throws IOException { + //checkChars(false); + new GenerateStringPrep().genStringPrep(); + System.out.println("Done"); + } + + UnicodeSet[] coreChars = new UnicodeSet[100]; + UnicodeSet decomposable = new UnicodeSet(); + UnicodeMap suspect = new UnicodeMap(); + + ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(""); + ToolUnicodePropertySource ups32 = ToolUnicodePropertySource.make("3.2.0"); + //UnicodeSet id_continue = ups.getSet("ID_Continue=true"); + UnicodeSet xid_continue = ups.getSet("XID_Continue=true"); + UnicodeSet wordChars = new UnicodeSet(); + { + if (false) { + wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher())); + wordChars.retainAll(ups.getSet("gc=Sk")); + } + wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" + + " \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" + + " \\u055A \\u02B9 \\u02BA]")); + //wordChars.removeAll(xid_continue); + } + + UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars); + UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement(); + UnicodeSet non_spacing = new UnicodeSet(ups.getSet("gc=Me")) + .addAll(ups.getSet("gc=Mn")) + .removeAll(ups.getSet("Default_Ignorable_Code_Point=true")); + + UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars); + + //UnicodeSet[] decompChars = new UnicodeSet[100]; + UCD ucd = Default.ucd(); + + static Collator uca0 = Collator.getInstance(ULocale.ENGLISH); + { + uca0.setStrength(Collator.IDENTICAL); + } + static GenerateHanTransliterator.MultiComparator uca + = new GenerateHanTransliterator.MultiComparator(new Comparator[] { + uca0, new UTF16.StringComparator()}); + + UnicodeSet bidiR = new UnicodeSet( + "[[:Bidi_Class=AL:][:Bidi_Class=R:]]"); + + UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]"); + UnicodeSet hasNoUpper = new UnicodeSet(); + UnicodeSet hasNoUpperMinus = new UnicodeSet(); + BagFormatter bf = new BagFormatter(); + UnicodeSet inIDN = new UnicodeSet(); + UnicodeSet isCaseFolded = new UnicodeSet(); + + void genStringPrep() throws IOException { + //showScriptToBlock(); + bf.setShowLiteral(BagFormatter.toHTMLControl); + bf.setUnicodePropertyFactory(ups); + //bf.setValueSource(UnicodeLabel.NULL); + if (false) { + + System.out.println("word chars: " + bf.showSetNames(wordChars)); + System.out.println("pat: " + bf.showSetNames(patternProp)); + System.out.println("xid: " + bf.showSetNames(not_xid_continue)); + } + for (int cp = 0; cp <= 0x10FFFF; ++cp) { + Utility.dot(cp); + int cat = Default.ucd().getCategory(cp); + if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue; + if (!Default.nfd().isNormalized(cp)) decomposable.add(cp); + // get IDNA + int idnaType = getIDNAType(cp); + idnaTypeSet[idnaType].add(cp); + + String str = UTF16.valueOf(cp); + if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp); + if (str.equals(ucd.getCase(str, FULL, FOLD))) isCaseFolded.add(cp); + + // scripts + int script = ucd.getScript(cp); + if (coreChars[script] == null) + coreChars[script] = new UnicodeSet(); + coreChars[script].add(cp); + } + // fix characters with no uppercase + hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars); + System.out.println(bf.showSetNames(hasNoUpper)); + + Utility.fixDot(); + PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html"); + PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html"); + PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt"); + textOut.println('\uFEFF'); + textOut.println("For documentation, see idn-chars.html"); + + Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut, + new String[] {"%date%", Default.getDate()}); + /* + out + .println(""); + out.println("IDN Characters"); + */ + htmlOut.println("
"); + htmlOut2.println("
"); + + for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) { + if (scriptCode == COMMON_SCRIPT + || scriptCode == INHERITED_SCRIPT) + continue; + showCodes(htmlOut, textOut, scriptCode, htmlOut2); + } + showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2); + showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2); + + showCodes(htmlOut, textOut, non_spacing); + htmlOut.println("
"); + htmlOut.close(); + htmlOut2.println(""); + htmlOut2.close(); + bf.setMergeRanges(false); + + textOut.println(); + textOut.println("# *** ADDITIONAL WORD CHARACTERS ***"); + textOut.println(); + bf.setValueSource("word-chars"); + bf.showSetNames(textOut, wordChars); + + textOut.println(); + textOut.println("# *** FOR REVIEW ***"); + bf.setLabelSource(UnicodeLabel.NULL); + for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) { + textOut.println(); + String value = (String)it.next(); + bf.setValueSource(value); + bf.showSetNames(textOut, suspect.getSet(value)); + } + textOut.close(); + textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn_vs_cfnfkcid.txt"); + bf = new BagFormatter(); + bf.setUnicodePropertyFactory(ups); + textOut.println(); + textOut.println("# *** Comparison of IDN with CF_NFKC_ID (case-folded, NFKC, XID), U3.2 only ***"); + UnicodeSet U32 = ups32.getSet("gc=cn").complement(); + UnicodeSet CF_NFKC_ID = new UnicodeSet(xid_continue).retainAll(isNFKC).retainAll(isCaseFolded).retainAll(U32); + bf.showSetDifferences(textOut, "CF_NFKC_ID", CF_NFKC_ID, "IDN", idnaTypeSet[OK]); + textOut.close(); + + } + + /** + * + */ + private void showScriptToBlock() { + UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap(); + UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap(); + UnicodeMap.Composer myCompose = new UnicodeMap.Composer() { + public Object compose(Object a, Object b) { + return a + "\t" + b; + } + }; + UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose); + for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) { + System.out.println(it.next()); + } + throw new IllegalArgumentException(); + } + + Map scriptToGif = CollectionUtilities.asMap(script_to_gif); + + static String[][] script_to_gif = { + + {"Common","common.gif"}, //Miscellaneous_Symbols + {"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks + {"Arabic","arabic.gif"}, //Arabic + {"Armenian","armenian.gif"}, //Armenian + {"Bengali","bengali.gif"}, //Bengali + {"Bopomofo","bopomofo.gif"}, //Bopomofo + {"Braille","braillesymbols.gif"}, //Braille_Patterns + {"Buginese","buginese.gif"}, //Buginese + {"Buhid","buhid.gif"}, //Buhid + {"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics + {"Cherokee","cherokee.gif"}, //Cherokee + {"Coptic","coptic.gif"}, //Coptic + {"Cypriot","cypriot.gif"}, //Cypriot_Syllabary + {"Cyrillic","cyrillic.gif"}, //Cyrillic + {"Deseret","deseret.gif"}, //Deseret + {"Devanagari","devanagari.gif"}, //Devanagari + {"Ethiopic","ethiopic.gif"}, //Ethiopic + {"Georgian","georgian.gif"}, //Georgian + {"Glagolitic","glagolitic.gif"}, //Glagolitic + {"Gothic","gothic.gif"}, //Gothic + {"Greek","greek.gif"}, //Greek_and_Coptic + {"Gujarati","gujarati.gif"}, //Gujarati + {"Gurmukhi","gurmukhi.gif"}, //Gurmukhi + {"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs + {"Han","kangxiradicals.gif"}, //Kangxi_Radicals + {"Hangul","hangulsyllables.gif"}, //Hangul_Syllables + {"Hanunoo","hanunoo.gif"}, //Hanunoo + {"Hebrew","hebrew.gif"}, //Hebrew + {"Hiragana","hiragana.gif"}, //Hiragana + {"Kannada","kannada.gif"}, //Kannada + {"Katakana","katakana.gif"}, //Katakana + {"Kharoshthi","kharoshthi.gif"}, //Kharoshthi + {"Khmer","khmer.gif"}, //Khmer + {"Lao","lao.gif"}, //Lao + {"Latin","latin.gif"}, //Basic_Latin + {"Limbu","limbu.gif"}, //Limbu + {"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary + {"Malayalam","malayalam.gif"}, //Malayalam + {"Mongolian","mongolian.gif"}, //Mongolian + {"Myanmar","myanmar.gif"}, //Myanmar + {"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue + {"Ogham","ogham.gif"}, //Ogham + {"Old_Italic","olditalic.gif"}, //Old_Italic + {"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian + {"Oriya","oriya.gif"}, //Oriya + {"Osmanya","osmanya.gif"}, //Osmanya + {"Runic","runic.gif"}, //Runic + {"Shavian","shavian.gif"}, //Shavian + {"Sinhala","sinhala.gif"}, //Sinhala + {"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri + {"Syriac","syriac.gif"}, //Syriac + {"Tagalog","tagalog.gif"}, //Tagalog + {"Tagbanwa","tagbanwa.gif"}, //Tagbanwa + {"Tai_Le","taile.gif"}, //Tai_Le + {"Tamil","tamil.gif"}, //Tamil + {"Telugu","telugu.gif"}, //Telugu + {"Thaana","thaana.gif"}, //Thaana + {"Thai","thai.gif"}, //Thai + {"Tibetan","tibetan.gif"}, //Tibetan + {"Tifinagh","tifinagh.gif"}, //Tifinagh + {"Ugaritic","ugaritic.gif"}, //Ugaritic + {"Yi","yi.gif"}, //Yi_Syllables + + }; + + UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT]; + { + for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet(); + } + static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4; + /** + * + */ + private int getIDNAType(int cp) { + inbuffer.setLength(0); + UTF16.append(inbuffer, cp); + try { + intermediate = IDNA.convertToASCII(inbuffer, + IDNA.DEFAULT); // USE_STD3_RULES + if (intermediate.length() == 0) + return DELETED; + outbuffer = IDNA.convertToUnicode(intermediate, + IDNA.USE_STD3_RULES); + } catch (StringPrepParseException e) { + return ILLEGAL; + } catch (Exception e) { + System.out.println("Failure at: " + Utility.hex(cp)); + return ILLEGAL; + } + if (!TestData.equals(inbuffer, outbuffer)) + return REMAPPED; + return OK; + } + StringBuffer inbuffer = new StringBuffer(); + StringBuffer intermediate, outbuffer; + + UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]"); + + /** + * @param htmlOut + * @param textOut TODO + * @param scriptCode + * @param htmlOut2 TODO + * @param ucd + * @param coreChars + * @param decompChars + */ + private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) { + if (coreChars[scriptCode] == null) return; + String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode); + script = Utility.getUnskeleton(script.toLowerCase(),true); + System.out.println(script); + + htmlOut.println(); + String scriptLine = " Script: " + script + ""; + htmlOut.println(scriptLine); + htmlOut2.println(scriptLine); + textOut.println(); + textOut.println("#*** Script: " + script + " ***"); + UnicodeSet core = new UnicodeSet(coreChars[scriptCode]); + + UnicodeSet deleted = extract(idnaTypeSet[DELETED], core); + UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core); + UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core); + + UnicodeSet remappedIsNFKC = extract(isNFKC, remapped); + UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC); + + UnicodeSet decomp = extract(decomposable, core); + UnicodeSet pattern = extract(patternProp, core); + UnicodeSet non_id = extract(not_xid_continue, core); + + UnicodeSet bicameralNoupper = new UnicodeSet(); + if (!hasNoUpper.containsAll(core)) { + bicameralNoupper = extract(hasNoUpperMinus, core); + } + + UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id); + for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) { + String cat = Default.ucd().getCategoryID(it.codepoint); + String name = Default.ucd().getName(it.codepoint); + if (name.indexOf("MUSICAL SYMBOL") >= 0 + || name.indexOf("DINGBA") >= 0 + || name.indexOf("RADICAL ") >= 0 + ) cat = "XX"; + suspect.put(it.codepoint, cat); + } + + if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode, uca); + if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode, uca); + if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode, uca); + if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode, uca); + if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode, uca); + + if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode, uca); + if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode, uca); + if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode, uca); + if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode, uca); + if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode, uca); + } + + private void showCodes(PrintWriter htmlOut, PrintWriter textOut, UnicodeSet uset) throws IOException { + String script = Default.ucd().getScriptID_fromIndex((byte) INHERITED_SCRIPT); + script = Utility.getUnskeleton(script.toLowerCase(),true); + String scriptLine = " Script: " + script + ""; + htmlOut.println(scriptLine); + UnicodeMap m = getPositions(); + + for (Iterator it = m.getAvailableValues(new TreeSet(uca)).iterator(); it.hasNext(); ) { + String type = (String) it.next(); + UnicodeSet current = m.getSet(type).retainAll(non_spacing); + if (current.size() == 0) continue; + printlnSet(htmlOut, textOut, script, "Visible_Combining_Marks_" + type, current, INHERITED_SCRIPT, positionComparator); + } + } + + /** + * @throws IOException + * + */ + private UnicodeMap getPositions() throws IOException { + UnicodeMap result = new UnicodeMap(); + BufferedReader in = bf.openUTF8Reader("C:\\DATA\\confusables\\", "positions.txt"); + String type="Undetermined"; + while (true) { + String line = Utility.readDataLine(in); + if (line == null) break; + if (line.length() == 0) continue; + if (line.startsWith("@")) { + type = line.substring(1); + continue; + } + String[] pieces = Utility.split(line, ';'); + String code = Utility.fromHex(pieces[0]); + result.put(UTF16.charAt(code,0), type); + } + return result; + } + + static Comparator positionComparator = new Comparator() { + public int compare(Object o1, Object o2) { + String s1 = (String)o1; + String s2 = (String)o2; + return Default.ucd().getName(s1).compareTo(Default.ucd().getName(s2)); + } + }; + + /** + * + */ + private UnicodeSet extract(UnicodeSet other, UnicodeSet core) { + UnicodeSet decomp = new UnicodeSet(core).retainAll(other); + core.removeAll(decomp); + return decomp; + } + + /** + * @param htmlOut + * @param textOut TODO + * @param script TODO + * @param unicodeset + * @param scriptCode + * @param comparator TODO + * @param uca + */ + private void printlnSet(PrintWriter htmlOut, PrintWriter textOut, + String script, String title, UnicodeSet unicodeset, int scriptCode, Comparator comparator) { + if (unicodeset == null) + return; + int size = unicodeset.size(); + String dir = unicodeset.containsSome(bidiR) + && unicodeset.containsNone(bidiL) ? " dir='rtl'" : ""; + htmlOut.println("" + title + " (" + + TestData.nf.format(size) + ")"); + htmlOut.print(""); + // categorization + textOut.println(); + textOut.println("# " + title); + bf.setValueSource(script + " ; " + title); + UnicodeSetIterator usi = new UnicodeSetIterator(); + if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) { + usi.reset(unicodeset); + while (usi.nextRange()) { + if (usi.codepoint == usi.codepointEnd) { + htmlOut.print(formatCode(UTF16 + .valueOf(usi.codepoint))); + } else { + htmlOut.print(formatCode(UTF16 + .valueOf(usi.codepoint)) + + ".. " + + formatCode(UTF16 + .valueOf(usi.codepointEnd))); + } + } + bf.showSetNames(textOut, unicodeset); + } else { + Set reordered = new TreeSet(comparator); + usi.reset(unicodeset); + while (usi.next()) { + String x = usi.getString(); + boolean foo = reordered.add(x); + if (!foo) + throw new IllegalArgumentException("Collision with " + + Default.ucd().getCodeAndName(x)); + } + for (Iterator it = reordered.iterator(); it.hasNext();) { + Object key = it.next(); + htmlOut.print(formatCode((String)key)); + } + bf.showSetNames(textOut, reordered); + } + htmlOut.println(""); + } + + /** + * @param string + * @return + */ + private String formatCode(String string) { + int cat = ucd.getCategory(UTF16.charAt(string,0)); + String pad = "\u00A0", pad1 = pad; + if (cat == Me || cat == Mn) { + pad = "\u00A0\u00A0"; + pad1 = "\u00A0\u00A0\u25cc"; + } + return "" + + pad1 + + BagFormatter.toHTMLControl.transliterate(string) + + pad + + " "; + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/NamedSequences-Template.html b/tools/unicodetools/com/ibm/text/UCD/NamedSequences-Template.html new file mode 100644 index 00000000000..58056d24f71 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/NamedSequences-Template.html @@ -0,0 +1,153 @@ + + + + + + + + + + +Named Sequences + + + + + + + + + + + + + +
+ [Unicode]  Unicode + Character Database
 
+
+ + + + +
+

L2-XXX

+

To: UTC
+ From: Mark Davis
+ Date: 2005-04-28

+

One of the original ideas for Unicode 4.1.0 was to produce a NamedSequences.html, + following the pattern of StandardizedVariants.html. This document was generated along those + lines, but not added into U4.1.0. My suggestion instead is to add this file (with suitable + style modifications, of course) as a chart someplace accessible under + http://unicode.org/charts/.

+

Alternatively, we could also combine this with the StandardizedVariants.html to provide + a unified chart of sequences, again someplace under + http://unicode.org/charts/.

+

Note: we don't have some of the glyphs quite right yet, but it should be + sufficient for discussing the format. One of the innovations is having a separate column of + text that for copy&paste; that needs discussion also.

+

 PROPOSED WORKING DRAFT
+
Named Sequences

+ + + + + + + + + + + + + + + + + + + + + + + + + +
Revision@revision@
AuthorsMembers of the Editorial Committee
Date@date@
This Version + + http://www.unicode.org/Public/@updateDirectory@/NamedSequences-@revision@.html
Previous Versionn/a
Latest Versionn/a
+


+ Summary

+
+

This file provides a visual display of the named sequences derived from NamedSequences.txt.The + proposal is to add this,

+
+

Status

+
+

The file and the files described herein are part of the + Unicode Character Database (UCD) and are governed by + the UCD Terms of Use stated at the end.

+
+
+

Introduction

+

The tables here exhaustively lists the valid, registered named sequences. The columns include a + representative glyph, the sequence of code points in hex, and the name of the sequence. In + addition, there is a last column entitled Copyable, which contains the literal text forming + the sequence. That text can be copied and pasting in elsewhere. The display of the text in this + column is up to the capabilities of the browser and the set of available fonts. For more + information, see Display Problems?.

+
+

Note: The representative glyphs used to show the names sequences + are often derived from different physical fonts than the representative glyphs in the standard. + They may therefore exhibit minor differences in size, proportion, style, or weight.

+
+

@table@

+
+

UCD Terms of Use

+

Disclaimer

+
+

The Unicode Character Database is provided as is by Unicode, Inc. No claims are made as to + fitness for any particular purpose. No warranties of any kind are expressed or implied. The + recipient agrees to determine applicability of information provided. If this file has been + purchased on magnetic or optical media from Unicode, Inc., the sole remedy for any claim will be + exchange of defective media within 90 days of receipt.

+

This disclaimer is applicable for all other data files accompanying the Unicode Character + Database, some of which have been compiled by the Unicode Consortium, and some of which have + been supplied by other sources.

+
+

Limitations on Rights to Redistribute This Data

+
+

Recipient is granted the right to make copies in any form for internal distribution and to + freely use the information supplied in the creation of products supporting the UnicodeTM + Standard. The files in the Unicode Character Database can be redistributed to third parties or + other organizations (whether for profit or not) as long as this notice and the disclaimer notice + are retained. Information can be extracted from these files and used in documentation or + programs, as long as there is an accompanying notice indicating the source.

+
+
+
+
+ + + + +
+ Access to Copyright and terms of use
+ +
+
+
+
+
+ + + + diff --git a/tools/unicodetools/com/ibm/text/UCD/TestData.java b/tools/unicodetools/com/ibm/text/UCD/TestData.java index f30eb8c8b1b..324c311fba2 100644 --- a/tools/unicodetools/com/ibm/text/UCD/TestData.java +++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $ -* $Date: 2005/05/02 15:39:53 $ -* $Revision: 1.22 $ +* $Date: 2005/05/27 21:38:51 $ +* $Revision: 1.23 $ * ******************************************************************************* */ @@ -46,8 +46,6 @@ public class TestData implements UCD_Types { public static void main (String[] args) throws IOException { //checkChars(false); - new GenStringPrep().genStringPrep(); - if (true) return; System.out.println("main: " + Default.getDate()); upf = ICUPropertyFactory.make(); @@ -152,404 +150,6 @@ public class TestData implements UCD_Types { } Matcher m; - static class GenStringPrep { - - UnicodeSet[] coreChars = new UnicodeSet[100]; - UnicodeSet decomposable = new UnicodeSet(); - UnicodeMap suspect = new UnicodeMap(); - - ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(""); - //UnicodeSet id_continue = ups.getSet("ID_Continue=true"); - UnicodeSet xid_continue = ups.getSet("XID_Continue=true"); - UnicodeSet wordChars = new UnicodeSet(); - { - if (false) { - wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher())); - wordChars.retainAll(ups.getSet("gc=Sk")); - } - wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" + - " \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" + - " \\u055A \\u02B9 \\u02BA]")); - //wordChars.removeAll(xid_continue); - } - - UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars); - UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement(); - - UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars); - - //UnicodeSet[] decompChars = new UnicodeSet[100]; - UCD ucd = Default.ucd(); - - Collator uca0 = Collator.getInstance(ULocale.ENGLISH); - { - uca0.setStrength(Collator.IDENTICAL); - } - GenerateHanTransliterator.MultiComparator uca - = new GenerateHanTransliterator.MultiComparator(new Comparator[] { - uca0, new UTF16.StringComparator()}); - - UnicodeSet bidiR = new UnicodeSet( - "[[:Bidi_Class=AL:][:Bidi_Class=R:]]"); - - UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]"); - UnicodeSet hasNoUpper = new UnicodeSet(); - UnicodeSet hasNoUpperMinus = new UnicodeSet(); - BagFormatter bf = new BagFormatter(); - UnicodeSet inIDN = new UnicodeSet(); - - void genStringPrep() throws IOException { - //showScriptToBlock(); - bf.setShowLiteral(BagFormatter.toHTMLControl); - //bf.setValueSource(UnicodeLabel.NULL); - if (false) { - - System.out.println("word chars: " + bf.showSetNames(wordChars)); - System.out.println("pat: " + bf.showSetNames(patternProp)); - System.out.println("xid: " + bf.showSetNames(not_xid_continue)); - } - for (int cp = 0; cp <= 0x10FFFF; ++cp) { - Utility.dot(cp); - int cat = Default.ucd().getCategory(cp); - if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue; - if (!Default.nfd().isNormalized(cp)) decomposable.add(cp); - int idnaType = getIDNAType(cp); - idnaTypeSet[idnaType].add(cp); - String str = UTF16.valueOf(cp); - if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp); - int script = ucd.getScript(cp); - if (coreChars[script] == null) - coreChars[script] = new UnicodeSet(); - coreChars[script].add(cp); - } - // fix characters with no uppercase - hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars); - System.out.println(bf.showSetNames(hasNoUpper)); - - Utility.fixDot(); - PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html"); - PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html"); - PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt"); - textOut.println('\uFEFF'); - textOut.println("For documentation, see idn-chars.html"); - - Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut, - new String[] {"%date%", Default.getDate()}); - /* - out - .println(""); - out.println("IDN Characters"); - */ - htmlOut.println("
"); - htmlOut2.println("
"); - - for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) { - if (scriptCode == COMMON_SCRIPT - || scriptCode == INHERITED_SCRIPT) - continue; - showCodes(htmlOut, textOut, scriptCode, htmlOut2); - } - showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2); - showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2); - htmlOut.println("
"); - htmlOut.close(); - htmlOut2.println(""); - htmlOut2.close(); - bf.setMergeRanges(false); - - textOut.println(); - textOut.println("# *** ADDITIONAL WORD CHARACTERS ***"); - textOut.println(); - bf.setValueSource("word-chars"); - bf.showSetNames(textOut, wordChars); - - textOut.println(); - textOut.println("# *** FOR REVIEW ***"); - bf.setLabelSource(UnicodeLabel.NULL); - for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) { - textOut.println(); - String value = (String)it.next(); - bf.setValueSource(value); - bf.showSetNames(textOut, suspect.getSet(value)); - } - textOut.close(); - } - - /** - * - */ - private void showScriptToBlock() { - UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap(); - UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap(); - UnicodeMap.Composer myCompose = new UnicodeMap.Composer() { - public Object compose(Object a, Object b) { - return a + "\t" + b; - } - }; - UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose); - for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) { - System.out.println(it.next()); - } - throw new IllegalArgumentException(); - } - - Map scriptToGif = CollectionUtilities.asMap(script_to_gif); - - static String[][] script_to_gif = { - - {"Common","common.gif"}, //Miscellaneous_Symbols - {"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks - {"Arabic","arabic.gif"}, //Arabic - {"Armenian","armenian.gif"}, //Armenian - {"Bengali","bengali.gif"}, //Bengali - {"Bopomofo","bopomofo.gif"}, //Bopomofo - {"Braille","braillesymbols.gif"}, //Braille_Patterns - {"Buginese","buginese.gif"}, //Buginese - {"Buhid","buhid.gif"}, //Buhid - {"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics - {"Cherokee","cherokee.gif"}, //Cherokee - {"Coptic","coptic.gif"}, //Coptic - {"Cypriot","cypriot.gif"}, //Cypriot_Syllabary - {"Cyrillic","cyrillic.gif"}, //Cyrillic - {"Deseret","deseret.gif"}, //Deseret - {"Devanagari","devanagari.gif"}, //Devanagari - {"Ethiopic","ethiopic.gif"}, //Ethiopic - {"Georgian","georgian.gif"}, //Georgian - {"Glagolitic","glagolitic.gif"}, //Glagolitic - {"Gothic","gothic.gif"}, //Gothic - {"Greek","greek.gif"}, //Greek_and_Coptic - {"Gujarati","gujarati.gif"}, //Gujarati - {"Gurmukhi","gurmukhi.gif"}, //Gurmukhi - {"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs - {"Han","kangxiradicals.gif"}, //Kangxi_Radicals - {"Hangul","hangulsyllables.gif"}, //Hangul_Syllables - {"Hanunoo","hanunoo.gif"}, //Hanunoo - {"Hebrew","hebrew.gif"}, //Hebrew - {"Hiragana","hiragana.gif"}, //Hiragana - {"Kannada","kannada.gif"}, //Kannada - {"Katakana","katakana.gif"}, //Katakana - {"Kharoshthi","kharoshthi.gif"}, //Kharoshthi - {"Khmer","khmer.gif"}, //Khmer - {"Lao","lao.gif"}, //Lao - {"Latin","latin.gif"}, //Basic_Latin - {"Limbu","limbu.gif"}, //Limbu - {"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary - {"Malayalam","malayalam.gif"}, //Malayalam - {"Mongolian","mongolian.gif"}, //Mongolian - {"Myanmar","myanmar.gif"}, //Myanmar - {"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue - {"Ogham","ogham.gif"}, //Ogham - {"Old_Italic","olditalic.gif"}, //Old_Italic - {"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian - {"Oriya","oriya.gif"}, //Oriya - {"Osmanya","osmanya.gif"}, //Osmanya - {"Runic","runic.gif"}, //Runic - {"Shavian","shavian.gif"}, //Shavian - {"Sinhala","sinhala.gif"}, //Sinhala - {"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri - {"Syriac","syriac.gif"}, //Syriac - {"Tagalog","tagalog.gif"}, //Tagalog - {"Tagbanwa","tagbanwa.gif"}, //Tagbanwa - {"Tai_Le","taile.gif"}, //Tai_Le - {"Tamil","tamil.gif"}, //Tamil - {"Telugu","telugu.gif"}, //Telugu - {"Thaana","thaana.gif"}, //Thaana - {"Thai","thai.gif"}, //Thai - {"Tibetan","tibetan.gif"}, //Tibetan - {"Tifinagh","tifinagh.gif"}, //Tifinagh - {"Ugaritic","ugaritic.gif"}, //Ugaritic - {"Yi","yi.gif"}, //Yi_Syllables - - }; - - UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT]; - { - for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet(); - } - static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4; - /** - * - */ - private int getIDNAType(int cp) { - inbuffer.setLength(0); - UTF16.append(inbuffer, cp); - try { - intermediate = IDNA.convertToASCII(inbuffer, - IDNA.DEFAULT); // USE_STD3_RULES - if (intermediate.length() == 0) - return DELETED; - outbuffer = IDNA.convertToUnicode(intermediate, - IDNA.USE_STD3_RULES); - } catch (StringPrepParseException e) { - return ILLEGAL; - } catch (Exception e) { - System.out.println("Failure at: " + Utility.hex(cp)); - return ILLEGAL; - } - if (!TestData.equals(inbuffer, outbuffer)) - return REMAPPED; - return OK; - } - StringBuffer inbuffer = new StringBuffer(); - StringBuffer intermediate, outbuffer; - - UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]"); - - /** - * @param htmlOut - * @param textOut TODO - * @param scriptCode - * @param htmlOut2 TODO - * @param ucd - * @param coreChars - * @param decompChars - */ - private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) { - if (coreChars[scriptCode] == null) return; - String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode); - script = Utility.getUnskeleton(script.toLowerCase(),true); - System.out.println(script); - - htmlOut.println(); - String scriptLine = " Script: " + script + ""; - htmlOut.println(scriptLine); - htmlOut2.println(scriptLine); - textOut.println(); - textOut.println("#*** Script: " + script + " ***"); - UnicodeSet core = new UnicodeSet(coreChars[scriptCode]); - - UnicodeSet deleted = extract(idnaTypeSet[DELETED], core); - UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core); - UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core); - - UnicodeSet remappedIsNFKC = extract(isNFKC, remapped); - UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC); - - UnicodeSet decomp = extract(decomposable, core); - UnicodeSet pattern = extract(patternProp, core); - UnicodeSet non_id = extract(not_xid_continue, core); - - UnicodeSet bicameralNoupper = new UnicodeSet(); - if (!hasNoUpper.containsAll(core)) { - bicameralNoupper = extract(hasNoUpperMinus, core); - } - - UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id); - for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) { - String cat = Default.ucd().getCategoryID(it.codepoint); - String name = Default.ucd().getName(it.codepoint); - if (name.indexOf("MUSICAL SYMBOL") >= 0 - || name.indexOf("DINGBA") >= 0 - || name.indexOf("RADICAL ") >= 0 - ) cat = "XX"; - suspect.put(it.codepoint, cat); - } - - if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode); - if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode); - if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode); - if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode); - if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode); - - if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode); - if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode); - if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode); - if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode); - if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode); - } - - /** - * - */ - private UnicodeSet extract(UnicodeSet other, UnicodeSet core) { - UnicodeSet decomp = new UnicodeSet(core).retainAll(other); - core.removeAll(decomp); - return decomp; - } - - /** - * @param htmlOut - * @param textOut TODO - * @param script TODO - * @param unicodeset - * @param scriptCode - * @param uca - */ - private void printlnSet(PrintWriter htmlOut, PrintWriter textOut, - String script, String title, UnicodeSet unicodeset, int scriptCode) { - if (unicodeset == null) - return; - int size = unicodeset.size(); - String dir = unicodeset.containsSome(bidiR) - && unicodeset.containsNone(bidiL) ? " dir='rtl'" : ""; - htmlOut.println("" + title + " (" - + nf.format(size) + ")"); - htmlOut.print(""); - // categorization - textOut.println(); - textOut.println("# " + title); - bf.setValueSource(script + " ; " + title); - UnicodeSetIterator usi = new UnicodeSetIterator(); - if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) { - usi.reset(unicodeset); - while (usi.nextRange()) { - if (usi.codepoint == usi.codepointEnd) { - htmlOut.print(formatCode(UTF16 - .valueOf(usi.codepoint))); - } else { - htmlOut.print(formatCode(UTF16 - .valueOf(usi.codepoint)) - + ".. " - + formatCode(UTF16 - .valueOf(usi.codepointEnd))); - } - } - bf.showSetNames(textOut, unicodeset); - } else { - Set reordered = new TreeSet(uca); - usi.reset(unicodeset); - while (usi.next()) { - String x = usi.getString(); - boolean foo = reordered.add(x); - if (!foo) - throw new IllegalArgumentException("Collision with " - + Default.ucd().getCodeAndName(x)); - } - for (Iterator it = reordered.iterator(); it.hasNext();) { - Object key = it.next(); - htmlOut.print(formatCode((String)key)); - } - bf.showSetNames(textOut, reordered); - } - htmlOut.println(""); - } - - /** - * @param string - * @return - */ - private String formatCode(String string) { - int cat = ucd.getCategory(UTF16.charAt(string,0)); - return "" - + (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc - + BagFormatter.toHTMLControl.transliterate(string) - + " "; - } - } - /** * @param inbuffer * @param outbuffer diff --git a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java index 873aa8a7284..b5a9bf024e8 100644 --- a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java +++ b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java @@ -240,7 +240,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { {"Control", "CN"}, {"Extend", "EX"}, {"Other", "XX"}, - }).swapFirst2ValueAliases()); + }, true).swapFirst2ValueAliases()); add(new UnicodeProperty.UnicodeMapProperty() { { @@ -283,7 +283,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { {"Numeric", "NU"}, {"ExtendNumLet", "EX"}, {"Other", "XX"}, - }).swapFirst2ValueAliases()); + }, true).swapFirst2ValueAliases()); add(new UnicodeProperty.UnicodeMapProperty() { { @@ -335,7 +335,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { {"STerm", "ST"}, {"Close", "CL"}, {"Other", "XX"}, - }).swapFirst2ValueAliases()); + }, false).swapFirst2ValueAliases()); } static String[] YES_NO_MAYBE = {"N", "M", "Y"}; diff --git a/tools/unicodetools/com/ibm/text/UCD/confusablesHeader.txt b/tools/unicodetools/com/ibm/text/UCD/confusablesHeader.txt new file mode 100644 index 00000000000..97d32b358fa --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/confusablesHeader.txt @@ -0,0 +1,34 @@ +# Confusables.txt +# Generated: %date%, MED +# This is a draft list of visually confusable characters, for use in conjunction with the +# recommendations in http://www.unicode.org/reports/tr36/ +# +# To fold using this list, first perform NFKD (if not already performed), +# then map each source character to the target character(s), then perform NFKD again. +# +# The format the standard Unicode semicolon-delimited hex. +# ; ; # +# +# The characters may be visually distinguishable in many fonts, or at larger sizes. +# Some anomalies are also introduced by 'closure'. That is, there may be a sequence of +# characters where each is visually confusable from the next, but the start and end are +# visually distinguishable. But when the set is closed, these will all map to together. +# +# This is unlike normalization data. There may be no connection between characters other +# than visual confusability. This data should not be used except in assessing visual confusability. +# +# This list is not limited to Unicode Identifier characters (XID_Continue) although the primary +# application will be to such characters. It is also not limited to lowercase characters, +# although the recommendations are to lowercase for security. +# +# Note that a some characters have unusual characteristics, and are not yet accounted for. +# For example, U+302E (?) HANGUL SINGLE DOT TONE MARK and U+302F (?) HANGUL DOUBLE DOT TONE MARK +# appear to the left of the prevous character. So what looks like "a:b" can actually be "ab\u302F" +# +# WARNING: The data is not final; it is very draft at this point, put together from different +# sources that need to be reviewed for accuracy and completeness of the mappings. +# There are still clear errors in the data; do not use this in any implementations. +# Ignore the internal_info field; it will be removed. +# +# Thanks especially to Eric van der Poel for collecting information about fonts using shared glyphs. +# ================================= \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html b/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html index 43c5dd69dd1..f7aa80a7eba 100644 --- a/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html +++ b/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html @@ -86,6 +86,10 @@ Within each subcategory characters are sorted according to the default +

Characters that are normally invisible are represented in the chart by their Unicode number, such as "U+FE00".

+

At the end of this document, there is an additional section that lists all visible non-spacing marks. +These are sorted first by combining character class (modified), then by script, then by code point..

+

For comparison of Indic characters, see indic-trans.html.

Additional Word Characters

This is a draft list of characters based on Section 4 Word Boundaries of UAX# 29, in the diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java index 54c04b73e57..de05b700ac1 100644 --- a/tools/unicodetools/com/ibm/text/utility/Utility.java +++ b/tools/unicodetools/com/ibm/text/utility/Utility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ -* $Date: 2005/03/30 17:19:32 $ -* $Revision: 1.48 $ +* $Date: 2005/05/27 21:39:03 $ +* $Revision: 1.49 $ * ******************************************************************************* */ @@ -336,6 +336,10 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES } public static String fromHex(String p) { + return fromHex(p, false); + } + + public static String fromHex(String p, boolean acceptChars) { StringBuffer output = new StringBuffer(); int value = 0; int count = 0; @@ -357,13 +361,31 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES default: int type = Character.getType(ch); if (type != Character.SPACE_SEPARATOR) { + if (acceptChars) { + if (count >= 4 && count <= 6) { + UTF32.append32(output, value); + count = 0; + value = 0; + } else if (count != 0) { + output.append(p.substring(i-count, i)); // TODO fix supplementary characters + } + UTF32.append32(output, ch); + continue main; + + } throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"", new Object[] {String.valueOf(ch), new Integer(i), p}); } // fall through!! case ' ': case ',': case ';': // do SPACE here, just for speed if (count != 0) { - UTF32.append32(output, value); + if (count < 4 || count > 6) { + if (acceptChars) output.append(p.substring(i-count, i)); + else throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"", + new Object[] {String.valueOf(ch), new Integer(i), p}); + } else { + UTF32.append32(output, value); + } } count = 0; value = 0; @@ -378,7 +400,13 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES count++; } if (count != 0) { - UTF32.append32(output, value); + if (count < 4 || count > 6) { + if (acceptChars) output.append(p.substring(p.length()-count, p.length())); + else throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"", + new Object[] {"EOS", new Integer(p.length()), p}); + } else { + UTF32.append32(output, value); + } } return output.toString(); }