diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java b/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java
new file mode 100644
index 00000000000..d5c1146ae19
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java
@@ -0,0 +1,480 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and *
+* others. All Rights Reserved. *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
+* $Date: 2005/05/27 21:40:51 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+package com.ibm.text.UCD;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import com.ibm.icu.dev.test.util.ArrayComparator;
+import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.dev.test.util.UnicodeMap;
+import com.ibm.icu.dev.test.util.UnicodePropertySource;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.text.utility.Utility;
+
+public class GenerateConfusables {
+ static PrintWriter log;
+ static final String ARROW = "\u2192";
+
+ static class Data2 {
+ String source;
+ String target;
+ int count;
+ Data2(String target, int count) {
+ this.target = target;
+ this.count = count;
+ }
+ }
+
+ static ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
+ static UnicodeSet skipSet = ups.getSet("gc=Cn").addAll(ups.getSet("gc=Co")).addAll(ups.getSet("gc=Cc")).addAll(ups.getSet("gc=Cf"));
+
+ static class Data implements Comparable {
+ String source;
+ String target;
+ String type;
+ Data(String source, String target, String type) {
+ this.source = source;
+ this.target = target;
+ this.type = type;
+ }
+ public int compareTo(Object o) {
+ int result;
+ Data that = (Data)o;
+ if (0 != (result = target.compareTo(that.target))) return result;
+ if (0 != (result = source.compareTo(that.source))) return result;
+ if (0 != (result = type.compareTo(that.type))) return result;
+ return 0;
+ }
+ }
+
+ static UnicodeSet controls = new UnicodeSet("[:Cc:]");
+
+ static class DataSet {
+ Set dataSet = new TreeSet();
+ Map dataMap = new TreeMap(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(), new UTF16.StringComparator()}));
+
+ public DataSet add(String source, String target, String type, String errorLine) {
+ if (skipSet.containsAll(source) || skipSet.containsAll(target)) return this;
+ String nsource = Default.nfkd().normalize(source);
+ String ntarget = Default.nfkd().normalize(target);
+
+ // if it is just a compatibility match, return
+ if (nsource.equals(ntarget)) return this;
+
+ if (type.startsWith("confusables-")) type = type.substring("confusables-".length());
+ if (type.endsWith(".txt")) type = type.substring(0,type.length() - ".txt".length());
+
+ // if it is base + combining sequence => base2 + same combining sequence, do just the base
+ int nsourceFirst = UTF16.charAt(nsource,0);
+ String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst));
+ int ntargetFirst = UTF16.charAt(ntarget,0);
+ String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst));
+ if (nsourceRest.length() != 0 && nsourceRest.equals(ntargetRest)) {
+ source = UTF16.valueOf(nsourceFirst);
+ target = UTF16.valueOf(ntargetFirst);
+ type += "-base";
+ }
+
+ // swap order
+ if (preferSecondAsSource(source, target)) {
+ String temp = target;
+ target = source;
+ source = temp;
+ }
+ if (target.indexOf('\u203D') >= 0) type += "-skip";
+ Data newData = new Data(source, target, type);
+ return add(newData, errorLine);
+ }
+ /**
+ * @param errorLine TODO
+ *
+ */
+ private DataSet add(Data newData, String errorLine) {
+ if (controls.containsSome(newData.source) || controls.containsSome(newData.target)) {
+ System.out.println("Problem with " + errorLine);
+ System.out.println(getCodeCharName(newData.source) + " => " + getCodeCharName(newData.target));
+ }
+ String[] key = {newData.source, newData.target};
+ Data old = (Data) dataMap.get(key);
+ if (old == null) {
+ dataSet.add(newData);
+ dataMap.put(key, newData);
+ }else {
+ old.type = old.type + "/" + newData.type;
+ }
+ return this;
+ }
+ // Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt"
+ static final int NORMAL = 0, FOLDING = 1, OLD = 2;
+
+ public DataSet addFile(String directory, String filename) throws IOException {
+ BufferedReader in = BagFormatter.openUTF8Reader(directory, filename);
+ int kind = NORMAL;
+ if (filename.indexOf("Folding") >= 0) kind = FOLDING;
+ else if (false && filename.indexOf("-old") >= 0) kind = OLD;
+ while (true) {
+ String line = Utility.readDataLine(in);
+ if (line == null) break;
+ if (line.length() == 0) continue;
+ String[] pieces = Utility.split(line,';');
+ if (pieces.length < 2) {
+ System.out.println("Error on: " + line);
+ continue;
+ }
+ String type = filename;
+ if (kind==FOLDING) {
+ String source = Utility.fromHex(pieces[0].trim(),true);
+ String target = Utility.fromHex(pieces[1].trim(),true);
+ String nsource = Default.nfkd().normalize(source);
+ String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
+ if (!first.equals(target)) {
+ add(source, target, type, line);
+ }
+ } else if (kind == OLD) {
+ String target = pieces[0].trim();
+ for (int i = 1; i < pieces.length; ++i) {
+ add(pieces[i].trim(), target, type, line);
+ }
+ } else {
+ String source = Utility.fromHex(pieces[0].trim(),true);
+ String target = Utility.fromHex(pieces[1].trim(),true);
+ if (pieces.length > 2) type = pieces[2].trim();
+ add(source, target, type, line);
+ }
+ }
+ in.close();
+ return this;
+ }
+ public void write(String directory, String filename, boolean appendFile) throws IOException {
+ PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
+ if (appendFile) {
+ String[] replacements = {"%date%", Default.getDate()};
+ Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
+ Utility.UTF8_WINDOWS, out, replacements);
+ }
+ for (Iterator it = dataSet.iterator(); it.hasNext();) {
+ Data item = (Data) it.next();
+ out.println(
+ Utility.hex(item.source)
+ + " ;\t" + Utility.hex(item.target)
+ + " ;\t" + item.type
+ + "\t# "
+ + "( " + item.source + " " + ARROW + " " + item.target + ") "
+ + Default.ucd().getName(item.source) + " " + ARROW + " "
+ + Default.ucd().getName(item.target));
+
+ }
+ out.close();
+ }
+ /**
+ *
+ */
+ public void add(DataSet ds) {
+ for (Iterator it = ds.dataSet.iterator(); it.hasNext();) {
+ add((Data)it.next(), "");
+ }
+ }
+ public DataSet clean() {
+ // remove all skips
+ DataSet tempSet = new DataSet();
+ Map m = new HashMap();
+ for (Iterator it = dataSet.iterator(); it.hasNext();) {
+ Data d = (Data) it.next();
+ if (d.type.indexOf("skip") >= 0) continue;
+ String newTarget = Default.nfkd().normalize(d.target);
+ String newSource = Default.nfkd().normalize(d.source);
+ String type = d.type;
+ if (!d.target.equals(newTarget) || !d.source.equals(newSource)) {
+ type += "-nf";
+ log.println("Norm:\t" + getCodeCharName(d.source) + " " + ARROW + " " + getCodeCharName(newSource));
+ log.println("\t" + getCodeCharName(d.target) + " " + ARROW + " " + getCodeCharName(newTarget) + " \t" + type);
+ continue;
+ }
+ // swap order
+ if (preferSecondAsSource(newSource, newTarget)) {
+ String temp = newTarget;
+ newTarget = newSource;
+ newSource = temp;
+ }
+
+ Data already = (Data) m.get(newSource);
+ if (already != null && !newTarget.equals(already.target)) {
+ log.println("X " + getCodeCharName(newSource) + " " + ARROW);
+ log.println("\t" + getCodeCharName(newTarget) + " \t" + type);
+ log.println("\t" + getCodeCharName(already.target) + " \t" + already.type);
+ if (preferSecondAsSource(already.target, newTarget)) {
+ // just fix new guy
+ type += "[" + newSource + "]" + already.type;
+ newSource = newTarget;
+ newTarget = already.target;
+ } else {
+ // need to fix new guy, AND fix old guy.
+ tempSet.remove(already);
+ type += "[" + newSource + "]" + already.type;
+ newSource = already.target;
+ already.type += "[" + already.target + "]" + type;
+ already.target = newTarget;
+ tempSet.add(already, "");
+ }
+ }
+ Data newData = new Data(newSource, newTarget, type);
+ m.put(newSource, newData);
+ tempSet.add(newData, "");
+ }
+ // now recursively apply
+ DataSet s = new DataSet();
+ for (Iterator it = tempSet.dataSet.iterator(); it.hasNext();) {
+ Data d = (Data) it.next();
+ int cp = 0;
+ StringBuffer result = new StringBuffer();
+ for (int i = 0; i < d.target.length(); i += UTF16.getCharCount(cp)) {
+ cp = UTF16.charAt(d.target, i);
+ String src = UTF16.valueOf(cp);
+ while (true) {
+ Data rep = (Data) m.get(src);
+ if (rep == null) break;
+ src = rep.target;
+ }
+ result.append(src);
+ }
+ String newTarget = result.toString();
+ newTarget = Default.nfkd().normalize(newTarget);
+ s.add(d.source, newTarget, d.type + (newTarget.equals(newTarget) ? "" : "-rec"), "");
+ }
+ return s;
+ }
+ /**
+ *
+ */
+ private void remove(Data already) {
+ String[] key = {already.source, already.target};
+ dataMap.remove(key);
+ dataSet.remove(already);
+ }
+ }
+ public static void main(String[] args) throws IOException {
+ String indir = Utility.BASE_DIR + "confusables/";
+ String outdir = Utility.GEN_DIR + "confusables/";
+ log = BagFormatter.openUTF8Writer(outdir, "log.txt");
+ //fixMichel(indir, outdir);
+ generateConfusables(indir, outdir);
+ log.close();
+ System.out.println("Done");
+ }
+ /**
+ * @throws IOException
+ *
+ */
+ private static void fixMichel(String indir, String outdir) throws IOException {
+ BufferedReader in = BagFormatter.openUTF8Reader(indir + "michel/", "tr36comments-annex.txt");
+ PrintWriter out = BagFormatter.openUTF8Writer(outdir, "new-tr36comments-annex.txt");
+ while (true) {
+ String line = Utility.readDataLine(in);
+ if (line == null) break;
+ String[] pieces = Utility.split(line,'\t');
+ if (pieces.length < 2) {
+ out.println(line);
+ continue;
+ }
+ String source = Utility.fromHex(pieces[0].trim());
+ if (Default.nfkd().isNormalized(source)) {
+ out.println(line);
+ }
+ }
+ in.close();
+ out.close();
+ }
+ /**
+ *
+ */
+ private static void generateConfusables(String indir, String outdir) throws IOException {
+ File dir = new File(indir);
+ String[] names = dir.list();
+ DataSet total = new DataSet();
+ for (int i = 0; i < names.length; ++i) {
+ if (new File(indir + names[i]).isDirectory()) continue;
+ System.out.println(names[i]);
+ DataSet ds = new DataSet();
+ ds.addFile(indir, names[i]);
+ ds.write(outdir, "new-" + names[i], false);
+ total.add(ds);
+ }
+ total.write(outdir, "confusables-raw.txt", false);
+ DataSet clean = total.clean();
+ clean.write(outdir, "confusables.txt", true);
+ }
+ /*
+ BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt");
+ Set set = new TreeSet(new ArrayComparator(new Comparator[] {new UTF16.StringComparator(),
+ new UTF16.StringComparator()}));
+ while (true) {
+ String line = Utility.readDataLine(in);
+ if (line == null) break;
+ if (line.length() == 0) continue;
+ String[] pieces = Utility.split(line,';');
+ if (pieces.length < 2) {
+ System.out.println("Error on: " + line);
+ continue;
+ }
+ String source = Utility.fromHex(pieces[0].trim());
+ String target = Utility.fromHex(pieces[1].trim());
+ String nsource = Default.nfkd().normalize(source);
+ String first = UTF16.valueOf(UTF16.charAt(nsource, 0));
+ if (!first.equals(target)) {
+ set.add(new String[]{source, target});
+ }
+ }
+ in.close();
+
+ }
+ public static void gen() throws IOException {
+ Map m = new TreeMap();
+ BufferedReader in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables.txt");
+ while (true) {
+ String line = in.readLine();
+ if (line == null) break;
+ String[] pieces = Utility.split(line,';');
+ if (pieces.length < 3) {
+ System.out.println("Error on: " + line);
+ continue;
+ }
+ int codepoint = Integer.parseInt(pieces[1], 16);
+ int cat = Default.ucd().getCategory(codepoint);
+ if (cat == UCD_Types.Co || cat == UCD_Types.Cn) continue; // skip private use
+ if (!Default.nfkd().isNormalized(codepoint)) continue; //skip non NFKC
+ String result = Utility.fromHex(pieces[0]);
+ if (!Default.nfkd().isNormalized(result)) continue; //skip non NFKC
+ int count = Integer.parseInt(pieces[2]);
+ String source = UTF16.valueOf(codepoint);
+ add(m, source, result, count);
+ }
+ in.close();
+
+ in = BagFormatter.openUTF8Reader(Utility.BASE_DIR + "confusables/", "confusables2.txt");
+ while (true) {
+ String line = in.readLine();
+ if (line == null) break;
+ line = line.trim();
+ int pos = line.indexOf("#");
+ if (pos >= 0) line = line.substring(0,pos).trim();
+ if (line.length() == 0) continue;
+ if (line.startsWith("@")) continue;
+ String[] pieces = Utility.split(line,';');
+ if (pieces.length < 2) {
+ System.out.println("Error on: " + line);
+ continue;
+ }
+ String source = pieces[0].trim();
+ for (int i = 1; i < pieces.length; ++i) {
+ add(m, source, pieces[i].trim(), -1);
+ }
+ }
+ in.close();
+
+ boolean gotOne;
+ // close the set
+ do {
+ gotOne = false;
+ for (Iterator it = m.keySet().iterator(); it.hasNext();) {
+ String source = (String) it.next();
+ Data2 data = (Data2) m.get(source);
+ Data2 data2 = (Data2) m.get(data.target);
+ if (data2 == null) continue;
+ data.target = data2.target;
+ gotOne = true;
+ break;
+ }
+ } while (gotOne);
+ // put into different sorting order
+ Set s = new TreeSet();
+ for (Iterator it = m.keySet().iterator(); it.hasNext();) {
+ String source = (String) it.next();
+ Data2 data = (Data2) m.get(source);
+ s.add(new Data(source, data.target, data.count));
+ }
+ // write it out
+ PrintWriter out = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "confusables.txt");
+ String[] replacements = {"%date%", Default.getDate()};
+ Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
+ Utility.UTF8_WINDOWS, out, replacements);
+ for (Iterator it = s.iterator(); it.hasNext();) {
+ Data d = (Data) it.next();
+ if (d == null) continue;
+ out.println(formatLine(d.source, d.target, d.count));
+ }
+
+ out.close();
+ System.out.println("Done");
+ }
+ /**
+ *
+ */
+ private static String formatLine(String source, String target, int count) {
+ return Utility.hex(source) + " ; " + Utility.hex(target," ")
+ + " ; " + count
+ + " # "
+ + "(" + source + " " + ARROW + " " + target + ") "
+ + Default.ucd().getName(source)
+ + " " + ARROW + " " + Default.ucd().getName(target);
+ }
+ /**
+ *
+ */
+ private static void add(Map m, String source, String target, int count) {
+ if (source.length() == 0 || target.length() == 0) return;
+ if (preferSecondAsSource(source, target)) {
+ String temp = target;
+ target = source;
+ source = temp;
+ }
+ Data2 other = (Data2) m.get(source);
+ if (other != null) {
+ if (target.equals(other.target)) return;
+ System.out.println("conflict");
+ System.out.println(formatLine(source, target, count));
+ System.out.println(formatLine(source, other.target, other.count));
+ // skip adding this, and instead add result -> other.target
+ add(m, target, other.target, count);
+ } else {
+ m.put(source, new Data2(target, count));
+ }
+ };
+
+ static private boolean preferSecondAsSource(String a, String b) {
+ // if first is longer, prefer second
+ int ca = UTF16.countCodePoint(a);
+ int cb = UTF16.countCodePoint(b);
+ if (ca != cb) {
+ return ca > cb;
+ }
+ // if first is lower, prefer second
+ return a.compareTo(b) < 0;
+ }
+
+ static String getCodeCharName(String a) {
+ return Default.ucd().getCode(a) + "( " + a + " ) " + Default.ucd().getName(a);
+ }
+
+}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java b/tools/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java
new file mode 100644
index 00000000000..d6f563623a4
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java
@@ -0,0 +1,125 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and *
+* others. All Rights Reserved. *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java,v $
+* $Date: 2005/05/27 21:40:51 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+package com.ibm.text.UCD;
+import com.ibm.text.utility.*;
+import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.text.Transliterator;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+import java.util.*;
+import java.io.*;
+
+public final class GenerateNamedSequences implements UCD_Types {
+
+ static final boolean DEBUG = false;
+
+ static public String showVarGlyphs(String code0, String code1, String shape, String description) {
+ if (DEBUG) System.out.println(code0 + ", " + code1 + ", [" + shape + "]");
+
+ String abbShape = "";
+ if (shape.length() != 0) {
+ abbShape = '-' + shape.substring(0,4);
+ if (description.indexOf("feminine") >= 0) abbShape += "fem";
+ }
+
+ return " ";
+ }
+
+/*
+# Field 0: the variation sequence
+# Field 1: the description of the desired appearance
+# Field 2: where the appearance is only different in in particular shaping environments
+# this field lists them. The possible values are: isolated, initial, medial, final.
+# If more than one is present, there are spaces between them.
+*/
+ static public void generate() throws IOException {
+
+
+ // read the data and compose the table
+
+ String table = "
Rep Glyph Hex Sequence Name Copyable ";
+
+ String[] splits = new String[4];
+ String[] codes = new String[20];
+ String[] shapes = new String[4];
+
+ BufferedReader in = Utility.openUnicodeFile("NamedSequences", Default.ucdVersion(), true, Utility.LATIN1);
+ Transliterator unicodexml = Transliterator.getInstance("hex/xml");
+ while (true) {
+ String line = Utility.readDataLine(in);
+ if (line == null) break;
+ line = line.trim();
+ if (line.length() == 0) continue;
+
+ int count = Utility.split(line, ';', splits);
+ String name = splits[0];
+ int codeCount = Utility.split(splits[1], ' ', codes);
+ StringBuffer codeBuffer = new StringBuffer();
+ for (int i = 0; i < codeCount; ++i) {
+ UTF16.append(codeBuffer, Integer.parseInt(codes[i],16));
+ }
+ String codeWithHyphens = splits[1].replaceAll("\\s", "-");
+ String codeAlt = "U+" + splits[1].replaceAll("\\s", " U+");
+ String codeString = unicodexml.transliterate(codeBuffer.toString());
+
+ //
+
+ //table += " \n";
+ String imageName = "images/U" + codeWithHyphens + ".gif";
+ if (splits[1].compareTo("1780") >= 0 && splits[1].compareTo("1800") < 0) {
+ String codeNoSpaces2 = splits[1].replaceAll("\\s", "");
+ imageName = "http://www.unicode.org/reports/tr28/images/" + codeNoSpaces2 + ".gif";
+ }
+ table += ""
+ + ""
+ + splits[1] + " "
+ + "" + splits[1] + " "
+ + "" + name + " "
+ + "" + codeString + " "
+ + " \n";
+ System.out.println(splits[1] + "\t" + codeString);
+ }
+ in.close();
+ table += "
";
+
+ // now write out the results
+
+ String directory = "DerivedData/";
+ String filename = directory + "NamedSequences" + UnicodeDataFile.getHTMLFileSuffix(true);
+ PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
+ /*
+ String[] batName = {""};
+ String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
+
+ String version = Default.ucd().getVersion();
+ int lastDot = version.lastIndexOf('.');
+ String updateDirectory = version.substring(0,lastDot) + "-Update";
+ int updateV = version.charAt(version.length()-1) - '0';
+ if (updateV != 0) updateDirectory += (char)('1' + updateV);
+ if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
+ */
+
+ String[] replacementList = {
+ "@revision@", Default.ucd().getVersion(),
+ //"@updateDirectory@", updateDirectory,
+ "@date@", Default.getDate(),
+ "@table@", table};
+
+ Utility.appendFile("NamedSequences-Template.html", Utility.UTF8, out, replacementList);
+
+ out.close();
+ //Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
+ }
+}
diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java b/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java
new file mode 100644
index 00000000000..2a1230f7f0c
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java
@@ -0,0 +1,515 @@
+/*
+ * Created on May 3, 2005
+ * Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others.
+ * For terms of use, see http://www.unicode.org/terms_of_use.html
+ */
+package com.ibm.text.UCD;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+
+import com.ibm.icu.dev.test.util.BagFormatter;
+import com.ibm.icu.dev.test.util.CollectionUtilities;
+import com.ibm.icu.dev.test.util.UnicodeLabel;
+import com.ibm.icu.dev.test.util.UnicodeMap;
+import com.ibm.icu.dev.test.util.UnicodeMap.Composer;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.IDNA;
+import com.ibm.icu.text.StringPrepParseException;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.text.UTF16.StringComparator;
+import com.ibm.icu.util.ULocale;
+import com.ibm.text.UCD.GenerateHanTransliterator.MultiComparator;
+import com.ibm.text.UCD.TestData.RegexMatcher;
+import com.ibm.text.utility.Utility;
+
+
+class GenerateStringPrep implements UCD_Types {
+
+ public static void main (String[] args) throws IOException {
+ //checkChars(false);
+ new GenerateStringPrep().genStringPrep();
+ System.out.println("Done");
+ }
+
+ UnicodeSet[] coreChars = new UnicodeSet[100];
+ UnicodeSet decomposable = new UnicodeSet();
+ UnicodeMap suspect = new UnicodeMap();
+
+ ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
+ ToolUnicodePropertySource ups32 = ToolUnicodePropertySource.make("3.2.0");
+ //UnicodeSet id_continue = ups.getSet("ID_Continue=true");
+ UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
+ UnicodeSet wordChars = new UnicodeSet();
+ {
+ if (false) {
+ wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
+ wordChars.retainAll(ups.getSet("gc=Sk"));
+ }
+ wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
+ " \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
+ " \\u055A \\u02B9 \\u02BA]"));
+ //wordChars.removeAll(xid_continue);
+ }
+
+ UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
+ UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
+ UnicodeSet non_spacing = new UnicodeSet(ups.getSet("gc=Me"))
+ .addAll(ups.getSet("gc=Mn"))
+ .removeAll(ups.getSet("Default_Ignorable_Code_Point=true"));
+
+ UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
+
+ //UnicodeSet[] decompChars = new UnicodeSet[100];
+ UCD ucd = Default.ucd();
+
+ static Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
+ {
+ uca0.setStrength(Collator.IDENTICAL);
+ }
+ static GenerateHanTransliterator.MultiComparator uca
+ = new GenerateHanTransliterator.MultiComparator(new Comparator[] {
+ uca0, new UTF16.StringComparator()});
+
+ UnicodeSet bidiR = new UnicodeSet(
+ "[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
+
+ UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
+ UnicodeSet hasNoUpper = new UnicodeSet();
+ UnicodeSet hasNoUpperMinus = new UnicodeSet();
+ BagFormatter bf = new BagFormatter();
+ UnicodeSet inIDN = new UnicodeSet();
+ UnicodeSet isCaseFolded = new UnicodeSet();
+
+ void genStringPrep() throws IOException {
+ //showScriptToBlock();
+ bf.setShowLiteral(BagFormatter.toHTMLControl);
+ bf.setUnicodePropertyFactory(ups);
+ //bf.setValueSource(UnicodeLabel.NULL);
+ if (false) {
+
+ System.out.println("word chars: " + bf.showSetNames(wordChars));
+ System.out.println("pat: " + bf.showSetNames(patternProp));
+ System.out.println("xid: " + bf.showSetNames(not_xid_continue));
+ }
+ for (int cp = 0; cp <= 0x10FFFF; ++cp) {
+ Utility.dot(cp);
+ int cat = Default.ucd().getCategory(cp);
+ if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
+ if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
+ // get IDNA
+ int idnaType = getIDNAType(cp);
+ idnaTypeSet[idnaType].add(cp);
+
+ String str = UTF16.valueOf(cp);
+ if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
+ if (str.equals(ucd.getCase(str, FULL, FOLD))) isCaseFolded.add(cp);
+
+ // scripts
+ int script = ucd.getScript(cp);
+ if (coreChars[script] == null)
+ coreChars[script] = new UnicodeSet();
+ coreChars[script].add(cp);
+ }
+ // fix characters with no uppercase
+ hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
+ System.out.println(bf.showSetNames(hasNoUpper));
+
+ Utility.fixDot();
+ PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
+ PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
+ PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
+ textOut.println('\uFEFF');
+ textOut.println("For documentation, see idn-chars.html");
+
+ Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut,
+ new String[] {"%date%", Default.getDate()});
+ /*
+ out
+ .println(" ");
+ out.println("IDN Characters ");
+ */
+ htmlOut.println("");
+ htmlOut2.println("");
+
+ for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
+ if (scriptCode == COMMON_SCRIPT
+ || scriptCode == INHERITED_SCRIPT)
+ continue;
+ showCodes(htmlOut, textOut, scriptCode, htmlOut2);
+ }
+ showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
+ showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
+
+ showCodes(htmlOut, textOut, non_spacing);
+ htmlOut.println("
");
+ htmlOut.close();
+ htmlOut2.println("
");
+ htmlOut2.close();
+ bf.setMergeRanges(false);
+
+ textOut.println();
+ textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
+ textOut.println();
+ bf.setValueSource("word-chars");
+ bf.showSetNames(textOut, wordChars);
+
+ textOut.println();
+ textOut.println("# *** FOR REVIEW ***");
+ bf.setLabelSource(UnicodeLabel.NULL);
+ for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
+ textOut.println();
+ String value = (String)it.next();
+ bf.setValueSource(value);
+ bf.showSetNames(textOut, suspect.getSet(value));
+ }
+ textOut.close();
+ textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn_vs_cfnfkcid.txt");
+ bf = new BagFormatter();
+ bf.setUnicodePropertyFactory(ups);
+ textOut.println();
+ textOut.println("# *** Comparison of IDN with CF_NFKC_ID (case-folded, NFKC, XID), U3.2 only ***");
+ UnicodeSet U32 = ups32.getSet("gc=cn").complement();
+ UnicodeSet CF_NFKC_ID = new UnicodeSet(xid_continue).retainAll(isNFKC).retainAll(isCaseFolded).retainAll(U32);
+ bf.showSetDifferences(textOut, "CF_NFKC_ID", CF_NFKC_ID, "IDN", idnaTypeSet[OK]);
+ textOut.close();
+
+ }
+
+ /**
+ *
+ */
+ private void showScriptToBlock() {
+ UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
+ UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
+ UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
+ public Object compose(Object a, Object b) {
+ return a + "\t" + b;
+ }
+ };
+ UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
+ for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
+ System.out.println(it.next());
+ }
+ throw new IllegalArgumentException();
+ }
+
+ Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
+
+ static String[][] script_to_gif = {
+
+ {"Common","common.gif"}, //Miscellaneous_Symbols
+ {"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
+ {"Arabic","arabic.gif"}, //Arabic
+ {"Armenian","armenian.gif"}, //Armenian
+ {"Bengali","bengali.gif"}, //Bengali
+ {"Bopomofo","bopomofo.gif"}, //Bopomofo
+ {"Braille","braillesymbols.gif"}, //Braille_Patterns
+ {"Buginese","buginese.gif"}, //Buginese
+ {"Buhid","buhid.gif"}, //Buhid
+ {"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
+ {"Cherokee","cherokee.gif"}, //Cherokee
+ {"Coptic","coptic.gif"}, //Coptic
+ {"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
+ {"Cyrillic","cyrillic.gif"}, //Cyrillic
+ {"Deseret","deseret.gif"}, //Deseret
+ {"Devanagari","devanagari.gif"}, //Devanagari
+ {"Ethiopic","ethiopic.gif"}, //Ethiopic
+ {"Georgian","georgian.gif"}, //Georgian
+ {"Glagolitic","glagolitic.gif"}, //Glagolitic
+ {"Gothic","gothic.gif"}, //Gothic
+ {"Greek","greek.gif"}, //Greek_and_Coptic
+ {"Gujarati","gujarati.gif"}, //Gujarati
+ {"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
+ {"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
+ {"Han","kangxiradicals.gif"}, //Kangxi_Radicals
+ {"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
+ {"Hanunoo","hanunoo.gif"}, //Hanunoo
+ {"Hebrew","hebrew.gif"}, //Hebrew
+ {"Hiragana","hiragana.gif"}, //Hiragana
+ {"Kannada","kannada.gif"}, //Kannada
+ {"Katakana","katakana.gif"}, //Katakana
+ {"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
+ {"Khmer","khmer.gif"}, //Khmer
+ {"Lao","lao.gif"}, //Lao
+ {"Latin","latin.gif"}, //Basic_Latin
+ {"Limbu","limbu.gif"}, //Limbu
+ {"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
+ {"Malayalam","malayalam.gif"}, //Malayalam
+ {"Mongolian","mongolian.gif"}, //Mongolian
+ {"Myanmar","myanmar.gif"}, //Myanmar
+ {"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
+ {"Ogham","ogham.gif"}, //Ogham
+ {"Old_Italic","olditalic.gif"}, //Old_Italic
+ {"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
+ {"Oriya","oriya.gif"}, //Oriya
+ {"Osmanya","osmanya.gif"}, //Osmanya
+ {"Runic","runic.gif"}, //Runic
+ {"Shavian","shavian.gif"}, //Shavian
+ {"Sinhala","sinhala.gif"}, //Sinhala
+ {"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
+ {"Syriac","syriac.gif"}, //Syriac
+ {"Tagalog","tagalog.gif"}, //Tagalog
+ {"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
+ {"Tai_Le","taile.gif"}, //Tai_Le
+ {"Tamil","tamil.gif"}, //Tamil
+ {"Telugu","telugu.gif"}, //Telugu
+ {"Thaana","thaana.gif"}, //Thaana
+ {"Thai","thai.gif"}, //Thai
+ {"Tibetan","tibetan.gif"}, //Tibetan
+ {"Tifinagh","tifinagh.gif"}, //Tifinagh
+ {"Ugaritic","ugaritic.gif"}, //Ugaritic
+ {"Yi","yi.gif"}, //Yi_Syllables
+
+ };
+
+ UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
+ {
+ for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
+ }
+ static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
+ /**
+ *
+ */
+ private int getIDNAType(int cp) {
+ inbuffer.setLength(0);
+ UTF16.append(inbuffer, cp);
+ try {
+ intermediate = IDNA.convertToASCII(inbuffer,
+ IDNA.DEFAULT); // USE_STD3_RULES
+ if (intermediate.length() == 0)
+ return DELETED;
+ outbuffer = IDNA.convertToUnicode(intermediate,
+ IDNA.USE_STD3_RULES);
+ } catch (StringPrepParseException e) {
+ return ILLEGAL;
+ } catch (Exception e) {
+ System.out.println("Failure at: " + Utility.hex(cp));
+ return ILLEGAL;
+ }
+ if (!TestData.equals(inbuffer, outbuffer))
+ return REMAPPED;
+ return OK;
+ }
+ StringBuffer inbuffer = new StringBuffer();
+ StringBuffer intermediate, outbuffer;
+
+ UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
+
+ /**
+ * @param htmlOut
+ * @param textOut TODO
+ * @param scriptCode
+ * @param htmlOut2 TODO
+ * @param ucd
+ * @param coreChars
+ * @param decompChars
+ */
+ private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
+ if (coreChars[scriptCode] == null) return;
+ String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
+ script = Utility.getUnskeleton(script.toLowerCase(),true);
+ System.out.println(script);
+
+ htmlOut.println();
+ String scriptLine = " Script: " + script + " ";
+ htmlOut.println(scriptLine);
+ htmlOut2.println(scriptLine);
+ textOut.println();
+ textOut.println("#*** Script: " + script + " ***");
+ UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
+
+ UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
+ UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
+ UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
+
+ UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
+ UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
+
+ UnicodeSet decomp = extract(decomposable, core);
+ UnicodeSet pattern = extract(patternProp, core);
+ UnicodeSet non_id = extract(not_xid_continue, core);
+
+ UnicodeSet bicameralNoupper = new UnicodeSet();
+ if (!hasNoUpper.containsAll(core)) {
+ bicameralNoupper = extract(hasNoUpperMinus, core);
+ }
+
+ UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
+ for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
+ String cat = Default.ucd().getCategoryID(it.codepoint);
+ String name = Default.ucd().getName(it.codepoint);
+ if (name.indexOf("MUSICAL SYMBOL") >= 0
+ || name.indexOf("DINGBA") >= 0
+ || name.indexOf("RADICAL ") >= 0
+ ) cat = "XX";
+ suspect.put(it.codepoint, cat);
+ }
+
+ if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode, uca);
+ if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode, uca);
+ if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode, uca);
+ if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode, uca);
+ if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode, uca);
+
+ if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode, uca);
+ if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode, uca);
+ if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode, uca);
+ if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode, uca);
+ if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode, uca);
+ }
+
+ private void showCodes(PrintWriter htmlOut, PrintWriter textOut, UnicodeSet uset) throws IOException {
+ String script = Default.ucd().getScriptID_fromIndex((byte) INHERITED_SCRIPT);
+ script = Utility.getUnskeleton(script.toLowerCase(),true);
+ String scriptLine = " Script: " + script + " ";
+ htmlOut.println(scriptLine);
+ UnicodeMap m = getPositions();
+
+ for (Iterator it = m.getAvailableValues(new TreeSet(uca)).iterator(); it.hasNext(); ) {
+ String type = (String) it.next();
+ UnicodeSet current = m.getSet(type).retainAll(non_spacing);
+ if (current.size() == 0) continue;
+ printlnSet(htmlOut, textOut, script, "Visible_Combining_Marks_" + type, current, INHERITED_SCRIPT, positionComparator);
+ }
+ }
+
+ /**
+ * @throws IOException
+ *
+ */
+ private UnicodeMap getPositions() throws IOException {
+ UnicodeMap result = new UnicodeMap();
+ BufferedReader in = bf.openUTF8Reader("C:\\DATA\\confusables\\", "positions.txt");
+ String type="Undetermined";
+ while (true) {
+ String line = Utility.readDataLine(in);
+ if (line == null) break;
+ if (line.length() == 0) continue;
+ if (line.startsWith("@")) {
+ type = line.substring(1);
+ continue;
+ }
+ String[] pieces = Utility.split(line, ';');
+ String code = Utility.fromHex(pieces[0]);
+ result.put(UTF16.charAt(code,0), type);
+ }
+ return result;
+ }
+
+ static Comparator positionComparator = new Comparator() {
+ public int compare(Object o1, Object o2) {
+ String s1 = (String)o1;
+ String s2 = (String)o2;
+ return Default.ucd().getName(s1).compareTo(Default.ucd().getName(s2));
+ }
+ };
+
+ /**
+ *
+ */
+ private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
+ UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
+ core.removeAll(decomp);
+ return decomp;
+ }
+
+ /**
+ * @param htmlOut
+ * @param textOut TODO
+ * @param script TODO
+ * @param unicodeset
+ * @param scriptCode
+ * @param comparator TODO
+ * @param uca
+ */
+ private void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
+ String script, String title, UnicodeSet unicodeset, int scriptCode, Comparator comparator) {
+ if (unicodeset == null)
+ return;
+ int size = unicodeset.size();
+ String dir = unicodeset.containsSome(bidiR)
+ && unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
+ htmlOut.println("" + title + " ("
+ + TestData.nf.format(size) + ") ");
+ htmlOut.print("");
+ // categorization
+ textOut.println();
+ textOut.println("# " + title);
+ bf.setValueSource(script + " ; " + title);
+ UnicodeSetIterator usi = new UnicodeSetIterator();
+ if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
+ usi.reset(unicodeset);
+ while (usi.nextRange()) {
+ if (usi.codepoint == usi.codepointEnd) {
+ htmlOut.print(formatCode(UTF16
+ .valueOf(usi.codepoint)));
+ } else {
+ htmlOut.print(formatCode(UTF16
+ .valueOf(usi.codepoint))
+ + ".. "
+ + formatCode(UTF16
+ .valueOf(usi.codepointEnd)));
+ }
+ }
+ bf.showSetNames(textOut, unicodeset);
+ } else {
+ Set reordered = new TreeSet(comparator);
+ usi.reset(unicodeset);
+ while (usi.next()) {
+ String x = usi.getString();
+ boolean foo = reordered.add(x);
+ if (!foo)
+ throw new IllegalArgumentException("Collision with "
+ + Default.ucd().getCodeAndName(x));
+ }
+ for (Iterator it = reordered.iterator(); it.hasNext();) {
+ Object key = it.next();
+ htmlOut.print(formatCode((String)key));
+ }
+ bf.showSetNames(textOut, reordered);
+ }
+ htmlOut.println(" ");
+ }
+
+ /**
+ * @param string
+ * @return
+ */
+ private String formatCode(String string) {
+ int cat = ucd.getCategory(UTF16.charAt(string,0));
+ String pad = "\u00A0", pad1 = pad;
+ if (cat == Me || cat == Mn) {
+ pad = "\u00A0\u00A0";
+ pad1 = "\u00A0\u00A0\u25cc";
+ }
+ return ""
+ + pad1
+ + BagFormatter.toHTMLControl.transliterate(string)
+ + pad
+ + " ";
+ }
+}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/NamedSequences-Template.html b/tools/unicodetools/com/ibm/text/UCD/NamedSequences-Template.html
new file mode 100644
index 00000000000..58056d24f71
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/NamedSequences-Template.html
@@ -0,0 +1,153 @@
+
+
+
+
+
+
+
+
+
+
+Named Sequences
+
+
+
+
+
+
+
+
+
+
+
+ L2-XXX
+ To: UTC
+ From: Mark Davis
+ Date: 2005-04-28
+ One of the original ideas for Unicode 4.1.0 was to produce a NamedSequences.html,
+ following the pattern of StandardizedVariants.html. This document was generated along those
+ lines, but not added into U4.1.0. My suggestion instead is to add this file (with suitable
+ style modifications, of course) as a chart someplace accessible under
+ http://unicode.org/charts/ .
+ Alternatively, we could also combine this with the StandardizedVariants.html to provide
+ a unified chart of sequences, again someplace under
+ http://unicode.org/charts/ .
+ Note: we don't have some of the glyphs quite right yet, but it should be
+ sufficient for discussing the format. One of the innovations is having a separate column of
+ text that for copy&paste; that needs discussion also.
+
+
+
PROPOSED WORKING DRAFT
+ Named Sequences
+
+
+ Summary
+
+ This file provides a visual display of the named sequences derived from NamedSequences.txt.The
+ proposal is to add this,
+
+
Status
+
+ The file and the files described herein are part of the
+ Unicode Character Database (UCD) and are governed by
+ the UCD Terms of Use stated at the end.
+
+
+
Introduction
+
The tables here exhaustively lists the valid, registered named sequences. The columns include a
+ representative glyph, the sequence of code points in hex, and the name of the sequence. In
+ addition, there is a last column entitled Copyable , which contains the literal text forming
+ the sequence. That text can be copied and pasting in elsewhere. The display of the text in this
+ column is up to the capabilities of the browser and the set of available fonts. For more
+ information, see Display Problems? .
+
+ Note: The representative glyphs used to show the names sequences
+ are often derived from different physical fonts than the representative glyphs in the standard.
+ They may therefore exhibit minor differences in size, proportion, style, or weight.
+
+
@table@
+
+
+
Disclaimer
+
+ The Unicode Character Database is provided as is by Unicode, Inc. No claims are made as to
+ fitness for any particular purpose. No warranties of any kind are expressed or implied. The
+ recipient agrees to determine applicability of information provided. If this file has been
+ purchased on magnetic or optical media from Unicode, Inc., the sole remedy for any claim will be
+ exchange of defective media within 90 days of receipt.
+ This disclaimer is applicable for all other data files accompanying the Unicode Character
+ Database, some of which have been compiled by the Unicode Consortium, and some of which have
+ been supplied by other sources.
+
+
Limitations on Rights to Redistribute This Data
+
+ Recipient is granted the right to make copies in any form for internal distribution and to
+ freely use the information supplied in the creation of products supporting the UnicodeTM
+ Standard. The files in the Unicode Character Database can be redistributed to third parties or
+ other organizations (whether for profit or not) as long as this notice and the disclaimer notice
+ are retained. Information can be extracted from these files and used in documentation or
+ programs, as long as there is an accompanying notice indicating the source.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tools/unicodetools/com/ibm/text/UCD/TestData.java b/tools/unicodetools/com/ibm/text/UCD/TestData.java
index f30eb8c8b1b..324c311fba2 100644
--- a/tools/unicodetools/com/ibm/text/UCD/TestData.java
+++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
-* $Date: 2005/05/02 15:39:53 $
-* $Revision: 1.22 $
+* $Date: 2005/05/27 21:38:51 $
+* $Revision: 1.23 $
*
*******************************************************************************
*/
@@ -46,8 +46,6 @@ public class TestData implements UCD_Types {
public static void main (String[] args) throws IOException {
//checkChars(false);
- new GenStringPrep().genStringPrep();
- if (true) return;
System.out.println("main: " + Default.getDate());
upf = ICUPropertyFactory.make();
@@ -152,404 +150,6 @@ public class TestData implements UCD_Types {
}
Matcher m;
- static class GenStringPrep {
-
- UnicodeSet[] coreChars = new UnicodeSet[100];
- UnicodeSet decomposable = new UnicodeSet();
- UnicodeMap suspect = new UnicodeMap();
-
- ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
- //UnicodeSet id_continue = ups.getSet("ID_Continue=true");
- UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
- UnicodeSet wordChars = new UnicodeSet();
- {
- if (false) {
- wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher()));
- wordChars.retainAll(ups.getSet("gc=Sk"));
- }
- wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
- " \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" +
- " \\u055A \\u02B9 \\u02BA]"));
- //wordChars.removeAll(xid_continue);
- }
-
- UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
- UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
-
- UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
-
- //UnicodeSet[] decompChars = new UnicodeSet[100];
- UCD ucd = Default.ucd();
-
- Collator uca0 = Collator.getInstance(ULocale.ENGLISH);
- {
- uca0.setStrength(Collator.IDENTICAL);
- }
- GenerateHanTransliterator.MultiComparator uca
- = new GenerateHanTransliterator.MultiComparator(new Comparator[] {
- uca0, new UTF16.StringComparator()});
-
- UnicodeSet bidiR = new UnicodeSet(
- "[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
-
- UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
- UnicodeSet hasNoUpper = new UnicodeSet();
- UnicodeSet hasNoUpperMinus = new UnicodeSet();
- BagFormatter bf = new BagFormatter();
- UnicodeSet inIDN = new UnicodeSet();
-
- void genStringPrep() throws IOException {
- //showScriptToBlock();
- bf.setShowLiteral(BagFormatter.toHTMLControl);
- //bf.setValueSource(UnicodeLabel.NULL);
- if (false) {
-
- System.out.println("word chars: " + bf.showSetNames(wordChars));
- System.out.println("pat: " + bf.showSetNames(patternProp));
- System.out.println("xid: " + bf.showSetNames(not_xid_continue));
- }
- for (int cp = 0; cp <= 0x10FFFF; ++cp) {
- Utility.dot(cp);
- int cat = Default.ucd().getCategory(cp);
- if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
- if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
- int idnaType = getIDNAType(cp);
- idnaTypeSet[idnaType].add(cp);
- String str = UTF16.valueOf(cp);
- if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
- int script = ucd.getScript(cp);
- if (coreChars[script] == null)
- coreChars[script] = new UnicodeSet();
- coreChars[script].add(cp);
- }
- // fix characters with no uppercase
- hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
- System.out.println(bf.showSetNames(hasNoUpper));
-
- Utility.fixDot();
- PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
- PrintWriter htmlOut2 = BagFormatter.openUTF8Writer(GEN_DIR, "script-chars.html");
- PrintWriter textOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.txt");
- textOut.println('\uFEFF');
- textOut.println("For documentation, see idn-chars.html");
-
- Utility.appendFile("./com/ibm/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut,
- new String[] {"%date%", Default.getDate()});
- /*
- out
- .println(" ");
- out.println("IDN Characters ");
- */
- htmlOut.println("");
- htmlOut2.println("");
-
- for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) {
- if (scriptCode == COMMON_SCRIPT
- || scriptCode == INHERITED_SCRIPT)
- continue;
- showCodes(htmlOut, textOut, scriptCode, htmlOut2);
- }
- showCodes(htmlOut, textOut, COMMON_SCRIPT, htmlOut2);
- showCodes(htmlOut, textOut, INHERITED_SCRIPT, htmlOut2);
- htmlOut.println("
");
- htmlOut.close();
- htmlOut2.println("
");
- htmlOut2.close();
- bf.setMergeRanges(false);
-
- textOut.println();
- textOut.println("# *** ADDITIONAL WORD CHARACTERS ***");
- textOut.println();
- bf.setValueSource("word-chars");
- bf.showSetNames(textOut, wordChars);
-
- textOut.println();
- textOut.println("# *** FOR REVIEW ***");
- bf.setLabelSource(UnicodeLabel.NULL);
- for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
- textOut.println();
- String value = (String)it.next();
- bf.setValueSource(value);
- bf.showSetNames(textOut, suspect.getSet(value));
- }
- textOut.close();
- }
-
- /**
- *
- */
- private void showScriptToBlock() {
- UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap();
- UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap();
- UnicodeMap.Composer myCompose = new UnicodeMap.Composer() {
- public Object compose(Object a, Object b) {
- return a + "\t" + b;
- }
- };
- UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose);
- for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) {
- System.out.println(it.next());
- }
- throw new IllegalArgumentException();
- }
-
- Map scriptToGif = CollectionUtilities.asMap(script_to_gif);
-
- static String[][] script_to_gif = {
-
- {"Common","common.gif"}, //Miscellaneous_Symbols
- {"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks
- {"Arabic","arabic.gif"}, //Arabic
- {"Armenian","armenian.gif"}, //Armenian
- {"Bengali","bengali.gif"}, //Bengali
- {"Bopomofo","bopomofo.gif"}, //Bopomofo
- {"Braille","braillesymbols.gif"}, //Braille_Patterns
- {"Buginese","buginese.gif"}, //Buginese
- {"Buhid","buhid.gif"}, //Buhid
- {"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics
- {"Cherokee","cherokee.gif"}, //Cherokee
- {"Coptic","coptic.gif"}, //Coptic
- {"Cypriot","cypriot.gif"}, //Cypriot_Syllabary
- {"Cyrillic","cyrillic.gif"}, //Cyrillic
- {"Deseret","deseret.gif"}, //Deseret
- {"Devanagari","devanagari.gif"}, //Devanagari
- {"Ethiopic","ethiopic.gif"}, //Ethiopic
- {"Georgian","georgian.gif"}, //Georgian
- {"Glagolitic","glagolitic.gif"}, //Glagolitic
- {"Gothic","gothic.gif"}, //Gothic
- {"Greek","greek.gif"}, //Greek_and_Coptic
- {"Gujarati","gujarati.gif"}, //Gujarati
- {"Gurmukhi","gurmukhi.gif"}, //Gurmukhi
- {"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs
- {"Han","kangxiradicals.gif"}, //Kangxi_Radicals
- {"Hangul","hangulsyllables.gif"}, //Hangul_Syllables
- {"Hanunoo","hanunoo.gif"}, //Hanunoo
- {"Hebrew","hebrew.gif"}, //Hebrew
- {"Hiragana","hiragana.gif"}, //Hiragana
- {"Kannada","kannada.gif"}, //Kannada
- {"Katakana","katakana.gif"}, //Katakana
- {"Kharoshthi","kharoshthi.gif"}, //Kharoshthi
- {"Khmer","khmer.gif"}, //Khmer
- {"Lao","lao.gif"}, //Lao
- {"Latin","latin.gif"}, //Basic_Latin
- {"Limbu","limbu.gif"}, //Limbu
- {"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary
- {"Malayalam","malayalam.gif"}, //Malayalam
- {"Mongolian","mongolian.gif"}, //Mongolian
- {"Myanmar","myanmar.gif"}, //Myanmar
- {"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue
- {"Ogham","ogham.gif"}, //Ogham
- {"Old_Italic","olditalic.gif"}, //Old_Italic
- {"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian
- {"Oriya","oriya.gif"}, //Oriya
- {"Osmanya","osmanya.gif"}, //Osmanya
- {"Runic","runic.gif"}, //Runic
- {"Shavian","shavian.gif"}, //Shavian
- {"Sinhala","sinhala.gif"}, //Sinhala
- {"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri
- {"Syriac","syriac.gif"}, //Syriac
- {"Tagalog","tagalog.gif"}, //Tagalog
- {"Tagbanwa","tagbanwa.gif"}, //Tagbanwa
- {"Tai_Le","taile.gif"}, //Tai_Le
- {"Tamil","tamil.gif"}, //Tamil
- {"Telugu","telugu.gif"}, //Telugu
- {"Thaana","thaana.gif"}, //Thaana
- {"Thai","thai.gif"}, //Thai
- {"Tibetan","tibetan.gif"}, //Tibetan
- {"Tifinagh","tifinagh.gif"}, //Tifinagh
- {"Ugaritic","ugaritic.gif"}, //Ugaritic
- {"Yi","yi.gif"}, //Yi_Syllables
-
- };
-
- UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
- {
- for (int i = 0; i < idnaTypeSet.length; ++i) idnaTypeSet[i] = new UnicodeSet();
- }
- static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
- /**
- *
- */
- private int getIDNAType(int cp) {
- inbuffer.setLength(0);
- UTF16.append(inbuffer, cp);
- try {
- intermediate = IDNA.convertToASCII(inbuffer,
- IDNA.DEFAULT); // USE_STD3_RULES
- if (intermediate.length() == 0)
- return DELETED;
- outbuffer = IDNA.convertToUnicode(intermediate,
- IDNA.USE_STD3_RULES);
- } catch (StringPrepParseException e) {
- return ILLEGAL;
- } catch (Exception e) {
- System.out.println("Failure at: " + Utility.hex(cp));
- return ILLEGAL;
- }
- if (!TestData.equals(inbuffer, outbuffer))
- return REMAPPED;
- return OK;
- }
- StringBuffer inbuffer = new StringBuffer();
- StringBuffer intermediate, outbuffer;
-
- UnicodeSet lowercase = new UnicodeSet("[:Lowercase:]");
-
- /**
- * @param htmlOut
- * @param textOut TODO
- * @param scriptCode
- * @param htmlOut2 TODO
- * @param ucd
- * @param coreChars
- * @param decompChars
- */
- private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) {
- if (coreChars[scriptCode] == null) return;
- String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
- script = Utility.getUnskeleton(script.toLowerCase(),true);
- System.out.println(script);
-
- htmlOut.println();
- String scriptLine = " Script: " + script + " ";
- htmlOut.println(scriptLine);
- htmlOut2.println(scriptLine);
- textOut.println();
- textOut.println("#*** Script: " + script + " ***");
- UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
-
- UnicodeSet deleted = extract(idnaTypeSet[DELETED], core);
- UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
- UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
-
- UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
- UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
-
- UnicodeSet decomp = extract(decomposable, core);
- UnicodeSet pattern = extract(patternProp, core);
- UnicodeSet non_id = extract(not_xid_continue, core);
-
- UnicodeSet bicameralNoupper = new UnicodeSet();
- if (!hasNoUpper.containsAll(core)) {
- bicameralNoupper = extract(hasNoUpperMinus, core);
- }
-
- UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
- for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
- String cat = Default.ucd().getCategoryID(it.codepoint);
- String name = Default.ucd().getName(it.codepoint);
- if (name.indexOf("MUSICAL SYMBOL") >= 0
- || name.indexOf("DINGBA") >= 0
- || name.indexOf("RADICAL ") >= 0
- ) cat = "XX";
- suspect.put(it.codepoint, cat);
- }
-
- if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode);
- if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode);
- if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
- if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
- if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "NFD-Decomposable", decomp, scriptCode);
-
- if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode);
- if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode);
- if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode);
- if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
- if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Prohibited", illegal, scriptCode);
- }
-
- /**
- *
- */
- private UnicodeSet extract(UnicodeSet other, UnicodeSet core) {
- UnicodeSet decomp = new UnicodeSet(core).retainAll(other);
- core.removeAll(decomp);
- return decomp;
- }
-
- /**
- * @param htmlOut
- * @param textOut TODO
- * @param script TODO
- * @param unicodeset
- * @param scriptCode
- * @param uca
- */
- private void printlnSet(PrintWriter htmlOut, PrintWriter textOut,
- String script, String title, UnicodeSet unicodeset, int scriptCode) {
- if (unicodeset == null)
- return;
- int size = unicodeset.size();
- String dir = unicodeset.containsSome(bidiR)
- && unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
- htmlOut.println("" + title + " ("
- + nf.format(size) + ") ");
- htmlOut.print("");
- // categorization
- textOut.println();
- textOut.println("# " + title);
- bf.setValueSource(script + " ; " + title);
- UnicodeSetIterator usi = new UnicodeSetIterator();
- if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
- usi.reset(unicodeset);
- while (usi.nextRange()) {
- if (usi.codepoint == usi.codepointEnd) {
- htmlOut.print(formatCode(UTF16
- .valueOf(usi.codepoint)));
- } else {
- htmlOut.print(formatCode(UTF16
- .valueOf(usi.codepoint))
- + ".. "
- + formatCode(UTF16
- .valueOf(usi.codepointEnd)));
- }
- }
- bf.showSetNames(textOut, unicodeset);
- } else {
- Set reordered = new TreeSet(uca);
- usi.reset(unicodeset);
- while (usi.next()) {
- String x = usi.getString();
- boolean foo = reordered.add(x);
- if (!foo)
- throw new IllegalArgumentException("Collision with "
- + Default.ucd().getCodeAndName(x));
- }
- for (Iterator it = reordered.iterator(); it.hasNext();) {
- Object key = it.next();
- htmlOut.print(formatCode((String)key));
- }
- bf.showSetNames(textOut, reordered);
- }
- htmlOut.println(" ");
- }
-
- /**
- * @param string
- * @return
- */
- private String formatCode(String string) {
- int cat = ucd.getCategory(UTF16.charAt(string,0));
- return ""
- + (cat == Me || cat == Mn ? "\u00A0" : "") //\u25cc
- + BagFormatter.toHTMLControl.transliterate(string)
- + " ";
- }
- }
-
/**
* @param inbuffer
* @param outbuffer
diff --git a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
index 873aa8a7284..b5a9bf024e8 100644
--- a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
+++ b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java
@@ -240,7 +240,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
{"Control", "CN"},
{"Extend", "EX"},
{"Other", "XX"},
- }).swapFirst2ValueAliases());
+ }, true).swapFirst2ValueAliases());
add(new UnicodeProperty.UnicodeMapProperty() {
{
@@ -283,7 +283,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
{"Numeric", "NU"},
{"ExtendNumLet", "EX"},
{"Other", "XX"},
- }).swapFirst2ValueAliases());
+ }, true).swapFirst2ValueAliases());
add(new UnicodeProperty.UnicodeMapProperty() {
{
@@ -335,7 +335,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
{"STerm", "ST"},
{"Close", "CL"},
{"Other", "XX"},
- }).swapFirst2ValueAliases());
+ }, false).swapFirst2ValueAliases());
}
static String[] YES_NO_MAYBE = {"N", "M", "Y"};
diff --git a/tools/unicodetools/com/ibm/text/UCD/confusablesHeader.txt b/tools/unicodetools/com/ibm/text/UCD/confusablesHeader.txt
new file mode 100644
index 00000000000..97d32b358fa
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/confusablesHeader.txt
@@ -0,0 +1,34 @@
+# Confusables.txt
+# Generated: %date%, MED
+# This is a draft list of visually confusable characters, for use in conjunction with the
+# recommendations in http://www.unicode.org/reports/tr36/
+#
+# To fold using this list, first perform NFKD (if not already performed),
+# then map each source character to the target character(s), then perform NFKD again.
+#
+# The format the standard Unicode semicolon-delimited hex.
+# ; ; #
+#
+# The characters may be visually distinguishable in many fonts, or at larger sizes.
+# Some anomalies are also introduced by 'closure'. That is, there may be a sequence of
+# characters where each is visually confusable from the next, but the start and end are
+# visually distinguishable. But when the set is closed, these will all map to together.
+#
+# This is unlike normalization data. There may be no connection between characters other
+# than visual confusability. This data should not be used except in assessing visual confusability.
+#
+# This list is not limited to Unicode Identifier characters (XID_Continue) although the primary
+# application will be to such characters. It is also not limited to lowercase characters,
+# although the recommendations are to lowercase for security.
+#
+# Note that a some characters have unusual characteristics, and are not yet accounted for.
+# For example, U+302E (?) HANGUL SINGLE DOT TONE MARK and U+302F (?) HANGUL DOUBLE DOT TONE MARK
+# appear to the left of the prevous character. So what looks like "a:b" can actually be "ab\u302F"
+#
+# WARNING: The data is not final; it is very draft at this point, put together from different
+# sources that need to be reviewed for accuracy and completeness of the mappings.
+# There are still clear errors in the data; do not use this in any implementations.
+# Ignore the internal_info field; it will be removed.
+#
+# Thanks especially to Eric van der Poel for collecting information about fonts using shared glyphs.
+# =================================
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html b/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
index 43c5dd69dd1..f7aa80a7eba 100644
--- a/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
+++ b/tools/unicodetools/com/ibm/text/UCD/idn-charsHeader.html
@@ -86,6 +86,10 @@ Within each subcategory characters are sorted according to the default
+Characters that are normally invisible are represented in the chart by their Unicode number, such as "U+FE00".
+At the end of this document, there is an additional section that lists all visible non-spacing marks .
+These are sorted first by combining character class (modified), then by script, then by code point..
+For comparison of Indic characters, see indic-trans.html .
This is a draft list of characters based on Section 4 Word Boundaries of
UAX# 29 , in the
diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java
index 54c04b73e57..de05b700ac1 100644
--- a/tools/unicodetools/com/ibm/text/utility/Utility.java
+++ b/tools/unicodetools/com/ibm/text/utility/Utility.java
@@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
-* $Date: 2005/03/30 17:19:32 $
-* $Revision: 1.48 $
+* $Date: 2005/05/27 21:39:03 $
+* $Revision: 1.49 $
*
*******************************************************************************
*/
@@ -336,6 +336,10 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
}
public static String fromHex(String p) {
+ return fromHex(p, false);
+ }
+
+ public static String fromHex(String p, boolean acceptChars) {
StringBuffer output = new StringBuffer();
int value = 0;
int count = 0;
@@ -357,13 +361,31 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
default:
int type = Character.getType(ch);
if (type != Character.SPACE_SEPARATOR) {
+ if (acceptChars) {
+ if (count >= 4 && count <= 6) {
+ UTF32.append32(output, value);
+ count = 0;
+ value = 0;
+ } else if (count != 0) {
+ output.append(p.substring(i-count, i)); // TODO fix supplementary characters
+ }
+ UTF32.append32(output, ch);
+ continue main;
+
+ }
throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
new Object[] {String.valueOf(ch), new Integer(i), p});
}
// fall through!!
case ' ': case ',': case ';': // do SPACE here, just for speed
if (count != 0) {
- UTF32.append32(output, value);
+ if (count < 4 || count > 6) {
+ if (acceptChars) output.append(p.substring(i-count, i));
+ else throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
+ new Object[] {String.valueOf(ch), new Integer(i), p});
+ } else {
+ UTF32.append32(output, value);
+ }
}
count = 0;
value = 0;
@@ -378,7 +400,13 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
count++;
}
if (count != 0) {
- UTF32.append32(output, value);
+ if (count < 4 || count > 6) {
+ if (acceptChars) output.append(p.substring(p.length()-count, p.length()));
+ else throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"",
+ new Object[] {"EOS", new Integer(p.length()), p});
+ } else {
+ UTF32.append32(output, value);
+ }
}
return output.toString();
}