From b120a3251bb5cadc16443db152e262a791a2129c Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Tue, 1 Nov 2005 00:10:54 +0000 Subject: [PATCH] ICU-4700 Misc tools X-SVN-Rev: 18741 --- .../com/ibm/text/UCD/ConvertUCD.java | 257 ++++----- .../com/ibm/text/UCD/GenerateConfusables.java | 48 +- .../com/ibm/text/UCD/GenerateStringPrep.java | 2 +- .../com/ibm/text/UCD/MakeNamesChart.java | 501 ++++++++++++++++++ .../com/ibm/text/UCD/Normalizer.java | 8 +- .../com/ibm/text/UCD/NormalizerSample.java | 2 +- .../com/ibm/text/UCD/QuickTest.java | 107 +++- .../com/ibm/text/UCD/TestIdentifiers.java | 56 ++ .../ibm/text/UCD/TestUnicodeInvariants.java | 2 +- tools/unicodetools/com/ibm/text/UCD/UCD.java | 12 +- .../com/ibm/text/UCD/UCD_Types.java | 6 +- .../com/ibm/text/UCD/UnicodeInvariants.txt | 15 + .../com/ibm/text/utility/ChainException.java | 26 +- .../com/ibm/text/utility/Utility.java | 7 +- 14 files changed, 853 insertions(+), 196 deletions(-) create mode 100644 tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java diff --git a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java index 44b59a229eb..0a45978f322 100644 --- a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $ -* $Date: 2004/11/12 23:17:15 $ -* $Revision: 1.16 $ +* $Date: 2005/11/01 00:10:53 $ +* $Revision: 1.17 $ * ******************************************************************************* */ @@ -396,142 +396,145 @@ public final class ConvertUCD implements UCD_Types { try { String[] parts = new String[20]; for (int lineNumber = 1; ; ++lineNumber) { - line = input.readLine(); - if (line == null) break; - if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'"); + try { + line = input.readLine(); + if (line == null) break; + if (SHOW && (lineNumber % 500) == 0) System.out.println("//" + lineNumber + ": '" + line + "'"); - String original = line; - String comment = ""; - int commentPos = line.indexOf('#'); - if (commentPos >= 0) { - comment = line.substring(commentPos+1).trim(); - line = line.substring(0, commentPos); - } - line = line.trim(); - if (line.length() == 0) continue; + String original = line; + String comment = ""; + int commentPos = line.indexOf('#'); + if (commentPos >= 0) { + comment = line.substring(commentPos+1).trim(); + line = line.substring(0, commentPos); + } + line = line.trim(); + if (line.length() == 0) continue; - int count = Utility.split(line,';',parts); + int count = Utility.split(line,';',parts); - if (false && parts[0].equals("2801")) { - System.out.println("debug?"); - } + if (false && parts[0].equals("2801")) { + System.out.println("debug?"); + } - // fix malformed or simple lists. + // fix malformed or simple lists. - if (count != labels.length) { - if (count == labels.length + 1 && parts[count-1].equals("")) { - if (!showedSemi) System.out.println("Extra semicolon in: " + original); - showedSemi = true; - } else if (count == 1) { // fix simple list - ++count; - parts[1] = "Y"; - } else if (count < labels.length) { - if (!showedShort) System.out.println("Line shorter than labels: " + original); - showedShort = true; - for (int i = count; i < labels.length; ++i) { - parts[i] = ""; - } - } else { - throw new ChainException("wrong count: {0}", - new Object[] {new Integer(line), new Integer(count)}); - } - } + if (count != labels.length) { + if (count == labels.length + 1 && parts[count-1].equals("")) { + if (!showedSemi) System.out.println("Extra semicolon in: " + original); + showedSemi = true; + } else if (count == 1) { // fix simple list + ++count; + parts[1] = "Y"; + } else if (count < labels.length) { + if (!showedShort) System.out.println("Line shorter than labels: " + original); + showedShort = true; + for (int i = count; i < labels.length; ++i) { + parts[i] = ""; + } + } else { + throw new ChainException("wrong count: {0}", + new Object[] {new Integer(line), new Integer(count)}); + } + } - // store char - // first field is always character OR range. May be UTF-32 - int cpTop; - int cpStart; - int ddot = parts[0].indexOf("."); - if (ddot >= 0) { - cpStart = UTF32.char32At(Utility.fromHex(parts[0].substring(0,ddot)),0); - cpTop = UTF32.char32At(Utility.fromHex(parts[0].substring(ddot+2)),0); - // System.out.println(Utility.hex(cpStart) + " ... " + Utility.hex(cpTop)); - } else { - cpStart = UTF32.char32At(Utility.fromHex(parts[0]),0); - cpTop = cpStart; - if (labels[1].equals("RANGE")) UTF32.char32At(Utility.fromHex(parts[1]),0); - } + // store char + // first field is always character OR range. May be UTF-32 + int cpTop; + int cpStart; + int ddot = parts[0].indexOf("."); + if (ddot >= 0) { + cpStart = UTF32.char32At(Utility.fromHex(parts[0].substring(0,ddot)),0); + cpTop = UTF32.char32At(Utility.fromHex(parts[0].substring(ddot+2)),0); + // System.out.println(Utility.hex(cpStart) + " ... " + Utility.hex(cpTop)); + } else { + cpStart = UTF32.char32At(Utility.fromHex(parts[0]),0); + cpTop = cpStart; + if (labels[1].equals("RANGE")) UTF32.char32At(Utility.fromHex(parts[1]),0); + } + // properties first + if (labels[1].equals("PROP")) { + String prop = parts[2].trim(); + // FIX!! + boolean skipLetters = false; + if (prop.equals("Alphabetic")) { + prop = "Other_Alphabetic"; + skipLetters = true; + } + // END FIX!! + properties.add(prop); + if (Utility.find(prop, UCD_Names.DeletedProperties, true) == -1) { // only undeleted + int end = UTF32.char32At(Utility.fromHex(parts[1]),0); + if (end == 0) end = cpStart; + for (int j = cpStart; j <= end; ++j) { + if (j != UCD.mapToRepresentative(j, Integer.MAX_VALUE)) continue; + if (skipLetters && getEntry(cpStart).isLetter()) continue; + appendCharProperties(j, prop); + } + } + } else { // not range! + String val = ""; + String lastVal; - // properties first - if (labels[1].equals("PROP")) { - String prop = parts[2].trim(); - // FIX!! - boolean skipLetters = false; - if (prop.equals("Alphabetic")) { - prop = "Other_Alphabetic"; - skipLetters = true; - } - // END FIX!! - properties.add(prop); - if (Utility.find(prop, UCD_Names.DeletedProperties, true) == -1) { // only undeleted - int end = UTF32.char32At(Utility.fromHex(parts[1]),0); - if (end == 0) end = cpStart; + for (int i = 1; i < labels.length; ++i) { + String key = labels[i]; + lastVal = val; + if (isHex.get(key) != null) { + val = Utility.fromHex(parts[i]); + } else { + val = parts[i].trim(); + } + if (key.equals("OMIT")) continue; // do after val, so lastVal is correct + if (key.equals("RANGE")) continue; // do after val, so lastVal is correct + if (val.equals("")) continue; // skip empty values, they mean default - for (int j = cpStart; j <= end; ++j) { - if (j != UCD.mapToRepresentative(j, Integer.MAX_VALUE)) continue; - if (skipLetters && getEntry(cpStart).isLetter()) continue; - appendCharProperties(j, prop); - } - } - } else { // not range! - String val = ""; - String lastVal; + for (int cps = cpStart; cps <= cpTop; ++cps) { + if (UCD.mapToRepresentative(cps, Integer.MAX_VALUE) != cps) continue; // skip condensed ranges - for (int i = 1; i < labels.length; ++i) { - String key = labels[i]; - lastVal = val; - if (isHex.get(key) != null) { - val = Utility.fromHex(parts[i]); - } else { - val = parts[i].trim(); - } - if (key.equals("OMIT")) continue; // do after val, so lastVal is correct - if (key.equals("RANGE")) continue; // do after val, so lastVal is correct - if (val.equals("")) continue; // skip empty values, they mean default - - for (int cps = cpStart; cps <= cpTop; ++cps) { - if (UCD.mapToRepresentative(cps, Integer.MAX_VALUE) != cps) continue; // skip condensed ranges - - if (key.equals("binary")) { - appendCharProperties(cps, val); - } else if (key.equals("fc")) { - UData data = getEntry(cps); - String type = parts[i-1].trim(); - if (type.equals("F") || type.equals("C") || type.equals("E") || type.equals("L")) { - data.fullCaseFolding = val; - //System.out.println("*<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val)); - } - if (type.equals("S") || type.equals("C") || type.equals("L")) { - data.simpleCaseFolding = val; - //System.out.println("<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val)); - } - if (type.equals("I")) { - data.simpleCaseFolding = val; - setBinaryProperty(cps, CaseFoldTurkishI); - if (DEBUG) System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting " - + Utility.hex(cps) + ": " + Utility.hex(val)); - } - } else if (labels[0].equals("SpecialCasing") // special handling for special casing - && labels[4].equals("sc") - && parts[4].trim().length() > 0) { - if (i < 4) { - if (DEBUG) System.out.println("Got special: " + Utility.hex(cps) + ", " - + Utility.hex(key) + ":" + Utility.hex(val)); - addCharData(cps, "sc", parts[4].trim() + ":" + key + ":" + val); - } - } else { - /*if (key.equals("sn")) { // SKIP UNDEFINED!! - UData data = getEntryIfExists(cps); - if (data == null || data.generalCategory == Cn) continue; - } - */ - addCharData(cps, key, val); - } - } - } - } + if (key.equals("binary")) { + appendCharProperties(cps, val); + } else if (key.equals("fc")) { + UData data = getEntry(cps); + String type = parts[i-1].trim(); + if (type.equals("F") || type.equals("C") || type.equals("E") || type.equals("L")) { + data.fullCaseFolding = val; + //System.out.println("*<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val)); + } + if (type.equals("S") || type.equals("C") || type.equals("L")) { + data.simpleCaseFolding = val; + //System.out.println("<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val)); + } + if (type.equals("I")) { + data.simpleCaseFolding = val; + setBinaryProperty(cps, CaseFoldTurkishI); + if (DEBUG) System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting " + + Utility.hex(cps) + ": " + Utility.hex(val)); + } + } else if (labels[0].equals("SpecialCasing") // special handling for special casing + && labels[4].equals("sc") + && parts[4].trim().length() > 0) { + if (i < 4) { + if (DEBUG) System.out.println("Got special: " + Utility.hex(cps) + ", " + + Utility.hex(key) + ":" + Utility.hex(val)); + addCharData(cps, "sc", parts[4].trim() + ":" + key + ":" + val); + } + } else { + /*if (key.equals("sn")) { // SKIP UNDEFINED!! + UData data = getEntryIfExists(cps); + if (data == null || data.generalCategory == Cn) continue; + } + */ + addCharData(cps, key, val); + } + } + } + } + } catch (Exception e) { + System.err.println("*Exception at: " + line + ", " + e.getMessage()); + //System.err.println(e.getMessage()); + } } } catch (Exception e) { System.out.println("Exception at: " + line + ", " + e.getMessage()); diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java b/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java index 2b04fd34aaf..e6e5d860380 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateConfusables.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $ -* $Date: 2005/07/19 17:21:00 $ -* $Revision: 1.7 $ +* $Date: 2005/11/01 00:10:53 $ +* $Revision: 1.8 $ * ******************************************************************************* */ @@ -290,7 +290,7 @@ public class GenerateConfusables { lowerIsBetter.putAll(remainingOutputSet, MARK_ASCII); lowerIsBetter.setMissing(MARK_NOT_NFC); - lowerIsBetter.lock(); + lowerIsBetter.freeze(); // add special values: //lowerIsBetter.putAll(new UnicodeSet("["), new Integer(0)); @@ -321,11 +321,11 @@ public class GenerateConfusables { PROHIBITED + NOT_IN_XID); removals2.setMissing("future?"); - additions.lock(); - remap.lock(); - removals.lock(); - reviews.lock(); - removals2.lock(); + additions.freeze(); + remap.freeze(); + removals.freeze(); + reviews.freeze(); + removals2.freeze(); } /** @@ -431,8 +431,8 @@ public class GenerateConfusables { //reviews.putAll(UNASSIGNED, ""); out.print("\uFEFF"); out.println("# Review List for IDN"); - out.println("# $Revision: 1.7 $"); - out.println("# $Date: 2005/07/19 17:21:00 $"); + out.println("# $Revision: 1.8 $"); + out.println("# $Date: 2005/11/01 00:10:53 $"); out.println(""); UnicodeSet fullSet = reviews.getSet("").complement(); @@ -487,8 +487,8 @@ public class GenerateConfusables { PrintWriter out = BagFormatter.openUTF8Writer(outdir, "idnchars.txt"); out.println("# Recommended Identifier Profiles for IDN"); - out.println("# $Revision: 1.7 $"); - out.println("# $Date: 2005/07/19 17:21:00 $"); + out.println("# $Revision: 1.8 $"); + out.println("# $Date: 2005/11/01 00:10:53 $"); out.println(""); out.println("# Output Characters"); @@ -557,8 +557,8 @@ public class GenerateConfusables { "xidmodifications.txt"); out.println("# Security Profile for General Identifiers"); - out.println("# $Revision: 1.7 $"); - out.println("# $Date: 2005/07/19 17:21:00 $"); + out.println("# $Revision: 1.8 $"); + out.println("# $Date: 2005/11/01 00:10:53 $"); out.println(""); out.println("# Characters restricted"); @@ -614,8 +614,8 @@ public class GenerateConfusables { //someRemovals = removals; out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt"); out.println("# Characters restricted in domain names"); - out.println("# $Revision: 1.7 $"); - out.println("# $Date: 2005/07/19 17:21:00 $"); + out.println("# $Revision: 1.8 $"); + out.println("# $Date: 2005/11/01 00:10:53 $"); out.println("#"); out.println("# This file contains a draft list of characters for use in"); out.println("# UTR #36: Unicode Security Considerations"); @@ -1149,8 +1149,8 @@ public class GenerateConfusables { public void writeSource(String directory, String filename) throws IOException { PrintWriter out = BagFormatter.openUTF8Writer(directory, filename); out.println("# Source File for IDN Confusables"); - out.println("# $Revision: 1.7 $"); - out.println("# $Date: 2005/07/19 17:21:00 $"); + out.println("# $Revision: 1.8 $"); + out.println("# $Date: 2005/11/01 00:10:53 $"); out.println(""); dataMixedAnycase.writeSource(out); out.close(); @@ -1160,8 +1160,8 @@ public class GenerateConfusables { PrintWriter out = BagFormatter.openUTF8Writer(directory, filename); out.print('\uFEFF'); out.println("# Recommended confusable mapping for IDN"); - out.println("# $Revision: 1.7 $"); - out.println("# $Date: 2005/07/19 17:21:00 $"); + out.println("# $Revision: 1.8 $"); + out.println("# $Date: 2005/11/01 00:10:53 $"); out.println(""); if (appendFile) { @@ -1369,8 +1369,8 @@ public class GenerateConfusables { UnicodeSet representable = new UnicodeSet(); out.print('\uFEFF'); out.println("# Summary: Recommended confusable mapping for IDN"); - out.println("# $Revision: 1.7 $"); - out.println("# $Date: 2005/07/19 17:21:00 $"); + out.println("# $Revision: 1.8 $"); + out.println("# $Date: 2005/11/01 00:10:53 $"); out.println(""); MyEquivalenceClass data = dataMixedAnycase; Set items = data.getOrderedExplicitItems(); @@ -1494,8 +1494,8 @@ public class GenerateConfusables { PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename); out.print('\uFEFF'); out.println("# Summary: Whole-Script Confusables"); - out.println("# $Revision: 1.7 $"); - out.println("# $Date: 2005/07/19 17:21:00 $"); + out.println("# $Revision: 1.8 $"); + out.println("# $Date: 2005/11/01 00:10:53 $"); out.println("# This data is used for determining whether a strings is a"); out.println("# whole-script or mixed-script confusable."); out.println("# The mappings here ignore common and inherited script characters,"); diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java b/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java index 87b4e357077..cb6f67a3b01 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateStringPrep.java @@ -206,7 +206,7 @@ class GenerateStringPrep implements UCD_Types { return a + "\t" + b; } }; - UnicodeMap sb = ((UnicodeMap)scripts.clone()).composeWith(blocks, myCompose); + UnicodeMap sb = ((UnicodeMap)scripts.cloneAsThawed()).composeWith(blocks, myCompose); for (Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) { System.out.println(it.next()); } diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java b/tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java new file mode 100644 index 00000000000..73a2d6ad8d8 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java @@ -0,0 +1,501 @@ +package com.ibm.text.UCD; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.dev.test.util.UnicodeMap; +import com.ibm.icu.dev.test.util.UnicodePropertySource; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Replaceable; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.ULocale; +import com.ibm.text.utility.Utility; +import com.ibm.text.utility.Utility.Encoding; + +public class MakeNamesChart { + + static int lastCodePoint = -1; + static boolean lastCodePointIsOld = false; + static int lastDecompType = UCD.NONE; + + static final String chartPrefix = "c_"; + static final String namePrefix = "n_"; + + static UnicodeSet skipChars;// = new UnicodeSet("[[:gc=cn:]-[:noncharactercodepoint:]]"); + static UnicodeSet rtl;// = new UnicodeSet("[[:bidiclass=r:][:bidiclass=al:]]"); + static UnicodeSet usePicture;// = new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]"); + + static UCD ucd41; + + public static void main(String[] args) throws Exception { + //ConvertUCD.main(new String[]{"5.0.0"}); + BlockInfo blockInfo = new BlockInfo("5.0.0", "NamesList.txt"); + // http://www.unicode.org/~book/incoming/kenfiles/U50M051010.lst + Default.setUCD("5.0.0"); + ucd41 = UCD.make("4.1.0"); + ToolUnicodePropertySource up = ToolUnicodePropertySource.make("5.0.0"); + skipChars = new UnicodeSet(up.getSet("gc=cn")).removeAll(up.getSet("gc=cn")); + //"[[:gc=cn:]-[:noncharactercodepoint:]]"); + rtl = new UnicodeSet(up.getSet("bidiclass=r")).addAll(up.getSet("bidiclass=al"));// "[[:bidiclass=r:][:bidiclass=al:]]"); + usePicture = new UnicodeSet(up.getSet("whitespace=true")).addAll(up.getSet("defaultignorablecodepoint=true"));// new UnicodeSet("[[:whitespace:][:defaultignorablecodepoint:]]"); + + List nameList = new ArrayList(); + ArrayList lines = new ArrayList(); + UnicodeSet collectedCodePoints = new UnicodeSet(); + BitSet nameListNew = new BitSet(); + + int limit = Integer.MAX_VALUE; + for (int count = 0; count < limit; ++count) { + if (!blockInfo.next(lines)) break; + String firstLine = (String)lines.get(0); + if (firstLine.startsWith("@@@")) continue; + String[] lineParts = firstLine.split("\t"); + String fileName = lineParts[1] + ".html"; + nameList.add(firstLine); + System.out.println(); + System.out.println("file: " + chartPrefix + fileName); + PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", chartPrefix + fileName); + out.println("" + + BagFormatter.toHTML.transliterate(getHeading(lineParts[2])) + + "" + + ""); + + // header + out.println("
" + + lineParts[1] + + " help" + + getHeading(lineParts[2]) + + "index " + + lineParts[3] + + "
"); + + if ("Unassigned".equals(lineParts[2])) { + System.out.println("debug"); + } + // first pass through and collect all the code points + collectedCodePoints.clear(); + for (int i = 1; i < lines.size(); ++i) { + String line = (String)lines.get(i); + int cp1 = line.charAt(0); + if (cp1 != '@' && cp1 != '\t') { + int cp = Integer.parseInt(line.split("\t")[0],16); + collectedCodePoints.add(cp); + } + } + collectedCodePoints.removeAll(skipChars); + if (collectedCodePoints.size() == 0) { + out.println("

No Names List

"); + } else { + out.println("
"); + int counter = 0; + for (UnicodeSetIterator it = new UnicodeSetIterator(collectedCodePoints); it.next();) { + if ((counter % 16) == 0 && counter != 0) { + out.println(""); + } + String tdclass = "cell"; + if (counter < 16) tdclass = "cellw"; + if (it.codepoint == 0x242) { + System.out.println("debug"); + } + boolean isNew = isNew(it.codepoint); + if (isNew) tdclass += "new"; + String hexcp = Utility.hex(it.codepoint, 4); + String title = ""; + String name = Default.ucd().getName(it.codepoint); + if (name != null) title = " title='" + BagFormatter.toHTML.transliterate(name.toLowerCase()) + "'"; + out.println(""); + counter++; + } + if (counter > 16) { + counter &= 0xF; + if (counter != 0) for (; counter < 16; ++counter) out.println(""); + out.println("
\u00A0" + + showChar(it.codepoint) + "\u00A0
" + + hexcp + "
\u00A0
"); + } + } + out.close(); + out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", namePrefix + fileName); + out.println("" + + ""); + + // now do the characters + boolean inTable = false; + for (int i = 1; i < lines.size(); ++i) { + String line = (String)lines.get(i); + try { + if (line.startsWith("@")) { + finishItem(out); + if (inTable) { + out.println(""); + inTable = false; + } + if (line.startsWith("@+")) { + line = line.substring(2).trim(); + out.println("

" + + line + + "

"); + } else { + line = line.substring(1).trim(); + out.println("

" + + line + + "

"); + } + } else { + if (!inTable) { + out.println(""); + inTable = true; + } + //String line2 = lineParts[1]; + if (line.startsWith("\t")) { + String body = line.trim(); + if (false && line.indexOf(body) != 1) { + System.out.println("Format error: too much inital whitespace: <" + line + ">"); + } + char firstChar = body.charAt(0); + switch (firstChar) { + case '*': body = "\u2022 " + body.substring(2); break; + case ':': body = checkCanonical(lastCodePoint, body); break; + case '#': body = checkCompatibility(lastCodePoint, body); break; + case 'x': body = getOther(body); break; + case '=': break; + default: throw new IllegalArgumentException("Huh? " + body); + } + out.println(""); + } else { + finishItem(out); + lineParts = line.split("\t"); + String x = lineParts[0]; + lastCodePoint = Integer.parseInt(x,16); + boolean lastCodePointIsNew = isNew(lastCodePoint); + if (lastCodePointIsNew) nameListNew.set(nameList.size()-1, true); + out.println("" + x + "" + + nameStyle(showTextConvertingHex(lineParts[1], false)) + ""); + lastDecompType = Default.ucd().getDecompositionType(lastCodePoint); + } + } + } catch (Exception e) { + throw (IllegalArgumentException) new IllegalArgumentException("Error on line: " + line) + .initCause(e); + } + } + finishItem(out); + out.close(); + } + blockInfo.in.close(); + PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", "mainList.html"); + out.println("" + + "Main List" + + "
\u00A0\u00A0" + + maybeNameStyle(showTextConvertingHex(body, firstChar != '='), firstChar == '=') + + "
\u00A0" + + showChar(lastCodePoint) + "\u00A0
"); + for (int i = 0; i < nameList.size(); ++i) { + String line = (String) nameList.get(i); + String[] lineParts = line.split("\t"); + String fileName = lineParts[1] + ".html"; + out.println("" + getHeading(lineParts[2]) + ""); + } + out.println("
" + lineParts[1] + + "" + + lineParts[3] +"
"); + out.close(); + BagFormatter bf = new BagFormatter(); + //System.out.println(bf.showSetDifferences("Has name in decomps", hasName, "Has no name in decomps", hasNoName)); + System.out.println("Name differences: Canonical"); + showNameDifferences(hasNameCan, hasNoNameCan); + System.out.println("Name differences: Compatibility"); + showNameDifferences(hasNameComp, hasNoNameComp); +// System.out.println("Characters with names in decomps: " + hasName.toPattern(true)); +// System.out.println("Characters without names in decomps: " + hasNoName.toPattern(true)); +// System.out.println("Characters sometimes with, sometimes without names in decomps: " + both.toPattern(true)); + System.out.println("Done"); + } + + private static boolean isNew(int codepoint) { + return Default.ucd().isAllocated(codepoint) && !ucd41.isAllocated(codepoint); + } + + private static void showNameDifferences(Map hasName, Map hasNoName) { + Set both = new TreeSet(hasNoName.keySet()); + both.retainAll(hasName.keySet()); + //hasNoName.removeAll(both); + //hasName.removeAll(both); + for (Iterator it = both.iterator(); it.hasNext();) { + String decomp = (String) it.next(); + System.out.println(); + System.out.println("decomp: " + Utility.hex(decomp)); + System.out.println("Has name in: " + Utility.hex((String)hasName.get(decomp))); + System.out.println("Has no name in: " + Utility.hex((String)hasNoName.get(decomp))); + } + System.out.println("Count: " + both.size()); + } + + static TestIdentifiers ti; + static { + try { + ti = new TestIdentifiers("L"); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + private static void finishItem(PrintWriter out) { + if (lastCodePoint < 0) return; + if (lastDecompType != UCD.NONE) { + System.out.println("Alert: missing decomp for " + Utility.hex(lastCodePoint)); + } + String str = UTF16.valueOf(lastCodePoint); + String upper = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.UPPER), "\u2191"); + showForm(out, str, upper, null, Default.ucd().getCase(str,UCD.FULL,UCD.TITLE), "\u2195"); + String lower = showForm(out, str, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.LOWER), "\u2193"); + showForm(out, lower, null, null, Default.ucd().getCase(str,UCD.FULL,UCD.FOLD), "\u2194"); + + String dc = Default.ucd().getDecompositionMapping(lastCodePoint); + String nfd = showForm(out, dc, str, null, Default.nfd().normalize(lastCodePoint), "\u21DB"); + //String nfc = showForm(out, dc, null, Default.nfc().normalize(lastCodePoint), "\u21DB"); + String nfkd = showForm(out, dc, str, nfd, Default.nfkd().normalize(lastCodePoint), "\u21DD"); + + if (nfkd.equals(str)) { + Set s = ti.getConfusables(lastCodePoint, "MA"); + if (s.size() > 1) { + sortedSet.clear(); + for (Iterator it = s.iterator(); it.hasNext();) { + sortedSet.add(Default.nfkd().normalize((String)it.next())); + } + sortedSet.remove(nfkd); // remove me + for (Iterator it = sortedSet.iterator(); it.hasNext();) { + String other = (String)it.next(); + if (nfkd.equals(Default.nfkd().normalize(other))) continue; + out.println("\u00A0\u00A0\u279F\u00A0" + + showTextConvertingHex(Utility.hex(other, 4, " + "), true) + + " " + + Default.ucd().getName(other, UCD.NORMAL, " + ").toLowerCase() + // maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=') + + ""); + } + } + } + lastCodePoint = -1; + } + + static Set sortedSet = new TreeSet(Collator.getInstance(ULocale.ENGLISH)); + + private static String showForm(PrintWriter out, String str, String str2, String str3, String transformed, String symbol) { + if (!transformed.equals(str) && !transformed.equals(str2) && !transformed.equals(str3)) { + out.println("\u00A0\u00A0" + symbol + "\u00A0" + + showTextConvertingHex(Utility.hex(transformed, 4, " + "), true) + + (UTF16.countCodePoint(transformed) != 1 ? "" : + " " + Default.ucd().getName(transformed, UCD.NORMAL, " + ").toLowerCase()) + // maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=') + + ""); + } + return transformed; + } + + static public String getHeading(String name) { + int pos = name.lastIndexOf(" ("); + if (pos < 0) return name; + return name.substring(0, pos); + } + + private static String maybeNameStyle(String string, boolean b) { + if (b && string.equals(string.toUpperCase(Locale.ENGLISH))) return nameStyle(string); + return string; + } + + + private static String nameStyle(String string) { + // TODO Auto-generated method stub + String result = "" + Default.ucd().getCase(string, UCD.FULL, UCD.TITLE) + ""; + // if it has any &xxx;, then restore them. + int position = 0; + while (true) { + if (!escapeMatch.reset(result).find(position)) break; + int start = escapeMatch.start(); + position = escapeMatch.end(); + result = result.substring(0,start) + + result.substring(start, position).toLowerCase() + + result.substring(position); + } + return result; + } + + static Matcher escapeMatch = Pattern.compile("\\&[A-Z][a-z]*\\;").matcher(""); + + private static String showTextConvertingHex(String body, boolean addCharToHex) { + body = BagFormatter.toHTML.transliterate(body); + if (addCharToHex) { + int position = 0; + while (position < body.length()) { + if (!findHex.reset(body).find(position)) break; + position = findHex.end(); + int start = findHex.start(); + int len = position - start; + if (len < 4 || len > 6) continue; + int cp = Integer.parseInt(findHex.group(),16); + if (cp > 0x10FFFF) continue; + String insert = "\u00A0" + showChar(cp); + String beginning = body.substring(0,start) + + "" + body.substring(start, position) + "" + + insert; + body = beginning + body.substring(position); + position = beginning.length(); + } + } + return body; + } + + static Matcher pointer = Pattern.compile("x \\((.*) - ([0-9A-F]+)\\)").matcher(""); + static Matcher pointer2 = Pattern.compile("x ([0-9A-F]{4,6})").matcher(""); + static Matcher findHex = Pattern.compile("[0-9A-F]+").matcher(""); + + private static String getOther(String body) { + // of form: x (hyphenation point - 2027) + // => arrow 2027 X hyphenation point + int cp; + String name = null; + if (pointer.reset(body).matches()) { + cp = Integer.parseInt(pointer.group(2),16); + name = pointer.group(1); + String name2 = Default.ucd().getName(cp); + if (name2 == null) name2 = ""; + if (!name.equalsIgnoreCase(name2)) { + System.out.println("Mismatch in name for " + body + " in " + Utility.hex(lastCodePoint)); + System.out.println("\tName is: " + name2); + } + } else if (pointer2.reset(body).matches()) { + cp = Integer.parseInt(pointer2.group(1),16); + // name = UCharacter.getName(cp).toLowerCase(); + // System.out.println("Irregular format: " + body); + } else { + throw new IllegalArgumentException("Bad format: " + body); + } + return "\u2192 " + Utility.hex(cp,4) /*+ " " + showChar(cp)*/ + (name != null ? " " + name : ""); + } + + static String showChar(int cp) { + if (usePicture.contains(cp)) { + int rep = '\u2588'; + if (cp <= 0x20) rep = 0x2400 + cp; + else if (cp == 0x7F) rep = 0x2421; + return "" + (char)rep + ""; + //String hex = Utility.hex(cp); + //return "" + hex + ""; + } + int type = Default.ucd().getCategory(cp); + String result = BagFormatter.toHTML.transliterate(UTF16.valueOf(cp)); + if (type == UCD.Me || type == UCD.Mn) { + result = "\u25CC" + result; + } else if (rtl.contains(cp)) { + result = "\u200E" + result + "\u200E"; + } + return result; + } + + //static final UnicodeSet noname = new UnicodeSet("[[:ascii:][:ideographic:]]"); + static final Map hasNoNameCan = new TreeMap(); + static final Map hasNameCan = new TreeMap(); + static final Map hasNoNameComp = new TreeMap(); + static final Map hasNameComp = new TreeMap(); + + private static String checkCanonical(int codePoint, String body) { + body = body.substring(2); + if (lastDecompType != UCD.CANONICAL) { + System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint)); + } + String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint); + String hexed = Utility.hex(lastDecomp, 4, " "); + String hexed2 = hexed; + if (UTF16.countCodePoint(lastDecomp) == 1) { + hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase(); + } + if (hexed.equalsIgnoreCase(body)) { + hasNoNameCan.put(lastDecomp, UTF16.valueOf(codePoint)); + } else if (hexed2.equalsIgnoreCase(body)) { + hasNameCan.put(lastDecomp, UTF16.valueOf(codePoint)); + } else { + System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint)); + System.out.println("\tShould be: " + hexed); + } + lastDecompType = UCD.NONE; + return "\u2261 " + body; + } + + private static String checkCompatibility(int codePoint, String body) { + body = body.substring(2); + if (lastDecompType <= UCD.CANONICAL) { + System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint)); + } + String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint); + String hexed = Utility.hex(lastDecomp, 4, " "); + if (lastDecompType != UCD.COMPAT_UNSPECIFIED) { + String lastDecompID = Default.ucd().getDecompositionTypeID(lastCodePoint); + hexed = "<" + lastDecompID + "> " + hexed; + } + String hexed2 = hexed; + if (UTF16.countCodePoint(lastDecomp) == 1) { + hexed2 += " " + Default.ucd().getName(lastDecomp).toLowerCase(); + } + if (hexed.equalsIgnoreCase(body)) { + hasNoNameComp.put(lastDecomp, UTF16.valueOf(codePoint)); + } else if (hexed2.equalsIgnoreCase(body)) { + hasNameComp.put(lastDecomp, UTF16.valueOf(codePoint)); + } else { + System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint)); + System.out.println("\tShould be: " + hexed); + } + lastDecompType = UCD.NONE; + return "\u2248 " + body; + } + + static class BlockInfo { + BufferedReader in; + String lastLine; + BlockInfo (String version, String filename) throws IOException { + in = Utility.openUnicodeFile(filename, version, true, Utility.LATIN1_WINDOWS); + //in = BagFormatter.openUTF8Reader(dir, filename); + } + boolean next(List inout) throws IOException { + inout.clear(); + if (lastLine != null) { + inout.add(lastLine); + lastLine = null; + } + while (true) { + String line = in.readLine(); + if (line == null) break; + if (line.startsWith("@@\t")) { + lastLine = line; + break; + } + inout.add(line); + } + return inout.size() > 0; + } + + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java index 163bf2de87a..766bd60a2bf 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java +++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $ -* $Date: 2004/04/17 18:21:39 $ -* $Revision: 1.16 $ +* $Date: 2005/11/01 00:10:54 $ +* $Revision: 1.17 $ * ******************************************************************************* */ @@ -136,7 +136,7 @@ public final class Normalizer implements UCD_Types { /** * Normalizes text according to the chosen form - * @param source the original text, unnormalized + * @param newLocaleID the original text, unnormalized * @return target the resulting normalized text */ public String normalize(int cp) { @@ -157,7 +157,7 @@ public final class Normalizer implements UCD_Types { /** * Does a quick check to see if the string is in the current form. Checks canonical order and * isAllowed(). - * @param source source text + * @param newLocaleID source text * @return YES, NO, MAYBE */ /* diff --git a/tools/unicodetools/com/ibm/text/UCD/NormalizerSample.java b/tools/unicodetools/com/ibm/text/UCD/NormalizerSample.java index 9c780e844e4..4e35ec49b6c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/NormalizerSample.java +++ b/tools/unicodetools/com/ibm/text/UCD/NormalizerSample.java @@ -86,7 +86,7 @@ public class NormalizerSample implements UCD_Types { /** * Normalizes text according to the chosen form - * @param source the original text, unnormalized + * @param newLocaleID the original text, unnormalized * @return target the resulting normalized text */ public String normalize(int cp) { diff --git a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java index fbb4bab907b..6a2a22a1b06 100644 --- a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $ -* $Date: 2005/10/11 19:39:15 $ -* $Revision: 1.7 $ +* $Date: 2005/11/01 00:10:54 $ +* $Revision: 1.8 $ * ******************************************************************************* */ @@ -21,6 +21,7 @@ import com.ibm.icu.dev.test.util.UnicodeMap; import com.ibm.icu.dev.test.util.UnicodeProperty; import com.ibm.icu.dev.test.util.UnicodePropertySource; import com.ibm.icu.dev.test.util.UnicodeMap.MapIterator; +import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; @@ -30,17 +31,92 @@ import com.ibm.text.utility.*; public class QuickTest implements UCD_Types { public static void main(String[] args) throws IOException { - getBidiMirrored(); - if (true) return; - getLengths("NFC", Default.nfc()); - getLengths("NFD", Default.nfd()); - getLengths("NFKC", Default.nfkc()); - getLengths("NFKD", Default.nfkd()); - System.out.println("Done"); + try { + //getBidiMirrored(); + getCaseFoldingUnstable(); + if (true) return; + getHasAllNormalizations(); + getLengths("NFC", Default.nfc()); + getLengths("NFD", Default.nfd()); + getLengths("NFKC", Default.nfkc()); + getLengths("NFKD", Default.nfkd()); + } finally { + System.out.println("Done"); + } } - - + private static void getCaseFoldingUnstable() { + for (int i = 3; i < com.ibm.text.utility.Utility.searchPath.length - 1; ++i) { + String newName = com.ibm.text.utility.Utility.searchPath[i]; + String oldName = com.ibm.text.utility.Utility.searchPath[i+1]; + showMemoryUsage(); + UCD ucdNew = UCD.make(newName); + showMemoryUsage(); + UCD ucdOld = UCD.make(oldName); + showMemoryUsage(); + UnicodeMap differences = new UnicodeMap(); + UnicodeSet differenceSet = new UnicodeSet(); + for (int j = 0; j < 0x10FFFF; ++j) { + if (!ucdOld.isAssigned(j)) continue; + String oldString = ucdOld.getCase(j, UCD.FULL, UCD.FOLD); + String newString = ucdNew.getCase(j, UCD.FULL, UCD.FOLD); + if (!oldString.equals(newString)) { + differenceSet.add(j); + differences.put(j, new String[]{oldString, newString}); + System.out.println("."); + } + } + if (differenceSet.size() != 0) { + System.out.println("Differences in " + com.ibm.text.utility.Utility.searchPath[i]); + for (UnicodeSetIterator it = new UnicodeSetIterator(differenceSet); it.next();) { + System.out.println(ucdNew.getCodeAndName(it.codepoint)); + String[] strings = (String[]) differences.getValue(it.codepoint); + System.out.println("\t" + oldName + ": " + ucdNew.getCodeAndName(strings[0])); + System.out.println("\t" + newName + ": " + ucdNew.getCodeAndName(strings[1])); + } + } + } + } + + static public void showMemoryUsage() { + System.gc(); System.gc(); System.gc(); System.gc(); + System.gc(); System.gc(); System.gc(); System.gc(); + System.gc(); System.gc(); System.gc(); System.gc(); + System.gc(); System.gc(); System.gc(); System.gc(); + System.out.println("total:\t" + Runtime.getRuntime().totalMemory() + ";\tfree:\t" + + Runtime.getRuntime().freeMemory()); + } + + private static void getHasAllNormalizations() { + UnicodeSet items = new UnicodeSet(); + Set s = new LinkedHashSet(); + for (int i = 0; i <= 0x10FFFF; ++i) { + if (!Default.ucd().isAssigned(i)) continue; + if (Default.ucd().getDecompositionType(i) == UCD.NONE) continue; + String source = UTF16.valueOf(i); + String nfc = Default.nfc().normalize(source); + String nfd = Default.nfd().normalize(source); + String nfkd = Default.nfkd().normalize(source); + String nfkc = Default.nfkc().normalize(source); + s.clear(); + s.add(source); + s.add(nfc); + s.add(nfd); + s.add(nfkd); + s.add(nfkc); + if (s.size() > 3) { + System.out.println(Utility.hex(source) + "\t" + Utility.escape(source) + + "\t" + Default.ucd().getName(source) + + "\tnfd\t" + Utility.hex(nfd) + "\t" + Utility.escape(nfd) + + "\tnfc\t" + Utility.hex(nfc) + "\t" + Utility.escape(nfc) + + "\tnfkd\t" + Utility.hex(nfkd) + "\t" + Utility.escape(nfkd) + + "\tnfkc\t" + Utility.hex(nfkc) + "\t" + Utility.escape(nfkc)); + } + } + } + + + private static void getBidiMirrored() { ToolUnicodePropertySource foo = ToolUnicodePropertySource.make(""); UnicodeMap status = new UnicodeMap(); @@ -92,9 +168,10 @@ public class QuickTest implements UCD_Types { UnicodeSet set = status.getSet(value); for (UnicodeSetIterator umi = new UnicodeSetIterator(set); umi.next();) { System.out.println(Utility.hex(umi.codepoint) - + ";\t" + value - + ";\t" + (x.contains(umi.codepoint) ? "O" : "") - + ";\t" + Default.ucd().getName(umi.codepoint)); + + (value.startsWith("*") ? ";\tBidi_Mirrored" : "") + + "\t#\t" + value + //+ ";\t" + (x.contains(umi.codepoint) ? "O" : "") + + "\t" + Default.ucd().getName(umi.codepoint)); } } } @@ -288,6 +365,6 @@ public class QuickTest implements UCD_Types { System.out.println("\tCount:" + set1.size()); System.out.println("\tSet:" + set1.toPattern(true)); System.out.println("\tDetails:"); - Utility.showSetNames("", set1, false, Default.ucd()); + //Utility.showSetNames("", set1, false, Default.ucd()); } } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/TestIdentifiers.java b/tools/unicodetools/com/ibm/text/UCD/TestIdentifiers.java index 19939e7dbde..aa759b788d5 100644 --- a/tools/unicodetools/com/ibm/text/UCD/TestIdentifiers.java +++ b/tools/unicodetools/com/ibm/text/UCD/TestIdentifiers.java @@ -4,10 +4,15 @@ import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; import java.util.BitSet; +import java.util.HashMap; +import java.util.Iterator; import java.util.List; +import java.util.Map; +import java.util.Set; import com.ibm.icu.dev.test.util.BagFormatter; import com.ibm.icu.dev.test.util.UnicodeMap; +import com.ibm.icu.dev.test.util.XEquivalenceClass; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.UTF16; @@ -34,6 +39,14 @@ public class TestIdentifiers { System.out.print(folded); ti.testItem(folded); } + for (int j = 0; j < tests[i].length(); ++j) { + int cp = tests[i].charAt(j); + Set s = ti.getConfusables(cp, "MA"); + System.out.println(Default.ucd().getCodeAndName(cp)); + for (Iterator it = s.iterator(); it.hasNext();) { + System.out.println("\t= " + Default.ucd().getCodeAndName((String)it.next())); + } + } } } @@ -141,6 +154,49 @@ public class TestIdentifiers { } br.close(); } + + Map type_equivalences; + + void loadConfusables() throws IOException { + BufferedReader br = BagFormatter.openUTF8Reader(indir, + "confusables.txt"); + String line = null; + type_equivalences = new HashMap(); + try { + while (true) { + line = Utility.readDataLine(br); + if (line == null) + break; + if (line.length() == 0) + continue; + String[] pieces = Utility.split(line, ';'); + // part 0 is source code point + String s = Utility.fromHex(pieces[0].trim()); + // part 1 is script1 + String t = Utility.fromHex(pieces[1].trim()); + + String type = pieces[2].trim(); + XEquivalenceClass ec = (XEquivalenceClass) type_equivalences.get(type); + if (ec == null) type_equivalences.put(type, ec = new XEquivalenceClass("")); + ec.add(s, t); + //System.out.println(type + ": " + Default.ucd().getCodeAndName(s) + " => " + Default.ucd().getCodeAndName(t)); + } + } catch (Exception e) { + throw (RuntimeException) new RuntimeException("Failure on line " + + line).initCause(e); + } + br.close(); + } + + public Set getConfusables(int cp, String type) { + try { + if (type_equivalences == null) loadConfusables(); + } catch (IOException e) { + return null; + } + XEquivalenceClass ec = (XEquivalenceClass) type_equivalences.get(type); + return ec.getEquivalences(UTF16.valueOf(cp)); + } void loadWholeScriptConfusables(String filterType) throws IOException { UnicodeSet[][] script_script_set = new UnicodeSet[UScript.CODE_LIMIT][UScript.CODE_LIMIT]; diff --git a/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java b/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java index 8ba02857d50..76b58b96696 100644 --- a/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java +++ b/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java @@ -73,7 +73,7 @@ public class TestUnicodeInvariants { int variableCount = 0; PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt"); out.write('\uFEFF'); // BOM - BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt"); + BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", "UnicodeInvariants.txt"); BagFormatter bf = new BagFormatter(); bf.setUnicodePropertyFactory(ToolUnicodePropertySource.make("")); BagFormatter bf2 = new BagFormatter(); diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 915ee78a83e..28267b47dd9 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2005/05/02 15:39:53 $ -* $Revision: 1.39 $ +* $Date: 2005/11/01 00:10:54 $ +* $Revision: 1.40 $ * ******************************************************************************* */ @@ -43,7 +43,7 @@ public final class UCD implements UCD_Types { /** * Used for the default version. */ - public static final String latestVersion = "4.1.0"; + public static final String latestVersion = "5.1.0"; /** * Create singleton instance for default (latest) version @@ -158,12 +158,16 @@ public final class UCD implements UCD_Types { * Get the character names for the code points in a string, separated by ", " */ public String getName(String s, byte style) { + return getName(s, style, ", "); + } + + public String getName(String s, byte style, String separator) { if (s.length() == 1) return getName(s.charAt(0), style); // optimize BMP StringBuffer result = new StringBuffer(); int cp; for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(s, i); - if (i > 0) result.append(", "); + if (i > 0) result.append(separator); result.append(getName(cp, style)); } return result.toString(); diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index 905996c1e83..997369dc5a1 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2005/03/10 02:37:20 $ -* $Revision: 1.31 $ +* $Date: 2005/11/01 00:10:54 $ +* $Revision: 1.32 $ * ******************************************************************************* */ @@ -15,7 +15,7 @@ package com.ibm.text.UCD; public interface UCD_Types { - static final byte BINARY_FORMAT = 16; // bumped if binary format of UCD changes. Forces rebuild + static final byte BINARY_FORMAT = 17; // bumped if binary format of UCD changes. Forces rebuild public static final String BASE_DIR = "C:\\DATA\\"; public static final String UCD_DIR = BASE_DIR + "UCD\\"; diff --git a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt index 4c0417fefb8..66f001c23a9 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt +++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt @@ -111,6 +111,21 @@ $XID_Continue ! [$Pattern_Whitespace $Pattern_Syntax] $Pattern_Whitespace ! [$XID_Continue $Pattern_Syntax] $Pattern_Syntax ! [$XID_Continue $Pattern_Whitespace] +# Test SA characters + +# They are limited to certain scripts: +Let $SAScripts = [$script:thai $script:lao $script:myanmar $script:khmer] +$SAScripts ⊇ $LineBreak:SA + +# And in those scripts, they are all the alphabetic spacing characters, plus some odd Cf +[$SAScripts & [$Alphabetic $gc:cf]] = [$SAScripts & [$LineBreak:SA $LineBreak:CM]] + +# Try removing M* from alphabetic, and matching to SA +[$SAScripts & [$Alphabetic $gc:cf - $gcAllMarks]] = $LineBreak:SA + +# Try adding M* to alphabetic, and matching to SA +[$SAScripts & [$Alphabetic $gc:cf $gcAllMarks]] = $LineBreak:SA + # testing # [$Pattern_Whitespace $Pattern_Syntax] ! [[^$WB:Format $WB:Other] \u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A] Let $otherword = [\u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A] diff --git a/tools/unicodetools/com/ibm/text/utility/ChainException.java b/tools/unicodetools/com/ibm/text/utility/ChainException.java index b06e75199ea..b52b7a25e08 100644 --- a/tools/unicodetools/com/ibm/text/utility/ChainException.java +++ b/tools/unicodetools/com/ibm/text/utility/ChainException.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/ChainException.java,v $ -* $Date: 2001/12/06 00:05:52 $ -* $Revision: 1.3 $ +* $Date: 2005/11/01 00:10:53 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -20,7 +20,7 @@ import java.io.*; public class ChainException extends RuntimeException { Object[] keyData; String messageFormat; - Exception chain; + //Exception chain; public ChainException (String messageFormat, Object[] objects) { this.messageFormat = messageFormat; @@ -30,20 +30,20 @@ public class ChainException extends RuntimeException { public ChainException (String messageFormat, Object[] objects, Exception chainedException) { this.messageFormat = messageFormat; keyData = objects == null ? null : (Object[]) objects.clone(); - chain = chainedException; + initCause(chainedException); } public String getMessage() { String chainMsg = ""; - if (chain != null) { - chainMsg = "; " + chain.getClass().getName() - + ", " + chain.getMessage(); - StringWriter w = new StringWriter(); - PrintWriter p = new PrintWriter(w); - chain.printStackTrace(p); - chainMsg += ", " + w.getBuffer(); - p.close(); - } +// if (chain != null) { +// chainMsg = "; " + chain.getClass().getName() +// + ", " + chain.getMessage(); +// StringWriter w = new StringWriter(); +// PrintWriter p = new PrintWriter(w); +// chain.printStackTrace(p); +// chainMsg += ", " + w.getBuffer(); +// p.close(); +// } String main = ""; if (keyData != null) main = MessageFormat.format(messageFormat, keyData); return main + chainMsg; diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java index e9d3d6ac725..993d37f836a 100644 --- a/tools/unicodetools/com/ibm/text/utility/Utility.java +++ b/tools/unicodetools/com/ibm/text/utility/Utility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ -* $Date: 2005/06/21 21:28:31 $ -* $Revision: 1.50 $ +* $Date: 2005/11/01 00:10:53 $ +* $Revision: 1.51 $ * ******************************************************************************* */ @@ -700,8 +700,9 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES return result + "}"; } - private static final String[] searchPath = { + public static final String[] searchPath = { "EXTRAS", + "5.0.0", "4.1.0", "4.0.1", "4.0.0",