diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index 89ed49811a5..6421c50c28b 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2005/06/08 01:44:48 $ -* $Revision: 1.42 $ +* $Date: 2006/04/05 22:12:46 $ +* $Revision: 1.43 $ * ******************************************************************************* */ @@ -18,6 +18,7 @@ import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.CanonicalIterator; import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.dev.test.util.TransliteratorUtilities; import com.ibm.icu.dev.test.util.UnicodeProperty; import com.ibm.icu.dev.test.util.UnicodePropertySource; import com.ibm.icu.impl.UCharacterProperty; @@ -33,7 +34,6 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; import com.ibm.text.UCD.*; -import com.ibm.text.UCD.UCD_Types; import com.ibm.text.utility.*; import com.ibm.text.UCD.Normalizer; @@ -4104,8 +4104,8 @@ F900..FAFF; CJK Compatibility Ideographs bf.setLineSeparator("
\r\n"); ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(""); bf.setUnicodePropertyFactory(ups); - bf.setShowLiteral(bf.toHTML); - bf.setFixName(bf.toHTML); + bf.setShowLiteral(TransliteratorUtilities.toHTML); + bf.setFixName(TransliteratorUtilities.toHTML); UCD ucd = Default.ucd(); UnicodeProperty cat = ups.getProperty("gc"); UnicodeSet ucd410 = cat.getSet("Cn") diff --git a/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt b/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt index ef6ad4e18fa..f7d71d141b3 100644 --- a/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt @@ -10,7 +10,7 @@ # The data supports both implementations that require simple case foldings # (where string lengths don't change), and implementations that allow full case folding # (where string lengths may grow). Note that where they can be supported, the -# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match. +# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match. # # All code points not listed in this file map to themselves. # diff --git a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java index 0a45978f322..c29405ececb 100644 --- a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $ -* $Date: 2005/11/01 00:10:53 $ -* $Revision: 1.17 $ +* $Date: 2006/04/05 22:12:44 $ +* $Revision: 1.18 $ * ******************************************************************************* */ @@ -840,6 +840,13 @@ public final class ConvertUCD implements UCD_Types { } else if (fieldName.equals("gc")) { uData.generalCategory = Utility.lookup(fieldValue, UCD_Names.GENERAL_CATEGORY, true); +// if (major >= 5 && uData.script == Unknown_Script +// && uData.generalCategory != Cn +// && uData.generalCategory != Cs +// && uData.generalCategory != Co) { +// uData.script = COMMON_SCRIPT; +// System.out.println("Resetting to Common Script: " + Utility.hex(uData.codePoint)); +// } } else if (fieldName.equals("bc")) { uData.bidiClass = Utility.lookup(fieldValue, UCD_Names.BIDI_CLASS, true); } else if (fieldName.equals("dt")) { @@ -878,8 +885,17 @@ public final class ConvertUCD implements UCD_Types { uData.numericValue = Utility.doubleFrom(fieldValue); } else if (fieldName.equals("cc")) { uData.combiningClass = (byte)Utility.intFrom(fieldValue); + if (uData.combiningClass == 9 && major >= 5) { + System.out.println("setting Grapheme_Link " + Utility.hex(uData.codePoint) + "\t" + uData.name); + uData.binaryProperties |= (1<= 5 && (uData.binaryProperties & 1<" + pad1 - + BagFormatter.toHTMLControl.transliterate(string) + + TransliteratorUtilities.toHTMLControl.transliterate(string) + pad + " "; } diff --git a/tools/unicodetools/com/ibm/text/UCD/IDNTester.java b/tools/unicodetools/com/ibm/text/UCD/IDNTester.java new file mode 100644 index 00000000000..6c47617a913 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/IDNTester.java @@ -0,0 +1,142 @@ +package com.ibm.text.UCD; + +import java.io.IOException; +import java.io.PrintWriter; + +import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.impl.PrettyPrinter; +import com.ibm.icu.text.IDNA; +import com.ibm.icu.text.StringPrepParseException; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.text.utility.Utility; + +public class IDNTester { + static StringBuffer inbuffer = new StringBuffer(); + static StringBuffer intermediate, outbuffer; + static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4; + static UnicodeSet IDNInputOnly = new UnicodeSet(); + static UnicodeSet IDNOutput = new UnicodeSet(); + static boolean initialized = false; + static UnicodeSet IDInputOnly32 = new UnicodeSet(); + static UnicodeSet IDOutput32 = new UnicodeSet(); + static UnicodeSet IDInputOnly50 = new UnicodeSet(); + static UnicodeSet IDOutput50 = new UnicodeSet(); + static PrettyPrinter pp = new PrettyPrinter(); + static PrintWriter pw; + + public static void main(String[] args) throws IOException { + initialize(); + pw = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "idnCount.html"); + pw.println(""); + showSet("IDN InputOnly: ", IDNInputOnly); + showSet("IDN Output: ", IDNOutput); + showSet("ID InputOnly, U3.2: ", IDInputOnly32); + showSet("ID Output, U3.2: ", IDOutput32); + + showSet("IDN Output - ID Output, U3.2: ", new UnicodeSet(IDNOutput).removeAll(IDOutput32)); + showSet("IDN Output & ID Output, U3.2: ", new UnicodeSet(IDNOutput).retainAll(IDOutput32)); + showSet("ID Output - IDN Output, U3.2: ", new UnicodeSet(IDOutput32).removeAll(IDNOutput)); + + showSet("ID InputOnly, U5.0: ", IDInputOnly50); + showSet("ID Output, U5.0: ", IDOutput50); + showSet("ID Output, U5.0 - U3.2: ", new UnicodeSet(IDOutput50).removeAll(IDOutput32)); + + pw.println(""); + + pw.close(); + } + + public static void showSet(String title, UnicodeSet set) { + pw.println("

" + title + set.size() + "

" + "

" + pp.toPattern(set) + "

"); + pw.println(); + } + + static UnicodeSet getIDNInput() { + if (!initialized) initialize(); + return IDNInputOnly; + } + + static UnicodeSet getIDNOutput() { + if (!initialized) initialize(); + return IDNInputOnly; + } + + private static void initialize() { + UnicodeSet oddballs = new UnicodeSet("[\u034F \u180B-\u180D \uFE00-\uFE0F _]"); + UCD U32 = UCD.make("3.2.0"); + Normalizer nfkc32 = new Normalizer(Normalizer.NFKC, "3.2.0"); + UCDProperty xid32 = DerivedProperty.make(UCD.Mod_ID_Continue_NO_Cf,U32); + UnicodeSet IDInput32 = xid32.getSet(); + IDInput32.add('-').removeAll(oddballs); + + UCD U50 = UCD.make("5.0.0"); + Normalizer nfkc50 = new Normalizer(Normalizer.NFKC, "5.0.0"); + UCDProperty xid50 = DerivedProperty.make(UCD.Mod_ID_Continue_NO_Cf,U50); + UnicodeSet IDInput50 = xid50.getSet(); + IDInput50.add('-').removeAll(oddballs); + + for (int i = 0; i < 0x10FFFF; ++i) { + if ((i & 0xFFF) == 0) { + System.out.println(i); + System.out.flush(); + } + int type = getIDNAType(i); + if (type == OK) { + IDNOutput.add(i); + } else if (type != ILLEGAL) { + IDNInputOnly.add(i); + } + if (IDInput32.contains(i)) { + splitSet(IDInputOnly32, IDOutput32, U32, nfkc32, i); + } + if (IDInput50.contains(i)) { + splitSet(IDInputOnly50, IDOutput50, U50, nfkc50, i); + } + } + initialized = true; + } + + private static void splitSet(UnicodeSet inputOnlySet, UnicodeSet outputSet, UCD ucd, Normalizer nfkc, int i) { + if (i < 0x7F) { + outputSet.add(i); + return; + } + String v = UTF16.valueOf(i); + String s = ucd.getCase(i, UCD.FULL, UCD.FOLD); + if (s.equals(v)) { + s = nfkc.normalize(s); + if (s.equals(v)) { + s = ucd.getCase(s, UCD.FULL, UCD.FOLD); + if (s.equals(v)) { + outputSet.add(i); + return; + } + } + } + inputOnlySet.add(i); + } + + static public int getIDNAType(int cp) { + if (cp == '-') return OK; + inbuffer.setLength(0); + UTF16.append(inbuffer, cp); + try { + intermediate = IDNA.convertToASCII(inbuffer, + IDNA.DEFAULT); // USE_STD3_RULES + if (intermediate.length() == 0) + return DELETED; + outbuffer = IDNA.convertToUnicode(intermediate, + IDNA.USE_STD3_RULES); + } catch (StringPrepParseException e) { + return ILLEGAL; + } catch (Exception e) { + System.out.println("Failure at: " + Utility.hex(cp)); + return ILLEGAL; + } + if (!TestData.equals(inbuffer, outbuffer)) + return REMAPPED; + return OK; + } + +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/InvariantTest.txt b/tools/unicodetools/com/ibm/text/UCD/InvariantTest.txt new file mode 100644 index 00000000000..29e73dd2ae8 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/InvariantTest.txt @@ -0,0 +1,75 @@ +Let $letter = [$gc:Lu $gc:Ll $gc:Lt $gc:Lo $gc:Lm]; +Let $number = [$gc:Nd $gc:Nl $gc:No] +Let $mark = [$gc:mn $gc:me $gc:mc] +Let $LMN = [$letter $number $mark] +Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation] +Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol] +Let $nfc = [^$NFC_Quick_Check:No] + +Show $nfc + +Show [$alphabetic - [$mark $letter $number]] + + +Let $oldCJK = [\u1100-\u11FF \u3040-\u30FF \u3130-\u318F \u31F0-\u31FF \u3400-\u4DBF \u4E00-\u9FFF \uAC00-\uD7AF \uF900-\uFAFF \uFF65-\uFFDC] + +Show [$oldCJK & $gc:cn] + +Let $fixedOld = [$oldCJK-$gc:cn] + + +#List the non-alphabetic old items +#Show [$oldCJK-$gc:cn-$alphabetic] + +#Check for differences +#Test $fixedOld = $trialNew + +#ShowEach $mark + +Let $uax29_outliers = [\u3031-\u3035 \u309B-\u309C \u30A0 \u30FC \uFF70 \uFF9E-\uFF9F] +Let $other_outliers = [\u3099-\u309A \u3006 \u303C \u302A-\u302E \u302F \U000E0100-\U000E01EF] + +# ========================================== + +# Outliers from UAX29 +Show $uax29_outliers + +# Additional outliers +Show $other_outliers + +# Take the 5 CJK scripts +Let $trialScripts = [$script:hani $script:hang $script:kana $script:hira $script:bopo] + +# Remove the non-LMN +Let $trialNewBase = [$trialScripts & $LMN] + +# Add the outliers +Let $trialNew = [$trialNewBase $uax29_outliers $other_outliers] + +# Show our result +Show $trialNew + +# As a double-check, show script characters we're tossing +Show [$trialScripts - $trialNew] + +# Compare snippets stuff +Let $guessClose = [$lb:QU $lb:Close_Punctuation] +Let $__closing_punc = ["')>\]`\}\u00AB\u00BB\u2018\u2019\u201C\u201D\u2039\u203A\u207E\u208E\u27E7\u27E9\u27EB\u2984\u2986\u2988\u298A\u298C\u298E\u2990\u2992\u2994\u2996\u2998\u29D9\u29DB\u29FD\u3009\u300B\u300D\u300F\u3011\u3015\u3017\u3019\u301B\u301E\u301F\uFD3F\uFE42\uFE44\uFE5A\uFE5C\uFF02\uFF07\uFF09\uFF3D\uFF5D\uFF63] + +$guessClose = $__closing_punc + +Let $guessClose = [$gc:pf $gc:pe $gc:pi] +$guessClose = $__closing_punc + +Let $guessTerm = [$sb:aterm $sb:sterm] +$guessTerm = [? ? !?? ? ? ? ? ??? ? ? ? ? ? ? ? .?? … ? ? ? ? ? ? ? ?? ? ? ? ? ? ? ?] + +Let $__issymotherr = [\u00A6\u00A7\u06FD\u06FE\u0F01-\u0F03\u0F13-\u0F17\u0F1A-\u0F1F\u0FBE-\u0FC5\u0FC7-\u0FCC\u2100\u2101\u2104-\u2106\u2108\u2109\u2117\u2118\u211E-\u2121\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u2400-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u2613\u2619-\u266E\u2670\u2671\u2701-\u2704\u2706-\u2709\u270C-\u2727\u2729-\u274B\u274F-\u2752\u2758-\u275E\u2761-\u2794\u2798-\u27AF\u27B1-\u27BE\u2800-\u28FF\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3012\u3013\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u3200-\u321C\u322A-\u3243\u3260-\u327B\u328A-\u32B0\u32C0-\u32CB\u32D0-\u32FE\u3300-\u3376\u337B-\u33DD\u33E0-\u33FE\uA490-\uA4A1\uA4A4-\uA4B3\uA4B5-\uA4C0\uA4C2-\uA4C4\uFFED\uFFEE\uFFFC\uFFFD] +Let $__issymothers = [\u00B6\u0482\u06E9\u09FA\u0B70\u0F34\u0F36\u0F38\u0FCF\u2114\u2123\u2125\u2127\u2129\u212E\u2132\u213A\u21D3\u220E\u2617\u274D\u2756\u3004\u3020\u327F\uA4C6\uFFE4\uFFE8] + +Let $symOther = [$__issymotherr $__issymothers] + +$symOther = $gcAllSymbols + + +[$symOther & $nfc] = [$gcAllSymbols & $nfc] diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index c059d8d1c74..f0b273399f3 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2005/10/11 19:39:15 $ -* $Revision: 1.36 $ +* $Date: 2006/04/05 22:12:44 $ +* $Revision: 1.37 $ * ******************************************************************************* */ @@ -160,8 +160,9 @@ public final class Main implements UCD_Types { //else if (arg.equalsIgnoreCase("TrailingZeros")) GenerateData.genTrailingZeros(); else if (arg.equalsIgnoreCase("GenerateThaiBreaks")) GenerateThaiBreaks.main(null); - else if (arg.equalsIgnoreCase("TestData")) TestData.main(new String[]{args[++i]}); - + else if (arg.equalsIgnoreCase("TestData")) TestData.main(new String[]{args[++i]}); + else if (arg.equalsIgnoreCase("MakeUnicodeFiles")) MakeUnicodeFiles.main(new String[]{}); + //else if (arg.equalsIgnoreCase("checkAgainstUInfo")) checkAgainstUInfo(); else if (arg.equalsIgnoreCase("checkScripts")) VerifyUCD.checkScripts(); else if (arg.equalsIgnoreCase("IdentifierTest")) VerifyUCD.IdentifierTest(); diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java b/tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java index e56e21967c2..3895442b7c3 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java +++ b/tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java @@ -16,6 +16,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.dev.test.util.TransliteratorUtilities; import com.ibm.icu.dev.test.util.UnicodeMap; import com.ibm.icu.dev.test.util.UnicodePropertySource; import com.ibm.icu.text.Collator; @@ -71,7 +72,7 @@ public class MakeNamesChart { System.out.println("file: " + chartPrefix + fileName); PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", chartPrefix + fileName); out.println("" + - BagFormatter.toHTML.transliterate(getHeading(lineParts[2])) + + TransliteratorUtilities.toHTML.transliterate(getHeading(lineParts[2])) + "" + ""); @@ -117,7 +118,7 @@ public class MakeNamesChart { String hexcp = Utility.hex(it.codepoint, 4); String title = ""; String name = Default.ucd().getName(it.codepoint); - if (name != null) title = " title='" + BagFormatter.toHTML.transliterate(name.toLowerCase()) + "'"; + if (name != null) title = " title='" + TransliteratorUtilities.toHTML.transliterate(name.toLowerCase()) + "'"; out.println("\u00A0" @@ -347,7 +348,7 @@ public class MakeNamesChart { static Matcher escapeMatch = Pattern.compile("\\&[A-Z][a-z]*\\;").matcher(""); private static String showTextConvertingHex(String body, boolean addCharToHex) { - body = BagFormatter.toHTML.transliterate(body); + body = TransliteratorUtilities.toHTML.transliterate(body); if (addCharToHex) { int position = 0; while (position < body.length()) { @@ -411,7 +412,7 @@ public class MakeNamesChart { if (type == UCD.Cn || type == UCD.Co || type == UCD.Cs) { return "\u2588"; } - String result = BagFormatter.toHTML.transliterate(UTF16.valueOf(cp)); + String result = TransliteratorUtilities.toHTML.transliterate(UTF16.valueOf(cp)); if (type == UCD.Me || type == UCD.Mn) { result = "\u25CC" + result; } else if (rtl.contains(cp)) { diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java index 9bbccd86a38..d2fe6ea2877 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java +++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java @@ -68,6 +68,7 @@ public class MakeUnicodeFiles { public static void main(String[] args) throws IOException { generateFile(); + System.out.println("DONE"); } static class Format { @@ -294,7 +295,7 @@ public class MakeUnicodeFiles { */ try { BufferedReader br = - Utility.openReadFile("MakeUnicodeFiles.txt", Utility.UTF8); + Utility.openReadFile("com/ibm/text/UCD/MakeUnicodeFiles.txt", Utility.UTF8); String key = null; String file = null, property = null, value = "", comments = ""; while (true) { @@ -594,6 +595,7 @@ public class MakeUnicodeFiles { pw.println(SEPARATOR); pw.println("# Total: " + count); pw.println(); + pw.println("# EOF"); udf.close(); } @@ -710,6 +712,8 @@ public class MakeUnicodeFiles { pw.println(line); } } + pw.println(); + pw.println("# EOF"); udf.close(); } @@ -769,10 +773,16 @@ public class MakeUnicodeFiles { ps.valueStyle = "none"; } - if (ps.noLabel) bf.setLabelSource(null); - if (ps.nameStyle.equals("none")) bf.setPropName(null); - else if (ps.nameStyle.equals("short")) bf.setPropName(prop.getFirstNameAlias()); - else bf.setPropName(name); + if (ps.noLabel) { + bf.setLabelSource(null); + } + if (ps.nameStyle.equals("none")) { + bf.setPropName(null); + } else if (ps.nameStyle.equals("short")) { + bf.setPropName(prop.getFirstNameAlias()); + } else { + bf.setPropName(name); + } if (ps.interleaveValues) { writeInterleavedValues(pw, bf, prop, ps); @@ -784,6 +794,8 @@ public class MakeUnicodeFiles { writeEnumeratedValues(pw, bf, unassigned, prop, ps); } } + pw.println(); + pw.println("# EOF"); udf.close(); } @@ -809,6 +821,15 @@ public class MakeUnicodeFiles { temp2.addAll(aliases); aliases = temp2; } + System.out.println("Check: " + prop.getValue(0xE000)); + String missing = ps.skipUnassigned != null ? ps.skipUnassigned : ps.skipValue; + if (missing != null && !missing.equals("False")) { + pw.println(); + String propName = bf.getPropName(); + if (propName == null) propName = ""; + else if (propName.length() != 0) propName = propName + "; "; + pw.println("# @missing: 0000..10FFFF; " + propName + missing); + } for (Iterator it = aliases.iterator(); it.hasNext();) { String value = (String)it.next(); if (DEBUG) System.out.println("Getting value " + value); @@ -891,6 +912,7 @@ public class MakeUnicodeFiles { pw.println(); //if (s.size() != 0) bf.showSetNames(pw, s); + //System.out.println(bf.showSetNames(s)); } } diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt index 7296a6e8a59..75671e023d9 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt +++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt @@ -1,6 +1,6 @@ -Generate: NamedSequences +Generate: .* DeltaVersion: 14 -CopyrightYear: 2005 +CopyrightYear: 2006 File: auxiliary/GraphemeBreakProperty Property: Grapheme_Cluster_Break @@ -65,7 +65,10 @@ Value: 4.0 # Newly assigned in Unicode 4.0.0 (April, 2003) Value: 4.1 -# Newly assigned in Unicode 4.1.0 (XXX, 2005) +# Newly assigned in Unicode 4.1.0 (March, 2005) + +Value: 5.0 +# Newly assigned in Unicode 5.0.0 (XXX, 2006) File: extracted/DerivedBidiClass Property: Bidi_Class @@ -158,6 +161,10 @@ Property: Grapheme_Base # Note: depending on an application's interpretation of Co (private use), # they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither. +Property: Grapheme_Link +# Derived Property: Grapheme_Link (deprecated) +# Generated from: Canonical_Combining_Class=Virama +# Use Canonical_Combining_Class=Virama directly instead File: extracted/DerivedDecompositionType Property: Decomposition_Type @@ -316,8 +323,6 @@ Property: Noncharacter_Code_Point Property: Other_Grapheme_Extend -Property: Grapheme_Link - Property: IDS_Binary_Operator Property: IDS_Trinary_Operator @@ -353,7 +358,7 @@ Property: SPECIAL File: Scripts Property: Script -Format: nameStyle=none skipUnassigned=Common +Format: nameStyle=none skipValue=Unknown File: SpecialCasing Property: SPECIAL diff --git a/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java b/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java index 630f53770cf..f4831951e01 100644 --- a/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java +++ b/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java @@ -1,6 +1,10 @@ package com.ibm.text.UCD; +import com.ibm.icu.impl.CollectionUtilities; +import com.ibm.icu.text.Collator; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; + import java.util.BitSet; import com.ibm.text.utility.*; import java.io.PrintWriter; @@ -194,6 +198,7 @@ public final class NFSkippable extends UCDProperty { PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt", Utility.UTF8_WINDOWS); + out.println(Utility.BOM); out.println("NFSafeSets"); out.println("Version: " + Default.ucd().getVersion()); out.println("Date: " + Default.getDate()); @@ -212,6 +217,8 @@ public final class NFSkippable extends UCDProperty { out.close(); } + static Collator UCA = Collator.getInstance(ULocale.ROOT); + static void generateSet(PrintWriter out, String label, UCDProperty up) { System.out.println("Generating: " + up.getName(NORMAL)); UnicodeSet result = new UnicodeSet(); @@ -227,11 +234,17 @@ public final class NFSkippable extends UCDProperty { out.println(label + " = new UnicodeSet("); writeStringInPieces(out, rSet, ", false);"); - rSet = result.toPattern(false); + if (true) { + rSet = result.toPattern(false); + } else { + rSet = CollectionUtilities.prettyPrint(result, true, null, null, UCA, UCA); + } + out.println("/*Unicode: "); writeStringInPieces(out, rSet, "*/"); out.println(); out.flush(); + System.out.println("Done"); } /* diff --git a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java index 284f3dd2f52..5fd606ca8e1 100644 --- a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java @@ -5,30 +5,42 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $ -* $Date: 2005/11/19 05:39:39 $ -* $Revision: 1.10 $ +* $Date: 2006/04/05 22:12:43 $ +* $Revision: 1.11 $ * ******************************************************************************* */ package com.ibm.text.UCD; -import java.util.*; -import java.io.*; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.PrintStream; +import java.io.PrintWriter; +import java.io.StreamTokenizer; +import java.io.StringReader; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; +import java.util.StringTokenizer; +import java.util.TreeMap; +import java.util.TreeSet; import com.ibm.icu.dev.demo.translit.CaseIterator; import com.ibm.icu.dev.test.util.BagFormatter; import com.ibm.icu.dev.test.util.UnicodeMap; -import com.ibm.icu.dev.test.util.UnicodeProperty; -import com.ibm.icu.dev.test.util.UnicodePropertySource; -import com.ibm.icu.dev.test.util.UnicodeMap.MapIterator; import com.ibm.icu.impl.PrettyPrinter; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.text.CanonicalIterator; import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Normalizer; +//import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.UTF16; @@ -36,27 +48,27 @@ import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; import com.ibm.icu.util.ULocale; -import com.ibm.text.utility.*; - public class QuickTest implements UCD_Types { public static void main(String[] args) throws IOException { try { - + + checkCase(); + if (true) return; + + getCaseFoldingUnstable(); + getCaseLengths("Lower", UCD.LOWER); getCaseLengths("Upper", UCD.UPPER); getCaseLengths("Title", UCD.TITLE); getCaseLengths("Fold", UCD.FOLD); - if (true) return; checkUnicodeSet(); getLengths("NFC", Default.nfc()); getLengths("NFD", Default.nfd()); getLengths("NFKC", Default.nfkc()); getLengths("NFKD", Default.nfkd()); - //getCaseFoldingUnstable(); - checkCase(); if (true) return; tem(); //checkPrettyPrint(); @@ -643,13 +655,13 @@ public class QuickTest implements UCD_Types { if (!text.equals(x)) alpha.put("Lowercase", x); String title = x = UCharacter.toTitleCase(ULocale.ENGLISH,text,null); if (!text.equals(x)) alpha.put("Titlecase", x); - String nfc = x = Normalizer.normalize(text,Normalizer.NFC); + String nfc = x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFC); if (!text.equals(x)) alpha.put("NFC", x); - String nfd = x = Normalizer.normalize(text,Normalizer.NFD); + String nfd = x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFD); if (!text.equals(x)) alpha.put("NFD", x); - x = Normalizer.normalize(text,Normalizer.NFKD); + x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFKD); if (!text.equals(x)) alpha.put("NFKD", x); - x = Normalizer.normalize(text,Normalizer.NFKC); + x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFKC); if (!text.equals(x)) alpha.put("NFKC", x); CanonicalIterator ci = new CanonicalIterator(text); diff --git a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt index 77bc82ed92c..5fa73072a7c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt +++ b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt @@ -70,3 +70,6 @@ # Note: the following case is already in the UnicodeData file. # 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I + +# EOF + diff --git a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt index 0fcfa85a34e..237b9e1beb5 100644 --- a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt @@ -31,9 +31,10 @@ # A locale ID is defined by taking any language tag as defined by # RFC 3066 (or its successor), and replacing '-' by '_'. # -# A context for a character C is defined by Section 3.13 Default Case Operations, -# on p. 89-90 of The Unicode Standard, Version 4.0, as amended by Unicode 4.1.0, -# as specified in http://www.unicode.org/versions/Unicode4.1.0/ +# A context for a character C is defined by Section 3.13 Default Case +# Operations, of The Unicode Standard, Version 5.0. +# (This is identical to the context defined by Unicode 4.1.0, +# as specified in http://www.unicode.org/versions/Unicode4.1.0/) # # Parsers of this file must be prepared to deal with future additions to this format: # * Additional contexts diff --git a/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html b/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html index 22dc10f8f8f..3cb84c0ee1c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html +++ b/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html @@ -1,13 +1,10 @@ - - + - + @@ -19,8 +16,9 @@ - + @@ -29,105 +27,78 @@

Standardized Variants

[Unicode]  Unicode - Character Database + [Unicode]  Unicode + Character Database
 
- - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + +
Revision@revision@
AuthorsMembers of the Editorial Committee
Date@date@
This Versionhttp://www.unicode.org/Public/@updateDirectory@/StandardizedVariants-@revision@.html
Previous Versionhttp://www.unicode.org/Public/3.2-Update/StandardizedVariants-3.2.0.html
Latest Versionhttp://www.unicode.org/Public/UNIDATA/StandardizedVariants.html
Revision@revision@
AuthorsMembers of the Editorial Committee
Date@date@
This Version + http://www.unicode.org/Public/@updateDirectory@/@filename@.html
Previous Version + http://www.unicode.org/Public/4.1.0/ucd/StandardizedVariants.html
Latest Version + http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html


Summary

-

This file provides a visual display of the standard variant sequences - derived from StandardizedVariants.txt.

+

This file provides a visual display of the standard variant sequences derived from + StandardizedVariants.txt.

Status

-

The file and the files described herein are part of the Unicode - Character Database (UCD) and are governed by the UCD - Terms of Use stated at the end.

+

This file and the files described herein are part of the Unicode Character Database and + are governed by the terms of use at + http://www.unicode.org/terms_of_use.html.


Introduction

-

The tables here exhaustively lists the valid, registered - combinations of base character plus variation indicator. All combinations not - listed in StandardizedVariants.txt are unspecified and are reserved for future - standardization; no conformant process may interpret them as standardized - variants. Variation selectors and their use are described in The Unicode - Standard.

-

These mathematical variants are all produced with the addition of Variation - Selector 1 (VS1 or U+FE00) to mathematical operator base characters. There is - no variation according to context. The Mongolian variants use the Mongolian - Variant Selectors, and may vary according to context. That is, if a contextual - shape is not listed below, then the variation sequence has an unmodified +

The tables here exhaustively lists the valid, registered combinations of base character + plus variation indicator. All combinations not listed in StandardizedVariants.txt are unspecified + and are reserved for future standardization; no conformant process may interpret them as + standardized variants. Variation selectors and their use are described in The Unicode Standard.

+

These mathematical variants are all produced with the addition of Variation Selector 1 (VS1 or + U+FE00) to mathematical operator base characters. There is no variation according to context. The + Mongolian variants use the Mongolian Variant Selectors, and may vary according to context. That + is, if a contextual shape is not listed below, then the variation sequence has an unmodified appearance. At this time no Han variants exist.

-

Note: The glyphs used to show the variations - are often derived from different physical fonts than the representative - glyphs in the standard. They may therefore exhibit minor differences in - size, proportion, or weight unrelated to the intentional difference - in feature that is the defining element of the variation. Such minor - differences should be ignored. Likewise, in some cases the existing - representative fonts may not yet contain newly encoded characters and hence - some representative glyphs shown in these tables may have a slightly - different style than others.

+

Note: The glyphs used to show the variations are often derived + from different physical fonts than the representative glyphs in the standard. They may therefore + exhibit minor differences in size, proportion, or weight unrelated to the intentional + difference in feature that is the defining element of the variation. Such minor differences + should be ignored. Likewise, in some cases the existing representative fonts may not yet contain + newly encoded characters and hence some representative glyphs shown in these tables may have a + slightly different style than others.

@table@


-

UCD Terms of Use

-

Disclaimer

-
-

The Unicode Character Database is provided as is by Unicode, Inc. No - claims are made as to fitness for any particular purpose. No warranties of - any kind are expressed or implied. The recipient agrees to determine - applicability of information provided. If this file has been purchased on - magnetic or optical media from Unicode, Inc., the sole remedy for any claim - will be exchange of defective media within 90 days of receipt.

-

This disclaimer is applicable for all other data files accompanying - the Unicode Character Database, some of which have been compiled by the - Unicode Consortium, and some of which have been supplied by other sources.

-
-

Limitations on Rights to Redistribute This Data

-
-

Recipient is granted the right to make copies in any form for internal - distribution and to freely use the information supplied in the creation of - products supporting the UnicodeTM Standard. The files in the - Unicode Character Database can be redistributed to third parties or other - organizations (whether for profit or not) as long as this notice and the - disclaimer notice are retained. Information can be extracted from these - files and used in documentation or programs, as long as there is an - accompanying notice indicating the source.

-
-
- +
Access to Copyright and terms of use + Access to Copyright and terms of use
- +
diff --git a/tools/unicodetools/com/ibm/text/UCD/TestData.java b/tools/unicodetools/com/ibm/text/UCD/TestData.java index 78fe9c89398..93801938963 100644 --- a/tools/unicodetools/com/ibm/text/UCD/TestData.java +++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $ -* $Date: 2005/11/19 05:39:39 $ -* $Revision: 1.24 $ +* $Date: 2006/04/05 22:12:43 $ +* $Revision: 1.25 $ * ******************************************************************************* */ @@ -27,6 +27,7 @@ import com.ibm.icu.impl.CollectionUtilities; import com.ibm.icu.impl.ICUData; import com.ibm.icu.impl.ICUResourceBundle; import com.ibm.icu.impl.UCharArrayIterator; +import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.NumberFormat; import com.ibm.icu.text.StringPrep; import com.ibm.icu.text.StringPrepParseException; @@ -45,8 +46,17 @@ public class TestData implements UCD_Types { static UnicodeProperty.Factory upf; public static void main (String[] args) throws IOException { - //checkChars(false); - + tryConsole2(); + if (true) return; + + showNonCompatFull(false); + showNonCompatFull(true); + + + checkForCaseStability(false); + //countChars(); + foo(); + System.out.println("main: " + Default.getDate()); upf = ICUPropertyFactory.make(); System.out.println("after factory: " + Default.getDate()); @@ -146,8 +156,152 @@ public class TestData implements UCD_Types { } } finally { log.close(); + } + } + + private static void showNonCompatFull(boolean compat) { + UCD ucd = UCD.make("4.1.0"); + Normalizer nfkc = new Normalizer(Normalizer.NFKC, ucd.getVersion()); + System.out.println(); + System.out.println(compat ? "Full Fold = Simple Lower of NFKC" : "Full Fold != Simple Lower of NFKC"); + System.out.println(); + int count = 0; + for (int i = 0; i <= 0x10FFFF; ++i) { + int gc = ucd.getCategory(i); + if (gc == Cn || gc == PRIVATE_USE) continue; + //if (compat == (ucd.getDecompositionType(i) > UCD.CANONICAL)) continue; + String str = UTF16.valueOf(i); + String simpleLower = ucd.getCase(str, SIMPLE, LOWER); + String fullFold = ucd.getCase(str, FULL, FOLD); + + if (!simpleLower.equals(fullFold)) { + String nfkcStr = nfkc.normalize(str); + String simpleLowerNfkc = ucd.getCase(nfkcStr, SIMPLE, LOWER); + if (compat != (fullFold.equals(simpleLowerNfkc))) continue; + System.out.println(ucd.getCodeAndName(i)); + System.out.println("\tSimple Lower:\t" + ucd.getCodeAndName(simpleLower)); + System.out.println("\tFull Fold:\t" + ucd.getCodeAndName(fullFold)); + count++; + } + } + System.out.println("Count:\t" + count); + } + + private static void tryConsole() throws UnsupportedEncodingException { + for (int i = 1; i < 0xFFFF; ++i) { + String s = UTF32.valueOf32(i); + byte[] bytes = s.getBytes("UTF-8"); + String utf8bytes = ""; + for (int j = 0; j < bytes.length; ++j) { + if (j != 0) utf8bytes += " "; + utf8bytes += Utility.hex(bytes[j]&0xFF,2); + } + String name = UCharacter.getExtendedName(i); + System.out.println(Utility.hex(i) + "\t(" + s + ")\t[" + utf8bytes + "]\t" + name); } } + + private static void tryConsole2() throws UnsupportedEncodingException { + UnicodeSet failures = new UnicodeSet(); + check: + for (int i = 1; i <= 0x10FFFF; ++i) { + String s = UTF32.valueOf32(i); + byte[] bytes = s.getBytes("UTF-8"); + for (int j = 0; j < bytes.length; ++j) { + switch (bytes[j]&0xFF) { + case 0x81: case 0x8D: case 0x8F: case 0x90: case 0x9D: + failures.add(i); + continue check; + } + } + } + System.out.println("Total corrupted characters: " + failures.size()); + System.out.println("Percent corrupted characters: " + ((failures.size() + 0.0) / 0x110000 * 100.0 + "%")); + //BagFormatter bf = new BagFormatter(); + //System.out.println(bf.showSetNames(failures)); + } + + + private static void countChars() { + int[][] count = new int[AGE_VERSIONS.length][50]; + for (int j = 1; j < AGE_VERSIONS.length; ++j) { + UCD ucd = UCD.make(AGE_VERSIONS[j]); + UCDProperty alpha = DerivedProperty.make(ucd.PropAlphabetic, ucd); + + int alphaCount = 0; + for (int i = 0; i <=0x10FFFF; ++i) { + int type = ucd.getCategory(i); + if (ucd.isNoncharacter(i)) type = LIMIT_CATEGORY; + ++count[j][type]; + if (alpha.hasValue(i) || type == ucd.Nd) ++count[j][LIMIT_CATEGORY+1]; + } + } + + for (byte i = -1; i < LIMIT_CATEGORY+2; ++i) { + switch(i) { + case -1: System.out.print("\t\t"); break; + default: System.out.print(UCD.getCategoryID_fromIndex(i,UCD.LONG) + "\t" + UCD.getCategoryID_fromIndex(i)); break; + case LIMIT_CATEGORY: System.out.print("Noncharacter" + "\t" + "NCCP"); break; + case LIMIT_CATEGORY+1: System.out.print("Alphabetic" + "\t" + "alpha"); break; + } + for (int j = 1; j < AGE_VERSIONS.length; ++j) { + if (i < 0) System.out.print("\t*" + AGE_VERSIONS[j] + "*"); + else System.out.print("\t" + count[j][i]); + } + System.out.println(); + } + + } + + private static void foo() { + String[] test = { + "vicepresident", + "vice president", + "vice-president", + "vice-président", + "vice-president's offices", + "vice-presidents' offices", + "vice-presidents offices", + "vice-presidentsoffices", + }; + RuleBasedCollator col = (RuleBasedCollator) Collator.getInstance(new ULocale("fr")); + col.setStrength(col.QUATERNARY); + col.setAlternateHandlingShifted(false); + + Arrays.sort(test, col); + List s = Arrays.asList(test); + String last = ""; + int[] level = new int[1]; + for (Iterator it = s.iterator(); it.hasNext();) { + String current = (String) it.next(); + int order = levelCompare(col, last, current, level); + //System.out.print(levelStrings[level[0]]); + //System.out.print(order < 0 ? "<" : order == 0 ? "=" : ">"); + System.out.println("\t" + current); + last = current; + } + for (int i = 0; i < test.length; ++i) { + System.out.print(test[i] + ";"); + } + System.out.println(); + } + + static String[] levelStrings = {".", "..", "...", "....", "....."}; + + static int levelCompare(RuleBasedCollator col, String a, String b, int[] level) { + int diff = 0; + level[0] = 0; + for (int i = 0; i < 15; ++i) { + col.setStrength(i); + diff = col.compare(a, b); + if (diff != 0) { + level[0] = i; + break; + } + } + return diff; + } + Matcher m; /** @@ -163,12 +317,12 @@ public class TestData implements UCD_Types { return true; } - private static void checkChars(boolean mergeRanges) { + private static void checkForCaseStability(boolean mergeRanges) { UCD ucd = Default.ucd(); ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(""); - UnicodeSet isUpper = ups.getSet("Uppercase=true"); - UnicodeSet isLower = ups.getSet("Lowercase=true"); - UnicodeSet isTitle = ups.getSet("gc=Lt"); + UnicodeSet propUppercase = ups.getSet("Uppercase=true"); + UnicodeSet propLowercase = ups.getSet("Lowercase=true"); + UnicodeSet isGcLt = ups.getSet("gc=Lt"); UnicodeSet otherAlphabetic = ups.getSet("Alphabetic=true").addAll(ups.getSet("gc=Sk")); // create the following UnicodeSet hasFold = new UnicodeSet(); @@ -177,6 +331,10 @@ public class TestData implements UCD_Types { UnicodeSet hasTitle = new UnicodeSet(); UnicodeSet compat = new UnicodeSet(); UnicodeSet bicameralsScripts = new UnicodeSet(); + + UnicodeSet isFUppercase = new UnicodeSet(); + UnicodeSet isFLowercase = new UnicodeSet(); + UnicodeSet isFTitlecase = new UnicodeSet(); UCD u40 = UCD.make("4.0.0"); BitSet scripts = new BitSet(); @@ -184,41 +342,83 @@ public class TestData implements UCD_Types { int gc = ucd.getCategory(i); if (gc == Cn || gc == PRIVATE_USE) continue; String str = UTF16.valueOf(i); - if (!str.equals(ucd.getCase(str, FULL, FOLD))) hasFold.add(i); - if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(i); + if (!str.equals(ucd.getCase(str, FULL, FOLD))) { + hasFold.add(i); + scripts.set(ucd.getScript(i)); + } + if (!str.equals(ucd.getCase(str, FULL, UPPER))) { + hasUpper.add(i); + scripts.set(ucd.getScript(i)); + } else { + isFUppercase.add(i); + } if (!str.equals(ucd.getCase(str, FULL, LOWER))) { hasLower.add(i); scripts.set(ucd.getScript(i)); + } else { + isFLowercase.add(i); + } + if (!str.equals(ucd.getCase(str, FULL, TITLE))) { + hasTitle.add(i); + scripts.set(ucd.getScript(i)); + } else { + isFTitlecase.add(i); } - if (!str.equals(ucd.getCase(str, FULL, TITLE))) hasTitle.add(i); if (!str.equals(Default.nfkd().normalize(str))) compat.add(i); //System.out.println(ucd.getCodeAndName(i) + "\t" + (u40.isAllocated(i) ? "already in 4.0" : "new in 4.1")); } BagFormatter bf = new BagFormatter(); + Transliterator nullTrans = Transliterator.getInstance("null"); + bf.setShowLiteral(nullTrans); bf.setMergeRanges(mergeRanges); bf.setUnicodePropertyFactory(ups); - printItems(bf, compat, "isUpper or isTitle without hasLower", - new UnicodeSet(isUpper).addAll(isTitle).removeAll(hasLower)); - printItems(bf, compat, "hasLower, but not isUpper or isTitle", - new UnicodeSet(hasLower).removeAll(isTitle).removeAll(isUpper)); - printItems(bf, compat, "isLower without hasUpper", - new UnicodeSet(isLower).addAll(isTitle).removeAll(hasUpper)); - printItems(bf, compat, "hasUpper, but not isLower or isTitle", - new UnicodeSet(hasUpper).removeAll(isTitle).removeAll(isLower)); + + UnicodeSet allCased = new UnicodeSet().addAll(hasUpper).addAll(hasLower).addAll(hasTitle); + isFUppercase.retainAll(allCased); + isFLowercase.retainAll(allCased); + isFTitlecase.retainAll(allCased); + System.out.println(Utility.BOM); + + printItems(bf, compat, "Uppercase=true or gc=Lt without hasLower", + new UnicodeSet(propUppercase).addAll(isGcLt).removeAll(hasLower)); + printItems(bf, compat, "hasLower, but not (Uppercase=true or gc=Lt)", + new UnicodeSet(hasLower).removeAll(isGcLt).removeAll(propUppercase)); + printItems(bf, compat, "Lowercase=true without hasUpper", + new UnicodeSet(propLowercase).addAll(isGcLt).removeAll(hasUpper)); + printItems(bf, compat, "hasUpper, but not (Lowercase=true or gc=Lt)", + new UnicodeSet(hasUpper).removeAll(isGcLt).removeAll(propLowercase)); + + + printItems(bf, compat, "Functionally Uppercase, but not Uppercase=true", + new UnicodeSet(isFUppercase).removeAll(propUppercase)); + printItems(bf, compat, "Uppercase=true, but not functionally Uppercase", + new UnicodeSet(propUppercase).removeAll(isFUppercase)); + + printItems(bf, compat, "Functionally Lowercase, but not Lowercase=true", + new UnicodeSet(isFLowercase).removeAll(propLowercase)); + printItems(bf, compat, "Lowercase=true, but not functionally Lowercase", + new UnicodeSet(propLowercase).removeAll(isFLowercase)); + UnicodeSet scriptSet = new UnicodeSet(); UnicodeProperty scriptProp = ups.getProperty("Script"); + bf.setMergeRanges(true); + System.out.println(); + System.out.println("Bicameral Scripts: those with at least one functionally cased character."); + System.out.println(); for (int i = 0; i < scripts.size(); ++i) { if (!scripts.get(i)) continue; - if (i == COMMON_SCRIPT) continue; + //if (i == COMMON_SCRIPT) continue; String scriptName = ucd.getScriptID_fromIndex((byte)i); - System.out.println(scriptName); - scriptSet.addAll(scriptProp.getSet(scriptName)); + UnicodeSet scriptUSet = scriptProp.getSet(scriptName); + scriptSet.addAll(scriptUSet); + printItems(bf, compat, "Bicameral Script: " + scriptName, + new UnicodeSet(allCased).retainAll(scriptUSet)); } - UnicodeSet allCased = new UnicodeSet().addAll(isUpper).addAll(isLower).addAll(isTitle); - printItems(bf, compat, "(Bicameral) isAlpha or Symbol Modifier, but not isCased", + bf.setMergeRanges(false); + printItems(bf, compat, "Bicameral Script: isAlpha or Symbol Modifier, but not isCased", new UnicodeSet(scriptSet).retainAll(otherAlphabetic).removeAll(allCased)); - printItems(bf, compat, "(Bicameral) isCased, but not isAlpha or Symbol Modifier", + printItems(bf, compat, "Bicameral Script: isCased, but not isAlpha or Symbol Modifier", new UnicodeSet(scriptSet).retainAll(allCased).removeAll(otherAlphabetic)); } @@ -302,21 +502,21 @@ public class TestData implements UCD_Types { } } - public static class RegexMatcher implements UnicodeProperty.Matcher { + public static class RegexMatcher implements UnicodeProperty.PatternMatcher { private Matcher matcher; - public UnicodeProperty.Matcher set(String pattern) { + public UnicodeProperty.PatternMatcher set(String pattern) { matcher = Pattern.compile(pattern).matcher(""); return this; } - public boolean matches(String value) { - matcher.reset(value); + public boolean matches(Object value) { + matcher.reset((String)value); return matcher.matches(); } } static BagFormatter bf = new BagFormatter(); - static UnicodeProperty.Matcher matcher = new RegexMatcher(); + static UnicodeProperty.PatternMatcher matcher = new RegexMatcher(); private static void showPropDiff(String p1, UnicodeSet s1, String p2, UnicodeSet s2) { System.out.println("Property Listing"); diff --git a/tools/unicodetools/com/ibm/text/UCD/TestIdentifiers.java b/tools/unicodetools/com/ibm/text/UCD/TestIdentifiers.java index aa759b788d5..21d156b8c35 100644 --- a/tools/unicodetools/com/ibm/text/UCD/TestIdentifiers.java +++ b/tools/unicodetools/com/ibm/text/UCD/TestIdentifiers.java @@ -26,8 +26,16 @@ public class TestIdentifiers { public static void main(String[] args) throws IOException { String[] tests = { "SØS", "façade", "MOPE", "VOP", "scope", "ibm", "vop", "toys-Ñ-us", "1iνе", "back", "boгing" }; + TestIdentifiers ti = new TestIdentifiers("L"); TestIdentifiers tiany = new TestIdentifiers("A"); + ti.loadIdentifiers(); + UnicodeSet idnCharSet = ti.idnChars.getSet("output", new UnicodeSet()); + System.out.println("idnCharSet: " + idnCharSet.size()); + UnicodeSet idnCharNonStarting = ti.nonstarting; + System.out.println("idnCharNonStarting: " + idnCharSet); + if (true) return; + for (int i = 0; i < tests.length; ++i) { System.out.print(tests[i]); String folded = UCharacter.foldCase(tests[i], true); diff --git a/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java b/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java index 76b58b96696..2dddd4633c2 100644 --- a/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java +++ b/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java @@ -10,6 +10,7 @@ import java.util.List; import java.util.Locale; import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.dev.tool.UOption; import com.ibm.icu.text.SymbolTable; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeMatcher; @@ -17,9 +18,26 @@ import com.ibm.icu.text.UnicodeSet; import com.ibm.text.utility.Utility; public class TestUnicodeInvariants { + private static final int + HELP1 = 0, + FILE = 1, + RANGE = 2 + ; + private static final UOption[] options = { + UOption.HELP_H(), + UOption.create("file", 'f', UOption.REQUIRES_ARG), + UOption.create("range", 'r', UOption.NO_ARG), + }; + public static void main(String[] args) throws IOException { - testInvariants(); + UOption.parseArgs(args, options); + + String file = "UnicodeInvariants.txt"; + if (options[FILE].doesOccur) file = options[FILE].value; + boolean doRange = options[RANGE].doesOccur; + + testInvariants(file, doRange); } /** @@ -68,19 +86,19 @@ public class TestUnicodeInvariants { static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\~ \\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]"); - public static void testInvariants() throws IOException { + public static void testInvariants(String outputFile, boolean doRange) throws IOException { String[][] variables = new String[100][2]; int variableCount = 0; PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt"); out.write('\uFEFF'); // BOM - BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", "UnicodeInvariants.txt"); + BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", outputFile); BagFormatter bf = new BagFormatter(); bf.setUnicodePropertyFactory(ToolUnicodePropertySource.make("")); BagFormatter bf2 = new BagFormatter(); bf2.setUnicodePropertyFactory(ToolUnicodePropertySource.make("")); - bf2.setMergeRanges(false); + bf2.setMergeRanges(doRange); ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] { - ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"), + ToolUnicodePropertySource.make(UCD.lastVersion).getSymbolTable("\u00D7"), ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")}); ParsePosition pp = new ParsePosition(0); int parseErrorCount = 0; @@ -113,11 +131,20 @@ public class TestUnicodeInvariants { // detect variables if (line.startsWith("Show")) { String part = line.substring(4).trim(); + if (part.startsWith("Each")) { + part = part.substring(4).trim(); + bf2.setMergeRanges(false); + } pp.setIndex(0); UnicodeSet leftSet = new UnicodeSet(part, pp, st); bf2.showSetNames(out, leftSet); + bf2.setMergeRanges(doRange); continue; } + + if (line.startsWith("Test")) { + line = line.substring(4).trim(); + } char relation = 0; String rightSide = null; @@ -166,7 +193,7 @@ public class TestUnicodeInvariants { boolean ok = true; switch(relation) { - case '=': ok = leftSet.equals(rightSet); break; + case '=': case '\u2261': ok = leftSet.equals(rightSet); break; case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break; case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break; case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break; diff --git a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java index b5a9bf024e8..cf1f5400f37 100644 --- a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java +++ b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java @@ -254,18 +254,19 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { , "Katakana"); Object foo = unicodeMap.getSet("Katakana"); UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true"); + UnicodeProperty lineBreak = getProperty("Line_Break"); unicodeMap.putAll(getProperty("Alphabetic").getSet("true") .add(0xA0).add(0x05F3) .removeAll(getProperty("Ideographic").getSet("true")) .removeAll(unicodeMap.getSet("Katakana")) - .removeAll(script.getSet("Thai")) - .removeAll(script.getSet("Lao")) + //.removeAll(script.getSet("Thai")) + //.removeAll(script.getSet("Lao")) + .removeAll(lineBreak.getSet("SA")) .removeAll(script.getSet("Hiragana")) .removeAll(graphemeExtend), "ALetter"); unicodeMap.putAll(new UnicodeSet("[\\u0027\\u00B7\\u05F4\\u2019\\u2027\\u003A]") ,"MidLetter"); - UnicodeProperty lineBreak = getProperty("Line_Break"); unicodeMap.putAll(lineBreak.getSet("Infix_Numeric") .remove(0x003A), "MidNum"); unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric"); diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 28267b47dd9..280e39a05a6 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2005/11/01 00:10:54 $ -* $Revision: 1.40 $ +* $Date: 2006/04/05 22:12:44 $ +* $Revision: 1.41 $ * ******************************************************************************* */ @@ -43,7 +43,8 @@ public final class UCD implements UCD_Types { /** * Used for the default version. */ - public static final String latestVersion = "5.1.0"; + public static final String latestVersion = "5.0.0"; + public static final String lastVersion = "4.1.0"; /** * Create singleton instance for default (latest) version @@ -803,6 +804,9 @@ public final class UCD implements UCD_Types { } public byte getScript(int codePoint) { + if (codePoint == 0xE000) { + codePoint += 0; + } return get(codePoint, false).script; } @@ -1398,6 +1402,7 @@ to guarantee identifier closure. } if (isHangul) { if (fixStrings) result.decompositionMapping = getHangulDecompositionPair(codePoint); + if (isLV(codePoint)) result.lineBreak = LB_H2; else result.lineBreak = LB_H3; result.decompositionType = CANONICAL; } return result; @@ -1612,6 +1617,9 @@ to guarantee identifier closure. } combiningClassSet.set(uData.combiningClass & 0xFF); + if (cp == 0xE000) { + System.out.println("Check: " + uData.script); + } add(uData); } /* diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java index b4c1b5d39b7..6f5f80c9e30 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $ -* $Date: 2005/03/10 02:37:20 $ -* $Revision: 1.31 $ +* $Date: 2006/04/05 22:12:44 $ +* $Revision: 1.32 $ * ******************************************************************************* */ @@ -345,7 +345,12 @@ final class UCD_Names implements UCD_Types { "SYLOTI_NAGRI", "OLD_PERSIAN", "KHAROSHTHI", - + "Balinese", + "Cuneiform", + "Phoenician", + "Phags-pa", + "Nko", + "Unknown" }; public static final Map EXTRA_SCRIPT = new HashMap(); @@ -426,11 +431,14 @@ final class UCD_Names implements UCD_Types { "Sylo", "Xpeo", "Khar", - + "Bali", + "Xsux", + "Phnx", + "Phag", + "Nkoo", + "Zzzz" }; - - static final String[] AGE = { "unassigned", "1.1", @@ -441,9 +449,9 @@ final class UCD_Names implements UCD_Types { "3.2", "4.0", "4.1", + "5.0", }; - static final String[] GENERAL_CATEGORY = { "Cn", // = Other, Not Assigned 0 diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index 997369dc5a1..fae6843479a 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2005/11/01 00:10:54 $ -* $Revision: 1.32 $ +* $Date: 2006/04/05 22:12:44 $ +* $Revision: 1.33 $ * ******************************************************************************* */ @@ -398,8 +398,14 @@ public interface UCD_Types { SYLOTI_NAGRI = 60, OLD_PERSIAN = 61, KHAROSHTHI = 62, + Balinese = 63, + Cuneiform = 64, + Phoenician = 65, + Phags_Pa = 66, + NKo = 67, + Unknown_Script = 68, - LIMIT_SCRIPT = 63; + LIMIT_SCRIPT = 69; static final int UNKNOWN = 0, @@ -411,7 +417,8 @@ public interface UCD_Types { AGE32 = 6, AGE40 = 7, AGE41 = 8, - LIMIT_AGE = 9; + AGE50 = 9, + LIMIT_AGE = 10; static final String[] AGE_VERSIONS = { "?", @@ -422,7 +429,8 @@ public interface UCD_Types { "3.1.0", "3.2.0", "4.0.0", - "4.1.0" + "4.1.0", + "5.0.0" }; public static byte diff --git a/tools/unicodetools/com/ibm/text/UCD/UData.java b/tools/unicodetools/com/ibm/text/UCD/UData.java index dbe897bc8e7..af979be6fd5 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UData.java +++ b/tools/unicodetools/com/ibm/text/UCD/UData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $ -* $Date: 2004/02/12 08:23:16 $ -* $Revision: 1.11 $ +* $Date: 2006/04/05 22:12:44 $ +* $Revision: 1.12 $ * ******************************************************************************* */ @@ -45,7 +45,7 @@ class UData implements UCD_Types { byte lineBreak = LB_XX; byte joiningType = -1; byte joiningGroup = NO_SHAPING; - byte script = COMMON_SCRIPT; + byte script = Unknown_Script; byte age = 0; static final UData UNASSIGNED = new UData(); diff --git a/tools/unicodetools/com/ibm/text/data/chinese_override.txt b/tools/unicodetools/com/ibm/text/data/chinese_override.txt index 9018cfad4bb..4855f2e3bcb 100644 --- a/tools/unicodetools/com/ibm/text/data/chinese_override.txt +++ b/tools/unicodetools/com/ibm/text/data/chinese_override.txt @@ -1,10 +1,4 @@ -#/** -# ******************************************************************************* -# * Copyright (C) 2002-2004, International Business Machines Corporation and * -# * others. All Rights Reserved. * -# ******************************************************************************* -# */ -#Override List +#Override List #Format is ()? #Note: the 'code' field is currently discarded; only the char is important. #Note: if there is conflict, the FIRST char wins. diff --git a/tools/unicodetools/com/ibm/text/utility/Counter.java b/tools/unicodetools/com/ibm/text/utility/Counter.java index f866b3a8996..57aa6193b46 100644 --- a/tools/unicodetools/com/ibm/text/utility/Counter.java +++ b/tools/unicodetools/com/ibm/text/utility/Counter.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Counter.java,v $ -* $Date: 2005/10/11 19:39:15 $ -* $Revision: 1.3 $ +* $Date: 2006/04/05 22:12:45 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -21,7 +21,7 @@ import java.io.*; import java.text.*; public final class Counter { - Map map = new HashMap(); + private Map map = new HashMap(); static public final class RWInteger implements Comparable { static int uniqueCount; @@ -92,5 +92,11 @@ public final class Counter { return result; } - + public Set keySet() { + return map.keySet(); + } + + public Map getMap() { + return Collections.unmodifiableMap(map); + } } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java index 881197e45f0..e840c30c5dd 100644 --- a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java +++ b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java @@ -48,7 +48,7 @@ public class UnicodeDataFile { out.println("# For documentation, see UCD.html"); } try { - Utility.appendFile(filename + "Header" + fileType, Utility.UTF8_UNIX, out); + Utility.appendFile("com/ibm/text/UCD/" + filename + "Header" + fileType, Utility.UTF8_UNIX, out); } catch (FileNotFoundException e) { /* out.println("# Unicode Character Database: Derived Property Data"); diff --git a/tools/unicodetools/readme.html b/tools/unicodetools/readme.html index 7e844f18886..aef2727248b 100644 --- a/tools/unicodetools/readme.html +++ b/tools/unicodetools/readme.html @@ -77,7 +77,10 @@ exist:
<UCD_DIR>/EXTRAS-Update

2. Download all of the UnicodeData files for each version into UCD_DIR.

The folder names must be of the form: "3.2.0-Update", so rename the folders on the
-Unicode site to this format.

+Unicode site to this format. If the +folder contains ucd, then make the contents of that directory be the contents of +the x.x.x-Update directory. That is, each directory will directly contain files +like PropList....txt

2a Ensure Complete Release

If you are downloading any "incomplete" release (one that does not contain a complete set of data files for that release, you need to also download the previous complete release). Most of the N.M-Update @@ -87,6 +90,7 @@ directoriess are complete, *except*:

Also, make the following changes to UnicodeData for 1.1.5:

Delete

3400;HANGUL SYLLABLE KIYEOK A;Lo;0;L;1100 1161;;;;N;;;;;
+...
 4DFF;HANGUL SYLLABLE MIEUM WEO RIEUL-THIEUTH;Lo;0;L;1106 116F 11B4;;;;N;;;;;
 4E00;;Lo;0;L;;;;;N;;;;;

Add:

@@ -106,18 +110,19 @@ BASE_DIR + "Collation\allkeys" + VERSION + ".txt".

If you have it in a different location, change that value for KEYS in UCA.java, and
the value for BASE_DIR

-

2c. Here is an example of the default directory structure with files:

+

2c. Here is an example of the default directory structure with files. All of +the yellow ones should exist

C://DATA/
 
         BIN/
     
-        Collation/
+        Collation/
             allkeys-3.1.1.txt
-        
+        
         GEN/
             DerivedData/
                 ExtractedProperties
-        UCD/
+        UCD/
             3.0.0-Update/
                 Unihan-3.2.0.txt
                 ...
@@ -133,69 +138,145 @@ the value for BASE_DIR

ArabicShaping-4.0.0d14b.txt BidiMirroring-4.0.0d1b.txt ... - EXTRAS-Update/
+ EXTRAS-Update/

3. Versions

All of the following have "version X" in the options you give to Java (either on the  command line, or in the Eclipse 'run' options. If you want a specific version like 3.1.0, then you would write "version 3.1.1". If you want the latest version (4.1.0), you can omit the "version X".

-

4. Running UCD, you will use com.ibm.text.UCD.Main as your main class.

-

The Working directory has to be C:\ICU4J\unicodetools\com\ibm\text\UCD
-(In Eclipse you can also use ${workspace_loc:UnicodeTools/com/ibm/text/UCD}, which abstracts away -the location.)
-
-The same for UCA:

-

main: com.ibm.text.UCD.Main
-directory: -C:\ICU4J\unicodetools\com\ibm\text\UCA

-

4a. BIN

-

For each version, the tools build a set of binary data in BIN that contain the information for -that release. This is done automatically, or you can manually do it with the options
-
-version X build
-
-This builds an compressed format of all the UCD data (except blocks and Unihan) into the BIN -directory. Don't worry about the voluminous console messages, unless one says "FAIL".
-
-You have to manually do this if you change any of the data files in that -version!!

-

Note: if for any reason you modify the binary format of the BIN files, you also have to bump the -value in that file:
-
-static final byte BINARY_FORMAT = 8; // bumped if binary format of UCD changes

-

4b. To build the Unicode files for a particular version X, run the Main with the following -argument:

-

MakeUnicodeFiles.generateFile

-

This will execute the commands in the file MakeUnicodeFiles.txt.

-

You will edit that file if you want a different 'd' version for the files, OR if you want to -change which files are built. At the top of the file you will see the following text:

-
Generate: 
-
DeltaVersion: 7
-

4c. To change which files are built, put any number of regular expressions separated by spaces -after Generate. Eg,

-
Generate: .*line.* prop.*
-

The matching is case-insensitive.

-

4d. To change the 'd' number that is appended to the generated files names, change the -DeltaVersion.

-

4e. To run basic consistency checking, run:

-

version X verify
-
-Don't worry about any console messages except those that say FAIL.

-

4f. Output

-

The files will be generated in the GEN directories.

-
    -
  • If they are the same as previous files (except for the first line and Date), they will be - renamed to UNCHANGED...
  • -
  • If they are not, then a bat file will be generated in the DIFF directory. Double-clicking on - this file will launch CompareIt, which is a nice diff program. Get compareIt from - http://www.grigsoft.com/files.htm (be - sure to get the Unicode version),then you can also set it as the diff program in CVS with - Admin/Preferences/WinCVS, External Diff = C:\Program Files\Compare It!\wincmp3.exe (or equiv).
  • -
-

5. Running UCA, you will use com.ibm.text.UCA.Main as your main class.

-

5a. To build all the UCA files used by ICU, use the option:

-

java <UCA>Main ICU

-

6. To build all the charts, use the UCA project, with options: normalizationChart caseChart -scriptChart indexChart

+

4. Building Files

+
    +
  1. Setup
      +
    1. In Eclipse, open the Package Explorer (Use Window>Show View if you + don't see it)
    2. +
    3. Open UnicodeTools
        +
      • com.ibm.text.UCD
          +
        • MakeUnicodeFiles.txt

          This file drives the production of + the derived Unicode files. The first three lines contain + parameters that you may want to modify at some times:

          +
          Generate: .*script.* // this is a regular expression. Use .* for all files
          +DeltaVersion: 10     // This gets appended to the file name. Pick 1+ the highest value in Public
          +CopyrightYear: 2006  // Pick the current year
          +
        • +
        +
      • +
      +
    4. +
    5. Open in Package Explorer +
        +
      • com.ibm.text.UCD
          +
        • Main
        • +
        +
      • +
      +
    6. +
    7. Run>Run As...
        +
      1. Choose Java Application
          +
        • it will fail, don't worry; you need to set some parameters
        • +
        +
      2. +
      +
    8. +
    9. Run>Run...
        +
      • Select the Arguments tab, and fill in the following
          +
        • Program arguments:
          build 5.0 MakeUnicodeFiles
          +
        • +
        • VM arguments: +
          -Xms512m -Xmx512m
          +
        • +
        +
      • +
      • Close and Save
      • +
      +
    10. +
    +
  2. +
  3. Run
      +
    1. You'll see it build the 5.0 files, with something like the following + results:
      Writing UCD_Data5.0.0
      +Data Size: 109,802
      +Wrote Data 109802
      +
    2. +
    3. For each version, the tools build a set of binary data in BIN that + contain the information for that release. This is done automatically, or + you can manually do it with the Program Arguments
      version X build
      +

      This builds an compressed format of all the UCD data (except blocks + and Unihan) into the BIN directory. Don't worry about the voluminous + console messages, unless one says "FAIL".

      +

      You have to manually do this if you change + any of the data files in that version!

      +

      Note: if for any reason you modify the binary format of the BIN files, you also have to bump the +value in that file:

      +
      static final byte BINARY_FORMAT = 8; // bumped if binary format of UCD changes
      +
    4. +
    +
  4. +
  5. Results in + C:\DATA\GEN\DerivedData
      +
    1. The files will be in this directory.
    2. +
    3. There are also DIFF folders, that contain BAT files that you can run + on Windows with CompareIt. (You can modify the code to build BATs with + another Diff program if you want).
        +
      1. For any file with a significant difference, it will build two + BAT files, such as the first two below.
        Diff_PropList-5.0.0d10.txt.bat
        +OLDER-Diff_PropList-5.0.0d10.txt.bat
        +
        +UNCHANGED-Diff_PropertyValueAliases-5.0.0d10.txt.bat
        +
      2. +
      +
    4. +
    5. Any files without significant changes will have "UNCHANGED" as a + prefix: ignore them.  The OLDER prefix is the comparison to the + last version of Unicode.
    6. +
    7. On Windows you can run these BATs to compare files:
    8. +
    +
  6. +
+

5. Invariant Checking

+
    +
  1. Setup
      +
    1. Open in Package Explorer
        +
      • com.ibm.text.UCD
          +
        • TestUnicodeInvariants.java
        • +
        +
      • +
      +
    2. +
    3. Run>Run As... Java Application
      + Will create the following file of results:
      C:\DATA\GEN\UnicodeInvariantResults.txt\
      +
    4. +
    5. Open that file and search for "**** START Error Info ****" Each such + point provides a dump of comparison information.
    6. +
    +
  2. +
+

6. Options

+
    +
  1. If you want to see files that are opened while processing, do the + following:
      +
    1. Run>Run
    2. +
    3. Select the Arguments tab, and add the following
        +
      1. VM arguments: +
        -DSHOW_FILES
        +
      2. +
      +
    4. +
    +
  2. +
+

5. UCA

+
    +
  1. +

    You will use com.ibm.text.UCA.Main as your main class, creating along + the same lines as above.

  2. +
  3. +

    To build all the UCA files used by ICU, use the Program arguments:

    +
    Main ICU
    +
  4. +
  5. +

    To build all the charts, use the UCA project, with options:

    +
    normalizationChart caseChart scriptChart indexChart
    +
  6. +