diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java index 637f35c53a4..e8b2e3effc2 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $ -* $Date: 2001/10/31 00:02:27 $ -* $Revision: 1.9 $ +* $Date: 2001/11/13 02:31:55 $ +* $Revision: 1.10 $ * ******************************************************************************* */ @@ -424,6 +424,7 @@ public class GenerateData implements UCD_Types { Set accumulation = new TreeSet(java.text.Collator.getInstance()); String spacing; + /* BufferedReader blocks = Utility.openUnicodeFile("Blocks", ucd.getVersion()); String[] parts = new String[10]; while (true) { @@ -442,6 +443,7 @@ public class GenerateData implements UCD_Types { checkDuplicate(duplicates, accumulation, value, "Block=" + value); } blocks.close(); + */ for (int k = 0; k < UCD_Names.NON_ENUMERATED.length; ++k) { propAbb = fixGaps(UCD_Names.NON_ENUMERATED[k][0], false); @@ -456,15 +458,19 @@ public class GenerateData implements UCD_Types { valueAbb = fixGaps(UCD_Names.SUPER_CATEGORIES[k][0], false); value = fixGaps(UCD_Names.SUPER_CATEGORIES[k][1], true); spacing = Utility.repeat(" ", 10-valueAbb.length()); - sorted.add("gc; " + valueAbb + spacing + "; " + value); + String baseLine = "gc; " + valueAbb + spacing + "; " + value; + spacing = Utility.repeat(" ", 50-baseLine.length()); + sorted.add(baseLine + spacing + "# " + UCD_Names.SUPER_CATEGORIES[k][2]); checkDuplicate(duplicates, accumulation, value, "General_Category=" + value); if (!value.equals(valueAbb)) checkDuplicate(duplicates, accumulation, valueAbb, "General_Category=" + value); } + /* sorted.add("xx; T ; True"); checkDuplicate(duplicates, accumulation, "T", "xx=True"); sorted.add("xx; F ; False"); checkDuplicate(duplicates, accumulation, "F", "xx=False"); + */ sorted.add("qc; Y ; Yes"); checkDuplicate(duplicates, accumulation, "Y", "qc=Yes"); sorted.add("qc; N ; No"); @@ -507,6 +513,10 @@ public class GenerateData implements UCD_Types { if (value.startsWith("Fixed_")) { continue; } } + if (type == JOINING_GROUP) { + valueAbb = "n/a"; + } + /* String elide = ""; if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{" @@ -546,7 +556,18 @@ public class GenerateData implements UCD_Types { log.println("# Generated: " + new Date() + ", MD"); log.println(HORIZONTAL_LINE); log.println(); - Utility.print(log, sorted, "\r\n", new MyBreaker()); + Utility.print(log, sorted, "\r\n", new MyBreaker(true)); + log.close(); + + log = Utility.openPrintWriter("PropertyValueAliases-" + ucd.getVersion() + "dX.txt"); + Utility.appendFile("PropertyValueAliasHeader.txt", false, log); + log.println("# Generated: " + new Date() + ", MD"); + log.println(HORIZONTAL_LINE); + log.println(); + Utility.print(log, sorted, "\r\n", new MyBreaker(false)); + log.close(); + + log = Utility.openPrintWriter("PropertyAliasSummary-" + ucd.getVersion() + "dX.txt"); log.println(); log.println(HORIZONTAL_LINE); log.println(); @@ -555,20 +576,43 @@ public class GenerateData implements UCD_Types { log.println("# Note: no two property names can be the same,"); log.println("# nor can two property value names for the same property be the same."); log.println(); - Utility.print(log, accumulation, "\r\n", new MyBreaker()); + Utility.print(log, accumulation, "\r\n", new MyBreaker(false)); log.println(); log.close(); } static class MyBreaker implements Utility.Breaker { + boolean status; + + public MyBreaker(boolean status) { + this.status = status; + } + + public boolean filter(Object current) { + String c = current.toString(); + if (c.startsWith("AA") || c.startsWith("BB") || c.startsWith("ZZ")) return status; + return !status; + } + public String get(Object current, Object old) { - if (old == null) return ""; + if (old == null) { + old = " "; + } String c = current.toString(); String o = old.toString(); - if (c.length() >= 2 && o.length() >= 0 && !c.substring(0,2).equals(o.substring(0,2))) { - return "\r\n"; + String sep = ""; + if (!c.substring(0,2).equals(o.substring(0,2))) { + sep = "\r\n"; + if (status) { + if (c.startsWith("AA")) sep = sep + HORIZONTAL_LINE + sep + "# Non-enumerated Properties" + sep + HORIZONTAL_LINE + sep; + if (c.startsWith("BB")) sep = sep + HORIZONTAL_LINE + sep + "# Enumerated Non-Binary Properties" + sep + HORIZONTAL_LINE + sep; + if (c.startsWith("ZZ")) sep = sep + HORIZONTAL_LINE + sep + "# Binary Properties" + sep + HORIZONTAL_LINE + sep; + } } - return ""; + if (status) { + c = c.substring(4); + } + return sep + c; } } diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index d4b6139e1e4..a63d2dec1f0 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2001/10/25 20:33:46 $ -* $Revision: 1.4 $ +* $Date: 2001/11/13 02:31:55 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -41,7 +41,8 @@ public final class Main { else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test(); else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase(); - else if (arg.equalsIgnoreCase("checkCase2")) VerifyUCD.checkCase2(); + else if (arg.equalsIgnoreCase("checkCaseLong")) VerifyUCD.checkCase2(true); + else if (arg.equalsIgnoreCase("checkCaseShort")) VerifyUCD.checkCase2(false); else if (arg.equalsIgnoreCase("checkCanonicalProperties")) VerifyUCD.checkCanonicalProperties(); else if (arg.equalsIgnoreCase("CheckCaseFold")) VerifyUCD.CheckCaseFold(); else if (arg.equalsIgnoreCase("idn")) VerifyUCD.VerifyIDN(); diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt index 526f7967e3c..9b60f167d0d 100644 --- a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt @@ -1,46 +1,30 @@ # DRAFT # PropertyAliases-3.2.0.txt # -# This file contains aliases for properties and property values used in the UCD. +# This file contains aliases for properties used in the UCD. # These names can be used for XML formats of UCD data, for regular-expression # property tests, and other programmatic textual descriptions of Unicode data. -# The names are not normative, except where they correspond to normative values -# in the UCD. +# The names are not normative, except where they correspond to normative +# properties in the UCD. For information on which properties are normative, +# see UnicodeCharacterDatabase.html. # # The names may be translated in appropriate environments, and additional # aliases may be useful. # # FORMAT # -# Each line has three fields, separated by semicolons. +# Each line has two fields, separated by semicolons. # -# First Field: Where the first field is AA, BB, or ZZ, then the line describes a property name: +# First Field: The first field is an abbreviated name for the property # -# AA - non-enumerated properties -# BB - enumerated, non-binary properties -# ZZ - binary properties and quick-check properties -# -# (The values AA, BB, and ZZ are arbitrary -- they were simply chosen to distinguish -# the different types.) -# -# Where the first field is not one of the above, the line describes a -# property value name. The first field describes the property for which that -# property value name is used. There are two special properties: -# -# xx stands for any binary property -# qc stands for any quick-check property -# -# Second Field: The second field is an abbreviated name. -# If there is no abbreviated name available, the field is marked with "n/a". -# -# Third Field: The third field is a long name. +# Second Field: The second field is a long name # # With loose matching of property names, the case distinctions, whitespace, # and '_' are ignored. # # NOTE: Currently there is at most one abbreviated name and one long name for -# each property and property value. However, in the future additional aliases -# may be added. In such a case, the first line for the property or property value +# each property. However, in the future additional aliases +# may be added. In such a case, the first line for the property # would have the preferred alias for output. # # NOTE: The property value names are NOT unique across properties, especially @@ -53,7 +37,5 @@ # cc means Combining_Class property, and # cc means the General_Category property value Control (cc) # -# Comments at the end of the file show cases of non-unique names. -# # The combination of property value and property name is, however, unique. # For more information, see UTR #24: Regular Expression Guidelines diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java index e826d722135..281e7ae24a8 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $ -* $Date: 2001/10/31 00:02:27 $ -* $Revision: 1.6 $ +* $Date: 2001/11/13 02:31:55 $ +* $Revision: 1.7 $ * ******************************************************************************* */ @@ -32,7 +32,10 @@ final class UCD_Names implements UCD_Types { {"stc", "Simple_Titlecase_Mapping"}, {"sfc", "Simple_Case_Folding"}, {"scc", "Special_Case_Condition"}, - {"blk", "Block"} + {"blk", "Block"}, + {"na1", "Unicode_1_Name"}, + {"isc", "ISO_Comment"}, + {"age", "Age"}, }; static final String[] UNIFIED_PROPERTIES = { @@ -406,13 +409,14 @@ final class UCD_Names implements UCD_Types { }; static final String[][] SUPER_CATEGORIES = { - {"L", "Letter"}, - {"M", "Mark"}, - {"N", "Number"}, - {"Z", "Separator"}, - {"C", "Other"}, - {"S", "Symbol"}, - {"P", "Punctuation"}, + {"L", "Letter", "Ll | Lm | Lo | Lt | Lu"}, + {"M", "Mark", "Mc | Me | Mn"}, + {"N", "Number", "Nd | Nl | No"}, + {"Z", "Separator", "Zl | Zp | Zs"}, + {"C", "Other", "Cc | Cf | Cn | Co | Cs"}, + {"S", "Symbol", "Sc | Sk | Sm | So"}, + {"P", "Punctuation", "Pc | Pd | Pe | Pf | Pi | Po | Ps"}, + {"Lc", "Cased Letter", "Ll | Lt | Lu"}, }; diff --git a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java index faf04c2c3e5..e7c7527cdd7 100644 --- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $ -* $Date: 2001/10/25 20:33:46 $ -* $Revision: 1.6 $ +* $Date: 2001/11/13 02:31:55 $ +* $Revision: 1.7 $ * ******************************************************************************* */ @@ -141,43 +141,109 @@ public class VerifyUCD implements UCD_Types { log.close(); } - public static void checkCase2() throws IOException { + public static void checkCase2(boolean longForm) throws IOException { Utility.fixDot(); System.out.println("checkCase"); ucd = UCD.make(Main.ucdVersion); initNormalizers(); - System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE)); + + /*String tx1 = "\u0391\u0342\u0345"; + String ux1 = "\u0391\u0342\u0399"; + String ctx1 = nfc.normalize(tx1); + String ctx2 = nfc.normalize(ux1); // wrong?? + + //System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE)); + */ + + String fileName = "CaseNormalizationDifferences.txt"; PrintWriter log = Utility.openPrintWriter(fileName); log.println("Differences between case(normalize(cp)) and normalize(case(cp))"); log.println("u, l, t - upper, lower, title"); log.println("c, d - nfc, nfd"); + + //Utility.DOTMASK = 0x7F; for (int cp = 0; cp <= 0x10FFFF; ++cp) { Utility.dot(cp); if (!ucd.isRepresented(cp) || ucd.isPUA(cp)) continue; - if (cp == '\u3371') { + if (cp == '\u0130') { System.out.println("debug"); } String x = UTF32.valueOf32(cp); + String dx = nfd.normalize(cp); + String cx = nfc.normalize(cp); String ux = ucd.getCase(x, FULL, UPPER); String lx = ucd.getCase(x, FULL, LOWER); String tx = ucd.getCase(x, FULL, TITLE); - - String dux = nfd.normalize(ux); - String dlx = nfd.normalize(lx); - String dtx = nfd.normalize(tx); + + if (x.equals(dx) && dx.equals(cx) && cx.equals(ux) && ux.equals(lx) && lx.equals(tx)) continue; String cux = nfc.normalize(ux); String clx = nfc.normalize(lx); String ctx = nfc.normalize(tx); + + if (x.equals(cx)) { + boolean needBreak = false; + if (!clx.equals(lx)) needBreak = true; + if (!ctx.equals(tx)) needBreak = true; + if (!cux.equals(ux)) needBreak = true; + + if (needBreak) { + log.println("# Was not NFC:"); + log.println( + "## " + Utility.hex(x) + "; " + + Utility.hex(lx) + "; " + + Utility.hex(tx) + "; " + + Utility.hex(ux) + "; # " + + ucd.getName(x)); + log.println("# should be:"); + log.println( + Utility.hex(x) + "; " + + Utility.hex(clx) + "; " + + Utility.hex(ctx) + "; " + + Utility.hex(cux) + "; # " + + ucd.getName(x)); + log.println(); + } + } + + String dux = nfd.normalize(ux); + String dlx = nfd.normalize(lx); + String dtx = nfd.normalize(tx); + + + + String startdx = getMarks(dx, false); + String enddx = getMarks(dx, true); - String dx = nfd.normalize(cp); - String cx = nfc.normalize(cp); + String startdux = getMarks(dux, false); + String enddux = getMarks(dux, true); + String startdtx = getMarks(dtx, false); + String enddtx = getMarks(dtx, true); + + String startdlx = getMarks(dlx, false); + String enddlx = getMarks(dlx, true); + + // If the new marks don't occur in the old decomposition, we got a problem! + + if (!startdx.startsWith(startdux) || !startdx.startsWith(startdtx) || !startdx.startsWith(startdlx) + || !enddx.endsWith(enddux) || !enddx.endsWith(enddtx) || !enddx.endsWith(enddlx)) { + log.println("Combining Class Difference for " + ucd.getCodeAndName(x)); + log.println("x: " + ucd.getCodeAndName(dx) + ", " + Utility.hex(startdx) + ", " + Utility.hex(enddx)); + log.println("ux: " + ucd.getCodeAndName(dux) + ", " + Utility.hex(startdux) + ", " + Utility.hex(enddux)); + log.println("tx: " + ucd.getCodeAndName(dtx) + ", " + Utility.hex(startdtx) + ", " + Utility.hex(enddtx)); + log.println("lx: " + ucd.getCodeAndName(dlx) + ", " + Utility.hex(startdlx) + ", " + Utility.hex(enddlx)); + log.println(); + } + + + if (!longForm) continue; + String udx = ucd.getCase(dx, FULL, UPPER); String ldx = ucd.getCase(dx, FULL, LOWER); String tdx = ucd.getCase(dx, FULL, TITLE); @@ -286,6 +352,28 @@ public class VerifyUCD implements UCD_Types { log.close(); } + + public static String getMarks(String s, boolean doEnd) { + int cp; + if (!doEnd) { + for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(s, i); + int cc = ucd.getCombiningClass(cp); + if (cc == 0) { + return s.substring(0, i); + } + } + } else { + for (int i = s.length(); i > 0; i -= UTF16.getCharCount(cp)) { + cp = UTF16.charAt(s, i-1); // will go 2 before if necessary + int cc = ucd.getCombiningClass(cp); + if (cc == 0) { + return s.substring(i); + } + } + } + return s; + } static final String names[] = {"LOWER", "TITLE", "UPPER", "(UNC)", "MIXED"}; static final String lowerNames[] = {"", "Other_Lower"}; diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java index 97851e139ec..64cc5579212 100644 --- a/tools/unicodetools/com/ibm/text/utility/Utility.java +++ b/tools/unicodetools/com/ibm/text/utility/Utility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ -* $Date: 2001/10/31 00:02:54 $ -* $Revision: 1.7 $ +* $Date: 2001/11/13 02:31:34 $ +* $Revision: 1.8 $ * ******************************************************************************* */ @@ -30,9 +30,11 @@ public final class Utility { // COMMON UTILITIES } private static boolean needCRLF = false; + + public static int DOTMASK = 0x7FF; public static void dot(int i) { - if ((i % 0x7FF) == 0) { + if ((i % DOTMASK) == 0) { needCRLF = true; System.out.print('.'); } @@ -458,6 +460,7 @@ public final class Utility { // COMMON UTILITIES public interface Breaker { public String get(Object current, Object old); + public boolean filter(Object current); // true is keep } public static void print(PrintWriter pw, Collection c, String separator, Breaker b) { @@ -466,14 +469,17 @@ public final class Utility { // COMMON UTILITIES Object last = null; while (it.hasNext()) { Object obj = it.next(); + if (b != null && !b.filter(obj)) continue; if (first) { first = false; + } else { + pw.print(separator); } - else pw.print(separator); if (b != null) { pw.print(b.get(obj, last)); + } else { + pw.print(obj); } - pw.print(obj); last = obj; } }