diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java index 420fec19a78..fe2287ff58e 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ -* $Date: 2002/07/03 02:15:47 $ -* $Revision: 1.17 $ +* $Date: 2003/03/15 02:36:49 $ +* $Revision: 1.18 $ * ******************************************************************************* */ @@ -1109,7 +1109,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] int itemInRange = startOfRange; int skip = 1; boolean doSamples = false; - UnicodeSetIterator usi = new UnicodeSetIterator(); + AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator(); /** * use FIXED_CE as the limit @@ -1120,8 +1120,8 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] this.nfkd = new Normalizer(Normalizer.NFKD, unicodeVersion); this.skipDecomps = skipDecomps; currentRange = 0; - usi.reset(unspecified); - usi.setAbbreviated(true); + usi.reset(unspecified, true); + //usi.setAbbreviated(true); // FIX SAMPLES if (SAMPLE_RANGES[0][0] == 0) { @@ -1204,8 +1204,8 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.] } } unspecified = temp; - usi.reset(unspecified); - usi.setAbbreviated(true); + usi.reset(unspecified, true); + //usi.setAbbreviated(true); if (DEBUG) System.out.println("Unspecified = " + unspecified.toPattern(true)); haveUnspecified = true; } diff --git a/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt b/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt index f924ba6ee9c..ca8da1ac349 100644 --- a/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt @@ -11,6 +11,8 @@ # (where string lengths may grow). Note that where they can be supported, the # full case foldings are superior: for example, they allow "MASSE" and "Maße" to match. # +# All code points not listed in this file map to themselves. +# # NOTE: case folding does not preserve normalization formats! # # For information on case folding, see diff --git a/tools/unicodetools/com/ibm/text/UCD/Default.java b/tools/unicodetools/com/ibm/text/UCD/Default.java index 00672f35fe0..60f04992cb2 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Default.java +++ b/tools/unicodetools/com/ibm/text/UCD/Default.java @@ -16,6 +16,10 @@ public final class Default implements UCD_Types { public static Normalizer nfkd; public static Normalizer[] nf = new Normalizer[4]; + public static void ensureUCD() { + if (ucd == null) setUCD(); + } + public static void setUCD(String version) { ucdVersion = version; setUCD(); diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java index b71986a1d99..a1344dd7002 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $ -* $Date: 2003/03/12 16:01:26 $ -* $Revision: 1.25 $ +* $Date: 2003/03/15 02:36:48 $ +* $Revision: 1.26 $ * ******************************************************************************* */ @@ -141,6 +141,39 @@ public class GenerateData implements UCD_Types { + ".html"; } + public static void checkDifferences (String targetVersion) throws IOException { + System.out.println("Checking Differences"); + UCD target = UCD.make(targetVersion); + + PrintWriter log1 = Utility.openPrintWriter("Log1.xml", Utility.LATIN1_UNIX); + log1.println(""); + + PrintWriter log2 = Utility.openPrintWriter("Log2.xml", Utility.LATIN1_UNIX); + log2.println(""); + + for (int i = 0; i <= 0x10FFFF; ++i) { + if (!target.isAllocated(i)) continue; + Utility.dot(i); + UData t = target.get(i, true); + UData current = Default.ucd.get(i, true); + if (i == 0x5E) { + System.out.println(target.getDecompositionTypeID(i) + + ", " + Utility.hex(target.getDecompositionMapping(i))); + System.out.println(Default.ucd.getDecompositionTypeID(i) + + ", " + Utility.hex(Default.ucd.getDecompositionMapping(i))); + } + if (t.equals(current)) continue; + + // print both for comparison + log1.println(t.toString(target, UData.ABBREVIATED)); + log2.println(current.toString(Default.ucd, UData.ABBREVIATED)); + } + log1.println(""); + log2.println(""); + log1.close(); + log2.close(); + } + public static void generateDerived (byte type, boolean checkTypeAndStandard, int headerChoice, String directory, String fileName) throws IOException { Default.setUCD(); diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index 5c3719852d0..b8b0d2e32a6 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2003/03/12 16:01:26 $ -* $Revision: 1.27 $ +* $Date: 2003/03/15 02:36:48 $ +* $Revision: 1.28 $ * ******************************************************************************* */ @@ -47,11 +47,31 @@ public final class Main implements UCD_Types { public static void main (String[] args) throws Exception { for (int i = 0; i < args.length; ++i) { + + long mask = 0; + String arg = args[i]; if (arg.charAt(0) == '#') return; // skip rest of line Utility.fixDot(); System.out.println("Argument: " + args[i]); + + // Expand string arguments + + if (arg.equalsIgnoreCase("All")) { + args = Utility.append(ALL_FILES, Utility.subarray(args, i+1)); + continue; + } + + // make sure the UCD is set up + + if (arg.equalsIgnoreCase("version")) { + Default.setUCD(args[++i]); + continue; + } + Default.ensureUCD(); + + // Now handle other options if (arg.equalsIgnoreCase("verify")) { VerifyUCD.verify(); @@ -60,7 +80,6 @@ public final class Main implements UCD_Types { VerifyUCD.checkAgainstUInfo(); } else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{Default.ucdVersion}); - else if (arg.equalsIgnoreCase("version")) Default.setUCD(args[++i]); else if (arg.equalsIgnoreCase("statistics")) VerifyUCD.statistics(); else if (arg.equalsIgnoreCase("NFSkippable")) NFSkippable.main(null); else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable(); @@ -123,6 +142,7 @@ public final class Main implements UCD_Types { else if (arg.equalsIgnoreCase("TestDirectoryIterator")) DirectoryIterator.test(); else if (arg.equalsIgnoreCase("checkIdentical")) GenerateData.handleIdentical(); else if (arg.equalsIgnoreCase("testnameuniqueness")) TestNameUniqueness.test(); + else if (arg.equalsIgnoreCase("checkDifferences")) GenerateData.checkDifferences("3.2.0"); //else if (arg.equalsIgnoreCase("NormalizationCharts")) ChartGenerator.writeNormalizationCharts(); @@ -130,36 +150,9 @@ public final class Main implements UCD_Types { /*else if (arg.equalsIgnoreCase("writeNormalizerTestSuite")) GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt"); */ - else extras(new String[] {arg}); - } - } - - public static void extras (String[] args) throws Exception { - //ubp = new UnifiedBinaryProperty(ucd); - - boolean expanding = false; - - for (int i = 0; i < args.length; ++i) { - String arg = args[i]; - if (arg.charAt(0) == '#') return; // skip rest of line - long mask = 0; - - Utility.fixDot(); - if (expanding) System.out.println("Argument: " + args[i]); - - if (arg.equalsIgnoreCase("All")) { - // Append all args at end - /* - String[] temp = new String[args.length + ALL_FILES.length]; - System.arraycopy(args, 0, temp, 0, args.length); - System.arraycopy(ALL_FILES, 0, temp, args.length, ALL_FILES.length); - */ - args = Utility.append(args, ALL_FILES); - expanding = true; - // EXTRACTED PROPERTIES - } else if (arg.equalsIgnoreCase("DerivedBidiClass")) { + else if (arg.equalsIgnoreCase("DerivedBidiClass")) { GenerateData.generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, GenerateData.HEADER_DERIVED, "DerivedData/extracted/", "DerivedBidiClass"); diff --git a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt index c9ad2c62f1d..ca35c1df91c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt @@ -6,6 +6,8 @@ # characters where they are 1-1, and does not have locale-specific mappings.) # For more information, see the discussion of Case Mappings in the Unicode Standard. # +# All code points not listed in this file that do not have a simple case mappings +# in UnicodeData.txt map to themselves. # ================================================================================ # Format # ================================================================================ diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 48d13d793dc..6bced563b63 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2003/03/12 16:01:26 $ -* $Revision: 1.21 $ +* $Date: 2003/03/15 02:36:48 $ +* $Revision: 1.22 $ * ******************************************************************************* */ @@ -123,7 +123,7 @@ public final class UCD implements UCD_Types { * Return XML version of the data associated with the code point. */ public String toString(int codePoint) { - return get(codePoint, true).toString(FULL); + return get(codePoint, true).toString(this,FULL); } /** @@ -1389,6 +1389,7 @@ to guarantee identifier closure. size = uDataFileCount = dataIn.readInt(); boolean didJoiningHack = false; + System.out.println("Loading UCD " + foundVersion); // records @@ -1396,7 +1397,7 @@ to guarantee identifier closure. UData uData = new UData(); uData.readBytes(dataIn); - if (uData.codePoint == 0x0221) { + if (uData.codePoint == 0x5E) { System.out.println("SPOT-CHECK: " + uData); } diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java index f8956d7a666..a389cd385d2 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $ -* $Date: 2003/03/12 16:01:26 $ -* $Revision: 1.17 $ +* $Date: 2003/03/15 02:36:48 $ +* $Revision: 1.18 $ * ******************************************************************************* */ @@ -51,7 +51,9 @@ final class UCD_Names implements UCD_Types { + "#\tAll code points not listed here have the type U", "Joining Group (listing ArabicShaping.txt, field 2)", "BidiMirrored (listing UnicodeData.txt, field 9: see UnicodeData.html)", - "Script", + "Script\r\n" + + "#\tThe value for all code points not explicitly listed in this file is COMMON." + , "Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)", "Hangul Syllable Type\r\n# All codepoints not explicitly listed here have the value NA", "Derived" @@ -219,11 +221,11 @@ final class UCD_Names implements UCD_Types { "IS", "PR", "PO", "NU", "AL", "ID", "IN", "HY", "CM", "BB", "BA", "SP", "BK", "CR", "LF", "CB", "SA", "AI", "B2", "SG", "ZW", - "JL", - "JV", - "JT", "NL", "WJ", + //"JL", + //"JV", + //"JT", }; @@ -235,11 +237,11 @@ final class UCD_Names implements UCD_Types { "CombiningMark", "BreakBefore", "BreakAfter", "Space", "MandatoryBreak", "CarriageReturn", "LineFeed", "ContingentBreak", "ComplexContext", "Ambiguous", "BreakBoth", "Surrogate", "ZWSpace", - "Leading_Jamo", - "Vowel_Jamo", - "Trailing_Jamo", "Next_Line", "Word_Joiner" + //"Leading_Jamo", + //"Vowel_Jamo", + //"Trailing_Jamo", }; public static final String[] SCRIPT = { diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index 4728431fe6d..c4a614271f4 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2003/03/12 16:01:26 $ -* $Revision: 1.18 $ +* $Date: 2003/03/15 02:36:48 $ +* $Revision: 1.19 $ * ******************************************************************************* */ @@ -15,7 +15,7 @@ package com.ibm.text.UCD; public interface UCD_Types { - public static final int dVersion = 15; // change to fix the generated file D version. If less than zero, no "d" + public static final int dVersion = 18; // change to fix the generated file D version. If less than zero, no "d" public static final String BASE_DIR = "C:\\DATA\\"; public static final String UCD_DIR = BASE_DIR + "UCD\\"; @@ -34,7 +34,7 @@ public interface UCD_Types { CJK_B_BASE = 0x20000, CJK_B_LIMIT = 0x2A6DF+1; - static final byte BINARY_FORMAT = 7; // bumped if binary format of UCD changes + static final byte BINARY_FORMAT = 8; // bumped if binary format of UCD changes // Unicode Property Types static final byte @@ -240,12 +240,12 @@ public interface UCD_Types { LB_IS = 8, LB_PR = 9, LB_PO = 10, LB_NU = 11, LB_AL = 12, LB_ID = 13, LB_IN = 14, LB_HY = 15, LB_CM = 16, LB_BB = 17, LB_BA = 18, LB_SP = 19, LB_BK = 20, LB_CR = 21, LB_LF = 22, LB_CB = 23, LB_SA = 24, LB_AI = 25, LB_B2 = 26, LB_SG = 27, LB_ZW = 28, - LB_JL = 29, - LB_JV = 30, - LB_JT = 31, - LB_NL = 32, - LB_WJ = 33, - LIMIT_LINE_BREAK = 34, + LB_NL = 29, + LB_WJ = 30, + //LB_JL = 29, + //LB_JV = 30, + //LB_JT = 31, + LIMIT_LINE_BREAK = 31, LB_LIMIT = LIMIT_LINE_BREAK; // east asian width diff --git a/tools/unicodetools/com/ibm/text/UCD/UData.java b/tools/unicodetools/com/ibm/text/UCD/UData.java index 1e408d88280..1176cf7fd8c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UData.java +++ b/tools/unicodetools/com/ibm/text/UCD/UData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $ -* $Date: 2003/03/12 16:01:26 $ -* $Revision: 1.7 $ +* $Date: 2003/03/15 02:36:48 $ +* $Revision: 1.8 $ * ******************************************************************************* */ @@ -78,6 +78,9 @@ class UData implements UCD_Types { public boolean equals(Object that) { UData other = (UData) that; + + // use equals for objects + if (!name.equals(other.name)) return false; if (!decompositionMapping.equals(other.decompositionMapping)) return false; if (!simpleUppercase.equals(other.simpleUppercase)) return false; @@ -90,8 +93,12 @@ class UData implements UCD_Types { if (!fullCaseFolding.equals(other.fullCaseFolding)) return false; if (!specialCasing.equals(other.specialCasing)) return false; if (!bidiMirror.equals(other.bidiMirror)) return false; + + // == for primitives + // Warning: doubles have to use special comparison, because of NaN + if (codePoint != other.codePoint) return false; - if (numericValue != other.numericValue) return false; + if (numericValue < other.numericValue || numericValue > other.numericValue) return false; if (binaryProperties != other.binaryProperties) return false; if (generalCategory != other.generalCategory) return false; if (combiningClass != other.combiningClass) return false; @@ -104,6 +111,7 @@ class UData implements UCD_Types { if (joiningGroup != other.joiningGroup) return false; if (script != other.script) return false; if (age != other.age) return false; + return true; } @@ -178,17 +186,17 @@ class UData implements UCD_Types { static final byte ABBREVIATED = 0, FULL = 1; public String toString() { - return toString(FULL); + return toString(Default.ucd, FULL); } - public String toString(byte style) { + public String toString(UCD ucd, byte style) { boolean full = style == FULL; StringBuffer result = new StringBuffer(); String s = UTF32.valueOf32(codePoint); - result.append("