diff --git a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java index 74ecd74f009..a30d9585cef 100644 --- a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $ -* $Date: 2002/03/15 00:34:46 $ -* $Revision: 1.5 $ +* $Date: 2002/03/20 00:21:43 $ +* $Revision: 1.6 $ * ******************************************************************************* */ @@ -25,7 +25,7 @@ import java.io.*; */ public final class ConvertUCD implements UCD_Types { - public static final boolean SHOW = true; + public static final boolean SHOW = false; public static final boolean DEBUG = false; public static int major; @@ -201,7 +201,7 @@ public final class ConvertUCD implements UCD_Types { // MAIN!! public static void main (String[] args) throws Exception { - System.out.println("ConvertUCD"); + System.out.println("Building binary version of UCD"); log = new PrintWriter(new BufferedWriter( new OutputStreamWriter( @@ -260,8 +260,17 @@ public final class ConvertUCD implements UCD_Types { UData value = (UData) charData.get(key); value.compact(); } - UData ud = getEntry(0x2A6D6); + + UData ud; + ud = getEntry(0x5e); + System.out.println("SPOT-CHECK: 5e: " + ud); + + ud = getEntry(0x130); + System.out.println("SPOT-CHECK: 130: " + ud); + + ud = getEntry(0x2A6D6); System.out.println("SPOT-CHECK: 2A6D6: " + ud); + ud = getEntry(0xFFFF); System.out.println("SPOT-CHECK: FFFF: " + ud); @@ -493,7 +502,16 @@ public final class ConvertUCD implements UCD_Types { if (type.equals("I")) { data.simpleCaseFolding = val; setBinaryProperty(cps, CaseFoldTurkishI); - System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val)); + System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting " + + Utility.hex(cps) + ": " + Utility.hex(val)); + } + } else if (labels[0].equals("SpecialCasing") // special handling for special casing + && labels[4].equals("sc") + && parts[4].trim().length() > 0) { + if (i < 4) { + if (DEBUG) System.out.println("Got special: " + Utility.hex(cps) + ", " + + Utility.hex(key) + ":" + Utility.hex(val)); + addCharData(cps, "sc", parts[4].trim() + ":" + key + ":" + val); } } else { /*if (key.equals("sn")) { // SKIP UNDEFINED!! @@ -782,12 +800,16 @@ public final class ConvertUCD implements UCD_Types { } else if (fieldName.equals("su")) { uData.fullUppercase = fieldValue; } else if (fieldName.equals("sl")) { + if (DEBUG) System.out.println("Setting full lowercase to " + Utility.hex(fieldValue) + uData); uData.fullLowercase = fieldValue; } else if (fieldName.equals("st")) { uData.fullTitlecase = fieldValue; } else if (fieldName.equals("sc")) { - uData.specialCasing = fieldValue; + if (uData.specialCasing.length() > 0) { + uData.specialCasing += ";"; + } + uData.specialCasing += fieldValue; } else if (fieldName.equals("xp")) { uData.binaryProperties |= 1 << Utility.lookup(fieldValue, UCD_Names.BP, true); diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java index ab472d32619..fe7ce3ba69d 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $ -* $Date: 2002/03/15 01:57:01 $ -* $Revision: 1.11 $ +* $Date: 2002/03/20 00:21:43 $ +* $Revision: 1.12 $ * ******************************************************************************* */ @@ -285,6 +285,11 @@ public final class DerivedProperty implements UCD_Types { else if (nfx.isTrailing(cp)) return MAYBE; else return ""; } + + public String getListingValue(int cp) { + return getValue(cp, LONG); + } + boolean hasValue(int cp) { return getValue(cp).length() != 0; } }; @@ -460,6 +465,12 @@ of characters, the first of which has a non-zero combining class. if (isCompEx(cp)) return true; return false; } + /* + public String getListingValue(int cp) { + if (getValueType() != BINARY) return getValue(cp, SHORT); + return getProperty(SHORT); + } + */ }; dprops[FullCompInclusion] = new UnicodeProperty() { @@ -537,37 +548,15 @@ of characters, the first of which has a non-zero combining class. hasUnassigned = true; shortName = "DI"; header = header = "# Derived Property: " + name - + "\r\n# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs - White_Space"; + + "\r\n# Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>" + + "\r\n# + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)"; } boolean hasValue(int cp) { + if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true; + if (ucdData.getBinaryProperty(cp,Other_Default_Ignorable_Code_Point)) return true; if (ucdData.getBinaryProperty(cp, White_space)) return false; byte cat = ucdData.getCategory(cp); - if (cat == Cf || cat == Cs || cat == Cc - || ucdData.getBinaryProperty(cp,Reserved_Cf_Code_Point)) return true; - return false; - } - }; - -/* - GraphemeExtend = 27, - GraphemeBase = 28, -# GraphemeExtend := Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink -# GraphemeBase := - -*/ - dprops[GraphemeExtend] = new UnicodeProperty() { - { - type = DERIVED_CORE; - name = "Grapheme_Extend"; - shortName = "GrExt"; - header = header = "# Derived Property: " + name - + "\r\n# Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link"; - } - boolean hasValue(int cp) { - if (ucdData.getBinaryProperty(cp, GraphemeExtend)) return false; - byte cat = ucdData.getCategory(cp); - if (cat == Me || cat == Mn || cat == Mc - || ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true; + if (cat == Cf || cat == Cs || cat == Cc) return true; return false; } }; @@ -576,6 +565,7 @@ of characters, the first of which has a non-zero combining class. { name = "Other_Case_Ignorable"; shortName = "OCI"; + isStandard = false; header = header = "# Binary Property"; } @@ -608,7 +598,7 @@ of characters, the first of which has a non-zero combining class. } boolean hasValue(int cp) { if (hasSoftDot(cp)) return true; - if (!Main.nfkd.hasDecomposition(cp)) return false; + if (!Main.nfkd.normalizationDiffers(cp)) return false; String decomp = Main.nfd.normalize(cp); boolean ok = false; for (int i = decomp.length()-1; i >= 0; --i) { @@ -630,6 +620,7 @@ of characters, the first of which has a non-zero combining class. dprops[Case_Ignorable] = new UnicodeProperty() { { name = "Case_Ignorable"; + isStandard = false; shortName = "CI"; header = header = "# Derived Property: " + name + "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf"; @@ -642,6 +633,33 @@ of characters, the first of which has a non-zero combining class. } }; +/* + GraphemeExtend = 27, + GraphemeBase = 28, +# GraphemeExtend := Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink +# GraphemeBase := + +*/ + dprops[GraphemeExtend] = new UnicodeProperty() { + { + type = DERIVED_CORE; + name = "Grapheme_Extend"; + shortName = "GrExt"; + header = header = "# Derived Property: " + name + + "\r\n# Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link - CGJ" + + "\r\n# (CGJ = U+034F)"; + + } + boolean hasValue(int cp) { + if (cp == 0x034F) return false; + if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false; + byte cat = ucdData.getCategory(cp); + if (cat == Me || cat == Mn || cat == Mc + || ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true; + return false; + } + }; + dprops[GraphemeBase] = new UnicodeProperty() { { type = DERIVED_CORE; @@ -649,9 +667,11 @@ of characters, the first of which has a non-zero combining class. shortName = "GrBase"; header = header = "# Derived Property: " + name - + "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Link - Grapheme_Extend"; + + "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp" + + "\r\n# - Grapheme_Extend - Grapheme_Link - CGJ"; } boolean hasValue(int cp) { + if (cp == 0x034F) return false; byte cat = ucdData.getCategory(cp); if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp || ucdData.getBinaryProperty(cp,GraphemeLink)) return false; diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java index c25cd6ff2ae..23834bc8458 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $ -* $Date: 2002/03/15 00:34:46 $ -* $Revision: 1.9 $ +* $Date: 2002/03/20 00:21:43 $ +* $Revision: 1.10 $ * ******************************************************************************* */ @@ -56,8 +56,7 @@ final class DerivedPropertyLister extends PropertyLister { } public String valueName(int cp) { - if (uprop.getValueType() != BINARY) return uprop.getValue(cp, LONG); - return uprop.getProperty(LONG); + return uprop.getListingValue(cp); } //public String optionalComment(int cp) { diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java index a69dac59e38..534b264ccf9 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $ -* $Date: 2002/03/15 01:57:01 $ -* $Revision: 1.6 $ +* $Date: 2002/03/20 00:21:43 $ +* $Revision: 1.7 $ * ******************************************************************************* */ @@ -24,6 +24,8 @@ public class GenerateCaseFolding implements UCD_Types { public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting + static final int CHECK_CHAR = 0x130; // for debugging, change to actual character, otherwise -1 + // PICK_SHORT & NF_CLOSURE = false for old style @@ -83,8 +85,14 @@ public class GenerateCaseFolding implements UCD_Types { if (rFull != null && rFull.equals(rSimple) || (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) { String type = "C"; - if (ch == 0x130 || ch == 0x131) type = "I"; - drawLine(out, ch, type, rFull); + if (ch == 0x130) { + drawLine(out, ch, "F", "i\u0307"); + drawLine(out, ch, "I", "\u0130"); + } else if (ch == 0x131) { + drawLine(out, ch, "I", "i"); + } else { + drawLine(out, ch, type, rFull); + } } else { if (rFull != null) { drawLine(out, ch, "F", rFull); @@ -404,7 +412,7 @@ public class GenerateCaseFolding implements UCD_Types { } static boolean isExcluded(int ch) { - if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE + // if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE if (ch == 0x0132 || ch == 0x0133) return true; // skip IJ, ij if (ch == 0x037A) return true; // skip GREEK YPOGEGRAMMENI if (0x249C <= ch && ch <= 0x24B5) return true; // skip PARENTHESIZED LATIN SMALL LETTER A.. @@ -456,7 +464,7 @@ public class GenerateCaseFolding implements UCD_Types { btitle = Main.nfc.normalize(btitle); } - if (ch == -1) {// for debugging, change to actual character + if (ch == CHECK_CHAR) { System.out.println("Code: " + Main.ucd.getCodeAndName(ch)); System.out.println("Decomp: " + Main.ucd.getCodeAndName(decomp)); System.out.println("Base: " + Main.ucd.getCodeAndName(base)); @@ -474,11 +482,17 @@ public class GenerateCaseFolding implements UCD_Types { // presumably if there is a single code point, it would already be in the simple mappings if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1 - && UTF16.countCodePoint(title) == 1) continue; + && UTF16.countCodePoint(title) == 1) { + if (ch == CHECK_CHAR) System.out.println("Skipping single code point: " + Main.ucd.getCodeAndName(ch)); + continue; + } // if there is no change from the base, skip - if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) continue; + if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) { + if (ch == CHECK_CHAR) System.out.println("Skipping equals base: " + Main.ucd.getCodeAndName(ch)); + continue; + } // fix special cases // if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue; @@ -488,20 +502,26 @@ public class GenerateCaseFolding implements UCD_Types { // if there are no changes from the original, or the expanded original, skip - if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) continue; + if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) { + if (ch == CHECK_CHAR) System.out.println("Skipping unchanged: " + Main.ucd.getCodeAndName(ch)); + continue; + } String name = Main.ucd.getName(ch); int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1 - : name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 3 - : name.indexOf("LIGATURE") >= 0 ? 2 - : name.indexOf("GEGRAMMENI") < 0 ? 4 - : UTF16.countCodePoint(ftitle) == 1 ? 5 - : UTF16.countCodePoint(fupper) == 2 ? 6 - : 7; + : ch == 0x130 ? 2 + : name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 4 + : name.indexOf("LIGATURE") >= 0 ? 3 + : name.indexOf("GEGRAMMENI") < 0 ? 5 + : UTF16.countCodePoint(ftitle) == 1 ? 6 + : UTF16.countCodePoint(fupper) == 2 ? 7 + : 8; + + if (ch == CHECK_CHAR) System.out.println("Order: " + order + " for " + Main.ucd.getCodeAndName(ch)); // HACK - boolean denormalize = !normalize && order != 5 && order != 6; + boolean denormalize = !normalize && order != 6 && order != 7; String mapping = Utility.hex(ch) + "; " + Utility.hex(flower.equals(base) ? chstr : denormalize ? Main.nfd.normalize(flower) : flower) @@ -544,12 +564,15 @@ public class GenerateCaseFolding implements UCD_Types { out.println("# The German es-zed is special--the normal mapping is to SS."); out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase())"); break; - case 2: out.println("# Ligatures"); break; - case 3: skipLine = true; break; - case 4: out.println("# No corresponding uppercase precomposed character"); break; - case 5: Utility.appendFile("SpecialCasingIota.txt", true, out); break; - case 6: out.println("# Some characters with YPOGEGRAMMENI are also have no corresponding titlecases"); break; - case 7: skipLine = true; break; + case 2: + out.println("# Preserve canonical equivalence for I with dot. Turkic is handled below."); + break; + case 3: out.println("# Ligatures"); break; + case 4: skipLine = true; break; + case 5: out.println("# No corresponding uppercase precomposed character"); break; + case 6: Utility.appendFile("SpecialCasingIota.txt", true, out); break; + case 7: out.println("# Some characters with YPOGEGRAMMENI are also have no corresponding titlecases"); break; + case 8: skipLine = true; break; } if (!skipLine) out.println(); } diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java index 2c2f6a69631..83b000f167e 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $ -* $Date: 2002/03/15 01:57:01 $ -* $Revision: 1.15 $ +* $Date: 2002/03/20 00:21:42 $ +* $Revision: 1.16 $ * ******************************************************************************* */ @@ -1183,7 +1183,7 @@ public class GenerateData implements UCD_Types { Utility.dot(i); if (!Main.ucd.isRepresented(i)) continue; - if (!Main.nfd.hasDecomposition(i)) { + if (!Main.nfd.normalizationDiffers(i)) { if (Main.ucd.getScript(i) == LATIN_SCRIPT) { int cp = i; String hex = "u" + Utility.hex(cp, 4); diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index 85117936b80..a7984d422d1 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2002/03/15 00:34:46 $ -* $Revision: 1.9 $ +* $Date: 2002/03/20 00:21:42 $ +* $Revision: 1.10 $ * ******************************************************************************* */ @@ -77,8 +77,11 @@ public final class Main implements UCD_Types { } else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{ucdVersion}); else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i]; else if (arg.equalsIgnoreCase("testskippable")) NFSkippable.main(null); + else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable(); else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML(); else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed(); + else if (arg.equalsIgnoreCase("verifyNormalizationStability")) VerifyUCD.verifyNormalizationStability(); + else if (arg.equalsIgnoreCase("generateHanTransliterator")) GenerateHanTransliterator.main(); else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry(); diff --git a/tools/unicodetools/com/ibm/text/UCD/MyPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/MyPropertyLister.java index 7c64b852b60..17249bcbd0e 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MyPropertyLister.java +++ b/tools/unicodetools/com/ibm/text/UCD/MyPropertyLister.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $ -* $Date: 2001/12/13 23:35:57 $ -* $Revision: 1.7 $ +* $Date: 2002/03/20 00:21:42 $ +* $Revision: 1.8 $ * ******************************************************************************* */ @@ -85,7 +85,7 @@ final class MyPropertyLister extends PropertyLister { if (cat == Cn && propMask != (BINARY_PROPERTIES | Noncharacter_Code_Point) - && propMask != (BINARY_PROPERTIES | Reserved_Cf_Code_Point) + && propMask != (BINARY_PROPERTIES | Other_Default_Ignorable_Code_Point) && propMask != (CATEGORY | Cn)) { if (BRIDGE) return CONTINUE; else return EXCLUDE; diff --git a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java index 69faa4382bb..f5409577b0c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java +++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $ -* $Date: 2002/03/15 01:57:01 $ -* $Revision: 1.7 $ +* $Date: 2002/03/20 00:21:42 $ +* $Revision: 1.8 $ * ******************************************************************************* */ @@ -67,6 +67,13 @@ public final class Normalizer implements UCD_Types { return getName(form); } + /** + * Return string name + */ + public String getUCDVersion() { + return data.getUCDVersion(); + } + /** * Does compose? */ @@ -120,7 +127,6 @@ public final class Normalizer implements UCD_Types { } /** - */ private StringBuffer hasDecompositionBuffer = new StringBuffer(); public boolean hasDecomposition(int cp) { @@ -129,6 +135,7 @@ public final class Normalizer implements UCD_Types { if (hasDecompositionBuffer.length() != 1) return true; return cp != hasDecompositionBuffer.charAt(0); } + */ /** * Does a quick check to see if the string is in the current form. Checks canonical order and @@ -427,6 +434,11 @@ public final class Normalizer implements UCD_Types { if (ucd. */ } + + String getUCDVersion() { + return ucd.getVersion(); + } + /* Problem: differs: true, call: false U+0385 GREEK DIALYTIKA TONOS Problem: differs: true, call: false U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL diff --git a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt index 93feb19c801..3cba2ab3a89 100644 --- a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt +++ b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt @@ -48,10 +48,14 @@ # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri # The following rules handle those cases. +0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE +0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE + # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. # This matches the behavior of the canonically equivalent I-dot_above -0307; ; 0307; 0307; After_Soft_Dotted; # COMBINING DOT ABOVE +0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE +0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. @@ -63,7 +67,6 @@ 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I -# Note: the following cases are already in the UnicodeData file. +# Note: the following case is already in the UnicodeData file. # 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I -# 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 9a0cb8a2592..84410be901d 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2001/12/13 23:35:57 $ -* $Revision: 1.9 $ +* $Date: 2002/03/20 00:21:42 $ +* $Revision: 1.10 $ * ******************************************************************************* */ @@ -1027,6 +1027,19 @@ to guarantee identifier closure. } private void fillFromFile(String version) { + try { + fillFromFile2(version); + } catch (ChainException e) { + try { + ConvertUCD.main(new String[]{version}); + } catch (Exception e2) { + throw new ChainException("Can't build data file for {0}", new Object[]{version}, e2); + } + fillFromFile2(version); + } + } + + private void fillFromFile2(String version) { DataInputStream dataIn = null; String fileName = BIN_DIR + "UCD_Data" + version + ".bin"; int uDataFileCount = 0; diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java index 4502192e1b0..faab929bdd1 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $ -* $Date: 2002/03/15 00:34:46 $ -* $Revision: 1.12 $ +* $Date: 2002/03/20 00:21:42 $ +* $Revision: 1.13 $ * ******************************************************************************* */ @@ -636,6 +636,7 @@ final class UCD_Names implements UCD_Types { "TEH_MARBUTA", "TETH", "WAW", + "SYRIAC WAW", "YEH", "YEH_BARREE", "YEH_WITH_TAIL", @@ -652,21 +653,21 @@ final class UCD_Names implements UCD_Types { "BEH", "BETH", "DAL", - "DALATH RISH", + "DALATH_RISH", "E", "FEH", - "FINAL SEMKATH", + "FINAL_SEMKATH", "GAF", "GAMAL", "HAH", - "HAMZA ON HEH GOAL", + "HAMZA_ON_HEH_GOAL", "HE", "HEH", - "HEH GOAL", + "HEH_GOAL", "HETH", "KAF", "KAPH", - "KNOTTED HEH", + "KNOTTED_HEH", "LAM", "LAMADH", "MEEM", @@ -677,23 +678,24 @@ final class UCD_Names implements UCD_Types { "QAF", "QAPH", "REH", - "REVERSED PE", + "REVERSED_PE", "SAD", "SADHE", "SEEN", "SEMKATH", "SHIN", - "SWASH KAF", + "SWASH_KAF", "TAH", "TAW", - "TEH MARBUTA", + "TEH_MARBUTA", "TETH", "WAW", + "SYRIAC WAW", "YEH", - "YEH BARREE", - "YEH WITH TAIL", + "YEH_BARREE", + "YEH_WITH_TAIL", "YUDH", - "YUDH HE", + "YUDH_HE", "ZAIN", }; diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index 2a76aea26e6..060beaf4eb9 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2002/03/15 00:34:46 $ -* $Revision: 1.9 $ +* $Date: 2002/03/20 00:21:42 $ +* $Revision: 1.10 $ * ******************************************************************************* */ @@ -15,7 +15,7 @@ package com.ibm.text.UCD; public interface UCD_Types { - public static final int dVersion = 7; // change to fix the generated file D version. If less than zero, no "d" + public static final int dVersion = 8; // change to fix the generated file D version. If less than zero, no "d" public static final String BASE_DIR = "C:\\DATA\\"; public static final String UCD_DIR = BASE_DIR + "UCD\\"; @@ -23,7 +23,7 @@ public interface UCD_Types { public static final String GEN_DIR = BASE_DIR + "GEN\\"; - static final byte BINARY_FORMAT = 5; // bumped if binary format of UCD changes + static final byte BINARY_FORMAT = 6; // bumped if binary format of UCD changes // Unicode Property Types static final byte @@ -188,7 +188,7 @@ public interface UCD_Types { IDS_TrinaryOperator = 24, Radical = 25, UnifiedIdeograph = 26, - Reserved_Cf_Code_Point = 27, + Other_Default_Ignorable_Code_Point = 27, Deprecated = 28, Soft_Dotted = 29, Logical_Order_Exception = 30, @@ -407,13 +407,14 @@ public static byte TEH_MARBUTA = 41, TETH = 42, WAW = 43, - YEH = 44, - YEH_BARREE = 45, - YEH_WITH_TAIL = 46, - YUDH = 47, - YUDH_HE = 48, - ZAIN = 49, - LIMIT_JOINING_GROUP = 50; + SYRIAC_WAW = 44, + YEH = 45, + YEH_BARREE = 46, + YEH_WITH_TAIL = 47, + YUDH = 48, + YUDH_HE = 49, + ZAIN = 50, + LIMIT_JOINING_GROUP = 51; static final byte NFD = 0, NFC = 1, NFKD = 2, NFKC = 3; public static final int diff --git a/tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java b/tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java index ae344277088..027839710c8 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java @@ -137,6 +137,14 @@ public abstract class UnicodeProperty implements UCD_Types { } } + /** + * special hack for NFD/NFKD + */ + public String getListingValue(int cp) { + if (getValueType() != BINARY) return getValue(cp, LONG); + return getProperty(LONG); + } + /** * Does it have the propertyValue? */ diff --git a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java index d10b87d675f..bf2a1b9a4f4 100644 --- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $ -* $Date: 2002/03/15 01:57:01 $ -* $Revision: 1.10 $ +* $Date: 2002/03/20 00:21:42 $ +* $Revision: 1.11 $ * ******************************************************************************* */ @@ -674,12 +674,12 @@ can help you narrow these down. if (cp == 0x3131) { System.out.println("Debug: " + idnProhibited + ", " + idnUnassigned - + ", " + Main.nfkc.hasDecomposition(cp) + + ", " + Main.nfkd.normalizationDiffers(cp) + ", " + Main.ucd.getCodeAndName(Main.nfkc.normalize(cp)) + ", " + Main.ucd.getCodeAndName(Main.nfc.normalize(cp))); } - if (!idnProhibited && ! idnUnassigned && Main.nfkc.hasDecomposition(cp)) { + if (!idnProhibited && ! idnUnassigned && Main.nfkd.normalizationDiffers(cp)) { String kc = Main.nfkc.normalize(cp); String c = Main.nfc.normalize(cp); if (kc.equals(c)) continue; @@ -1045,6 +1045,47 @@ E0020-E007F; [TAGGING CHARACTERS] } return result; } + + /* + + "\r\n# Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>" + + "\r\n# + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)"; + */ + + public static void diffIgnorable () { + Main.setUCD(); + + UnicodeSet control = UnifiedBinaryProperty.make(CATEGORY + Cf, Main.ucd).getSet(); + + System.out.println("Cf"); + Utility.showSetNames("", control, false, Main.ucd); + + control.addAll(UnifiedBinaryProperty.make(CATEGORY + Cc, Main.ucd).getSet()); + + System.out.println("Cf + Cc"); + Utility.showSetNames("", control, false, Main.ucd); + + control.addAll(UnifiedBinaryProperty.make(CATEGORY + Cs, Main.ucd).getSet()); + + System.out.println("Cf + Cc + Cs"); + Utility.showSetNames("", control, false, Main.ucd); + + control.removeAll(UnifiedBinaryProperty.make(BINARY_PROPERTIES + White_space, Main.ucd).getSet()); + + System.out.println("Cf + Cc + Cs - WhiteSpace"); + Utility.showSetNames("", control, false, Main.ucd); + + control.add(0x2060,0x206f).add(0xFFF0,0xFFFB).add(0xE0000,0xE0FFF); + + System.out.println("(Cf + Cc + Cs - WhiteSpace) + ranges"); + Utility.showSetNames("", control, false, Main.ucd); + + UnicodeSet odicp = UnifiedBinaryProperty.make(BINARY_PROPERTIES + Other_Default_Ignorable_Code_Point, Main.ucd).getSet(); + + odicp.removeAll(control); + + System.out.println("Minimal Default Ignorable Code Points"); + Utility.showSetNames("", odicp, true, Main.ucd); + } public static void IdentifierTest() { @@ -1241,6 +1282,95 @@ E0020-E007F; [TAGGING CHARACTERS] if (cat == Lu || cat == Lt || cat == Ll) return "LC"; return Main.ucd.getCategoryID(cp); } + + static public void verifyNormalizationStability() { + Main.setUCD(); + verifyNormalizationStability2("3.1.0"); + verifyNormalizationStability2("3.0.0"); + } + + static public void verifyNormalizationStability2(String version) { + + Main.nfd.normalizationDiffers(0x10300); + + UCD older = UCD.make(version); // Main.ucd.getPreviousVersion(); + + Normalizer oldNFC = new Normalizer(Normalizer.NFC, older.getVersion()); + Normalizer oldNFD = new Normalizer(Normalizer.NFD, older.getVersion()); + Normalizer oldNFKC = new Normalizer(Normalizer.NFKC, older.getVersion()); + Normalizer oldNFKD = new Normalizer(Normalizer.NFKD, older.getVersion()); + + System.out.println("Testing " + Main.nfd.getUCDVersion() + " against " + oldNFD.getUCDVersion()); + + for (int i = 0; i <= 0x10FFFF; ++i) { + Utility.dot(i); + if (!Main.ucd.isAssigned(i)) continue; + byte cat = Main.ucd.getCategory(i); + if (cat == Cs || cat == PRIVATE_USE) continue; + + if (i == 0x5e) { + System.out.println("debug"); + String test1 = Main.nfkd.normalize(i); + String test2 = oldNFKD.normalize(i); + System.out.println("Testing (new/old)" + Main.ucd.getCodeAndName(i)); + System.out.println("\t" + Main.ucd.getCodeAndName(test1)); + System.out.println("\t" + Main.ucd.getCodeAndName(test2)); + } + + if (older.isAssigned(i)) { + + int newCan = Main.ucd.getCombiningClass(i); + int oldCan = older.getCombiningClass(i); + if (newCan != oldCan) { + System.out.println("FAILS CCC STABILITY: " + newCan + " != " + oldCan + + "; " + Main.ucd.getCodeAndName(i)); + } + + verifyEquals(i, "NFD STABILITY (new/old)", Main.nfd.normalize(i), oldNFD.normalize(i)); + verifyEquals(i, "NFC STABILITY (new/old)", Main.nfc.normalize(i), oldNFC.normalize(i)); + verifyEquals(i, "NFKD STABILITY (new/old)", Main.nfkd.normalize(i), oldNFKD.normalize(i)); + verifyEquals(i, "NFKC STABILITY (new/old)", Main.nfkc.normalize(i), oldNFKC.normalize(i)); + + } else { + // not in older version. + // (1) If there is a decomp, and it is composed of all OLD characters, then it must NOT compose + if (Main.nfd.normalizationDiffers(i)) { + String decomp = Main.nfd.normalize(i); + if (noneHaveCategory(decomp, Cn, older)) { + String recomp = Main.nfc.normalize(decomp); + if (recomp.equals(UTF16.valueOf(i))) { + Utility.fixDot(); + System.out.println("FAILS COMP STABILITY: " + Main.ucd.getCodeAndName(i)); + System.out.println("\t" + Main.ucd.getCodeAndName(decomp)); + System.out.println("\t" + Main.ucd.getCodeAndName(recomp)); + System.out.println(); + throw new IllegalArgumentException("Comp stability"); + } + } + } + } + } + } + + public static boolean noneHaveCategory(String s, byte cat, UCD ucd) { + int cp; + for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(s, i); + byte cat2 = ucd.getCategory(i); + if (cat == cat2) return false; + } + return true; + } + + public static void verifyEquals(int cp, String message, String a, String b) { + if (!a.equals(b)) { + Utility.fixDot(); + System.out.println("FAILS " + message + ": " + Main.ucd.getCodeAndName(cp)); + System.out.println("\t" + Main.ucd.getCodeAndName(a)); + System.out.println("\t" + Main.ucd.getCodeAndName(b)); + System.out.println(); + } + } public static void checkAgainstUInfo() { /*