diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java index 15834c01ca1..35601f57d73 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $ -* $Date: 2001/09/19 23:33:16 $ -* $Revision: 1.4 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -14,55 +14,13 @@ package com.ibm.text.UCD; import com.ibm.text.utility.*; import com.ibm.text.*; +import java.util.*; public class DerivedProperty implements UCD_Types { UCD ucdData; - static final int - PropMath = 0, - PropAlphabetic = 1, - PropLowercase = 2, - PropUppercase = 3, - - ID_Start = 4, - ID_Continue_NO_Cf = 5, - - Mod_ID_Start = 6, - Mod_ID_Continue_NO_Cf = 7, - - Missing_Uppercase = 8, - Missing_Lowercase = 9, - Missing_Mixedcase = 10, - - FC_NFKC_Closure = 11, - - FullCompExclusion = 12, - FullCompInclusion = 13, - - QuickNFD = 14, - QuickNFC = 15, - QuickNFKD = 16, - QuickNFKC = 17, - - ExpandsOnNFD = 18, - ExpandsOnNFC = 19, - ExpandsOnNFKD = 20, - ExpandsOnNFKC = 21, - - GenNFD = 22, - GenNFC = 23, - GenNFKD = 24, - GenNFKC = 25, - - DefaultIgnorable = 26, - GraphemeExtend = 27, - GraphemeBase = 28, - - FC_NFC_Closure = 29, - - LIMIT = 30; - + // ADD CONSTANT to UCD_TYPES public DerivedProperty(UCD ucd) { ucdData = ucd; @@ -74,9 +32,9 @@ public class DerivedProperty implements UCD_Types { else return "Unimplemented!!"; } - public String getName(int propNumber) { + public String getName(int propNumber, byte style) { DProp dp = dprops[propNumber]; - if (dp != null) return dp.getName(); + if (dp != null) return dp.getName(style); else return "Unimplemented!!"; } @@ -87,10 +45,17 @@ public class DerivedProperty implements UCD_Types { } public boolean isDefined(int propNumber) { + if (propNumber < 0 || propNumber >= dprops.length) return false; return dprops[propNumber] != null; } + public boolean isTest(int propNumber) { + if (!isDefined(propNumber)) return false; + return dprops[propNumber].isTest(); + } + public boolean hasProperty(int cp, int propNumber) { + if (!isDefined(propNumber)) return false; return dprops[propNumber].hasProperty(cp); } @@ -112,9 +77,15 @@ public class DerivedProperty implements UCD_Types { "Mixedcase"}; private abstract class DProp { - String name, header; - String getName() { return name; } + boolean testStatus = false; + byte defaultStyle = LONG; + String name, shortName, header; + String getName(byte style) { + if (style == NORMAL) style = defaultStyle; + return style < LONG ? shortName : name; + } String getHeader() { return header; } + boolean isTest() { return testStatus; } abstract boolean hasProperty(int cp); public boolean propertyVaries() { return false; } public String getProperty(int cp) { return hasProperty(cp) ? name : ""; } @@ -125,6 +96,7 @@ public class DerivedProperty implements UCD_Types { ExDProp(int i) { nfx = nf[i-ExpandsOnNFD]; name = "Expands_On_" + NAME[i-ExpandsOnNFD]; + shortName = "XO_" + NAME[i-ExpandsOnNFD]; header = "# Derived Property: " + name + "\r\n# Generated according to UAX #15." + "\r\n# Characters whose normalized length is not one." @@ -139,11 +111,80 @@ public class DerivedProperty implements UCD_Types { } }; + class NF_UnsafeStartProp extends DProp { + Normalizer nfx; + int prop; + + NF_UnsafeStartProp(int i) { + prop = i-NFD_UnsafeStart; + nfx = nf[prop]; + name = NAME[prop] + "_UnsafeStart"; + shortName = NAME[prop] + "_SS"; + header = "# Derived Property: " + name + + "\r\n# Generated according to UAX #15." + + "\r\n# Characters that are cc==0, BUT which may interact with previous characters." + ; + } + boolean hasProperty(int cp) { + if (ucdData.getCombiningClass(cp) != 0) return false; + String norm = nfx.normalize(cp); + int first = UTF16.charAt(norm, 0); + if (ucdData.getCombiningClass(first) != 0) return true; + if ((prop == 1 || prop == 3) + && dprops[NFC_TrailingZero].hasProperty(first)) return true; // 1,3 == composing + return false; + } + }; + + + class NFC_Prop extends DProp { + BitSet bitset; + boolean filter = false; + boolean keepNonZero = true; + + NFC_Prop(int i) { + BitSet[] bitsets = new BitSet[3]; + switch(i) { + case NFC_Leading: bitsets[0] = bitset = new BitSet(); break; + case NFC_Resulting: bitsets[2] = bitset = new BitSet(); break; + case NFC_TrailingZero: keepNonZero = false; // FALL THRU + case NFC_TrailingNonZero: bitsets[1] = bitset = new BitSet(); break; + } + filter = bitsets[1] != null; + nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]); + + name = Names[i-NFC_Leading]; + shortName = SNames[i-NFC_Leading]; + header = "# Derived Property: " + name + + "\r\n# " + Description[i-NFC_Leading] + + "\r\n# NFKC characters are the same, after subtracting the NFKD = NO values." + + "\r\n# Generated according to UAX #15." + + "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact." + + "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!"; + } + boolean hasProperty(int cp) { + boolean result = bitset.get(cp); + if (result && filter) { + result = (ucdData.getCombiningClass(cp) != 0) == keepNonZero; + } + return result; + } + final String[] Names = {"NFC_Leading", "NFC_TrailingNonZero", "NFC_TrailingZero", "NFC_Resulting"}; + final String[] SNames = {"NFC_L", "NFC_TNZ", "NFC_TZ", "NFC_R"}; + final String[] Description = { + "Characters that can combine with following characters in NFC", + "Characters that can combine with previous characters in NFC, and have non-zero combining class", + "Characters that can combine with previous characters in NFC, and have zero combining class", + "Characters that can result from a combination of other characters in NFC", + }; + }; + class GenDProp extends DProp { Normalizer nfx; Normalizer nfComp = null; GenDProp (int i) { + testStatus = true; nfx = nf[i-GenNFD]; name = NAME[i-GenNFD]; String compName = "the character itself"; @@ -201,6 +242,7 @@ public class DerivedProperty implements UCD_Types { name = "Possible_Missing_" + CaseNames[i-Missing_Uppercase]; header = "# Derived Property: " + name + "\r\n# Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases"; + testStatus = true; } boolean hasProperty(int cp) { byte cat = ucdData.getCategory(cp); @@ -221,6 +263,7 @@ public class DerivedProperty implements UCD_Types { NO = NAME[i-QuickNFD] + "_NO"; MAYBE = NAME[i-QuickNFD] + "_MAYBE"; name = NAME[i-QuickNFD] + "_QuickCheck"; + shortName = NAME[i-QuickNFD] + "_QC"; header = "# Derived Property: " + name + "\r\n# Generated from computing decomposibles" + ((i == QuickNFC || i == QuickNFKC) @@ -250,9 +293,18 @@ public class DerivedProperty implements UCD_Types { dprops[i] = new GenDProp(i); } + for (int i = NFC_Leading; i <= NFC_Resulting; ++i) { + dprops[i] = new NFC_Prop(i); + } + + for (int i = NFD_UnsafeStart; i <= NFKC_UnsafeStart; ++i) { + dprops[i] = new NF_UnsafeStartProp(i); + } + dprops[ID_Start] = new DProp() { { name = "ID_Start"; + shortName = "IDS"; header = "# Derived Property: " + name + "\r\n# Characters that can start an identifier." + "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl"; @@ -265,6 +317,7 @@ public class DerivedProperty implements UCD_Types { dprops[ID_Continue_NO_Cf] = new DProp() { { name = "ID_Continue"; + shortName = "IDC"; header = "# Derived Property: " + name + "\r\n# Characters that can continue an identifier." + "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc" @@ -278,6 +331,7 @@ public class DerivedProperty implements UCD_Types { dprops[Mod_ID_Start] = new DProp() { { name = "XID_Start"; + shortName = "XIDS"; header = "# Derived Property: " + name + "\r\n# ID_Start modified for closure under NFKx" + "\r\n# Modified as described in UAX #15" @@ -292,6 +346,7 @@ public class DerivedProperty implements UCD_Types { dprops[Mod_ID_Continue_NO_Cf] = new DProp() { { name = "XID_Continue"; + shortName = "XIDC"; header = "# Derived Property: " + name + "\r\n# Mod_ID_Continue modified for closure under NFKx" + "\r\n# Modified as described in UAX #15" @@ -307,6 +362,7 @@ public class DerivedProperty implements UCD_Types { dprops[PropMath] = new DProp() { { name = "Math"; + shortName = name; header = "# Derived Property: " + name + "\r\n# Generated from: Sm + Other_Math"; } @@ -321,6 +377,7 @@ public class DerivedProperty implements UCD_Types { dprops[PropAlphabetic] = new DProp() { { name = "Alphabetic"; + shortName = "Alpha"; header = "# Derived Property: " + name + "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic"; } @@ -335,6 +392,7 @@ public class DerivedProperty implements UCD_Types { dprops[PropLowercase] = new DProp() { { name = "Lowercase"; + shortName = "Lower"; header = "# Derived Property: " + name + "\r\n# Generated from: Ll + Other_Lowercase"; } @@ -349,6 +407,7 @@ public class DerivedProperty implements UCD_Types { dprops[PropUppercase] = new DProp() { { name = "Uppercase"; + shortName = "Upper"; header = "# Derived Property: " + name + "\r\n# Generated from: Lu + Other_Uppercase"; } @@ -373,7 +432,9 @@ of characters, the first of which has a non-zero combining class. */ dprops[FullCompExclusion] = new DProp() { { - name = "Comp_Ex"; + name = "Full_Composition_Exclusion"; + shortName = "Comp_Ex"; + defaultStyle = SHORT; header = "# Derived Property: " + name + ": Full Composition Exclusion" + "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions"; @@ -390,7 +451,10 @@ of characters, the first of which has a non-zero combining class. dprops[FullCompInclusion] = new DProp() { { - name = "Comp_In"; + name = "Full_Composition_Inclusion"; + shortName = "Comp_In"; + defaultStyle = SHORT; + testStatus = true; header = "# Derived Property: " + name + ": Full Composition Inclusion" + "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion"; @@ -408,6 +472,7 @@ of characters, the first of which has a non-zero combining class. dprops[FC_NFKC_Closure] = new DProp() { { name = "FC_NFKC_Closure"; + shortName = "FC_NFKC"; header = "# Derived Property: " + name + "\r\n# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b));" + "\r\n# Then if (c != b) add the mapping from a to c to the set of" @@ -427,6 +492,7 @@ of characters, the first of which has a non-zero combining class. dprops[FC_NFC_Closure] = new DProp() { { name = "FC_NFC_Closure"; + shortName = "FC_NFC"; header = "# Derived Property: " + name + "\r\n# Generated from computing: b = NFC(Fold(a)); c = NFC(Fold(b));" + "\r\n# Then if (c != b) add the mapping from a to c to the set of" @@ -450,8 +516,9 @@ of characters, the first of which has a non-zero combining class. dprops[DefaultIgnorable] = new DProp() { { name = "Default_Ignorable_Code_Point"; + shortName = "DI"; header = header = "# Derived Property: " + name - + "\r\n# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs - WhiteSpace"; + + "\r\n# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs - White_Space"; } boolean hasProperty(int cp) { if (ucdData.getBinaryProperty(cp, White_space)) return false; @@ -471,11 +538,12 @@ of characters, the first of which has a non-zero combining class. */ dprops[GraphemeExtend] = new DProp() { { - name = "GraphemeExtend"; + name = "Grapheme_Extend"; + shortName = "GrExt"; header = header = "# Derived Property: " + name - + "\r\n# Generated from: Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink" + + "\r\n# Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link" + "\r\n# Used in the definition of GraphemeCluster: " - + "\r\n# GraphemeCluster ::= GraphameBase? ( GraphemeExtend | GraphemeLink Join_Control? GraphemeBase? )*"; + + "\r\n# GraphemeCluster ::= GraphameBase? ( Grapheme_Extend | Grapheme_Link Join_Control? Grapheme_Base? )*"; } boolean hasProperty(int cp) { if (ucdData.getBinaryProperty(cp, GraphemeExtend)) return false; @@ -486,13 +554,80 @@ of characters, the first of which has a non-zero combining class. } }; + dprops[Other_Case_Ignorable] = new DProp() { + { + name = "Other_Case_Ignorable"; + shortName = "OCI"; + + header = header = "# Binary Property"; + } + boolean hasProperty(int cp) { + switch(cp) { + case 0x27: case 0x2019: case 0xAD: return true; + // case 0x2d: case 0x2010: case 0x2011: +/* +0027 ; Other_Case_Ignorable # Po APOSTROPHE +00AD ; Other_Case_Ignorable # Pd SOFT HYPHEN +2019 ; Other_Case_Ignorable # Pf RIGHT SINGLE QUOTATION MARK +*/ + } + return false; + } + }; + + dprops[Type_i] = new DProp() { + { + name = "Special_Dotted"; + shortName = "SDot"; + header = header = "# Derived Property: " + name + + "\r\n# Generated from: all characters whose canonical decompositions end with a combining character sequence that" + + "\r\n# - starts with i or j" + + "\r\n# - has no combining marks above" + + "\r\n# - has no combining marks with zero canonical combining class" + ; + } + boolean hasProperty(int cp) { + if (cp == 'i' || cp == 'j') return true; + if (!nfkd.hasDecomposition(cp)) return false; + String decomp = nfd.normalize(cp); + boolean ok = false; + for (int i = decomp.length()-1; i >= 0; --i) { + char ch = decomp.charAt(i); + int cc = ucdData.getCombiningClass(ch); + if (cc == 230) return false; + if (cc == 0) { + if (ch == 'i' || ch == 'j') ok = true; + else return false; + } + } + return ok; + } + }; + + dprops[Case_Ignorable] = new DProp() { + { + name = "Case_Ignorable"; + shortName = "CI"; + header = header = "# Derived Property: " + name + + "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf"; + } + boolean hasProperty(int cp) { + byte cat = ucdData.getCategory(cp); + if (cat == Lm || cat == Cf || cat == Mn || cat == Me) return true; + if (dprops[Other_Case_Ignorable].hasProperty(cp)) return true; + return false; + } + }; + dprops[GraphemeBase] = new DProp() { { - name = "GraphemeBase"; + name = "Grapheme_Base"; + shortName = "GrBase"; + header = header = "# Derived Property: " + name - + "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - GraphemeLink - GraphemeExtend" + + "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Link - Grapheme_Extend" + "\r\n# Used in the definition of GraphemeCluster: " - + "\r\n# GraphemeCluster ::= GraphameBase? ( GraphemeExtend | GraphemeLink Join_Control? GraphemeBase? )*"; + + "\r\n# GraphemeCluster ::= GraphameBase? ( Grapheme_Extend | Grapheme_Link Join_Control? Grapheme_Base? )*"; } boolean hasProperty(int cp) { byte cat = ucdData.getCategory(cp); diff --git a/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java index 6710c92effa..b0fb5c4d815 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java +++ b/tools/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $ -* $Date: 2001/09/19 23:33:16 $ -* $Revision: 1.3 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -25,7 +25,7 @@ class DiffPropertyLister extends PropertyLister { } public String propertyName(int cp) { - return ucdData.getVersion(); + return major_minor_only(ucdData.getVersion()); } /* @@ -49,9 +49,10 @@ class DiffPropertyLister extends PropertyLister { public String headerString() { if (oldUCD != null) { - return "# Differences between " + ucdData.getVersion() + " and " + oldUCD.getVersion(); + return "# Differences between " + major_minor_only(ucdData.getVersion()) + + " and " + major_minor_only(oldUCD.getVersion()); } else { - return "# Allocated as of " + ucdData.getVersion(); + return "# Designated as of " + major_minor_only(ucdData.getVersion()); } } @@ -80,6 +81,10 @@ class DiffPropertyLister extends PropertyLister { return count; } */ + + private String major_minor_only(String s) { + return s.substring(0, s.lastIndexOf('.')); + } } diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java index 878d6899abe..37b3afc8a9b 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $ -* $Date: 2001/09/19 23:33:16 $ -* $Revision: 1.6 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.7 $ * ******************************************************************************* */ @@ -19,19 +19,25 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; import com.ibm.text.utility.*; +import com.ibm.text.UTF16; + public class GenerateData implements UCD_Types { + static UnifiedBinaryProperty ubp; + public static void main (String inVersion, String[] args) throws IOException { System.out.println("START"); ucd = UCD.make(inVersion); + ubp = new UnifiedBinaryProperty(ucd); + System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate()))); String version = ucd.getVersion(); for (int i = 0; i < args.length; ++i) { String arg = args[i]; if (arg.charAt(0) == '#') return; // skip rest of line - int mask = 0; + long mask = 0; Utility.fixDot(); System.out.println("Argument: " + args[i]); @@ -39,7 +45,16 @@ public class GenerateData implements UCD_Types { if (arg.equalsIgnoreCase("partition")) { partitionProperties(); } else if (arg.equalsIgnoreCase("list")) { - listProperties(); + listProperties(); + } else if (arg.equalsIgnoreCase("listAccents")) { + listCombiningAccents(); + + } else if (arg.equalsIgnoreCase("listGreekVowels")) { + listGreekVowels(); + + } else if (arg.equalsIgnoreCase("listKatakana")) { + listKatakana(); + } else if (arg.equalsIgnoreCase("diff")) { listDifferences(); } else if (arg.equalsIgnoreCase("DerivedBidiClass")) { @@ -91,6 +106,18 @@ public class GenerateData implements UCD_Types { mask = Utility.setBits(mask, DerivedProperty.DefaultIgnorable, DerivedProperty.FC_NFC_Closure-1); generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-" + version ); + } else if (arg.equalsIgnoreCase("caseignorable")) { + mask = Utility.setBits(0, DerivedProperty.Other_Case_Ignorable, DerivedProperty.Type_i); + generateDerived(mask, HEADER_DERIVED, "CaseIgnorable-" + version ); + + } else if (arg.equalsIgnoreCase("nfcprops")) { + mask = Utility.setBits(0, NFC_Leading, NFC_Resulting); + generateDerived(mask, HEADER_DERIVED, "NFKC_SafeStart-" + version); + + } else if (arg.equalsIgnoreCase("nfunsafestart")) { + mask = Utility.setBits(0, NFD_UnsafeStart, NFKC_UnsafeStart); + generateDerived(mask, HEADER_DERIVED, "NFUnsafeStart-" + version); + } else if (arg.equalsIgnoreCase("DerivedAge")) { generateAge("DerivedAge-" + version ); @@ -202,11 +229,11 @@ public class GenerateData implements UCD_Types { output.println(); } - public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException { + public static void generateDerived (long bitMask, int headerChoice, String fileName) throws IOException { PrintWriter output = Utility.openPrintWriter(fileName + "dX.txt"); doHeader(fileName, output, headerChoice); for (int i = 0; i < DerivedProperty.LIMIT; ++i) { - if ((bitMask & (1<> 8) != last) { last = j >> 8; @@ -349,8 +376,8 @@ public class GenerateData implements UCD_Types { if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue; if (!ucd.isAllocated(cp)) continue; - boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, i); - boolean jProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, j); + boolean iProp = ubp.get(cp, i); + boolean jProp = ubp.get(cp, j); if (jProp) ++jCount; if (iProp) { @@ -361,8 +388,8 @@ public class GenerateData implements UCD_Types { } if (iCount == 0 || jCount == 0) continue; - String jNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.SHORT); - //String jNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.LONG); + String jNameShort = ubp.getFullID(j, SHORT); + //String jNameLong = ubp.getFullID(j, LONG); String rel = bothCount == 0 ? "DISJOINT" : i_jPropCount == 0 && j_iPropCount == 0 ? "EQUALS" @@ -384,26 +411,69 @@ public class GenerateData implements UCD_Types { } - public static void listProperties() { + public static void listProperties() throws IOException { + String propAbb = ""; + String prop = ""; + + Map duplicates = new TreeMap(); + Set sorted = new TreeSet(java.text.Collator.getInstance()); + String spacing; + + for(int k = 0; k < UCD_Names.NON_ENUMERATED.length; ++k) { + propAbb = UCD_Names.NON_ENUMERATED[k][0]; + prop = UCD_Names.NON_ENUMERATED[k][1]; + spacing = Utility.repeat(" ", 10-propAbb.length()); + sorted.add("AA; " + propAbb + spacing + "; " + prop); + checkDuplicate(duplicates, propAbb, prop); + if (!prop.equals(propAbb)) checkDuplicate(duplicates, prop, prop); + } + + sorted.add("xx; T ; True"); + checkDuplicate(duplicates, "T", "xx"); + sorted.add("xx; F ; False"); + checkDuplicate(duplicates, "F", "xx"); + sorted.add("qc; Y ; Yes"); + checkDuplicate(duplicates, "Y", "qc"); + sorted.add("qc; N ; No"); + checkDuplicate(duplicates, "Y", "qc"); + sorted.add("qc; M ; Maybe"); + checkDuplicate(duplicates, "Y", "qc"); + + for (int i = 0; i < LIMIT_ENUM; ++i) { int type = i & 0xFF00; - if (type == JOINING_GROUP || type == AGE) continue; - if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue; - String value = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.LONG); + if (type == AGE) continue; + if (i == (BINARY_PROPERTIES | CaseFoldTurkishI)) continue; + + if (type == i && type != BINARY_PROPERTIES && type != DERIVED) { + propAbb = ubp.getPropertyName(i, SHORT); + prop = ubp.getPropertyName(i, LONG); + spacing = Utility.repeat(" ", 10-propAbb.length()); + sorted.add("BB; " + propAbb + spacing + "; " + prop); + checkDuplicate(duplicates, propAbb, prop); + if (!prop.equals(propAbb)) checkDuplicate(duplicates, prop, prop); + } + + if (!ubp.isDefined(i)) continue; + if (ubp.isTest(i)) continue; + + String value = ubp.getID(i, LONG); if (value.length() == 0) value = "none"; else if (value.equals("")) continue; - String abbvalue = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT); + value = fixGaps(value); + + if (type == SCRIPT) { + value = ucd.getCase(value, FULL, TITLE); + } + + String abbvalue = ubp.getID(i, SHORT); if (abbvalue.length() == 0) abbvalue = "no"; if (type == COMBINING_CLASS) { - value = MyPropertyLister.getCombiningName(i); - if (value.length() == 0) { - if ((i & 0xFF) == 0) value = "99"; - else continue; - } - abbvalue = value; + if (value.startsWith("Fixed_")) { continue; } } - + + /* String elide = ""; if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{" + abbvalue @@ -421,9 +491,78 @@ public class GenerateData implements UCD_Types { + value + "}"; System.out.println("" + elide + "" + abb + "" + norm + ""); + */ + + spacing = Utility.repeat(" ", 10-abbvalue.length()); + + if (type == BINARY_PROPERTIES || type == DERIVED) { + sorted.add("ZZ; " + abbvalue + spacing + "; " + value); + checkDuplicate(duplicates, value, value); + if (!value.equals(abbvalue)) checkDuplicate(duplicates, abbvalue, value); + continue; + } + + sorted.add(propAbb + "; " + abbvalue + spacing + "; " + value); + checkDuplicate(duplicates, value, prop + "=" + value); + if (!value.equals(abbvalue)) checkDuplicate(duplicates, abbvalue, prop + "=" + value); + } + + PrintWriter log = Utility.openPrintWriter("PropertyAliases.txt"); + Utility.appendFile("PropertyAliasHeader.txt", log); + Utility.print(log, sorted, "\r\n", new MyBreaker()); + log.close(); + } + + static class MyBreaker implements Utility.Breaker { + public String get(Object current, Object old) { + if (old == null) return ""; + String c = current.toString(); + String o = old.toString(); + if (c.length() >= 2 && o.length() >= 0 && !c.substring(0,2).equals(o.substring(0,2))) { + return "\r\n"; + } + return ""; } } - + + static void checkDuplicate(Map m, String toCheck, String comment) { + String result = (String) m.get(toCheck); + if (result != null) { + System.out.println("Collision with " + toCheck); + System.out.println(" Between " + comment); + System.out.println(" And " + result); + } else { + m.put(skeleton(toCheck), comment); + } + } + + static String fixGaps(String source) { + StringBuffer result = new StringBuffer(); + byte lastCat = -1; + for (int i = 0; i < source.length(); ++i) { + char c = source.charAt(i); + if (c == ' ' || c == '-') c = '_'; + byte cat = ucd.getCategory(c); + if (lastCat == Ll && cat == Lu) { + result.append('_'); + } + result.append(c); + lastCat = cat; + } + return result.toString(); + } + + static String skeleton(String source) { + StringBuffer result = new StringBuffer(); + source = source.toLowerCase(); + for (int i = 0; i < source.length(); ++i) { + char c = source.charAt(i); + if (c < 'a' || c > 'z') continue; + result.append(c); + } + return result.toString(); + } + static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1; public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial, @@ -445,7 +584,7 @@ public class GenerateData implements UCD_Types { doHeader(file, output, headerChoice); int last = -1; for (int i = startEnum; i < endEnum; ++i) { - if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue; + if (!ubp.isDefined(i)) continue; if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE || i == (BINARY_PROPERTIES | Non_break) || i == (BINARY_PROPERTIES | CaseFoldTurkishI) @@ -689,14 +828,19 @@ public class GenerateData implements UCD_Types { static final void generateAge(String filename) throws IOException { PrintWriter log = Utility.openPrintWriter(filename + "dX.txt"); try { - log.println("# Derived file showing when various code points were allocated in Unicode"); + log.println("# Derived file showing when various code points were designated in Unicode"); log.println("# author: M. Davis"); log.println("# generated: " + new Date()); log.println("# Notes:"); - log.println("# - The old Hangul Syllables (removed from 2.0) are not included in the 1.1.0 listing."); + log.println("# - The term 'designated' means that a previously reserved code point was specified"); + log.println("# to be a noncharacter or surrogate, or assigned as a character,"); + log.println("# control or format code."); + log.println("# - Versions are only tracked from 1.1 onwards, since version 1.0"); + log.println("# predated changes required by the ISO 10646 merger."); + log.println("# - The Hangul Syllables that were removed from 2.0 are not included in the 1.1 listing."); log.println("# - The supplementary private use code points and the non-character code points"); - log.println("# were allocated in version 2.0, but not specifically listed in the UCD"); - log.println("# until versions 3.0.1 and 3.1.0 respectively."); + log.println("# were designated in version 2.0, but not specifically listed in the UCD"); + log.println("# until versions 3.0 and 3.1 respectively."); log.println("# ================================================"); log.println(); @@ -713,6 +857,9 @@ public class GenerateData implements UCD_Types { log.println("# ================================================"); log.println(); new DiffPropertyLister("3.0.0", "3.1.0", log).print(); + log.println("# ================================================"); + log.println(); + new DiffPropertyLister("3.1.0", "3.2.0", log).print(); /* printDiff("110", "200"); UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false); @@ -761,5 +908,133 @@ public class GenerateData implements UCD_Types { } + public static void listCombiningAccents() throws IOException { + PrintWriter log = Utility.openPrintWriter("ListAccents.txt"); + Normalizer nfd = new Normalizer(Normalizer.NFD); + Set set = new TreeSet(); + Set set2 = new TreeSet(); + + for (int i = 0; i < 0x10FFFF; ++i) { + Utility.dot(i); + if (!ucd.isRepresented(i)) continue; + + if (!nfd.hasDecomposition(i)) { + if (ucd.getScript(i) == LATIN_SCRIPT) { + int cp = i; + String hex = "u" + Utility.hex(cp, 4); + set.add("# yyy $x <> \\" + hex + " ; # " + ucd.getName(cp)); + } + continue; + } + + String decomp = nfd.normalize(i); + int j; + for (j = 0; j < decomp.length(); j += UTF16.getCharCount(i)) { + int cp = UTF16.charAt(decomp, j); + byte cat = ucd.getCategory(cp); + if (cat != Mn) continue; + String hex = "u" + Utility.hex(cp, 4); + set.add("# xxx $x <> \\" + hex + " ; # " + ucd.getName(cp)); + } + } + + Iterator it = set.iterator(); + while (it.hasNext()) { + log.println(it.next()); + } + log.close(); + } + + public static void listGreekVowels() throws IOException { + PrintWriter log = Utility.openPrintWriter("ListGreekVowels.txt"); + Normalizer nfd = new Normalizer(Normalizer.NFD); + Normalizer nfc = new Normalizer(Normalizer.NFC); + Set set = new TreeSet(); + Set set2 = new TreeSet(); + + String vowels = "\u03B1\u03B5\u03B7\u03B9\u03BF\u03C5\u03C9\u0391\u0395\u0397\u0399\u039F\u03A5\u03A9"; + String diphthongEnd = "\u03B9\u03C5\u0399\u03A5"; + String diphthongStart = "\u03B1\u03B5\u03B7\u03BF\u03C5\u0391\u0395\u0397\u039F\u03A5"; + String etas = "\u03B7\u0397"; + String iotas = "\u03B9\u0399"; + + for (char i = 0; i < 0xFFFF; ++i) { + Utility.dot(i); + if (!ucd.isRepresented(i)) continue; + if (ucd.getScript(i) != GREEK_SCRIPT) continue; + String decomp = nfd.normalize(i); + + if (decomp.indexOf('\u0306') >= 0) continue; // skip breve + if (decomp.indexOf('\u0304') >= 0) continue; // skip macron + + String comp = nfc.normalize(decomp); + if (!comp.equals(String.valueOf(i))) continue; // skip compats + + char first = decomp.charAt(0); + + if (vowels.indexOf(first) < 0) continue; + + String h = ""; + if (decomp.indexOf('\u0314') >= 0) h = "\uFFFF"; + + if (diphthongEnd.indexOf(first) >= 0) { + for (int j = 0; j < diphthongStart.length(); ++j) { + String v = diphthongStart.substring(j, j+1); + char vc = v.charAt(0); + if (ucd.getCategory(vc) == Ll && ucd.getCategory(first) == Lu) continue; + if (etas.indexOf(vc) >= 0 && iotas.indexOf(first) >= 0) continue; + set.add(new Pair(h + v + first, new Pair(v + decomp, v + i))); + } + } + set.add(new Pair(h+first, new Pair(decomp, String.valueOf(i)))); + } + + Iterator it = set.iterator(); + Object last = ""; + while (it.hasNext()) { + Pair p = (Pair) it.next(); + if (!last.equals(p.first)) { + log.println(); + last = p.first; + } else { + log.print(", "); + } + p = (Pair) p.second; + log.print(p.second); + } + log.close(); + } + + public static void listKatakana() throws IOException { + + for (char i = 'a'; i <= 'z'; ++i) { + doKana(String.valueOf(i)); + if (i == 'c') doKana("ch"); + if (i == 's') doKana("sh"); + if (i == 'd') { + doKana("dz"); + doKana("dj"); + } + } + + System.out.println(); + } + + public static void doKana(String i) { + + String vowels = "aeiou"; + System.out.println(); + System.out.print(i + " " + i + i); + System.out.println(); + for (int j = 0; j < vowels.length(); ++j) { + char c = vowels.charAt(j); + System.out.print(" " + i + c); + } + System.out.println(); + for (int j = 0; j < vowels.length(); ++j) { + char c = vowels.charAt(j); + System.out.print(" " + i + "y" + c); + } + } } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index 74f813079fe..d4b6139e1e4 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2001/09/19 23:33:16 $ -* $Revision: 1.3 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -37,6 +37,7 @@ public final class Main { } else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i]; else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML(); else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed(); + else if (arg.equalsIgnoreCase("generateHanTransliterator")) GenerateHanTransliterator.main(); else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test(); else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase(); @@ -49,8 +50,9 @@ public final class Main { //else if (arg.equalsIgnoreCase("checkAgainstUInfo")) checkAgainstUInfo(); else if (arg.equalsIgnoreCase("checkScripts")) VerifyUCD.checkScripts(); else if (arg.equalsIgnoreCase("IdentifierTest")) VerifyUCD.IdentifierTest(); - else if (arg.equalsIgnoreCase("GenerateData")) GenerateData.main(ucdVersion, Utility.split(args[++i],',')); + else if (arg.equalsIgnoreCase("Generate")) GenerateData.main(ucdVersion, Utility.split(args[++i],',')); else if (arg.equalsIgnoreCase("BuildNames")) BuildNames.main(null); + else if (arg.equalsIgnoreCase("JavascriptProperties")) WriteJavaScriptInfo.assigned(); else if (arg.equalsIgnoreCase("writeNormalizerTestSuite")) GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt"); else { diff --git a/tools/unicodetools/com/ibm/text/UCD/MyPropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/MyPropertyLister.java index 9f125d833cf..22e1d378ef0 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MyPropertyLister.java +++ b/tools/unicodetools/com/ibm/text/UCD/MyPropertyLister.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $ -* $Date: 2001/09/19 23:33:16 $ -* $Revision: 1.3 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -21,64 +21,36 @@ final class MyPropertyLister extends PropertyLister { static final boolean BRIDGE = false; private int propMask; + + UnifiedBinaryProperty ubp; public MyPropertyLister(UCD ucd, int propMask, PrintWriter output) { this.propMask = propMask; this.output = output; this.ucdData = ucd; + ubp = new UnifiedBinaryProperty(ucd); if (propMask < COMBINING_CLASS) usePropertyComment = false; // skip gen cat } - static String getCombiningName (int propMask) { - String s = ""; - switch (propMask & 0xFF) { - case 0: s = "NotReordered"; break; - case 1: s = "Overlay"; break; - case 7: s = "Nukta"; break; - case 8: s = "KanaVoicing"; break; - case 9: s = "Virama"; break; - case 202: s = "AttachedBelowLeft"; break; - case 204: s = "AttachedBelow"; break; - case 206: s = "AttachedBelowRight"; break; - case 208: s = "AttachedLeft"; break; - case 210: s = "AttachedRight"; break; - case 212: s = "AttachedAboveLeft"; break; - case 214: s = "AttachedAbove"; break; - case 216: s = "AttachedAboveRight"; break; - case 218: s = "BelowLeft"; break; - case 220: s = "Below"; break; - case 222: s = "BelowRight"; break; - case 224: s = "Left"; break; - case 226: s = "Right"; break; - case 228: s = "AboveLeft"; break; - case 230: s = "Above"; break; - case 232: s = "AboveRight"; break; - case 233: s = "DoubleBelow"; break; - case 234: s = "DoubleAbove"; break; - case 240: s = "IotaSubscript"; break; - } - return s; - } - public String headerString() { int main = (propMask & 0xFF00); if (main == COMBINING_CLASS) { - String s = getCombiningName(propMask); - if (s.length() == 0) s = "Other Combining Class"; + String s = UCD.getCombiningID_fromIndex((short)(propMask & 0xFF), LONG); + if (s.startsWith("Fixed")) s = "Other Combining Class"; return "# " + s; } else if (main == BINARY_PROPERTIES) { return ""; } else if (main == JOINING_GROUP) { return ""; } else { - String shortID = getUnifiedBinaryPropertyID(ucdData, propMask, SHORT); - String longID = getUnifiedBinaryPropertyID(ucdData, propMask, LONG); + String shortID = ubp.getID(propMask, SHORT); + String longID = ubp.getID(propMask, LONG); return "# " + shortID + (shortID.equals(longID) ? "" : "\t(" + longID + ")"); } } public String propertyName(int cp) { - return getUnifiedBinaryPropertyID(propMask); + return ubp.getID(propMask); } public String optionalComment(int cp) { @@ -115,7 +87,7 @@ final class MyPropertyLister extends PropertyLister { else return EXCLUDE; } - boolean inSet = getUnifiedBinaryProperty(cp, propMask); + boolean inSet = ubp.get(cp, propMask); /* if (cp >= 0x1D400 && cp <= 0x1D7C9 && cat != Cn) { if (propMask == (SCRIPT | LATIN_SCRIPT)) inSet = cp <= 0x1D6A3; @@ -133,151 +105,6 @@ final class MyPropertyLister extends PropertyLister { return INCLUDE; } - /** - * @return unified property number - */ - public static boolean isUnifiedBinaryPropertyDefined(UCD ucd, int propMask) { - int enum = propMask >> 8; - propMask &= 0xFF; - switch (enum) { - case CATEGORY>>8: return propMask != UNUSED_CATEGORY && propMask < LIMIT_CATEGORY; - case COMBINING_CLASS>>8: return ucd.isCombiningClassUsed((byte)propMask); - case BIDI_CLASS>>8: return propMask != BIDI_UNUSED && propMask < LIMIT_BIDI_CLASS; - case DECOMPOSITION_TYPE>>8: return propMask < LIMIT_DECOMPOSITION_TYPE; - case NUMERIC_TYPE>>8: return propMask < LIMIT_NUMERIC_TYPE; - case EAST_ASIAN_WIDTH>>8: return propMask < LIMIT_EAST_ASIAN_WIDTH; - case LINE_BREAK>>8: return propMask < LIMIT_LINE_BREAK; - case JOINING_TYPE>>8: return propMask < LIMIT_JOINING_TYPE; - case JOINING_GROUP>>8: return propMask < LIMIT_JOINING_GROUP; - case BINARY_PROPERTIES>>8: return propMask < LIMIT_BINARY_PROPERTIES; - case SCRIPT>>8: return propMask != UNUSED_SCRIPT && propMask < LIMIT_SCRIPT; - case AGE>>8: return propMask < LIMIT_AGE; - default: return false; - } - } - - public boolean getUnifiedBinaryProperty(int cp, int propMask) { - return getUnifiedBinaryProperty(ucdData, cp, propMask); - } - - static public boolean getUnifiedBinaryProperty(UCD ucd, int cp, int propMask) { - int enum = propMask >> 8; - propMask &= 0xFF; - switch (enum) { - case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break; - return ucd.getCategory(cp) == propMask; - case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break; - return ucd.getCombiningClass(cp) == propMask; - case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break; - return ucd.getBidiClass(cp) == propMask; - case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break; - return ucd.getDecompositionType(cp) == propMask; - case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break; - return ucd.getNumericType(cp) == propMask; - case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break; - return ucd.getEastAsianWidth(cp) == propMask; - case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break; - return ucd.getLineBreak(cp) == propMask; - case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break; - return ucd.getJoiningType(cp) == propMask; - case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break; - return ucd.getJoiningGroup(cp) == propMask; - case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break; - return ucd.getBinaryProperty(cp, propMask); - case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break; - return ucd.getScript(cp) == propMask; - case AGE>>8: if (propMask >= LIMIT_AGE) break; - return ucd.getAge(cp) == propMask; - } - throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)}); - } - - static final int SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2; - - public String getUnifiedBinaryPropertyID(int unifiedPropMask) { - return getUnifiedBinaryPropertyID(ucdData, unifiedPropMask, NORMAL); - } - - public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask) { - String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG); - String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT); - if (longOne.equals(shortOne)) return longOne; - return shortOne + "(" + longOne + ")"; - } - - public static String getFullUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) { - String pre = ""; - if ((unifiedPropMask & 0xFF00) != BINARY_PROPERTIES) { - String preShort = UCD_Names.ABB_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "="; - String preLong = UCD_Names.SHORT_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "="; - if (style < LONG) pre = preShort; - else if (style == LONG || preShort.equals(preLong)) pre = preLong; - else pre = preShort + "(" + preLong + ")"; - } - String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT); - if (shortOne.length() == 0) shortOne = "xx"; - String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG); - if (longOne.length() == 0) longOne = "none"; - - String post; - if (style < LONG) post = shortOne; - else if (style == LONG || shortOne.equals(longOne)) post = longOne; - else post = shortOne + "(" + longOne + ")"; - - if (pre.length() == 0) { - pre = post + "="; - post = "T"; - } - - return pre + post; - } - - public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) { - int enum = unifiedPropMask >> 8; - byte propMask = (byte)unifiedPropMask; - switch (enum) { - case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break; - if (style != LONG) return ucd.getCategoryID_fromIndex(propMask); - return UCD_Names.LONG_GC[propMask]; - case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break; - String s = ""; - if (style == LONG) { - s = getCombiningName(unifiedPropMask); - if (s.length() != 0) return s; - s = "fixed_"; - } - return s + ucd.getCombiningClassID_fromIndex((short)(0xFF & propMask)); - case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break; - if (style != LONG) return ucd.getBidiClassID_fromIndex(propMask); - return UCD_Names.LONG_BC[propMask]; - case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break; - if (style != SHORT) return ucd.getDecompositionTypeID_fromIndex(propMask); - return UCD_Names.SHORT_DT[propMask]; - case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break; - if (style != SHORT) return ucd.getNumericTypeID_fromIndex(propMask); - return UCD_Names.SHORT_NT[propMask]; - case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break; - if (style != LONG) return ucd.getEastAsianWidthID_fromIndex(propMask); - return UCD_Names.SHORT_EA[propMask]; - case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break; - if (style != LONG) return ucd.getLineBreakID_fromIndex(propMask); - return UCD_Names.LONG_LB[propMask]; - case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break; - if (style != LONG) return ucd.getJoiningTypeID_fromIndex(propMask); - return UCD_Names.LONG_JOINING_TYPE[propMask]; - case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break; - return ucd.getJoiningGroupID_fromIndex(propMask); - case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break; - if (style != SHORT) return ucd.getBinaryPropertiesID_fromIndex(propMask); - return UCD_Names.SHORT_BP[propMask]; - case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break; - if (style != SHORT) return ucd.getScriptID_fromIndex(propMask); - return UCD_Names.ABB_SCRIPT[propMask]; - case AGE>>8: if (propMask >= LIMIT_AGE) break; - return ucd.getAgeID_fromIndex(propMask); - } - throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)}); - } } diff --git a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java index 3fe3e6f50c4..299de02b008 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java +++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $ -* $Date: 2001/09/06 01:29:48 $ -* $Revision: 1.3 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -208,12 +208,27 @@ public final class Normalizer implements UCD_Types { * pair is firstChar << 16 | secondChar. * Will need to be fixed for surrogates. */ - /* - public IntHashtable.IntEnumeration getComposition() { - return data.getComposition(); - } - */ + public void getCompositionStatus(BitSet leading, BitSet trailing, BitSet resulting) { + Iterator it = data.compTable.keySet().iterator(); + while (it.hasNext()) { + Long key = (Long)it.next(); + Integer result = (Integer)data.compTable.get(key); + long keyLong = key.longValue(); + if (leading != null) leading.set((int)(keyLong >>> 32)); + if (trailing != null) trailing.set((int)keyLong); + if (resulting != null) resulting.set(result.intValue()); + } + for (int i = UCD.LBase; i < UCD.TLimit; ++i) { + if (leading != null && UCD.isLeadingJamo(i)) leading.set(i); // set all initial Jamo (that form syllables) + if (trailing != null && UCD.isTrailingJamo(i)) trailing.set(i); // set all final Jamo (that form syllables) + } + if (leading != null) { + for (int i = UCD.SBase; i < UCD.SLimit; ++i) { + if (UCD.isDoubleHangul(i)) leading.set(i); // set all two-Jamo syllables + } + } + } public boolean isTrailing(int cp) { return this.composition ? data.isTrailing(cp) : false; diff --git a/tools/unicodetools/com/ibm/text/UCD/TestData.java b/tools/unicodetools/com/ibm/text/UCD/TestData.java index 8608921214d..6bbda961a3d 100644 --- a/tools/unicodetools/com/ibm/text/UCD/TestData.java +++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $ -* $Date: 2001/09/19 23:33:16 $ -* $Revision: 1.4 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -30,7 +30,7 @@ public class TestData implements UCD_Types { checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F"); checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD"); - int mask = 0; + long mask = 0; if (false) { @@ -166,7 +166,7 @@ public class TestData implements UCD_Types { output.println(); } - public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException { + public static void generateDerived (long bitMask, int headerChoice, String fileName) throws IOException { ucd = UCD.make("3.1.0"); PrintWriter output = Utility.openPrintWriter(fileName); doHeader(fileName, output, headerChoice); @@ -251,9 +251,11 @@ public class TestData implements UCD_Types { PrintWriter output = Utility.openPrintWriter(file); doHeader(file, output, headerChoice); + UnifiedBinaryProperty ubp = new UnifiedBinaryProperty(ucd); + int last = -1; for (int i = startEnum; i < endEnum; ++i) { - if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue; + if (!ubp.isDefined(i)) continue; if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE || i == (CATEGORY | UNUSED_CATEGORY) || i == (BINARY_PROPERTIES | Non_break) diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 0fd0b54c1a9..4255f48ad1d 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2001/09/19 23:33:16 $ -* $Revision: 1.3 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -296,24 +296,38 @@ public final class UCD implements UCD_Types { } throw new IllegalArgumentException("getCase: " + caseType + ", " + simpleVsFull); } - + + static final char SHY = '\u00AD'; + + static final char APOSTROPHE = '\u2019'; + public String getCase(String s, byte simpleVsFull, byte caseType, String condition) { if (UTF32.length32(s) == 1) return getCase(UTF32.char32At(s, 0), simpleVsFull, caseType); StringBuffer result = new StringBuffer(); int cp; byte currentCaseType = caseType; + DerivedProperty dp = new DerivedProperty(this); + for (int i = 0; i < s.length(); i += UTF32.count16(cp)) { cp = UTF32.char32At(s, i); String mappedVersion = getCase(cp, simpleVsFull, currentCaseType, condition); result.append(mappedVersion); - if (caseType == TITLE) { - // if letter is cased, change to lowercase, otherwise change to TITLE + if (caseType == TITLE) { // set the case type for the next character + + // certain characters are ignored + if (cp == '-' || cp == SHY || cp == '\'' || cp == APOSTROPHE) continue; byte cat = getCategory(cp); - if (cat == Mn || cat == Me || cat == Mc) { - // ignore! - } else if (cat == Lu || cat == Ll || cat == Lt - || getBinaryProperty(cp, Other_Lowercase) - || getBinaryProperty(cp, Other_Uppercase)) { + if (cat == Mn || cat == Me || cat == Cf || cat == Lm) continue; + if (dp.hasProperty(cp, DerivedProperty.DefaultIgnorable)) continue; + // if DefaultIgnorable is not supported, then + // check for (Cf + Cc + Cs) - White_Space + // if (cat == Cs && cp != 0x85 && (cp < 9 || cp > 0xD)) continue; + + // if letter is cased, change next to lowercase, otherwise revert to TITLE + if (cat == Lu || cat == Ll || cat == Lt + || getBinaryProperty(cp, Other_Lowercase) // skip if not supported + || getBinaryProperty(cp, Other_Uppercase) // skip if not supported + ) { currentCaseType = LOWER; } else { currentCaseType = TITLE; @@ -528,6 +542,43 @@ public final class UCD implements UCD_Types { public static String getCategoryID_fromIndex(byte prop) { return UCD_Names.GC[prop]; } + + public String getCombiningID(int codePoint, byte style) { + return getCombiningID_fromIndex(getCombiningClass(codePoint), style); + } + + static String getCombiningID_fromIndex (short index, byte style) { + String s = "Fixed"; + switch (index) { + case 0: s = style < LONG ? "NR" : "NotReordered"; break; + case 1: s = style < LONG ? "OV" : "Overlay"; break; + case 7: s = style < LONG ? "NK" : "Nukta"; break; + case 8: s = style < LONG ? "KV" : "KanaVoicing"; break; + case 9: s = style < LONG ? "VR" : "Virama"; break; + case 202: s = style < LONG ? "ATBL" : "AttachedBelowLeft"; break; + case 204: s = style < LONG ? "ATB" : "AttachedBelow"; break; + case 206: s = style < LONG ? "ATBR" : "AttachedBelowRight"; break; + case 208: s = style < LONG ? "ATL" : "AttachedLeft"; break; + case 210: s = style < LONG ? "ATR" : "AttachedRight"; break; + case 212: s = style < LONG ? "ATAL" : "AttachedAboveLeft"; break; + case 214: s = style < LONG ? "ATA" : "AttachedAbove"; break; + case 216: s = style < LONG ? "ATAR" : "AttachedAboveRight"; break; + case 218: s = style < LONG ? "BL" : "BelowLeft"; break; + case 220: s = style < LONG ? "B" : "Below"; break; + case 222: s = style < LONG ? "BR" : "BelowRight"; break; + case 224: s = style < LONG ? "L" : "Left"; break; + case 226: s = style < LONG ? "R" : "Right"; break; + case 228: s = style < LONG ? "AL" : "AboveLeft"; break; + case 230: s = style < LONG ? "A" : "Above"; break; + case 232: s = style < LONG ? "AR" : "AboveRight"; break; + case 233: s = style < LONG ? "DB" : "DoubleBelow"; break; + case 234: s = style < LONG ? "DB" : "DoubleAbove"; break; + case 240: s = style < LONG ? "IS" : "IotaSubscript"; break; + default: s += "_" + index; + } + return s; + } + public String getBidiClassID(int codePoint) { return getBidiClassID_fromIndex(getBidiClass(codePoint)); @@ -868,7 +919,7 @@ to guarantee identifier closure. // Hangul constants - static final int + public static final int SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, LCount = 19, VCount = 21, TCount = 28, NCount = VCount * TCount, // 588 @@ -891,6 +942,14 @@ to guarantee identifier closure. } private static final char[] pair = new char[2]; + + static boolean isDoubleHangul(int s) { + int SIndex = s - SBase; + if (0 > SIndex || SIndex >= SCount) { + throw new IllegalArgumentException("Not a Hangul Syllable: " + s); + } + return (SIndex % TCount) == 0; + } static String getHangulDecompositionPair(int ch) { int SIndex = ch - SBase; @@ -923,6 +982,10 @@ to guarantee identifier closure. return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit); } + static boolean isLeadingJamo(int cp) { + return (LBase <= cp && cp < LLimit); + } + private void fillFromFile(String version) { DataInputStream dataIn = null; String fileName = BIN_DIR + "UCD_Data" + version + ".bin"; diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java index 222dd30c2b8..fb3438aa150 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $ -* $Date: 2001/09/01 00:06:15 $ -* $Revision: 1.3 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -17,6 +17,22 @@ import com.ibm.text.utility.*; final class UCD_Names implements UCD_Types { + + public static String[][] NON_ENUMERATED = { + {"na", "Name"}, + {"dm", "Decomposition_Mapping"}, + {"nv", "Numeric_Value"}, + {"bmg", "Bidi_Mirroring_Glyph"}, + {"lc", "Lowercase_Mapping"}, + {"uc", "Uppercase_Mapping"}, + {"tc", "Titlecase_Mapping"}, + {"cf", "Case_Folding"}, + {"slc", "Simple_Lowercase_Mapping"}, + {"suc", "Simple_Uppercase_Mapping"}, + {"stc", "Simple_Titlecase_Mapping"}, + {"sfc", "Simple_Case_Folding"}, + {"scc", "Special_Case_Condition"} + }; static final String[] UNIFIED_PROPERTIES = { "General Category (listing UnicodeData.txt, field 2: see UnicodeData.html)", @@ -32,7 +48,8 @@ final class UCD_Names implements UCD_Types { "Joining Group (listing ArabicShaping.txt, field 2)", "BidiMirrored (listing UnicodeData.txt, field 9: see UnicodeData.html)", "Script", - "Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)" + "Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)", + "Derived" }; static final String[] SHORT_UNIFIED_PROPERTIES = { @@ -45,9 +62,10 @@ final class UCD_Names implements UCD_Types { "LineBreak", "JoiningType", "JoiningGroup", - "Value", + "", "Script", - "Age" + "Age", + "" }; static final String[] ABB_UNIFIED_PROPERTIES = { @@ -60,15 +78,16 @@ final class UCD_Names implements UCD_Types { "lb", "jt", "jg", - "va", + "", "sc", - "Ag" + "ag", + "", }; static final String[] BP = { - "BidiMirrored", - "CompositionExclusion", + "Bidi_Mirrored", + "Composition_Exclusion", "White_Space", "NonBreak", "Bidi_Control", @@ -87,46 +106,46 @@ final class UCD_Names implements UCD_Types { "Other_Lowercase", "Other_Uppercase", "Noncharacter_Code_Point", - "CaseFoldTurkishI", - "Other_GraphemeExtend", - "GraphemeLink", - "IDS_BinaryOperator", - "IDS_TrinaryOperator", + "Case_Fold_Turkish_I", + "Other_Grapheme_Extend", + "Grapheme_Link", + "IDS_Binary_Operator", + "IDS_Trinary_Operator", "Radical", - "UnifiedIdeograph", + "Unified_Ideograph", "Other_Default_Ignorable_Code_Point", "Deprecated", }; static final String[] SHORT_BP = { "BidiM", - "CExc", - "WhSp", + "CE", + "WSpace", "NBrk", - "BdCon", - "JCon", + "BidiC", + "JoinC", "Dash", - "Hyph", + "Hyphen", "QMark", - "TPunc", + "Term", "OMath", - "HexD", - "AHexD", - "OAlph", + "Hex", + "AHex", + "OAlpha", "Ideo", - "Diac", + "Dia", "Ext", - "OLoc", - "OUpc", + "OLower", + "OUpper", "NChar", "TurkI", - "OGrX", + "OGrExt", "GrLink", "IDSB", "IDST", "Radical", - "UCJK", - "RCf", + "UIdeo", + "ODI", "Dep", }; @@ -196,7 +215,7 @@ final class UCD_Names implements UCD_Types { "Numeric", "Alphabetic", "Ideographic", "Inseperable", "Hyphen", "CombiningMark", "BreakBefore", "BreakAfter", "Space", "MandatoryBreak", "CarriageReturn", "LineFeed", "ContingentBreak", - "ComplexContext", "Ambiguous", "BreakBeforeAndAfter", "Surrogate", "ZWSpace" + "ComplexContext", "Ambiguous", "BreakBoth", "Surrogate", "ZWSpace" }; public static final String[] SCRIPT = { diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index 53175278cea..ba98d049b52 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2001/09/19 23:33:16 $ -* $Revision: 1.3 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -37,6 +37,10 @@ public interface UCD_Types { 13 Lower case equivalent mapping. Similar to 12. This field is informative. 14 Title case equivalent mapping. Similar to 12. This field is informative. */ + + + // for IDs + static final byte SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2; // Binary ENUM Grouping public static final int @@ -52,8 +56,9 @@ public interface UCD_Types { BINARY_PROPERTIES = 0x900, SCRIPT = 0xA00, AGE = 0xB00, + DERIVED = 0xC00, NEXT_ENUM = 0x100, - LIMIT_ENUM = AGE + 0x100; + LIMIT_ENUM = DERIVED + 0x100; public static final int LIMIT_COMBINING_CLASS = 256; @@ -384,4 +389,65 @@ public static byte YUDH_HE = 48, ZAIN = 49, LIMIT_JOINING_GROUP = 50; + + // DERIVED PROPERTY + + static final int + PropMath = 0, + PropAlphabetic = 1, + PropLowercase = 2, + PropUppercase = 3, + + ID_Start = 4, + ID_Continue_NO_Cf = 5, + + Mod_ID_Start = 6, + Mod_ID_Continue_NO_Cf = 7, + + Missing_Uppercase = 8, + Missing_Lowercase = 9, + Missing_Mixedcase = 10, + + FC_NFKC_Closure = 11, + + FullCompExclusion = 12, + FullCompInclusion = 13, + + QuickNFD = 14, + QuickNFC = 15, + QuickNFKD = 16, + QuickNFKC = 17, + + ExpandsOnNFD = 18, + ExpandsOnNFC = 19, + ExpandsOnNFKD = 20, + ExpandsOnNFKC = 21, + + GenNFD = 22, + GenNFC = 23, + GenNFKD = 24, + GenNFKC = 25, + + DefaultIgnorable = 26, + GraphemeExtend = 27, + GraphemeBase = 28, + + FC_NFC_Closure = 29, + + Other_Case_Ignorable = 30, + Case_Ignorable = 31, + Type_i = 32, + + NFC_Leading = 33, + NFC_TrailingNonZero = 34, + NFC_TrailingZero = 35, + NFC_Resulting = 36, + + NFD_UnsafeStart = 37, + NFC_UnsafeStart = 38, + NFKD_UnsafeStart = 39, + NFKC_UnsafeStart = 40, + + LIMIT = 41; + } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/UData.java b/tools/unicodetools/com/ibm/text/UCD/UData.java index f70d7d77a51..90178fa5430 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UData.java +++ b/tools/unicodetools/com/ibm/text/UCD/UData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $ -* $Date: 2001/08/31 00:29:50 $ -* $Revision: 1.2 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -185,7 +185,7 @@ class UData implements UCD_Types { result.append(" n='").append(Utility.quoteXML(name)).append("'\r\n"); int lastPos = result.length(); - + if (full || generalCategory != Lo) result.append(" gc='").append(UCD_Names.GC[generalCategory]).append('\''); if (full || combiningClass != 0) result.append(" cc='").append(combiningClass & 0xFF).append('\''); if (full || decompositionType != NONE) result.append(" dt='").append(UCD_Names.DT[decompositionType]).append('\''); @@ -232,7 +232,7 @@ class UData implements UCD_Types { result.append("/>"); return result.toString(); } - + public void writeBytes(DataOutputStream os) throws IOException { compact(); os.writeInt(codePoint); diff --git a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java index c7da3dd5fdd..faf04c2c3e5 100644 --- a/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/VerifyUCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $ -* $Date: 2001/09/19 23:33:15 $ -* $Revision: 1.5 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.6 $ * ******************************************************************************* */ @@ -82,6 +82,16 @@ public class VerifyUCD implements UCD_Types { Utility.fixDot(); System.out.println("checkCase"); ucd = UCD.make(Main.ucdVersion); + + String test = "The qui'ck br\u2019own 'fox jum\u00ADped ov\u200Ber th\u200Ce lazy dog."; + + String ttest = ucd.getCase(test, FULL, TITLE); + + PrintWriter titleTest = Utility.openPrintWriter("TestTitle.txt"); + titleTest.println(test); + titleTest.println(ttest); + titleTest.close(); + initNormalizers(); System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE)); String fileName = "CaseDifferences.txt"; diff --git a/tools/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java b/tools/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java index c74eb3799d0..2b0215e4d83 100644 --- a/tools/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java +++ b/tools/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java,v $ -* $Date: 2001/08/31 00:29:50 $ -* $Revision: 1.2 $ +* $Date: 2001/10/25 20:33:46 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -18,7 +18,34 @@ import java.io.*; //import java.text.*; import com.ibm.text.utility.*; -public class WriteJavaScriptInfo { +public class WriteJavaScriptInfo implements UCD_Types { + + static public void assigned() throws IOException { + PrintWriter log = Utility.openPrintWriter("assigned.js"); + UCD ucd = UCD.make(); + boolean wasIn = false; + int lastWritten = -100; + int i; + for (i = 0; i <= 0x10FFFF; ++i) { + byte cat = ucd.getCategory(i); + boolean in = cat != Cn && cat != Co && cat != Cs; + if (wasIn == in) continue; + if (in) { + log.print(i + ","); + lastWritten = i; + } else { + if (lastWritten != i-1) log.print(i-1); + log.println(","); + } + wasIn = in; + } + if (wasIn) { + if (lastWritten != i-1) log.print(i-1); + log.println(","); + } + log.close(); + } + /* TODO: fix enumeration of compositions static public void writeJavascriptInfo() throws IOException {