diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA_Data.java b/tools/unicodetools/com/ibm/text/UCA/UCA_Data.java index 53272c18be9..17ba0e0e063 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA_Data.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA_Data.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Data.java,v $ -* $Date: 2002/07/14 22:07:00 $ -* $Revision: 1.1 $ +* $Date: 2004/03/11 19:03:19 $ +* $Revision: 1.2 $ * ******************************************************************************* */ @@ -27,7 +27,7 @@ import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; public class UCA_Data implements UCA_Types { - static final boolean DEBUG = true; + static final boolean DEBUG = false; static final boolean DEBUG_SHOW_ADD = false; private Normalizer toD; diff --git a/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt b/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt index f933abd0596..ad4ee67b6a2 100644 --- a/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt @@ -1,7 +1,4 @@ -# Correlated with Unicode 4.0 -# Note: The casing of block names is not normative. -# For example, "Basic Latin" and "BASIC LATIN" are equivalent. -# -# Code points not explicitly listed in this file are given the value No_Block. -# +# Note: The casing of block names is not normative. +# For example, "Basic Latin" and "BASIC LATIN" are equivalent. +# Format: # Start Code..End Code; Block Name diff --git a/tools/unicodetools/com/ibm/text/UCD/BuildNames.java b/tools/unicodetools/com/ibm/text/UCD/BuildNames.java index 564a51e50c8..b7f52f937a1 100644 --- a/tools/unicodetools/com/ibm/text/UCD/BuildNames.java +++ b/tools/unicodetools/com/ibm/text/UCD/BuildNames.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $ -* $Date: 2004/02/07 01:01:17 $ -* $Revision: 1.8 $ +* $Date: 2004/03/11 19:03:18 $ +* $Revision: 1.9 $ * ******************************************************************************* */ @@ -26,7 +26,7 @@ import com.ibm.text.utility.*; public class BuildNames implements UCD_Types { - static final boolean DEBUG = true; + static final boolean DEBUG = false; public static void main(String[] args) throws IOException { collectWords(); diff --git a/tools/unicodetools/com/ibm/text/UCD/CheckICU.java b/tools/unicodetools/com/ibm/text/UCD/CheckICU.java index d705335bd7c..68de58c0b9a 100644 --- a/tools/unicodetools/com/ibm/text/UCD/CheckICU.java +++ b/tools/unicodetools/com/ibm/text/UCD/CheckICU.java @@ -59,7 +59,7 @@ public class CheckICU { toolFactory = ToolUnicodePropertySource.make("4.0.0"); String[] quickList = { - "Canonical_Combining_Class", + // "Canonical_Combining_Class", // "Script", "Bidi_Mirroring_Glyph", "Case_Folding", //"Numeric_Value" }; diff --git a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java index 1a93de08d3d..67abe786fb1 100644 --- a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $ -* $Date: 2004/02/12 08:23:17 $ -* $Revision: 1.14 $ +* $Date: 2004/03/11 19:03:18 $ +* $Revision: 1.15 $ * ******************************************************************************* */ @@ -305,6 +305,7 @@ public final class ConvertUCD implements UCD_Types { value.compact(); } + /* UData ud; ud = getEntry(0x5e); System.out.println("SPOT-CHECK: 5e: " + ud); @@ -320,6 +321,7 @@ public final class ConvertUCD implements UCD_Types { ud = getEntry(0xFFFF); System.out.println("SPOT-CHECK: FFFF: " + ud); + */ writeJavaData(); } @@ -410,7 +412,7 @@ public final class ConvertUCD implements UCD_Types { int count = Utility.split(line,';',parts); - if (parts[0].equals("2801")) { + if (false && parts[0].equals("2801")) { System.out.println("debug?"); } @@ -468,7 +470,7 @@ public final class ConvertUCD implements UCD_Types { if (end == 0) end = cpStart; for (int j = cpStart; j <= end; ++j) { - if (j != UCD.mapToRepresentative(j, false)) continue; + if (j != UCD.mapToRepresentative(j, Integer.MAX_VALUE)) continue; if (skipLetters && getEntry(cpStart).isLetter()) continue; appendCharProperties(j, prop); } @@ -490,7 +492,7 @@ public final class ConvertUCD implements UCD_Types { if (val.equals("")) continue; // skip empty values, they mean default for (int cps = cpStart; cps <= cpTop; ++cps) { - if (UCD.mapToRepresentative(cps, false) != cps) continue; // skip condensed ranges + if (UCD.mapToRepresentative(cps, Integer.MAX_VALUE) != cps) continue; // skip condensed ranges if (key.equals("binary")) { appendCharProperties(cps, val); @@ -508,7 +510,7 @@ public final class ConvertUCD implements UCD_Types { if (type.equals("I")) { data.simpleCaseFolding = val; setBinaryProperty(cps, CaseFoldTurkishI); - System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting " + if (DEBUG) System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val)); } } else if (labels[0].equals("SpecialCasing") // special handling for special casing @@ -658,7 +660,7 @@ public final class ConvertUCD implements UCD_Types { System.out.println("Warning: NULL name\r\n" + uData); System.out.println(); } - if (uData.codePoint == 0x2801) { + if (false && uData.codePoint == 0x2801) { System.out.println("SPOT-CHECK: " + uData); } uData.writeBytes(dataOut); diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedAgeHeader.txt b/tools/unicodetools/com/ibm/text/UCD/DerivedAgeHeader.txt index 1c786cf31c8..3c071e9302f 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedAgeHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedAgeHeader.txt @@ -1,16 +1,29 @@ # # Unicode Character Database: Derived Property Data -# This file shows when various code points were designated in Unicode +# This file shows when various code points were first assigned in Unicode. +# +# Caution: When using the Age *property*, all assigned code points +# in each version are included, not just the newly assigned code points. +# For more information, see http://www.unicode.org/reports/tr18/ +# # Notes: -# - The term 'designated' means that a previously reserved code point was specified -# to be a noncharacter or surrogate, or assigned as a character, -# control or format code. +# +# - The term 'assigned' means that a previously reserved code point was assigned +# to be a character (graphic, format, control, or private-use); +# a noncharacter code point; or a surrogate code point. +# For more information, see The Unicode Standard Section 2.4 +# # - Versions are only tracked from 1.1 onwards, since version 1.0 # predated changes required by the ISO 10646 merger. +# # - The Hangul Syllables that were removed from 2.0 are not included in the 1.1 listing. +# # - The supplementary private use code points and the non-character code points -# were designated in version 2.0, but not specifically listed in the UCD +# were assigned in version 2.0, but not specifically listed in the UCD # until versions 3.0 and 3.1 respectively. # +# - Contiguous ranges are broken into separate lines where they would cross code point +# types: graphic, format, control, private-use, surrogate, noncharacter +# # For details on the contents of each version, see # http://www.unicode.org/versions/enumeratedversions.html. diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java index 9f92c520377..0addccbb127 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $ -* $Date: 2004/02/18 03:08:59 $ -* $Revision: 1.25 $ +* $Date: 2004/03/11 19:03:17 $ +* $Revision: 1.26 $ * ******************************************************************************* */ @@ -378,7 +378,7 @@ public final class DerivedProperty implements UCD_Types { shortName = "IDC"; header = "# Derived Property: " + name + "\r\n# Characters that can continue an identifier." - + "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc" + + "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc + Other_ID_Continue" + "\r\n# NOTE: Cf characters should be filtered out."; } public boolean hasValue(int cp) { diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java index ba8af3285cd..a5a3efa731a 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $ -* $Date: 2004/02/18 03:08:59 $ -* $Revision: 1.34 $ +* $Date: 2004/03/11 19:03:17 $ +* $Revision: 1.35 $ * ******************************************************************************* */ @@ -110,8 +110,8 @@ public class GenerateData implements UCD_Types { output.println("# Generated algorithmically from the Unicode Character Database"); } output.println("# For documentation, see UCD.html"); - output.println("# Note: Unassigned and Noncharacter codepoints may be omitted"); - output.println("# if they have default property values."); + //output.println("# Note: Unassigned and Noncharacter codepoints may be omitted"); + //output.println("# if they have default property values."); output.println(HORIZONTAL_LINE); output.println(); } diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java index 2902ec10ea6..759eef2c61d 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java +++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java @@ -5,6 +5,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; +import java.lang.reflect.Field; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; @@ -20,42 +21,18 @@ import java.util.TreeSet; import com.ibm.icu.dev.test.util.BagFormatter; import com.ibm.icu.dev.test.util.Tabber; +import com.ibm.icu.dev.test.util.UnicodeLabel; import com.ibm.icu.dev.test.util.UnicodeProperty; +import com.ibm.icu.text.NumberFormat; import com.ibm.icu.text.UnicodeSet; import com.ibm.text.utility.UnicodeDataFile; import com.ibm.text.utility.Utility; import com.ibm.icu.text.Collator; public class MakeUnicodeFiles { + public static int dVersion = 8; // change to fix the generated file D version. If less than zero, no "d" - static boolean DEBUG = true; - - public static void main() throws IOException { - generateFile("*"); - } - - static String[] FILE_OPTIONS = { - "Script nameStyle=none makeUppercase skipUnassigned=Common hackValues", - "Age nameStyle=none noLabel skipValue=unassigned", - "Numeric_Type nameStyle=none makeFirstLetterLowercase skipValue=None", - "General_Category nameStyle=none valueStyle=short noLabel", - "Line_Break nameStyle=none valueStyle=short skipUnassigned=Unknown", - "Joining_Type nameStyle=none valueStyle=short skipValue=Non_Joining", - "Joining_Group nameStyle=none skipValue=No_Joining_Group makeUppercase", - "East_Asian_Width nameStyle=none valueStyle=short skipUnassigned=Neutral", - "Decomposition_Type nameStyle=none skipValue=None makeFirstLetterLowercase hackValues", - "Bidi_Class nameStyle=none valueStyle=short skipUnassigned=Left_To_Right", - "Block nameStyle=none noLabel valueList", - "Canonical_Combining_Class nameStyle=none valueStyle=short skipUnassigned=Not_Reordered longValueHeading=ccc", - "Hangul_Syllable_Type nameStyle=none valueStyle=short skipValue=Not_Applicable", - "NFD_Quick_Check nameStyle=short valueStyle=short skipValue=Yes", - "NFC_Quick_Check nameStyle=short valueStyle=short skipValue=Yes", - "NFKC_Quick_Check nameStyle=short valueStyle=short skipValue=Yes", - "NFKD_Quick_Check nameStyle=short valueStyle=short skipValue=Yes", - "FC_NFKC_Closure nameStyle=short" - }; - - static String[] hackNameList = { + /*static String[] hackNameList = { "noBreak", "Arabic_Presentation_Forms-A", "Arabic_Presentation_Forms-B", "CJK_Symbols_and_Punctuation", "Combining_Diacritical_Marks_for_Symbols", "Enclosed_CJK_Letters_and_Months", "Greek_and_Coptic", @@ -66,218 +43,450 @@ public class MakeUnicodeFiles { "Supplementary_Private_Use_Area-A", "Supplementary_Private_Use_Area-B", "Canadian-Aboriginal", "Old-Italic" }; + static { + for (int i = 0; i < hackNameList.length; ++i) { + System.out.println("HackName:\t" + hackNameList[i]); + } + } + */ - static class PrintStyle { - static PrintStyle DEFAULT_PRINT_STYLE = new PrintStyle(); - static Map PRINT_STYLE_MAP = new TreeMap(UnicodeProperty.PROPERTY_COMPARATOR); - boolean noLabel = false; - boolean makeUppercase = false; - boolean makeFirstLetterLowercase = false; - boolean orderByRangeStart = false; - boolean interleaveValues = false; + static boolean DEBUG = false; + + public static void main() throws IOException { + generateFile(); + } + + static class Format { + public static Format theFormat = new Format(); // singleton + + + Map printStyleMap = new TreeMap(UnicodeProperty.PROPERTY_COMPARATOR); + static PrintStyle DEFAULT_PRINT_STYLE = new PrintStyle(); + Map fileToPropertySet = new TreeMap(); + Map fileToComments = new TreeMap(); + Map fileToDirectory = new TreeMap(); + TreeMap propertyToValueToComments = new TreeMap(); + Map hackMap = new HashMap(); + UnicodeProperty.MapFilter hackMapFilter; + String[] filesToDo; + + private Format(){ + build(); + } + /* + static String[] FILE_OPTIONS = { + "Script nameStyle=none makeUppercase skipUnassigned=Common hackValues", + "Age nameStyle=none noLabel skipValue=unassigned", + "Numeric_Type nameStyle=none makeFirstLetterLowercase skipValue=None", + "General_Category nameStyle=none valueStyle=short noLabel", + "Line_Break nameStyle=none valueStyle=short skipUnassigned=Unknown", + "Joining_Type nameStyle=none valueStyle=short skipValue=Non_Joining", + "Joining_Group nameStyle=none skipValue=No_Joining_Group makeUppercase", + "East_Asian_Width nameStyle=none valueStyle=short skipUnassigned=Neutral", + "Decomposition_Type nameStyle=none skipValue=None makeFirstLetterLowercase hackValues", + "Bidi_Class nameStyle=none valueStyle=short skipUnassigned=Left_To_Right", + "Block nameStyle=none noLabel valueList", + "Canonical_Combining_Class nameStyle=none valueStyle=short skipUnassigned=Not_Reordered longValueHeading=ccc", + "Hangul_Syllable_Type nameStyle=none valueStyle=short skipValue=Not_Applicable", + "NFD_Quick_Check nameStyle=short valueStyle=short skipValue=Yes", + "NFC_Quick_Check nameStyle=short valueStyle=short skipValue=Yes", + "NFKC_Quick_Check nameStyle=short valueStyle=short skipValue=Yes", + "NFKD_Quick_Check nameStyle=short valueStyle=short skipValue=Yes", + "FC_NFKC_Closure nameStyle=short" + }; + */ + + void printFileComments(PrintWriter pw, String filename) { + String fileComments = (String) fileToComments.get(filename); + if (fileComments != null) + pw.println(fileComments); + } + + private void addPrintStyle(String options) { + PrintStyle result = new PrintStyle(); + printStyleMap.put(result.parse(options), result); + } + + public PrintStyle getPrintStyle(String propname) { + PrintStyle result = (PrintStyle) printStyleMap.get(propname); + if (result != null) + return result; + if (DEBUG) + System.out.println("Using default style!"); + return DEFAULT_PRINT_STYLE; + } + + public static class PrintStyle { + boolean noLabel = false; + boolean makeUppercase = false; + boolean makeFirstLetterLowercase = false; + boolean orderByRangeStart = false; + boolean interleaveValues = false; boolean hackValues = false; String nameStyle = "none"; String valueStyle = "long"; String skipValue = null; String skipUnassigned = null; String longValueHeading = null; - - static void add(String options) { - PrintStyle result = new PrintStyle(); - PRINT_STYLE_MAP.put(result.parse(options), result); - } - static PrintStyle get(String propname) { - PrintStyle result = (PrintStyle) PRINT_STYLE_MAP.get(propname); - if (result != null) return result; - if (DEBUG) System.out.println("Using default style!"); - return DEFAULT_PRINT_STYLE; - } + boolean sortNumeric = false; + String parse(String options) { options = options.replace('\t', ' '); String[] pieces = Utility.split(options, ' '); for (int i = 1; i < pieces.length; ++i) { String piece = pieces[i]; // binary - if (piece.equals("noLabel")) noLabel = true; - else if (piece.equals("makeUppercase")) makeUppercase = true; - else if (piece.equals("makeFirstLetterLowercase")) makeFirstLetterLowercase = true; - else if (piece.equals("orderByRangeStart")) orderByRangeStart = true; - else if (piece.equals("valueList")) interleaveValues = true; - else if (piece.equals("hackValues")) hackValues = true; + if (piece.equals("noLabel")) + noLabel = true; + else if (piece.equals("makeUppercase")) + makeUppercase = true; + else if (piece.equals("makeFirstLetterLowercase")) + makeFirstLetterLowercase = true; + else if (piece.equals("orderByRangeStart")) + orderByRangeStart = true; + else if (piece.equals("valueList")) + interleaveValues = true; + else if (piece.equals("hackValues")) + hackValues = true; + else if (piece.equals("sortNumeric")) + sortNumeric = true; // with parameter - else if (piece.startsWith("valueStyle=")) valueStyle = afterEquals(piece); - else if (piece.startsWith("nameStyle=")) nameStyle = afterEquals(piece); - else if (piece.startsWith("longValueHeading=")) longValueHeading = afterEquals(piece); - else if (piece.startsWith("skipValue=")) skipValue = afterEquals(piece); - else if (piece.startsWith("skipUnassigned=")) skipUnassigned = afterEquals(piece); - else if (piece.length() != 0) { - throw new IllegalArgumentException("Illegal PrintStyle Parameter: " + piece + " in " + pieces[0]); + else if (piece.startsWith("valueStyle=")) + valueStyle = afterEquals(piece); + else if (piece.startsWith("nameStyle=")) + nameStyle = afterEquals(piece); + else if (piece.startsWith("longValueHeading=")) + longValueHeading = afterEquals(piece); + else if (piece.startsWith("skipValue=")) { + if (skipUnassigned != null) + throw new IllegalArgumentException("Can't have both skipUnassigned and skipValue"); + skipValue = afterEquals(piece); + } else if (piece.startsWith("skipUnassigned=")) { + if (skipValue != null) + throw new IllegalArgumentException("Can't have both skipUnassigned and skipValue"); + skipUnassigned = afterEquals(piece); + } else if (piece.length() != 0) { + throw new IllegalArgumentException( + "Illegal PrintStyle Parameter: " + + piece + + " in " + + pieces[0]); } } - if (DEBUG && options.indexOf('=') >= 0) { - System.out.println(pieces[0]); - if (longValueHeading != null)System.out.println(" name " + longValueHeading); - if (nameStyle != null) System.out.println(" nameStyle " + nameStyle); - if (longValueHeading != null) System.out.println(" longValueHeading " + longValueHeading); - if (skipValue != null) System.out.println(" skipValue " + skipValue); - if (skipUnassigned != null) System.out.println(" skipUnassigned " + skipUnassigned); - } return pieces[0]; } - String afterEquals(String source) { - return source.substring(source.indexOf('=')+1); + public String toString() { + Class myClass = getClass(); + String result = myClass.getName() + "\n"; + Field[] myFields = myClass.getDeclaredFields(); + for (int i = 0; i < myFields.length; ++i) { + String value = ""; + try { + Object obj = myFields[i].get(this); + if (obj == null) value = ""; + else value = obj.toString(); + } catch (Exception e) {} + result += "\t" + myFields[i].getName() + "=<" + value + ">\n"; + } + return result; } - } - static { - for (int i = 0; i < FILE_OPTIONS.length; ++i) { - PrintStyle.add(FILE_OPTIONS[i]); - } - } - - static Map hackMap = new HashMap(); - static { - for (int i = 0; i < hackNameList.length; ++i) { - String item = hackNameList[i]; - String regularItem = UnicodeProperty.regularize(item,true); - hackMap.put(regularItem, item); - } } - static UnicodeProperty.MapFilter hackMapFilter = new UnicodeProperty.MapFilter(hackMap); + /* + static { + for (int i = 0; i < FILE_OPTIONS.length; ++i) { + PrintStyle.add(FILE_OPTIONS[i]); + } + } + */ - static class ValueComments { - TreeMap propertyToValueToComments = new TreeMap(); - ValueComments add(String property, String value, String comments) { - TreeMap valueToComments = (TreeMap) propertyToValueToComments.get(property); - if (valueToComments == null) { - valueToComments = new TreeMap(); - propertyToValueToComments.put(property, valueToComments); - } - valueToComments.put(value, comments); - return this; - } - String get(String property, String value) { - TreeMap valueToComments = (TreeMap) propertyToValueToComments.get(property); - if (valueToComments != null) return (String) valueToComments.get(value); - return null; - } - } - static ValueComments valueComments = new ValueComments(); - static { - for (int i = 0; i < UCD_Names.UNIFIED_PROPERTIES.length; ++i) { - String name = Utility.getUnskeleton(UCD_Names.UNIFIED_PROPERTIES[i], false); - valueComments.add(name, "*", "# " + UCD_Names.UNIFIED_PROPERTY_HEADERS[i]); - } - // HACK - valueComments.add("Bidi_Mirroring", "*", "# " + UCD_Names.UNIFIED_PROPERTY_HEADERS[9]); - try { - BufferedReader br = Utility.openReadFile("MakeUnicodeFiles.txt", Utility.UTF8); - String key = null; - String value = ""; - while (true) { - String line = br.readLine(); - if (line == null) break; - if (!line.startsWith("#")) { - if (key != null) {// store - String[] pieces = Utility.split(key, '='); - if (pieces.length == 1) { - valueComments.add(pieces[0].trim(), "*", value); - } else { - valueComments.add(pieces[0].trim(), pieces[1].trim(), value); - } - value = ""; - } - key = line; - } else { - value += line + "\n"; + void addValueComments(String property, String value, String comments) { + if (DEBUG) + showPVC(property, value, comments); + TreeMap valueToComments = + (TreeMap) propertyToValueToComments.get(property); + if (valueToComments == null) { + valueToComments = new TreeMap(); + propertyToValueToComments.put(property, valueToComments); + } + valueToComments.put(value, comments); + if (DEBUG && property.equals("BidiClass")) { + getValueComments(property, value); + } + } + + private void showPVC(String property, String value, String comments) { + System.out.println( + "Putting Property: <" + + property + + ">, Value: <" + + value + + ">, Comments: <" + + comments + ">"); + } + + String getValueComments(String property, String value) { + TreeMap valueToComments = + (TreeMap) propertyToValueToComments.get(property); + String result = null; + if (valueToComments != null) + result = (String) valueToComments.get(value); + if (DEBUG) System.out.println("Getting Property: <" + property + ">, Value: <" + + value + ">, Comment: <" + result + ">"); + return result; + } + + Map getValue2CommentsMap(String property) { + return (Map) propertyToValueToComments.get(property); + } + + static String afterEquals(String source) { + return source.substring(source.indexOf('=') + 1); + } + + static String afterWhitespace(String source) { + // Note: don't need to be international + for (int i = 0; i < source.length(); ++i) { + char ch = source.charAt(i); + if (Character.isWhitespace(source.charAt(i))) { + return source.substring(i).trim(); } } - br.close(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - throw new IllegalArgumentException("File missing"); + return ""; + } + + /*private void add(String name, String[] properties) { + fileToPropertySet.put(name, properties); + }*/ + + private void build() { + /* + for (int i = 0; i < hackNameList.length; ++i) { + String item = hackNameList[i]; + String regularItem = UnicodeProperty.regularize(item,true); + hackMap.put(regularItem, item); + } + */ + + /* + for (int i = 0; i < UCD_Names.UNIFIED_PROPERTIES.length; ++i) { + String name = Utility.getUnskeleton(UCD_Names.UNIFIED_PROPERTIES[i], false); + valueComments.add(name, "*", "# " + UCD_Names.UNIFIED_PROPERTY_HEADERS[i]); + System.out.println(); + System.out.println(name); + System.out.println("# " + UCD_Names.UNIFIED_PROPERTY_HEADERS[i]); + } + // HACK + valueComments.add("Bidi_Mirroring", "*", "# " + UCD_Names.UNIFIED_PROPERTY_HEADERS[9]); + */ + try { + BufferedReader br = + Utility.openReadFile("MakeUnicodeFiles.txt", Utility.UTF8); + String key = null; + String file = null, property = null, value = "", comments = ""; + while (true) { + String line = br.readLine(); + if (line == null) + break; + line = line.trim(); + if (line.length() == 0) + continue; + if (DEBUG) + System.out.println("\t" + line); + String lineValue = afterWhitespace(line); + if (line.startsWith("Format:")) { + addPrintStyle(property + " " + lineValue); // fix later + } else if (line.startsWith("#")) { + if (comments.length() != 0) comments += "\n"; + comments += line; + } else { + // end of comments, roll up + if (property != null) + addValueComments(property, value, comments); + comments = ""; + if (line.startsWith("Generate:")) { + filesToDo = Utility.split(lineValue, ' '); + if (filesToDo.length == 0) { + filesToDo = new String[] {""}; + } + } else if (line.startsWith("DeltaVersion:")) { + dVersion = Integer.parseInt(lineValue); + } else if (line.startsWith("File:")) { + int p2 = lineValue.lastIndexOf('/'); + file = lineValue.substring(p2+1); + if (p2 >= 0) { + fileToDirectory.put(file, lineValue.substring(0,p2+1)); + } + property = null; + } else if (line.startsWith("Property:")) { + property = lineValue; + addPropertyToFile(file, property); + value = ""; + } else if (line.startsWith("Value:")) { + value = lineValue; + } else if (line.startsWith("HackName:")) { + String regularItem = + UnicodeProperty.regularize(lineValue, true); + hackMap.put(regularItem, lineValue); + } else if (line.startsWith("FinalComments")) { + break; + } else { + throw new IllegalArgumentException("Unknown command: " + line); + } + } + } + br.close(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + throw new IllegalArgumentException("File missing"); + } + hackMapFilter = new UnicodeProperty.MapFilter(hackMap); + /* + add("PropertyValueAliases", null); + add("PropertyAliases", null); + add("SpecialCasing", null); + add("NormalizationTest", null); + add("StandardizedVariants", null); + add("CaseFolding", null); + add("DerivedAge", new String[] {"Age"}); + add("Scripts", new String[] {"Script"}); + add("HangulSyllableType", new String[] {"HangulSyllableType"}); + add("DerivedBidiClass", new String[] {"BidiClass"}); + add("DerivedBinaryProperties", new String[] {"BidiMirrored"}); + add("DerivedCombiningClass", new String[] {"CanonicalCombiningClass"}); + add("DerivedDecompositionType", new String[] {"DecompositionType"}); + add("DerivedEastAsianWidth", new String[] {"EastAsianWidth"}); + add("DerivedGeneralCategory", new String[] {"GeneralCategory"}); + add("DerivedJoiningGroup", new String[] {"JoiningGroup"}); + add("DerivedJoiningType", new String[] {"JoiningType"}); + add("DerivedLineBreak", new String[] {"LineBreak"}); + add("DerivedNumericType", new String[] {"NumericType"}); + add("DerivedNumericValues", new String[] {"NumericValue"}); + add("PropList", new String[] { + "White_Space", "Bidi_Control", "Join_Control", + "Dash", "Hyphen", "Quotation_Mark", + "Terminal_Punctuation", "Other_Math", + "Hex_Digit", "ASCII_Hex_Digit", + "Other_Alphabetic", + "Ideographic", + "Diacritic", "Extender", + "Other_Lowercase", "Other_Uppercase", + "Noncharacter_Code_Point", + "Other_Grapheme_Extend", + "Grapheme_Link", + "IDS_Binary_Operator", "IDS_Trinary_Operator", + "Radical", "Unified_Ideograph", + "Other_Default_Ignorable_Code_Point", + "Deprecated", "Soft_Dotted", + "Logical_Order_Exception", + "Other_ID_Start" + }); + add("DerivedCoreProperties", new String[] { + "Math", "Alphabetic", "Lowercase", "Uppercase", + "ID_Start", "ID_Continue", + "XID_Start", "XID_Continue", + "Default_Ignorable_Code_Point", + "Grapheme_Extend", "Grapheme_Base" + }); + add("DerivedNormalizationProps", new String[] { + "FC_NFKC_Closure", + "Full_Composition_Exclusion", + "NFD_QuickCheck", "NFC_QuickCheck", "NFKD_QuickCheck", "NFKC_QuickCheck", + "Expands_On_NFD", "Expands_On_NFC", "Expands_On_NFKD", "Expands_On_NFKC" + }); + */ + write(); + } + + private void write() { + TreeMap fileoptions = new TreeMap(); + /*for (int i = 0; i < FILE_OPTIONS.length; ++i) { + String s = FILE_OPTIONS[i]; + int pos = s.indexOf(' '); + String name = s.substring(0,pos); + String options = s.substring(pos).trim(); + fileoptions.put(name, options); + } + */ + for (Iterator it = fileToPropertySet.keySet().iterator(); it.hasNext();) { + String key = (String) it.next(); + if (DEBUG) { + System.out.println(); + System.out.println("File:\t" + key); + } + List propList2 = (List) fileToPropertySet.get(key); + if (propList2 == null) { + System.out.println("SPECIAL"); + continue; + } + for (Iterator pIt = propList2.iterator(); pIt.hasNext();) { + String prop = (String) pIt.next(); + String options = (String)fileoptions.get(prop); + if (DEBUG) { + System.out.println(); + System.out.println("Property:\t" + prop); + if (options != null) { + System.out.println("Format:\t" + options); + } + } + Map vc = getValue2CommentsMap(prop); + if (vc == null) continue; + for (Iterator it2 = vc.keySet().iterator(); it2.hasNext();) { + String value = (String) it2.next(); + String comment = (String) vc.get(value); + if (DEBUG) { + if (!value.equals("")) { + System.out.println("Value:\t" + value); + } + System.out.println(comment); + } + } + } + } + } + + private void addCommentToFile(String filename, String comment) { + fileToComments.put(filename, comment); + } + + private void addPropertyToFile(String filename, String property) { + List properties = (List) fileToPropertySet.get(filename); + if (properties == null) { + properties = new ArrayList(1); + fileToPropertySet.put(filename, properties); + } + properties.add(property); + } + public List getPropertiesFromFile(String filename) { + return (List) fileToPropertySet.get(filename); + } + public Set getFiles() { + return fileToPropertySet.keySet(); } } - - - - //CompositionExclusions - //SpecialCasing - //NormalizationTest - //add("CaseFolding", new String[] {"CaseFolding"}); - static Map contents = new TreeMap(); - static void add(String name, String[] properties) { - contents.put(name, properties); - } - static { - add("PropertyValueAliases", null); - add("PropertyAliases", null); - add("SpecialCasing", null); - add("NormalizationTest", null); - add("StandardizedVariants", null); - add("CaseFolding", null); - add("DerivedAge", new String[] {"Age"}); - add("Scripts", new String[] {"Script"}); - add("HangulSyllableType", new String[] {"HangulSyllableType"}); - add("DerivedBidiClass", new String[] {"BidiClass"}); - add("DerivedBinaryProperties", new String[] {"BidiMirrored"}); - add("DerivedCombiningClass", new String[] {"CanonicalCombiningClass"}); - add("DerivedDecompositionType", new String[] {"DecompositionType"}); - add("DerivedEastAsianWidth", new String[] {"EastAsianWidth"}); - add("DerivedGeneralCategory", new String[] {"GeneralCategory"}); - add("DerivedJoiningGroup", new String[] {"JoiningGroup"}); - add("DerivedJoiningType", new String[] {"JoiningType"}); - add("DerivedLineBreak", new String[] {"LineBreak"}); - add("DerivedNumericType", new String[] {"NumericType"}); - add("DerivedNumericValues", new String[] {"NumericValue"}); - add("PropList", new String[] { - "White_Space", "Bidi_Control", "Join_Control", - "Dash", "Hyphen", "Quotation_Mark", - "Terminal_Punctuation", "Other_Math", - "Hex_Digit", "ASCII_Hex_Digit", - "Other_Alphabetic", - "Ideographic", - "Diacritic", "Extender", - "Other_Lowercase", "Other_Uppercase", - "Noncharacter_Code_Point", - "Other_Grapheme_Extend", - "Grapheme_Link", - "IDS_Binary_Operator", "IDS_Trinary_Operator", - "Radical", "Unified_Ideograph", - "Other_Default_Ignorable_Code_Point", - "Deprecated", "Soft_Dotted", - "Logical_Order_Exception", - "Other_ID_Start" - }); - add("DerivedCoreProperties", new String[] { - "Math", "Alphabetic", "Lowercase", "Uppercase", - "ID_Start", "ID_Continue", - "XID_Start", "XID_Continue", - "Default_Ignorable_Code_Point", - "Grapheme_Extend", "Grapheme_Base" - }); - add("DerivedNormalizationProps", new String[] { - "FC_NFKC_Closure", - "Full_Composition_Exclusion", - "NFD_QuickCheck", "NFC_QuickCheck", "NFKD_QuickCheck", "NFKC_QuickCheck", - "Expands_On_NFD", "Expands_On_NFC", "Expands_On_NFKD", "Expands_On_NFKC" - }); - } - public static void generateFile(String atOrAfter, String atOrBefore) throws IOException { - Iterator it = contents.keySet().iterator(); - while (it.hasNext()) { - String propname = (String) it.next(); - if (propname.compareToIgnoreCase(atOrAfter) < 0) continue; - if (propname.compareToIgnoreCase(atOrBefore) > 0) continue; - generateFile(propname); - } - } + public static void generateFile() throws IOException { + for (int i = 0; i < Format.theFormat.filesToDo.length; ++i) { + String fileName = + Format.theFormat.filesToDo[i].trim().toLowerCase( + Locale.ENGLISH); + Iterator it = Format.theFormat.getFiles().iterator(); + boolean gotOne = false; + while (it.hasNext()) { + String propname = (String) it.next(); + if (!propname.toLowerCase(Locale.ENGLISH).startsWith(fileName)) continue; + generateFile(propname); + gotOne = true; + } + if (!gotOne) { + throw new IllegalArgumentException( + "Non-matching file name: " + fileName); + } + } + } public static void generateFile(String filename) throws IOException { - if (filename.equals("*")) { - generateFile("", "\uFFFD"); - } else if (filename.endsWith("Aliases")) { + if (filename.endsWith("Aliases")) { if (filename.endsWith("ValueAliases")) generateValueAliasFile(filename); else generateAliasFile(filename); } else if (filename.equals("NormalizationTest")) { @@ -296,7 +505,7 @@ public class MakeUnicodeFiles { static final String SEPARATOR = "# ================================================"; public static void generateAliasFile(String filename) throws IOException { - UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedDataTest/", filename); + UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedData/", filename); PrintWriter pw = udf.out; UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(Default.ucdVersion()); @@ -354,7 +563,7 @@ public class MakeUnicodeFiles { } pw.println(); pw.println(SEPARATOR); - pw.println("#Total: " + count); + pw.println("# Total: " + count); pw.println(); udf.close(); } @@ -386,8 +595,9 @@ public class MakeUnicodeFiles { "gc\t;\tZ\t;\tSeparator\t# Zl | Zp | Zs"}; public static void generateValueAliasFile(String filename) throws IOException { - UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedDataTest/", filename); + UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedData/", filename); PrintWriter pw = udf.out; + Format.theFormat.printFileComments(pw, filename); UnicodeProperty.Factory toolFactory = ToolUnicodePropertySource.make(Default.ucdVersion()); BagFormatter bf = new BagFormatter(toolFactory); @@ -428,7 +638,7 @@ public class MakeUnicodeFiles { for (Iterator it2 = up.getAvailableValues().iterator(); it2.hasNext();) { String value = (String) it2.next(); List l = up.getValueAliases(value); - System.out.println(value + "\t" + bf.join(l)); + if (DEBUG) System.out.println(value + "\t" + bf.join(l)); // HACK Tabber mt = mt2; @@ -442,7 +652,7 @@ public class MakeUnicodeFiles { } else { l.add(0, l.get(0)); // double up } - } else if (l.size() > 2) { + } else if (propName.equals("Canonical_Combining_Class")) { mt = mt3; } if (UnicodeProperty.equalNames(value,"Cyrillic_Supplement")) { @@ -464,6 +674,8 @@ public class MakeUnicodeFiles { } } pw.println(); + pw.println("# " + propName + " (" + shortProp + ")"); + pw.println(); for (Iterator it4 = sortedSet.iterator(); it4.hasNext();) { String line = (String) it4.next(); pw.println(line); @@ -473,28 +685,52 @@ public class MakeUnicodeFiles { } public static void generatePropertyFile(String filename) throws IOException { - String[] propList = (String[]) contents.get(filename); - UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedDataTest/", filename); - PrintWriter pw = udf.out; // bf2.openUTF8Writer(UCD_Types.GEN_DIR, "Test" + filename + ".txt"); - UnicodeProperty.Factory toolFactory - = ToolUnicodePropertySource.make(Default.ucdVersion()); - UnicodeSet unassigned = toolFactory.getSet("gc=cn") - .addAll(toolFactory.getSet("gc=cs")); - //System.out.println(unassigned.toPattern(true)); - // .removeAll(toolFactory.getSet("noncharactercodepoint=true")); - - for (int i = 0; i < propList.length; ++i) { + String dir = (String) Format.theFormat.fileToDirectory.get(filename); + if (dir == null) dir = ""; + UnicodeDataFile udf = + UnicodeDataFile.openAndWriteHeader("DerivedData/" + dir, filename); + PrintWriter pw = udf.out; + // bf2.openUTF8Writer(UCD_Types.GEN_DIR, "Test" + filename + ".txt"); + Format.theFormat.printFileComments(pw, filename); + UnicodeProperty.Factory toolFactory = + ToolUnicodePropertySource.make(Default.ucdVersion()); + UnicodeSet unassigned = + toolFactory.getSet("gc=cn").addAll(toolFactory.getSet("gc=cs")); + //System.out.println(unassigned.toPattern(true)); + // .removeAll(toolFactory.getSet("noncharactercodepoint=true")); + + List propList = Format.theFormat.getPropertiesFromFile(filename); + for (Iterator propIt = propList.iterator(); propIt.hasNext();) { BagFormatter bf = new BagFormatter(toolFactory); - UnicodeProperty prop = toolFactory.getProperty(propList[i]); + UnicodeProperty prop = toolFactory.getProperty((String) propIt.next()); String name = prop.getName(); System.out.println("Property: " + name + "; " + prop.getTypeName(prop.getType())); - pw.println("\n" + SEPARATOR + "\n"); - String propComment = valueComments.get(name, "*"); - if (propComment != null) { - pw.print(propComment); + pw.println(); + pw.println(SEPARATOR); + String propComment = Format.theFormat.getValueComments(name, ""); + if (propComment != null && propComment.length() != 0) { + pw.println(); + pw.println(propComment); + } else if (!prop.isType(UnicodeProperty.BINARY_MASK)) { + pw.println(); + pw.println("# Property:\t" + name); } - pw.println(); - PrintStyle ps = PrintStyle.get(name); + + Format.PrintStyle ps = Format.theFormat.getPrintStyle(name); + if (DEBUG) System.out.println(ps.toString()); + + if (!prop.isType(UnicodeProperty.BINARY_MASK) && + (ps.skipUnassigned != null || ps.skipValue != null)) { + String v = ps.skipValue; + if (v == null) { + v = ps.skipUnassigned; + } + String v2 = prop.getFirstValueAlias(v); + if (UnicodeProperty.compareNames(v,v2) != 0) v = v + " (" + v2 + ")"; + pw.println(); + pw.println("# All code points not explicitly listed for " + prop.getName()); + pw.println("# have the value " + v + "."); + } if (!ps.interleaveValues && prop.isType(UnicodeProperty.BINARY_MASK)) { if (DEBUG) System.out.println("Resetting Binary Values"); @@ -517,60 +753,73 @@ public class MakeUnicodeFiles { } else { writeEnumeratedValues(pw, bf, unassigned, prop, ps); } - pw.println(); } udf.close(); } + private static void writeEnumeratedValues( PrintWriter pw, BagFormatter bf, UnicodeSet unassigned, UnicodeProperty prop, - PrintStyle ps) { + Format.PrintStyle ps) { if (DEBUG) System.out.println("Writing Enumerated Values: " + prop.getName()); - bf.setValueSource(new UnicodeProperty.FilteredProperty(prop, hackMapFilter)); + bf.setValueSource(new UnicodeProperty.FilteredProperty(prop, Format.theFormat.hackMapFilter)); Collection aliases = prop.getAvailableValues(); if (ps.orderByRangeStart) { - System.out.println("Reordering"); + if (DEBUG) System.out.println("Reordering"); TreeSet temp2 = new TreeSet(new RangeStartComparator(prop)); temp2.addAll(aliases); aliases = temp2; - } + } + if (ps.sortNumeric) { + if (DEBUG) System.out.println("Reordering"); + TreeSet temp2 = new TreeSet(NUMERIC_STRING_COMPARATOR); + temp2.addAll(aliases); + aliases = temp2; + } for (Iterator it = aliases.iterator(); it.hasNext();) { String value = (String)it.next(); - UnicodeSet s = prop.getSet(value); if (DEBUG) System.out.println("Getting value " + value); - String valueComment = valueComments.get(prop.getName(), value); + UnicodeSet s = prop.getSet(value); + String valueComment = Format.theFormat.getValueComments(prop.getName(), value); if (DEBUG) { - System.out.println(value + "\t" + prop.getFirstValueAlias(value) + "\tskip:" + ps.skipValue); + System.out.println("Value:\t" + value + "\t" + prop.getFirstValueAlias(value) + "\tskip:" + ps.skipValue); + System.out.println("Value Comment\t" + valueComment); System.out.println(s.toPattern(true)); } int totalSize = s.size(); - if (s.size() == 0) continue; + if (s.size() == 0) { + System.out.println("\tSkipping Empty: " + prop.getName() + "=" + value); + continue; + } if (UnicodeProperty.compareNames(value, ps.skipValue) == 0) { - System.out.println("Skipping: " + value); + if (DEBUG) System.out.println("Skipping: " + value); continue; } if (UnicodeProperty.compareNames(value, ps.skipUnassigned) == 0) { - System.out.println("Removing Unassigneds: " + value); + bf.setFullTotal(s.size()); + if (DEBUG) System.out.println("Removing Unassigneds: " + value); s.removeAll(unassigned); } //if (s.size() == 0) continue; //if (unassigned.containsAll(s)) continue; // skip if all unassigned //if (s.contains(0xD0000)) continue; // skip unassigned - pw.print("\n" + SEPARATOR + "\n\n"); + boolean nonLongValue = false; String displayValue = value; if (ps.valueStyle.equals("none")) { displayValue = null; + nonLongValue = true; } else if (ps.valueStyle.equals("short")) { displayValue = prop.getFirstValueAlias(displayValue); + nonLongValue = true; if (DEBUG) System.out.println("Changing value " + displayValue); } if (ps.makeUppercase && displayValue != null) { @@ -585,29 +834,48 @@ public class MakeUnicodeFiles { } if (DEBUG) System.out.println("Setting value " + displayValue); bf.setValueSource(displayValue); - if (valueComment != null) { - pw.println(valueComment); + + if (!prop.isType(UnicodeProperty.BINARY_MASK)) { pw.println(); + pw.println(SEPARATOR); + if (nonLongValue) { + pw.println(); + pw.println("# " + prop.getName() + "=" + value); + } } - if (ps.longValueHeading != null) { + + if (valueComment != null) { + pw.println(); + pw.println(valueComment); + } + if (false && ps.longValueHeading != null) { String headingValue = value; if (ps.longValueHeading == "ccc") { headingValue = Utility.replace(value, "_", ""); char c = headingValue.charAt(0); if ('0' <= c && c <= '9') headingValue = "Other Combining Class"; } + pw.println(); pw.println("# " + headingValue); - pw.println(); - } - if (s.size() != 0) bf.showSetNames(pw, s); - if (s.size() != totalSize) { - pw.println(); - pw.print("# Not Listed: " + totalSize); } pw.println(); + //if (s.size() != 0) + bf.showSetNames(pw, s); } } + //static NumberFormat nf = NumberFormat.getInstance(); + static Comparator NUMERIC_STRING_COMPARATOR = new Comparator() { + public int compare(Object o1, Object o2) { + if (o1 == o2) return 0; + if (o1 == null) return -1; + if (o2 == null) return 1; + return Double.compare( + Double.parseDouble((String) o1), + Double.parseDouble((String) o2)); + } + + }; /* private static void writeBinaryValues( PrintWriter pw, @@ -625,8 +893,11 @@ public class MakeUnicodeFiles { BagFormatter bf, UnicodeProperty prop) { if (DEBUG) System.out.println("Writing Interleaved Values: " + prop.getName()); + pw.println(); bf.setValueSource(new UnicodeProperty.FilteredProperty(prop, new RestoreSpacesFilter())) .setNameSource(null) + .setLabelSource(null) + .setRangeBreakSource(null) .setShowCount(false) .showSetNames(pw,new UnicodeSet(0,0x10FFFF)); } @@ -636,8 +907,11 @@ public class MakeUnicodeFiles { BagFormatter bf, UnicodeProperty prop) { if (DEBUG) System.out.println("Writing String Values: " + prop.getName()); - bf.setValueSource(prop).setHexValue(true).setMergeRanges(false); - bf.showSetNames(pw,new UnicodeSet(0,0x10FFFF)); + pw.println(); + bf.setValueSource(prop) + .setHexValue(true) + .setMergeRanges(false) + .showSetNames(pw,new UnicodeSet(0,0x10FFFF)); } static class RangeStartComparator implements Comparator { @@ -660,7 +934,7 @@ public class MakeUnicodeFiles { static class RestoreSpacesFilter extends UnicodeProperty.StringFilter { public String remap(String original) { // ok, because doesn't change length - String mod = (String) hackMap.get(original); + String mod = (String) Format.theFormat.hackMap.get(original); if (mod != null) original = mod; return original.replace('_',' '); } @@ -673,6 +947,155 @@ public class MakeUnicodeFiles { return s.compareToIgnoreCase(t); } }; + + public static void showDiff() throws IOException { + PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "propertyDifference.txt"); + try { + showDifferences(out, "4.0.1", "LB", "GC"); + showDifferences(out, "4.0.1", "East Asian Width", "LB"); + showDifferences(out, "4.0.1", "East Asian Width", "GC"); + } finally { + out.close(); + } + } + + public static void showMatches() throws IOException { + PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "propertyDifference.txt"); + try { + showDifferences(out, "4.0.1", "LB", "GC"); + showDifferences(out, "4.0.1", "East Asian Width", "LB"); + showDifferences(out, "4.0.1", "East Asian Width", "GC"); + } finally { + out.close(); + } + } + + static NumberFormat nf = NumberFormat.getIntegerInstance(Locale.ENGLISH); + + static void showDifferences(PrintWriter out, String version, String prop1, String prop2) throws IOException { + UnicodeProperty p1 = ToolUnicodePropertySource.make(version).getProperty(prop1); + UnicodeProperty p2 = ToolUnicodePropertySource.make(version).getProperty(prop2); + BagFormatter bf = new BagFormatter(); + out.println("Comparing " + p1.getName() + " and " + p2.getName()); + System.out.println("Comparing " + p1.getName() + " and " + p2.getName()); + UnicodeSet intersection = new UnicodeSet(); + UnicodeSet disjoint = new UnicodeSet(); + main: + for (Iterator it1 = p1.getAvailableValues().iterator(); it1.hasNext();) { + String v1 = (String)it1.next(); + UnicodeSet s1 = p1.getSet(v1); + v1 += " (" + p1.getFirstValueAlias(v1) + ")"; + System.out.println(v1); + out.println(); + out.println(v1 + " [" + nf.format(s1.size()) + "]"); + + // create some containers so that the output is organized reasonably + String contains = ""; + String overlaps = ""; + UnicodeSet containsSet = new UnicodeSet(); + Set overlapsSet = new TreeSet(); + for (Iterator it2 = p2.getAvailableValues().iterator(); it2.hasNext();) { + String v2 = (String)it2.next(); + UnicodeSet s2 = p2.getSet(v2); + // v2 += "(" + p2.getFirstValueAlias(v2) + ")"; + v2 = p2.getFirstValueAlias(v2); + if (s1.containsNone(s2)) continue; + if (s1.equals(s2)) { + out.println("\t= " + v2); + continue main; // since they are partitions, we can stop here + } else if (s2.containsAll(s1)) { + out.println("\t\u2282 " + v2 + " [" + nf.format(s2.size()) + "]"); + continue main; // partition, stop + } else if (s1.containsAll(s2)) { + if (contains.length() != 0) contains += " \u222a "; + contains += v2 + " [" + nf.format(s2.size()) + "]"; + containsSet.addAll(s2); + if (containsSet.size() == s1.size()) break; + } else { // doesn't contain, isn't contained + if (overlaps.length() != 0) overlaps += "\r\n\t"; + intersection.clear().addAll(s2).retainAll(s1); + disjoint.clear().addAll(s1).removeAll(s2); + overlaps += "\u2283 " + v2 + " [" + nf.format(intersection.size()) + "]" + + " \u2285 " + v2 + " [" + nf.format(disjoint.size()) + "]"; + overlapsSet.add(v2); + } + } + if (contains.length() != 0) { + out.println((containsSet.size() == s1.size() ? "\t= " : "\t\u2283 ") + contains); + } + if (overlaps.length() != 0) out.println("\t" + overlaps); + if (false && overlapsSet.size() != 0) { + out.println("\t\u2260\u2284\u2285"); + for (Iterator it3 = overlapsSet.iterator(); it3.hasNext();) { + String v3 = (String) it3.next(); + UnicodeSet s3 = p2.getSet(v3); + out.println("\t" + v3); + bf.showSetDifferences(out,v1,s1,v3,s3); + } + } + } + } + + static class UnicodeDataHack extends UnicodeLabel { + private UnicodeProperty.Factory factory; + private UnicodeProperty name; + private UnicodeProperty bidiMirrored; + private UnicodeProperty numericValue; + private UnicodeProperty numericType; + private UnicodeProperty decompositionValue; + private UnicodeProperty decompositionType; + private UnicodeProperty bidiClass; + private UnicodeProperty combiningClass; + private UnicodeProperty category; + UnicodeDataHack(UnicodeProperty.Factory factory) { + this.factory = factory; + name = factory.getProperty("Name"); + category = factory.getProperty("General_Category"); + combiningClass = factory.getProperty("Canonical_Combining_Class"); + bidiClass = factory.getProperty("Bidi_Class"); + decompositionType = factory.getProperty("Decomposition_Type"); + decompositionValue = factory.getProperty("Decomposition_Value"); + numericType = factory.getProperty("Numeric_Type"); + numericValue = factory.getProperty("Numeric_Value"); + bidiMirrored = factory.getProperty("Bidi_Mirrored"); + //name10 + //isoComment + } + public String getValue(int codepoint, boolean isShort) { + String nameStr = name.getName(); + if (nameStr.startsWith(" 0) { + nameStr = nameStr.substring(0,pos) + "%" + nameStr.substring(pos + code.length()); + } + nameStr += ";" + + category.getValue(codepoint, true) + ";" + + combiningClass.getValue(codepoint, true) + ";" + + bidiClass.getValue(codepoint, true) + ";" + ; + String temp = decompositionType.getValue(codepoint, true); + if (!temp.equals("None")) { + nameStr += "<" + temp + "> " + Utility.hex(decompositionValue.getValue(codepoint)); + } + nameStr += ";"; + temp = numericType.getValue(codepoint, true); + if (temp.equals("Decimal")) { + nameStr += temp + ";" + temp + ";" + temp + ";"; + } else if (temp.equals("Digit")) { + nameStr += ";" + temp + ";" + temp + ";"; + } else if (temp.equals("Numeric")) { + nameStr += ";;" + temp + ";"; + } else if (temp.equals("Digit")) { + nameStr += ";;;"; + } + if (bidiMirrored.getValue(codepoint, true).equals("True")) { + nameStr += "Y" + ";"; + } + nameStr += ";"; + return nameStr; + } + } } diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt new file mode 100644 index 00000000000..d3ca1317db0 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt @@ -0,0 +1,355 @@ +Generate: DerivedCoreProperties +DeltaVersion: 11 + +File: Blocks +Property: Block +Format: valueList + +File: CaseFolding +Property: SPECIAL + +File: DerivedAge +Property: Age +Format: nameStyle=none noLabel skipValue=unassigned + +Value: 1.1 +# Assigned as of Unicode 1.1.0 (June, 1993) +# [excluding removed Hangul Syllables] + +Value: 2.0 +# Newly assigned in Unicode 2.0.0 (July, 1996) + +Value: 2.1 +# Newly assigned in Unicode 2.1.2 (May, 1998) + +Value: 3.0 +# Newly assigned in Unicode 3.0.0 (September, 1999) + +Value: 3.1 +# Newly assigned in Unicode 3.1.0 (March, 2001) + +Value: 3.2 +# Newly assigned in Unicode 3.2.0 (March, 2002) + +Value: 4.0 +# Newly assigned in Unicode 4.0.0 (April, 2003) + +File: extracted/DerivedBidiClass +Property: Bidi_Class +# Bidi Class (listing UnicodeData.txt, field 4: see UCD.html) +Format: valueStyle=short skipUnassigned=Left_To_Right + +File: extracted/DerivedBinaryProperties +Property: Bidi_Mirrored +# Bidi_Mirrored (listing UnicodeData.txt, field 9: see UCD.html) + +File: extracted/DerivedCombiningClass +Property: Canonical_Combining_Class +# Combining Class (listing UnicodeData.txt, field 3: see UCD.html) +# All code points not explicitly listed in this file have the property +# value: 0. +Format: nameStyle=none valueStyle=short skipUnassigned=Not_Reordered + +File: DerivedCoreProperties +Property: Math +# Derived Property: Math +# Generated from: Sm + Other_Math + +Property: Alphabetic +# Derived Property: Alphabetic +# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic + + +Property: Lowercase +# Derived Property: Lowercase +# Generated from: Ll + Other_Lowercase + + +Property: Uppercase +# Derived Property: Uppercase +# Generated from: Lu + Other_Uppercase + + +Property: ID_Start +# Derived Property: ID_Start +# Characters that can start an identifier. +# Generated from Lu+Ll+Lt+Lm+Lo+Nl+Other_ID_Start + + +Property: ID_Continue +# Derived Property: ID_Continue +# Characters that can continue an identifier. +# Generated from: ID_Start + Mn+Mc+Nd+Pc +# NOTE: Cf characters should be filtered out. + + +Property: XID_Start +# Derived Property: XID_Start +# ID_Start modified for closure under NFKx +# Modified as described in UAX #15 +# NOTE: Does NOT remove the non-NFKx characters. +# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string)) + + +Property: XID_Continue +# Derived Property: XID_Continue +# Mod_ID_Continue modified for closure under NFKx +# Modified as described in UAX #15 +# NOTE: Cf characters should be filtered out. +# NOTE: Does NOT remove the non-NFKx characters. +# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string)) + + +Property: Default_Ignorable_Code_Point +# Derived Property: Default_Ignorable_Code_Point +# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs - White_Space + + +Property: Grapheme_Extend +# Derived Property: Grapheme_Extend +# Generated from: Me + Mn + Other_Grapheme_Extend +# Note: depending on an application's interpretation of Co (private use), +# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither. + + +Property: Grapheme_Base +# Derived Property: Grapheme_Base +# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend +# Note: depending on an application's interpretation of Co (private use), +# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither. + + +File: extracted/DerivedDecompositionType +Property: Decomposition_Type +Format: skipValue=None +# Decomposition_Type (from UnicodeData.txt, field 5: see UCD.html) + +File: extracted/DerivedEastAsianWidth +Property: East_Asian_Width +Format: valueStyle=short skipUnassigned=Neutral +# East_Asian_Width (listing EastAsianWidth.txt, field 1) + +File: extracted/DerivedGeneralCategory +Property: General_Category +Format: valueStyle=short noLabel + +File: extracted/DerivedJoiningGroup +Property: Joining_Group +# Joining Group (listing ArabicShaping.txt, field 3) +Format: skipValue=No_Joining_Group + +File: extracted/DerivedJoiningType +Property: Joining_Type +# Type T is derived, as described in ArabicShaping.txt +Format: valueStyle=short skipValue=Non_Joining + +File: extracted/DerivedLineBreak +Property: Line_Break +Format: valueStyle=short skipUnassigned=Unknown + +File: DerivedNormalizationProps + +Property: FC_NFKC_Closure +# Derived Property: FC_NFKC_Closure +# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b)); +# Then if (c != b) add the mapping from a to c to the set of +# mappings that constitute the FC_NFKC_Closure list +# Uses the full case folding from CaseFolding.txt, without the T option. +Format: nameStyle=short + + +Property: Full_Composition_Exclusion +# Derived Property: Full_Composition_Exclusion +# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions + + +Property: NFD_QuickCheck +# Derived Property: NFD_QuickCheck +# Generated from computing decomposibles +Format: nameStyle=short valueStyle=short skipValue=Yes + + +Property: NFC_QuickCheck +# Derived Property: NFC_QuickCheck +# Generated from computing decomposibles (and characters that may compose with previous ones) +Format: nameStyle=short valueStyle=short skipValue=Yes + +Property: NFKD_QuickCheck +# Derived Property: NFKD_QuickCheck +# Generated from computing decomposibles +Format: nameStyle=short valueStyle=short skipValue=Yes + + +Property: NFKC_QuickCheck +# Derived Property: NFKC_QuickCheck +# Generated from computing decomposibles (and characters that may compose with previous ones) +Format: nameStyle=short valueStyle=short skipValue=Yes + +Property: Expands_On_NFD +# Derived Property: Expands_On_NFD +# Generated according to UAX #15. +# Characters whose normalized length is not one. +# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact. +# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters! + + +Property: Expands_On_NFC +# Derived Property: Expands_On_NFC +# Generated according to UAX #15. +# Characters whose normalized length is not one. +# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact. +# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters! + + +Property: Expands_On_NFKD +# Derived Property: Expands_On_NFKD +# Generated according to UAX #15. +# Characters whose normalized length is not one. +# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact. +# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters! + + +Property: Expands_On_NFKC +# Derived Property: Expands_On_NFKC +# Generated according to UAX #15. +# Characters whose normalized length is not one. +# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact. +# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters! + + +File: extracted/DerivedNumericType +Property: Numeric_Type +# Numeric Type (from UnicodeData.txt, field 6/7/8 plus Unihan.txt: see UCD.html) +Format: skipValue=None + +File: extracted/DerivedNumericValues +Property: Numeric_Value +# Numeric Values (from UnicodeData.txt, field 6/7/8) +# WARNING: Certain valus, such as 0.16666667, are repeating fractions +# Although they are only printed with a limited number of decimal places +# in this file, they should be expressed to the limits of the precision +# available when used. +Format: sortNumeric + +File: HangulSyllableType +Property: Hangul_Syllable_Type +Format: valueStyle=short skipValue=Not_Applicable + +File: NormalizationTest +Property: SPECIAL + +File: PropList + +Property: White_Space + +Property: Bidi_Control + +Property: Join_Control + +Property: Dash + +Property: Hyphen + +Property: Quotation_Mark + +Property: Terminal_Punctuation + +Property: Other_Math + +Property: Hex_Digit + +Property: ASCII_Hex_Digit + +Property: Other_Alphabetic + +Property: Ideographic + +Property: Diacritic + +Property: Extender + +Property: Other_Lowercase + +Property: Other_Uppercase + +Property: Noncharacter_Code_Point + +Property: Other_Grapheme_Extend + +Property: Grapheme_Link + +Property: IDS_Binary_Operator + +Property: IDS_Trinary_Operator + +Property: Radical + +Property: Unified_Ideograph + +Property: Other_Default_Ignorable_Code_Point + +Property: Deprecated + +Property: Soft_Dotted + +Property: Logical_Order_Exception + +Property: Other_ID_Start + +Property: Other_ID_Continue + +Property: STerm + +Property: Variation_Selector + +File: PropertyAliases +Property: SPECIAL + +File: PropertyValueAliases +Property: SPECIAL + +File: Scripts + +Property: Script +Format: nameStyle=none skipUnassigned=Common + +File: SpecialCasing +Property: SPECIAL + +File: StandardizedVariants +Property: SPECIAL + +HackName: noBreak +HackName: Arabic_Presentation_Forms-A +HackName: Arabic_Presentation_Forms-B +HackName: CJK_Symbols_and_Punctuation +HackName: Combining_Diacritical_Marks_for_Symbols +HackName: Enclosed_CJK_Letters_and_Months +HackName: Greek_and_Coptic +HackName: Halfwidth_and_Fullwidth_Forms +HackName: Latin-1_Supplement +HackName: Latin_Extended-A +HackName: Latin_Extended-B +HackName: Miscellaneous_Mathematical_Symbols-A +HackName: Miscellaneous_Mathematical_Symbols-B +HackName: Miscellaneous_Symbols_and_Arrows +HackName: Superscripts_and_Subscripts +HackName: Supplemental_Arrows-A +HackName: Supplemental_Arrows-B +HackName: Supplementary_Private_Use_Area-A +HackName: Supplementary_Private_Use_Area-B +HackName: Canadian-Aboriginal +HackName: Old-Italic + +FinalComments +Note that PropertyAliases sorts by the long name, while PropertyValueAliases +sorts by the short name +ArabicShaping +BidiMirroring +CompositionExclusions +EastAsianWidth +LineBreak +StandardizedVariants +UnicodeData + + diff --git a/tools/unicodetools/com/ibm/text/UCD/MyFloatLister.java b/tools/unicodetools/com/ibm/text/UCD/MyFloatLister.java index 73956614a00..d1e47eefd4b 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MyFloatLister.java +++ b/tools/unicodetools/com/ibm/text/UCD/MyFloatLister.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyFloatLister.java,v $ -* $Date: 2003/03/12 16:01:26 $ -* $Revision: 1.5 $ +* $Date: 2004/03/11 19:03:17 $ +* $Revision: 1.6 $ * ******************************************************************************* */ @@ -40,7 +40,7 @@ class MyFloatLister extends PropertyLister { public byte status(int cp) { //if ((cp & 0xFFF) == 0) System.out.println("# " + Utility.hex(cp)); if (false && !ucdData.isRepresented(cp)) { - if (ucdData.mapToRepresentative(cp, false) != cp) return PropertyLister.CONTINUE; + if (ucdData.mapToRepresentative(cp, ucdData.getCompositeVersion()) != cp) return PropertyLister.CONTINUE; return PropertyLister.CONTINUE; } if (ucdData.getCategory(cp) == Cn) return PropertyLister.CONTINUE; diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt new file mode 100644 index 00000000000..f502de853f6 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt @@ -0,0 +1,40 @@ +# This file contains aliases for properties used in the UCD. +# These names can be used for XML formats of UCD data, for regular-expression +# property tests, and other programmatic textual descriptions of Unicode data. +# For information on which properties are normative, see UCD.html. +# +# The names may be translated in appropriate environments, and additional +# aliases may be useful. +# +# FORMAT +# +# Each line has two or more fields, separated by semicolons. +# +# First Field: The first field is an abbreviated name for the property. +# +# Second Field: The second field is a long name +# +# The above are the preferred aliases. Other aliases may be listed in additional fields. +# +# Loose matching should be applied to all property names and property values, with +# the exception of String Property values. With loose matching of property names and +# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property +# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1". +# +# NOTE: Property value names are NOT unique across properties. For example: +# +# AL means Arabic Letter for the Bidi_Class property, and +# AL means Alpha_Left for the Combining_Class property, and +# AL means Alphabetic for the Line_Break property. +# +# In addition, some property names may be the same as some property value names. +# For example: +# +# sc means the Script property, and +# Sc means the General_Category property value Currency_Symbol (Sc) +# +# The combination of property value and property name is, however, unique. +# +# For more information, see UTS #18: Regular Expression Guidelines +# ================================================ + diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt new file mode 100644 index 00000000000..0e9d5bec886 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt @@ -0,0 +1,48 @@ +# This file contains aliases for property values used in the UCD. +# These names can be used for XML formats of UCD data, for regular-expression +# property tests, and other programmatic textual descriptions of Unicode data. +# For information on which properties are normative, see UCD.html. +# +# The names may be translated in appropriate environments, and additional +# aliases may be useful. +# +# FORMAT +# +# Each line describes a property value name. +# This consists of three or more fields, separated by semicolons. +# +# First Field: The first field describes the property for which that +# property value name is used. +# +# Second Field: The second field is an abbreviated name. +# If there is no abbreviated name available, the field is marked with "n/a". +# +# Third Field: The third field is a long name. +# +# In the case of ccc, there are 4 fields. The second field is numeric, third +# is abbreviated, and fourth is long. +# +# The above are the preferred aliases. Other aliases may be listed in additional fields. +# +# Loose matching should be applied to all property names and property values, with +# the exception of String Property values. With loose matching of property names and +# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property +# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1". +# +# NOTE: Property value names are NOT unique across properties. For example: +# +# AL means Arabic Letter for the Bidi_Class property, and +# AL means Alpha_Left for the Combining_Class property, and +# AL means Alphabetic for the Line_Break property. +# +# In addition, some property names may be the same as some property value names. +# For example: +# +# sc means the Script property, and +# Sc means the General_Category property value Currency_Symbol (Sc) +# +# The combination of property value and property name is, however, unique. +# +# For more information, see UTS #18: Regular Expression Guidelines +# ================================================ + diff --git a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java index 4086ddd216f..b3a024bf43c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java +++ b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java @@ -1,5 +1,6 @@ package com.ibm.text.UCD; +import java.text.NumberFormat; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -53,7 +54,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { add(new UnicodeProperty.SimpleProperty() { public String _getValue(int codepoint) { - if (codepoint == 0x1D100) { + if (DEBUG && codepoint == 0x1D100) { System.out.println("here"); } //if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null; @@ -82,10 +83,17 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { .setValues("")); add(new UnicodeProperty.SimpleProperty() { + NumberFormat nf = NumberFormat.getInstance(); + { + nf.setGroupingUsed(false); + nf.setMaximumFractionDigits(8); + nf.setMinimumFractionDigits(1); + } public String _getValue(int codepoint) { + double num = ucd.getNumericValue(codepoint); if (Double.isNaN(num)) return null; - return Double.toString(num); + return nf.format(num); } }.setMain("Numeric_Value", "nv", UnicodeProperty.NUMERIC, version)); @@ -100,8 +108,9 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { public int getMaxWidth(boolean isShort) { return 14; } - }.setMain("FC_NFKC_Closure", "FNC", UnicodeProperty.STRING, version) - .addName("FC_NFKC")); + }.setMain("FC_NFKC_Closure", "FC_NFKC", UnicodeProperty.STRING, version) + //.addName("FNC") + ); add(new UnicodeProperty.SimpleProperty() { public String _getValue(int codepoint) { @@ -319,7 +328,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { case UCD_Types.COMBINING_CLASS>>8: temp = (ucd.getCombiningClassID_fromIndex((short)i, style)); break; case UCD_Types.BIDI_CLASS>>8: temp = (ucd.getBidiClassID_fromIndex((byte)i, style)); break; case UCD_Types.DECOMPOSITION_TYPE>>8: temp = (ucd.getDecompositionTypeID_fromIndex((byte)i, style)); - check = temp != null; + //check = temp != null; break; case UCD_Types.NUMERIC_TYPE>>8: temp = (ucd.getNumericTypeID_fromIndex((byte)i, style)); titlecase = true; @@ -389,7 +398,10 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { case UCD_Types.EAST_ASIAN_WIDTH>>8: return lookup(valueAlias, UCD_Names.LONG_EAST_ASIAN_WIDTH, UCD_Names.EAST_ASIAN_WIDTH, result); case UCD_Types.LINE_BREAK>>8: - return lookup(valueAlias, UCD_Names.LONG_LINE_BREAK, UCD_Names.LINE_BREAK, result); + lookup(valueAlias, UCD_Names.LONG_LINE_BREAK, UCD_Names.LINE_BREAK, result); + if (valueAlias.equals("Inseparable")) addUnique("Inseperable", result); + // Inseparable; Inseperable + return result; case UCD_Types.JOINING_TYPE>>8: return lookup(valueAlias, UCD_Names.LONG_JOINING_TYPE, UCD_Names.JOINING_TYPE, result); case UCD_Types.JOINING_GROUP>>8: @@ -445,10 +457,13 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { if (isType(BINARY_MASK)) { return up.hasValue(codepoint) ? "True" : "False"; } - return ""; + throw new IllegalArgumentException("Failed to find value for " + Utility.hex(codepoint)); } public String getAge(int codePoint) { + if (codePoint == 0xF0000) { + System.out.println("debug point"); + } if (needAgeCache) { for (int i = UCD_Types.AGE11; i < UCD_Types.LIMIT_AGE; ++i) { ucdCache[i] = UCD.make(UCD_Names.AGE_VERSIONS[i]); diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 390392e7e3a..09a07de3c07 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2004/02/18 03:09:01 $ -* $Revision: 1.32 $ +* $Date: 2004/03/11 19:03:16 $ +* $Revision: 1.33 $ * ******************************************************************************* */ @@ -86,7 +86,6 @@ public final class UCD implements UCD_Types { */ public boolean isAllocated(int codePoint) { if (getCategory(codePoint) != Cn) return true; - if (compositeVersion >= 0x20000 && codePoint >= 0xF0000 && codePoint <= 0x10FFFD) return true; if (isNoncharacter(codePoint)) return true; return false; } @@ -94,11 +93,9 @@ public final class UCD implements UCD_Types { public boolean isNoncharacter(int codePoint) { if ((codePoint & 0xFFFE) == 0xFFFE) { if (compositeVersion < 0x20000 && codePoint > 0xFFFF) return false; - // major < 2 return true; } if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && compositeVersion >= 0x30100) return true; - // major >= 3 && minor >= 1 return false; } @@ -113,8 +110,9 @@ public final class UCD implements UCD_Types { * Is the code point a PUA character (fast check) */ public boolean isPUA(int codePoint) { - return (codePoint >= 0xE000 && codePoint < 0xF900 - || codePoint >= 0xF0000 && codePoint < 0xFFFFE + if (codePoint >= 0xE000 && codePoint < 0xF900) return true; + if (compositeVersion < 0x20000) return false; + return (codePoint >= 0xF0000 && codePoint < 0xFFFFE || codePoint >= 0x100000 && codePoint < 0x10FFFE); } @@ -353,7 +351,7 @@ public final class UCD implements UCD_Types { return combiningClassSet.get(0xFF & value); } - static UnicodeSet BIDI_R_SET, BIDI_AL_SET; + static UnicodeSet BIDI_R_SET, BIDI_AL_SET, BIDI_BN_SET; /** * Get the bidi class @@ -424,10 +422,17 @@ public final class UCD implements UCD_Types { BIDI_R_SET.removeAll(noncharacters); BIDI_AL_SET.removeAll(noncharacters); - + BIDI_BN_SET = new UnicodeSet(); + if (compositeVersion >= 0x40001) { + BIDI_BN_SET.addAll(noncharacters); + UnicodeSet DefaultIg = DerivedProperty.make(DefaultIgnorable, this).getSet(); + System.out.println("DefaultIg: " + DefaultIg); + BIDI_BN_SET.addAll(DefaultIg); + } System.out.println("BIDI_R_SET: " + BIDI_R_SET); System.out.println("BIDI_AL_SET: " + BIDI_AL_SET); + System.out.println("BIDI_BN_SET: " + BIDI_BN_SET); if (BIDI_R_SET.containsSome(BIDI_AL_SET)) { throw new ChainException("BIDI values for Cf characters overlap!!", null); @@ -435,6 +440,9 @@ public final class UCD implements UCD_Types { } + if (BIDI_BN_SET.contains(codePoint)) { + return BIDI_BN; + } if (BIDI_R_SET.contains(codePoint)) { return BIDI_R; } @@ -1012,7 +1020,7 @@ public final class UCD implements UCD_Types { } public static String getScriptID_fromIndex(byte prop, byte length) { - return prop < 0 || prop >= UCD_Names.JOINING_GROUP.length ? null + return prop < 0 || prop >= UCD_Names.SCRIPT.length ? null : (length == SHORT) ? UCD_Names.SCRIPT[prop] : UCD_Names.LONG_SCRIPT[prop]; } @@ -1043,7 +1051,7 @@ public final class UCD implements UCD_Types { : style == SHORT ? UCD_Names.SHORT_BP[bit] : UCD_Names.BP[bit]; } - public static int mapToRepresentative(int ch, boolean lessThan20105) { + public static int mapToRepresentative(int ch, int rCompositeVersion) { if (ch <= 0xFFFD) { //if (ch <= 0x2800) return ch; //if (ch <= 0x28FF) return 0x2800; // braille @@ -1061,7 +1069,7 @@ public final class UCD implements UCD_Types { if (ch <= 0xDFFF) return 0xDC00; if (ch <= 0xE000) return ch; // Private Use if (ch <= 0xF8FF) return 0xE000; - if (lessThan20105) { + if (rCompositeVersion < 0x20105) { if (ch <= 0xF900) return ch; // CJK Compatibility Ideograp if (ch <= 0xFA2D) return 0xF900; } @@ -1069,14 +1077,20 @@ public final class UCD implements UCD_Types { if (ch <= 0xFDEF) return 0xFFFF; } else { if ((ch & 0xFFFE) == 0xFFFE) return 0xFFFF; // Noncharacter + if (ch <= 0x20000) return ch; // Extension B if (ch <= 0x2A6D6) return 0x20000; //if (ch <= 0x2F800) return ch; //if (ch <= 0x2FA1D) return 0x2F800; // compat ideographs - if (ch <= 0xF0000) return ch; // Plane 15 Private Use + if (ch < 0xF0000) return ch; // Plane 15 Private Use + if (rCompositeVersion >= 0x20000) { + return 0xE000; + } + /* if (ch <= 0xFFFFD) return 0xF0000; // Plane 16 Private Use if (ch <= 0x100000) return ch; // Plane 15 Private Use if (ch <= 0x10FFFD) return 0x100000; // Plane 16 Private Use + */ } return ch; } @@ -1106,6 +1120,7 @@ public final class UCD implements UCD_Types { byte cat = getCategory(cp); if (cat == Mn || cat == Mc || cat == Nd || cat == Pc) return true; if (getBinaryProperty(cp, Other_ID_Start)) return true; + if (getBinaryProperty(cp, Other_ID_Continue)) return true; return false; } @@ -1189,7 +1204,7 @@ to guarantee identifier closure. if (codePoint >= 0x2800 && codePoint <= 0x28FF) return true; if (codePoint >= 0x2F800 && codePoint <= 0x2FA1D) return true; - int rangeStart = mapToRepresentative(codePoint, compositeVersion < 0x020105); + int rangeStart = mapToRepresentative(codePoint, compositeVersion); switch (rangeStart) { default: return getRaw(codePoint) == null; @@ -1247,7 +1262,7 @@ to guarantee identifier closure. // do range stuff String constructedName = null; - int rangeStart = mapToRepresentative(codePoint, compositeVersion < 0x020105); + int rangeStart = mapToRepresentative(codePoint, compositeVersion); boolean isHangul = false; boolean isRemapped = false; switch (rangeStart) { @@ -1297,7 +1312,7 @@ to guarantee identifier closure. case 0xE000: // Private Use case 0xF0000: // Private Use case 0x100000: // Private Use - if (fixStrings) constructedName = ""; + if (fixStrings) constructedName = ""; isRemapped = true; break; case 0xD800: // Surrogate diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java index 3722b19c704..cba0feb14d4 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $ -* $Date: 2004/02/18 03:09:01 $ -* $Revision: 1.26 $ +* $Date: 2004/03/11 19:03:16 $ +* $Revision: 1.27 $ * ******************************************************************************* */ @@ -152,7 +152,8 @@ final class UCD_Names implements UCD_Types { "Logical_Order_Exception", "Other_ID_Start", "STerm", - "Variation_Selector" + "Variation_Selector", + "Other_ID_Continue", }; static final String[] SHORT_BP = { @@ -189,7 +190,8 @@ final class UCD_Names implements UCD_Types { "LOE", "OIDS", "STerm", - "VS" + "VS", + "OIDC" }; /* @@ -262,7 +264,7 @@ final class UCD_Names implements UCD_Types { "Unknown", "OpenPunctuation", "ClosePunctuation", "Quotation", "Glue", "Nonstarter", "Exclamation", "BreakSymbols", "InfixNumeric", "PrefixNumeric", "PostfixNumeric", - "Numeric", "Alphabetic", "Ideographic", "Inseperable", "Hyphen", + "Numeric", "Alphabetic", "Ideographic", "Inseparable", "Hyphen", "CombiningMark", "BreakBefore", "BreakAfter", "Space", "MandatoryBreak", "CarriageReturn", "LineFeed", "ContingentBreak", "ComplexContext", "Ambiguous", "BreakBoth", "Surrogate", "ZWSpace", @@ -327,7 +329,8 @@ final class UCD_Names implements UCD_Types { "SHAVIAN", "OSMANYA", "CYPRIOT", - "BRAILLE", + "BRAILLE", + "KATAKANA_OR_HIRAGANA", }; @@ -395,6 +398,7 @@ final class UCD_Names implements UCD_Types { "Osma", "Cprt", "Brai", + "Hrkt", }; @@ -643,11 +647,13 @@ final class UCD_Names implements UCD_Types { case 9: s = style < LONG ? "VR" : "Virama"; break; case 200: s = style < LONG ? "ATBL" : "AttachedBelowLeft"; break; case 202: s = style < LONG ? "ATB" : "AttachedBelow"; break; + /* case 204: s = style < LONG ? "ATBR" : "AttachedBelowRight"; break; case 208: s = style < LONG ? "ATL" : "AttachedLeft"; break; case 210: s = style < LONG ? "ATR" : "AttachedRight"; break; case 212: s = style < LONG ? "ATAL" : "AttachedAboveLeft"; break; - case 214: s = style < LONG ? "ATA" : "AttachedAbove"; break; + case 214: s = style < LONG ? "ATA" : "AttachedAbove"; break; + */ case 216: s = style < LONG ? "ATAR" : "AttachedAboveRight"; break; case 218: s = style < LONG ? "BL" : "BelowLeft"; break; case 220: s = style < LONG ? "B" : "Below"; break; diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index 5a96efd992d..134eff7f7b9 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2004/02/18 03:09:01 $ -* $Revision: 1.27 $ +* $Date: 2004/03/11 19:03:16 $ +* $Revision: 1.28 $ * ******************************************************************************* */ @@ -15,9 +15,7 @@ package com.ibm.text.UCD; public interface UCD_Types { - public static final int dVersion = 6; // change to fix the generated file D version. If less than zero, no "d" - static final byte BINARY_FORMAT = 14; // bumped if binary format of UCD changes. Forces rebuild - + static final byte BINARY_FORMAT = 15; // bumped if binary format of UCD changes. Forces rebuild public static final String BASE_DIR = "C:\\DATA\\"; public static final String UCD_DIR = BASE_DIR + "UCD\\"; @@ -213,9 +211,10 @@ public interface UCD_Types { Soft_Dotted = 29, Logical_Order_Exception = 30, Other_ID_Start = 31, - Sentence_Terminal = 32, - Variation_Selector = 33, - LIMIT_BINARY_PROPERTIES = 34; + Sentence_Terminal = 32, + Variation_Selector = 33, + Other_ID_Continue = 34, + LIMIT_BINARY_PROPERTIES = 35; /* static final int @@ -383,7 +382,8 @@ public interface UCD_Types { OSMANYA = 51, CYPRIOT = 52, BRAILLE = 53, - LIMIT_SCRIPT = 54; + KATAKANA_OR_HIRAGANA = 54, + LIMIT_SCRIPT = 55; static final int UNKNOWN = 0, diff --git a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java index 38a56bac7fd..b0a1494ef9a 100644 --- a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java +++ b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java @@ -7,6 +7,7 @@ import java.io.PrintWriter; import com.ibm.text.UCD.Default; import com.ibm.text.UCD.GenerateData; +import com.ibm.text.UCD.MakeUnicodeFiles; import com.ibm.text.UCD.UCD_Types; public class UnicodeDataFile { @@ -26,16 +27,23 @@ public class UnicodeDataFile { result.out.println("# " + filename + UnicodeDataFile.getFileSuffix(false)); result.out.println(generateDateLine()); - result.out.println("#"); + result.out.println("#"); + result.out.println("# Unicode Character Database"); + result.out.println("# Copyright (c) 1991-2004 Unicode, Inc."); + result.out.println( + "# For terms of use, see http://www.unicode.org/terms_of_use.html"); + result.out.println("# For documentation, see UCD.html"); try { Utility.appendFile(filename + "Header.txt", Utility.LATIN1, result.out); } catch (FileNotFoundException e) { + /* result.out.println("# Unicode Character Database: Derived Property Data"); result.out.println("# Generated algorithmically from the Unicode Character Database"); result.out.println("# For documentation, see UCD.html"); result.out.println("# Note: Unassigned and Noncharacter codepoints may be omitted"); result.out.println("# if they have default property values."); result.out.println("# ================================================"); + */ } return result; @@ -51,14 +59,20 @@ public class UnicodeDataFile { } public static String getHTMLFileSuffix(boolean withDVersion) { - return "-" + Default.ucd().getVersion() - + ((withDVersion && UCD_Types.dVersion >= 0) ? ("d" + UCD_Types.dVersion) : "") + return "-" + + Default.ucd().getVersion() + + ((withDVersion && MakeUnicodeFiles.dVersion >= 0) + ? ("d" + MakeUnicodeFiles.dVersion) + : "") + ".html"; } public static String getFileSuffix(boolean withDVersion) { - return "-" + Default.ucd().getVersion() - + ((withDVersion && UCD_Types.dVersion >= 0) ? ("d" + UCD_Types.dVersion) : "") + return "-" + + Default.ucd().getVersion() + + ((withDVersion && MakeUnicodeFiles.dVersion >= 0) + ? ("d" + MakeUnicodeFiles.dVersion) + : "") + ".txt"; } diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java index 4419dffcb53..789f5af93b4 100644 --- a/tools/unicodetools/com/ibm/text/utility/Utility.java +++ b/tools/unicodetools/com/ibm/text/utility/Utility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ -* $Date: 2004/02/18 03:09:02 $ -* $Revision: 1.40 $ +* $Date: 2004/03/11 19:03:16 $ +* $Revision: 1.41 $ * ******************************************************************************* */ @@ -725,8 +725,7 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES public static PrintWriter openPrintWriter(String directory, String filename, Encoding options) throws IOException { File file = new File(directory + filename); Utility.fixDot(); - System.out.print("Creating File: " + file); - System.out.println("\t" + file.getCanonicalPath()); + System.out.println("Creating File: " + file.getCanonicalPath()); File parent = new File(file.getParent()); //System.out.println("Creating File: "+ parent); parent.mkdirs();