diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java index 1637d1180e7..ceecc154718 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCharts.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $ -* $Date: 2002/10/03 22:58:17 $ -* $Revision: 1.13 $ +* $Date: 2002/10/05 01:28:56 $ +* $Revision: 1.14 $ * ******************************************************************************* */ @@ -921,14 +921,35 @@ public class WriteCharts implements UCD_Types { + "
" + Utility.hex(comp) + ""; } + public static void writeAllocation() throws IOException { Default.setUCD(); + String[] names = new String[300]; // HACK, 300 is plenty for now. Fix if it ever gets larger + int[] starts = new int[names.length]; + int[] ends = new int[names.length]; + + UCD.BlockData blockData = new UCD.BlockData(); + int counter = 0; - UnicodeSet[] values = new UnicodeSet[500]; - String[] names = new String[values.length]; - int[] starts = new int[values.length]; - int[] ends = new int[values.length]; + int blockId = 0; + while (Default.ucd.getBlockData(blockId++, blockData)) { + names[counter] = blockData.name; + starts[counter] = blockData.start; + ends[counter] = blockData.end; + //System.out.println(names[counter] + ", " + values[counter]); + ++counter; + + // HACK + if (blockData.name.equals("Tags")) { + names[counter] = "reserved default ignorable"; + starts[counter] = 0xE0080; + ends[counter] = 0xE0FFF; + ++counter; + } + } + + /* BufferedReader in = Utility.openUnicodeFile("Blocks", "", true, false); try { while (true) { @@ -947,42 +968,79 @@ public class WriteCharts implements UCD_Types { ends[counter] = end; //System.out.println(names[counter] + ", " + values[counter]); ++counter; + + // HACK + if (name.equals("Tags")) { + names[counter] = "reserved default ignorable"; + values[counter] = new UnicodeSet(0xE0080, 0xE0FFF); + starts[counter] = 0xE0080; + ends[counter] = 0xE0FFF; + ++counter; + } } } finally { in.close(); } + */ - PrintWriter out = Utility.openPrintWriter("Allocation.html", Utility.LATIN1_WINDOWS); + + /* + Graphic + Format + Control + Private Use + Surrogate + Noncharacter + Reserved (default ignorable) + Reserved (other) + */ + + PrintWriter out = Utility.openPrintWriter("allocation.html", Utility.LATIN1_WINDOWS); try { out.println(""); out.println("Unicode Allocation"); out.println("

Unicode Allocation

"); - out.println(""); - out.println(""); - UnicodeSetIterator it = new UnicodeSetIterator(); - int lastEnd = -1; - for (int i = 0; i < counter; ++i) { - if (starts[i] != lastEnd + 1) { - drawAllocation(out, lastEnd + 1, "reserved", starts[i] - lastEnd + 1, 0); + for (int textOnly = 0; textOnly < 2; ++textOnly) { + out.println("
StartBlock NameSize
"); // width='100%' + if (textOnly == 0) { + out.println(""); + } else { + out.println(""); } - int total = values[i].size(); - int alloc = 0; - it.reset(values[i]); - while (it.nextRange()) { - for (int j = it.codepoint; j <= it.codepointEnd; ++j) { + int lastEnd = -1; + for (int i = 0; i < counter; ++i) { + if (starts[i] != lastEnd + 1) { + drawAllocation(out, lastEnd + 1, "reserved", starts[i] - lastEnd + 1, 0, "#000000", "#000000", textOnly); + } + int total = ends[i] - starts[i] + 1; + int alloc = 0; + for (int j = starts[i]; j <= ends[i]; ++j) { if (Default.ucd.isAllocated(j)) ++alloc; } + //System.out.println(names[i] + "\t" + alloc + "\t" + total); + String color = names[i].indexOf("Surrogates") >= 0 ? "#FF0000" + : names[i].indexOf("Private") >= 0 ? "#0000FF" + : "#00FF00"; + String colorReserved = names[i].indexOf("reserved default ignorable") >= 0 ? "#CCCCCC" + : "#000000"; + drawAllocation(out, starts[i], names[i], total, alloc, color, colorReserved, textOnly); + lastEnd = ends[i]; } - System.out.println(names[i] + "\t" + alloc + "\t" + total); - drawAllocation(out, starts[i], names[i], total, alloc); - lastEnd = ends[i]; + out.println("
StartBlock NameSize
Block NameStartTotalAssigned

 

"); } - out.println(""); - out.println("

This chart lists all the Unicode blocks and their starting code points. " - + "The area of each bar is proportional to the total number of code points in each block, " - + "with green for the proportion of assigned code points. " + out.println("

Key

This chart lists all the Unicode blocks and their starting code points. " + + "The area of each bar is proportional to the total number of code points in each block. " + + "The colors have the following significance:
" + + "" + + "" + + "" + + "" + + "" + + "" + + "
GreenGraphic, Control, Format, Noncharacter* code points
RedSurrogate code points
BluePrivate Use code points
GrayReserved (default ignorable) code points
BlackReserved (other) code points

" + + "* Control, Format, and Noncharacter are not distinguished from Graphic characters by color, since they are mixed into other blocks. " + "Tooltips on the bars show the total number of code points and the number assigned. " + "(Remember that assigned code points are not necessarily assigned characters.)" + "

"); @@ -997,23 +1055,27 @@ public class WriteCharts implements UCD_Types { static NumberFormat nf = NumberFormat.getNumberInstance(Locale.US); static {nf.setMaximumFractionDigits(0);} - static void drawAllocation(PrintWriter out, int start, String title, int total, int alloc) { - int unalloc = total - alloc; - - double totalWidth = longestBar*(Math.sqrt(total) / Math.sqrt(longestBlock)); - double allocWidth = alloc * totalWidth / total; - double unallocWidth = totalWidth - allocWidth; - - out.println("" + Utility.hex(start) - + "" + title - + ""); - - if (alloc != 0) out.println(""); - if (unalloc != 0) out.println(""); - out.println("
  
"); + static void drawAllocation(PrintWriter out, int start, String title, int total, int alloc, String color, String colorReserved, int textOnly) { + if (textOnly == 0) { + int unalloc = total - alloc; + + double totalWidth = longestBar*(Math.sqrt(total) / Math.sqrt(longestBlock)); + double allocWidth = alloc * totalWidth / total; + double unallocWidth = totalWidth - allocWidth; + + out.println("" + Utility.hex(start) + + "" + title + + ""); + + if (alloc != 0) out.println(""); + if (unalloc != 0) out.println(""); + out.println("
  
"); + } else { + out.println("" + title + "" + start + "" + total + "" + alloc + ""); + } } } diff --git a/tools/unicodetools/com/ibm/text/UCD/CaseTestHeader.txt b/tools/unicodetools/com/ibm/text/UCD/CaseTestHeader.txt new file mode 100644 index 00000000000..1a80bf523aa --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/CaseTestHeader.txt @@ -0,0 +1,47 @@ +# +# This file is used to test (1) case conversion, (2) case detection, +# and (3) case-insensitive matching. +# (1) is represented below by function names such as toLower(), +# (2) is represented below by function names such as isLower(). +# (3) is represented below by the function name equalsCaseInsensitive(). +# (The actual function names will vary depending on software language and/or library.) +# +# The test cases also check whether canonical equivalence is preserved +# by these functions. +# +# Format: +# ; ; ; ; <fold> (# <comment>)? +# +# Test: +# +# A. For each line: +# 1. Verify the following equalities: +# lower == toLower(src) +# upper == toUpper(src) +# title == toTitle(src) +# fold == toFold(src) +# 2. Verify that all of the following are true: +# isLower(toLower(lower)) +# isUpper(toUpper(upper)) +# isTitle(toTitle(title)) +# isFold(toTitle(fold)) +# 3. Verify that all of the following are true: +# equalsCaseInsensitive(src, lower) +# equalsCaseInsensitive(src, upper) +# equalsCaseInsensitive(src, title) +# equalsCaseInsensitive(src, fold) +# +# B. For each code point that is NOT listed as a src: +# 1. Verify the following equalities: +# src == toLower(src) == toUpper(src) == toTitle(src) == toFold(src) +# 2. Verify that all of the following are true: +# isLower(toLower(lower)) +# isUpper(toUpper(upper)) +# isTitle(toTitle(title)) +# isFold(toTitle(fold)) +# 3. Verify that all of the following are true: +# equalsCaseInsensitive(src, lower) +# equalsCaseInsensitive(src, upper) +# equalsCaseInsensitive(src, title) +# equalsCaseInsensitive(src, fold) +# diff --git a/tools/unicodetools/com/ibm/text/UCD/Charts.java b/tools/unicodetools/com/ibm/text/UCD/Charts.java new file mode 100644 index 00000000000..991351f79e5 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/Charts.java @@ -0,0 +1,25 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Charts.java,v $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; +import com.ibm.icu.text.UnicodeSet; +import java.io.*; + +import java.util.*; +import com.ibm.icu.text.UTF16; + +import com.ibm.text.utility.*; + + +public class Charts { +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/CodePointProperty.java b/tools/unicodetools/com/ibm/text/UCD/CodePointProperty.java new file mode 100644 index 00000000000..2e69616a807 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/CodePointProperty.java @@ -0,0 +1,106 @@ +package com.ibm.text.UCD; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.text.utility.*; +import java.util.*; + +// Enumerated properties will be IntCodePointProperty. +// The string values they return will be the property value names. +// Binary properties are Enumerated properties. They return 0 or 1 + +abstract public class CodePointProperty { + // styles for names and string values + static final byte SHORT = 0, DEFAULT = 1, LONG = 2, NORMAL_LIMIT = 3; + + // gets the property name + abstract public String getName(byte style); + + // value may also be numeric, etc, but this returns string equivalent. + abstract public String getValue(int codePoint, byte style); + + // returns true if the code point has the value + // works with any style that getValue takes + abstract public boolean hasValue(int codePoint, String value); + + // returns the set of all code points with that value. + // same effect as using hasValue one by one, but faster internal implementation + abstract public UnicodeSet getSet(String value); + + // returns a list of all possible values + // logically the same as looping from 0..10FFFF with getValue and getStyleLimit, + // and throwing out duplicates, but much faster. + static Iterator getAllValues(byte style) { + return null; + } + + // gets top value style available for this property + public byte getStyleLimit(byte style) { + return NORMAL_LIMIT; + } + + // returns true if the value is known to be uniform over a type. + // this is used for various optimizations, especially for Cn & Co + public boolean isUniformOverCategory(byte generalCategory) { + return false; + } + + // subclasses + + static abstract public class IntCodePointProperty extends CodePointProperty { + abstract int getNumericValue(int codePoint); + abstract int getMaxValue(); + abstract int getMinValue(); + static Iterator getAllNumericValues() { + return null; + } + } + + static abstract public class DoubleCodePointProperty extends CodePointProperty { + abstract double getNumericValue(int codePoint); + abstract double getMaxValue(); + abstract double getMinValue(); + static Iterator getAllNumericValues() { + return null; + } + } + + // registration and lookup + + // register a new property + static void register(CodePointProperty newProp) { + //... + } + + // finds a registered property by name + static CodePointProperty getInstance(String name) { + return null; + } + + // returns a list of all registered properties + static Iterator getAllRegistered() { + return null; + } + + // UnicodeSet would use these internally to handle properties. That is, when + // it encountered ... [:name=value:] ... + // it would do: + // CodePointProperty x = getInstance(name); + // if (x != null) doError(name, value); + // UnicodeSet s = x.getSet(value); + // and then use s. + + // open issue: we could have a property like: contains("dot") + // in that case, we would register "contains" as the 'base' name, + // but allow lookup with string parameters ("dot") + // Maybe just adding: + + public boolean hasParameters() { + return false; + } + public void setParameters(String parameters) {} + public String getParameters() { + return null; + } + + // that way we could have [[:letter:]&[:contains(dot):]] + +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java index f4c00f7277f..1f6e3657148 100644 --- a/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/ConvertUCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $ -* $Date: 2002/06/13 21:14:05 $ -* $Revision: 1.8 $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.9 $ * ******************************************************************************* */ @@ -331,7 +331,7 @@ public final class ConvertUCD implements UCD_Types { static void readBlocks() throws Exception { System.out.println("Reading 'Blocks'"); - BufferedReader input = Utility.openUnicodeFile(blocksname, version, true, false); + BufferedReader input = Utility.openUnicodeFile(blocksname, version, true, Utility.LATIN1); String line = ""; try { String[] parts = new String[20]; @@ -376,7 +376,7 @@ public final class ConvertUCD implements UCD_Types { } String tempVersion = version; if (version.equals(UCD.latestVersion)) tempVersion = ""; - BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion, true, false); + BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion, true, Utility.LATIN1); if (input == null) { System.out.println("COULDN'T OPEN: " + labels[0]); return; @@ -834,7 +834,7 @@ public final class ConvertUCD implements UCD_Types { uData.numericType = Utility.lookup(fieldValue, UCD_Names.NT, true); } else if (fieldName.equals("ea")) { - uData.eastAsianWidth = Utility.lookup(fieldValue, UCD_Names.EA, true); + uData.eastAsianWidth = Utility.lookup(fieldValue, UCD_Names.SHORT_EA, true); } else if (fieldName.equals("lb")) { uData.lineBreak = Utility.lookup(fieldValue, UCD_Names.LB, true); diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java index 832bad46f83..cb7c6fe5734 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $ -* $Date: 2002/07/30 09:56:41 $ -* $Revision: 1.11 $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.12 $ * ******************************************************************************* */ @@ -63,7 +63,7 @@ public class GenerateCaseFolding implements UCD_Types { out.println("# CaseFolding" + GenerateData.getFileSuffix(false)); out.println(GenerateData.generateDateLine()); out.println("#"); - Utility.appendFile("CaseFoldingHeader.txt", false, out); + Utility.appendFile("CaseFoldingHeader.txt", Utility.LATIN1, out); /* PrintWriter out = new PrintWriter( @@ -561,7 +561,7 @@ public class GenerateCaseFolding implements UCD_Types { out.println("# SpecialCasing" + GenerateData.getFileSuffix(false)); out.println(GenerateData.generateDateLine()); out.println("#"); - Utility.appendFile("SpecialCasingHeader.txt", true, out); + Utility.appendFile("SpecialCasingHeader.txt", Utility.UTF8, out); Iterator it = sorted.keySet().iterator(); int lastOrder = -1; @@ -584,7 +584,7 @@ public class GenerateCaseFolding implements UCD_Types { case 3: out.println("# Ligatures"); break; case 4: skipLine = true; break; case 5: out.println("# No corresponding uppercase precomposed character"); break; - case 6: Utility.appendFile("SpecialCasingIota.txt", true, out); break; + case 6: Utility.appendFile("SpecialCasingIota.txt", Utility.UTF8, out); break; case 7: out.println("# Some characters with YPOGEGRAMMENI are also have no corresponding titlecases"); break; case 8: skipLine = true; break; } @@ -592,7 +592,7 @@ public class GenerateCaseFolding implements UCD_Types { } out.println(line); } - Utility.appendFile("SpecialCasingFooter.txt", true, out); + Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out); out.close(); Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile)); } diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseTest.java b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseTest.java new file mode 100644 index 00000000000..ee6686e071f --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseTest.java @@ -0,0 +1,94 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseTest.java,v $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; + +import java.util.*; +import java.io.*; + +import com.ibm.text.utility.*; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; + +abstract public class GenerateCaseTest implements UCD_Types { + + public static void main(String[] args) throws IOException { + System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61"); + Default.setUCD(); + + PrintWriter out = Utility.openPrintWriter("CaseTest.txt", Utility.UTF8_WINDOWS); + + out.println("# CaseTest"); + out.println("# Generated: " + Default.getDate() + ", MED"); + Utility.appendFile("CaseTestHeader.txt", Utility.LATIN1, out); + + for (int cp = 0; cp < 0x10FFFF; ++cp) { + Utility.dot(cp); + if (!Default.ucd.isAllocated(cp)) continue; + if (Default.ucd.isHangulSyllable(cp)) continue; + byte cat = Default.ucd.getCategory(cp); + if (cp == PRIVATE_USE) continue; + + String lower = Default.ucd.getCase(cp, FULL, LOWER); + String upper = Default.ucd.getCase(cp, FULL, UPPER); + String title = Default.ucd.getCase(cp, FULL, TITLE); + String fold = Default.ucd.getCase(cp, FULL, FOLD); + if (lower.equals(upper) + && lower.equals(title) + && lower.equals(fold)) continue; + + String s = UTF16.valueOf(cp); + write(out, s, true); + + // if (cp == '\u0345') continue; // don't add combining for this special case + + s = s + testChar; + + String s2 = Default.nfd.normalize(s); + + String lower1 = Default.nfc.normalize(Default.ucd.getCase(s2, FULL, LOWER)); + String upper1 = Default.nfc.normalize(Default.ucd.getCase(s2, FULL, UPPER)); + String title1 = Default.nfc.normalize(Default.ucd.getCase(s2, FULL, TITLE)); + String fold1 = Default.nfc.normalize(Default.ucd.getCase(s2, FULL, FOLD)); + + if (lower1.equals(Default.nfc.normalize(lower+testChar)) + && upper1.equals(Default.nfc.normalize(upper+testChar)) + && title1.equals(Default.nfc.normalize(title+testChar)) + && fold1.equals(Default.nfc.normalize(fold+testChar)) + ) continue; + + write(out, s, true); + } + out.println("# total lines: " + counter); + out.close(); + } + + static final char testChar = '\u0316'; + static int counter = 0; + + static void write(PrintWriter out, String ss, boolean doComment) { + String s = Default.nfd.normalize(ss); + String lower = Default.nfc.normalize(Default.ucd.getCase(s, FULL, LOWER)); + String upper = Default.nfc.normalize(Default.ucd.getCase(s, FULL, UPPER)); + String title = Default.nfc.normalize(Default.ucd.getCase(s, FULL, TITLE)); + String fold = Default.nfc.normalize(Default.ucd.getCase(s, FULL, FOLD)); + out.println(Utility.hex(ss) + "; " + + Utility.hex(lower) + "; " + + Utility.hex(upper) + "; " + + Utility.hex(title) + "; " + + Utility.hex(fold) + + (doComment ? "\t# " + Default.ucd.getName(ss) : "") + ); + counter++; + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java index eeb0a11cebd..c39a0c8055c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $ -* $Date: 2002/07/30 09:56:41 $ -* $Revision: 1.22 $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.23 $ * ******************************************************************************* */ @@ -545,6 +545,10 @@ public class GenerateData implements UCD_Types { if (i == (BINARY_PROPERTIES | CaseFoldTurkishI)) continue; if (i == (BINARY_PROPERTIES | Non_break)) continue; + if (type == NUMERIC_TYPE) { + //System.out.println("debug"); + } + UnicodeProperty up = UnifiedBinaryProperty.make(i, Default.ucd); if (up == null) continue; if (!up.isStandard()) continue; @@ -587,8 +591,9 @@ public class GenerateData implements UCD_Types { } valueAbb = up.getValue(SHORT); - if (valueAbb.length() == 0) valueAbb = "n/a"; valueAbb = Utility.getUnskeleton(valueAbb, false); + if (valueAbb.length() == 0) valueAbb = "n/a"; + //else if (valueAbb.equals(value)) valueAbb = "n/a"; if (type == COMBINING_CLASS) { @@ -643,6 +648,13 @@ public class GenerateData implements UCD_Types { } } + UCD.BlockData blockData = new UCD.BlockData(); + + int blockId = 0; + while (Default.ucd.getBlockData(blockId++, blockData)) { + addLine(sorted, "blk", "n/a", blockData.name); + } + String filename = "PropertyAliases"; String newFile = "DerivedData/" + filename + getFileSuffix(true); PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); @@ -651,7 +663,7 @@ public class GenerateData implements UCD_Types { log.println("# " + filename + getFileSuffix(false)); log.println(generateDateLine()); log.println("#"); - Utility.appendFile("PropertyAliasHeader.txt", false, log); + Utility.appendFile("PropertyAliasHeader.txt", Utility.LATIN1, log); log.println(HORIZONTAL_LINE); log.println(); Utility.print(log, sorted, "\r\n", new MyBreaker(true)); @@ -667,7 +679,7 @@ public class GenerateData implements UCD_Types { log.println("# " + filename + getFileSuffix(false)); log.println(generateDateLine()); log.println("#"); - Utility.appendFile("PropertyValueAliasHeader.txt", false, log); + Utility.appendFile("PropertyValueAliasHeader.txt", Utility.LATIN1, log); log.println(HORIZONTAL_LINE); log.println(); Utility.print(log, sorted, "\r\n", new MyBreaker(false)); diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java index 54195ea7625..f6af4ac20ef 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $ -* $Date: 2002/08/04 21:38:45 $ -* $Revision: 1.9 $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.10 $ * ******************************************************************************* */ @@ -45,7 +45,7 @@ public final class GenerateHanTransliterator implements UCD_Types { log = Utility.openPrintWriter("Unihan_log.html", Utility.UTF8_WINDOWS); log.println("<body>"); - BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, true); + BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, Utility.UTF8); Map properties = new TreeMap(); @@ -502,7 +502,7 @@ public final class GenerateHanTransliterator implements UCD_Types { if (type == CHINESE) { System.out.println("Reading chinese_frequency.txt"); - br = Utility.openReadFile(BASE_DIR + "dict\\chinese_frequency.txt", true); + br = Utility.openReadFile(BASE_DIR + "dict\\chinese_frequency.txt", Utility.UTF8); counter = 0; while (true) { line = Utility.readDataLine(br); @@ -521,7 +521,7 @@ public final class GenerateHanTransliterator implements UCD_Types { if (type == JAPANESE) { System.out.println("Reading japanese_frequency.txt"); - br = Utility.openReadFile( BASE_DIR + "dict\\japanese_frequency.txt", true); + br = Utility.openReadFile( BASE_DIR + "dict\\japanese_frequency.txt", Utility.UTF8); Map japaneseMap = new HashMap(); while (true) { line = Utility.readDataLine(br); @@ -704,7 +704,7 @@ public final class GenerateHanTransliterator implements UCD_Types { if (type == JAPANESE) fname = "edict.txt"; System.out.println("Reading " + fname); - BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, true); + BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8); int counter = 0; String[] pieces = new String[50]; String line = ""; @@ -751,7 +751,7 @@ public final class GenerateHanTransliterator implements UCD_Types { String fname = "Chinese_override.txt"; System.out.println("Reading " + fname); - BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, true); + BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8); int counter = 0; String[] pieces = new String[50]; String line = ""; @@ -997,7 +997,7 @@ public final class GenerateHanTransliterator implements UCD_Types { static void readCDICT() throws IOException { System.out.println("Reading cdict.txt"); - BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\cdict.txt", true); + BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\cdict.txt", Utility.UTF8); int counter = 0; String[] pieces = new String[50]; String line = ""; @@ -1075,7 +1075,7 @@ public final class GenerateHanTransliterator implements UCD_Types { static void readUnihanData(String key) throws java.io.IOException { - BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, true); + BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, Utility.UTF8); int count = 0; int lineCounter = 0; diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks-old.java b/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks-old.java new file mode 100644 index 00000000000..c89f330b005 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks-old.java @@ -0,0 +1,74 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks-old.java,v $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; +import java.io.*; +import com.ibm.text.utility.*; +import com.ibm.text.UnicodeSet; +import java.util.*; + +public class GenerateThaiBreaks { + public static void main(String [] args) throws IOException { + + BufferedReader br = new BufferedReader( + new InputStreamReader( + new FileInputStream("\\icu4j\\src\\data\\thai6.ucs"), "UnicodeLittle")); + try { + Main.setUCD(); + UnicodeSet ignorables = new UnicodeSet("[:M:]"); + ignorables.retain(0x0E00, 0x0E7F); // just Thai block + ignorables.add(0x0E40, 0x0E44); // add logical order exception + ignorables.add(0, ' '); // add controls + ignorables.add('.'); + + UnicodeSet initials = new UnicodeSet(); + UnicodeSet finals = new UnicodeSet(); + UnicodeSet medials = new UnicodeSet(); + while (true) { + String line = br.readLine(); + if (line == null) break; + int end; + + // find final consonant + for (int i = line.length() - 1; ; --i) { + char c = line.charAt(i); + if (!ignorables.contains(c)) { + finals.add(c); + end = i; + break; + } + } + + boolean haveFirst = false; + for (int i = 0; i < end; ++i) { + char c = line.charAt(i); + if (ignorables.contains(c)) continue; + if (!haveFirst) { + initials.add(c); + haveFirst = true; + } else { + medials.add(c); + } + } + } + + initials.removeAll(medials); + finals.removeAll(medials); + Utility.showSetNames("initials: ", initials, false, Main.ucd); + Utility.showSetNames("finals: ", finals, false, Main.ucd); + Utility.showSetNames("medials: ", medials, false, Main.ucd); + } finally { + br.close(); + } + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/IANANames.java b/tools/unicodetools/com/ibm/text/UCD/IANANames.java index 6abfb651bc3..383b362cc78 100644 --- a/tools/unicodetools/com/ibm/text/UCD/IANANames.java +++ b/tools/unicodetools/com/ibm/text/UCD/IANANames.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/IANANames.java,v $ -* $Date: 2002/08/08 15:38:16 $ -* $Revision: 1.1 $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.2 $ * ******************************************************************************* */ @@ -65,7 +65,7 @@ public class IANANames implements UCD_Types { } public IANANames() throws IOException { - BufferedReader in = Utility.openReadFile(BASE_DIR + "IANA\\character-sets.txt", false); + BufferedReader in = Utility.openReadFile(BASE_DIR + "IANA\\character-sets.txt", Utility.LATIN1); try { boolean atStart = true; String lastName = ""; diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index 6a863f6c7ec..00d9e005056 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2002/10/01 01:19:16 $ -* $Revision: 1.24 $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.25 $ * ******************************************************************************* */ @@ -73,6 +73,8 @@ public final class Main implements UCD_Types { else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry(); + else if (arg.equalsIgnoreCase("testenum")) SampleEnum.test(); + else if (arg.equalsIgnoreCase("quicktest")) QuickTest.test(); else if (arg.equalsIgnoreCase("TernaryStore")) TernaryStore.test(); diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt index 73858efd205..80f1db25521 100644 --- a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt @@ -34,4 +34,4 @@ # In addition, some property names may be the same as some property value names. # # The combination of property value and property name is, however, unique. -# For more information, see UTR #24: Regular Expression Guidelines +# For more information, see UTR #18: Regular Expression Guidelines diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasHeader.txt index 92c7d7ca715..1b1c0b44866 100644 --- a/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasHeader.txt @@ -29,7 +29,7 @@ # and '_' are ignored. # # NOTE: The Block property values are in Blocks.txt, and not repeated here. -# For more information on the use of blocks, see UTR #24: Regular Expression Guidelines +# For more information on the use of blocks, see UTR #18: Regular Expression Guidelines # # NOTE: Currently there is at most one abbreviated name and one long name for # property value. However, in the future additional aliases diff --git a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java new file mode 100644 index 00000000000..69f82ca8f28 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java @@ -0,0 +1,103 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; + +import java.util.*; +import java.io.*; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; + +import com.ibm.text.utility.*; + +public class QuickTest implements UCD_Types { + static final void test() { + Default.setUCD(); +/* + [4] NameStartChar := ":" | [A-Z] | "_" | [a-z] | + [#xC0 - #x2FF] | [#x370 - #x37D] | [#x37F - #x1FFF] | + [#x200C - #x200D] | [#x2070 - #x218F] | [#x2C00 - #x2FEF] | + [#x3001 - #xD7FF] | [#xF900 - #xF9FF] | [#x10000 - #xDFFFF] + + [4a] NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] +*/ + UnicodeSet nameStartChar = new UnicodeSet("[\\: A-Z \\_ a-z" + + "\\u00c0-\\u02FF \\u0370-\\u037D \\u037F-\\u1FFF" + + "\\u200C-\\u200D \\u2070-\\u218F \\u2C00-\\u2FEF" + + "\\u3001-\\uD7FF \\uF900-\\uF9FF \\U00010000-\\U000DFFFF]"); + + UnicodeSet nameChar = new UnicodeSet("[\\- \\. 0-9 \\u00B7 \\u0300-\\u036F]") + .addAll(nameStartChar); + + showSet("NameStartChar", nameStartChar); + showDiffs("NameChar", nameChar, "NameStartChar", nameStartChar); + + + UnicodeSet defaultIgnorable = UnifiedBinaryProperty.make(DERIVED | DefaultIgnorable).getSet(); + UnicodeSet whitespace = UnifiedBinaryProperty.make(BINARY_PROPERTIES | White_space).getSet(); + + UnicodeSet notNFKC = new UnicodeSet(); + UnicodeSet privateUse = new UnicodeSet(); + UnicodeSet noncharacter = new UnicodeSet(); + UnicodeSet format = new UnicodeSet("[:Cf:]"); + + for (int i = 0; i <= 0x10FFFF; ++i) { + if (!Default.ucd.isAllocated(i)) continue; + if (!Default.nfkc.isNormalized(i)) notNFKC.add(i); + if (Default.ucd.isNoncharacter(i)) noncharacter.add(i); + if (Default.ucd.getCategory(i) == PRIVATE_USE) privateUse.add(i); + } + + showSet("notNFKC in NameChar", new UnicodeSet(notNFKC).retainAll(nameChar)); + showSet("notNFKC outside of NameChar", new UnicodeSet(notNFKC).removeAll(nameChar)); + + showSet("Whitespace in NameChar", new UnicodeSet(nameChar).retainAll(whitespace)); + showSet("Whitespace not in NameChar", new UnicodeSet(whitespace).removeAll(nameChar)); + + + showSet("Noncharacters in NameChar", new UnicodeSet(noncharacter).retainAll(noncharacter)); + showSet("Noncharacters outside of NameChar", new UnicodeSet(noncharacter).removeAll(nameChar)); + + showSet("Format in NameChar", new UnicodeSet(nameChar).retainAll(format)); + showSet("Other Default_Ignorables in NameChar", new UnicodeSet(defaultIgnorable).removeAll(format).retainAll(nameChar)); + showSet("PrivateUse in NameChar", new UnicodeSet(defaultIgnorable).retainAll(privateUse)); + + UnicodeSet CID_Start = new UnicodeSet("[:ID_Start:]").removeAll(notNFKC); + UnicodeSet CID_Continue = new UnicodeSet("[:ID_Continue:]") + .removeAll(notNFKC).removeAll(format); + + UnicodeSet CID_Continue_extras = new UnicodeSet(CID_Continue).removeAll(CID_Start); + + showDiffs("NoK_ID_Start", CID_Start, "NameStartChar", nameStartChar); + showDiffs("NoK_ID_Continue_Extras", CID_Continue_extras, "NameChar", nameChar); + + System.out.println("Removing canonical singletons"); + } + + static void showDiffs(String title1, UnicodeSet set1, String title2, UnicodeSet set2) { + showSet(title1 + " - " + title2, new UnicodeSet(set1).removeAll(set2)); + } + + static void showSet(String title1, UnicodeSet set1) { + System.out.println(); + System.out.println(title1); + if (set1.size() == 0) { + System.out.println("\tNONE"); + return; + } + System.out.println("\tCount:" + set1.size()); + System.out.println("\tSet:" + set1.toPattern(true)); + System.out.println("\tDetails:"); + Utility.showSetNames("", set1, false, Default.ucd); + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/TernaryStore.java b/tools/unicodetools/com/ibm/text/UCD/TernaryStore.java new file mode 100644 index 00000000000..9525ebf63e9 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/TernaryStore.java @@ -0,0 +1,566 @@ +package com.ibm.text.UCD; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.lang.UCharacter; +import com.ibm.text.utility.*; +import java.util.*; +import java.io.*; + +// Enumerated properties will be IntCodePointProperty. +// The string values they return will be the property value names. +// Binary properties are Enumerated properties. They return 0 or 1 + +public final class TernaryStore { + + static final int DONE = Integer.MIN_VALUE; + static final int NOT_FOUND = Integer.MIN_VALUE+1; + + // for testing + static DepthPrinter dp; + + static void test() throws java.io.IOException { + Default.setUCD(); + + PrintWriter pw = Utility.openPrintWriter("TestTernary.txt", Utility.LATIN1_WINDOWS); + try { + dp = new DepthPrinter(pw); + + String[] tests = {"the", "quick", "fish", "fisherman", "fishes", + "brown", "brow", "bracket", "bright", "brat", + "brough", "dogs", "upper", "zebra", + "fisher"}; + test("Simple: ", tests, tests.length); + + + tests = new String[300000]; + int counter = 0; + int i; + for (i = 0; counter < tests.length && i <= 0x10FFFF; ++i) { + if (Default.ucd.hasComputableName(i)) continue; + + String temp = UCharacter.getName(i); + if (temp != null) tests[counter++] = temp.trim(); + } + System.out.println("max-cp: " + Utility.hex(i)); + test("Unicode Names: ", tests, counter); + + //if (true) return; + + BufferedReader br = Utility.openReadFile(UCD_Types.BASE_DIR + "dict\\DiploFreq.txt", Utility.LATIN1); + String line; + counter = 0; + while (counter < tests.length) { + line = Utility.readDataLine(br); + if (line == null) break; + if (line.length() == 0) continue; + Utility.dot(counter); + int tabPos = line.indexOf('\t'); + if (tabPos < 0) { + System.out.println("???" + line); + continue; + } + tests[counter++] = line.substring(tabPos+1); + } + test("French: ", tests, counter); + } finally { + pw.close(); + } + } + + static void test(String title, String[] tests, int len) { + System.out.println(); + System.out.println(title); + dp.println(); + dp.print(title, 0); + dp.println(); + TernaryStore.Builder builder = new TernaryStore.Builder(); + int charCount = 0; + for (int i = 0; i < len; ++i) { + builder.add(tests[i], i); + charCount += tests[i].length(); + } + System.out.println("charCount: " + charCount); + TernaryStore store = builder.build(); + store.showNodes(); + store.checkNodes(); + + dp.println("Storage"); + dp.println(store.stringStore.toString()); + System.out.println("StorageSize: " + store.stringStore.toString().length()); + + Matcher matcher = store.getMatcher(); + for (int i = 0; i < len; ++i) { + int check = test(tests[i], matcher); + if (check != i) { + System.out.println("\tFail, result: " + tests[i] + ", " + check); + } + } + } + + static int test(String s, Matcher matcher) { + matcher.reset(s, 0); + int lastResult = -1; + for (int result = matcher.next(); result != DONE; result = matcher.next()) { + lastResult = result; + } + return lastResult; + } + + static final class Node { + String getString(StringStore stringStore) { + if (stringCode < 0) return tempString; + return stringStore.get(stringCode); + } + void setString(String s) { + tempString = s; + } + String tempString; + int stringCode = -1; + Node less; + Node greater; + Node next; + int result = NOT_FOUND; + + public String toString(StringStore store) { + return getString(store) + + (result != NOT_FOUND ? "(" + result + ")" : "") + + (next != null ? next.toString() : ""); + } + } + + Node base; + StringStore stringStore = new StringStore(); + + final static class Matcher { + TernaryStore store; + String s; + int position; + Node lastNode; + + void reset(String s, int position) { + this.s = s; + this.position = position; + this.lastNode = store.base; + } + + // returns the next result + // or DONE when done + // sets position to point after end of found string + + int next() { + while (lastNode != null && position < s.length()) { + char ch = s.charAt(position++); + do { + String nodeString = lastNode.getString(store.stringStore); + char first = nodeString.charAt(0); + if (ch == first) { + // now check the rest of the string + for (int i = 1; i < nodeString.length(); ++i) { + char other = nodeString.charAt(i); + if (other != s.charAt(position++)) { + return DONE; + } + } + + // if we succeed, return result if there is one + int result = lastNode.result; + lastNode = lastNode.next; + if (result != NOT_FOUND) return result; + break; // get next char + } + // otherwise branch sideways, keeping same char + if (ch > first) { + lastNode = lastNode.greater; + } else { + lastNode = lastNode.less; + } + } while (lastNode != null); + } + return DONE; + } + } + + public Matcher getMatcher() { + Matcher result = new Matcher(); + result.store = this; + return result; + } + + public void showNodes() { + showNodes2(base, "", 5); + } + + public void showNodes2(Node n, String path, int depth) { + if (n.less != null) { + showNodes2(n.less, path+"-", depth); + } + dp.print("", depth); + if (false) dp.print(path); + dp.print(n.getString(stringStore)); + if (n.result != NOT_FOUND) dp.print("/" + n.result); + dp.println(); + if (n.next != null) { + showNodes2(n.next, path+".", depth+n.getString(stringStore).length()); + } + if (n.greater != null) { + showNodes2(n.greater, path+"+", depth); + } + } + + static class NodeInfo { + int nodeCount; + int resultCount; + int nullLessCount; + int nullGreaterCount; + int nullSimpleCount; + int nullNextCount; + } + + public void checkNodes() { + NodeInfo nodeInfo = new NodeInfo(); + checkNodes(base, nodeInfo); + System.out.println("Nodes: " + nodeInfo.nodeCount); + System.out.println("nullLessCount: " + nodeInfo.nullLessCount); + System.out.println("nullGreaterCount: " + nodeInfo.nullGreaterCount); + System.out.println("nullNextCount: " + nodeInfo.nullNextCount); + System.out.println("resultCount: " + nodeInfo.resultCount); + System.out.println("nullSimpleCount: " + nodeInfo.nullSimpleCount); + } + + public void checkNodes(Node n, NodeInfo nodeInfo) { + nodeInfo.nodeCount++; + if (n.result != NOT_FOUND) nodeInfo.resultCount++; + if (n.less != null) { + checkNodes(n.less, nodeInfo); + } else { + nodeInfo.nullLessCount++; + if (n.greater == null && n.result == NOT_FOUND) nodeInfo.nullSimpleCount++; + } + if (n.next != null) { + checkNodes(n.next, nodeInfo); + } else { + nodeInfo.nullNextCount++; + } + if (n.greater != null) { + checkNodes(n.greater, nodeInfo); + } else { + nodeInfo.nullGreaterCount++; + } + } + + final static class DepthPrinter { + private PrintWriter pw; + private int currentDepth = 0; + private String leader = "."; + + DepthPrinter(PrintWriter pw) { + this.pw = pw; + } + + void print(char ch) { + print(ch, 0); + } + + void print(String s) { + print(s, 0); + } + + void print(char ch, int depth) { + print(String.valueOf(ch), depth); + } + + void print(String s, int depth) { + int delta = depth - currentDepth; + if (delta > 0) { + pw.print(Utility.repeat(leader, delta - 1)); + currentDepth = depth; + } + pw.print(s); + currentDepth += s.length(); + } + + void println() { + pw.println(); + currentDepth = 0; + } + + void println(String s) { + pw.print(s); + pw.println(); + currentDepth = 0; + } + } + + final static class StringStore { + // initially, there is a simple strategy + + private String buffer = ""; + private static final char TERMINATOR = '\u007E'; + private static final int PIECE_LENGTH = 5; + private static String[] pieces = new String[50]; // HACK + private static Set strings = new HashSet(); + + public void add(String s) { + strings.add(s); + } + + public void compact() { + System.out.println("Adding Pieces"); + // add all the pieces + Iterator it = strings.iterator(); + Set additions = new HashSet(); + while (it.hasNext()) { + String s = (String)it.next(); + int len = Utility.split(s, ' ', pieces); + for (int i = 0; i < len; ++i) { + additions.add(pieces[i]); + } + } + + store(additions); + store(strings); + } + + private void store(Set stuff) { + System.out.println("Sorting"); + // sort them by length, longest first + Set ordered = new TreeSet(); + Iterator it = stuff.iterator(); + while (it.hasNext()) { + String s = (String)it.next(); + ordered.add(new Pair(new Integer(-s.length()), s)); + } + System.out.println("Storing"); + // add them + it = ordered.iterator(); + while (it.hasNext()) { + String s = (String)(((Pair)it.next()).second); + get(s); + } + } + + private int get(String s) { + System.out.println("Adding: \'" + s + "\'"); + int index; + if (s.indexOf(' ') < 0) { + index = addNoSplit(s); + System.out.println("\tReturning: " + index); + return index; + } + int len = Utility.split(s, ' ', pieces); + StringBuffer itemCodes = new StringBuffer(); + for (int i = 0; i < len; ++i) { + String piece = pieces[i]; + itemCodes.append((char)addNoSplit(piece)); + /*for (int j = 0; j < piece.length(); j += PIECE_LENGTH) { + int maxLen = j + PIECE_LENGTH; + if (maxLen > piece.length()) maxLen = piece.length(); + itemCodes.append((char)addNoSplit(piece.substring(j, maxLen))); + }*/ + } + index = 0x8000 | addNoSplit(itemCodes.toString()); // mark it as composite + System.out.println("\tReturning: " + index); + return index; + } + + private int addNoSplit(String s) { + System.out.println("\tAdding2: \'" + s + "\'"); + String sTerm = s + TERMINATOR; + int index = buffer.indexOf(sTerm); + if (index >= 0) return index; + + index = buffer.length(); + buffer += sTerm; + System.out.println("\t\tReturning2: " + index); + return index; + } + + public String get(int index) { + String result; + System.out.println("Fetching: " + index); + + if ((index & 0x8000) == 0) { + int end = buffer.indexOf(TERMINATOR, index); + result = buffer.substring(index, end); + System.out.println("\tReturning: '" + result + "'"); + return result; + } + index &= ~0x8000; // remove 1 bit + + int end = buffer.indexOf(TERMINATOR, index); + result = ""; + for (int i = index; i < end; ++i) { + if (result.length() != 0) result += " "; + result += get(buffer.charAt(i)); + } + System.out.println("\tReturning: '" + result + "'"); + return result; + } + + public String toString() { + return buffer; + } + + } + + final static class Builder { + Map map = new TreeMap(); + String[] names; + TernaryStore store; + Set set = new TreeSet(); + + public void add(String name, int result) { + map.put(name, new Integer(result)); + } + + public TernaryStore build() { + // flatten strings into array + names = new String[map.size()]; + Iterator it = map.keySet().iterator(); + int count = 0; + while (it.hasNext()) { + names[count++] = (String) it.next(); + if (false) { + dp.print((count-1) + " " + names[count-1]); + dp.println(); + } + } + + // build nodes + store = new TernaryStore(); + addNode(0, names.length); + + // free storage + names = null; + map.clear(); + + System.out.println("compacting"); + compactStore(store.base); + store.stringStore.compact(); + + //compactStrings(store); + //set.clear(); // free more storage + + replaceStrings(store.base); + //map.clear(); // free storage + + // free storage + TernaryStore result = store; + store = null; + + return result; + } + + /* + void compactStrings(TernaryStore t) { + // we have a set of Pairs, first is length, second is string + // compact them, word by word + Iterator it = set.iterator(); + while (it.hasNext()) { + String string = ((String)((Pair)it.next()).second); + int index = t.stringStore.add(string); + if (true) { + System.out.println("Checking: " + index); + String reverse = t.stringStore.get(index); + if (!reverse.equals(string)) { + System.out.println("source: \'" + string + "\'"); + System.out.println("reverse: \'" + reverse + "\'"); + throw new IllegalArgumentException("Failed roundtrip"); + } + } + + map.put(string, new Integer(index)); + } + } + */ + + public void replaceStrings(Node n) { + n.stringCode = store.stringStore.get(n.getString(store.stringStore)); + n.setString(null); + if (n.less != null) replaceStrings(n.less); + if (n.next != null) replaceStrings(n.next); + if (n.greater != null) replaceStrings(n.greater); + } + + public void compactStore(Node n) { + Node nextNode = n.next; + if (false) dp.println(n.toString()); + while (n.result == NOT_FOUND && nextNode != null && nextNode.greater == null + && nextNode.less == null) { + n.setString(n.getString(store.stringStore) + nextNode.getString(store.stringStore)); + n.result = nextNode.result; + n.next = nextNode = nextNode.next; // remove old node + } + // add strings sorted by length, longest first + store.stringStore.add(n.getString(store.stringStore)); + + if (n.less != null) compactStore(n.less); + if (n.next != null) compactStore(n.next); + if (n.greater != null) compactStore(n.greater); + } + + private void addNode(int start, int limit) { + if (start >= limit) return; + int mid = (start + limit) / 2; + //System.out.println("start: " + start + ", mid: " + mid + ", limit: " + limit); + //System.out.println("adding: " + names[mid]); + addNode(names[mid], ((Integer)map.get(names[mid])).intValue()); + addNode(start, mid); + addNode(mid+1, limit); + } + + private void addNode(String s, int result) { + if (store.base == null) { + store.base = addRest(s, 0, result); + return; + } + Node n = store.base; + Node lastNode = n; + + for (int i = 0; i < s.length(); ++i) { + char ch = s.charAt(i); + while (true) { + char first = n.getString(store.stringStore).charAt(0); + if (ch == first) { + if (n.next == null) { + n.next = addRest(s, i+1, result); + return; + } + lastNode = n; + n = n.next; + break; // get next char + } + // otherwise branch sideways, keeping same char + if (ch > first) { + if (n.greater == null) { + n.greater = addRest(s, i, result); + return; + } + n = n.greater; + } else { + if (n.less == null) { + n.less = addRest(s, i, result); + return; + } + n = n.less; + } + } + } + lastNode.result = result; + } + + private Node addRest(String s, int position, int result) { + Node lastNode = null; + for (int i = s.length() - 1; i >= position; --i) { + Node n = new Node(); + n.setString(s.substring(i, i+1)); // + "" to force a new string + if (lastNode == null) { + n.result = result; + } + n.next = lastNode; + lastNode = n; + } + return lastNode; + } + } +} + diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 6ce3c43055a..223f7d271ac 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,21 +5,25 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2002/09/25 06:40:13 $ -* $Revision: 1.18 $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.19 $ * ******************************************************************************* */ package com.ibm.text.UCD; +import java.util.List; +import java.util.ArrayList; import java.util.HashMap; import java.util.BitSet; import java.util.Map; + import java.io.IOException; import java.io.DataInputStream; import java.io.BufferedInputStream; import java.io.FileInputStream; +import java.io.BufferedReader; import com.ibm.text.utility.*; @@ -31,7 +35,7 @@ public final class UCD implements UCD_Types { /** * Used for the default version. */ - public static final String latestVersion = "3.2.0"; + public static final String latestVersion = "3.2.1"; /** * Create singleton instance for default (latest) version @@ -651,7 +655,7 @@ public final class UCD implements UCD_Types { } public static String getCategoryID_fromIndex(byte prop) { - return UCD_Names.GC[prop]; + return getCategoryID_fromIndex(prop, NORMAL); } public static String getCategoryID_fromIndex(byte prop, byte style) { @@ -660,7 +664,7 @@ public final class UCD implements UCD_Types { public String getCombiningClassID(int codePoint) { - return getCombiningClassID_fromIndex(getCombiningClass(codePoint), NORMAL); + return getCombiningClassID(codePoint, NORMAL); } public String getCombiningClassID(int codePoint, byte style) { @@ -681,9 +685,9 @@ public final class UCD implements UCD_Types { case 7: s = style < LONG ? "NK" : "Nukta"; break; case 8: s = style < LONG ? "KV" : "KanaVoicing"; break; case 9: s = style < LONG ? "VR" : "Virama"; break; - case 202: s = style < LONG ? "ATBL" : "AttachedBelowLeft"; break; - case 204: s = style < LONG ? "ATB" : "AttachedBelow"; break; - case 206: s = style < LONG ? "ATBR" : "AttachedBelowRight"; break; + case 200: s = style < LONG ? "ATBL" : "AttachedBelowLeft"; break; + case 202: s = style < LONG ? "ATB" : "AttachedBelow"; break; + case 204: s = style < LONG ? "ATBR" : "AttachedBelowRight"; break; case 208: s = style < LONG ? "ATL" : "AttachedLeft"; break; case 210: s = style < LONG ? "ATR" : "AttachedRight"; break; case 212: s = style < LONG ? "ATAL" : "AttachedAboveLeft"; break; @@ -734,7 +738,7 @@ public final class UCD implements UCD_Types { } public static String getNumericTypeID_fromIndex(byte prop) { - return UCD_Names.NT[prop]; + return getNumericTypeID_fromIndex(prop, NORMAL); } public static String getNumericTypeID_fromIndex(byte prop, byte style) { @@ -746,7 +750,7 @@ public final class UCD implements UCD_Types { } public static String getEastAsianWidthID_fromIndex(byte prop) { - return UCD_Names.EA[prop]; + return getEastAsianWidthID_fromIndex(prop, NORMAL); } public static String getEastAsianWidthID_fromIndex(byte prop, byte style) { @@ -758,7 +762,7 @@ public final class UCD implements UCD_Types { } public static String getLineBreakID_fromIndex(byte prop) { - return UCD_Names.LB[prop]; + return getLineBreakID_fromIndex(prop, NORMAL); } public static String getLineBreakID_fromIndex(byte prop, byte style) { @@ -770,7 +774,7 @@ public final class UCD implements UCD_Types { } public static String getJoiningTypeID_fromIndex(byte prop) { - return UCD_Names.JOINING_TYPE[prop]; + return getJoiningTypeID_fromIndex(prop, NORMAL); } public static String getJoiningTypeID_fromIndex(byte prop, byte style) { @@ -782,7 +786,7 @@ public final class UCD implements UCD_Types { } public static String getJoiningGroupID_fromIndex(byte prop) { - return UCD_Names.JOINING_GROUP[prop]; + return getJoiningGroupID_fromIndex(prop, NORMAL); } public static String getJoiningGroupID_fromIndex(byte prop, byte style) { @@ -795,7 +799,7 @@ public final class UCD implements UCD_Types { } public static String getScriptID_fromIndex(byte prop) { - return UCD_Names.SCRIPT[prop]; + return getScriptID_fromIndex(prop, NORMAL); } public static String getScriptID_fromIndex(byte prop, byte length) { @@ -808,7 +812,7 @@ public final class UCD implements UCD_Types { } public static String getAgeID_fromIndex(byte prop) { - return UCD_Names.AGE[prop]; + return getAgeID_fromIndex(prop, NORMAL); } public static String getAgeID_fromIndex(byte prop, byte style) { @@ -1306,4 +1310,53 @@ to guarantee identifier closure. } } } + + public static class BlockData { + public int start; + public int end; + public String name; + } + + public boolean getBlockData(int blockId, BlockData output) { + if (blocks == null) loadBlocks(); + BlockData temp; + try { + temp = (BlockData) blocks.get(blockId); + } catch (IndexOutOfBoundsException e) { + return false; + } + output.name = temp.name; + output.start = temp.start; + output.end = temp.end; + return true; + } + + private List blocks = null; + + private void loadBlocks() { + blocks = new ArrayList(); + try { + BufferedReader in = Utility.openUnicodeFile("Blocks", version, true, Utility.LATIN1); + try { + while (true) { + // 0000..007F; Basic Latin + String line = Utility.readDataLine(in); + if (line == null) break; + if (line.length() == 0) continue; + int pos1 = line.indexOf('.'); + int pos2 = line.indexOf(';', pos1); + + BlockData blockData = new BlockData(); + blockData.start = Integer.parseInt(line.substring(0, pos1), 16); + blockData.end = Integer.parseInt(line.substring(pos1+2, pos2), 16); + blockData.name = line.substring(pos2+1).trim().replace(' ', '_'); + blocks.add(blockData); + } + } finally { + in.close(); + } + } catch (IOException e) { + throw new IllegalArgumentException("Can't read block file"); + } + } } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java index ecca42fc2b9..ec9b7797b6c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $ -* $Date: 2002/06/13 21:14:05 $ -* $Revision: 1.14 $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.15 $ * ******************************************************************************* */ @@ -201,11 +201,11 @@ final class UCD_Names implements UCD_Types { static final String[] YN_TABLE = {"N", "Y"}; - static String[] EA = { + static String[] SHORT_EA = { "N", "A", "H", "W", "F", "Na" }; - static String[] SHORT_EA = { + static String[] EA = { "Neutral", "Ambiguous", "Halfwidth", "Wide", "Fullwidth", "Narrow" }; diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index 74173aa16f4..94f82407c77 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2002/08/04 21:38:45 $ -* $Revision: 1.15 $ +* $Date: 2002/10/05 01:28:58 $ +* $Revision: 1.16 $ * ******************************************************************************* */ @@ -15,7 +15,7 @@ package com.ibm.text.UCD; public interface UCD_Types { - public static final int dVersion = 9; // change to fix the generated file D version. If less than zero, no "d" + public static final int dVersion = 2; // change to fix the generated file D version. If less than zero, no "d" public static final String BASE_DIR = "C:\\DATA\\"; public static final String UCD_DIR = BASE_DIR + "UCD\\"; diff --git a/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java b/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java index 57f4c1eda86..9633b3d1575 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $ -* $Date: 2002/08/04 21:38:44 $ -* $Revision: 1.9 $ +* $Date: 2002/10/05 01:28:57 $ +* $Revision: 1.10 $ * ******************************************************************************* */ @@ -299,13 +299,13 @@ public final class UnifiedBinaryProperty extends UnicodeProperty { case COMBINING_CLASS>>8: return ucd.getCombiningClassID_fromIndex((byte)propValue, style); case BIDI_CLASS>>8: return ucd.getBidiClassID_fromIndex((byte)propValue, style); case DECOMPOSITION_TYPE>>8: return ucd.getDecompositionTypeID_fromIndex((byte)propValue, style); - case NUMERIC_TYPE>>8: ucd.getNumericTypeID_fromIndex((byte)propValue, style); - case EAST_ASIAN_WIDTH>>8: return ucd.getEastAsianWidthID_fromIndex((byte)propValue); + case NUMERIC_TYPE>>8: return ucd.getNumericTypeID_fromIndex((byte)propValue, style); + case EAST_ASIAN_WIDTH>>8: return ucd.getEastAsianWidthID_fromIndex((byte)propValue, style); case LINE_BREAK>>8: return ucd.getLineBreakID_fromIndex((byte)propValue, style); - case JOINING_TYPE>>8: return ucd.getJoiningTypeID_fromIndex((byte)propValue); - case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex((byte)propValue); + case JOINING_TYPE>>8: return ucd.getJoiningTypeID_fromIndex((byte)propValue, style); + case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex((byte)propValue, style); case BINARY_PROPERTIES>>8: return ucd.getBinaryPropertiesID_fromIndex((byte)propValue, style); - case SCRIPT>>8: return ucd.getScriptID_fromIndex((byte)propValue); + case SCRIPT>>8: return ucd.getScriptID_fromIndex((byte)propValue, style); case AGE>>8: return ucd.getAgeID_fromIndex((byte)propValue); /* case DERIVED>>8: diff --git a/tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java b/tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java index 88e0a2ac53e..92868108aa7 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedProperty.java,v $ -* $Date: 2002/08/08 15:38:16 $ -* $Revision: 1.1 $ +* $Date: 2002/10/05 01:28:57 $ +* $Revision: 1.2 $ * ******************************************************************************* */ @@ -175,12 +175,12 @@ public final class UnifiedProperty extends UnicodeProperty { case BIDI_CLASS>>8: return ucd.getBidiClassID_fromIndex(ucd.getBidiClass(cp), style); case DECOMPOSITION_TYPE>>8: return ucd.getDecompositionTypeID_fromIndex(ucd.getDecompositionType(cp), style); case NUMERIC_TYPE>>8: return ucd.getNumericTypeID_fromIndex(ucd.getNumericType(cp), style); - case EAST_ASIAN_WIDTH>>8: return ucd.getEastAsianWidthID_fromIndex(ucd.getEastAsianWidth(cp)); - case LINE_BREAK>>8: return ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp)); - case JOINING_TYPE>>8: return ucd.getJoiningTypeID_fromIndex(ucd.getJoiningType(cp)); - case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex(ucd.getJoiningGroup(cp)); - case SCRIPT>>8: return ucd.getScriptID_fromIndex(ucd.getScript(cp)); - case AGE>>8: return ucd.getAgeID_fromIndex(ucd.getAge(cp)); + case EAST_ASIAN_WIDTH>>8: return ucd.getEastAsianWidthID_fromIndex(ucd.getEastAsianWidth(cp), style); + case LINE_BREAK>>8: return ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp), style); + case JOINING_TYPE>>8: return ucd.getJoiningTypeID_fromIndex(ucd.getJoiningType(cp), style); + case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex(ucd.getJoiningGroup(cp), style); + case SCRIPT>>8: return ucd.getScriptID_fromIndex(ucd.getScript(cp), style); + case AGE>>8: return ucd.getAgeID_fromIndex(ucd.getAge(cp), style); default: throw new IllegalArgumentException("Internal Error"); } } diff --git a/tools/unicodetools/com/ibm/text/utility/PoorMansEnum.java b/tools/unicodetools/com/ibm/text/utility/PoorMansEnum.java new file mode 100644 index 00000000000..caae2cd4131 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/utility/PoorMansEnum.java @@ -0,0 +1,99 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/PoorMansEnum.java,v $ +* $Date: 2002/10/05 01:28:57 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +/* Goal for enum is: + * Easy to use + * ID <-> int + * ID <-> string name + */ +package com.ibm.text.utility; + +import java.util.*; + +public class PoorMansEnum { + protected int value; + protected String name; + protected PoorMansEnum next; + + public int toInt() { + return value; + } + + public String toString() { + return name; + } + + // for subclassers + + protected PoorMansEnum() { + } + + /** Utility for subclasses + */ + protected static class EnumStore { + private List int2Id = new ArrayList(); + private Map string2Id = new HashMap(); + private PoorMansEnum last = null; + + public PoorMansEnum add(PoorMansEnum id, String name) { + // both string and id must be new! + if (int2Id.indexOf(id) >= 0) { + throw new IllegalArgumentException("ID already stored for \"" + name + '"'); + } else if (string2Id.containsKey(name)) { + throw new IllegalArgumentException('"' + name + "\" already stored for ID "); + } + id.value = int2Id.size(); + id.name = name; + if (last != null) { + last.next = id; + } + int2Id.add(id); + string2Id.put(name, id); + last = id; + return id; + } + + public PoorMansEnum addAlias(PoorMansEnum id, String name) { + // id must be old, string must be new + if (int2Id.indexOf(id) < 0) { + throw new IllegalArgumentException("ID must already be stored for \"" + name + '"'); + } else if (string2Id.containsKey(name)) { + throw new IllegalArgumentException('"' + name + "\" already stored for ID "); + } + string2Id.put(name, id); + return id; + } + + public Collection getAliases(PoorMansEnum id, Collection output) { + Iterator it = string2Id.keySet().iterator(); + while (it.hasNext()) { + Object s = it.next(); + if (s == id.name) continue; + if (id == string2Id.get(s)) output.add(s); + } + return output; + } + + public int getMax() { + return int2Id.size(); + } + + public PoorMansEnum get(int value) { + return (PoorMansEnum) int2Id.get(value); + } + + public PoorMansEnum get(String name) { + return (PoorMansEnum) string2Id.get(name); + } + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/utility/SampleEnum.java b/tools/unicodetools/com/ibm/text/utility/SampleEnum.java new file mode 100644 index 00000000000..39049b16117 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/utility/SampleEnum.java @@ -0,0 +1,76 @@ +/******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/SampleEnum.java,v $ +* $Date: 2002/10/05 01:28:56 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.utility; + +import java.util.*; + +/** Sample Poor-Man's Enum. + * To use as a template, copy and + * <ul> + * <li>replace all instances of "SampleEnum" by your enum's name</li> + * <li>change the enum values to your values</li> + * <li>set any aliases (or remove that section)</li> + * </ul> + */ +public final class SampleEnum extends PoorMansEnum { + private static PoorMansEnum.EnumStore store = new PoorMansEnum.EnumStore(); + + public static final SampleEnum + ALPHA = add("The"), + BETA = add("Quick"), + GAMMA = add("Brown"), + + FIRST = ALPHA; + + static { + store.addAlias(ALPHA, "A"); + } + + /* Boilerplate */ + public SampleEnum next() { return (SampleEnum) next; } + public void getAliases(Collection output) { store.getAliases(this, output); } + public static SampleEnum get(String s) { return (SampleEnum) store.get(s); } + public static SampleEnum get(int v) { return (SampleEnum) store.get(v); } + public static int getMax() { return store.getMax(); } + + private SampleEnum() {} + private static SampleEnum add(String name) { return (SampleEnum) store.add(new SampleEnum(), name);} + + + + /* just for testing */ + public static void test() { + // int to string, collecting strings as we go + Set s = new TreeSet(); + for (int i = 0; i < SampleEnum.getMax(); ++i) { + String n = SampleEnum.get(i).toString(); + System.out.println(i + ", " + n); + s.add(n); + } + // String to int + Iterator it = s.iterator(); + while (it.hasNext()) { + String n = (String)it.next(); + System.out.println(n + ", " + SampleEnum.get(n).toInt()); + } + + // iteration + for (SampleEnum current = FIRST; current != null; current = current.next()) { + s.clear(); + current.getAliases(s); + System.out.println(current.toInt() + ", " + current + ", " + s); + } + } + + +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java index 57c8a170b2d..149f0e22529 100644 --- a/tools/unicodetools/com/ibm/text/utility/Utility.java +++ b/tools/unicodetools/com/ibm/text/utility/Utility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ -* $Date: 2002/09/25 06:40:14 $ -* $Revision: 1.25 $ +* $Date: 2002/10/05 01:28:56 $ +* $Revision: 1.26 $ * ******************************************************************************* */ @@ -26,7 +26,7 @@ import com.ibm.text.UCD.*; public final class Utility implements UCD_Types { // COMMON UTILITIES - static final boolean UTF8 = true; // TODO -- make argument + // static final boolean UTF8 = true; // TODO -- make argument public static final char BOM = '\uFEFF'; public static String[] append(String[] array1, String[] array2) { @@ -521,7 +521,7 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES return "<codepoint hex=\"" + hex(c,1) + "\"/>"; } - if (c <= 0x7E || UTF8) { + if (c <= 0x7E) { return UTF32.valueOf32(c); } @@ -634,17 +634,45 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES } */ - static final byte WINDOWS_MASK = 1, UTF8_MASK = 2; - public static final byte - LATIN1_UNIX = 0, - LATIN1_WINDOWS = WINDOWS_MASK, - UTF8_UNIX = UTF8_MASK, - UTF8_WINDOWS = UTF8_MASK | WINDOWS_MASK; + public static final class Encoding extends PoorMansEnum { + private static PoorMansEnum.EnumStore store = new PoorMansEnum.EnumStore(); + + /* Boilerplate */ + public Encoding next() { return (Encoding) next; } + public void getAliases(Collection output) { store.getAliases(this, output); } + public static Encoding get(String s) { return (Encoding) store.get(s); } + public static Encoding get(int v) { return (Encoding) store.get(v); } + public static int getMax() { return store.getMax(); } + + private Encoding() {} + private static Encoding add(String name) { return (Encoding) store.add(new Encoding(), name);} + } + public static final Encoding + LATIN1_UNIX = Encoding.add("LATIN1_UNIX"), + LATIN1_WINDOWS = Encoding.add("LATIN1_WINDOWS"), + UTF8_UNIX = Encoding.add("UTF8_UNIX"), + UTF8_WINDOWS = Encoding.add("UTF8_WINDOWS"), + + UTF8 = Encoding.add("UTF8"), // for read-only + LATIN1 = Encoding.add("LATIN1"), // for read-only + + FIRST = LATIN1_UNIX; + + + /* + public static final Encoding + LATIN1_UNIX = Encoding.LATIN1_UNIX, + LATIN1_WINDOWS = Encoding.LATIN1_WINDOWS, + UTF8_UNIX = Encoding.UTF8_UNIX, + UTF8_WINDOWS = Encoding.UTF8_WINDOWS; + */ + + // Normally use false, false. // But for UCD files use true, true // Or if they are UTF8, use true, false - public static PrintWriter openPrintWriter(String filename, byte options) throws IOException { + public static PrintWriter openPrintWriter(String filename, Encoding options) throws IOException { File file = new File(getOutputName(filename)); Utility.fixDot(); System.out.println("Creating File: " + file.getCanonicalPath()); @@ -655,7 +683,8 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES new UTF8StreamWriter( new FileOutputStream(file), 32*1024, - (options & WINDOWS_MASK) == 0, (options & UTF8_MASK) == 0)); + options == LATIN1_UNIX || options == UTF8_UNIX, + options == LATIN1_UNIX || options == LATIN1_WINDOWS)); } public static String getOutputName(String filename) { @@ -714,13 +743,9 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES } } - public static void appendFile(String filename, boolean utf8, PrintWriter output) throws IOException { - appendFile(filename, utf8, output, null); - } - - public static BufferedReader openReadFile(String filename, boolean UTF8) throws FileNotFoundException, UnsupportedEncodingException { + public static BufferedReader openReadFile(String filename, Encoding encoding) throws FileNotFoundException, UnsupportedEncodingException { FileInputStream fis = new FileInputStream(filename); - InputStreamReader isr = UTF8 ? new InputStreamReader(fis, "UTF8") : new InputStreamReader(fis); + InputStreamReader isr = (encoding == UTF8_UNIX || encoding == UTF8_WINDOWS) ? new InputStreamReader(fis, "UTF8") : new InputStreamReader(fis); BufferedReader br = new BufferedReader(isr, 32*1024); return br; } @@ -769,10 +794,17 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES return line; } - public static void appendFile(String filename, boolean utf8, PrintWriter output, String[] replacementList) throws IOException { + public static void appendFile(String filename, Encoding encoding, PrintWriter output) throws IOException { + appendFile(filename, encoding, output, null); + } + + public static void appendFile(String filename, Encoding encoding, PrintWriter output, String[] replacementList) throws IOException { + BufferedReader br = openReadFile(filename, encoding); + /* FileInputStream fis = new FileInputStream(filename); - InputStreamReader isr = utf8 ? new InputStreamReader(fis, "UTF8") : new InputStreamReader(fis); + InputStreamReader isr = (encoding == UTF8_UNIX || encoding == UTF8_WINDOWS) ? new InputStreamReader(fis, "UTF8") : new InputStreamReader(fis); BufferedReader br = new BufferedReader(isr, 32*1024); + */ while (true) { String line = br.readLine(); if (line == null) break; @@ -861,20 +893,20 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES return -1; } - public static void copyTextFile(String filename, boolean utf8, String newName, String[] replacementList) throws IOException { + public static void copyTextFile(String filename, Encoding encoding, String newName, String[] replacementList) throws IOException { PrintWriter out = Utility.openPrintWriter(newName, UTF8_WINDOWS); - appendFile(filename, utf8, out, replacementList); + appendFile(filename, encoding, out, replacementList); out.close(); } - public static void copyTextFile(String filename, boolean utf8, String newName) throws IOException { - copyTextFile(filename, utf8, newName, null); + public static void copyTextFile(String filename, Encoding encoding, String newName) throws IOException { + copyTextFile(filename, encoding, newName, null); } - public static BufferedReader openUnicodeFile(String filename, String version, boolean show, boolean UTF8) throws IOException { + public static BufferedReader openUnicodeFile(String filename, String version, boolean show, Encoding encoding) throws IOException { String name = getMostRecentUnicodeDataFile(filename, version, true, show); if (name == null) return null; - return openReadFile(name, UTF8); // new BufferedReader(new FileReader(name),32*1024); + return openReadFile(name, encoding); // new BufferedReader(new FileReader(name),32*1024); } public static String getMostRecentUnicodeDataFile(String filename, String version,