diff --git a/tools/unicodetools/com/ibm/text/UCD/Default.java b/tools/unicodetools/com/ibm/text/UCD/Default.java index ad5365c6ac7..815d7580df8 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Default.java +++ b/tools/unicodetools/com/ibm/text/UCD/Default.java @@ -15,6 +15,7 @@ public final class Default implements UCD_Types { private static Normalizer nfkc; private static Normalizer nfkd; private static Normalizer[] nf = new Normalizer[4]; + private static String year; public static void setUCD(String version) { ucdVersion = version; @@ -37,14 +38,21 @@ public final class Default implements UCD_Types { } static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd', 'HH:mm:ss' GMT'"); + static DateFormat yearFormat = new SimpleDateFormat("yyyy"); + static { myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); + year = yearFormat.format(new Date()); } public static String getDate() { return myDateFormat.format(new Date()); } + public static String getYear() { + return year; + } + public static String ucdVersion() { if (ucd == null) setUCD(); return ucdVersion; @@ -75,4 +83,11 @@ public final class Default implements UCD_Types { return nf[index]; } + /** + * @param lineValue + */ + public static void setYear(String lineValue) { + year = lineValue; + } + } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java index c4f48ac846b..e229463c9cb 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java +++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java @@ -34,12 +34,13 @@ import com.ibm.icu.text.SymbolTable; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeMatcher; import com.ibm.icu.text.UnicodeSet; +import com.ibm.text.UCD.MakeUnicodeFiles.Format.PrintStyle; import com.ibm.text.utility.UnicodeDataFile; import com.ibm.text.utility.Utility; import com.ibm.icu.text.Collator; public class MakeUnicodeFiles { - public static int dVersion = 6; // change to fix the generated file D version. If less than zero, no "d" + public static int dVersion = -1; // change to fix the generated file D version. If less than zero, no "d" /*static String[] hackNameList = { "noBreak", "Arabic_Presentation_Forms-A", "Arabic_Presentation_Forms-B", @@ -62,8 +63,7 @@ public class MakeUnicodeFiles { static boolean DEBUG = false; public static void main(String[] args) throws IOException { - //generateFile(); - testInvariants(); + generateFile(); } static class Format { @@ -321,6 +321,8 @@ public class MakeUnicodeFiles { } } else if (line.startsWith("DeltaVersion:")) { dVersion = Integer.parseInt(lineValue); + } else if (line.startsWith("CopyrightYear:")) { + Default.setYear(lineValue); } else if (line.startsWith("File:")) { int p2 = lineValue.lastIndexOf('/'); file = lineValue.substring(p2+1); @@ -758,7 +760,7 @@ public class MakeUnicodeFiles { else bf.setPropName(name); if (ps.interleaveValues) { - writeInterleavedValues(pw, bf, prop); + writeInterleavedValues(pw, bf, prop, ps); } else if (prop.isType(UnicodeProperty.STRING_OR_MISC_MASK)) { writeStringValues(pw, bf, prop); //} else if (prop.isType(UnicodeProperty.BINARY_MASK)) { @@ -904,10 +906,10 @@ public class MakeUnicodeFiles { private static void writeInterleavedValues( PrintWriter pw, BagFormatter bf, - UnicodeProperty prop) { + UnicodeProperty prop, PrintStyle ps) { if (DEBUG) System.out.println("Writing Interleaved Values: " + prop.getName()); pw.println(); - bf.setValueSource(new UnicodeProperty.FilteredProperty(prop, new RestoreSpacesFilter())) + bf.setValueSource(new UnicodeProperty.FilteredProperty(prop, new RestoreSpacesFilter(ps))) .setNameSource(null) .setLabelSource(null) .setRangeBreakSource(null) @@ -945,10 +947,20 @@ public class MakeUnicodeFiles { } static class RestoreSpacesFilter extends UnicodeProperty.StringFilter { - public String remap(String original) { + String skipValue; + /** + * @param ps + */ + public RestoreSpacesFilter(PrintStyle ps) { + skipValue = ps.skipValue; + if (skipValue == null) skipValue = ps.skipUnassigned; + } + + public String remap(String original) { // ok, because doesn't change length String mod = (String) Format.theFormat.hackMap.get(original); if (mod != null) original = mod; + if (original.equals(skipValue)) return null; return original.replace('_',' '); } } @@ -1147,185 +1159,7 @@ public class MakeUnicodeFiles { } } - /** - * Chain together several SymbolTables. - * @author Davis - */ - static class ChainedSymbolTable implements SymbolTable { - // TODO: add accessors? - private List symbolTables; - /** - * Each SymbolTable is each accessed in order by the other methods, - * so the first in the list is accessed first, etc. - * @param symbolTables - */ - ChainedSymbolTable(SymbolTable[] symbolTables) { - this.symbolTables = Arrays.asList(symbolTables); - } - public char[] lookup(String s) { - for (Iterator it = symbolTables.iterator(); it.hasNext();) { - SymbolTable st = (SymbolTable) it.next(); - char[] result = st.lookup(s); - if (result != null) return result; - } - return null; - } - - public UnicodeMatcher lookupMatcher(int ch) { - for (Iterator it = symbolTables.iterator(); it.hasNext();) { - SymbolTable st = (SymbolTable) it.next(); - UnicodeMatcher result = st.lookupMatcher(ch); - if (result != null) return result; - } - return null; - } - - // Warning: this depends on pos being left alone unless a string is returned!! - public String parseReference(String text, ParsePosition pos, int limit) { - for (Iterator it = symbolTables.iterator(); it.hasNext();) { - SymbolTable st = (SymbolTable) it.next(); - String result = st.parseReference(text, pos, limit); - if (result != null) return result; - } - return null; - } - } - - static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\~ \\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]"); - - public static void testInvariants() throws IOException { - String[][] variables = new String[100][2]; - int variableCount = 0; - PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt"); - out.write('\uFEFF'); // BOM - BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt"); - BagFormatter bf = new BagFormatter(); - ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] { - ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"), - ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")}); - ParsePosition pp = new ParsePosition(0); - int parseErrorCount = 0; - int testFailureCount = 0; - while (true) { - String line = in.readLine(); - if (line == null) break; - if (line.startsWith("\uFEFF")) line = line.substring(1); - out.println(line); - line = line.trim(); - int pos = line.indexOf('#'); - if (pos >= 0) line = line.substring(0,pos).trim(); - if (line.length() == 0) continue; - - // fix all the variables - String oldLine = line; - line = Utility.replace(line, variables, variableCount); - - // detect variables - if (line.startsWith("Let")) { - int x = line.indexOf('='); - variables[variableCount][0] = line.substring(3,x).trim(); - variables[variableCount][1] = line.substring(x+1).trim(); - variableCount++; - System.out.println("Added variable: <" + variables[variableCount-1][0] + "><" - + variables[variableCount-1][1] + ">"); - continue; - } - - char relation = 0; - String rightSide = null; - String leftSide = null; - UnicodeSet leftSet = null; - UnicodeSet rightSet = null; - try { - pp.setIndex(0); - leftSet = new UnicodeSet(line, pp, st); - leftSide = line.substring(0,pp.getIndex()); - eatWhitespace(line, pp); - relation = line.charAt(pp.getIndex()); - if (!INVARIANT_RELATIONS.contains(relation)) { - throw new ParseException("Invalid relation, must be one of " + INVARIANT_RELATIONS.toPattern(false), - pp.getIndex()); - } - pp.setIndex(pp.getIndex()+1); // skip char - eatWhitespace(line, pp); - int start = pp.getIndex(); - rightSet = new UnicodeSet(line, pp, st); - rightSide = line.substring(start,pp.getIndex()); - eatWhitespace(line, pp); - if (line.length() != pp.getIndex()) { - throw new ParseException("Extra characters at end", pp.getIndex()); - } - } catch (ParseException e) { - out.println("PARSE ERROR:\t" + line.substring(0,e.getErrorOffset()) - + "<@>" + line.substring(e.getErrorOffset())); - out.println(); - out.println("**** START Error Info ****"); - out.println(e.getMessage()); - out.println("**** END Error Info ****"); - out.println(); - parseErrorCount++; - continue; - } catch (IllegalArgumentException e) { - out.println("PARSE ERROR:\t" + line); - out.println(); - out.println("**** START Error Info ****"); - out.println(e.getMessage()); - out.println("**** END Error Info ****"); - out.println(); - parseErrorCount++; - continue; - } - - boolean ok = true; - switch(relation) { - case '=': ok = leftSet.equals(rightSet); break; - case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break; - case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break; - case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break; - case '\u2265': case '\u2287': ok = leftSet.containsAll(rightSet); break; - case '!': ok = leftSet.containsNone(rightSet); break; - case '?': ok = !leftSet.equals(rightSet) - && !leftSet.containsAll(rightSet) - && !rightSet.containsAll(leftSet) - && !leftSet.containsNone(rightSet); - break; - default: throw new IllegalArgumentException("Internal Error"); - } - if (ok) continue; - out.println(); - out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH)); - out.println("**** START Error Info ****"); - bf.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet); - out.println("**** END Error Info ****"); - out.println(); - testFailureCount++; - } - out.println(); - out.println("**** SUMMARY ****"); - out.println(); - out.println("ParseErrorCount=" + parseErrorCount); - out.println("TestFailureCount=" + testFailureCount); - out.close(); - System.out.println("ParseErrorCount=" + parseErrorCount); - System.out.println("TestFailureCount=" + testFailureCount); - } - - /** - * @param line - * @param pp - */ - private static void eatWhitespace(String line, ParsePosition pp) { - int cp = 0; - int i; - for (i = pp.getIndex(); i < line.length(); i += UTF16.getCharCount(cp)) { - cp = UTF16.charAt(line, i); - if (!com.ibm.icu.lang.UCharacter.isUWhiteSpace(cp)) { - break; - } - } - pp.setIndex(i); - } - + /* static class PropertySymbolTable implements SymbolTable { static boolean DEBUG = false; diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt index da50fd4c514..cf17829de5c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt +++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt @@ -1,15 +1,16 @@ Generate: -DeltaVersion: 7 +DeltaVersion: 8 +CopyrightYear: 2005 -File: GraphemeClusterBreakProperty +File: uax29/GraphemeBreakProperty Property: Grapheme_Cluster_Break Format: skipValue=Other -File: WordBreakProperty +File: uax29/WordBreakProperty Property: Word_Break Format: skipValue=Other -File: SentenceBreakProperty +File: uax29/SentenceBreakProperty Property: Sentence_Break Format: skipValue=Other @@ -20,9 +21,7 @@ Property: Block # For example, "Latin Extended-A" and "latin extended a" are equivalent. # For more information on the comparison of property values, # see UCD.html. -# -# Code points not explicitly listed in this file are given the value No_Block. -Format: valueList +Format: valueList skipUnassigned=No_Block File: CaseFolding Property: SPECIAL @@ -363,7 +362,7 @@ HackName: Supplemental_Arrows-B HackName: Supplementary_Private_Use_Area-A HackName: Supplementary_Private_Use_Area-B HackName: Canadian-Aboriginal -HackName: Old-Italic +#HackName: Old-Italic FinalComments Note that PropertyAliases sorts by the long name, while PropertyValueAliases diff --git a/tools/unicodetools/com/ibm/text/UCD/TestData.java b/tools/unicodetools/com/ibm/text/UCD/TestData.java index 06d9750b6aa..574ed63c991 100644 --- a/tools/unicodetools/com/ibm/text/UCD/TestData.java +++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $ -* $Date: 2004/02/18 03:09:00 $ -* $Revision: 1.15 $ +* $Date: 2004/12/11 06:03:08 $ +* $Revision: 1.16 $ * ******************************************************************************* */ @@ -137,8 +137,71 @@ public class TestData implements UCD_Types { log.close(); } } + + static PrintWriter log; + + public static void checkShaping() throws IOException { + log = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "checklog.txt"); + checkProperty("Joining_Type", "Non_Joining", "Joining_Type", "Transparent"); + checkProperty("Joining_Group", "No_Joining_Group", "Joining_Type", "Transparent"); + checkProperty("Line_Break", "Unknown", "Line_Break", "Combining_Mark"); + checkProperty("East_Asian_Width", null, "Line_Break", "Combining_Mark"); + checkProperty("Bidi_Class", null, "Line_Break", "Combining_Mark"); + checkProperty("Script", null, "Script", new String[]{"Common", "Inherited"}); + checkProperty("General_Category", null, "General_Category", new String[]{"Spacing_Mark", + "Enclosing_Mark", "Nonspacing_Mark"}); + log.close(); + } - public static class RegexMatcher implements UnicodeProperty.Matcher { + /** + * @param propertyName + * @param exclusion + * @param ignorePropertyName TODO + * @param ignoreValue + */ + private static void checkProperty(String propertyName, String exclusion, String ignorePropertyName, Object ignoreValueList) { + log.println(); + log.println(propertyName + " Check"); + log.println(); + Set ignoreValueSet = new HashSet(); + if (ignoreValueList instanceof String) ignoreValueSet.add(ignoreValueList); + else ignoreValueSet.addAll(Arrays.asList((Object[])ignoreValueList)); + + ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("4.0.1"); + UnicodeProperty up = ups.getProperty(propertyName); + UnicodeProperty ignProp = ups.getProperty(ignorePropertyName); + UnicodeProperty name = ups.getProperty("Name"); + UnicodeSet significant = (exclusion != null ? up.getSet(exclusion) : new UnicodeSet()).complement(); + UnicodeSetIterator it = new UnicodeSetIterator(significant); + Normalizer n = new Normalizer(Normalizer.NFD, "4.0.1"); + int counter = 0; + while (it.next()) { + String baseValue = up.getValue(it.codepoint); + String nfd = n.normalize(it.codepoint); + if (n.isNormalized(it.codepoint)) continue; + //if (nfd.equals(it.getString())) continue; + int cp; + for (int i = 0; i < nfd.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(nfd, i); + boolean shown = false; + String newValue = up.getValue(cp); + String possIgnValue = ignProp.getValue(cp); + if (ignoreValueSet.contains(possIgnValue)) { + //log.println("--- " + newValue + "\t" + Utility.hex(cp) + " " + name.getValue(cp)); + continue; + } + //log.println("*** " + newValue + "\t" + Utility.hex(cp) + " " + name.getValue(cp)); + + if (!baseValue.equals(newValue)) { + if (!shown) log.println((++counter) + "\tCONFLICT\t" + baseValue + "\t" + Utility.hex(it.codepoint) + " " + name.getValue(it.codepoint)); + log.println("\tNFD(" + Utility.hex(it.codepoint) + ") contains:\t" + newValue + "\t" + Utility.hex(cp) + " " + name.getValue(cp)); + shown = true; + } + } + } + } + + public static class RegexMatcher implements UnicodeProperty.Matcher { private Matcher matcher; public UnicodeProperty.Matcher set(String pattern) { diff --git a/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java b/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java new file mode 100644 index 00000000000..a3136148903 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java @@ -0,0 +1,203 @@ +package com.ibm.text.UCD; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.text.ParseException; +import java.text.ParsePosition; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; + +import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.text.SymbolTable; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeMatcher; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.text.utility.Utility; + +public class TestUnicodeInvariants { + + public static void main(String[] args) throws IOException { + testInvariants(); + } + + /** + * Chain together several SymbolTables. + * @author Davis + */ + static class ChainedSymbolTable implements SymbolTable { + // TODO: add accessors? + private List symbolTables; + /** + * Each SymbolTable is each accessed in order by the other methods, + * so the first in the list is accessed first, etc. + * @param symbolTables + */ + ChainedSymbolTable(SymbolTable[] symbolTables) { + this.symbolTables = Arrays.asList(symbolTables); + } + public char[] lookup(String s) { + for (Iterator it = symbolTables.iterator(); it.hasNext();) { + SymbolTable st = (SymbolTable) it.next(); + char[] result = st.lookup(s); + if (result != null) return result; + } + return null; + } + + public UnicodeMatcher lookupMatcher(int ch) { + for (Iterator it = symbolTables.iterator(); it.hasNext();) { + SymbolTable st = (SymbolTable) it.next(); + UnicodeMatcher result = st.lookupMatcher(ch); + if (result != null) return result; + } + return null; + } + + // Warning: this depends on pos being left alone unless a string is returned!! + public String parseReference(String text, ParsePosition pos, int limit) { + for (Iterator it = symbolTables.iterator(); it.hasNext();) { + SymbolTable st = (SymbolTable) it.next(); + String result = st.parseReference(text, pos, limit); + if (result != null) return result; + } + return null; + } + } + + static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\~ \\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]"); + + public static void testInvariants() throws IOException { + String[][] variables = new String[100][2]; + int variableCount = 0; + PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt"); + out.write('\uFEFF'); // BOM + BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt"); + BagFormatter bf = new BagFormatter(); + ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] { + ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"), + ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")}); + ParsePosition pp = new ParsePosition(0); + int parseErrorCount = 0; + int testFailureCount = 0; + while (true) { + String line = in.readLine(); + if (line == null) break; + if (line.startsWith("\uFEFF")) line = line.substring(1); + out.println(line); + line = line.trim(); + int pos = line.indexOf('#'); + if (pos >= 0) line = line.substring(0,pos).trim(); + if (line.length() == 0) continue; + + // fix all the variables + String oldLine = line; + line = Utility.replace(line, variables, variableCount); + + // detect variables + if (line.startsWith("Let")) { + int x = line.indexOf('='); + variables[variableCount][0] = line.substring(3,x).trim(); + variables[variableCount][1] = line.substring(x+1).trim(); + variableCount++; + if (false) System.out.println("Added variable: <" + variables[variableCount-1][0] + "><" + + variables[variableCount-1][1] + ">"); + continue; + } + + char relation = 0; + String rightSide = null; + String leftSide = null; + UnicodeSet leftSet = null; + UnicodeSet rightSet = null; + try { + pp.setIndex(0); + leftSet = new UnicodeSet(line, pp, st); + leftSide = line.substring(0,pp.getIndex()); + eatWhitespace(line, pp); + relation = line.charAt(pp.getIndex()); + if (!INVARIANT_RELATIONS.contains(relation)) { + throw new ParseException("Invalid relation, must be one of " + INVARIANT_RELATIONS.toPattern(false), + pp.getIndex()); + } + pp.setIndex(pp.getIndex()+1); // skip char + eatWhitespace(line, pp); + int start = pp.getIndex(); + rightSet = new UnicodeSet(line, pp, st); + rightSide = line.substring(start,pp.getIndex()); + eatWhitespace(line, pp); + if (line.length() != pp.getIndex()) { + throw new ParseException("Extra characters at end", pp.getIndex()); + } + } catch (ParseException e) { + out.println("PARSE ERROR:\t" + line.substring(0,e.getErrorOffset()) + + "<@>" + line.substring(e.getErrorOffset())); + out.println(); + out.println("**** START Error Info ****"); + out.println(e.getMessage()); + out.println("**** END Error Info ****"); + out.println(); + parseErrorCount++; + continue; + } catch (IllegalArgumentException e) { + out.println("PARSE ERROR:\t" + line); + out.println(); + out.println("**** START Error Info ****"); + out.println(e.getMessage()); + out.println("**** END Error Info ****"); + out.println(); + parseErrorCount++; + continue; + } + + boolean ok = true; + switch(relation) { + case '=': ok = leftSet.equals(rightSet); break; + case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break; + case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break; + case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break; + case '\u2265': case '\u2287': ok = leftSet.containsAll(rightSet); break; + case '!': ok = leftSet.containsNone(rightSet); break; + case '?': ok = !leftSet.equals(rightSet) + && !leftSet.containsAll(rightSet) + && !rightSet.containsAll(leftSet) + && !leftSet.containsNone(rightSet); + break; + default: throw new IllegalArgumentException("Internal Error"); + } + if (ok) continue; + out.println(); + out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH)); + out.println("**** START Error Info ****"); + bf.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet); + out.println("**** END Error Info ****"); + out.println(); + testFailureCount++; + } + out.println(); + out.println("**** SUMMARY ****"); + out.println(); + out.println("ParseErrorCount=" + parseErrorCount); + out.println("TestFailureCount=" + testFailureCount); + out.close(); + System.out.println("ParseErrorCount=" + parseErrorCount); + System.out.println("TestFailureCount=" + testFailureCount); + } + + /** + * @param line + * @param pp + */ + private static void eatWhitespace(String line, ParsePosition pp) { + int cp = 0; + int i; + for (i = pp.getIndex(); i < line.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(line, i); + if (!com.ibm.icu.lang.UCharacter.isUWhiteSpace(cp)) { + break; + } + } + pp.setIndex(i); + } +} diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 2396b895832..a0813e20a60 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2004/11/13 23:10:32 $ -* $Revision: 1.36 $ +* $Date: 2004/12/11 06:03:08 $ +* $Revision: 1.37 $ * ******************************************************************************* */ @@ -1094,6 +1094,7 @@ public final class UCD implements UCD_Types { if (ch <= 0x4DB5) return 0x3400; if (ch <= 0x4E00) return ch; // CJK Ideograph if (ch <= 0x9FA5) return 0x4E00; + if (ch <= 0x9FBB && rCompositeVersion >= 0x40100) return 0x4E00; if (ch <= 0xAC00) return ch; // Hangul Syllable if (ch <= 0xD7A3) return 0xAC00; if (ch <= 0xD800) return ch; // Non Private Use High Surrogate diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java index b0f96b57f6c..61658b1839d 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $ -* $Date: 2004/11/13 23:10:32 $ -* $Revision: 1.29 $ +* $Date: 2004/12/11 06:03:08 $ +* $Revision: 1.30 $ * ******************************************************************************* */ @@ -318,7 +318,7 @@ final class UCD_Names implements UCD_Types { "BOPOMOFO", // BOPOMOFO "HAN", // HAN "YI", // YI - "OLD-ITALIC", + "OLD_ITALIC", "GOTHIC", "DESERET", "INHERITED", // nonspacing marks @@ -335,6 +335,14 @@ final class UCD_Names implements UCD_Types { "CYPRIOT", "BRAILLE", "KATAKANA_OR_HIRAGANA", + "BUGINESE", + "COPTIC", + "NEW_TAI_LUE", + "GLAGOLITIC", + "TIFINAGH", + "SYLOTI_NAGRI", + "OLD_PERSIAN", + "KHAROSHTHI", }; @@ -403,6 +411,14 @@ final class UCD_Names implements UCD_Types { "Cprt", "Brai", "Hrkt", + "Bugi", + "Copt", + "Talu", + "Glag", + "Tfng", + "Sylo", + "Xpeo", + "Khar", }; diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index 88c0869abef..28439cc037c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2004/11/13 23:10:32 $ -* $Revision: 1.29 $ +* $Date: 2004/12/11 06:03:08 $ +* $Revision: 1.30 $ * ******************************************************************************* */ @@ -390,7 +390,16 @@ public interface UCD_Types { CYPRIOT = 52, BRAILLE = 53, KATAKANA_OR_HIRAGANA = 54, - LIMIT_SCRIPT = 55; + BUGINESE = 55, + COPTIC = 56, + NEW_TAI_LUE = 57, + GLAGOLITIC = 58, + TIFINAGH = 59, + SYLOTI_NAGRI = 60, + OLD_PERSIAN = 61, + KHAROSHTHI = 62, + + LIMIT_SCRIPT = 63; static final int UNKNOWN = 0, diff --git a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt index ac528434a88..31e899a3f6f 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt +++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt @@ -41,7 +41,7 @@ #$East_Asian_Width:Neutral ? $GC:Uppercase_Letter $GC:Zs ? $Name:«.*SPACE.*» -[$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126] +# [$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126] # Examples of parsing errors @@ -56,6 +56,10 @@ $Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse] $LB:OP = $GC:Ps $General_Category:Decimal_Number = $Numeric_Type:Decimal $Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl] +$Dash ⊃ [$GC:Pd] +$Script:Common ! [$GC:Mn $GC:Me] +$Script:Common ! [$Alphabetic - $Math] +$Alphabetic ⊃ [$Uppercase $Lowercase] # Comparisons across versions @@ -71,7 +75,7 @@ $Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic] $Lowercase = [$GC:Ll $Other_Lowercase] $Uppercase = [$GC:Lu $Other_Uppercase] $ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start] -$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc] $Other_ID_Continue +$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc $Other_ID_Continue] $Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]] $Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend] $Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend] @@ -87,8 +91,8 @@ $Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf $Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]] # Testing -$script:greek = $×script:greek -$gc:lm = $script:inherited +# $script:greek = $×script:greek +# $gc:lm = $script:inherited # =========================== @@ -110,7 +114,7 @@ Let $ZWJ = [\u200D] # [\N{ZERO WIDTH JOINER}] Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation] Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol] Let $gcAllMarks = [$gc:Nonspacing_Mark $gc:Enclosing_Mark $gc:Spacing_Mark] -Let $strange = [\u24B6-\u24CF] +Let $strange = [\u24B6-\u24E9] # Unassigned, Control, Format, Private_Use, Surrogate, # Uppercase_Letter, Lowercase_Letter, Titlecase_Letter, Modifier_Letter, Other_Letter, @@ -122,9 +126,9 @@ Let $strange = [\u24B6-\u24CF] # UTS Rules -Let $alpha = [$Alphabetic $Lowercase] # $Uppercase $ZWNJ $ZWJ] +Let $alpha = [$Alphabetic $strange] # $Uppercase $ZWNJ $ZWJ] Let $lower = $Lowercase -Let $upper = [$Uppercase - $strange] +Let $upper = [$Uppercase] Let $punct = [$gcAllPunctuation $gcAllSymbols - $alpha] Let $digit = $gc:Decimal_Number Let $xdigit = [$gc:Decimal_Number $Hex_Digit] # in both! @@ -132,7 +136,7 @@ Let $alnum = [$alpha $digit] Let $space = $Whitespace Let $blank = [$Whitespace - [$LF $VTAB $FF $CR $NEL $gc:Line_Separator $gc:Paragraph_Separator]] Let $cntrl = $gc:Control -Let $graph = [^$space $gc:Control $gc:Format $gc:Surrogate $gc:Unassigned] # $ZWNJ $ZWJ] +Let $graph = [^$space $gc:Control $gc:Surrogate $gc:Unassigned] # $ZWNJ $ZWJ] Let $print = [$graph $blank - $cntrl] Let $word = [$alpha $gcAllMarks $digit $gc:Connector_Punctuation] diff --git a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java index fd47781d715..6bc06639392 100644 --- a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java +++ b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java @@ -29,7 +29,7 @@ public class UnicodeDataFile { result.out.println(generateDateLine()); result.out.println("#"); result.out.println("# Unicode Character Database"); - result.out.println("# Copyright (c) 1991-2004 Unicode, Inc."); + result.out.println("# Copyright (c) 1991-" + Default.getYear() + " Unicode, Inc."); result.out.println( "# For terms of use, see http://www.unicode.org/terms_of_use.html"); result.out.println("# For documentation, see UCD.html");