From 641a6d6d79dc61c93dc6d4dbdf67ce5f03aef2dd Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Sat, 26 Mar 2005 05:40:05 +0000 Subject: [PATCH] ICU-0 update for U4.1.0 X-SVN-Rev: 17400 --- .../com/ibm/text/UCD/BlocksHeader.txt | 2 + .../com/ibm/text/UCD/CaseFoldingHeader.txt | 1 + .../com/ibm/text/UCD/GenerateCaseFolding.java | 21 +++++--- .../com/ibm/text/UCD/GenerateData.java | 32 +++++++----- .../com/ibm/text/UCD/MakeUnicodeFiles.txt | 13 +++-- .../ibm/text/UCD/NormalizationTestHeader.txt | 32 ++++++++++++ .../ibm/text/UCD/PropertyAliasesHeader.txt | 1 + .../text/UCD/PropertyValueAliasesHeader.txt | 1 + .../com/ibm/text/UCD/SpecialCasingHeader.txt | 1 + .../com/ibm/text/UCD/TestData.java | 52 +++++++++++-------- .../text/UCD/ToolUnicodePropertySource.java | 9 ++-- .../com/ibm/text/utility/UnicodeDataFile.java | 7 ++- 12 files changed, 118 insertions(+), 54 deletions(-) create mode 100644 tools/unicodetools/com/ibm/text/UCD/NormalizationTestHeader.txt diff --git a/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt b/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt index ad4ee67b6a2..1f1a02b7761 100644 --- a/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/BlocksHeader.txt @@ -1,4 +1,6 @@ +# # Note: The casing of block names is not normative. # For example, "Basic Latin" and "BASIC LATIN" are equivalent. +# # Format: # Start Code..End Code; Block Name diff --git a/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt b/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt index ca8da1ac349..ef6ad4e18fa 100644 --- a/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt @@ -1,3 +1,4 @@ +# # Case Folding Properties # # This file is a supplement to the UnicodeData file. diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java index 6e2d6382efe..772ec22a7c0 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $ -* $Date: 2004/02/12 08:23:15 $ -* $Revision: 1.16 $ +* $Date: 2005/03/26 05:40:04 $ +* $Revision: 1.17 $ * ******************************************************************************* */ @@ -574,14 +574,19 @@ public class GenerateCaseFolding implements UCD_Types { log.close(); System.out.println("Writing"); - String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true); - PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); - String[] batName = {""}; + //String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true); + //PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); + + UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedData/", "SpecialCasing" + suffix2); + PrintWriter out = udf.out; + + /* String[] batName = {""}; String mostRecent = UnicodeDataFile.generateBat("DerivedData/", "SpecialCasing", suffix2 + UnicodeDataFile.getFileSuffix(true), batName); out.println("# SpecialCasing" + UnicodeDataFile.getFileSuffix(false)); out.println(UnicodeDataFile.generateDateLine()); out.println("#"); Utility.appendFile("SpecialCasingHeader.txt", Utility.UTF8, out); +*/ Iterator it = sorted.keySet().iterator(); int lastOrder = -1; @@ -612,8 +617,8 @@ public class GenerateCaseFolding implements UCD_Types { } out.println(line); } - Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out); - out.close(); - Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); + //Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out); + udf.close(); + //Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); } } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java index 8d7e1de3f75..4019fec01fa 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateData.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $ -* $Date: 2005/03/10 02:37:19 $ -* $Revision: 1.37 $ +* $Date: 2005/03/26 05:40:04 $ +* $Revision: 1.38 $ * ******************************************************************************* */ @@ -744,16 +744,19 @@ public class GenerateData implements UCD_Types { static public void writeNormalizerTestSuite(String directory, String fileName) throws IOException { + UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, fileName); + PrintWriter log = fc.out; + String newFile = directory + fileName + UnicodeDataFile.getFileSuffix(true); - PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX); - String[] batName = {""}; - String mostRecent = UnicodeDataFile.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName); + //PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX); + //String[] batName = {""}; + //String mostRecent = UnicodeDataFile.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName); String[] example = new String[256]; - log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false)); - log.println(UnicodeDataFile.generateDateLine()); - log.println("#"); + //log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false)); + //log.println(UnicodeDataFile.generateDateLine()); + /*log.println("#"); log.println("# Normalization Test Suite"); log.println("# Format:"); log.println("#"); @@ -787,7 +790,7 @@ public class GenerateData implements UCD_Types { log.println("#"); log.println("@Part0 # Specific cases"); - log.println("#"); + log.println("#");*/ for (int j = 0; j < testSuiteCases.length; ++j) { writeLine(testSuiteCases[j], log, false); @@ -891,8 +894,8 @@ public class GenerateData implements UCD_Types { Utility.fixDot(); log.println("#"); log.println("# END OF FILE"); - log.close(); - Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); + fc.close(); + //Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); } static void handleIdentical() throws IOException { @@ -942,12 +945,13 @@ public class GenerateData implements UCD_Types { // not recursive!!! static final String comma(String s) { + //if (true) return s; commaResult.setLength(0); int cp; - for (int i = 0; i < s.length(); i += UTF32.count16(i)) { - cp = UTF32.char32At(s, i); + for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(s, i); if (Default.ucd().getCategory(cp) == Mn) commaResult.append('\u25CC'); - UTF32.append32(commaResult, cp); + UTF16.append(commaResult, cp); } return commaResult.toString(); } diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt index 16fc4c750b1..5067563f555 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt +++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.txt @@ -1,5 +1,5 @@ -Generate: Derived.* -DeltaVersion: 12 +Generate: +DeltaVersion: 13 CopyrightYear: 2005 File: auxiliary/GraphemeBreakProperty @@ -58,6 +58,13 @@ Value: 4.1 File: extracted/DerivedBidiClass Property: Bidi_Class # Bidi Class (listing UnicodeData.txt, field 4: see UCD.html) +# Unlike other properties, unassigned code points in blocks reserved for right-to-left scripts are given either types R or AL. +# The unassigned characters that default to R are: +# Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF +# The unassigned characters that default to AL are: +# Arabic, Syriac, Thaana, Arabic_Presentation_Forms_A, Arabic_Presentation_Forms_B, Arabic_Supplement, +# and the range \u0750-\u077F, minus the Noncharacter_Code_Points +# For all other cases: Format: valueStyle=short skipUnassigned=Left_To_Right File: extracted/DerivedBinaryProperties @@ -67,8 +74,6 @@ Property: Bidi_Mirrored File: extracted/DerivedCombiningClass Property: Canonical_Combining_Class # Combining Class (listing UnicodeData.txt, field 3: see UCD.html) -# All code points not explicitly listed in this file have the property -# value: 0. Format: nameStyle=none valueStyle=short skipUnassigned=Not_Reordered File: DerivedCoreProperties diff --git a/tools/unicodetools/com/ibm/text/UCD/NormalizationTestHeader.txt b/tools/unicodetools/com/ibm/text/UCD/NormalizationTestHeader.txt new file mode 100644 index 00000000000..32aa458b912 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/NormalizationTestHeader.txt @@ -0,0 +1,32 @@ +# +# Normalization Test Suite +# Format: +# +# Columns (c1, c2,...) are separated by semicolons +# Comments are indicated with hash marks +# +# CONFORMANCE: +# 1. The following invariants must be true for all conformant implementations +# +# NFC +# c2 == NFC(c1) == NFC(c2) == NFC(c3) +# c4 == NFC(c4) == NFC(c5) +# +# NFD +# c3 == NFD(c1) == NFD(c2) == NFD(c3) +# c5 == NFD(c4) == NFD(c5) +# +# NFKC +# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) +# +# NFKD +# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) +# +# 2. For every code point X assigned in this version of Unicode that is not specifically +# listed in Part 1, the following invariants must be true for all conformant +# implementations: +# +# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X) +# +@Part0 # Specific cases +# \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt index f502de853f6..50fef4b0cff 100644 --- a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasesHeader.txt @@ -1,3 +1,4 @@ +# # This file contains aliases for properties used in the UCD. # These names can be used for XML formats of UCD data, for regular-expression # property tests, and other programmatic textual descriptions of Unicode data. diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt index 0e9d5bec886..282326d9a2c 100644 --- a/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasesHeader.txt @@ -1,3 +1,4 @@ +# # This file contains aliases for property values used in the UCD. # These names can be used for XML formats of UCD data, for regular-expression # property tests, and other programmatic textual descriptions of Unicode data. diff --git a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt index fcf77089488..0fcfa85a34e 100644 --- a/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt +++ b/tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt @@ -1,3 +1,4 @@ +# # Special Casing Properties # # This file is a supplement to the UnicodeData file. diff --git a/tools/unicodetools/com/ibm/text/UCD/TestData.java b/tools/unicodetools/com/ibm/text/UCD/TestData.java index 5932c04c34f..81135b10ddb 100644 --- a/tools/unicodetools/com/ibm/text/UCD/TestData.java +++ b/tools/unicodetools/com/ibm/text/UCD/TestData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $ -* $Date: 2005/03/10 02:37:19 $ -* $Revision: 1.18 $ +* $Date: 2005/03/26 05:40:05 $ +* $Revision: 1.19 $ * ******************************************************************************* */ @@ -151,7 +151,12 @@ public class TestData implements UCD_Types { static class GenStringPrep { UnicodeSet[] coreChars = new UnicodeSet[100]; - UnicodeSet[] decompChars = new UnicodeSet[100]; + UnicodeSet decomposable = new UnicodeSet(); + UnicodeSet pattern = new UnicodeSet(); + ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(""); + //UnicodeSet id_continue = ups.getSet("ID_Continue=true"); + UnicodeSet xid_continue = ups.getSet("XID_Continue=true"); + //UnicodeSet[] decompChars = new UnicodeSet[100]; UCD ucd = Default.ucd(); Collator uca = Collator.getInstance(ULocale.ENGLISH); @@ -167,10 +172,13 @@ public class TestData implements UCD_Types { void genStringPrep() throws IOException { + //BagFormatter bf = new BagFormatter(); + //System.out.println(bf.showSetDifferences("ID_Continue", id_continue, "XID_Continue", xid_continue)); StringBuffer inbuffer = new StringBuffer(); StringBuffer intermediate, outbuffer; for (int cp = 0; cp <= 0x10FFFF; ++cp) { Utility.dot(cp); + if (!Default.nfd().isNormalized(cp)) decomposable.add(cp); inbuffer.setLength(0); UTF16.append(inbuffer, cp); try { @@ -189,15 +197,9 @@ public class TestData implements UCD_Types { if (!TestData.equals(inbuffer, outbuffer)) continue; int script = ucd.getScript(cp); - if (!Default.nfd().isNormalized(cp)) { - if (decompChars[script] == null) - decompChars[script] = new UnicodeSet(); - decompChars[script].add(cp); - } else { - if (coreChars[script] == null) - coreChars[script] = new UnicodeSet(); - coreChars[script].add(cp); - } + if (coreChars[script] == null) + coreChars[script] = new UnicodeSet(); + coreChars[script].add(cp); } // find characters with no uppercase for (UnicodeSetIterator it = new UnicodeSetIterator(lowercase); it.next();) { @@ -212,8 +214,11 @@ public class TestData implements UCD_Types { .println(""); out.println("IDN Characters"); @@ -240,15 +245,16 @@ public class TestData implements UCD_Types { * @param scriptCode */ private void showCodes(PrintWriter out, int scriptCode) { - if (coreChars[scriptCode] == null - && decompChars[scriptCode] == null) - return; + if (coreChars[scriptCode] == null) return; System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode)); - String script = Default.ucd().getScriptID_fromIndex( - (byte) scriptCode); + String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode); out.println(); out.println(""); UnicodeSet core = new UnicodeSet(coreChars[scriptCode]); + UnicodeSet decomp = new UnicodeSet(core).retainAll(decomposable); + core.removeAll(decomp); + UnicodeSet non_id = new UnicodeSet(core).removeAll(xid_continue); + core.removeAll(non_id); UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper); core.removeAll(otherCore); if (core.size() == 0) { @@ -257,9 +263,9 @@ public class TestData implements UCD_Types { otherCore = temp; } printlnSet(out, "Atomic", core, scriptCode); - if (otherCore.size() != 0) printlnSet(out, "Atomic [noUpper]", otherCore, scriptCode); - UnicodeSet decomp = decompChars[scriptCode]; - if (decomp != null && decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode); + if (otherCore.size() != 0) printlnSet(out, "Atomic-no-uppercase", otherCore, scriptCode); + if (non_id.size() != 0) printlnSet(out, "Non-ID", non_id, scriptCode); + if (decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode); } /** @@ -277,7 +283,7 @@ public class TestData implements UCD_Types { && unicodeset.containsNone(bidiL) ? " dir='rtl'" : ""; out.println(""); - out.print(""); + out.print("
Script: " + script + "
" + title + " (" + nf.format(size) + ")
"); UnicodeSetIterator usi = new UnicodeSetIterator(); if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) { usi.reset(unicodeset); diff --git a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java index 1aafeb37581..9d951c0a651 100644 --- a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java +++ b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java @@ -264,7 +264,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { unicodeMap.putAll(lineBreak.getSet("Infix_Numeric") .remove(0x003A), "MidNum"); unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric"); - unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "Numeric"); + unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "ExtendNumLet"); unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it. unicodeMap.setMissing("Other"); } @@ -479,9 +479,10 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory { public List _getValueAliases(String valueAlias, List result) { if (result == null) result = new ArrayList(); int type = getType() & CORE_MASK; - if (type == STRING || type == MISC) return result; - else if (type == NUMERIC) return result; - else if (type == BINARY) { + if (type == STRING || type == MISC || type == NUMERIC) { + UnicodeProperty.addUnique(valueAlias, result); + return result; + } else if (type == BINARY) { UnicodeProperty.addUnique(valueAlias, result); return lookup(valueAlias, UCD_Names.YN_TABLE_LONG, UCD_Names.YN_TABLE, null, result); } else if (type == ENUMERATED || type == CATALOG) { diff --git a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java index 6bc06639392..c15ed90343d 100644 --- a/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java +++ b/tools/unicodetools/com/ibm/text/utility/UnicodeDataFile.java @@ -15,15 +15,17 @@ public class UnicodeDataFile { private String newFile; private String batName; private String mostRecent; + private String filename; private UnicodeDataFile(){}; public static UnicodeDataFile openAndWriteHeader(String directory, String filename) throws IOException { UnicodeDataFile result = new UnicodeDataFile(); result.newFile = directory + filename + UnicodeDataFile.getFileSuffix(true); - result.out = Utility.openPrintWriter(result.newFile, Utility.LATIN1_UNIX); + result.out = Utility.openPrintWriter(result.newFile, Utility.UTF8_UNIX); String[] batName = {""}; result.mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName); result.batName = batName[0]; + result.filename = filename; result.out.println("# " + filename + UnicodeDataFile.getFileSuffix(false)); result.out.println(generateDateLine()); @@ -50,6 +52,9 @@ public class UnicodeDataFile { } public void close() throws IOException { + try { + Utility.appendFile(filename + "Footer.txt", Utility.LATIN1, out); + } catch (FileNotFoundException e) {} out.close(); Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName); }