ICU-0 update for U4.1.0

X-SVN-Rev: 17400
This commit is contained in:
Mark Davis 2005-03-26 05:40:05 +00:00
parent 599dbb508c
commit 641a6d6d79
12 changed files with 118 additions and 54 deletions

View file

@ -1,4 +1,6 @@
#
# Note: The casing of block names is not normative.
# For example, "Basic Latin" and "BASIC LATIN" are equivalent.
#
# Format:
# Start Code..End Code; Block Name

View file

@ -1,3 +1,4 @@
#
# Case Folding Properties
#
# This file is a supplement to the UnicodeData file.

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
* $Date: 2004/02/12 08:23:15 $
* $Revision: 1.16 $
* $Date: 2005/03/26 05:40:04 $
* $Revision: 1.17 $
*
*******************************************************************************
*/
@ -574,14 +574,19 @@ public class GenerateCaseFolding implements UCD_Types {
log.close();
System.out.println("Writing");
String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true);
PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String[] batName = {""};
//String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true);
//PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader("DerivedData/", "SpecialCasing" + suffix2);
PrintWriter out = udf.out;
/* String[] batName = {""};
String mostRecent = UnicodeDataFile.generateBat("DerivedData/", "SpecialCasing", suffix2 + UnicodeDataFile.getFileSuffix(true), batName);
out.println("# SpecialCasing" + UnicodeDataFile.getFileSuffix(false));
out.println(UnicodeDataFile.generateDateLine());
out.println("#");
Utility.appendFile("SpecialCasingHeader.txt", Utility.UTF8, out);
*/
Iterator it = sorted.keySet().iterator();
int lastOrder = -1;
@ -612,8 +617,8 @@ public class GenerateCaseFolding implements UCD_Types {
}
out.println(line);
}
Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
out.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
//Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
udf.close();
//Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2005/03/10 02:37:19 $
* $Revision: 1.37 $
* $Date: 2005/03/26 05:40:04 $
* $Revision: 1.38 $
*
*******************************************************************************
*/
@ -744,16 +744,19 @@ public class GenerateData implements UCD_Types {
static public void writeNormalizerTestSuite(String directory, String fileName) throws IOException {
UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, fileName);
PrintWriter log = fc.out;
String newFile = directory + fileName + UnicodeDataFile.getFileSuffix(true);
PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
String[] batName = {""};
String mostRecent = UnicodeDataFile.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName);
//PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
//String[] batName = {""};
//String mostRecent = UnicodeDataFile.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName);
String[] example = new String[256];
log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false));
log.println(UnicodeDataFile.generateDateLine());
log.println("#");
//log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false));
//log.println(UnicodeDataFile.generateDateLine());
/*log.println("#");
log.println("# Normalization Test Suite");
log.println("# Format:");
log.println("#");
@ -787,7 +790,7 @@ public class GenerateData implements UCD_Types {
log.println("#");
log.println("@Part0 # Specific cases");
log.println("#");
log.println("#");*/
for (int j = 0; j < testSuiteCases.length; ++j) {
writeLine(testSuiteCases[j], log, false);
@ -891,8 +894,8 @@ public class GenerateData implements UCD_Types {
Utility.fixDot();
log.println("#");
log.println("# END OF FILE");
log.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
fc.close();
//Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
static void handleIdentical() throws IOException {
@ -942,12 +945,13 @@ public class GenerateData implements UCD_Types {
// not recursive!!!
static final String comma(String s) {
//if (true) return s;
commaResult.setLength(0);
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(i)) {
cp = UTF32.char32At(s, i);
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
if (Default.ucd().getCategory(cp) == Mn) commaResult.append('\u25CC');
UTF32.append32(commaResult, cp);
UTF16.append(commaResult, cp);
}
return commaResult.toString();
}

View file

@ -1,5 +1,5 @@
Generate: Derived.*
DeltaVersion: 12
Generate:
DeltaVersion: 13
CopyrightYear: 2005
File: auxiliary/GraphemeBreakProperty
@ -58,6 +58,13 @@ Value: 4.1
File: extracted/DerivedBidiClass
Property: Bidi_Class
# Bidi Class (listing UnicodeData.txt, field 4: see UCD.html)
# Unlike other properties, unassigned code points in blocks reserved for right-to-left scripts are given either types R or AL.
# The unassigned characters that default to R are:
# Hebrew, Cypriot_Syllabary, Kharoshthi, and the ranges \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF
# The unassigned characters that default to AL are:
# Arabic, Syriac, Thaana, Arabic_Presentation_Forms_A, Arabic_Presentation_Forms_B, Arabic_Supplement,
# and the range \u0750-\u077F, minus the Noncharacter_Code_Points
# For all other cases:
Format: valueStyle=short skipUnassigned=Left_To_Right
File: extracted/DerivedBinaryProperties
@ -67,8 +74,6 @@ Property: Bidi_Mirrored
File: extracted/DerivedCombiningClass
Property: Canonical_Combining_Class
# Combining Class (listing UnicodeData.txt, field 3: see UCD.html)
# All code points not explicitly listed in this file have the property
# value: 0.
Format: nameStyle=none valueStyle=short skipUnassigned=Not_Reordered
File: DerivedCoreProperties

View file

@ -0,0 +1,32 @@
#
# Normalization Test Suite
# Format:
#
# Columns (c1, c2,...) are separated by semicolons
# Comments are indicated with hash marks
#
# CONFORMANCE:
# 1. The following invariants must be true for all conformant implementations
#
# NFC
# c2 == NFC(c1) == NFC(c2) == NFC(c3)
# c4 == NFC(c4) == NFC(c5)
#
# NFD
# c3 == NFD(c1) == NFD(c2) == NFD(c3)
# c5 == NFD(c4) == NFD(c5)
#
# NFKC
# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
#
# NFKD
# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
#
# 2. For every code point X assigned in this version of Unicode that is not specifically
# listed in Part 1, the following invariants must be true for all conformant
# implementations:
#
# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
#
@Part0 # Specific cases
#

View file

@ -1,3 +1,4 @@
#
# This file contains aliases for properties used in the UCD.
# These names can be used for XML formats of UCD data, for regular-expression
# property tests, and other programmatic textual descriptions of Unicode data.

View file

@ -1,3 +1,4 @@
#
# This file contains aliases for property values used in the UCD.
# These names can be used for XML formats of UCD data, for regular-expression
# property tests, and other programmatic textual descriptions of Unicode data.

View file

@ -1,3 +1,4 @@
#
# Special Casing Properties
#
# This file is a supplement to the UnicodeData file.

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
* $Date: 2005/03/10 02:37:19 $
* $Revision: 1.18 $
* $Date: 2005/03/26 05:40:05 $
* $Revision: 1.19 $
*
*******************************************************************************
*/
@ -151,7 +151,12 @@ public class TestData implements UCD_Types {
static class GenStringPrep {
UnicodeSet[] coreChars = new UnicodeSet[100];
UnicodeSet[] decompChars = new UnicodeSet[100];
UnicodeSet decomposable = new UnicodeSet();
UnicodeSet pattern = new UnicodeSet();
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
//UnicodeSet[] decompChars = new UnicodeSet[100];
UCD ucd = Default.ucd();
Collator uca = Collator.getInstance(ULocale.ENGLISH);
@ -167,10 +172,13 @@ public class TestData implements UCD_Types {
void genStringPrep() throws IOException {
//BagFormatter bf = new BagFormatter();
//System.out.println(bf.showSetDifferences("ID_Continue", id_continue, "XID_Continue", xid_continue));
StringBuffer inbuffer = new StringBuffer();
StringBuffer intermediate, outbuffer;
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
inbuffer.setLength(0);
UTF16.append(inbuffer, cp);
try {
@ -189,15 +197,9 @@ public class TestData implements UCD_Types {
if (!TestData.equals(inbuffer, outbuffer))
continue;
int script = ucd.getScript(cp);
if (!Default.nfd().isNormalized(cp)) {
if (decompChars[script] == null)
decompChars[script] = new UnicodeSet();
decompChars[script].add(cp);
} else {
if (coreChars[script] == null)
coreChars[script] = new UnicodeSet();
coreChars[script].add(cp);
}
if (coreChars[script] == null)
coreChars[script] = new UnicodeSet();
coreChars[script].add(cp);
}
// find characters with no uppercase
for (UnicodeSetIterator it = new UnicodeSetIterator(lowercase); it.next();) {
@ -212,8 +214,11 @@ public class TestData implements UCD_Types {
.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
out.println("<title>IDN Characters</title><style>");
out.println("<!--");
out
.println(".script { font-size: 150%; background-color: #C0C0C0 }");
out.println(".script { font-size: 150%; background-color: #CCCCCC }");
out.println(".Atomic { background-color: #CCCCFF }");
out.println(".Atomic-no-uppercase { background-color: #CCFFCC }");
out.println(".Non-ID { background-color: #FFCCCC }");
out.println(".Decomposable { background-color: #FFFFCC }");
out.println("th { text-align: left }");
out.println("-->");
out.println("</style></head><body><table>");
@ -240,15 +245,16 @@ public class TestData implements UCD_Types {
* @param scriptCode
*/
private void showCodes(PrintWriter out, int scriptCode) {
if (coreChars[scriptCode] == null
&& decompChars[scriptCode] == null)
return;
if (coreChars[scriptCode] == null) return;
System.out.println(ucd.getScriptID_fromIndex((byte) scriptCode));
String script = Default.ucd().getScriptID_fromIndex(
(byte) scriptCode);
String script = Default.ucd().getScriptID_fromIndex((byte) scriptCode);
out.println();
out.println("<tr><th class='script'>Script: " + script + "</th></tr>");
UnicodeSet core = new UnicodeSet(coreChars[scriptCode]);
UnicodeSet decomp = new UnicodeSet(core).retainAll(decomposable);
core.removeAll(decomp);
UnicodeSet non_id = new UnicodeSet(core).removeAll(xid_continue);
core.removeAll(non_id);
UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
core.removeAll(otherCore);
if (core.size() == 0) {
@ -257,9 +263,9 @@ public class TestData implements UCD_Types {
otherCore = temp;
}
printlnSet(out, "Atomic", core, scriptCode);
if (otherCore.size() != 0) printlnSet(out, "Atomic [noUpper]", otherCore, scriptCode);
UnicodeSet decomp = decompChars[scriptCode];
if (decomp != null && decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
if (otherCore.size() != 0) printlnSet(out, "Atomic-no-uppercase", otherCore, scriptCode);
if (non_id.size() != 0) printlnSet(out, "Non-ID", non_id, scriptCode);
if (decomp.size() != 0) printlnSet(out, "Decomposable", decomp, scriptCode);
}
/**
@ -277,7 +283,7 @@ public class TestData implements UCD_Types {
&& unicodeset.containsNone(bidiL) ? " dir='rtl'" : "";
out.println("<tr><th class='" + title + "'>" + title + " ("
+ nf.format(size) + ")</th></tr>");
out.print("<tr><td" + dir + ">");
out.print("<tr><td class='" + title + "'" + dir + ">");
UnicodeSetIterator usi = new UnicodeSetIterator();
if (scriptCode == HAN_SCRIPT || scriptCode == HANGUL_SCRIPT) {
usi.reset(unicodeset);

View file

@ -264,7 +264,7 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
unicodeMap.putAll(lineBreak.getSet("Infix_Numeric")
.remove(0x003A), "MidNum");
unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric");
unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "Numeric");
unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "ExtendNumLet");
unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it.
unicodeMap.setMissing("Other");
}
@ -479,9 +479,10 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
public List _getValueAliases(String valueAlias, List result) {
if (result == null) result = new ArrayList();
int type = getType() & CORE_MASK;
if (type == STRING || type == MISC) return result;
else if (type == NUMERIC) return result;
else if (type == BINARY) {
if (type == STRING || type == MISC || type == NUMERIC) {
UnicodeProperty.addUnique(valueAlias, result);
return result;
} else if (type == BINARY) {
UnicodeProperty.addUnique(valueAlias, result);
return lookup(valueAlias, UCD_Names.YN_TABLE_LONG, UCD_Names.YN_TABLE, null, result);
} else if (type == ENUMERATED || type == CATALOG) {

View file

@ -15,15 +15,17 @@ public class UnicodeDataFile {
private String newFile;
private String batName;
private String mostRecent;
private String filename;
private UnicodeDataFile(){};
public static UnicodeDataFile openAndWriteHeader(String directory, String filename) throws IOException {
UnicodeDataFile result = new UnicodeDataFile();
result.newFile = directory + filename + UnicodeDataFile.getFileSuffix(true);
result.out = Utility.openPrintWriter(result.newFile, Utility.LATIN1_UNIX);
result.out = Utility.openPrintWriter(result.newFile, Utility.UTF8_UNIX);
String[] batName = {""};
result.mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName);
result.batName = batName[0];
result.filename = filename;
result.out.println("# " + filename + UnicodeDataFile.getFileSuffix(false));
result.out.println(generateDateLine());
@ -50,6 +52,9 @@ public class UnicodeDataFile {
}
public void close() throws IOException {
try {
Utility.appendFile(filename + "Footer.txt", Utility.LATIN1, out);
} catch (FileNotFoundException e) {}
out.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName);
}