ICU-5149 update unicode tools after cvs problems.

X-SVN-Rev: 19520
This commit is contained in:
Mark Davis 2006-04-05 22:13:04 +00:00
parent ea4cd7f0fa
commit 557bade86a
33 changed files with 964 additions and 346 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2005/06/08 01:44:48 $
* $Revision: 1.42 $
* $Date: 2006/04/05 22:12:46 $
* $Revision: 1.43 $
*
*******************************************************************************
*/
@ -18,6 +18,7 @@ import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.CanonicalIterator;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.dev.test.util.UnicodePropertySource;
import com.ibm.icu.impl.UCharacterProperty;
@ -33,7 +34,6 @@ import java.text.DateFormat;
import java.text.SimpleDateFormat;
import com.ibm.text.UCD.*;
import com.ibm.text.UCD.UCD_Types;
import com.ibm.text.utility.*;
import com.ibm.text.UCD.Normalizer;
@ -4104,8 +4104,8 @@ F900..FAFF; CJK Compatibility Ideographs
bf.setLineSeparator("<br>\r\n");
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
bf.setUnicodePropertyFactory(ups);
bf.setShowLiteral(bf.toHTML);
bf.setFixName(bf.toHTML);
bf.setShowLiteral(TransliteratorUtilities.toHTML);
bf.setFixName(TransliteratorUtilities.toHTML);
UCD ucd = Default.ucd();
UnicodeProperty cat = ups.getProperty("gc");
UnicodeSet ucd410 = cat.getSet("Cn")

View file

@ -10,7 +10,7 @@
# The data supports both implementations that require simple case foldings
# (where string lengths don't change), and implementations that allow full case folding
# (where string lengths may grow). Note that where they can be supported, the
# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
#
# All code points not listed in this file map to themselves.
#

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
* $Date: 2005/11/01 00:10:53 $
* $Revision: 1.17 $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.18 $
*
*******************************************************************************
*/
@ -840,6 +840,13 @@ public final class ConvertUCD implements UCD_Types {
} else if (fieldName.equals("gc")) {
uData.generalCategory = Utility.lookup(fieldValue, UCD_Names.GENERAL_CATEGORY, true);
// if (major >= 5 && uData.script == Unknown_Script
// && uData.generalCategory != Cn
// && uData.generalCategory != Cs
// && uData.generalCategory != Co) {
// uData.script = COMMON_SCRIPT;
// System.out.println("Resetting to Common Script: " + Utility.hex(uData.codePoint));
// }
} else if (fieldName.equals("bc")) {
uData.bidiClass = Utility.lookup(fieldValue, UCD_Names.BIDI_CLASS, true);
} else if (fieldName.equals("dt")) {
@ -878,8 +885,17 @@ public final class ConvertUCD implements UCD_Types {
uData.numericValue = Utility.doubleFrom(fieldValue);
} else if (fieldName.equals("cc")) {
uData.combiningClass = (byte)Utility.intFrom(fieldValue);
if (uData.combiningClass == 9 && major >= 5) {
System.out.println("setting Grapheme_Link " + Utility.hex(uData.codePoint) + "\t" + uData.name);
uData.binaryProperties |= (1<<GraphemeLink);
System.out.println(uData);
}
} else if (fieldName.equals("bp")) {
uData.binaryProperties = (byte)Utility.longFrom(fieldValue);
// if (major >= 5 && (uData.binaryProperties & 1<<Noncharacter_Code_Point) != 0) {
// uData.script = Unknown_Script;
// }
System.out.println("Resetting: " + uData);
} else {
throw new IllegalArgumentException("Unknown fieldName");
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
* $Date: 2005/03/30 17:19:32 $
* $Revision: 1.13 $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.14 $
*
*******************************************************************************
*/
@ -310,7 +310,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
UnicodeDataFile fc = UnicodeDataFile.openHTMLAndWriteHeader("auxiliary\\", fileName + "BreakTest");
UnicodeDataFile fc = UnicodeDataFile.openHTMLAndWriteHeader("DerivedData\\auxiliary\\", fileName + "BreakTest");
PrintWriter out = fc.out;
/* PrintWriter out = Utility.openPrintWriter("auxiliary\\"
@ -354,7 +354,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
String[] testCase = new String[50];
// do main test
UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader("auxiliary\\", fileName + "BreakTest"
UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader("DerivedData\\auxiliary\\", fileName + "BreakTest"
+ (shortVersion ? "_SHORT" : ""));
PrintWriter out = fc.out;
/* PrintWriter out = Utility.openPrintWriter("TR29\\" + fileName + "BreakTest"

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
* $Date: 2005/03/26 05:40:04 $
* $Revision: 1.17 $
* $Date: 2006/04/05 22:12:45 $
* $Revision: 1.18 $
*
*******************************************************************************
*/
@ -585,8 +585,8 @@ public class GenerateCaseFolding implements UCD_Types {
out.println("# SpecialCasing" + UnicodeDataFile.getFileSuffix(false));
out.println(UnicodeDataFile.generateDateLine());
out.println("#");
Utility.appendFile("SpecialCasingHeader.txt", Utility.UTF8, out);
*/
//Utility.appendFile("com/ibm/text/UCD/SpecialCasingHeader.txt", Utility.UTF8, out);
Iterator it = sorted.keySet().iterator();
int lastOrder = -1;
@ -609,7 +609,7 @@ public class GenerateCaseFolding implements UCD_Types {
case 3: out.println("# Ligatures"); break;
case 4: skipLine = true; break;
case 5: out.println("# No corresponding uppercase precomposed character"); break;
case 6: Utility.appendFile("SpecialCasingIota.txt", Utility.UTF8, out); break;
case 6: Utility.appendFile("com/ibm/text/UCD/SpecialCasingIota.txt", Utility.UTF8, out); break;
case 7: out.println("# Some characters with YPOGEGRAMMENI also have no corresponding titlecases"); break;
case 8: skipLine = true; break;
}
@ -617,7 +617,7 @@ public class GenerateCaseFolding implements UCD_Types {
}
out.println(line);
}
//Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
Utility.appendFile("com/ibm/text/UCD/SpecialCasingFooter.txt", Utility.UTF8, out);
udf.close();
//Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
* $Date: 2005/11/19 05:39:39 $
* $Revision: 1.9 $
* $Date: 2006/04/05 22:12:45 $
* $Revision: 1.10 $
*
*******************************************************************************
*/
@ -38,6 +38,7 @@ import com.ibm.icu.dev.demo.translit.InfoDialog;
import com.ibm.icu.dev.test.util.ArrayComparator;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.ICUPropertyFactory;
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
import com.ibm.icu.dev.test.util.UnicodeLabel;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeProperty;
@ -423,15 +424,15 @@ public class GenerateConfusables {
BagFormatter bf = new BagFormatter();
bf.setUnicodePropertyFactory(ups);
bf.setLabelSource(null);
bf.setShowLiteral(bf.toHTMLControl);
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
bf.setMergeRanges(true);
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt");
//reviews.putAll(UNASSIGNED, "");
out.print("\uFEFF");
out.println("# Review List for IDN");
out.println("# $Revision: 1.9 $");
out.println("# $Date: 2005/11/19 05:39:39 $");
out.println("# $Revision: 1.10 $");
out.println("# $Date: 2006/04/05 22:12:45 $");
out.println("");
UnicodeSet fullSet = reviews.getSet("").complement();
@ -478,7 +479,7 @@ public class GenerateConfusables {
BagFormatter bf = new BagFormatter();
bf.setUnicodePropertyFactory(ups);
bf.setLabelSource(null);
bf.setShowLiteral(bf.toHTMLControl);
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
bf.setMergeRanges(true);
UnicodeSet letters = new UnicodeSet("[[:Alphabetic:][:Mark:][:Nd:]]");
@ -486,8 +487,8 @@ public class GenerateConfusables {
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "idnchars.txt");
out.println("# Recommended Identifier Profiles for IDN");
out.println("# $Revision: 1.9 $");
out.println("# $Date: 2005/11/19 05:39:39 $");
out.println("# $Revision: 1.10 $");
out.println("# $Date: 2006/04/05 22:12:45 $");
out.println("");
out.println("# Output Characters");
@ -549,15 +550,15 @@ public class GenerateConfusables {
BagFormatter bf = new BagFormatter();
bf.setUnicodePropertyFactory(ups);
bf.setLabelSource(null);
bf.setShowLiteral(bf.toHTMLControl);
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
bf.setMergeRanges(true);
PrintWriter out = BagFormatter.openUTF8Writer(outdir,
"xidmodifications.txt");
out.println("# Security Profile for General Identifiers");
out.println("# $Revision: 1.9 $");
out.println("# $Date: 2005/11/19 05:39:39 $");
out.println("# $Revision: 1.10 $");
out.println("# $Date: 2006/04/05 22:12:45 $");
out.println("");
out.println("# Characters restricted");
@ -613,8 +614,8 @@ public class GenerateConfusables {
//someRemovals = removals;
out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt");
out.println("# Characters restricted in domain names");
out.println("# $Revision: 1.9 $");
out.println("# $Date: 2005/11/19 05:39:39 $");
out.println("# $Revision: 1.10 $");
out.println("# $Date: 2006/04/05 22:12:45 $");
out.println("#");
out.println("# This file contains a draft list of characters for use in");
out.println("# UTR #36: Unicode Security Considerations");
@ -1148,8 +1149,8 @@ public class GenerateConfusables {
public void writeSource(String directory, String filename) throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
out.println("# Source File for IDN Confusables");
out.println("# $Revision: 1.9 $");
out.println("# $Date: 2005/11/19 05:39:39 $");
out.println("# $Revision: 1.10 $");
out.println("# $Date: 2006/04/05 22:12:45 $");
out.println("");
dataMixedAnycase.writeSource(out);
out.close();
@ -1159,8 +1160,8 @@ public class GenerateConfusables {
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
out.print('\uFEFF');
out.println("# Recommended confusable mapping for IDN");
out.println("# $Revision: 1.9 $");
out.println("# $Date: 2005/11/19 05:39:39 $");
out.println("# $Revision: 1.10 $");
out.println("# $Date: 2006/04/05 22:12:45 $");
out.println("");
if (appendFile) {
@ -1347,7 +1348,7 @@ public class GenerateConfusables {
}
}
static class MyCollectionFilter implements CollectionUtilities.Filter {
static class MyCollectionFilter implements CollectionUtilities.ObjectMatcher {
UnicodeSet outputAllowed;
int minLength;
public boolean matches(Object o) {
@ -1368,8 +1369,8 @@ public class GenerateConfusables {
UnicodeSet representable = new UnicodeSet();
out.print('\uFEFF');
out.println("# Summary: Recommended confusable mapping for IDN");
out.println("# $Revision: 1.9 $");
out.println("# $Date: 2005/11/19 05:39:39 $");
out.println("# $Revision: 1.10 $");
out.println("# $Date: 2006/04/05 22:12:45 $");
out.println("");
MyEquivalenceClass data = dataMixedAnycase;
Set items = data.getOrderedExplicitItems();
@ -1446,7 +1447,7 @@ public class GenerateConfusables {
representable.removeAll(script);
BagFormatter bf = new BagFormatter();
bf.setValueSource(ups.getProperty("script"));
bf.setShowLiteral(bf.toHTMLControl);
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
bf.showSetNames(out, representable);
}
out.close();
@ -1493,8 +1494,8 @@ public class GenerateConfusables {
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
out.print('\uFEFF');
out.println("# Summary: Whole-Script Confusables");
out.println("# $Revision: 1.9 $");
out.println("# $Date: 2005/11/19 05:39:39 $");
out.println("# $Revision: 1.10 $");
out.println("# $Date: 2006/04/05 22:12:45 $");
out.println("# This data is used for determining whether a strings is a");
out.println("# whole-script or mixed-script confusable.");
out.println("# The mappings here ignore common and inherited script characters,");
@ -1539,7 +1540,7 @@ public class GenerateConfusables {
script_set[i] = new UnicodeSet("[:script=" + UScript.getName(i) + ":]"); // ugly hack
}
bf.setValueSource(ups.getProperty("script"));
bf.setShowLiteral(bf.toHTMLControl);
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
bf.setLabelSource(UnicodeLabel.NULL);
}
WholeScript(UnicodeSet filterSet, String label) {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2005/10/11 19:39:15 $
* $Revision: 1.39 $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.40 $
*
*******************************************************************************
*/
@ -756,41 +756,41 @@ public class GenerateData implements UCD_Types {
//log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false));
//log.println(UnicodeDataFile.generateDateLine());
log.println("#");
log.println("# Normalization Test Suite");
log.println("# Format:");
log.println("#");
log.println("# Columns (c1, c2,...) are separated by semicolons");
log.println("# Comments are indicated with hash marks");
log.println("#");
log.println("# CONFORMANCE:");
log.println("# 1. The following invariants must be true for all conformant implementations");
log.println("#");
log.println("# NFC");
log.println("# c2 == NFC(c1) == NFC(c2) == NFC(c3)");
log.println("# c4 == NFC(c4) == NFC(c5)");
log.println("#");
log.println("# NFD");
log.println("# c3 == NFD(c1) == NFD(c2) == NFD(c3)");
log.println("# c5 == NFD(c4) == NFD(c5)");
log.println("#");
log.println("# NFKC");
log.println("# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)");
log.println("#");
log.println("# NFKD");
log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)");
log.println("#");
log.println("# 2. For every code point X assigned in this version of Unicode that is not specifically");
log.println("# listed in Part 1, the following invariants must be true for all conformant");
log.println("# implementations:");
log.println("#");
log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
// log.println("#");
// log.println("# Normalization Test Suite");
// log.println("# Format:");
// log.println("#");
// log.println("# Columns (c1, c2,...) are separated by semicolons");
// log.println("# Comments are indicated with hash marks");
// log.println("#");
// log.println("# CONFORMANCE:");
// log.println("# 1. The following invariants must be true for all conformant implementations");
// log.println("#");
// log.println("# NFC");
// log.println("# c2 == NFC(c1) == NFC(c2) == NFC(c3)");
// log.println("# c4 == NFC(c4) == NFC(c5)");
// log.println("#");
// log.println("# NFD");
// log.println("# c3 == NFD(c1) == NFD(c2) == NFD(c3)");
// log.println("# c5 == NFD(c4) == NFD(c5)");
// log.println("#");
// log.println("# NFKC");
// log.println("# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)");
// log.println("#");
// log.println("# NFKD");
// log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)");
// log.println("#");
// log.println("# 2. For every code point X assigned in this version of Unicode that is not specifically");
// log.println("# listed in Part 1, the following invariants must be true for all conformant");
// log.println("# implementations:");
// log.println("#");
// log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)");
System.out.println("Writing Part 1");
log.println("#");
log.println("@Part0 # Specific cases");
log.println("#");
// log.println("#");
// log.println("@Part0 # Specific cases");
// log.println("#");
for (int j = 0; j < testSuiteCases.length; ++j) {
writeLine(testSuiteCases[j], log, false);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateNamedSequences.java,v $
* $Date: 2005/05/27 21:40:51 $
* $Revision: 1.1 $
* $Date: 2006/04/05 22:12:45 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
@ -117,7 +117,7 @@ public final class GenerateNamedSequences implements UCD_Types {
"@date@", Default.getDate(),
"@table@", table};
Utility.appendFile("NamedSequences-Template.html", Utility.UTF8, out, replacementList);
Utility.appendFile("com/ibm/text/UCD/NamedSequences-Template.html", Utility.UTF8, out, replacementList);
out.close();
//Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java,v $
* $Date: 2005/10/11 19:39:15 $
* $Revision: 1.6 $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -104,18 +104,31 @@ public final class GenerateStandardizedVariants implements UCD_Types {
String version = Default.ucd().getVersion();
int lastDot = version.lastIndexOf('.');
String updateDirectory = version.substring(0,lastDot) + "-Update";
int updateV = version.charAt(version.length()-1) - '0';
if (updateV != 0) updateDirectory += (char)('1' + updateV);
if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
String updateDirectory;
String partialFilename;
if (version.compareTo("4.1.0") < 0) {
updateDirectory = version.substring(0,lastDot) + "-Update";
int updateV = version.charAt(version.length()-1) - '0';
if (updateV != 0) updateDirectory += (char)('1' + updateV);
if (DEBUG) System.out.println("updateDirectory: " + updateDirectory);
partialFilename = "StandardizedVariants-" + Default.ucd().getVersion();
} else if (version.compareTo("4.1.0") == 0) {
updateDirectory = version.substring(0,lastDot) + "/ucd";
partialFilename = "StandardizedVariants";
} else {
updateDirectory = version + "/ucd";
partialFilename = "StandardizedVariants";
}
String[] replacementList = {
"@revision@", Default.ucd().getVersion(),
"@updateDirectory@", updateDirectory,
"@filename@", partialFilename,
"@date@", Default.getDate(),
"@table@", table};
Utility.appendFile("StandardizedVariants-Template.html", Utility.UTF8, out, replacementList);
Utility.appendFile("com/ibm/text/UCD/StandardizedVariants-Template.html", Utility.UTF8, out, replacementList);
out.close();
//Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);

View file

@ -15,6 +15,7 @@ import java.util.Set;
import java.util.TreeSet;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
import com.ibm.icu.dev.test.util.UnicodeLabel;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeMap.Composer;
@ -91,7 +92,7 @@ class GenerateStringPrep implements UCD_Types {
void genStringPrep() throws IOException {
//showScriptToBlock();
bf.setShowLiteral(BagFormatter.toHTMLControl);
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
bf.setUnicodePropertyFactory(ups);
//bf.setValueSource(UnicodeLabel.NULL);
if (false) {
@ -508,7 +509,7 @@ class GenerateStringPrep implements UCD_Types {
}
return "<span title='" + ucd.getCodeAndName(string) + "'>"
+ pad1
+ BagFormatter.toHTMLControl.transliterate(string)
+ TransliteratorUtilities.toHTMLControl.transliterate(string)
+ pad
+ "</span> ";
}

View file

@ -0,0 +1,142 @@
package com.ibm.text.UCD;
import java.io.IOException;
import java.io.PrintWriter;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.impl.PrettyPrinter;
import com.ibm.icu.text.IDNA;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.Utility;
public class IDNTester {
static StringBuffer inbuffer = new StringBuffer();
static StringBuffer intermediate, outbuffer;
static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4;
static UnicodeSet IDNInputOnly = new UnicodeSet();
static UnicodeSet IDNOutput = new UnicodeSet();
static boolean initialized = false;
static UnicodeSet IDInputOnly32 = new UnicodeSet();
static UnicodeSet IDOutput32 = new UnicodeSet();
static UnicodeSet IDInputOnly50 = new UnicodeSet();
static UnicodeSet IDOutput50 = new UnicodeSet();
static PrettyPrinter pp = new PrettyPrinter();
static PrintWriter pw;
public static void main(String[] args) throws IOException {
initialize();
pw = BagFormatter.openUTF8Writer(Utility.GEN_DIR, "idnCount.html");
pw.println("<html><body>");
showSet("IDN InputOnly: ", IDNInputOnly);
showSet("IDN Output: ", IDNOutput);
showSet("ID InputOnly, U3.2: ", IDInputOnly32);
showSet("ID Output, U3.2: ", IDOutput32);
showSet("IDN Output - ID Output, U3.2: ", new UnicodeSet(IDNOutput).removeAll(IDOutput32));
showSet("IDN Output & ID Output, U3.2: ", new UnicodeSet(IDNOutput).retainAll(IDOutput32));
showSet("ID Output - IDN Output, U3.2: ", new UnicodeSet(IDOutput32).removeAll(IDNOutput));
showSet("ID InputOnly, U5.0: ", IDInputOnly50);
showSet("ID Output, U5.0: ", IDOutput50);
showSet("ID Output, U5.0 - U3.2: ", new UnicodeSet(IDOutput50).removeAll(IDOutput32));
pw.println("</body></html>");
pw.close();
}
public static void showSet(String title, UnicodeSet set) {
pw.println("<h2>" + title + set.size() + "</h2>" + "<p>" + pp.toPattern(set) + "</p>");
pw.println();
}
static UnicodeSet getIDNInput() {
if (!initialized) initialize();
return IDNInputOnly;
}
static UnicodeSet getIDNOutput() {
if (!initialized) initialize();
return IDNInputOnly;
}
private static void initialize() {
UnicodeSet oddballs = new UnicodeSet("[\u034F \u180B-\u180D \uFE00-\uFE0F _]");
UCD U32 = UCD.make("3.2.0");
Normalizer nfkc32 = new Normalizer(Normalizer.NFKC, "3.2.0");
UCDProperty xid32 = DerivedProperty.make(UCD.Mod_ID_Continue_NO_Cf,U32);
UnicodeSet IDInput32 = xid32.getSet();
IDInput32.add('-').removeAll(oddballs);
UCD U50 = UCD.make("5.0.0");
Normalizer nfkc50 = new Normalizer(Normalizer.NFKC, "5.0.0");
UCDProperty xid50 = DerivedProperty.make(UCD.Mod_ID_Continue_NO_Cf,U50);
UnicodeSet IDInput50 = xid50.getSet();
IDInput50.add('-').removeAll(oddballs);
for (int i = 0; i < 0x10FFFF; ++i) {
if ((i & 0xFFF) == 0) {
System.out.println(i);
System.out.flush();
}
int type = getIDNAType(i);
if (type == OK) {
IDNOutput.add(i);
} else if (type != ILLEGAL) {
IDNInputOnly.add(i);
}
if (IDInput32.contains(i)) {
splitSet(IDInputOnly32, IDOutput32, U32, nfkc32, i);
}
if (IDInput50.contains(i)) {
splitSet(IDInputOnly50, IDOutput50, U50, nfkc50, i);
}
}
initialized = true;
}
private static void splitSet(UnicodeSet inputOnlySet, UnicodeSet outputSet, UCD ucd, Normalizer nfkc, int i) {
if (i < 0x7F) {
outputSet.add(i);
return;
}
String v = UTF16.valueOf(i);
String s = ucd.getCase(i, UCD.FULL, UCD.FOLD);
if (s.equals(v)) {
s = nfkc.normalize(s);
if (s.equals(v)) {
s = ucd.getCase(s, UCD.FULL, UCD.FOLD);
if (s.equals(v)) {
outputSet.add(i);
return;
}
}
}
inputOnlySet.add(i);
}
static public int getIDNAType(int cp) {
if (cp == '-') return OK;
inbuffer.setLength(0);
UTF16.append(inbuffer, cp);
try {
intermediate = IDNA.convertToASCII(inbuffer,
IDNA.DEFAULT); // USE_STD3_RULES
if (intermediate.length() == 0)
return DELETED;
outbuffer = IDNA.convertToUnicode(intermediate,
IDNA.USE_STD3_RULES);
} catch (StringPrepParseException e) {
return ILLEGAL;
} catch (Exception e) {
System.out.println("Failure at: " + Utility.hex(cp));
return ILLEGAL;
}
if (!TestData.equals(inbuffer, outbuffer))
return REMAPPED;
return OK;
}
}

View file

@ -0,0 +1,75 @@
Let $letter = [$gc:Lu $gc:Ll $gc:Lt $gc:Lo $gc:Lm];
Let $number = [$gc:Nd $gc:Nl $gc:No]
Let $mark = [$gc:mn $gc:me $gc:mc]
Let $LMN = [$letter $number $mark]
Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation]
Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol]
Let $nfc = [^$NFC_Quick_Check:No]
Show $nfc
Show [$alphabetic - [$mark $letter $number]]
Let $oldCJK = [\u1100-\u11FF \u3040-\u30FF \u3130-\u318F \u31F0-\u31FF \u3400-\u4DBF \u4E00-\u9FFF \uAC00-\uD7AF \uF900-\uFAFF \uFF65-\uFFDC]
Show [$oldCJK & $gc:cn]
Let $fixedOld = [$oldCJK-$gc:cn]
#List the non-alphabetic old items
#Show [$oldCJK-$gc:cn-$alphabetic]
#Check for differences
#Test $fixedOld = $trialNew
#ShowEach $mark
Let $uax29_outliers = [\u3031-\u3035 \u309B-\u309C \u30A0 \u30FC \uFF70 \uFF9E-\uFF9F]
Let $other_outliers = [\u3099-\u309A \u3006 \u303C \u302A-\u302E \u302F \U000E0100-\U000E01EF]
# ==========================================
# Outliers from UAX29
Show $uax29_outliers
# Additional outliers
Show $other_outliers
# Take the 5 CJK scripts
Let $trialScripts = [$script:hani $script:hang $script:kana $script:hira $script:bopo]
# Remove the non-LMN
Let $trialNewBase = [$trialScripts & $LMN]
# Add the outliers
Let $trialNew = [$trialNewBase $uax29_outliers $other_outliers]
# Show our result
Show $trialNew
# As a double-check, show script characters we're tossing
Show [$trialScripts - $trialNew]
# Compare snippets stuff
Let $guessClose = [$lb:QU $lb:Close_Punctuation]
Let $__closing_punc = ["')>\]`\}\u00AB\u00BB\u2018\u2019\u201C\u201D\u2039\u203A\u207E\u208E\u27E7\u27E9\u27EB\u2984\u2986\u2988\u298A\u298C\u298E\u2990\u2992\u2994\u2996\u2998\u29D9\u29DB\u29FD\u3009\u300B\u300D\u300F\u3011\u3015\u3017\u3019\u301B\u301E\u301F\uFD3F\uFE42\uFE44\uFE5A\uFE5C\uFF02\uFF07\uFF09\uFF3D\uFF5D\uFF63]
$guessClose = $__closing_punc
Let $guessClose = [$gc:pf $gc:pe $gc:pi]
$guessClose = $__closing_punc
Let $guessTerm = [$sb:aterm $sb:sterm]
$guessTerm = [? ? !?? ? ? ? ? ??? ? ? ? ? ? ? ? .?? … ? ? ? ? ? ? ? ?? ? ? ? ? ? ? ?]
Let $__issymotherr = [\u00A6\u00A7\u06FD\u06FE\u0F01-\u0F03\u0F13-\u0F17\u0F1A-\u0F1F\u0FBE-\u0FC5\u0FC7-\u0FCC\u2100\u2101\u2104-\u2106\u2108\u2109\u2117\u2118\u211E-\u2121\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u2400-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u2613\u2619-\u266E\u2670\u2671\u2701-\u2704\u2706-\u2709\u270C-\u2727\u2729-\u274B\u274F-\u2752\u2758-\u275E\u2761-\u2794\u2798-\u27AF\u27B1-\u27BE\u2800-\u28FF\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3012\u3013\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u3200-\u321C\u322A-\u3243\u3260-\u327B\u328A-\u32B0\u32C0-\u32CB\u32D0-\u32FE\u3300-\u3376\u337B-\u33DD\u33E0-\u33FE\uA490-\uA4A1\uA4A4-\uA4B3\uA4B5-\uA4C0\uA4C2-\uA4C4\uFFED\uFFEE\uFFFC\uFFFD]
Let $__issymothers = [\u00B6\u0482\u06E9\u09FA\u0B70\u0F34\u0F36\u0F38\u0FCF\u2114\u2123\u2125\u2127\u2129\u212E\u2132\u213A\u21D3\u220E\u2617\u274D\u2756\u3004\u3020\u327F\uA4C6\uFFE4\uFFE8]
Let $symOther = [$__issymotherr $__issymothers]
$symOther = $gcAllSymbols
[$symOther & $nfc] = [$gcAllSymbols & $nfc]

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2005/10/11 19:39:15 $
* $Revision: 1.36 $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.37 $
*
*******************************************************************************
*/
@ -160,8 +160,9 @@ public final class Main implements UCD_Types {
//else if (arg.equalsIgnoreCase("TrailingZeros")) GenerateData.genTrailingZeros();
else if (arg.equalsIgnoreCase("GenerateThaiBreaks")) GenerateThaiBreaks.main(null);
else if (arg.equalsIgnoreCase("TestData")) TestData.main(new String[]{args[++i]});
else if (arg.equalsIgnoreCase("TestData")) TestData.main(new String[]{args[++i]});
else if (arg.equalsIgnoreCase("MakeUnicodeFiles")) MakeUnicodeFiles.main(new String[]{});
//else if (arg.equalsIgnoreCase("checkAgainstUInfo")) checkAgainstUInfo();
else if (arg.equalsIgnoreCase("checkScripts")) VerifyUCD.checkScripts();
else if (arg.equalsIgnoreCase("IdentifierTest")) VerifyUCD.IdentifierTest();

View file

@ -16,6 +16,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodePropertySource;
import com.ibm.icu.text.Collator;
@ -71,7 +72,7 @@ public class MakeNamesChart {
System.out.println("file: " + chartPrefix + fileName);
PrintWriter out = BagFormatter.openUTF8Writer("C:/DATA/GEN/charts/namelist/", chartPrefix + fileName);
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>" +
BagFormatter.toHTML.transliterate(getHeading(lineParts[2])) +
TransliteratorUtilities.toHTML.transliterate(getHeading(lineParts[2])) +
"</title><link rel='stylesheet' type='text/css' href='namelist.css'>" +
"<base target='names'></head><body>");
@ -117,7 +118,7 @@ public class MakeNamesChart {
String hexcp = Utility.hex(it.codepoint, 4);
String title = "";
String name = Default.ucd().getName(it.codepoint);
if (name != null) title = " title='" + BagFormatter.toHTML.transliterate(name.toLowerCase()) + "'";
if (name != null) title = " title='" + TransliteratorUtilities.toHTML.transliterate(name.toLowerCase()) + "'";
out.println("<td class='" + tdclass + "'"
+ title
+ ">\u00A0"
@ -347,7 +348,7 @@ public class MakeNamesChart {
static Matcher escapeMatch = Pattern.compile("\\&[A-Z][a-z]*\\;").matcher("");
private static String showTextConvertingHex(String body, boolean addCharToHex) {
body = BagFormatter.toHTML.transliterate(body);
body = TransliteratorUtilities.toHTML.transliterate(body);
if (addCharToHex) {
int position = 0;
while (position < body.length()) {
@ -411,7 +412,7 @@ public class MakeNamesChart {
if (type == UCD.Cn || type == UCD.Co || type == UCD.Cs) {
return "\u2588";
}
String result = BagFormatter.toHTML.transliterate(UTF16.valueOf(cp));
String result = TransliteratorUtilities.toHTML.transliterate(UTF16.valueOf(cp));
if (type == UCD.Me || type == UCD.Mn) {
result = "\u25CC" + result;
} else if (rtl.contains(cp)) {

View file

@ -68,6 +68,7 @@ public class MakeUnicodeFiles {
public static void main(String[] args) throws IOException {
generateFile();
System.out.println("DONE");
}
static class Format {
@ -294,7 +295,7 @@ public class MakeUnicodeFiles {
*/
try {
BufferedReader br =
Utility.openReadFile("MakeUnicodeFiles.txt", Utility.UTF8);
Utility.openReadFile("com/ibm/text/UCD/MakeUnicodeFiles.txt", Utility.UTF8);
String key = null;
String file = null, property = null, value = "", comments = "";
while (true) {
@ -594,6 +595,7 @@ public class MakeUnicodeFiles {
pw.println(SEPARATOR);
pw.println("# Total: " + count);
pw.println();
pw.println("# EOF");
udf.close();
}
@ -710,6 +712,8 @@ public class MakeUnicodeFiles {
pw.println(line);
}
}
pw.println();
pw.println("# EOF");
udf.close();
}
@ -769,10 +773,16 @@ public class MakeUnicodeFiles {
ps.valueStyle = "none";
}
if (ps.noLabel) bf.setLabelSource(null);
if (ps.nameStyle.equals("none")) bf.setPropName(null);
else if (ps.nameStyle.equals("short")) bf.setPropName(prop.getFirstNameAlias());
else bf.setPropName(name);
if (ps.noLabel) {
bf.setLabelSource(null);
}
if (ps.nameStyle.equals("none")) {
bf.setPropName(null);
} else if (ps.nameStyle.equals("short")) {
bf.setPropName(prop.getFirstNameAlias());
} else {
bf.setPropName(name);
}
if (ps.interleaveValues) {
writeInterleavedValues(pw, bf, prop, ps);
@ -784,6 +794,8 @@ public class MakeUnicodeFiles {
writeEnumeratedValues(pw, bf, unassigned, prop, ps);
}
}
pw.println();
pw.println("# EOF");
udf.close();
}
@ -809,6 +821,15 @@ public class MakeUnicodeFiles {
temp2.addAll(aliases);
aliases = temp2;
}
System.out.println("Check: " + prop.getValue(0xE000));
String missing = ps.skipUnassigned != null ? ps.skipUnassigned : ps.skipValue;
if (missing != null && !missing.equals("False")) {
pw.println();
String propName = bf.getPropName();
if (propName == null) propName = "";
else if (propName.length() != 0) propName = propName + "; ";
pw.println("# @missing: 0000..10FFFF; " + propName + missing);
}
for (Iterator it = aliases.iterator(); it.hasNext();) {
String value = (String)it.next();
if (DEBUG) System.out.println("Getting value " + value);
@ -891,6 +912,7 @@ public class MakeUnicodeFiles {
pw.println();
//if (s.size() != 0)
bf.showSetNames(pw, s);
//System.out.println(bf.showSetNames(s));
}
}

View file

@ -1,6 +1,6 @@
Generate: NamedSequences
Generate: .*
DeltaVersion: 14
CopyrightYear: 2005
CopyrightYear: 2006
File: auxiliary/GraphemeBreakProperty
Property: Grapheme_Cluster_Break
@ -65,7 +65,10 @@ Value: 4.0
# Newly assigned in Unicode 4.0.0 (April, 2003)
Value: 4.1
# Newly assigned in Unicode 4.1.0 (XXX, 2005)
# Newly assigned in Unicode 4.1.0 (March, 2005)
Value: 5.0
# Newly assigned in Unicode 5.0.0 (XXX, 2006)
File: extracted/DerivedBidiClass
Property: Bidi_Class
@ -158,6 +161,10 @@ Property: Grapheme_Base
# Note: depending on an application's interpretation of Co (private use),
# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither.
Property: Grapheme_Link
# Derived Property: Grapheme_Link (deprecated)
# Generated from: Canonical_Combining_Class=Virama
# Use Canonical_Combining_Class=Virama directly instead
File: extracted/DerivedDecompositionType
Property: Decomposition_Type
@ -316,8 +323,6 @@ Property: Noncharacter_Code_Point
Property: Other_Grapheme_Extend
Property: Grapheme_Link
Property: IDS_Binary_Operator
Property: IDS_Trinary_Operator
@ -353,7 +358,7 @@ Property: SPECIAL
File: Scripts
Property: Script
Format: nameStyle=none skipUnassigned=Common
Format: nameStyle=none skipValue=Unknown
File: SpecialCasing
Property: SPECIAL

View file

@ -1,6 +1,10 @@
package com.ibm.text.UCD;
import com.ibm.icu.impl.CollectionUtilities;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
import java.util.BitSet;
import com.ibm.text.utility.*;
import java.io.PrintWriter;
@ -194,6 +198,7 @@ public final class NFSkippable extends UCDProperty {
PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt", Utility.UTF8_WINDOWS);
out.println(Utility.BOM);
out.println("NFSafeSets");
out.println("Version: " + Default.ucd().getVersion());
out.println("Date: " + Default.getDate());
@ -212,6 +217,8 @@ public final class NFSkippable extends UCDProperty {
out.close();
}
static Collator UCA = Collator.getInstance(ULocale.ROOT);
static void generateSet(PrintWriter out, String label, UCDProperty up) {
System.out.println("Generating: " + up.getName(NORMAL));
UnicodeSet result = new UnicodeSet();
@ -227,11 +234,17 @@ public final class NFSkippable extends UCDProperty {
out.println(label + " = new UnicodeSet(");
writeStringInPieces(out, rSet, ", false);");
rSet = result.toPattern(false);
if (true) {
rSet = result.toPattern(false);
} else {
rSet = CollectionUtilities.prettyPrint(result, true, null, null, UCA, UCA);
}
out.println("/*Unicode: ");
writeStringInPieces(out, rSet, "*/");
out.println();
out.flush();
System.out.println("Done");
}
/*

View file

@ -5,30 +5,42 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
* $Date: 2005/11/19 05:39:39 $
* $Revision: 1.10 $
* $Date: 2006/04/05 22:12:43 $
* $Revision: 1.11 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import com.ibm.icu.dev.demo.translit.CaseIterator;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.dev.test.util.UnicodePropertySource;
import com.ibm.icu.dev.test.util.UnicodeMap.MapIterator;
import com.ibm.icu.impl.PrettyPrinter;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.CanonicalIterator;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.Normalizer;
//import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
@ -36,27 +48,27 @@ import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
import com.ibm.text.utility.*;
public class QuickTest implements UCD_Types {
public static void main(String[] args) throws IOException {
try {
checkCase();
if (true) return;
getCaseFoldingUnstable();
getCaseLengths("Lower", UCD.LOWER);
getCaseLengths("Upper", UCD.UPPER);
getCaseLengths("Title", UCD.TITLE);
getCaseLengths("Fold", UCD.FOLD);
if (true) return;
checkUnicodeSet();
getLengths("NFC", Default.nfc());
getLengths("NFD", Default.nfd());
getLengths("NFKC", Default.nfkc());
getLengths("NFKD", Default.nfkd());
//getCaseFoldingUnstable();
checkCase();
if (true) return;
tem();
//checkPrettyPrint();
@ -643,13 +655,13 @@ public class QuickTest implements UCD_Types {
if (!text.equals(x)) alpha.put("Lowercase", x);
String title = x = UCharacter.toTitleCase(ULocale.ENGLISH,text,null);
if (!text.equals(x)) alpha.put("Titlecase", x);
String nfc = x = Normalizer.normalize(text,Normalizer.NFC);
String nfc = x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFC);
if (!text.equals(x)) alpha.put("NFC", x);
String nfd = x = Normalizer.normalize(text,Normalizer.NFD);
String nfd = x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFD);
if (!text.equals(x)) alpha.put("NFD", x);
x = Normalizer.normalize(text,Normalizer.NFKD);
x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFKD);
if (!text.equals(x)) alpha.put("NFKD", x);
x = Normalizer.normalize(text,Normalizer.NFKC);
x = com.ibm.icu.text.Normalizer.normalize(text,com.ibm.icu.text.Normalizer.NFKC);
if (!text.equals(x)) alpha.put("NFKC", x);
CanonicalIterator ci = new CanonicalIterator(text);

View file

@ -70,3 +70,6 @@
# Note: the following case is already in the UnicodeData file.
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
# EOF

View file

@ -31,9 +31,10 @@
# A locale ID is defined by taking any language tag as defined by
# RFC 3066 (or its successor), and replacing '-' by '_'.
#
# A context for a character C is defined by Section 3.13 Default Case Operations,
# on p. 89-90 of The Unicode Standard, Version 4.0, as amended by Unicode 4.1.0,
# as specified in http://www.unicode.org/versions/Unicode4.1.0/
# A context for a character C is defined by Section 3.13 Default Case
# Operations, of The Unicode Standard, Version 5.0.
# (This is identical to the context defined by Unicode 4.1.0,
# as specified in http://www.unicode.org/versions/Unicode4.1.0/)
#
# Parsers of this file must be prepared to deal with future additions to this format:
# * Additional contexts

View file

@ -1,13 +1,10 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/loose.dtd">
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="GENERATOR" content="Microsoft FrontPage 5.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<meta name="keywords" content="unicode, variant glyphs">
<meta name="description" content="Describes and displays standardized variant glyphs">
@ -19,8 +16,9 @@
<table class="header">
<tr>
<td class="icon"><a href="http://www.unicode.org"><img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar" href="http://www.unicode.org/ucd">Unicode
Character Database</a></td>
<td class="icon"><a href="http://www.unicode.org">
<img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar" href="http://www.unicode.org/ucd">Unicode
Character Database</a></td>
</tr>
<tr>
<td class="gray">&nbsp;</td>
@ -29,105 +27,78 @@
<blockquote>
<h1>Standardized Variants</h1>
<table class="wide">
<tbody>
<tr>
<td valign="top" width="144">Revision</td>
<td valign="top">@revision@</td>
</tr>
<tr>
<td valign="top" width="144">Authors</td>
<td valign="top">Members of the Editorial Committee</td>
</tr>
<tr>
<td valign="top" width="144">Date</td>
<td valign="top">@date@</td>
</tr>
<tr>
<td valign="top" width="144">This Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/@updateDirectory@/StandardizedVariants-@revision@.html">http://www.unicode.org/Public/@updateDirectory@/StandardizedVariants-@revision@.html</a></td>
</tr>
<tr>
<td valign="top" width="144">Previous Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/3.2-Update/StandardizedVariants-3.2.0.html">http://www.unicode.org/Public/3.2-Update/StandardizedVariants-3.2.0.html</a></td>
</tr>
<tr>
<td valign="top" width="144">Latest Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html">http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html</a></td>
</tr>
</tbody>
<tr>
<td valign="top" width="144">Revision</td>
<td valign="top">@revision@</td>
</tr>
<tr>
<td valign="top" width="144">Authors</td>
<td valign="top">Members of the Editorial Committee</td>
</tr>
<tr>
<td valign="top" width="144">Date</td>
<td valign="top">@date@</td>
</tr>
<tr>
<td valign="top" width="144">This Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/@updateDirectory@/@filename@.html">
http://www.unicode.org/Public/@updateDirectory@/@filename@.html</a></td>
</tr>
<tr>
<td valign="top" width="144">Previous Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/4.1.0/ucd/StandardizedVariants.html">
http://www.unicode.org/Public/4.1.0/ucd/StandardizedVariants.html</a></td>
</tr>
<tr>
<td valign="top" width="144">Latest Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html">
http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html</a></td>
</tr>
</table>
<h3><br>
<i>Summary</i></h3>
<blockquote>
<p>This file provides a visual display of the standard variant sequences
derived from StandardizedVariants.txt.</p>
<p>This file provides a visual display of the standard variant sequences derived from
StandardizedVariants.txt.</p>
</blockquote>
<h3><i>Status</i></h3>
<blockquote>
<p><i>The file and the files described herein are part of the <a href="http://www.unicode.org/ucd">Unicode
Character Database</a> (UCD) and are governed by the <a href="#Terms of Use">UCD
Terms of Use</a> stated at the end.</i></p>
<p><i>This file and the files described herein are part of the Unicode Character Database and
are governed by the terms of use at <a href="http://www.unicode.org/terms_of_use.html">
http://www.unicode.org/terms_of_use.html</a>.</i></p>
</blockquote>
<hr width="50%">
<h2>Introduction</h2>
<p>The tables here <i>exhaustively</i> lists the valid, registered
combinations of base character plus variation indicator. All combinations not
listed in StandardizedVariants.txt are unspecified and are reserved for future
standardization; no conformant process may interpret them as standardized
variants. Variation selectors and their use are described in The Unicode
Standard.</p>
<p>These mathematical variants are all produced with the addition of Variation
Selector 1 (VS1 or U+FE00) to mathematical operator base characters. There is
no variation according to context. The Mongolian variants use the Mongolian
Variant Selectors, and may vary according to context. That is, if a contextual
shape is not listed below, then the variation sequence has an unmodified
<p>The tables here <i>exhaustively</i> lists the valid, registered combinations of base character
plus variation indicator. All combinations not listed in StandardizedVariants.txt are unspecified
and are reserved for future standardization; no conformant process may interpret them as
standardized variants. Variation selectors and their use are described in The Unicode Standard.</p>
<p>These mathematical variants are all produced with the addition of Variation Selector 1 (VS1 or
U+FE00) to mathematical operator base characters. There is no variation according to context. The
Mongolian variants use the Mongolian Variant Selectors, and may vary according to context. That
is, if a contextual shape is not listed below, then the variation sequence has an unmodified
appearance. At this time no Han variants exist.</p>
<blockquote>
<p><a name="fonts"><b>Note: </b></a>The glyphs used to show the variations
are often derived from different physical fonts than the representative
glyphs in the standard. They may therefore exhibit minor differences in
size, proportion, or weight <i>unrelated</i> to the intentional difference
in feature that is the defining element of the variation. Such minor
differences should be ignored. Likewise, in some cases the existing
representative fonts may not yet contain newly encoded characters and hence
some representative glyphs shown in these tables may have a slightly
different style than others.</p>
<p><a name="fonts"><b>Note: </b></a>The glyphs used to show the variations are often derived
from different physical fonts than the representative glyphs in the standard. They may therefore
exhibit minor differences in size, proportion, or weight <i>unrelated</i> to the intentional
difference in feature that is the defining element of the variation. Such minor differences
should be ignored. Likewise, in some cases the existing representative fonts may not yet contain
newly encoded characters and hence some representative glyphs shown in these tables may have a
slightly different style than others.</p>
</blockquote>
<p>@table@</p>
<hr width="50%">
<h2>UCD <a name="Terms of Use">Terms of Use</a></h2>
<h3><i>Disclaimer</i></h3>
<blockquote>
<p><i>The Unicode Character Database is provided as is by Unicode, Inc. No
claims are made as to fitness for any particular purpose. No warranties of
any kind are expressed or implied. The recipient agrees to determine
applicability of information provided. If this file has been purchased on
magnetic or optical media from Unicode, Inc., the sole remedy for any claim
will be exchange of defective media within 90 days of receipt.</i></p>
<p><i>This disclaimer is applicable for all other data files accompanying
the Unicode Character Database, some of which have been compiled by the
Unicode Consortium, and some of which have been supplied by other sources.</i></p>
</blockquote>
<h3><i>Limitations on Rights to Redistribute This Data</i></h3>
<blockquote>
<p><i>Recipient is granted the right to make copies in any form for internal
distribution and to freely use the information supplied in the creation of
products supporting the Unicode<sup>TM</sup> Standard. The files in the
Unicode Character Database can be redistributed to third parties or other
organizations (whether for profit or not) as long as this notice and the
disclaimer notice are retained. Information can be extracted from these
files and used in documentation or programs, as long as there is an
accompanying notice indicating the source.</i></p>
</blockquote>
<hr width="50%">
<div align="center">
<center>
<table cellspacing="0" cellpadding="0" border="0">
<tr>
<td><a href="http://www.unicode.org/unicode/copyright.html"><img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
<td><a href="http://www.unicode.org/unicode/copyright.html">
<img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
</tr>
</table>
<script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js"></script>
<script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js">
</script>
</center>
</div>
</blockquote>

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
* $Date: 2005/11/19 05:39:39 $
* $Revision: 1.24 $
* $Date: 2006/04/05 22:12:43 $
* $Revision: 1.25 $
*
*******************************************************************************
*/
@ -27,6 +27,7 @@ import com.ibm.icu.impl.CollectionUtilities;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.UCharArrayIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.StringPrep;
import com.ibm.icu.text.StringPrepParseException;
@ -45,8 +46,17 @@ public class TestData implements UCD_Types {
static UnicodeProperty.Factory upf;
public static void main (String[] args) throws IOException {
//checkChars(false);
tryConsole2();
if (true) return;
showNonCompatFull(false);
showNonCompatFull(true);
checkForCaseStability(false);
//countChars();
foo();
System.out.println("main: " + Default.getDate());
upf = ICUPropertyFactory.make();
System.out.println("after factory: " + Default.getDate());
@ -146,8 +156,152 @@ public class TestData implements UCD_Types {
}
} finally {
log.close();
}
}
private static void showNonCompatFull(boolean compat) {
UCD ucd = UCD.make("4.1.0");
Normalizer nfkc = new Normalizer(Normalizer.NFKC, ucd.getVersion());
System.out.println();
System.out.println(compat ? "Full Fold = Simple Lower of NFKC" : "Full Fold != Simple Lower of NFKC");
System.out.println();
int count = 0;
for (int i = 0; i <= 0x10FFFF; ++i) {
int gc = ucd.getCategory(i);
if (gc == Cn || gc == PRIVATE_USE) continue;
//if (compat == (ucd.getDecompositionType(i) > UCD.CANONICAL)) continue;
String str = UTF16.valueOf(i);
String simpleLower = ucd.getCase(str, SIMPLE, LOWER);
String fullFold = ucd.getCase(str, FULL, FOLD);
if (!simpleLower.equals(fullFold)) {
String nfkcStr = nfkc.normalize(str);
String simpleLowerNfkc = ucd.getCase(nfkcStr, SIMPLE, LOWER);
if (compat != (fullFold.equals(simpleLowerNfkc))) continue;
System.out.println(ucd.getCodeAndName(i));
System.out.println("\tSimple Lower:\t" + ucd.getCodeAndName(simpleLower));
System.out.println("\tFull Fold:\t" + ucd.getCodeAndName(fullFold));
count++;
}
}
System.out.println("Count:\t" + count);
}
private static void tryConsole() throws UnsupportedEncodingException {
for (int i = 1; i < 0xFFFF; ++i) {
String s = UTF32.valueOf32(i);
byte[] bytes = s.getBytes("UTF-8");
String utf8bytes = "";
for (int j = 0; j < bytes.length; ++j) {
if (j != 0) utf8bytes += " ";
utf8bytes += Utility.hex(bytes[j]&0xFF,2);
}
String name = UCharacter.getExtendedName(i);
System.out.println(Utility.hex(i) + "\t(" + s + ")\t[" + utf8bytes + "]\t" + name);
}
}
private static void tryConsole2() throws UnsupportedEncodingException {
UnicodeSet failures = new UnicodeSet();
check:
for (int i = 1; i <= 0x10FFFF; ++i) {
String s = UTF32.valueOf32(i);
byte[] bytes = s.getBytes("UTF-8");
for (int j = 0; j < bytes.length; ++j) {
switch (bytes[j]&0xFF) {
case 0x81: case 0x8D: case 0x8F: case 0x90: case 0x9D:
failures.add(i);
continue check;
}
}
}
System.out.println("Total corrupted characters: " + failures.size());
System.out.println("Percent corrupted characters: " + ((failures.size() + 0.0) / 0x110000 * 100.0 + "%"));
//BagFormatter bf = new BagFormatter();
//System.out.println(bf.showSetNames(failures));
}
private static void countChars() {
int[][] count = new int[AGE_VERSIONS.length][50];
for (int j = 1; j < AGE_VERSIONS.length; ++j) {
UCD ucd = UCD.make(AGE_VERSIONS[j]);
UCDProperty alpha = DerivedProperty.make(ucd.PropAlphabetic, ucd);
int alphaCount = 0;
for (int i = 0; i <=0x10FFFF; ++i) {
int type = ucd.getCategory(i);
if (ucd.isNoncharacter(i)) type = LIMIT_CATEGORY;
++count[j][type];
if (alpha.hasValue(i) || type == ucd.Nd) ++count[j][LIMIT_CATEGORY+1];
}
}
for (byte i = -1; i < LIMIT_CATEGORY+2; ++i) {
switch(i) {
case -1: System.out.print("\t\t"); break;
default: System.out.print(UCD.getCategoryID_fromIndex(i,UCD.LONG) + "\t" + UCD.getCategoryID_fromIndex(i)); break;
case LIMIT_CATEGORY: System.out.print("Noncharacter" + "\t" + "NCCP"); break;
case LIMIT_CATEGORY+1: System.out.print("Alphabetic" + "\t" + "alpha"); break;
}
for (int j = 1; j < AGE_VERSIONS.length; ++j) {
if (i < 0) System.out.print("\t*" + AGE_VERSIONS[j] + "*");
else System.out.print("\t" + count[j][i]);
}
System.out.println();
}
}
private static void foo() {
String[] test = {
"vicepresident",
"vice president",
"vice-president",
"vice-président",
"vice-president's offices",
"vice-presidents' offices",
"vice-presidents offices",
"vice-presidentsoffices",
};
RuleBasedCollator col = (RuleBasedCollator) Collator.getInstance(new ULocale("fr"));
col.setStrength(col.QUATERNARY);
col.setAlternateHandlingShifted(false);
Arrays.sort(test, col);
List s = Arrays.asList(test);
String last = "";
int[] level = new int[1];
for (Iterator it = s.iterator(); it.hasNext();) {
String current = (String) it.next();
int order = levelCompare(col, last, current, level);
//System.out.print(levelStrings[level[0]]);
//System.out.print(order < 0 ? "<" : order == 0 ? "=" : ">");
System.out.println("\t" + current);
last = current;
}
for (int i = 0; i < test.length; ++i) {
System.out.print(test[i] + ";");
}
System.out.println();
}
static String[] levelStrings = {".", "..", "...", "....", "....."};
static int levelCompare(RuleBasedCollator col, String a, String b, int[] level) {
int diff = 0;
level[0] = 0;
for (int i = 0; i < 15; ++i) {
col.setStrength(i);
diff = col.compare(a, b);
if (diff != 0) {
level[0] = i;
break;
}
}
return diff;
}
Matcher m;
/**
@ -163,12 +317,12 @@ public class TestData implements UCD_Types {
return true;
}
private static void checkChars(boolean mergeRanges) {
private static void checkForCaseStability(boolean mergeRanges) {
UCD ucd = Default.ucd();
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
UnicodeSet isUpper = ups.getSet("Uppercase=true");
UnicodeSet isLower = ups.getSet("Lowercase=true");
UnicodeSet isTitle = ups.getSet("gc=Lt");
UnicodeSet propUppercase = ups.getSet("Uppercase=true");
UnicodeSet propLowercase = ups.getSet("Lowercase=true");
UnicodeSet isGcLt = ups.getSet("gc=Lt");
UnicodeSet otherAlphabetic = ups.getSet("Alphabetic=true").addAll(ups.getSet("gc=Sk"));
// create the following
UnicodeSet hasFold = new UnicodeSet();
@ -177,6 +331,10 @@ public class TestData implements UCD_Types {
UnicodeSet hasTitle = new UnicodeSet();
UnicodeSet compat = new UnicodeSet();
UnicodeSet bicameralsScripts = new UnicodeSet();
UnicodeSet isFUppercase = new UnicodeSet();
UnicodeSet isFLowercase = new UnicodeSet();
UnicodeSet isFTitlecase = new UnicodeSet();
UCD u40 = UCD.make("4.0.0");
BitSet scripts = new BitSet();
@ -184,41 +342,83 @@ public class TestData implements UCD_Types {
int gc = ucd.getCategory(i);
if (gc == Cn || gc == PRIVATE_USE) continue;
String str = UTF16.valueOf(i);
if (!str.equals(ucd.getCase(str, FULL, FOLD))) hasFold.add(i);
if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(i);
if (!str.equals(ucd.getCase(str, FULL, FOLD))) {
hasFold.add(i);
scripts.set(ucd.getScript(i));
}
if (!str.equals(ucd.getCase(str, FULL, UPPER))) {
hasUpper.add(i);
scripts.set(ucd.getScript(i));
} else {
isFUppercase.add(i);
}
if (!str.equals(ucd.getCase(str, FULL, LOWER))) {
hasLower.add(i);
scripts.set(ucd.getScript(i));
} else {
isFLowercase.add(i);
}
if (!str.equals(ucd.getCase(str, FULL, TITLE))) {
hasTitle.add(i);
scripts.set(ucd.getScript(i));
} else {
isFTitlecase.add(i);
}
if (!str.equals(ucd.getCase(str, FULL, TITLE))) hasTitle.add(i);
if (!str.equals(Default.nfkd().normalize(str))) compat.add(i);
//System.out.println(ucd.getCodeAndName(i) + "\t" + (u40.isAllocated(i) ? "already in 4.0" : "new in 4.1"));
}
BagFormatter bf = new BagFormatter();
Transliterator nullTrans = Transliterator.getInstance("null");
bf.setShowLiteral(nullTrans);
bf.setMergeRanges(mergeRanges);
bf.setUnicodePropertyFactory(ups);
printItems(bf, compat, "isUpper or isTitle without hasLower",
new UnicodeSet(isUpper).addAll(isTitle).removeAll(hasLower));
printItems(bf, compat, "hasLower, but not isUpper or isTitle",
new UnicodeSet(hasLower).removeAll(isTitle).removeAll(isUpper));
printItems(bf, compat, "isLower without hasUpper",
new UnicodeSet(isLower).addAll(isTitle).removeAll(hasUpper));
printItems(bf, compat, "hasUpper, but not isLower or isTitle",
new UnicodeSet(hasUpper).removeAll(isTitle).removeAll(isLower));
UnicodeSet allCased = new UnicodeSet().addAll(hasUpper).addAll(hasLower).addAll(hasTitle);
isFUppercase.retainAll(allCased);
isFLowercase.retainAll(allCased);
isFTitlecase.retainAll(allCased);
System.out.println(Utility.BOM);
printItems(bf, compat, "Uppercase=true or gc=Lt without hasLower",
new UnicodeSet(propUppercase).addAll(isGcLt).removeAll(hasLower));
printItems(bf, compat, "hasLower, but not (Uppercase=true or gc=Lt)",
new UnicodeSet(hasLower).removeAll(isGcLt).removeAll(propUppercase));
printItems(bf, compat, "Lowercase=true without hasUpper",
new UnicodeSet(propLowercase).addAll(isGcLt).removeAll(hasUpper));
printItems(bf, compat, "hasUpper, but not (Lowercase=true or gc=Lt)",
new UnicodeSet(hasUpper).removeAll(isGcLt).removeAll(propLowercase));
printItems(bf, compat, "Functionally Uppercase, but not Uppercase=true",
new UnicodeSet(isFUppercase).removeAll(propUppercase));
printItems(bf, compat, "Uppercase=true, but not functionally Uppercase",
new UnicodeSet(propUppercase).removeAll(isFUppercase));
printItems(bf, compat, "Functionally Lowercase, but not Lowercase=true",
new UnicodeSet(isFLowercase).removeAll(propLowercase));
printItems(bf, compat, "Lowercase=true, but not functionally Lowercase",
new UnicodeSet(propLowercase).removeAll(isFLowercase));
UnicodeSet scriptSet = new UnicodeSet();
UnicodeProperty scriptProp = ups.getProperty("Script");
bf.setMergeRanges(true);
System.out.println();
System.out.println("Bicameral Scripts: those with at least one functionally cased character.");
System.out.println();
for (int i = 0; i < scripts.size(); ++i) {
if (!scripts.get(i)) continue;
if (i == COMMON_SCRIPT) continue;
//if (i == COMMON_SCRIPT) continue;
String scriptName = ucd.getScriptID_fromIndex((byte)i);
System.out.println(scriptName);
scriptSet.addAll(scriptProp.getSet(scriptName));
UnicodeSet scriptUSet = scriptProp.getSet(scriptName);
scriptSet.addAll(scriptUSet);
printItems(bf, compat, "Bicameral Script: " + scriptName,
new UnicodeSet(allCased).retainAll(scriptUSet));
}
UnicodeSet allCased = new UnicodeSet().addAll(isUpper).addAll(isLower).addAll(isTitle);
printItems(bf, compat, "(Bicameral) isAlpha or Symbol Modifier, but not isCased",
bf.setMergeRanges(false);
printItems(bf, compat, "Bicameral Script: isAlpha or Symbol Modifier, but not isCased",
new UnicodeSet(scriptSet).retainAll(otherAlphabetic).removeAll(allCased));
printItems(bf, compat, "(Bicameral) isCased, but not isAlpha or Symbol Modifier",
printItems(bf, compat, "Bicameral Script: isCased, but not isAlpha or Symbol Modifier",
new UnicodeSet(scriptSet).retainAll(allCased).removeAll(otherAlphabetic));
}
@ -302,21 +502,21 @@ public class TestData implements UCD_Types {
}
}
public static class RegexMatcher implements UnicodeProperty.Matcher {
public static class RegexMatcher implements UnicodeProperty.PatternMatcher {
private Matcher matcher;
public UnicodeProperty.Matcher set(String pattern) {
public UnicodeProperty.PatternMatcher set(String pattern) {
matcher = Pattern.compile(pattern).matcher("");
return this;
}
public boolean matches(String value) {
matcher.reset(value);
public boolean matches(Object value) {
matcher.reset((String)value);
return matcher.matches();
}
}
static BagFormatter bf = new BagFormatter();
static UnicodeProperty.Matcher matcher = new RegexMatcher();
static UnicodeProperty.PatternMatcher matcher = new RegexMatcher();
private static void showPropDiff(String p1, UnicodeSet s1, String p2, UnicodeSet s2) {
System.out.println("Property Listing");

View file

@ -26,8 +26,16 @@ public class TestIdentifiers {
public static void main(String[] args) throws IOException {
String[] tests = { "SØS", "façade", "MOPE", "VOP", "scope", "ibm", "vop",
"toys-я-us", "1iνе", "back", "boгing" };
TestIdentifiers ti = new TestIdentifiers("L");
TestIdentifiers tiany = new TestIdentifiers("A");
ti.loadIdentifiers();
UnicodeSet idnCharSet = ti.idnChars.getSet("output", new UnicodeSet());
System.out.println("idnCharSet: " + idnCharSet.size());
UnicodeSet idnCharNonStarting = ti.nonstarting;
System.out.println("idnCharNonStarting: " + idnCharSet);
if (true) return;
for (int i = 0; i < tests.length; ++i) {
System.out.print(tests[i]);
String folded = UCharacter.foldCase(tests[i], true);

View file

@ -10,6 +10,7 @@ import java.util.List;
import java.util.Locale;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.tool.UOption;
import com.ibm.icu.text.SymbolTable;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeMatcher;
@ -17,9 +18,26 @@ import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.Utility;
public class TestUnicodeInvariants {
private static final int
HELP1 = 0,
FILE = 1,
RANGE = 2
;
private static final UOption[] options = {
UOption.HELP_H(),
UOption.create("file", 'f', UOption.REQUIRES_ARG),
UOption.create("range", 'r', UOption.NO_ARG),
};
public static void main(String[] args) throws IOException {
testInvariants();
UOption.parseArgs(args, options);
String file = "UnicodeInvariants.txt";
if (options[FILE].doesOccur) file = options[FILE].value;
boolean doRange = options[RANGE].doesOccur;
testInvariants(file, doRange);
}
/**
@ -68,19 +86,19 @@ public class TestUnicodeInvariants {
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\~ \\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]");
public static void testInvariants() throws IOException {
public static void testInvariants(String outputFile, boolean doRange) throws IOException {
String[][] variables = new String[100][2];
int variableCount = 0;
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
out.write('\uFEFF'); // BOM
BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", "UnicodeInvariants.txt");
BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", outputFile);
BagFormatter bf = new BagFormatter();
bf.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
BagFormatter bf2 = new BagFormatter();
bf2.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
bf2.setMergeRanges(false);
bf2.setMergeRanges(doRange);
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"),
ToolUnicodePropertySource.make(UCD.lastVersion).getSymbolTable("\u00D7"),
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
ParsePosition pp = new ParsePosition(0);
int parseErrorCount = 0;
@ -113,11 +131,20 @@ public class TestUnicodeInvariants {
// detect variables
if (line.startsWith("Show")) {
String part = line.substring(4).trim();
if (part.startsWith("Each")) {
part = part.substring(4).trim();
bf2.setMergeRanges(false);
}
pp.setIndex(0);
UnicodeSet leftSet = new UnicodeSet(part, pp, st);
bf2.showSetNames(out, leftSet);
bf2.setMergeRanges(doRange);
continue;
}
if (line.startsWith("Test")) {
line = line.substring(4).trim();
}
char relation = 0;
String rightSide = null;
@ -166,7 +193,7 @@ public class TestUnicodeInvariants {
boolean ok = true;
switch(relation) {
case '=': ok = leftSet.equals(rightSet); break;
case '=': case '\u2261': ok = leftSet.equals(rightSet); break;
case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break;
case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break;
case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break;

View file

@ -254,18 +254,19 @@ public class ToolUnicodePropertySource extends UnicodeProperty.Factory {
, "Katakana");
Object foo = unicodeMap.getSet("Katakana");
UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true");
UnicodeProperty lineBreak = getProperty("Line_Break");
unicodeMap.putAll(getProperty("Alphabetic").getSet("true")
.add(0xA0).add(0x05F3)
.removeAll(getProperty("Ideographic").getSet("true"))
.removeAll(unicodeMap.getSet("Katakana"))
.removeAll(script.getSet("Thai"))
.removeAll(script.getSet("Lao"))
//.removeAll(script.getSet("Thai"))
//.removeAll(script.getSet("Lao"))
.removeAll(lineBreak.getSet("SA"))
.removeAll(script.getSet("Hiragana"))
.removeAll(graphemeExtend),
"ALetter");
unicodeMap.putAll(new UnicodeSet("[\\u0027\\u00B7\\u05F4\\u2019\\u2027\\u003A]")
,"MidLetter");
UnicodeProperty lineBreak = getProperty("Line_Break");
unicodeMap.putAll(lineBreak.getSet("Infix_Numeric")
.remove(0x003A), "MidNum");
unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric");

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2005/11/01 00:10:54 $
* $Revision: 1.40 $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.41 $
*
*******************************************************************************
*/
@ -43,7 +43,8 @@ public final class UCD implements UCD_Types {
/**
* Used for the default version.
*/
public static final String latestVersion = "5.1.0";
public static final String latestVersion = "5.0.0";
public static final String lastVersion = "4.1.0";
/**
* Create singleton instance for default (latest) version
@ -803,6 +804,9 @@ public final class UCD implements UCD_Types {
}
public byte getScript(int codePoint) {
if (codePoint == 0xE000) {
codePoint += 0;
}
return get(codePoint, false).script;
}
@ -1398,6 +1402,7 @@ to guarantee identifier closure.
}
if (isHangul) {
if (fixStrings) result.decompositionMapping = getHangulDecompositionPair(codePoint);
if (isLV(codePoint)) result.lineBreak = LB_H2; else result.lineBreak = LB_H3;
result.decompositionType = CANONICAL;
}
return result;
@ -1612,6 +1617,9 @@ to guarantee identifier closure.
}
combiningClassSet.set(uData.combiningClass & 0xFF);
if (cp == 0xE000) {
System.out.println("Check: " + uData.script);
}
add(uData);
}
/*

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
* $Date: 2005/03/10 02:37:20 $
* $Revision: 1.31 $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.32 $
*
*******************************************************************************
*/
@ -345,7 +345,12 @@ final class UCD_Names implements UCD_Types {
"SYLOTI_NAGRI",
"OLD_PERSIAN",
"KHAROSHTHI",
"Balinese",
"Cuneiform",
"Phoenician",
"Phags-pa",
"Nko",
"Unknown"
};
public static final Map EXTRA_SCRIPT = new HashMap();
@ -426,11 +431,14 @@ final class UCD_Names implements UCD_Types {
"Sylo",
"Xpeo",
"Khar",
"Bali",
"Xsux",
"Phnx",
"Phag",
"Nkoo",
"Zzzz"
};
static final String[] AGE = {
"unassigned",
"1.1",
@ -441,9 +449,9 @@ final class UCD_Names implements UCD_Types {
"3.2",
"4.0",
"4.1",
"5.0",
};
static final String[] GENERAL_CATEGORY = {
"Cn", // = Other, Not Assigned 0

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2005/11/01 00:10:54 $
* $Revision: 1.32 $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.33 $
*
*******************************************************************************
*/
@ -398,8 +398,14 @@ public interface UCD_Types {
SYLOTI_NAGRI = 60,
OLD_PERSIAN = 61,
KHAROSHTHI = 62,
Balinese = 63,
Cuneiform = 64,
Phoenician = 65,
Phags_Pa = 66,
NKo = 67,
Unknown_Script = 68,
LIMIT_SCRIPT = 63;
LIMIT_SCRIPT = 69;
static final int
UNKNOWN = 0,
@ -411,7 +417,8 @@ public interface UCD_Types {
AGE32 = 6,
AGE40 = 7,
AGE41 = 8,
LIMIT_AGE = 9;
AGE50 = 9,
LIMIT_AGE = 10;
static final String[] AGE_VERSIONS = {
"?",
@ -422,7 +429,8 @@ public interface UCD_Types {
"3.1.0",
"3.2.0",
"4.0.0",
"4.1.0"
"4.1.0",
"5.0.0"
};
public static byte

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
* $Date: 2004/02/12 08:23:16 $
* $Revision: 1.11 $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.12 $
*
*******************************************************************************
*/
@ -45,7 +45,7 @@ class UData implements UCD_Types {
byte lineBreak = LB_XX;
byte joiningType = -1;
byte joiningGroup = NO_SHAPING;
byte script = COMMON_SCRIPT;
byte script = Unknown_Script;
byte age = 0;
static final UData UNASSIGNED = new UData();

View file

@ -1,10 +1,4 @@
#/**
# *******************************************************************************
# * Copyright (C) 2002-2004, International Business Machines Corporation and *
# * others. All Rights Reserved. *
# *******************************************************************************
# */
#Override List
#Override List
#Format is <code><tab><char><tab><pinyin>(<tab><comment>)?
#Note: the 'code' field is currently discarded; only the char is important.
#Note: if there is conflict, the FIRST char wins.

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Counter.java,v $
* $Date: 2005/10/11 19:39:15 $
* $Revision: 1.3 $
* $Date: 2006/04/05 22:12:45 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -21,7 +21,7 @@ import java.io.*;
import java.text.*;
public final class Counter {
Map map = new HashMap();
private Map map = new HashMap();
static public final class RWInteger implements Comparable {
static int uniqueCount;
@ -92,5 +92,11 @@ public final class Counter {
return result;
}
public Set keySet() {
return map.keySet();
}
public Map getMap() {
return Collections.unmodifiableMap(map);
}
}

View file

@ -48,7 +48,7 @@ public class UnicodeDataFile {
out.println("# For documentation, see UCD.html");
}
try {
Utility.appendFile(filename + "Header" + fileType, Utility.UTF8_UNIX, out);
Utility.appendFile("com/ibm/text/UCD/" + filename + "Header" + fileType, Utility.UTF8_UNIX, out);
} catch (FileNotFoundException e) {
/*
out.println("# Unicode Character Database: Derived Property Data");

View file

@ -77,7 +77,10 @@ exist:<br>
&lt;UCD_DIR&gt;/EXTRAS-Update</p>
<h3>2. Download all of the UnicodeData files for each version into UCD_DIR.</h3>
<p>The folder names must be of the form: &quot;3.2.0-Update&quot;, so rename the folders on the<br>
Unicode site to this format.</p>
Unicode site to this format. I<span style="background-color: #FFFF00">f the
folder contains ucd, then make the contents of that directory be the contents of
the x.x.x-Update directory. That is, each directory will directly contain files
like PropList....txt</span></p>
<h4>2a Ensure Complete Release</h4>
<p>If you are downloading any &quot;incomplete&quot; release (one that does not contain a complete set of data
files for that release, you need to also download the previous complete release). Most of the N.M-Update
@ -87,6 +90,7 @@ directoriess are complete, *except*:</p>
<p>Also, make the following changes to UnicodeData for 1.1.5:</p>
<p><b>Delete</b></p>
<pre>3400;HANGUL SYLLABLE KIYEOK A;Lo;0;L;1100 1161;;;;N;;;;;
...
4DFF;HANGUL SYLLABLE MIEUM WEO RIEUL-THIEUTH;Lo;0;L;1106 116F 11B4;;;;N;;;;;
4E00;<cjk IDEOGRAPH REPRESENTATIVE>;Lo;0;L;;;;;N;;;;;</pre>
<p><b>Add:</b></p>
@ -106,18 +110,19 @@ BASE_DIR + &quot;Collation\allkeys&quot; + VERSION + &quot;.txt&quot;.<br>
<br>
If you have it in a different location, change that value for KEYS in UCA.java, and <br>
the value for BASE_DIR</p>
<h4>2c. Here is an example of the default directory structure with files:</h4>
<h4>2c. Here is an example of the default directory structure with files. All of
the yellow ones should exist</h4>
<pre>C://DATA/
BIN/
Collation/
<span style="background-color: #FFFF00"> Collation/
allkeys-3.1.1.txt
</span>
GEN/
DerivedData/
ExtractedProperties
UCD/
<span style="background-color: #FFFF00"> </span><span style="background-color: #FFFF00">UCD/
3.0.0-Update/
Unihan-3.2.0.txt
...
@ -133,69 +138,145 @@ the value for BASE_DIR</p>
ArabicShaping-4.0.0d14b.txt
BidiMirroring-4.0.0d1b.txt
...
EXTRAS-Update/</pre>
EXTRAS-Update/</span></pre>
<h3>3. Versions</h3>
<p>All of the following have &quot;version X&quot; in the options you give to Java (either on the&nbsp;
command line, or in the Eclipse 'run' options. If you want a specific version like 3.1.0, then you
would write &quot;version 3.1.1&quot;. If you want the latest version (4.1.0), you can omit the &quot;version X&quot;.</p>
<h3>4. Running UCD, you will use com.ibm.text.UCD.Main as your main class.</h3>
<p>The Working directory has to be C:\ICU4J\unicodetools\com\ibm\text\UCD<br>
(In Eclipse you can also use ${workspace_loc:UnicodeTools/com/ibm/text/UCD}, which abstracts away
the location.)<br>
<br>
The same for UCA:</p>
<p>main: com.ibm.text.UCD.Main<br>
directory: <a href="file:///C:/ICU4J/unicodetools/com/ibm/text/UCA">
C:\ICU4J\unicodetools\com\ibm\text\UCA</a></p>
<h4>4a. BIN</h4>
<p>For each version, the tools build a set of binary data in BIN that contain the information for
that release. This is done automatically, or you can manually do it with the options<br>
<br>
version X build<br>
<br>
This builds an compressed format of all the UCD data (except blocks and Unihan) into the BIN
directory. Don't worry about the voluminous console messages, unless one says &quot;FAIL&quot;.<br>
<br>
<font color="#FF0000"><i>You have to manually do this if you change any of the data files in that
version!!</i></font></p>
<p>Note: if for any reason you modify the binary format of the BIN files, you also have to bump the
value in that file:<br>
<br>
static final byte BINARY_FORMAT = 8; // bumped if binary format of UCD changes</p>
<h4>4b. To build the Unicode files for a particular version X, run the Main with the following
argument:</h4>
<p>MakeUnicodeFiles.generateFile</p>
<p>This will execute the commands in the file MakeUnicodeFiles.txt.</p>
<p>You will edit that file if you want a different 'd' version for the files, OR if you want to
change which files are built. At the top of the file you will see the following text:</p>
<pre>Generate: </pre>
<pre>DeltaVersion: 7</pre>
<h4>4c. To change which files are built, put any number of regular expressions separated by spaces
after Generate. Eg,</h4>
<pre>Generate: .*line.* prop.*</pre>
<p>The matching is case-insensitive.</p>
<h4>4d. To change the 'd' number that is appended to the generated files names, change the
DeltaVersion.</h4>
<h4>4e. To run basic consistency checking, run:</h4>
<p>version X verify<br>
<br>
Don't worry about any console messages except those that say FAIL.</p>
<h4>4f. Output</h4>
<p>The files will be generated in the GEN directories.</p>
<ul>
<li>If they are the same as previous files (except for the first line and Date), they will be
renamed to UNCHANGED... </li>
<li>If they are not, then a bat file will be generated in the DIFF directory. Double-clicking on
this file will launch CompareIt, which is a nice diff program. Get compareIt from
<a class="xurl" href="http://www.grigsoft.com/files.htm">http://www.grigsoft.com/files.htm</a> (be
sure to get the Unicode version),then you can also set it as the diff program in CVS with
Admin/Preferences/WinCVS, External Diff = C:\Program Files\Compare It!\wincmp3.exe (or equiv).</li>
</ul>
<h3>5. Running UCA, you will use com.ibm.text.UCA.Main as your main class.</h3>
<h4>5a. To build all the UCA files used by ICU, use the option:</h4>
<p>java &lt;UCA&gt;Main ICU</p>
<h4>6. To build all the charts, use the UCA project, with options: normalizationChart caseChart
scriptChart indexChart</h4>
<h3>4. Building Files</h3>
<ol>
<li><b>Setup</b><ol>
<li>In Eclipse, open the Package Explorer (Use Window&gt;Show View if you
don't see it)</li>
<li>Open UnicodeTools<ul>
<li>com.ibm.text.UCD<ul>
<li>MakeUnicodeFiles.<span style="background-color: #FFFF00">txt</span><p>This file drives the production of
the derived Unicode files. The first three lines contain
parameters that you may want to modify at some times:</p>
<pre>Generate: <b>.*script.*</b> <i>// this is a regular expression. Use .* for all files</i>
DeltaVersion: <b>10</b> <i> // This gets appended to the file name. Pick 1+ the highest value in Public</i>
CopyrightYear: <b>2006</b> <i> // Pick the current year</i></pre>
</li>
</ul>
</li>
</ul>
</li>
<li>Open in Package Explorer
<ul>
<li>com.ibm.text.UCD<ul>
<li>Main</li>
</ul>
</li>
</ul>
</li>
<li>Run&gt;Run As...<ol>
<li>Choose Java Application<ul>
<li>it will fail, don't worry; you need to set some parameters</li>
</ul>
</li>
</ol>
</li>
<li>Run&gt;Run...<ul>
<li>Select the Arguments tab, and fill in the following<ul>
<li>Program arguments:<pre>build 5.0 MakeUnicodeFiles</pre>
</li>
<li>VM arguments:
<pre>-Xms512m -Xmx512m</pre>
</li>
</ul>
</li>
<li>Close and Save</li>
</ul>
</li>
</ol>
</li>
<li><b>Run</b><ol>
<li>You'll see it build the 5.0 files, with something like the following
results:<pre>Writing UCD_Data5.0.0
Data Size: 109,802
Wrote Data 109802</pre>
</li>
<li>For each version, the tools build a set of binary data in BIN that
contain the information for that release. This is done automatically, or
you can manually do it with the Program Arguments<pre>version X build</pre>
<p>This builds an compressed format of all the UCD data (except blocks
and Unihan) into the BIN directory. Don't worry about the voluminous
console messages, unless one says &quot;FAIL&quot;.</p>
<p><font color="#FF0000"><i>You have to manually do this if you change
any of the data files in that version!</i></font></p>
<p>Note: if for any reason you modify the binary format of the BIN files, you also have to bump the
value in that file:</p>
<pre>static final byte BINARY_FORMAT = 8; // bumped if binary format of UCD changes</pre>
</li>
</ol>
</li>
<li>Results in <a href="file:///C:/DATA/GEN/DerivedData">
C:\DATA\GEN\DerivedData</a><ol>
<li>The files will be in this directory.</li>
<li>There are also DIFF folders, that contain BAT files that you can run
on Windows with CompareIt. (You can modify the code to build BATs with
another Diff program if you want).<ol>
<li>For any file with a significant difference, it will build two
BAT files, such as the first two below.<pre>Diff_PropList-5.0.0d10.txt.bat
OLDER-Diff_PropList-5.0.0d10.txt.bat
UNCHANGED-Diff_PropertyValueAliases-5.0.0d10.txt.bat</pre>
</li>
</ol>
</li>
<li>Any files without significant changes will have &quot;UNCHANGED&quot; as a
prefix: ignore them.&nbsp; The OLDER prefix is the comparison to the
last version of Unicode.</li>
<li>On Windows you can run these BATs to compare files:</li>
</ol>
</li>
</ol>
<h3>5. Invariant Checking</h3>
<ol>
<li>Setup<ol>
<li>Open in Package Explorer<ul>
<li>com.ibm.text.UCD<ul>
<li>TestUnicodeInvariants.java</li>
</ul>
</li>
</ul>
</li>
<li>Run&gt;Run As... Java Application<br>
Will create the following file of results:<pre><a href="file:///C:/DATA/GEN/UnicodeInvariantResults.txt/">C:\DATA\GEN\UnicodeInvariantResults.txt\</a></pre>
</li>
<li>Open that file and search for &quot;**** START Error Info ****&quot; Each such
point provides a dump of comparison information.</li>
</ol>
</li>
</ol>
<h3>6. Options</h3>
<ol>
<li>If you want to see files that are opened while processing, do the
following:<ol>
<li>Run&gt;Run</li>
<li>Select the Arguments tab, and add the following<ol>
<li>VM arguments:
<pre>-DSHOW_FILES</pre>
</li>
</ol>
</li>
</ol>
</li>
</ol>
<h3>5. UCA</h3>
<ol>
<li>
<h3>You will use com.ibm.text.UCA.Main as your main class, creating along
the same lines as above.</h3></li>
<li>
<h4>To build all the UCA files used by ICU, use the Program arguments:</h4>
<pre>Main ICU</pre>
</li>
<li>
<h4>To build all the charts, use the UCA project, with options: </h4>
<pre>normalizationChart caseChart scriptChart indexChart</pre>
</li>
</ol>
</body>